diff options
Diffstat (limited to 'src/arrow/js/src')
110 files changed, 19516 insertions, 0 deletions
diff --git a/src/arrow/js/src/Arrow.dom.ts b/src/arrow/js/src/Arrow.dom.ts new file mode 100644 index 000000000..07f0c8b8e --- /dev/null +++ b/src/arrow/js/src/Arrow.dom.ts @@ -0,0 +1,113 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import streamAdapters from './io/adapters'; +import { Builder } from './builder/index'; +import { RecordBatchReader, RecordBatchFileReader, RecordBatchStreamReader, } from './ipc/reader'; +import { RecordBatchWriter, RecordBatchFileWriter, RecordBatchStreamWriter, } from './ipc/writer'; +import { toDOMStream } from './io/whatwg/iterable'; +import { builderThroughDOMStream } from './io/whatwg/builder'; +import { recordBatchReaderThroughDOMStream } from './io/whatwg/reader'; +import { recordBatchWriterThroughDOMStream } from './io/whatwg/writer'; + +streamAdapters.toDOMStream = toDOMStream; +Builder['throughDOM'] = builderThroughDOMStream; +RecordBatchReader['throughDOM'] = recordBatchReaderThroughDOMStream; +RecordBatchFileReader['throughDOM'] = recordBatchReaderThroughDOMStream; +RecordBatchStreamReader['throughDOM'] = recordBatchReaderThroughDOMStream; +RecordBatchWriter['throughDOM'] = recordBatchWriterThroughDOMStream; +RecordBatchFileWriter['throughDOM'] = recordBatchWriterThroughDOMStream; +RecordBatchStreamWriter['throughDOM'] = recordBatchWriterThroughDOMStream; + +export { + DateUnit, IntervalUnit, MessageHeader, MetadataVersion, Precision, TimeUnit, Type, UnionMode, BufferType, + Data, + DataType, + Null, + Bool, + Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64, + Float, Float16, Float32, Float64, + Utf8, + Binary, + FixedSizeBinary, + Date_, DateDay, DateMillisecond, + Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, + Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond, + Decimal, + List, + Struct, + Union, DenseUnion, SparseUnion, + Dictionary, + Interval, IntervalDayTime, IntervalYearMonth, + FixedSizeList, + Map_, + Table, + Column, + Schema, Field, + Visitor, + Vector, + BaseVector, + BinaryVector, + BoolVector, + Chunked, + DateVector, DateDayVector, DateMillisecondVector, + DecimalVector, + DictionaryVector, + FixedSizeBinaryVector, + FixedSizeListVector, + FloatVector, Float16Vector, Float32Vector, Float64Vector, + IntervalVector, IntervalDayTimeVector, IntervalYearMonthVector, + IntVector, Int8Vector, Int16Vector, Int32Vector, Int64Vector, Uint8Vector, Uint16Vector, Uint32Vector, Uint64Vector, + ListVector, + MapVector, + NullVector, + StructVector, + TimestampVector, TimestampSecondVector, TimestampMillisecondVector, TimestampMicrosecondVector, TimestampNanosecondVector, + TimeVector, TimeSecondVector, TimeMillisecondVector, TimeMicrosecondVector, TimeNanosecondVector, + UnionVector, DenseUnionVector, SparseUnionVector, + Utf8Vector, + ByteStream, AsyncByteStream, AsyncByteQueue, ReadableSource, WritableSink, + RecordBatchReader, RecordBatchFileReader, RecordBatchStreamReader, AsyncRecordBatchFileReader, AsyncRecordBatchStreamReader, + RecordBatchWriter, RecordBatchFileWriter, RecordBatchStreamWriter, RecordBatchJSONWriter, + MessageReader, AsyncMessageReader, JSONMessageReader, + Message, + RecordBatch, + ArrowJSONLike, FileHandle, Readable, Writable, ReadableWritable, ReadableDOMStreamOptions, + DataFrame, FilteredDataFrame, CountByResult, BindFunc, NextFunc, + predicate, + util, + Builder, + BinaryBuilder, + BoolBuilder, + DateBuilder, DateDayBuilder, DateMillisecondBuilder, + DecimalBuilder, + DictionaryBuilder, + FixedSizeBinaryBuilder, + FixedSizeListBuilder, + FloatBuilder, Float16Builder, Float32Builder, Float64Builder, + IntervalBuilder, IntervalDayTimeBuilder, IntervalYearMonthBuilder, + IntBuilder, Int8Builder, Int16Builder, Int32Builder, Int64Builder, Uint8Builder, Uint16Builder, Uint32Builder, Uint64Builder, + ListBuilder, + MapBuilder, + NullBuilder, + StructBuilder, + TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuilder, TimestampMicrosecondBuilder, TimestampNanosecondBuilder, + TimeBuilder, TimeSecondBuilder, TimeMillisecondBuilder, TimeMicrosecondBuilder, TimeNanosecondBuilder, + UnionBuilder, DenseUnionBuilder, SparseUnionBuilder, + Utf8Builder, + isTypedArray, +} from './Arrow'; diff --git a/src/arrow/js/src/Arrow.node.ts b/src/arrow/js/src/Arrow.node.ts new file mode 100644 index 000000000..44221f613 --- /dev/null +++ b/src/arrow/js/src/Arrow.node.ts @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import streamAdapters from './io/adapters'; +import { Builder } from './builder/index'; +import { RecordBatchReader } from './ipc/reader'; +import { RecordBatchWriter } from './ipc/writer'; +import { toNodeStream } from './io/node/iterable'; +import { builderThroughNodeStream } from './io/node/builder'; +import { recordBatchReaderThroughNodeStream } from './io/node/reader'; +import { recordBatchWriterThroughNodeStream } from './io/node/writer'; + +streamAdapters.toNodeStream = toNodeStream; +Builder['throughNode'] = builderThroughNodeStream; +RecordBatchReader['throughNode'] = recordBatchReaderThroughNodeStream; +RecordBatchWriter['throughNode'] = recordBatchWriterThroughNodeStream; + +export * from './Arrow.dom'; diff --git a/src/arrow/js/src/Arrow.ts b/src/arrow/js/src/Arrow.ts new file mode 100644 index 000000000..8bf296310 --- /dev/null +++ b/src/arrow/js/src/Arrow.ts @@ -0,0 +1,136 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +export { + DateUnit, + TimeUnit, + Precision, + UnionMode, + IntervalUnit, + MetadataVersion, +} from './fb/Schema'; + +export { MessageHeader } from './fb/Message'; + +export { Type, BufferType } from './enum'; + +export { Data } from './data'; +export { + DataType, + Null, + Bool, + Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64, + Float, Float16, Float32, Float64, + Utf8, + Binary, + FixedSizeBinary, + Date_, DateDay, DateMillisecond, + Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, + Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond, + Decimal, + List, + Struct, + Union, DenseUnion, SparseUnion, + Dictionary, + Interval, IntervalDayTime, IntervalYearMonth, + FixedSizeList, + Map_, +} from './type'; + +export { Table } from './table'; +export { Column } from './column'; +export { Visitor } from './visitor'; +export { Schema, Field } from './schema'; +export { + Vector, + BaseVector, + BinaryVector, + BoolVector, + Chunked, + DateVector, DateDayVector, DateMillisecondVector, + DecimalVector, + DictionaryVector, + FixedSizeBinaryVector, + FixedSizeListVector, + FloatVector, Float16Vector, Float32Vector, Float64Vector, + IntervalVector, IntervalDayTimeVector, IntervalYearMonthVector, + IntVector, Int8Vector, Int16Vector, Int32Vector, Int64Vector, Uint8Vector, Uint16Vector, Uint32Vector, Uint64Vector, + ListVector, + MapVector, + NullVector, + StructVector, + TimestampVector, TimestampSecondVector, TimestampMillisecondVector, TimestampMicrosecondVector, TimestampNanosecondVector, + TimeVector, TimeSecondVector, TimeMillisecondVector, TimeMicrosecondVector, TimeNanosecondVector, + UnionVector, DenseUnionVector, SparseUnionVector, + Utf8Vector, +} from './vector/index'; + +export { + Builder, + BinaryBuilder, + BoolBuilder, + DateBuilder, DateDayBuilder, DateMillisecondBuilder, + DecimalBuilder, + DictionaryBuilder, + FixedSizeBinaryBuilder, + FixedSizeListBuilder, + FloatBuilder, Float16Builder, Float32Builder, Float64Builder, + IntervalBuilder, IntervalDayTimeBuilder, IntervalYearMonthBuilder, + IntBuilder, Int8Builder, Int16Builder, Int32Builder, Int64Builder, Uint8Builder, Uint16Builder, Uint32Builder, Uint64Builder, + ListBuilder, + MapBuilder, + NullBuilder, + StructBuilder, + TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuilder, TimestampMicrosecondBuilder, TimestampNanosecondBuilder, + TimeBuilder, TimeSecondBuilder, TimeMillisecondBuilder, TimeMicrosecondBuilder, TimeNanosecondBuilder, + UnionBuilder, DenseUnionBuilder, SparseUnionBuilder, + Utf8Builder, +} from './builder/index'; + +export { ByteStream, AsyncByteStream, AsyncByteQueue, ReadableSource, WritableSink } from './io/stream'; +export { RecordBatchReader, RecordBatchFileReader, RecordBatchStreamReader, AsyncRecordBatchFileReader, AsyncRecordBatchStreamReader } from './ipc/reader'; +export { RecordBatchWriter, RecordBatchFileWriter, RecordBatchStreamWriter, RecordBatchJSONWriter } from './ipc/writer'; +export { MessageReader, AsyncMessageReader, JSONMessageReader } from './ipc/message'; +export { Message } from './ipc/metadata/message'; +export { RecordBatch } from './recordbatch'; +export { ArrowJSONLike, FileHandle, Readable, Writable, ReadableWritable, ReadableDOMStreamOptions } from './io/interfaces'; +export { DataFrame, FilteredDataFrame, CountByResult, BindFunc, NextFunc } from './compute/dataframe'; + +import * as util_bn_ from './util/bn'; +import * as util_int_ from './util/int'; +import * as util_bit_ from './util/bit'; +import * as util_math_ from './util/math'; +import * as util_buffer_ from './util/buffer'; +import * as util_vector_ from './util/vector'; +import * as predicate from './compute/predicate'; +import { compareSchemas, compareFields, compareTypes } from './visitor/typecomparator'; + +export { predicate }; +/** @ignore */ +export const util = { + ...util_bn_, + ...util_int_, + ...util_bit_, + ...util_math_, + ...util_buffer_, + ...util_vector_, + compareSchemas, + compareFields, + compareTypes, +}; + +export { isTypedArray } from './util/args'; diff --git a/src/arrow/js/src/bin/arrow2csv.ts b/src/arrow/js/src/bin/arrow2csv.ts new file mode 100644 index 000000000..d5803cce0 --- /dev/null +++ b/src/arrow/js/src/bin/arrow2csv.ts @@ -0,0 +1,334 @@ +#! /usr/bin/env node + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import * as fs from 'fs'; +import * as stream from 'stream'; +import { valueToString } from '../util/pretty'; +import { Schema, RecordBatch, RecordBatchReader, AsyncByteQueue } from '../Arrow.node'; + +/* eslint-disable @typescript-eslint/no-require-imports */ + +const padLeft = require('pad-left'); +const bignumJSONParse = require('json-bignum').parse; +const argv = require(`command-line-args`)(cliOpts(), { partial: true }); +const files = argv.help ? [] : [...(argv.file || []), ...(argv._unknown || [])].filter(Boolean); + +const state = { ...argv, closed: false, maxColWidths: [10] }; + +type ToStringState = { + hr: string; + sep: string; + schema: any; + closed: boolean; + metadata: boolean; + maxColWidths: number[]; +}; + +(async () => { + + const sources = argv.help ? [] : [ + ...files.map((file) => () => fs.createReadStream(file)), + ...(process.stdin.isTTY ? [] : [() => process.stdin]) + ].filter(Boolean) as (() => NodeJS.ReadableStream)[]; + + let reader: RecordBatchReader | null; + let hasReaders = false; + + for (const source of sources) { + if (state.closed) { break; } + for await (reader of recordBatchReaders(source)) { + hasReaders = true; + const transformToString = batchesToString(state, reader.schema); + await pipeTo( + reader.pipe(transformToString), + process.stdout, { end: false } + ).catch(() => state.closed = true); // Handle EPIPE errors + } + if (state.closed) { break; } + } + + return hasReaders ? 0 : print_usage(); +})() +.then((x) => +x || 0, (err) => { + if (err) { + console.error(`${err?.stack || err}`); + } + return process.exitCode || 1; +}).then((code) => process.exit(code)); + +function pipeTo(source: NodeJS.ReadableStream, sink: NodeJS.WritableStream, opts?: { end: boolean }) { + return new Promise((resolve, reject) => { + + source.on('end', onEnd).pipe(sink, opts).on('error', onErr); + + function onEnd() { done(undefined, resolve); } + function onErr(err: any) { done(err, reject); } + function done(e: any, cb: (e?: any) => void) { + source.removeListener('end', onEnd); + sink.removeListener('error', onErr); + cb(e); + } + }); +} + +async function *recordBatchReaders(createSourceStream: () => NodeJS.ReadableStream) { + + const json = new AsyncByteQueue(); + const stream = new AsyncByteQueue(); + const source = createSourceStream(); + let reader: RecordBatchReader | null = null; + let readers: AsyncIterable<RecordBatchReader> | null = null; + // tee the input source, just in case it's JSON + source.on('end', () => [stream, json].forEach((y) => y.close())) + .on('data', (x) => [stream, json].forEach((y) => y.write(x))) + .on('error', (e) => [stream, json].forEach((y) => y.abort(e))); + + try { + for await (reader of RecordBatchReader.readAll(stream)) { + reader && (yield reader); + } + if (reader) return; + } catch (e) { readers = null; } + + if (!readers) { + await json.closed; + if (source instanceof fs.ReadStream) { source.close(); } + // If the data in the `json` ByteQueue parses to JSON, then assume it's Arrow JSON from a file or stdin + try { + for await (reader of RecordBatchReader.readAll(bignumJSONParse(await json.toString()))) { + reader && (yield reader); + } + } catch (e) { readers = null; } + } +} + +function batchesToString(state: ToStringState, schema: Schema) { + + let rowId = 0; + let batchId = -1; + let maxColWidths = [10]; + const { hr, sep } = state; + + const header = ['row_id', ...schema.fields.map((f) => `${f}`)].map(valueToString); + + state.maxColWidths = header.map((x, i) => Math.max(maxColWidths[i] || 0, x.length)); + + return new stream.Transform({ + encoding: 'utf8', + writableObjectMode: true, + readableObjectMode: false, + final(cb: (error?: Error | null) => void) { + // if there were no batches, then print the Schema, and metadata + if (batchId === -1) { + hr && this.push(`${horizontalRule(state.maxColWidths, hr, sep)}\n\n`); + this.push(`${formatRow(header, maxColWidths, sep)}\n`); + if (state.metadata && schema.metadata.size > 0) { + this.push(`metadata:\n${formatMetadata(schema.metadata)}\n`); + } + } + hr && this.push(`${horizontalRule(state.maxColWidths, hr, sep)}\n\n`); + cb(); + }, + transform(batch: RecordBatch, _enc: string, cb: (error?: Error, data?: any) => void) { + + batch = !state.schema?.length ? batch : batch.select(...state.schema); + + if (state.closed) { return cb(undefined, null); } + + // Pass one to convert to strings and count max column widths + state.maxColWidths = measureColumnWidths(rowId, batch, header.map((x, i) => Math.max(maxColWidths[i] || 0, x.length))); + + // If this is the first batch in a stream, print a top horizontal rule, schema metadata, and + if (++batchId === 0) { + hr && this.push(`${horizontalRule(state.maxColWidths, hr, sep)}\n`); + if (state.metadata && batch.schema.metadata.size > 0) { + this.push(`metadata:\n${formatMetadata(batch.schema.metadata)}\n`); + hr && this.push(`${horizontalRule(state.maxColWidths, hr, sep)}\n`); + } + if (batch.length <= 0 || batch.numCols <= 0) { + this.push(`${formatRow(header, maxColWidths = state.maxColWidths, sep)}\n`); + } + } + + if (batch.length > 0 && batch.numCols > 0) { + // If any of the column widths changed, print the header again + if (rowId % 350 !== 0 && JSON.stringify(state.maxColWidths) !== JSON.stringify(maxColWidths)) { + this.push(`${formatRow(header, state.maxColWidths, sep)}\n`); + } + maxColWidths = state.maxColWidths; + for (const row of batch) { + if (state.closed) { break; } else if (!row) { continue; } + if (rowId++ % 350 === 0) { + this.push(`${formatRow(header, maxColWidths, sep)}\n`); + } + this.push(`${formatRow([rowId, ...row.toArray()].map(valueToString), maxColWidths, sep)}\n`); + } + } + cb(); + } + }); +} + +function horizontalRule(maxColWidths: number[], hr = '', sep = ' | ') { + return ` ${padLeft('', maxColWidths.reduce((x, y) => x + y, -2 + maxColWidths.length * sep.length), hr)}`; +} + +function formatRow(row: string[] = [], maxColWidths: number[] = [], sep = ' | ') { + return `${row.map((x, j) => padLeft(x, maxColWidths[j])).join(sep)}`; +} + +function formatMetadata(metadata: Map<string, string>) { + + return [...metadata].map(([key, val]) => + ` ${key}: ${formatMetadataValue(val)}` + ).join(', \n'); + + function formatMetadataValue(value = '') { + let parsed = value; + try { + parsed = JSON.stringify(JSON.parse(value), null, 2); + } catch (e) { parsed = value; } + return valueToString(parsed).split('\n').join('\n '); + } +} + +function measureColumnWidths(rowId: number, batch: RecordBatch, maxColWidths: number[] = []) { + let val: any, j = 0; + for (const row of batch) { + if (!row) { continue; } + maxColWidths[j = 0] = Math.max(maxColWidths[0] || 0, (`${rowId++}`).length); + for (val of row) { + if (val && typedArrayElementWidths.has(val.constructor) && (typeof val[Symbol.toPrimitive] !== 'function')) { + // If we're printing a column of TypedArrays, ensure the column is wide enough to accommodate + // the widest possible element for a given byte size, since JS omits leading zeroes. For example: + // 1 | [1137743649,2170567488,244696391,2122556476] + // 2 | null + // 3 | [637174007,2142281880,961736230,2912449282] + // 4 | [1035112265,21832886,412842672,2207710517] + // 5 | null + // 6 | null + // 7 | [2755142991,4192423256,2994359,467878370] + const elementWidth = typedArrayElementWidths.get(val.constructor)!; + + maxColWidths[j + 1] = Math.max(maxColWidths[j + 1] || 0, + 2 + // brackets on each end + (val.length - 1) + // commas between elements + (val.length * elementWidth) // width of stringified 2^N-1 + ); + } else { + maxColWidths[j + 1] = Math.max(maxColWidths[j + 1] || 0, valueToString(val).length); + } + ++j; + } + } + return maxColWidths; +} + +// Measure the stringified representation of 2^N-1 for each TypedArray variant +const typedArrayElementWidths = (() => { + const maxElementWidth = (ArrayType: any) => { + const octets = Array.from({ length: ArrayType.BYTES_PER_ELEMENT - 1 }, _ => 255); + return `${new ArrayType(new Uint8Array([...octets, 254]).buffer)[0]}`.length; + }; + return new Map<any, number>([ + [Int8Array, maxElementWidth(Int8Array)], + [Int16Array, maxElementWidth(Int16Array)], + [Int32Array, maxElementWidth(Int32Array)], + [Uint8Array, maxElementWidth(Uint8Array)], + [Uint16Array, maxElementWidth(Uint16Array)], + [Uint32Array, maxElementWidth(Uint32Array)], + [Float32Array, maxElementWidth(Float32Array)], + [Float64Array, maxElementWidth(Float64Array)], + [Uint8ClampedArray, maxElementWidth(Uint8ClampedArray)] + ]); +})(); + +function cliOpts() { + return [ + { + type: String, + name: 'schema', alias: 's', + optional: true, multiple: true, + typeLabel: '{underline columns}', + description: 'A space-delimited list of column names' + }, + { + type: String, + name: 'file', alias: 'f', + optional: true, multiple: true, + description: 'The Arrow file to read' + }, + { + type: String, + name: 'sep', optional: true, default: ' | ', + description: 'The column separator character (default: " | ")' + }, + { + type: String, + name: 'hr', optional: true, default: '', + description: 'The horizontal border character (default: "")' + }, + { + type: Boolean, + name: 'metadata', alias: 'm', + optional: true, default: false, + description: 'Flag to print Schema metadata (default: false)' + }, + { + type: Boolean, + name: 'help', optional: true, default: false, + description: 'Print this usage guide.' + } + ]; +} + +function print_usage() { + console.log(require('command-line-usage')([ + { + header: 'arrow2csv', + content: 'Print a CSV from an Arrow file' + }, + { + header: 'Synopsis', + content: [ + '$ arrow2csv {underline file.arrow} [{bold --schema} column_name ...]', + '$ arrow2csv [{bold --schema} column_name ...] [{bold --file} {underline file.arrow}]', + '$ arrow2csv {bold -s} column_1 {bold -s} column_2 [{bold -f} {underline file.arrow}]', + '$ arrow2csv [{bold --help}]' + ] + }, + { + header: 'Options', + optionList: cliOpts() + }, + { + header: 'Example', + content: [ + '$ arrow2csv --schema foo baz --sep " , " -f simple.arrow', + '> "row_id", "foo: Int32", "baz: Utf8"', + '> 0, 1, "aa"', + '> 1, null, null', + '> 2, 3, null', + '> 3, 4, "bbb"', + '> 4, 5, "cccc"', + ] + } + ])); + return 1; +} diff --git a/src/arrow/js/src/builder.ts b/src/arrow/js/src/builder.ts new file mode 100644 index 000000000..86db95306 --- /dev/null +++ b/src/arrow/js/src/builder.ts @@ -0,0 +1,527 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Vector } from './vector'; +import { BufferType } from './enum'; +import { Data, Buffers } from './data'; +import { createIsValidFunction } from './builder/valid'; +import { BuilderType as B, VectorType as V} from './interfaces'; +import { BufferBuilder, BitmapBufferBuilder, DataBufferBuilder, OffsetsBufferBuilder } from './builder/buffer'; +import { + DataType, strideForType, + Float, Int, Decimal, FixedSizeBinary, + Date_, Time, Timestamp, Interval, + Utf8, Binary, List, Map_ +} from './type'; + +/** + * A set of options required to create a `Builder` instance for a given `DataType`. + * @see {@link Builder} + */ +export interface BuilderOptions<T extends DataType = any, TNull = any> { + type: T; + nullValues?: TNull[] | ReadonlyArray<TNull> | null; + children?: { [key: string]: BuilderOptions } | BuilderOptions[]; +} + +/** + * A set of options to create an Iterable or AsyncIterable `Builder` transform function. + * @see {@link Builder.throughIterable} + * @see {@link Builder.throughAsyncIterable} + */ + +export interface IterableBuilderOptions<T extends DataType = any, TNull = any> extends BuilderOptions<T, TNull> { + highWaterMark?: number; + queueingStrategy?: 'bytes' | 'count'; + dictionaryHashFunction?: (value: any) => string | number; + valueToChildTypeId?: (builder: Builder<T, TNull>, value: any, offset: number) => number; +} + +/** + * An abstract base class for types that construct Arrow Vectors from arbitrary JavaScript values. + * + * A `Builder` is responsible for writing arbitrary JavaScript values + * to ArrayBuffers and/or child Builders according to the Arrow specification + * for each DataType, creating or resizing the underlying ArrayBuffers as necessary. + * + * The `Builder` for each Arrow `DataType` handles converting and appending + * values for a given `DataType`. The high-level {@link Builder.new `Builder.new()`} convenience + * method creates the specific `Builder` subclass for the supplied `DataType`. + * + * Once created, `Builder` instances support both appending values to the end + * of the `Builder`, and random-access writes to specific indices + * (`Builder.prototype.append(value)` is a convenience method for + * `builder.set(builder.length, value)`). Appending or setting values beyond the + * Builder's current length may cause the builder to grow its underlying buffers + * or child Builders (if applicable) to accommodate the new values. + * + * After enough values have been written to a `Builder`, `Builder.prototype.flush()` + * will commit the values to the underlying ArrayBuffers (or child Builders). The + * internal Builder state will be reset, and an instance of `Data<T>` is returned. + * Alternatively, `Builder.prototype.toVector()` will flush the `Builder` and return + * an instance of `Vector<T>` instead. + * + * When there are no more values to write, use `Builder.prototype.finish()` to + * finalize the `Builder`. This does not reset the internal state, so it is + * necessary to call `Builder.prototype.flush()` or `toVector()` one last time + * if there are still values queued to be flushed. + * + * Note: calling `Builder.prototype.finish()` is required when using a `DictionaryBuilder`, + * because this is when it flushes the values that have been enqueued in its internal + * dictionary's `Builder`, and creates the `dictionaryVector` for the `Dictionary` `DataType`. + * + * ```ts + * import { Builder, Utf8 } from 'apache-arrow'; + * + * const utf8Builder = Builder.new({ + * type: new Utf8(), + * nullValues: [null, 'n/a'] + * }); + * + * utf8Builder + * .append('hello') + * .append('n/a') + * .append('world') + * .append(null); + * + * const utf8Vector = utf8Builder.finish().toVector(); + * + * console.log(utf8Vector.toJSON()); + * // > ["hello", null, "world", null] + * ``` + * + * @typeparam T The `DataType` of this `Builder`. + * @typeparam TNull The type(s) of values which will be considered null-value sentinels. + */ +export abstract class Builder<T extends DataType = any, TNull = any> { + + /** + * Create a `Builder` instance based on the `type` property of the supplied `options` object. + * @param {BuilderOptions<T, TNull>} options An object with a required `DataType` instance + * and other optional parameters to be passed to the `Builder` subclass for the given `type`. + * + * @typeparam T The `DataType` of the `Builder` to create. + * @typeparam TNull The type(s) of values which will be considered null-value sentinels. + * @nocollapse + */ + // @ts-ignore + public static new<T extends DataType = any, TNull = any>(options: BuilderOptions<T, TNull>): B<T, TNull> {} + + /** @nocollapse */ + // @ts-ignore + public static throughNode<T extends DataType = any, TNull = any>(options: import('./io/node/builder').BuilderDuplexOptions<T, TNull>): import('stream').Duplex { + throw new Error(`"throughNode" not available in this environment`); + } + /** @nocollapse */ + // @ts-ignore + public static throughDOM<T extends DataType = any, TNull = any>(options: import('./io/whatwg/builder').BuilderTransformOptions<T, TNull>): import('./io/whatwg/builder').BuilderTransform<T, TNull> { + throw new Error(`"throughDOM" not available in this environment`); + } + + /** + * Transform a synchronous `Iterable` of arbitrary JavaScript values into a + * sequence of Arrow Vector<T> following the chunking semantics defined in + * the supplied `options` argument. + * + * This function returns a function that accepts an `Iterable` of values to + * transform. When called, this function returns an Iterator of `Vector<T>`. + * + * The resulting `Iterator<Vector<T>>` yields Vectors based on the + * `queueingStrategy` and `highWaterMark` specified in the `options` argument. + * + * * If `queueingStrategy` is `"count"` (or omitted), The `Iterator<Vector<T>>` + * will flush the underlying `Builder` (and yield a new `Vector<T>`) once the + * Builder's `length` reaches or exceeds the supplied `highWaterMark`. + * * If `queueingStrategy` is `"bytes"`, the `Iterator<Vector<T>>` will flush + * the underlying `Builder` (and yield a new `Vector<T>`) once its `byteLength` + * reaches or exceeds the supplied `highWaterMark`. + * + * @param {IterableBuilderOptions<T, TNull>} options An object of properties which determine the `Builder` to create and the chunking semantics to use. + * @returns A function which accepts a JavaScript `Iterable` of values to + * write, and returns an `Iterator` that yields Vectors according + * to the chunking semantics defined in the `options` argument. + * @nocollapse + */ + public static throughIterable<T extends DataType = any, TNull = any>(options: IterableBuilderOptions<T, TNull>) { + return throughIterable(options); + } + + /** + * Transform an `AsyncIterable` of arbitrary JavaScript values into a + * sequence of Arrow Vector<T> following the chunking semantics defined in + * the supplied `options` argument. + * + * This function returns a function that accepts an `AsyncIterable` of values to + * transform. When called, this function returns an AsyncIterator of `Vector<T>`. + * + * The resulting `AsyncIterator<Vector<T>>` yields Vectors based on the + * `queueingStrategy` and `highWaterMark` specified in the `options` argument. + * + * * If `queueingStrategy` is `"count"` (or omitted), The `AsyncIterator<Vector<T>>` + * will flush the underlying `Builder` (and yield a new `Vector<T>`) once the + * Builder's `length` reaches or exceeds the supplied `highWaterMark`. + * * If `queueingStrategy` is `"bytes"`, the `AsyncIterator<Vector<T>>` will flush + * the underlying `Builder` (and yield a new `Vector<T>`) once its `byteLength` + * reaches or exceeds the supplied `highWaterMark`. + * + * @param {IterableBuilderOptions<T, TNull>} options An object of properties which determine the `Builder` to create and the chunking semantics to use. + * @returns A function which accepts a JavaScript `AsyncIterable` of values + * to write, and returns an `AsyncIterator` that yields Vectors + * according to the chunking semantics defined in the `options` + * argument. + * @nocollapse + */ + public static throughAsyncIterable<T extends DataType = any, TNull = any>(options: IterableBuilderOptions<T, TNull>) { + return throughAsyncIterable(options); + } + + /** + * Construct a builder with the given Arrow DataType with optional null values, + * which will be interpreted as "null" when set or appended to the `Builder`. + * @param {{ type: T, nullValues?: any[] }} options A `BuilderOptions` object used to create this `Builder`. + */ + constructor({ 'type': type, 'nullValues': nulls }: BuilderOptions<T, TNull>) { + this.type = type; + this.children = []; + this.nullValues = nulls; + this.stride = strideForType(type); + this._nulls = new BitmapBufferBuilder(); + if (nulls && nulls.length > 0) { + this._isValid = createIsValidFunction(nulls); + } + } + + /** + * The Builder's `DataType` instance. + * @readonly + */ + public type: T; + /** + * The number of values written to the `Builder` that haven't been flushed yet. + * @readonly + */ + public length = 0; + /** + * A boolean indicating whether `Builder.prototype.finish()` has been called on this `Builder`. + * @readonly + */ + public finished = false; + /** + * The number of elements in the underlying values TypedArray that + * represent a single logical element, determined by this Builder's + * `DataType`. This is 1 for most types, but is larger when the `DataType` + * is `Int64`, `Uint64`, `Decimal`, `DateMillisecond`, certain variants of + * `Interval`, `Time`, or `Timestamp`, `FixedSizeBinary`, and `FixedSizeList`. + * @readonly + */ + public readonly stride: number; + public readonly children: Builder[]; + /** + * The list of null-value sentinels for this `Builder`. When one of these values + * is written to the `Builder` (either via `Builder.prototype.set()` or `Builder.prototype.append()`), + * a 1-bit is written to this Builder's underlying null BitmapBufferBuilder. + * @readonly + */ + public readonly nullValues?: TNull[] | ReadonlyArray<TNull> | null; + + /** + * Flush the `Builder` and return a `Vector<T>`. + * @returns {Vector<T>} A `Vector<T>` of the flushed values. + */ + public toVector() { return Vector.new(this.flush()); } + + public get ArrayType() { return this.type.ArrayType; } + public get nullCount() { return this._nulls.numInvalid; } + public get numChildren() { return this.children.length; } + + /** + * @returns The aggregate length (in bytes) of the values that have been written. + */ + public get byteLength(): number { + let size = 0; + this._offsets && (size += this._offsets.byteLength); + this._values && (size += this._values.byteLength); + this._nulls && (size += this._nulls.byteLength); + this._typeIds && (size += this._typeIds.byteLength); + return this.children.reduce((size, child) => size + child.byteLength, size); + } + + /** + * @returns The aggregate number of rows that have been reserved to write new values. + */ + public get reservedLength(): number { + return this._nulls.reservedLength; + } + + /** + * @returns The aggregate length (in bytes) that has been reserved to write new values. + */ + public get reservedByteLength(): number { + let size = 0; + this._offsets && (size += this._offsets.reservedByteLength); + this._values && (size += this._values.reservedByteLength); + this._nulls && (size += this._nulls.reservedByteLength); + this._typeIds && (size += this._typeIds.reservedByteLength); + return this.children.reduce((size, child) => size + child.reservedByteLength, size); + } + + protected _offsets!: DataBufferBuilder<Int32Array>; + public get valueOffsets() { return this._offsets ? this._offsets.buffer : null; } + + protected _values!: BufferBuilder<T['TArray'], any>; + public get values() { return this._values ? this._values.buffer : null; } + + protected _nulls: BitmapBufferBuilder; + public get nullBitmap() { return this._nulls ? this._nulls.buffer : null; } + + protected _typeIds!: DataBufferBuilder<Int8Array>; + public get typeIds() { return this._typeIds ? this._typeIds.buffer : null; } + + protected _isValid!: (value: T['TValue'] | TNull) => boolean; + protected _setValue!: (inst: Builder<T>, index: number, value: T['TValue']) => void; + + /** + * Appends a value (or null) to this `Builder`. + * This is equivalent to `builder.set(builder.length, value)`. + * @param {T['TValue'] | TNull } value The value to append. + */ + public append(value: T['TValue'] | TNull) { return this.set(this.length, value); } + + /** + * Validates whether a value is valid (true), or null (false) + * @param {T['TValue'] | TNull } value The value to compare against null the value representations + */ + public isValid(value: T['TValue'] | TNull): boolean { return this._isValid(value); } + + /** + * Write a value (or null-value sentinel) at the supplied index. + * If the value matches one of the null-value representations, a 1-bit is + * written to the null `BitmapBufferBuilder`. Otherwise, a 0 is written to + * the null `BitmapBufferBuilder`, and the value is passed to + * `Builder.prototype.setValue()`. + * @param {number} index The index of the value to write. + * @param {T['TValue'] | TNull } value The value to write at the supplied index. + * @returns {this} The updated `Builder` instance. + */ + public set(index: number, value: T['TValue'] | TNull) { + if (this.setValid(index, this.isValid(value))) { + this.setValue(index, value); + } + return this; + } + + /** + * Write a value to the underlying buffers at the supplied index, bypassing + * the null-value check. This is a low-level method that + * @param {number} index + * @param {T['TValue'] | TNull } value + */ + public setValue(index: number, value: T['TValue']) { this._setValue(this, index, value); } + public setValid(index: number, valid: boolean) { + this.length = this._nulls.set(index, +valid).length; + return valid; + } + + // @ts-ignore + public addChild(child: Builder, name = `${this.numChildren}`) { + throw new Error(`Cannot append children to non-nested type "${this.type}"`); + } + + /** + * Retrieve the child `Builder` at the supplied `index`, or null if no child + * exists at that index. + * @param {number} index The index of the child `Builder` to retrieve. + * @returns {Builder | null} The child Builder at the supplied index or null. + */ + public getChildAt<R extends DataType = any>(index: number): Builder<R> | null { + return this.children[index] || null; + } + + /** + * Commit all the values that have been written to their underlying + * ArrayBuffers, including any child Builders if applicable, and reset + * the internal `Builder` state. + * @returns A `Data<T>` of the buffers and childData representing the values written. + */ + public flush() { + + const buffers: any = []; + const values = this._values; + const offsets = this._offsets; + const typeIds = this._typeIds; + const { length, nullCount } = this; + + if (typeIds) { /* Unions */ + buffers[BufferType.TYPE] = typeIds.flush(length); + // DenseUnions + offsets && (buffers[BufferType.OFFSET] = offsets.flush(length)); + } else if (offsets) { /* Variable-width primitives (Binary, Utf8) and Lists */ + // Binary, Utf8 + values && (buffers[BufferType.DATA] = values.flush(offsets.last())); + buffers[BufferType.OFFSET] = offsets.flush(length); + } else if (values) { /* Fixed-width primitives (Int, Float, Decimal, Time, Timestamp, and Interval) */ + buffers[BufferType.DATA] = values.flush(length); + } + + nullCount > 0 && (buffers[BufferType.VALIDITY] = this._nulls.flush(length)); + + const data = Data.new<T>( + this.type, 0, length, nullCount, buffers as Buffers<T>, + this.children.map((child) => child.flush())) as Data<T>; + + this.clear(); + + return data; + } + + /** + * Finalize this `Builder`, and child builders if applicable. + * @returns {this} The finalized `Builder` instance. + */ + public finish() { + this.finished = true; + this.children.forEach((child) => child.finish()); + return this; + } + + /** + * Clear this Builder's internal state, including child Builders if applicable, and reset the length to 0. + * @returns {this} The cleared `Builder` instance. + */ + public clear() { + this.length = 0; + this._offsets && (this._offsets.clear()); + this._values && (this._values.clear()); + this._nulls && (this._nulls.clear()); + this._typeIds && (this._typeIds.clear()); + this.children.forEach((child) => child.clear()); + return this; + } +} + +(Builder.prototype as any).length = 1; +(Builder.prototype as any).stride = 1; +(Builder.prototype as any).children = null; +(Builder.prototype as any).finished = false; +(Builder.prototype as any).nullValues = null; +(Builder.prototype as any)._isValid = () => true; + +/** @ignore */ +export abstract class FixedWidthBuilder<T extends Int | Float | FixedSizeBinary | Date_ | Timestamp | Time | Decimal | Interval = any, TNull = any> extends Builder<T, TNull> { + constructor(opts: BuilderOptions<T, TNull>) { + super(opts); + this._values = new DataBufferBuilder(new this.ArrayType(0), this.stride); + } + public setValue(index: number, value: T['TValue']) { + const values = this._values; + values.reserve(index - values.length + 1); + return super.setValue(index, value); + } +} + +/** @ignore */ +export abstract class VariableWidthBuilder<T extends Binary | Utf8 | List | Map_, TNull = any> extends Builder<T, TNull> { + protected _pendingLength = 0; + protected _offsets: OffsetsBufferBuilder; + protected _pending: Map<number, any> | undefined; + constructor(opts: BuilderOptions<T, TNull>) { + super(opts); + this._offsets = new OffsetsBufferBuilder(); + } + public setValue(index: number, value: T['TValue']) { + const pending = this._pending || (this._pending = new Map()); + const current = pending.get(index); + current && (this._pendingLength -= current.length); + this._pendingLength += value.length; + pending.set(index, value); + } + public setValid(index: number, isValid: boolean) { + if (!super.setValid(index, isValid)) { + (this._pending || (this._pending = new Map())).set(index, undefined); + return false; + } + return true; + } + public clear() { + this._pendingLength = 0; + this._pending = undefined; + return super.clear(); + } + public flush() { + this._flush(); + return super.flush(); + } + public finish() { + this._flush(); + return super.finish(); + } + protected _flush() { + const pending = this._pending; + const pendingLength = this._pendingLength; + this._pendingLength = 0; + this._pending = undefined; + if (pending && pending.size > 0) { + this._flushPending(pending, pendingLength); + } + return this; + } + protected abstract _flushPending(pending: Map<number, any>, pendingLength: number): void; +} + +/** @ignore */ +type ThroughIterable<T extends DataType = any, TNull = any> = (source: Iterable<T['TValue'] | TNull>) => IterableIterator<V<T>>; + +/** @ignore */ +function throughIterable<T extends DataType = any, TNull = any>(options: IterableBuilderOptions<T, TNull>) { + const { ['queueingStrategy']: queueingStrategy = 'count' } = options; + const { ['highWaterMark']: highWaterMark = queueingStrategy !== 'bytes' ? 1000 : 2 ** 14 } = options; + const sizeProperty: 'length' | 'byteLength' = queueingStrategy !== 'bytes' ? 'length' : 'byteLength'; + return function*(source: Iterable<T['TValue'] | TNull>) { + let numChunks = 0; + const builder = Builder.new(options); + for (const value of source) { + if (builder.append(value)[sizeProperty] >= highWaterMark) { + ++numChunks && (yield builder.toVector()); + } + } + if (builder.finish().length > 0 || numChunks === 0) { + yield builder.toVector(); + } + } as ThroughIterable<T, TNull>; +} + +/** @ignore */ +type ThroughAsyncIterable<T extends DataType = any, TNull = any> = (source: Iterable<T['TValue'] | TNull> | AsyncIterable<T['TValue'] | TNull>) => AsyncIterableIterator<V<T>>; + +/** @ignore */ +function throughAsyncIterable<T extends DataType = any, TNull = any>(options: IterableBuilderOptions<T, TNull>) { + const { ['queueingStrategy']: queueingStrategy = 'count' } = options; + const { ['highWaterMark']: highWaterMark = queueingStrategy !== 'bytes' ? 1000 : 2 ** 14 } = options; + const sizeProperty: 'length' | 'byteLength' = queueingStrategy !== 'bytes' ? 'length' : 'byteLength'; + return async function* (source: Iterable<T['TValue'] | TNull> | AsyncIterable<T['TValue'] | TNull>) { + let numChunks = 0; + const builder = Builder.new(options); + for await (const value of source) { + if (builder.append(value)[sizeProperty] >= highWaterMark) { + ++numChunks && (yield builder.toVector()); + } + } + if (builder.finish().length > 0 || numChunks === 0) { + yield builder.toVector(); + } + } as ThroughAsyncIterable<T, TNull>; +} diff --git a/src/arrow/js/src/builder/binary.ts b/src/arrow/js/src/builder/binary.ts new file mode 100644 index 000000000..829da5c97 --- /dev/null +++ b/src/arrow/js/src/builder/binary.ts @@ -0,0 +1,54 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Binary } from '../type'; +import { toUint8Array } from '../util/buffer'; +import { BufferBuilder } from './buffer'; +import { VariableWidthBuilder, BuilderOptions } from '../builder'; + +/** @ignore */ +export class BinaryBuilder<TNull = any> extends VariableWidthBuilder<Binary, TNull> { + constructor(opts: BuilderOptions<Binary, TNull>) { + super(opts); + this._values = new BufferBuilder(new Uint8Array(0)); + } + public get byteLength(): number { + let size = this._pendingLength + (this.length * 4); + this._offsets && (size += this._offsets.byteLength); + this._values && (size += this._values.byteLength); + this._nulls && (size += this._nulls.byteLength); + return size; + } + public setValue(index: number, value: Uint8Array) { + return super.setValue(index, toUint8Array(value)); + } + protected _flushPending(pending: Map<number, Uint8Array | undefined>, pendingLength: number) { + const offsets = this._offsets; + const data = this._values.reserve(pendingLength).buffer; + let index = 0, length = 0, offset = 0, value: Uint8Array | undefined; + for ([index, value] of pending) { + if (value === undefined) { + offsets.set(index, 0); + } else { + length = value.length; + data.set(value, offset); + offsets.set(index, length); + offset += length; + } + } + } +} diff --git a/src/arrow/js/src/builder/bool.ts b/src/arrow/js/src/builder/bool.ts new file mode 100644 index 000000000..5c0e0950e --- /dev/null +++ b/src/arrow/js/src/builder/bool.ts @@ -0,0 +1,31 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Bool } from '../type'; +import { BitmapBufferBuilder } from './buffer'; +import { Builder, BuilderOptions } from '../builder'; + +/** @ignore */ +export class BoolBuilder<TNull = any> extends Builder<Bool, TNull> { + constructor(options: BuilderOptions<Bool, TNull>) { + super(options); + this._values = new BitmapBufferBuilder(); + } + public setValue(index: number, value: boolean) { + this._values.set(index, +value); + } +} diff --git a/src/arrow/js/src/builder/buffer.ts b/src/arrow/js/src/builder/buffer.ts new file mode 100644 index 000000000..3c20cc001 --- /dev/null +++ b/src/arrow/js/src/builder/buffer.ts @@ -0,0 +1,182 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { memcpy } from '../util/buffer'; +import { BigIntAvailable, BigInt64Array, BigUint64Array } from '../util/compat'; +import { + TypedArray, TypedArrayConstructor, + BigIntArray, BigIntArrayConstructor +} from '../interfaces'; + +/** @ignore */ type DataValue<T> = T extends TypedArray ? number : T extends BigIntArray ? WideValue<T> : T; +/** @ignore */ type WideValue<T extends BigIntArray> = T extends BigIntArray ? bigint | Int32Array | Uint32Array : never; +/** @ignore */ type ArrayCtor<T extends TypedArray | BigIntArray> = + T extends TypedArray ? TypedArrayConstructor<T> : + T extends BigIntArray ? BigIntArrayConstructor<T> : + any; + +/** @ignore */ +const roundLengthUpToNearest64Bytes = (len: number, BPE: number) => ((((len * BPE) + 63) & ~63) || 64) / BPE; +/** @ignore */ +const sliceOrExtendArray = <T extends TypedArray | BigIntArray>(arr: T, len = 0) => ( + arr.length >= len ? arr.subarray(0, len) : memcpy(new (arr.constructor as any)(len), arr, 0) +) as T; + +/** @ignore */ +export interface BufferBuilder<T extends TypedArray | BigIntArray = any, TValue = DataValue<T>> { + readonly offset: number; +} + +/** @ignore */ +export class BufferBuilder<T extends TypedArray | BigIntArray = any, TValue = DataValue<T>> { + + constructor(buffer: T, stride = 1) { + this.buffer = buffer; + this.stride = stride; + this.BYTES_PER_ELEMENT = buffer.BYTES_PER_ELEMENT; + this.ArrayType = buffer.constructor as ArrayCtor<T>; + this._resize(this.length = buffer.length / stride | 0); + } + + public buffer: T; + public length: number; + public readonly stride: number; + public readonly ArrayType: ArrayCtor<T>; + public readonly BYTES_PER_ELEMENT: number; + + public get byteLength() { return this.length * this.stride * this.BYTES_PER_ELEMENT | 0; } + public get reservedLength() { return this.buffer.length / this.stride; } + public get reservedByteLength() { return this.buffer.byteLength; } + + // @ts-ignore + public set(index: number, value: TValue) { return this; } + public append(value: TValue) { return this.set(this.length, value); } + public reserve(extra: number) { + if (extra > 0) { + this.length += extra; + const stride = this.stride; + const length = this.length * stride; + const reserved = this.buffer.length; + if (length >= reserved) { + this._resize(reserved === 0 + ? roundLengthUpToNearest64Bytes(length * 1, this.BYTES_PER_ELEMENT) + : roundLengthUpToNearest64Bytes(length * 2, this.BYTES_PER_ELEMENT) + ); + } + } + return this; + } + public flush(length = this.length) { + length = roundLengthUpToNearest64Bytes(length * this.stride, this.BYTES_PER_ELEMENT); + const array = sliceOrExtendArray<T>(this.buffer, length); + this.clear(); + return array; + } + public clear() { + this.length = 0; + this._resize(0); + return this; + } + protected _resize(newLength: number) { + return this.buffer = <T> memcpy(new this.ArrayType(newLength), this.buffer); + } +} + +(BufferBuilder.prototype as any).offset = 0; + +/** @ignore */ +export class DataBufferBuilder<T extends TypedArray> extends BufferBuilder<T, number> { + public last() { return this.get(this.length - 1); } + public get(index: number) { return this.buffer[index]; } + public set(index: number, value: number) { + this.reserve(index - this.length + 1); + this.buffer[index * this.stride] = value; + return this; + } +} + +/** @ignore */ +export class BitmapBufferBuilder extends DataBufferBuilder<Uint8Array> { + + constructor(data = new Uint8Array(0)) { super(data, 1 / 8); } + + public numValid = 0; + public get numInvalid() { return this.length - this.numValid; } + public get(idx: number) { return this.buffer[idx >> 3] >> idx % 8 & 1; } + public set(idx: number, val: number) { + const { buffer } = this.reserve(idx - this.length + 1); + const byte = idx >> 3, bit = idx % 8, cur = buffer[byte] >> bit & 1; + // If `val` is truthy and the current bit is 0, flip it to 1 and increment `numValid`. + // If `val` is falsey and the current bit is 1, flip it to 0 and decrement `numValid`. + val ? cur === 0 && ((buffer[byte] |= (1 << bit)), ++this.numValid) + : cur === 1 && ((buffer[byte] &= ~(1 << bit)), --this.numValid); + return this; + } + public clear() { + this.numValid = 0; + return super.clear(); + } +} + +/** @ignore */ +export class OffsetsBufferBuilder extends DataBufferBuilder<Int32Array> { + constructor(data = new Int32Array(1)) { super(data, 1); } + public append(value: number) { + return this.set(this.length - 1, value); + } + public set(index: number, value: number) { + const offset = this.length - 1; + const buffer = this.reserve(index - offset + 1).buffer; + if (offset < index++) { + buffer.fill(buffer[offset], offset, index); + } + buffer[index] = buffer[index - 1] + value; + return this; + } + public flush(length = this.length - 1) { + if (length > this.length) { + this.set(length - 1, 0); + } + return super.flush(length + 1); + } +} + +/** @ignore */ +export class WideBufferBuilder<T extends TypedArray, R extends BigIntArray> extends BufferBuilder<T, DataValue<T>> { + public buffer64!: R; + protected _ArrayType64!: BigIntArrayConstructor<R>; + public get ArrayType64() { + return this._ArrayType64 || (this._ArrayType64 = <BigIntArrayConstructor<R>> (this.buffer instanceof Int32Array ? BigInt64Array : BigUint64Array)); + } + public set(index: number, value: DataValue<T>) { + this.reserve(index - this.length + 1); + switch (typeof value) { + case 'bigint': this.buffer64[index] = value; break; + case 'number': this.buffer[index * this.stride] = value; break; + default: this.buffer.set(value as TypedArray, index * this.stride); + } + return this; + } + protected _resize(newLength: number) { + const data = super._resize(newLength); + const length = data.byteLength / (this.BYTES_PER_ELEMENT * this.stride); + if (BigIntAvailable) { + this.buffer64 = new this.ArrayType64(data.buffer, data.byteOffset, length); + } + return data; + } +} diff --git a/src/arrow/js/src/builder/date.ts b/src/arrow/js/src/builder/date.ts new file mode 100644 index 000000000..e9748b58c --- /dev/null +++ b/src/arrow/js/src/builder/date.ts @@ -0,0 +1,26 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { FixedWidthBuilder } from '../builder'; +import { Date_, DateDay, DateMillisecond } from '../type'; + +/** @ignore */ +export class DateBuilder<T extends Date_ = Date_, TNull = any> extends FixedWidthBuilder<T, TNull> {} +/** @ignore */ +export class DateDayBuilder<TNull = any> extends DateBuilder<DateDay, TNull> {} +/** @ignore */ +export class DateMillisecondBuilder<TNull = any> extends DateBuilder<DateMillisecond, TNull> {} diff --git a/src/arrow/js/src/builder/decimal.ts b/src/arrow/js/src/builder/decimal.ts new file mode 100644 index 000000000..5814abd5b --- /dev/null +++ b/src/arrow/js/src/builder/decimal.ts @@ -0,0 +1,22 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Decimal } from '../type'; +import { FixedWidthBuilder } from '../builder'; + +/** @ignore */ +export class DecimalBuilder<TNull = any> extends FixedWidthBuilder<Decimal, TNull> {} diff --git a/src/arrow/js/src/builder/dictionary.ts b/src/arrow/js/src/builder/dictionary.ts new file mode 100644 index 000000000..6602825dd --- /dev/null +++ b/src/arrow/js/src/builder/dictionary.ts @@ -0,0 +1,98 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Vector } from '../vector'; +import { IntBuilder } from './int'; +import { Dictionary, DataType } from '../type'; +import { Builder, BuilderOptions } from '../builder'; + +type DictionaryHashFunction = (x: any) => string | number; + +export interface DictionaryBuilderOptions<T extends DataType = any, TNull = any> extends BuilderOptions<T, TNull> { + dictionaryHashFunction?: DictionaryHashFunction; +} + +/** @ignore */ +export class DictionaryBuilder<T extends Dictionary, TNull = any> extends Builder<T, TNull> { + + protected _dictionaryOffset: number; + protected _dictionary?: Vector<T['dictionary']>; + protected _keysToIndices: { [key: string]: number }; + public readonly indices: IntBuilder<T['indices']>; + public readonly dictionary: Builder<T['dictionary']>; + + constructor({ 'type': type, 'nullValues': nulls, 'dictionaryHashFunction': hashFn }: DictionaryBuilderOptions<T, TNull>) { + super({ type: new Dictionary(type.dictionary, type.indices, type.id, type.isOrdered) as T }); + this._nulls = <any> null; + this._dictionaryOffset = 0; + this._keysToIndices = Object.create(null); + this.indices = Builder.new({ 'type': this.type.indices, 'nullValues': nulls }) as IntBuilder<T['indices']>; + this.dictionary = Builder.new({ 'type': this.type.dictionary, 'nullValues': null }) as Builder<T['dictionary']>; + if (typeof hashFn === 'function') { + this.valueToKey = hashFn; + } + } + + public get values() { return this.indices.values; } + public get nullCount() { return this.indices.nullCount; } + public get nullBitmap() { return this.indices.nullBitmap; } + public get byteLength() { return this.indices.byteLength + this.dictionary.byteLength; } + public get reservedLength() { return this.indices.reservedLength + this.dictionary.reservedLength; } + public get reservedByteLength() { return this.indices.reservedByteLength + this.dictionary.reservedByteLength; } + public isValid(value: T['TValue'] | TNull) { return this.indices.isValid(value); } + public setValid(index: number, valid: boolean) { + const indices = this.indices; + valid = indices.setValid(index, valid); + this.length = indices.length; + return valid; + } + public setValue(index: number, value: T['TValue']) { + const keysToIndices = this._keysToIndices; + const key = this.valueToKey(value); + let idx = keysToIndices[key]; + if (idx === undefined) { + keysToIndices[key] = idx = this._dictionaryOffset + this.dictionary.append(value).length - 1; + } + return this.indices.setValue(index, idx); + } + public flush() { + const type = this.type; + const prev = this._dictionary; + const curr = this.dictionary.toVector(); + const data = this.indices.flush().clone(type); + data.dictionary = prev ? prev.concat(curr) : curr; + this.finished || (this._dictionaryOffset += curr.length); + this._dictionary = data.dictionary as Vector<T['dictionary']>; + this.clear(); + return data; + } + public finish() { + this.indices.finish(); + this.dictionary.finish(); + this._dictionaryOffset = 0; + this._keysToIndices = Object.create(null); + return super.finish(); + } + public clear() { + this.indices.clear(); + this.dictionary.clear(); + return super.clear(); + } + public valueToKey(val: any): string | number { + return typeof val === 'string' ? val : `${val}`; + } +} diff --git a/src/arrow/js/src/builder/fixedsizebinary.ts b/src/arrow/js/src/builder/fixedsizebinary.ts new file mode 100644 index 000000000..99aaf46a3 --- /dev/null +++ b/src/arrow/js/src/builder/fixedsizebinary.ts @@ -0,0 +1,22 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { FixedSizeBinary } from '../type'; +import { FixedWidthBuilder } from '../builder'; + +/** @ignore */ +export class FixedSizeBinaryBuilder<TNull = any> extends FixedWidthBuilder<FixedSizeBinary, TNull> {} diff --git a/src/arrow/js/src/builder/fixedsizelist.ts b/src/arrow/js/src/builder/fixedsizelist.ts new file mode 100644 index 000000000..cc20f5ba2 --- /dev/null +++ b/src/arrow/js/src/builder/fixedsizelist.ts @@ -0,0 +1,41 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Run } from './run'; +import { Field } from '../schema'; +import { Builder } from '../builder'; +import { DataType, FixedSizeList } from '../type'; + +/** @ignore */ +export class FixedSizeListBuilder<T extends DataType = any, TNull = any> extends Builder<FixedSizeList<T>, TNull> { + protected _run = new Run<T, TNull>(); + public setValue(index: number, value: T['TValue']) { + super.setValue(index, this._run.bind(value)); + } + public addChild(child: Builder<T>, name = '0') { + if (this.numChildren > 0) { + throw new Error('FixedSizeListBuilder can only have one child.'); + } + const childIndex = this.children.push(child); + this.type = new FixedSizeList(this.type.listSize, new Field(name, child.type, true)); + return childIndex; + } + public clear() { + this._run.clear(); + return super.clear(); + } +} diff --git a/src/arrow/js/src/builder/float.ts b/src/arrow/js/src/builder/float.ts new file mode 100644 index 000000000..dbf4c0d06 --- /dev/null +++ b/src/arrow/js/src/builder/float.ts @@ -0,0 +1,45 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { float64ToUint16 } from '../util/math'; +import { FixedWidthBuilder } from '../builder'; +import { Float, Float16, Float32, Float64 } from '../type'; + +/** @ignore */ +export class FloatBuilder<T extends Float = Float, TNull = any> extends FixedWidthBuilder<T, TNull> {} + +/** @ignore */ +export class Float16Builder<TNull = any> extends FloatBuilder<Float16, TNull> { + public setValue(index: number, value: number) { + // convert JS float64 to a uint16 + this._values.set(index, float64ToUint16(value)); + } +} + +/** @ignore */ +export class Float32Builder<TNull = any> extends FloatBuilder<Float32, TNull> { + public setValue(index: number, value: number) { + this._values.set(index, value); + } +} + +/** @ignore */ +export class Float64Builder<TNull = any> extends FloatBuilder<Float64, TNull> { + public setValue(index: number, value: number) { + this._values.set(index, value); + } +} diff --git a/src/arrow/js/src/builder/index.ts b/src/arrow/js/src/builder/index.ts new file mode 100644 index 000000000..dfd9d54f1 --- /dev/null +++ b/src/arrow/js/src/builder/index.ts @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/** @ignore */ +export { Builder, BuilderOptions } from '../builder'; +export { BoolBuilder } from './bool'; +export { NullBuilder } from './null'; +export { DateBuilder, DateDayBuilder, DateMillisecondBuilder } from './date'; +export { DecimalBuilder } from './decimal'; +export { DictionaryBuilder } from './dictionary'; +export { FixedSizeBinaryBuilder } from './fixedsizebinary'; +export { FloatBuilder, Float16Builder, Float32Builder, Float64Builder } from './float'; +export { IntBuilder, Int8Builder, Int16Builder, Int32Builder, Int64Builder, Uint8Builder, Uint16Builder, Uint32Builder, Uint64Builder } from './int'; +export { TimeBuilder, TimeSecondBuilder, TimeMillisecondBuilder, TimeMicrosecondBuilder, TimeNanosecondBuilder } from './time'; +export { TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuilder, TimestampMicrosecondBuilder, TimestampNanosecondBuilder } from './timestamp'; +export { IntervalBuilder, IntervalDayTimeBuilder, IntervalYearMonthBuilder } from './interval'; +export { Utf8Builder } from './utf8'; +export { BinaryBuilder } from './binary'; +export { ListBuilder } from './list'; +export { FixedSizeListBuilder } from './fixedsizelist'; +export { MapBuilder } from './map'; +export { StructBuilder } from './struct'; +export { UnionBuilder, SparseUnionBuilder, DenseUnionBuilder } from './union'; + +import { Type } from '../enum'; +import { Field } from '../schema'; +import { DataType } from '../type'; +import { Utf8Builder } from './utf8'; +import { BuilderType as B } from '../interfaces'; +import { Builder, BuilderOptions } from '../builder'; +import { instance as setVisitor } from '../visitor/set'; +import { instance as getBuilderConstructor } from '../visitor/builderctor'; + +/** @nocollapse */ +Builder.new = newBuilder; + +function newBuilder<T extends DataType = any, TNull = any>(options: BuilderOptions<T, TNull>): B<T, TNull> { + + const type = options.type; + const builder = new (getBuilderConstructor.getVisitFn<T>(type)())(options) as Builder<T, TNull>; + + if (type.children && type.children.length > 0) { + + const children = options['children'] || [] as BuilderOptions[]; + const defaultOptions = { 'nullValues': options['nullValues'] }; + const getChildOptions = Array.isArray(children) + ? ((_: Field, i: number) => children[i] || defaultOptions) + : (({ name }: Field) => children[name] || defaultOptions); + + type.children.forEach((field, index) => { + const { type } = field; + const opts = getChildOptions(field, index); + builder.children.push(newBuilder({ ...opts, type })); + }); + } + + return builder as B<T, TNull>; +} + +(Object.keys(Type) as any[]) + .map((T: any) => Type[T] as any) + .filter((T: any): T is Type => typeof T === 'number' && T !== Type.NONE) + .forEach((typeId) => { + const BuilderCtor = getBuilderConstructor.visit(typeId); + BuilderCtor.prototype._setValue = setVisitor.getVisitFn(typeId); + }); + +(Utf8Builder.prototype as any)._setValue = setVisitor.visitBinary; diff --git a/src/arrow/js/src/builder/int.ts b/src/arrow/js/src/builder/int.ts new file mode 100644 index 000000000..5777bd125 --- /dev/null +++ b/src/arrow/js/src/builder/int.ts @@ -0,0 +1,80 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { bignumToBigInt } from '../util/bn'; +import { WideBufferBuilder } from './buffer'; +import { BigInt64Array } from '../util/compat'; +import { FixedWidthBuilder, BuilderOptions } from '../builder'; +import { Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64 } from '../type'; + +/** @ignore */ +export class IntBuilder<T extends Int = Int, TNull = any> extends FixedWidthBuilder<T, TNull> { + public setValue(index: number, value: T['TValue']) { + this._values.set(index, value); + } +} + +/** @ignore */ +export class Int8Builder<TNull = any> extends IntBuilder<Int8, TNull> {} +/** @ignore */ +export class Int16Builder<TNull = any> extends IntBuilder<Int16, TNull> {} +/** @ignore */ +export class Int32Builder<TNull = any> extends IntBuilder<Int32, TNull> {} +/** @ignore */ +export class Int64Builder<TNull = any> extends IntBuilder<Int64, TNull> { + protected _values: WideBufferBuilder<Int32Array, BigInt64Array>; + constructor(options: BuilderOptions<Int64, TNull>) { + if (options['nullValues']) { + options['nullValues'] = (options['nullValues'] as TNull[]).map(toBigInt); + } + super(options); + this._values = new WideBufferBuilder(new Int32Array(0), 2); + } + public get values64() { return this._values.buffer64; } + public isValid(value: Int32Array | bigint | TNull) { return super.isValid(toBigInt(value)); } +} + +/** @ignore */ +export class Uint8Builder<TNull = any> extends IntBuilder<Uint8, TNull> {} +/** @ignore */ +export class Uint16Builder<TNull = any> extends IntBuilder<Uint16, TNull> {} +/** @ignore */ +export class Uint32Builder<TNull = any> extends IntBuilder<Uint32, TNull> {} +/** @ignore */ +export class Uint64Builder<TNull = any> extends IntBuilder<Uint64, TNull> { + protected _values: WideBufferBuilder<Uint32Array, BigUint64Array>; + constructor(options: BuilderOptions<Uint64, TNull>) { + if (options['nullValues']) { + options['nullValues'] = (options['nullValues'] as TNull[]).map(toBigInt); + } + super(options); + this._values = new WideBufferBuilder(new Uint32Array(0), 2); + } + public get values64() { return this._values.buffer64; } + public isValid(value: Uint32Array | bigint | TNull) { return super.isValid(toBigInt(value)); } +} + +const toBigInt = ((memo: any) => (value: any) => { + if (ArrayBuffer.isView(value)) { + memo.buffer = value.buffer; + memo.byteOffset = value.byteOffset; + memo.byteLength = value.byteLength; + value = bignumToBigInt(memo); + memo.buffer = null; + } + return value; +})({ 'BigIntArray': BigInt64Array }); diff --git a/src/arrow/js/src/builder/interval.ts b/src/arrow/js/src/builder/interval.ts new file mode 100644 index 000000000..374228215 --- /dev/null +++ b/src/arrow/js/src/builder/interval.ts @@ -0,0 +1,26 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { FixedWidthBuilder } from '../builder'; +import { Interval, IntervalDayTime, IntervalYearMonth } from '../type'; + +/** @ignore */ +export class IntervalBuilder<T extends Interval = Interval, TNull = any> extends FixedWidthBuilder<T, TNull> {} +/** @ignore */ +export class IntervalDayTimeBuilder<TNull = any> extends IntervalBuilder<IntervalDayTime, TNull> {} +/** @ignore */ +export class IntervalYearMonthBuilder<TNull = any> extends IntervalBuilder<IntervalYearMonth, TNull> {} diff --git a/src/arrow/js/src/builder/list.ts b/src/arrow/js/src/builder/list.ts new file mode 100644 index 000000000..844681eae --- /dev/null +++ b/src/arrow/js/src/builder/list.ts @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Run } from './run'; +import { Field } from '../schema'; +import { DataType, List } from '../type'; +import { OffsetsBufferBuilder } from './buffer'; +import { Builder, BuilderOptions, VariableWidthBuilder } from '../builder'; + +/** @ignore */ +export class ListBuilder<T extends DataType = any, TNull = any> extends VariableWidthBuilder<List<T>, TNull> { + protected _run = new Run<T, TNull>(); + protected _offsets: OffsetsBufferBuilder; + constructor(opts: BuilderOptions<List<T>, TNull>) { + super(opts); + this._offsets = new OffsetsBufferBuilder(); + } + public addChild(child: Builder<T>, name = '0') { + if (this.numChildren > 0) { + throw new Error('ListBuilder can only have one child.'); + } + this.children[this.numChildren] = child; + this.type = new List(new Field(name, child.type, true)); + return this.numChildren - 1; + } + public clear() { + this._run.clear(); + return super.clear(); + } + protected _flushPending(pending: Map<number, T['TValue'] | undefined>) { + const run = this._run; + const offsets = this._offsets; + const setValue = this._setValue; + let index = 0, value: Uint8Array | undefined; + for ([index, value] of pending) { + if (value === undefined) { + offsets.set(index, 0); + } else { + offsets.set(index, value.length); + setValue(this, index, run.bind(value)); + } + } + } +} diff --git a/src/arrow/js/src/builder/map.ts b/src/arrow/js/src/builder/map.ts new file mode 100644 index 000000000..25affef2c --- /dev/null +++ b/src/arrow/js/src/builder/map.ts @@ -0,0 +1,64 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Field } from '../schema'; +import { DataType, Map_, Struct } from '../type'; +import { Builder, VariableWidthBuilder } from '../builder'; + +/** @ignore */ type MapValue<K extends DataType = any, V extends DataType = any> = Map_<K, V>['TValue']; +/** @ignore */ type MapValues<K extends DataType = any, V extends DataType = any> = Map<number, MapValue<K, V> | undefined>; +/** @ignore */ type MapValueExt<K extends DataType = any, V extends DataType = any> = MapValue<K, V> | { [key: string]: V } | { [key: number]: V } ; + +/** @ignore */ +export class MapBuilder<K extends DataType = any, V extends DataType = any, TNull = any> extends VariableWidthBuilder<Map_<K, V>, TNull> { + + protected _pending: MapValues<K, V> | undefined; + public set(index: number, value: MapValueExt<K, V> | TNull) { + return super.set(index, value as MapValue<K, V> | TNull); + } + + public setValue(index: number, value: MapValueExt<K, V>) { + value = value instanceof Map ? value : new Map(Object.entries(value)); + const pending = this._pending || (this._pending = new Map() as MapValues<K, V>); + const current = pending.get(index); + current && (this._pendingLength -= current.size); + this._pendingLength += value.size; + pending.set(index, value); + } + + public addChild(child: Builder<Struct<{ key: K; value: V }>>, name = `${this.numChildren}`) { + if (this.numChildren > 0) { + throw new Error('ListBuilder can only have one child.'); + } + this.children[this.numChildren] = child; + this.type = new Map_<K, V>(new Field(name, child.type, true), this.type.keysSorted); + return this.numChildren - 1; + } + + protected _flushPending(pending: MapValues<K, V>) { + const offsets = this._offsets; + const setValue = this._setValue; + pending.forEach((value, index) => { + if (value === undefined) { + offsets.set(index, 0); + } else { + offsets.set(index, value.size); + setValue(this, index, value); + } + }); + } +} diff --git a/src/arrow/js/src/builder/null.ts b/src/arrow/js/src/builder/null.ts new file mode 100644 index 000000000..4be3f063b --- /dev/null +++ b/src/arrow/js/src/builder/null.ts @@ -0,0 +1,29 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Null } from '../type'; +import { Builder } from '../builder'; + +/** @ignore */ +export class NullBuilder<TNull = any> extends Builder<Null, TNull> { + // @ts-ignore + public setValue(index: number, value: null) {} + public setValid(index: number, valid: boolean) { + this.length = Math.max(index + 1, this.length); + return valid; + } +} diff --git a/src/arrow/js/src/builder/run.ts b/src/arrow/js/src/builder/run.ts new file mode 100644 index 000000000..5239f51f2 --- /dev/null +++ b/src/arrow/js/src/builder/run.ts @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Vector } from '../vector'; +import { DataType } from '../type'; + +/** @ignore */ +export class Run<T extends DataType = any, TNull = any> { + protected _values!: ArrayLike<T['TValue'] | TNull>; + public get length() { return this._values.length; } + public get(index: number) { return this._values[index]; } + public clear() { this._values = <any> null; return this; } + public bind(values: Vector<T> | ArrayLike<T['TValue'] | TNull>) { + if (values instanceof Vector) { + return values; + } + this._values = values; + return this as any; + } +} diff --git a/src/arrow/js/src/builder/struct.ts b/src/arrow/js/src/builder/struct.ts new file mode 100644 index 000000000..4d12336ce --- /dev/null +++ b/src/arrow/js/src/builder/struct.ts @@ -0,0 +1,29 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Field } from '../schema'; +import { Builder } from '../builder'; +import { DataType, Struct } from '../type'; + +/** @ignore */ +export class StructBuilder<T extends { [key: string]: DataType } = any, TNull = any> extends Builder<Struct<T>, TNull> { + public addChild(child: Builder, name = `${this.numChildren}`) { + const childIndex = this.children.push(child); + this.type = new Struct([...this.type.children, new Field(name, child.type, true)]); + return childIndex; + } +} diff --git a/src/arrow/js/src/builder/time.ts b/src/arrow/js/src/builder/time.ts new file mode 100644 index 000000000..bfa71d2b5 --- /dev/null +++ b/src/arrow/js/src/builder/time.ts @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { FixedWidthBuilder } from '../builder'; +import { Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond } from '../type'; + +/** @ignore */ +export class TimeBuilder<T extends Time = Time, TNull = any> extends FixedWidthBuilder<T, TNull> {} +/** @ignore */ +export class TimeSecondBuilder<TNull = any> extends TimeBuilder<TimeSecond, TNull> {} +/** @ignore */ +export class TimeMillisecondBuilder<TNull = any> extends TimeBuilder<TimeMillisecond, TNull> {} +/** @ignore */ +export class TimeMicrosecondBuilder<TNull = any> extends TimeBuilder<TimeMicrosecond, TNull> {} +/** @ignore */ +export class TimeNanosecondBuilder<TNull = any> extends TimeBuilder<TimeNanosecond, TNull> {} diff --git a/src/arrow/js/src/builder/timestamp.ts b/src/arrow/js/src/builder/timestamp.ts new file mode 100644 index 000000000..49741e9ba --- /dev/null +++ b/src/arrow/js/src/builder/timestamp.ts @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { FixedWidthBuilder } from '../builder'; +import { Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond } from '../type'; + +/** @ignore */ +export class TimestampBuilder<T extends Timestamp = Timestamp, TNull = any> extends FixedWidthBuilder<T, TNull> {} +/** @ignore */ +export class TimestampSecondBuilder<TNull = any> extends TimestampBuilder<TimestampSecond, TNull> {} +/** @ignore */ +export class TimestampMillisecondBuilder<TNull = any> extends TimestampBuilder<TimestampMillisecond, TNull> {} +/** @ignore */ +export class TimestampMicrosecondBuilder<TNull = any> extends TimestampBuilder<TimestampMicrosecond, TNull> {} +/** @ignore */ +export class TimestampNanosecondBuilder<TNull = any> extends TimestampBuilder<TimestampNanosecond, TNull> {} diff --git a/src/arrow/js/src/builder/union.ts b/src/arrow/js/src/builder/union.ts new file mode 100644 index 000000000..18ac05bf6 --- /dev/null +++ b/src/arrow/js/src/builder/union.ts @@ -0,0 +1,96 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Field } from '../schema'; +import { DataBufferBuilder } from './buffer'; +import { Builder, BuilderOptions } from '../builder'; +import { Union, SparseUnion, DenseUnion } from '../type'; + +export interface UnionBuilderOptions<T extends Union = any, TNull = any> extends BuilderOptions<T, TNull> { + valueToChildTypeId?: (builder: UnionBuilder<T, TNull>, value: any, offset: number) => number; +} + +/** @ignore */ +export abstract class UnionBuilder<T extends Union, TNull = any> extends Builder<T, TNull> { + + protected _typeIds: DataBufferBuilder<Int8Array>; + + constructor(options: UnionBuilderOptions<T, TNull>) { + super(options); + this._typeIds = new DataBufferBuilder(new Int8Array(0), 1); + if (typeof options['valueToChildTypeId'] === 'function') { + this._valueToChildTypeId = options['valueToChildTypeId']; + } + } + + public get typeIdToChildIndex() { return this.type.typeIdToChildIndex; } + + public append(value: T['TValue'] | TNull, childTypeId?: number) { + return this.set(this.length, value, childTypeId); + } + + public set(index: number, value: T['TValue'] | TNull, childTypeId?: number) { + if (childTypeId === undefined) { + childTypeId = this._valueToChildTypeId(this, value, index); + } + if (this.setValid(index, this.isValid(value))) { + this.setValue(index, value, childTypeId); + } + return this; + } + + public setValue(index: number, value: T['TValue'], childTypeId?: number) { + this._typeIds.set(index, childTypeId!); + super.setValue(index, value); + } + + public addChild(child: Builder, name = `${this.children.length}`) { + const childTypeId = this.children.push(child); + const { type: { children, mode, typeIds } } = this; + const fields = [...children, new Field(name, child.type)]; + this.type = <T> new Union(mode, [...typeIds, childTypeId], fields); + return childTypeId; + } + + /** @ignore */ + // @ts-ignore + protected _valueToChildTypeId(builder: UnionBuilder<T, TNull>, value: any, offset: number): number { + throw new Error(`Cannot map UnionBuilder value to child typeId. \ +Pass the \`childTypeId\` as the second argument to unionBuilder.append(), \ +or supply a \`valueToChildTypeId\` function as part of the UnionBuilder constructor options.`); + } +} + +/** @ignore */ +export class SparseUnionBuilder<T extends SparseUnion, TNull = any> extends UnionBuilder<T, TNull> {} +/** @ignore */ +export class DenseUnionBuilder<T extends DenseUnion, TNull = any> extends UnionBuilder<T, TNull> { + + protected _offsets: DataBufferBuilder<Int32Array>; + + constructor(options: UnionBuilderOptions<T, TNull>) { + super(options); + this._offsets = new DataBufferBuilder(new Int32Array(0)); + } + + /** @ignore */ + public setValue(index: number, value: T['TValue'], childTypeId?: number) { + const childIndex = this.type.typeIdToChildIndex[childTypeId!]; + this._offsets.set(index, this.getChildAt(childIndex)!.length); + return super.setValue(index, value, childTypeId); + } +} diff --git a/src/arrow/js/src/builder/utf8.ts b/src/arrow/js/src/builder/utf8.ts new file mode 100644 index 000000000..7564cdad6 --- /dev/null +++ b/src/arrow/js/src/builder/utf8.ts @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Utf8 } from '../type'; +import { encodeUtf8 } from '../util/utf8'; +import { BinaryBuilder } from './binary'; +import { BufferBuilder } from './buffer'; +import { VariableWidthBuilder, BuilderOptions } from '../builder'; + +/** @ignore */ +export class Utf8Builder<TNull = any> extends VariableWidthBuilder<Utf8, TNull> { + constructor(opts: BuilderOptions<Utf8, TNull>) { + super(opts); + this._values = new BufferBuilder(new Uint8Array(0)); + } + public get byteLength(): number { + let size = this._pendingLength + (this.length * 4); + this._offsets && (size += this._offsets.byteLength); + this._values && (size += this._values.byteLength); + this._nulls && (size += this._nulls.byteLength); + return size; + } + public setValue(index: number, value: string) { + return super.setValue(index, encodeUtf8(value) as any); + } + // @ts-ignore + protected _flushPending(pending: Map<number, Uint8Array | undefined>, pendingLength: number): void {} +} + +(Utf8Builder.prototype as any)._flushPending = (BinaryBuilder.prototype as any)._flushPending; diff --git a/src/arrow/js/src/builder/valid.ts b/src/arrow/js/src/builder/valid.ts new file mode 100644 index 000000000..ae5b799fb --- /dev/null +++ b/src/arrow/js/src/builder/valid.ts @@ -0,0 +1,77 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { DataType } from '../type'; +import { valueToString } from '../util/pretty'; +import { BigIntAvailable } from '../util/compat'; + +/** + * Dynamically compile the null values into an `isValid()` function whose + * implementation is a switch statement. Microbenchmarks in v8 indicate + * this approach is 25% faster than using an ES6 Map. + * + * @example + * console.log(createIsValidFunction([null, 'N/A', NaN])); + * `function (x) { + * if (x !== x) return false; + * switch (x) { + * case null: + * case "N/A": + * return false; + * } + * return true; + * }` + * + * @ignore + * @param nullValues + */ +export function createIsValidFunction<T extends DataType = any, TNull = any>(nullValues?: ReadonlyArray<TNull>) { + + if (!nullValues || nullValues.length <= 0) { + // @ts-ignore + return function isValid(value: any) { return true; }; + } + + let fnBody = ''; + const noNaNs = nullValues.filter((x) => x === x); + + if (noNaNs.length > 0) { + fnBody = ` + switch (x) {${noNaNs.map((x) => ` + case ${valueToCase(x)}:`).join('')} + return false; + }`; + } + + // NaN doesn't equal anything including itself, so it doesn't work as a + // switch case. Instead we must explicitly check for NaN before the switch. + if (nullValues.length !== noNaNs.length) { + fnBody = `if (x !== x) return false;\n${fnBody}`; + } + + return new Function(`x`, `${fnBody}\nreturn true;`) as (value: T['TValue'] | TNull) => boolean; +} + +/** @ignore */ +function valueToCase(x: any) { + if (typeof x !== 'bigint') { + return valueToString(x); + } else if (BigIntAvailable) { + return `${valueToString(x)}n`; + } + return `"${valueToString(x)}"`; +} diff --git a/src/arrow/js/src/column.ts b/src/arrow/js/src/column.ts new file mode 100644 index 000000000..48b40e5a1 --- /dev/null +++ b/src/arrow/js/src/column.ts @@ -0,0 +1,136 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Data } from './data'; +import { Field } from './schema'; +import { DataType } from './type'; +import { Vector } from './vector'; +import { Clonable, Sliceable, Applicative } from './vector'; +import { VectorCtorArgs, VectorType as V } from './interfaces'; +import { Chunked, SearchContinuation } from './vector/chunked'; + +export interface Column<T extends DataType = any> { + concat(...others: Vector<T>[]): Column<T>; + slice(begin?: number, end?: number): Column<T>; + clone(chunks?: Vector<T>[], offsets?: Uint32Array): Column<T>; +} + +export class Column<T extends DataType = any> + extends Chunked<T> + implements Clonable<Column<T>>, + Sliceable<Column<T>>, + Applicative<T, Column<T>> { + + public static new<T extends DataType>(data: Data<T>, ...args: VectorCtorArgs<V<T>>): Column<T>; + public static new<T extends DataType>(field: string | Field<T>, ...chunks: (Vector<T> | Vector<T>[])[]): Column<T>; + public static new<T extends DataType>(field: string | Field<T>, data: Data<T>, ...args: VectorCtorArgs<V<T>>): Column<T>; + /** @nocollapse */ + public static new<T extends DataType = any>(...args: any[]) { + + let [field, data, ...rest] = args as [ + string | Field<T>, + Data<T> | Vector<T> | (Data<T> | Vector<T>)[], + ...any[] + ]; + + if (typeof field !== 'string' && !(field instanceof Field)) { + data = <Data<T> | Vector<T> | (Data<T> | Vector<T>)[]> field; + field = ''; + } + + const chunks = Chunked.flatten<T>( + Array.isArray(data) ? [...data, ...rest] : + data instanceof Vector ? [data, ...rest] : + [Vector.new(data, ...rest)] + ); + + if (typeof field === 'string') { + const type = chunks[0].data.type; + field = new Field(field, type, true); + } else if (!field.nullable && chunks.some(({ nullCount }) => nullCount > 0)) { + field = field.clone({ nullable: true }); + } + return new Column(field, chunks); + } + + constructor(field: Field<T>, vectors: Vector<T>[] = [], offsets?: Uint32Array) { + vectors = Chunked.flatten<T>(...vectors); + super(field.type, vectors, offsets); + this._field = field; + if (vectors.length === 1 && !(this instanceof SingleChunkColumn)) { + return new SingleChunkColumn(field, vectors[0], this._chunkOffsets); + } + } + + protected _field: Field<T>; + protected _children?: Column[]; + + public get field() { return this._field; } + public get name() { return this._field.name; } + public get nullable() { return this._field.nullable; } + public get metadata() { return this._field.metadata; } + + public clone(chunks = this._chunks) { + return new Column(this._field, chunks); + } + + public getChildAt<R extends DataType = any>(index: number): Column<R> | null { + + if (index < 0 || index >= this.numChildren) { return null; } + + const columns = this._children || (this._children = []); + let column: Column<R>, field: Field<R>, chunks: Vector<R>[]; + + if (column = columns[index]) { return column; } + if (field = ((this.type.children || [])[index] as Field<R>)) { + chunks = this._chunks + .map((vector) => vector.getChildAt<R>(index)) + .filter((vec): vec is Vector<R> => vec != null); + if (chunks.length > 0) { + return (columns[index] = new Column<R>(field, chunks)); + } + } + + return null; + } +} + +/** @ignore */ +class SingleChunkColumn<T extends DataType = any> extends Column<T> { + protected _chunk: Vector<T>; + constructor(field: Field<T>, vector: Vector<T>, offsets?: Uint32Array) { + super(field, [vector], offsets); + this._chunk = vector; + } + public search(index: number): [number, number] | null; + public search<N extends SearchContinuation<Chunked<T>>>(index: number, then?: N): ReturnType<N>; + public search<N extends SearchContinuation<Chunked<T>>>(index: number, then?: N) { + return then ? then(this, 0, index) : [0, index]; + } + public isValid(index: number): boolean { + return this._chunk.isValid(index); + } + public get(index: number): T['TValue'] | null { + return this._chunk.get(index); + } + public set(index: number, value: T['TValue'] | null): void { + this._chunk.set(index, value); + } + public indexOf(element: T['TValue'], offset?: number): number { + return this._chunk.indexOf(element, offset); + } +} diff --git a/src/arrow/js/src/compute/dataframe.ts b/src/arrow/js/src/compute/dataframe.ts new file mode 100644 index 000000000..e9df37194 --- /dev/null +++ b/src/arrow/js/src/compute/dataframe.ts @@ -0,0 +1,288 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Table } from '../table'; +import { Vector } from '../vector'; +import { IntVector } from '../vector/int'; +import { Field, Schema } from '../schema'; +import { Predicate, Col, PredicateFunc } from './predicate'; +import { RecordBatch } from '../recordbatch'; +import { VectorType as V } from '../interfaces'; +import { DataType, Int, Struct, Dictionary } from '../type'; + +/** @ignore */ +export type BindFunc = (batch: RecordBatch) => void; +/** @ignore */ +export type NextFunc = (idx: number, batch: RecordBatch) => void; + +/** + * `DataFrame` extends {@link Table} with support for predicate filtering. + * + * You can construct `DataFrames` like tables or convert a `Table` to a `DataFrame` + * with the constructor. + * + * ```ts + * const df = new DataFrame(table); + * ``` + */ +export class DataFrame<T extends { [key: string]: DataType } = any> extends Table<T> { + public filter(predicate: Predicate): FilteredDataFrame<T> { + return new FilteredDataFrame<T>(this.chunks, predicate); + } + public scan(next: NextFunc, bind?: BindFunc) { + const batches = this.chunks, numBatches = batches.length; + for (let batchIndex = -1; ++batchIndex < numBatches;) { + // load batches + const batch = batches[batchIndex]; + if (bind) { bind(batch); } + // yield all indices + for (let index = -1, numRows = batch.length; ++index < numRows;) { + next(index, batch); + } + } + } + public scanReverse(next: NextFunc, bind?: BindFunc) { + const batches = this.chunks, numBatches = batches.length; + for (let batchIndex = numBatches; --batchIndex >= 0;) { + // load batches + const batch = batches[batchIndex]; + if (bind) { bind(batch); } + // yield all indices + for (let index = batch.length; --index >= 0;) { + next(index, batch); + } + } + } + public countBy(name: Col | string) { + const batches = this.chunks, numBatches = batches.length; + const count_by = typeof name === 'string' ? new Col(name) : name as Col; + // Assume that all dictionary batches are deltas, which means that the + // last record batch has the most complete dictionary + count_by.bind(batches[numBatches - 1]); + const vector = count_by.vector as V<Dictionary>; + if (!DataType.isDictionary(vector.type)) { + throw new Error('countBy currently only supports dictionary-encoded columns'); + } + + const countByteLength = Math.ceil(Math.log(vector.length) / Math.log(256)); + const CountsArrayType = countByteLength == 4 ? Uint32Array : + countByteLength >= 2 ? Uint16Array : Uint8Array; + + const counts = new CountsArrayType(vector.dictionary.length); + for (let batchIndex = -1; ++batchIndex < numBatches;) { + // load batches + const batch = batches[batchIndex]; + // rebind the countBy Col + count_by.bind(batch); + const keys = (count_by.vector as V<Dictionary>).indices; + // yield all indices + for (let index = -1, numRows = batch.length; ++index < numRows;) { + const key = keys.get(index); + if (key !== null) { counts[key]++; } + } + } + return new CountByResult(vector.dictionary, IntVector.from(counts)); + } +} + +/** @ignore */ +export class CountByResult<T extends DataType = any, TCount extends Int = Int> extends Table<{ values: T; counts: TCount }> { + constructor(values: Vector<T>, counts: V<TCount>) { + type R = { values: T; counts: TCount }; + const schema = new Schema<R>([ + new Field('values', values.type), + new Field('counts', counts.type) + ]); + super(new RecordBatch<R>(schema, counts.length, [values, counts])); + } + public toJSON(): Record<string, unknown> { + const values = this.getColumnAt(0)!; + const counts = this.getColumnAt(1)!; + const result = {} as { [k: string]: number | null }; + for (let i = -1; ++i < this.length;) { + result[values.get(i)] = counts.get(i); + } + return result; + } +} + +/** @ignore */ +class FilteredBatchIterator<T extends { [key: string]: DataType }> implements IterableIterator<Struct<T>['TValue']> { + private batchIndex = 0; + private batch: RecordBatch<T>; + private index = 0; + private predicateFunc: PredicateFunc; + + constructor( + private batches: RecordBatch<T>[], + private predicate: Predicate + ) { + // TODO: bind batches lazily + // If predicate doesn't match anything in the batch we don't need + // to bind the callback + this.batch = this.batches[this.batchIndex]; + this.predicateFunc = this.predicate.bind(this.batch); + } + + next(): IteratorResult<Struct<T>['TValue']> { + while (this.batchIndex < this.batches.length) { + while (this.index < this.batch.length) { + if (this.predicateFunc(this.index, this.batch)) { + return { + value: this.batch.get(this.index++) as any, + }; + } + this.index++; + } + + if (++this.batchIndex < this.batches.length) { + this.index = 0; + this.batch = this.batches[this.batchIndex]; + this.predicateFunc = this.predicate.bind(this.batch); + } + } + + return {done: true, value: null}; + } + + [Symbol.iterator]() { + return this; + } +} + +/** @ignore */ +export class FilteredDataFrame<T extends { [key: string]: DataType } = any> extends DataFrame<T> { + private _predicate: Predicate; + constructor (batches: RecordBatch<T>[], predicate: Predicate) { + super(batches); + this._predicate = predicate; + } + public scan(next: NextFunc, bind?: BindFunc) { + // inlined version of this: + // this.parent.scan((idx, columns) => { + // if (this.predicate(idx, columns)) next(idx, columns); + // }); + const batches = this._chunks; + const numBatches = batches.length; + for (let batchIndex = -1; ++batchIndex < numBatches;) { + // load batches + const batch = batches[batchIndex]; + const predicate = this._predicate.bind(batch); + let isBound = false; + // yield all indices + for (let index = -1, numRows = batch.length; ++index < numRows;) { + if (predicate(index, batch)) { + // bind batches lazily - if predicate doesn't match anything + // in the batch we don't need to call bind on the batch + if (bind && !isBound) { + bind(batch); + isBound = true; + } + next(index, batch); + } + } + } + } + public scanReverse(next: NextFunc, bind?: BindFunc) { + const batches = this._chunks; + const numBatches = batches.length; + for (let batchIndex = numBatches; --batchIndex >= 0;) { + // load batches + const batch = batches[batchIndex]; + const predicate = this._predicate.bind(batch); + let isBound = false; + // yield all indices + for (let index = batch.length; --index >= 0;) { + if (predicate(index, batch)) { + // bind batches lazily - if predicate doesn't match anything + // in the batch we don't need to call bind on the batch + if (bind && !isBound) { + bind(batch); + isBound = true; + } + next(index, batch); + } + } + } + } + public count(): number { + // inlined version of this: + // let sum = 0; + // this.parent.scan((idx, columns) => { + // if (this.predicate(idx, columns)) ++sum; + // }); + // return sum; + let sum = 0; + const batches = this._chunks; + const numBatches = batches.length; + for (let batchIndex = -1; ++batchIndex < numBatches;) { + // load batches + const batch = batches[batchIndex]; + const predicate = this._predicate.bind(batch); + for (let index = -1, numRows = batch.length; ++index < numRows;) { + if (predicate(index, batch)) { ++sum; } + } + } + return sum; + } + + public [Symbol.iterator](): IterableIterator<Struct<T>['TValue']> { + // inlined version of this: + // this.parent.scan((idx, columns) => { + // if (this.predicate(idx, columns)) next(idx, columns); + // }); + return new FilteredBatchIterator<T>(this._chunks, this._predicate); + } + public filter(predicate: Predicate): FilteredDataFrame<T> { + return new FilteredDataFrame<T>( + this._chunks, + this._predicate.and(predicate) + ); + } + public countBy(name: Col | string) { + const batches = this._chunks, numBatches = batches.length; + const count_by = typeof name === 'string' ? new Col(name) : name as Col; + // Assume that all dictionary batches are deltas, which means that the + // last record batch has the most complete dictionary + count_by.bind(batches[numBatches - 1]); + const vector = count_by.vector as V<Dictionary>; + if (!DataType.isDictionary(vector.type)) { + throw new Error('countBy currently only supports dictionary-encoded columns'); + } + + const countByteLength = Math.ceil(Math.log(vector.length) / Math.log(256)); + const CountsArrayType = countByteLength == 4 ? Uint32Array : + countByteLength >= 2 ? Uint16Array : Uint8Array; + + const counts = new CountsArrayType(vector.dictionary.length); + + for (let batchIndex = -1; ++batchIndex < numBatches;) { + // load batches + const batch = batches[batchIndex]; + const predicate = this._predicate.bind(batch); + // rebind the countBy Col + count_by.bind(batch); + const keys = (count_by.vector as V<Dictionary>).indices; + // yield all indices + for (let index = -1, numRows = batch.length; ++index < numRows;) { + const key = keys.get(index); + if (key !== null && predicate(index, batch)) { counts[key]++; } + } + } + return new CountByResult(vector.dictionary, IntVector.from(counts)); + } +} diff --git a/src/arrow/js/src/compute/predicate.ts b/src/arrow/js/src/compute/predicate.ts new file mode 100644 index 000000000..52030763d --- /dev/null +++ b/src/arrow/js/src/compute/predicate.ts @@ -0,0 +1,292 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Vector } from '../vector'; +import { RecordBatch } from '../recordbatch'; +import { DictionaryVector } from '../vector/dictionary'; + +/** @ignore */ +export type ValueFunc<T> = (idx: number, cols: RecordBatch) => T | null; +/** @ignore */ +export type PredicateFunc = (idx: number, cols: RecordBatch) => boolean; + +/** @ignore */ +export abstract class Value<T> { + eq(other: Value<T> | T): Predicate { + if (!(other instanceof Value)) { other = new Literal(other); } + return new Equals(this, other); + } + le(other: Value<T> | T): Predicate { + if (!(other instanceof Value)) { other = new Literal(other); } + return new LTeq(this, other); + } + ge(other: Value<T> | T): Predicate { + if (!(other instanceof Value)) { other = new Literal(other); } + return new GTeq(this, other); + } + lt(other: Value<T> | T): Predicate { + return new Not(this.ge(other)); + } + gt(other: Value<T> | T): Predicate { + return new Not(this.le(other)); + } + ne(other: Value<T> | T): Predicate { + return new Not(this.eq(other)); + } +} + +/** @ignore */ +export class Literal<T= any> extends Value<T> { + constructor(public v: T) { super(); } +} + +/** @ignore */ +export class Col<T= any> extends Value<T> { + public vector!: Vector; + public colidx!: number; + + constructor(public name: string) { super(); } + bind(batch: RecordBatch): (idx: number, batch?: RecordBatch) => any { + if (!this.colidx) { + // Assume column index doesn't change between calls to bind + //this.colidx = cols.findIndex(v => v.name.indexOf(this.name) != -1); + this.colidx = -1; + const fields = batch.schema.fields; + for (let idx = -1; ++idx < fields.length;) { + if (fields[idx].name === this.name) { + this.colidx = idx; + break; + } + } + if (this.colidx < 0) { throw new Error(`Failed to bind Col "${this.name}"`); } + } + + const vec = this.vector = batch.getChildAt(this.colidx)!; + return (idx: number) => vec.get(idx); + } +} + +/** @ignore */ +export abstract class Predicate { + abstract bind(batch: RecordBatch): PredicateFunc; + and(...expr: Predicate[]): And { return new And(this, ...expr); } + or(...expr: Predicate[]): Or { return new Or(this, ...expr); } + not(): Predicate { return new Not(this); } +} + +/** @ignore */ +export abstract class ComparisonPredicate<T= any> extends Predicate { + constructor(public readonly left: Value<T>, public readonly right: Value<T>) { + super(); + } + + bind(batch: RecordBatch) { + if (this.left instanceof Literal) { + if (this.right instanceof Literal) { + return this._bindLitLit(batch, this.left, this.right); + } else { // right is a Col + + return this._bindLitCol(batch, this.left, this.right as Col); + } + } else { // left is a Col + if (this.right instanceof Literal) { + return this._bindColLit(batch, this.left as Col, this.right); + } else { // right is a Col + return this._bindColCol(batch, this.left as Col, this.right as Col); + } + } + } + + protected abstract _bindLitLit(batch: RecordBatch, left: Literal, right: Literal): PredicateFunc; + protected abstract _bindColCol(batch: RecordBatch, left: Col, right: Col): PredicateFunc; + protected abstract _bindColLit(batch: RecordBatch, col: Col, lit: Literal): PredicateFunc; + protected abstract _bindLitCol(batch: RecordBatch, lit: Literal, col: Col): PredicateFunc; +} + +/** @ignore */ +export abstract class CombinationPredicate extends Predicate { + readonly children: Predicate[]; + constructor(...children: Predicate[]) { + super(); + this.children = children; + } +} +// add children to prototype so it doesn't get mangled in es2015/umd +(<any> CombinationPredicate.prototype).children = Object.freeze([]); // freeze for safety + +/** @ignore */ +export class And extends CombinationPredicate { + constructor(...children: Predicate[]) { + // Flatten any Ands + children = children.reduce((accum: Predicate[], p: Predicate): Predicate[] => { + return accum.concat(p instanceof And ? p.children : p); + }, []); + super(...children); + } + bind(batch: RecordBatch) { + const bound = this.children.map((p) => p.bind(batch)); + return (idx: number, batch: RecordBatch) => bound.every((p) => p(idx, batch)); + } +} + +/** @ignore */ +export class Or extends CombinationPredicate { + constructor(...children: Predicate[]) { + // Flatten any Ors + children = children.reduce((accum: Predicate[], p: Predicate): Predicate[] => { + return accum.concat(p instanceof Or ? p.children : p); + }, []); + super(...children); + } + bind(batch: RecordBatch) { + const bound = this.children.map((p) => p.bind(batch)); + return (idx: number, batch: RecordBatch) => bound.some((p) => p(idx, batch)); + } +} + +/** @ignore */ +export class Equals extends ComparisonPredicate { + // Helpers used to cache dictionary reverse lookups between calls to bind + private lastDictionary: Vector|undefined; + private lastKey: number|undefined; + + protected _bindLitLit(_batch: RecordBatch, left: Literal, right: Literal): PredicateFunc { + const rtrn: boolean = left.v == right.v; + return () => rtrn; + } + + protected _bindColCol(batch: RecordBatch, left: Col, right: Col): PredicateFunc { + const left_func = left.bind(batch); + const right_func = right.bind(batch); + return (idx: number, batch: RecordBatch) => left_func(idx, batch) == right_func(idx, batch); + } + + protected _bindColLit(batch: RecordBatch, col: Col, lit: Literal): PredicateFunc { + const col_func = col.bind(batch); + if (col.vector instanceof DictionaryVector) { + let key: any; + const vector = col.vector as DictionaryVector; + if (vector.dictionary !== this.lastDictionary) { + key = vector.reverseLookup(lit.v); + this.lastDictionary = vector.dictionary; + this.lastKey = key; + } else { + key = this.lastKey; + } + + if (key === -1) { + // the value doesn't exist in the dictionary - always return + // false + // TODO: special-case of PredicateFunc that encapsulates this + // "always false" behavior. That way filtering operations don't + // have to bother checking + return () => false; + } else { + return (idx: number) => { + return vector.getKey(idx) === key; + }; + } + } else { + return (idx: number, cols: RecordBatch) => col_func(idx, cols) == lit.v; + } + } + + protected _bindLitCol(batch: RecordBatch, lit: Literal, col: Col) { + // Equals is commutative + return this._bindColLit(batch, col, lit); + } +} + +/** @ignore */ +export class LTeq extends ComparisonPredicate { + protected _bindLitLit(_batch: RecordBatch, left: Literal, right: Literal): PredicateFunc { + const rtrn: boolean = left.v <= right.v; + return () => rtrn; + } + + protected _bindColCol(batch: RecordBatch, left: Col, right: Col): PredicateFunc { + const left_func = left.bind(batch); + const right_func = right.bind(batch); + return (idx: number, cols: RecordBatch) => left_func(idx, cols) <= right_func(idx, cols); + } + + protected _bindColLit(batch: RecordBatch, col: Col, lit: Literal): PredicateFunc { + const col_func = col.bind(batch); + return (idx: number, cols: RecordBatch) => col_func(idx, cols) <= lit.v; + } + + protected _bindLitCol(batch: RecordBatch, lit: Literal, col: Col) { + const col_func = col.bind(batch); + return (idx: number, cols: RecordBatch) => lit.v <= col_func(idx, cols); + } +} + +/** @ignore */ +export class GTeq extends ComparisonPredicate { + protected _bindLitLit(_batch: RecordBatch, left: Literal, right: Literal): PredicateFunc { + const rtrn: boolean = left.v >= right.v; + return () => rtrn; + } + + protected _bindColCol(batch: RecordBatch, left: Col, right: Col): PredicateFunc { + const left_func = left.bind(batch); + const right_func = right.bind(batch); + return (idx: number, cols: RecordBatch) => left_func(idx, cols) >= right_func(idx, cols); + } + + protected _bindColLit(batch: RecordBatch, col: Col, lit: Literal): PredicateFunc { + const col_func = col.bind(batch); + return (idx: number, cols: RecordBatch) => col_func(idx, cols) >= lit.v; + } + + protected _bindLitCol(batch: RecordBatch, lit: Literal, col: Col) { + const col_func = col.bind(batch); + return (idx: number, cols: RecordBatch) => lit.v >= col_func(idx, cols); + } +} + +/** @ignore */ +export class Not extends Predicate { + constructor(public readonly child: Predicate) { + super(); + } + + bind(batch: RecordBatch) { + const func = this.child.bind(batch); + return (idx: number, batch: RecordBatch) => !func(idx, batch); + } +} + +/** @ignore */ +export class CustomPredicate extends Predicate { + constructor(private next: PredicateFunc, private bind_: (batch: RecordBatch) => void) { + super(); + } + + bind(batch: RecordBatch) { + this.bind_(batch); + return this.next; + } +} + +export function lit(v: any): Value<any> { return new Literal(v); } +export function col(n: string): Col<any> { return new Col(n); } +export function and(...p: Predicate[]): And { return new And(...p); } +export function or(...p: Predicate[]): Or { return new Or(...p); } +export function custom(next: PredicateFunc, bind: (batch: RecordBatch) => void) { + return new CustomPredicate(next, bind); +} diff --git a/src/arrow/js/src/data.ts b/src/arrow/js/src/data.ts new file mode 100644 index 000000000..2a549088c --- /dev/null +++ b/src/arrow/js/src/data.ts @@ -0,0 +1,295 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Vector } from './vector'; +import { truncateBitmap } from './util/bit'; +import { popcnt_bit_range } from './util/bit'; +import { BufferType, UnionMode, Type } from './enum'; +import { DataType, SparseUnion, DenseUnion, strideForType } from './type'; +import { toArrayBufferView, toUint8Array, toInt32Array } from './util/buffer'; +import { + Dictionary, + Null, Int, Float, + Binary, Bool, Utf8, Decimal, + Date_, Time, Timestamp, Interval, + List, Struct, Union, FixedSizeBinary, FixedSizeList, Map_, +} from './type'; + +// When slicing, we do not know the null count of the sliced range without +// doing some computation. To avoid doing this eagerly, we set the null count +// to -1 (any negative number will do). When Vector.nullCount is called the +// first time, the null count will be computed. See ARROW-33 +/** @ignore */ export type kUnknownNullCount = -1; +/** @ignore */ export const kUnknownNullCount = -1; + +/** @ignore */ export type NullBuffer = Uint8Array | null | undefined; +/** @ignore */ export type TypeIdsBuffer = Int8Array | ArrayLike<number> | Iterable<number> | undefined; +/** @ignore */ export type ValueOffsetsBuffer = Int32Array | ArrayLike<number> | Iterable<number> | undefined; +/** @ignore */ export type DataBuffer<T extends DataType> = T['TArray'] | ArrayLike<number> | Iterable<number> | undefined; + +/** @ignore */ +export interface Buffers<T extends DataType> { + [BufferType.OFFSET]: Int32Array; + [BufferType.DATA]: T['TArray']; + [BufferType.VALIDITY]: Uint8Array; + [BufferType.TYPE]: T['TArray']; +} + +/** @ignore */ +export interface Data<T extends DataType = DataType> { + readonly TType: T['TType']; + readonly TArray: T['TArray']; + readonly TValue: T['TValue']; +} + +/** @ignore */ +export class Data<T extends DataType = DataType> { + + public readonly type: T; + public readonly length: number; + public readonly offset: number; + public readonly stride: number; + public readonly childData: Data[]; + + /** + * The dictionary for this Vector, if any. Only used for Dictionary type. + */ + public dictionary?: Vector; + + public readonly values!: Buffers<T>[BufferType.DATA]; + public readonly typeIds!: Buffers<T>[BufferType.TYPE]; + public readonly nullBitmap!: Buffers<T>[BufferType.VALIDITY]; + public readonly valueOffsets!: Buffers<T>[BufferType.OFFSET]; + + public get typeId(): T['TType'] { return this.type.typeId; } + public get ArrayType(): T['ArrayType'] { return this.type.ArrayType; } + public get buffers() { + return [this.valueOffsets, this.values, this.nullBitmap, this.typeIds] as Buffers<T>; + } + public get byteLength(): number { + let byteLength = 0; + const { valueOffsets, values, nullBitmap, typeIds } = this; + valueOffsets && (byteLength += valueOffsets.byteLength); + values && (byteLength += values.byteLength); + nullBitmap && (byteLength += nullBitmap.byteLength); + typeIds && (byteLength += typeIds.byteLength); + return this.childData.reduce((byteLength, child) => byteLength + child.byteLength, byteLength); + } + + protected _nullCount: number | kUnknownNullCount; + + public get nullCount() { + let nullCount = this._nullCount; + let nullBitmap: Uint8Array | undefined; + if (nullCount <= kUnknownNullCount && (nullBitmap = this.nullBitmap)) { + this._nullCount = nullCount = this.length - popcnt_bit_range(nullBitmap, this.offset, this.offset + this.length); + } + return nullCount; + } + + constructor(type: T, offset: number, length: number, nullCount?: number, buffers?: Partial<Buffers<T>> | Data<T>, childData?: (Data | Vector)[], dictionary?: Vector) { + this.type = type; + this.dictionary = dictionary; + this.offset = Math.floor(Math.max(offset || 0, 0)); + this.length = Math.floor(Math.max(length || 0, 0)); + this._nullCount = Math.floor(Math.max(nullCount || 0, -1)); + this.childData = (childData || []).map((x) => x instanceof Data ? x : x.data) as Data[]; + let buffer: Buffers<T>[keyof Buffers<T>]; + if (buffers instanceof Data) { + this.stride = buffers.stride; + this.values = buffers.values; + this.typeIds = buffers.typeIds; + this.nullBitmap = buffers.nullBitmap; + this.valueOffsets = buffers.valueOffsets; + } else { + this.stride = strideForType(type); + if (buffers) { + (buffer = (buffers as Buffers<T>)[0]) && (this.valueOffsets = buffer); + (buffer = (buffers as Buffers<T>)[1]) && (this.values = buffer); + (buffer = (buffers as Buffers<T>)[2]) && (this.nullBitmap = buffer); + (buffer = (buffers as Buffers<T>)[3]) && (this.typeIds = buffer); + } + } + } + + public clone<R extends DataType>(type: R, offset = this.offset, length = this.length, nullCount = this._nullCount, buffers: Buffers<R> = <any> this, childData: (Data | Vector)[] = this.childData) { + return new Data(type, offset, length, nullCount, buffers, childData, this.dictionary); + } + + public slice(offset: number, length: number): Data<T> { + const { stride, typeId, childData } = this; + // +true === 1, +false === 0, so this means + // we keep nullCount at 0 if it's already 0, + // otherwise set to the invalidated flag -1 + const nullCount = +(this._nullCount === 0) - 1; + const childStride = typeId === 16 /* FixedSizeList */ ? stride : 1; + const buffers = this._sliceBuffers(offset, length, stride, typeId); + return this.clone<T>(this.type, this.offset + offset, length, nullCount, buffers, + // Don't slice children if we have value offsets (the variable-width types) + (!childData.length || this.valueOffsets) ? childData : this._sliceChildren(childData, childStride * offset, childStride * length)); + } + + public _changeLengthAndBackfillNullBitmap(newLength: number): Data<T> { + if (this.typeId === Type.Null) { + return this.clone(this.type, 0, newLength, 0); + } + const { length, nullCount } = this; + // start initialized with 0s (nulls), then fill from 0 to length with 1s (not null) + const bitmap = new Uint8Array(((newLength + 63) & ~63) >> 3).fill(255, 0, length >> 3); + // set all the bits in the last byte (up to bit `length - length % 8`) to 1 (not null) + bitmap[length >> 3] = (1 << (length - (length & ~7))) - 1; + // if we have a nullBitmap, truncate + slice and set it over the pre-filled 1s + if (nullCount > 0) { + bitmap.set(truncateBitmap(this.offset, length, this.nullBitmap), 0); + } + const buffers = this.buffers; + buffers[BufferType.VALIDITY] = bitmap; + return this.clone(this.type, 0, newLength, nullCount + (newLength - length), buffers); + } + + protected _sliceBuffers(offset: number, length: number, stride: number, typeId: T['TType']): Buffers<T> { + let arr: any; + const { buffers } = this; + // If typeIds exist, slice the typeIds buffer + (arr = buffers[BufferType.TYPE]) && (buffers[BufferType.TYPE] = arr.subarray(offset, offset + length)); + // If offsets exist, only slice the offsets buffer + (arr = buffers[BufferType.OFFSET]) && (buffers[BufferType.OFFSET] = arr.subarray(offset, offset + length + 1)) || + // Otherwise if no offsets, slice the data buffer. Don't slice the data vector for Booleans, since the offset goes by bits not bytes + (arr = buffers[BufferType.DATA]) && (buffers[BufferType.DATA] = typeId === 6 ? arr : arr.subarray(stride * offset, stride * (offset + length))); + return buffers; + } + + protected _sliceChildren(childData: Data[], offset: number, length: number): Data[] { + return childData.map((child) => child.slice(offset, length)); + } + + // + // Convenience methods for creating Data instances for each of the Arrow Vector types + // + /** @nocollapse */ + public static new<T extends DataType>(type: T, offset: number, length: number, nullCount?: number, buffers?: Partial<Buffers<T>> | Data<T>, childData?: (Data | Vector)[], dictionary?: Vector): Data<T> { + if (buffers instanceof Data) { buffers = buffers.buffers; } else if (!buffers) { buffers = [] as Partial<Buffers<T>>; } + switch (type.typeId) { + case Type.Null: return <unknown> Data.Null( <unknown> type as Null, offset, length) as Data<T>; + case Type.Int: return <unknown> Data.Int( <unknown> type as Int, offset, length, nullCount || 0, buffers[BufferType.VALIDITY], buffers[BufferType.DATA] || []) as Data<T>; + case Type.Dictionary: return <unknown> Data.Dictionary( <unknown> type as Dictionary, offset, length, nullCount || 0, buffers[BufferType.VALIDITY], buffers[BufferType.DATA] || [], dictionary!) as Data<T>; + case Type.Float: return <unknown> Data.Float( <unknown> type as Float, offset, length, nullCount || 0, buffers[BufferType.VALIDITY], buffers[BufferType.DATA] || []) as Data<T>; + case Type.Bool: return <unknown> Data.Bool( <unknown> type as Bool, offset, length, nullCount || 0, buffers[BufferType.VALIDITY], buffers[BufferType.DATA] || []) as Data<T>; + case Type.Decimal: return <unknown> Data.Decimal( <unknown> type as Decimal, offset, length, nullCount || 0, buffers[BufferType.VALIDITY], buffers[BufferType.DATA] || []) as Data<T>; + case Type.Date: return <unknown> Data.Date( <unknown> type as Date_, offset, length, nullCount || 0, buffers[BufferType.VALIDITY], buffers[BufferType.DATA] || []) as Data<T>; + case Type.Time: return <unknown> Data.Time( <unknown> type as Time, offset, length, nullCount || 0, buffers[BufferType.VALIDITY], buffers[BufferType.DATA] || []) as Data<T>; + case Type.Timestamp: return <unknown> Data.Timestamp( <unknown> type as Timestamp, offset, length, nullCount || 0, buffers[BufferType.VALIDITY], buffers[BufferType.DATA] || []) as Data<T>; + case Type.Interval: return <unknown> Data.Interval( <unknown> type as Interval, offset, length, nullCount || 0, buffers[BufferType.VALIDITY], buffers[BufferType.DATA] || []) as Data<T>; + case Type.FixedSizeBinary: return <unknown> Data.FixedSizeBinary( <unknown> type as FixedSizeBinary, offset, length, nullCount || 0, buffers[BufferType.VALIDITY], buffers[BufferType.DATA] || []) as Data<T>; + case Type.Binary: return <unknown> Data.Binary( <unknown> type as Binary, offset, length, nullCount || 0, buffers[BufferType.VALIDITY], buffers[BufferType.OFFSET] || [], buffers[BufferType.DATA] || []) as Data<T>; + case Type.Utf8: return <unknown> Data.Utf8( <unknown> type as Utf8, offset, length, nullCount || 0, buffers[BufferType.VALIDITY], buffers[BufferType.OFFSET] || [], buffers[BufferType.DATA] || []) as Data<T>; + case Type.List: return <unknown> Data.List( <unknown> type as List, offset, length, nullCount || 0, buffers[BufferType.VALIDITY], buffers[BufferType.OFFSET] || [], (childData || [])[0]) as Data<T>; + case Type.FixedSizeList: return <unknown> Data.FixedSizeList( <unknown> type as FixedSizeList, offset, length, nullCount || 0, buffers[BufferType.VALIDITY], (childData || [])[0]) as Data<T>; + case Type.Struct: return <unknown> Data.Struct( <unknown> type as Struct, offset, length, nullCount || 0, buffers[BufferType.VALIDITY], childData || []) as Data<T>; + case Type.Map: return <unknown> Data.Map( <unknown> type as Map_, offset, length, nullCount || 0, buffers[BufferType.VALIDITY], buffers[BufferType.OFFSET] || [], (childData || [])[0]) as Data<T>; + case Type.Union: return <unknown> Data.Union( <unknown> type as Union, offset, length, nullCount || 0, buffers[BufferType.VALIDITY], buffers[BufferType.TYPE] || [], buffers[BufferType.OFFSET] || childData, childData) as Data<T>; + } + throw new Error(`Unrecognized typeId ${type.typeId}`); + } + + /** @nocollapse */ + public static Null<T extends Null>(type: T, offset: number, length: number) { + return new Data(type, offset, length, 0); + } + /** @nocollapse */ + public static Int<T extends Int>(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, data: DataBuffer<T>) { + return new Data(type, offset, length, nullCount, [undefined, toArrayBufferView(type.ArrayType, data), toUint8Array(nullBitmap)]); + } + /** @nocollapse */ + public static Dictionary<T extends Dictionary>(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, data: DataBuffer<T>, dictionary: Vector<T['dictionary']>) { + return new Data(type, offset, length, nullCount, [undefined, toArrayBufferView<T['TArray']>(type.indices.ArrayType, data), toUint8Array(nullBitmap)], [], dictionary); + } + /** @nocollapse */ + public static Float<T extends Float>(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, data: DataBuffer<T>) { + return new Data(type, offset, length, nullCount, [undefined, toArrayBufferView(type.ArrayType, data), toUint8Array(nullBitmap)]); + } + /** @nocollapse */ + public static Bool<T extends Bool>(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, data: DataBuffer<T>) { + return new Data(type, offset, length, nullCount, [undefined, toArrayBufferView(type.ArrayType, data), toUint8Array(nullBitmap)]); + } + /** @nocollapse */ + public static Decimal<T extends Decimal>(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, data: DataBuffer<T>) { + return new Data(type, offset, length, nullCount, [undefined, toArrayBufferView(type.ArrayType, data), toUint8Array(nullBitmap)]); + } + /** @nocollapse */ + public static Date<T extends Date_>(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, data: DataBuffer<T>) { + return new Data(type, offset, length, nullCount, [undefined, toArrayBufferView(type.ArrayType, data), toUint8Array(nullBitmap)]); + } + /** @nocollapse */ + public static Time<T extends Time>(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, data: DataBuffer<T>) { + return new Data(type, offset, length, nullCount, [undefined, toArrayBufferView(type.ArrayType, data), toUint8Array(nullBitmap)]); + } + /** @nocollapse */ + public static Timestamp<T extends Timestamp>(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, data: DataBuffer<T>) { + return new Data(type, offset, length, nullCount, [undefined, toArrayBufferView(type.ArrayType, data), toUint8Array(nullBitmap)]); + } + /** @nocollapse */ + public static Interval<T extends Interval>(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, data: DataBuffer<T>) { + return new Data(type, offset, length, nullCount, [undefined, toArrayBufferView(type.ArrayType, data), toUint8Array(nullBitmap)]); + } + /** @nocollapse */ + public static FixedSizeBinary<T extends FixedSizeBinary>(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, data: DataBuffer<T>) { + return new Data(type, offset, length, nullCount, [undefined, toArrayBufferView(type.ArrayType, data), toUint8Array(nullBitmap)]); + } + /** @nocollapse */ + public static Binary<T extends Binary>(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, valueOffsets: ValueOffsetsBuffer, data: DataBuffer<T>) { + return new Data(type, offset, length, nullCount, [toInt32Array(valueOffsets), toUint8Array(data), toUint8Array(nullBitmap)]); + } + /** @nocollapse */ + public static Utf8<T extends Utf8>(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, valueOffsets: ValueOffsetsBuffer, data: DataBuffer<T>) { + return new Data(type, offset, length, nullCount, [toInt32Array(valueOffsets), toUint8Array(data), toUint8Array(nullBitmap)]); + } + /** @nocollapse */ + public static List<T extends List>(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, valueOffsets: ValueOffsetsBuffer, child: Data<T['valueType']> | Vector<T['valueType']>) { + return new Data(type, offset, length, nullCount, [toInt32Array(valueOffsets), undefined, toUint8Array(nullBitmap)], child ? [child] : []); + } + /** @nocollapse */ + public static FixedSizeList<T extends FixedSizeList>(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, child: Data<T['valueType']> | Vector<T['valueType']>) { + return new Data(type, offset, length, nullCount, [undefined, undefined, toUint8Array(nullBitmap)], child ? [child] : []); + } + /** @nocollapse */ + public static Struct<T extends Struct>(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, children: (Data | Vector)[]) { + return new Data(type, offset, length, nullCount, [undefined, undefined, toUint8Array(nullBitmap)], children); + } + /** @nocollapse */ + public static Map<T extends Map_>(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, valueOffsets: ValueOffsetsBuffer, child: (Data | Vector)) { + return new Data(type, offset, length, nullCount, [toInt32Array(valueOffsets), undefined, toUint8Array(nullBitmap)], child ? [child] : []); + } + public static Union<T extends SparseUnion>(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, typeIds: TypeIdsBuffer, children: (Data | Vector)[], _?: any): Data<T>; + public static Union<T extends DenseUnion>(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, typeIds: TypeIdsBuffer, valueOffsets: ValueOffsetsBuffer, children: (Data | Vector)[]): Data<T>; + public static Union<T extends Union>(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, typeIds: TypeIdsBuffer, valueOffsetsOrChildren: ValueOffsetsBuffer | (Data | Vector)[], children?: (Data | Vector)[]): Data<T>; + /** @nocollapse */ + public static Union<T extends Union>(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, typeIds: TypeIdsBuffer, valueOffsetsOrChildren: ValueOffsetsBuffer | (Data | Vector)[], children?: (Data | Vector)[]) { + const buffers = <unknown> [ + undefined, undefined, + toUint8Array(nullBitmap), + toArrayBufferView(type.ArrayType, typeIds) + ] as Partial<Buffers<T>>; + if (type.mode === UnionMode.Sparse) { + return new Data(type, offset, length, nullCount, buffers, valueOffsetsOrChildren as (Data | Vector)[]); + } + buffers[BufferType.OFFSET] = toInt32Array(<ValueOffsetsBuffer> valueOffsetsOrChildren); + return new Data(type, offset, length, nullCount, buffers, children); + } +} + +(Data.prototype as any).childData = Object.freeze([]); diff --git a/src/arrow/js/src/enum.ts b/src/arrow/js/src/enum.ts new file mode 100644 index 000000000..517aa27e8 --- /dev/null +++ b/src/arrow/js/src/enum.ts @@ -0,0 +1,142 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +export { + DateUnit, + TimeUnit, + Precision, + UnionMode, + IntervalUnit, + MetadataVersion, +} from './fb/Schema'; + +export { MessageHeader } from './fb/Message'; + +/** + * Main data type enumeration. + * + * Data types in this library are all *logical*. They can be expressed as + * either a primitive physical type (bytes or bits of some fixed size), a + * nested type consisting of other data types, or another data type (e.g. a + * timestamp encoded as an int64). + * + * **Note**: Only enum values 0-17 (NONE through Map) are written to an Arrow + * IPC payload. + * + * The rest of the values are specified here so TypeScript can narrow the type + * signatures further beyond the base Arrow Types. The Arrow DataTypes include + * metadata like `bitWidth` that impact the type signatures of the values we + * accept and return. + * + * For example, the `Int8Vector` reads 1-byte numbers from an `Int8Array`, an + * `Int32Vector` reads a 4-byte number from an `Int32Array`, and an `Int64Vector` + * reads a pair of 4-byte lo, hi 32-bit integers as a zero-copy slice from the + * underlying `Int32Array`. + * + * Library consumers benefit by knowing the narrowest type, since we can ensure + * the types across all public methods are propagated, and never bail to `any`. + * These values are _never_ used at runtime, and they will _never_ be written + * to the flatbuffers metadata of serialized Arrow IPC payloads. + */ +export enum Type { + /** The default placeholder type */ + NONE = 0, + /** A NULL type having no physical storage */ + Null = 1, + /** Signed or unsigned 8, 16, 32, or 64-bit little-endian integer */ + Int = 2, + /** 2, 4, or 8-byte floating point value */ + Float = 3, + /** Variable-length bytes (no guarantee of UTF8-ness) */ + Binary = 4, + /** UTF8 variable-length string as List<Char> */ + Utf8 = 5, + /** Boolean as 1 bit, LSB bit-packed ordering */ + Bool = 6, + /** Precision-and-scale-based decimal type. Storage type depends on the parameters. */ + Decimal = 7, + /** int32_t days or int64_t milliseconds since the UNIX epoch */ + Date = 8, + /** Time as signed 32 or 64-bit integer, representing either seconds, milliseconds, microseconds, or nanoseconds since midnight since midnight */ + Time = 9, + /** Exact timestamp encoded with int64 since UNIX epoch (Default unit millisecond) */ + Timestamp = 10, + /** YEAR_MONTH or DAY_TIME interval in SQL style */ + Interval = 11, + /** A list of some logical data type */ + List = 12, + /** Struct of logical types */ + Struct = 13, + /** Union of logical types */ + Union = 14, + /** Fixed-size binary. Each value occupies the same number of bytes */ + FixedSizeBinary = 15, + /** Fixed-size list. Each value occupies the same number of bytes */ + FixedSizeList = 16, + /** Map of named logical types */ + Map = 17, + + /** Dictionary aka Category type */ + Dictionary = -1, + Int8 = -2, + Int16 = -3, + Int32 = -4, + Int64 = -5, + Uint8 = -6, + Uint16 = -7, + Uint32 = -8, + Uint64 = -9, + Float16 = -10, + Float32 = -11, + Float64 = -12, + DateDay = -13, + DateMillisecond = -14, + TimestampSecond = -15, + TimestampMillisecond = -16, + TimestampMicrosecond = -17, + TimestampNanosecond = -18, + TimeSecond = -19, + TimeMillisecond = -20, + TimeMicrosecond = -21, + TimeNanosecond = -22, + DenseUnion = -23, + SparseUnion = -24, + IntervalDayTime = -25, + IntervalYearMonth = -26, +} + +export enum BufferType { + /** + * used in List type, Dense Union and variable length primitive types (String, Binary) + */ + OFFSET = 0, + + /** + * actual data, either wixed width primitive types in slots or variable width delimited by an OFFSET vector + */ + DATA = 1, + + /** + * Bit vector indicating if each value is null + */ + VALIDITY = 2, + + /** + * Type vector used in Union type + */ + TYPE = 3 + } diff --git a/src/arrow/js/src/fb/.eslintrc.js b/src/arrow/js/src/fb/.eslintrc.js new file mode 100644 index 000000000..d448540e4 --- /dev/null +++ b/src/arrow/js/src/fb/.eslintrc.js @@ -0,0 +1,23 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +module.exports = { + rules: { + "@typescript-eslint/no-require-imports": "off", + "@typescript-eslint/no-inferrable-types": "off" + }, +};
\ No newline at end of file diff --git a/src/arrow/js/src/fb/File.ts b/src/arrow/js/src/fb/File.ts new file mode 100644 index 000000000..5746dd183 --- /dev/null +++ b/src/arrow/js/src/fb/File.ts @@ -0,0 +1,300 @@ +// automatically generated by the FlatBuffers compiler, do not modify + +import { flatbuffers } from 'flatbuffers'; +import * as NS13596923344997147894 from './Schema'; +/** + * ---------------------------------------------------------------------- + * Arrow File metadata + * + * + * @constructor + */ +export class Footer { + bb: flatbuffers.ByteBuffer | null = null; + + bb_pos: number = 0; + /** + * @param number i + * @param flatbuffers.ByteBuffer bb + * @returns Footer + */ + __init(i: number, bb: flatbuffers.ByteBuffer): Footer { + this.bb_pos = i; + this.bb = bb; + return this; + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param Footer= obj + * @returns Footer + */ + static getRootAsFooter(bb: flatbuffers.ByteBuffer, obj?: Footer): Footer { + return (obj || new Footer()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param Footer= obj + * @returns Footer + */ + static getSizePrefixedRootAsFooter(bb: flatbuffers.ByteBuffer, obj?: Footer): Footer { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new Footer()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @returns MetadataVersion + */ + version(): NS13596923344997147894.MetadataVersion { + const offset = this.bb!.__offset(this.bb_pos, 4); + return offset ? /** */ (this.bb!.readInt16(this.bb_pos + offset)) : NS13596923344997147894.MetadataVersion.V1; + } + + /** + * @param Schema= obj + * @returns Schema|null + */ + schema(obj?: NS13596923344997147894.Schema): NS13596923344997147894.Schema | null { + const offset = this.bb!.__offset(this.bb_pos, 6); + return offset ? (obj || new NS13596923344997147894.Schema()).__init(this.bb!.__indirect(this.bb_pos + offset), this.bb!) : null; + } + + /** + * @param number index + * @param Block= obj + * @returns Block + */ + dictionaries(index: number, obj?: Block): Block | null { + const offset = this.bb!.__offset(this.bb_pos, 8); + return offset ? (obj || new Block()).__init(this.bb!.__vector(this.bb_pos + offset) + index * 24, this.bb!) : null; + } + + /** + * @returns number + */ + dictionariesLength(): number { + const offset = this.bb!.__offset(this.bb_pos, 8); + return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0; + } + + /** + * @param number index + * @param Block= obj + * @returns Block + */ + recordBatches(index: number, obj?: Block): Block | null { + const offset = this.bb!.__offset(this.bb_pos, 10); + return offset ? (obj || new Block()).__init(this.bb!.__vector(this.bb_pos + offset) + index * 24, this.bb!) : null; + } + + /** + * @returns number + */ + recordBatchesLength(): number { + const offset = this.bb!.__offset(this.bb_pos, 10); + return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0; + } + + /** + * User-defined metadata + * + * @param number index + * @param KeyValue= obj + * @returns KeyValue + */ + customMetadata(index: number, obj?: NS13596923344997147894.KeyValue): NS13596923344997147894.KeyValue | null { + const offset = this.bb!.__offset(this.bb_pos, 12); + return offset ? (obj || new NS13596923344997147894.KeyValue()).__init(this.bb!.__indirect(this.bb!.__vector(this.bb_pos + offset) + index * 4), this.bb!) : null; + } + + /** + * @returns number + */ + customMetadataLength(): number { + const offset = this.bb!.__offset(this.bb_pos, 12); + return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0; + } + + /** + * @param flatbuffers.Builder builder + */ + static startFooter(builder: flatbuffers.Builder) { + builder.startObject(5); + } + + /** + * @param flatbuffers.Builder builder + * @param MetadataVersion version + */ + static addVersion(builder: flatbuffers.Builder, version: NS13596923344997147894.MetadataVersion) { + builder.addFieldInt16(0, version, NS13596923344997147894.MetadataVersion.V1); + } + + /** + * @param flatbuffers.Builder builder + * @param flatbuffers.Offset schemaOffset + */ + static addSchema(builder: flatbuffers.Builder, schemaOffset: flatbuffers.Offset) { + builder.addFieldOffset(1, schemaOffset, 0); + } + + /** + * @param flatbuffers.Builder builder + * @param flatbuffers.Offset dictionariesOffset + */ + static addDictionaries(builder: flatbuffers.Builder, dictionariesOffset: flatbuffers.Offset) { + builder.addFieldOffset(2, dictionariesOffset, 0); + } + + /** + * @param flatbuffers.Builder builder + * @param number numElems + */ + static startDictionariesVector(builder: flatbuffers.Builder, numElems: number) { + builder.startVector(24, numElems, 8); + } + + /** + * @param flatbuffers.Builder builder + * @param flatbuffers.Offset recordBatchesOffset + */ + static addRecordBatches(builder: flatbuffers.Builder, recordBatchesOffset: flatbuffers.Offset) { + builder.addFieldOffset(3, recordBatchesOffset, 0); + } + + /** + * @param flatbuffers.Builder builder + * @param number numElems + */ + static startRecordBatchesVector(builder: flatbuffers.Builder, numElems: number) { + builder.startVector(24, numElems, 8); + } + + /** + * @param flatbuffers.Builder builder + * @param flatbuffers.Offset customMetadataOffset + */ + static addCustomMetadata(builder: flatbuffers.Builder, customMetadataOffset: flatbuffers.Offset) { + builder.addFieldOffset(4, customMetadataOffset, 0); + } + + /** + * @param flatbuffers.Builder builder + * @param Array.<flatbuffers.Offset> data + * @returns flatbuffers.Offset + */ + static createCustomMetadataVector(builder: flatbuffers.Builder, data: flatbuffers.Offset[]): flatbuffers.Offset { + builder.startVector(4, data.length, 4); + for (let i = data.length - 1; i >= 0; i--) { + builder.addOffset(data[i]); + } + return builder.endVector(); + } + + /** + * @param flatbuffers.Builder builder + * @param number numElems + */ + static startCustomMetadataVector(builder: flatbuffers.Builder, numElems: number) { + builder.startVector(4, numElems, 4); + } + + /** + * @param flatbuffers.Builder builder + * @returns flatbuffers.Offset + */ + static endFooter(builder: flatbuffers.Builder): flatbuffers.Offset { + const offset = builder.endObject(); + return offset; + } + + /** + * @param flatbuffers.Builder builder + * @param flatbuffers.Offset offset + */ + static finishFooterBuffer(builder: flatbuffers.Builder, offset: flatbuffers.Offset) { + builder.finish(offset); + } + + /** + * @param flatbuffers.Builder builder + * @param flatbuffers.Offset offset + */ + static finishSizePrefixedFooterBuffer(builder: flatbuffers.Builder, offset: flatbuffers.Offset) { + builder.finish(offset, undefined, true); + } + + static createFooter(builder: flatbuffers.Builder, version: NS13596923344997147894.MetadataVersion, schemaOffset: flatbuffers.Offset, dictionariesOffset: flatbuffers.Offset, recordBatchesOffset: flatbuffers.Offset, customMetadataOffset: flatbuffers.Offset): flatbuffers.Offset { + Footer.startFooter(builder); + Footer.addVersion(builder, version); + Footer.addSchema(builder, schemaOffset); + Footer.addDictionaries(builder, dictionariesOffset); + Footer.addRecordBatches(builder, recordBatchesOffset); + Footer.addCustomMetadata(builder, customMetadataOffset); + return Footer.endFooter(builder); + } +} +/** + * @constructor + */ +export class Block { + bb: flatbuffers.ByteBuffer | null = null; + + bb_pos: number = 0; + /** + * @param number i + * @param flatbuffers.ByteBuffer bb + * @returns Block + */ + __init(i: number, bb: flatbuffers.ByteBuffer): Block { + this.bb_pos = i; + this.bb = bb; + return this; + } + + /** + * Index to the start of the RecordBlock (note this is past the Message header) + * + * @returns flatbuffers.Long + */ + offset(): flatbuffers.Long { + return this.bb!.readInt64(this.bb_pos); + } + + /** + * Length of the metadata + * + * @returns number + */ + metaDataLength(): number { + return this.bb!.readInt32(this.bb_pos + 8); + } + + /** + * Length of the data (this is aligned so there can be a gap between this and + * the metadata). + * + * @returns flatbuffers.Long + */ + bodyLength(): flatbuffers.Long { + return this.bb!.readInt64(this.bb_pos + 16); + } + + /** + * @param flatbuffers.Builder builder + * @param flatbuffers.Long offset + * @param number metaDataLength + * @param flatbuffers.Long bodyLength + * @returns flatbuffers.Offset + */ + static createBlock(builder: flatbuffers.Builder, offset: flatbuffers.Long, metaDataLength: number, bodyLength: flatbuffers.Long): flatbuffers.Offset { + builder.prep(8, 24); + builder.writeInt64(bodyLength); + builder.pad(4); + builder.writeInt32(metaDataLength); + builder.writeInt64(offset); + return builder.offset(); + } + +} diff --git a/src/arrow/js/src/fb/Message.ts b/src/arrow/js/src/fb/Message.ts new file mode 100644 index 000000000..973eb0425 --- /dev/null +++ b/src/arrow/js/src/fb/Message.ts @@ -0,0 +1,709 @@ +// automatically generated by the FlatBuffers compiler, do not modify + +import { flatbuffers } from 'flatbuffers'; +import * as NS13596923344997147894 from './Schema'; +/** + * @enum {number} + */ +export enum CompressionType { + LZ4_FRAME = 0, + ZSTD = 1 +} + +/** + * Provided for forward compatibility in case we need to support different + * strategies for compressing the IPC message body (like whole-body + * compression rather than buffer-level) in the future + * + * @enum {number} + */ +export enum BodyCompressionMethod { + /** + * Each constituent buffer is first compressed with the indicated + * compressor, and then written with the uncompressed length in the first 8 + * bytes as a 64-bit little-endian signed integer followed by the compressed + * buffer bytes (and then padding as required by the protocol). The + * uncompressed length may be set to -1 to indicate that the data that + * follows is not compressed, which can be useful for cases where + * compression does not yield appreciable savings. + */ + BUFFER = 0 +} + +/** + * ---------------------------------------------------------------------- + * The root Message type + * This union enables us to easily send different message types without + * redundant storage, and in the future we can easily add new message types. + * + * Arrow implementations do not need to implement all of the message types, + * which may include experimental metadata types. For maximum compatibility, + * it is best to send data using RecordBatch + * + * @enum {number} + */ +export enum MessageHeader { + NONE = 0, + Schema = 1, + DictionaryBatch = 2, + RecordBatch = 3, + Tensor = 4, + SparseTensor = 5 +} + +/** + * ---------------------------------------------------------------------- + * Data structures for describing a table row batch (a collection of + * equal-length Arrow arrays) + * Metadata about a field at some level of a nested type tree (but not + * its children). + * + * For example, a List<Int16> with values [[1, 2, 3], null, [4], [5, 6], null] + * would have {length: 5, null_count: 2} for its List node, and {length: 6, + * null_count: 0} for its Int16 node, as separate FieldNode structs + * + * @constructor + */ +export class FieldNode { + bb: flatbuffers.ByteBuffer | null = null; + + bb_pos: number = 0; + /** + * @param number i + * @param flatbuffers.ByteBuffer bb + * @returns FieldNode + */ + __init(i: number, bb: flatbuffers.ByteBuffer): FieldNode { + this.bb_pos = i; + this.bb = bb; + return this; + } + + /** + * The number of value slots in the Arrow array at this level of a nested + * tree + * + * @returns flatbuffers.Long + */ + length(): flatbuffers.Long { + return this.bb!.readInt64(this.bb_pos); + } + + /** + * The number of observed nulls. Fields with null_count == 0 may choose not + * to write their physical validity bitmap out as a materialized buffer, + * instead setting the length of the bitmap buffer to 0. + * + * @returns flatbuffers.Long + */ + nullCount(): flatbuffers.Long { + return this.bb!.readInt64(this.bb_pos + 8); + } + + /** + * @param flatbuffers.Builder builder + * @param flatbuffers.Long length + * @param flatbuffers.Long null_count + * @returns flatbuffers.Offset + */ + static createFieldNode(builder: flatbuffers.Builder, length: flatbuffers.Long, null_count: flatbuffers.Long): flatbuffers.Offset { + builder.prep(8, 16); + builder.writeInt64(null_count); + builder.writeInt64(length); + return builder.offset(); + } + +} +/** + * Optional compression for the memory buffers constituting IPC message + * bodies. Intended for use with RecordBatch but could be used for other + * message types + * + * @constructor + */ +export class BodyCompression { + bb: flatbuffers.ByteBuffer | null = null; + + bb_pos: number = 0; + /** + * @param number i + * @param flatbuffers.ByteBuffer bb + * @returns BodyCompression + */ + __init(i: number, bb: flatbuffers.ByteBuffer): BodyCompression { + this.bb_pos = i; + this.bb = bb; + return this; + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param BodyCompression= obj + * @returns BodyCompression + */ + static getRootAsBodyCompression(bb: flatbuffers.ByteBuffer, obj?: BodyCompression): BodyCompression { + return (obj || new BodyCompression()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param BodyCompression= obj + * @returns BodyCompression + */ + static getSizePrefixedRootAsBodyCompression(bb: flatbuffers.ByteBuffer, obj?: BodyCompression): BodyCompression { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new BodyCompression()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * Compressor library + * + * @returns CompressionType + */ + codec(): CompressionType { + const offset = this.bb!.__offset(this.bb_pos, 4); + return offset ? /** */ (this.bb!.readInt8(this.bb_pos + offset)) : CompressionType.LZ4_FRAME; + } + + /** + * Indicates the way the record batch body was compressed + * + * @returns BodyCompressionMethod + */ + method(): BodyCompressionMethod { + const offset = this.bb!.__offset(this.bb_pos, 6); + return offset ? /** */ (this.bb!.readInt8(this.bb_pos + offset)) : BodyCompressionMethod.BUFFER; + } + + /** + * @param flatbuffers.Builder builder + */ + static startBodyCompression(builder: flatbuffers.Builder) { + builder.startObject(2); + } + + /** + * @param flatbuffers.Builder builder + * @param CompressionType codec + */ + static addCodec(builder: flatbuffers.Builder, codec: CompressionType) { + builder.addFieldInt8(0, codec, CompressionType.LZ4_FRAME); + } + + /** + * @param flatbuffers.Builder builder + * @param BodyCompressionMethod method + */ + static addMethod(builder: flatbuffers.Builder, method: BodyCompressionMethod) { + builder.addFieldInt8(1, method, BodyCompressionMethod.BUFFER); + } + + /** + * @param flatbuffers.Builder builder + * @returns flatbuffers.Offset + */ + static endBodyCompression(builder: flatbuffers.Builder): flatbuffers.Offset { + const offset = builder.endObject(); + return offset; + } + + static createBodyCompression(builder: flatbuffers.Builder, codec: CompressionType, method: BodyCompressionMethod): flatbuffers.Offset { + BodyCompression.startBodyCompression(builder); + BodyCompression.addCodec(builder, codec); + BodyCompression.addMethod(builder, method); + return BodyCompression.endBodyCompression(builder); + } +} +/** + * A data header describing the shared memory layout of a "record" or "row" + * batch. Some systems call this a "row batch" internally and others a "record + * batch". + * + * @constructor + */ +export class RecordBatch { + bb: flatbuffers.ByteBuffer | null = null; + + bb_pos: number = 0; + /** + * @param number i + * @param flatbuffers.ByteBuffer bb + * @returns RecordBatch + */ + __init(i: number, bb: flatbuffers.ByteBuffer): RecordBatch { + this.bb_pos = i; + this.bb = bb; + return this; + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param RecordBatch= obj + * @returns RecordBatch + */ + static getRootAsRecordBatch(bb: flatbuffers.ByteBuffer, obj?: RecordBatch): RecordBatch { + return (obj || new RecordBatch()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param RecordBatch= obj + * @returns RecordBatch + */ + static getSizePrefixedRootAsRecordBatch(bb: flatbuffers.ByteBuffer, obj?: RecordBatch): RecordBatch { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new RecordBatch()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * number of records / rows. The arrays in the batch should all have this + * length + * + * @returns flatbuffers.Long + */ + length(): flatbuffers.Long { + const offset = this.bb!.__offset(this.bb_pos, 4); + return offset ? this.bb!.readInt64(this.bb_pos + offset) : this.bb!.createLong(0, 0); + } + + /** + * Nodes correspond to the pre-ordered flattened logical schema + * + * @param number index + * @param FieldNode= obj + * @returns FieldNode + */ + nodes(index: number, obj?: FieldNode): FieldNode | null { + const offset = this.bb!.__offset(this.bb_pos, 6); + return offset ? (obj || new FieldNode()).__init(this.bb!.__vector(this.bb_pos + offset) + index * 16, this.bb!) : null; + } + + /** + * @returns number + */ + nodesLength(): number { + const offset = this.bb!.__offset(this.bb_pos, 6); + return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0; + } + + /** + * Buffers correspond to the pre-ordered flattened buffer tree + * + * The number of buffers appended to this list depends on the schema. For + * example, most primitive arrays will have 2 buffers, 1 for the validity + * bitmap and 1 for the values. For struct arrays, there will only be a + * single buffer for the validity (nulls) bitmap + * + * @param number index + * @param Buffer= obj + * @returns Buffer + */ + buffers(index: number, obj?: NS13596923344997147894.Buffer): NS13596923344997147894.Buffer | null { + const offset = this.bb!.__offset(this.bb_pos, 8); + return offset ? (obj || new NS13596923344997147894.Buffer()).__init(this.bb!.__vector(this.bb_pos + offset) + index * 16, this.bb!) : null; + } + + /** + * @returns number + */ + buffersLength(): number { + const offset = this.bb!.__offset(this.bb_pos, 8); + return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0; + } + + /** + * Optional compression of the message body + * + * @param BodyCompression= obj + * @returns BodyCompression|null + */ + compression(obj?: BodyCompression): BodyCompression | null { + const offset = this.bb!.__offset(this.bb_pos, 10); + return offset ? (obj || new BodyCompression()).__init(this.bb!.__indirect(this.bb_pos + offset), this.bb!) : null; + } + + /** + * @param flatbuffers.Builder builder + */ + static startRecordBatch(builder: flatbuffers.Builder) { + builder.startObject(4); + } + + /** + * @param flatbuffers.Builder builder + * @param flatbuffers.Long length + */ + static addLength(builder: flatbuffers.Builder, length: flatbuffers.Long) { + builder.addFieldInt64(0, length, builder.createLong(0, 0)); + } + + /** + * @param flatbuffers.Builder builder + * @param flatbuffers.Offset nodesOffset + */ + static addNodes(builder: flatbuffers.Builder, nodesOffset: flatbuffers.Offset) { + builder.addFieldOffset(1, nodesOffset, 0); + } + + /** + * @param flatbuffers.Builder builder + * @param number numElems + */ + static startNodesVector(builder: flatbuffers.Builder, numElems: number) { + builder.startVector(16, numElems, 8); + } + + /** + * @param flatbuffers.Builder builder + * @param flatbuffers.Offset buffersOffset + */ + static addBuffers(builder: flatbuffers.Builder, buffersOffset: flatbuffers.Offset) { + builder.addFieldOffset(2, buffersOffset, 0); + } + + /** + * @param flatbuffers.Builder builder + * @param number numElems + */ + static startBuffersVector(builder: flatbuffers.Builder, numElems: number) { + builder.startVector(16, numElems, 8); + } + + /** + * @param flatbuffers.Builder builder + * @param flatbuffers.Offset compressionOffset + */ + static addCompression(builder: flatbuffers.Builder, compressionOffset: flatbuffers.Offset) { + builder.addFieldOffset(3, compressionOffset, 0); + } + + /** + * @param flatbuffers.Builder builder + * @returns flatbuffers.Offset + */ + static endRecordBatch(builder: flatbuffers.Builder): flatbuffers.Offset { + const offset = builder.endObject(); + return offset; + } + + static createRecordBatch(builder: flatbuffers.Builder, length: flatbuffers.Long, nodesOffset: flatbuffers.Offset, buffersOffset: flatbuffers.Offset, compressionOffset: flatbuffers.Offset): flatbuffers.Offset { + RecordBatch.startRecordBatch(builder); + RecordBatch.addLength(builder, length); + RecordBatch.addNodes(builder, nodesOffset); + RecordBatch.addBuffers(builder, buffersOffset); + RecordBatch.addCompression(builder, compressionOffset); + return RecordBatch.endRecordBatch(builder); + } +} +/** + * For sending dictionary encoding information. Any Field can be + * dictionary-encoded, but in this case none of its children may be + * dictionary-encoded. + * There is one vector / column per dictionary, but that vector / column + * may be spread across multiple dictionary batches by using the isDelta + * flag + * + * @constructor + */ +export class DictionaryBatch { + bb: flatbuffers.ByteBuffer | null = null; + + bb_pos: number = 0; + /** + * @param number i + * @param flatbuffers.ByteBuffer bb + * @returns DictionaryBatch + */ + __init(i: number, bb: flatbuffers.ByteBuffer): DictionaryBatch { + this.bb_pos = i; + this.bb = bb; + return this; + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param DictionaryBatch= obj + * @returns DictionaryBatch + */ + static getRootAsDictionaryBatch(bb: flatbuffers.ByteBuffer, obj?: DictionaryBatch): DictionaryBatch { + return (obj || new DictionaryBatch()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param DictionaryBatch= obj + * @returns DictionaryBatch + */ + static getSizePrefixedRootAsDictionaryBatch(bb: flatbuffers.ByteBuffer, obj?: DictionaryBatch): DictionaryBatch { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new DictionaryBatch()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @returns flatbuffers.Long + */ + id(): flatbuffers.Long { + const offset = this.bb!.__offset(this.bb_pos, 4); + return offset ? this.bb!.readInt64(this.bb_pos + offset) : this.bb!.createLong(0, 0); + } + + /** + * @param RecordBatch= obj + * @returns RecordBatch|null + */ + data(obj?: RecordBatch): RecordBatch | null { + const offset = this.bb!.__offset(this.bb_pos, 6); + return offset ? (obj || new RecordBatch()).__init(this.bb!.__indirect(this.bb_pos + offset), this.bb!) : null; + } + + /** + * If isDelta is true the values in the dictionary are to be appended to a + * dictionary with the indicated id. If isDelta is false this dictionary + * should replace the existing dictionary. + * + * @returns boolean + */ + isDelta(): boolean { + const offset = this.bb!.__offset(this.bb_pos, 8); + return offset ? !!this.bb!.readInt8(this.bb_pos + offset) : false; + } + + /** + * @param flatbuffers.Builder builder + */ + static startDictionaryBatch(builder: flatbuffers.Builder) { + builder.startObject(3); + } + + /** + * @param flatbuffers.Builder builder + * @param flatbuffers.Long id + */ + static addId(builder: flatbuffers.Builder, id: flatbuffers.Long) { + builder.addFieldInt64(0, id, builder.createLong(0, 0)); + } + + /** + * @param flatbuffers.Builder builder + * @param flatbuffers.Offset dataOffset + */ + static addData(builder: flatbuffers.Builder, dataOffset: flatbuffers.Offset) { + builder.addFieldOffset(1, dataOffset, 0); + } + + /** + * @param flatbuffers.Builder builder + * @param boolean isDelta + */ + static addIsDelta(builder: flatbuffers.Builder, isDelta: boolean) { + builder.addFieldInt8(2, +isDelta, +false); + } + + /** + * @param flatbuffers.Builder builder + * @returns flatbuffers.Offset + */ + static endDictionaryBatch(builder: flatbuffers.Builder): flatbuffers.Offset { + const offset = builder.endObject(); + return offset; + } + + static createDictionaryBatch(builder: flatbuffers.Builder, id: flatbuffers.Long, dataOffset: flatbuffers.Offset, isDelta: boolean): flatbuffers.Offset { + DictionaryBatch.startDictionaryBatch(builder); + DictionaryBatch.addId(builder, id); + DictionaryBatch.addData(builder, dataOffset); + DictionaryBatch.addIsDelta(builder, isDelta); + return DictionaryBatch.endDictionaryBatch(builder); + } +} +/** + * @constructor + */ +export class Message { + bb: flatbuffers.ByteBuffer | null = null; + + bb_pos: number = 0; + /** + * @param number i + * @param flatbuffers.ByteBuffer bb + * @returns Message + */ + __init(i: number, bb: flatbuffers.ByteBuffer): Message { + this.bb_pos = i; + this.bb = bb; + return this; + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param Message= obj + * @returns Message + */ + static getRootAsMessage(bb: flatbuffers.ByteBuffer, obj?: Message): Message { + return (obj || new Message()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param Message= obj + * @returns Message + */ + static getSizePrefixedRootAsMessage(bb: flatbuffers.ByteBuffer, obj?: Message): Message { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new Message()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @returns MetadataVersion + */ + version(): NS13596923344997147894.MetadataVersion { + const offset = this.bb!.__offset(this.bb_pos, 4); + return offset ? /** */ (this.bb!.readInt16(this.bb_pos + offset)) : NS13596923344997147894.MetadataVersion.V1; + } + + /** + * @returns MessageHeader + */ + headerType(): MessageHeader { + const offset = this.bb!.__offset(this.bb_pos, 6); + return offset ? /** */ (this.bb!.readUint8(this.bb_pos + offset)) : MessageHeader.NONE; + } + + /** + * @param flatbuffers.Table obj + * @returns ?flatbuffers.Table + */ + header<T extends flatbuffers.Table>(obj: T): T | null { + const offset = this.bb!.__offset(this.bb_pos, 8); + return offset ? this.bb!.__union(obj, this.bb_pos + offset) : null; + } + + /** + * @returns flatbuffers.Long + */ + bodyLength(): flatbuffers.Long { + const offset = this.bb!.__offset(this.bb_pos, 10); + return offset ? this.bb!.readInt64(this.bb_pos + offset) : this.bb!.createLong(0, 0); + } + + /** + * @param number index + * @param KeyValue= obj + * @returns KeyValue + */ + customMetadata(index: number, obj?: NS13596923344997147894.KeyValue): NS13596923344997147894.KeyValue | null { + const offset = this.bb!.__offset(this.bb_pos, 12); + return offset ? (obj || new NS13596923344997147894.KeyValue()).__init(this.bb!.__indirect(this.bb!.__vector(this.bb_pos + offset) + index * 4), this.bb!) : null; + } + + /** + * @returns number + */ + customMetadataLength(): number { + const offset = this.bb!.__offset(this.bb_pos, 12); + return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0; + } + + /** + * @param flatbuffers.Builder builder + */ + static startMessage(builder: flatbuffers.Builder) { + builder.startObject(5); + } + + /** + * @param flatbuffers.Builder builder + * @param MetadataVersion version + */ + static addVersion(builder: flatbuffers.Builder, version: NS13596923344997147894.MetadataVersion) { + builder.addFieldInt16(0, version, NS13596923344997147894.MetadataVersion.V1); + } + + /** + * @param flatbuffers.Builder builder + * @param MessageHeader headerType + */ + static addHeaderType(builder: flatbuffers.Builder, headerType: MessageHeader) { + builder.addFieldInt8(1, headerType, MessageHeader.NONE); + } + + /** + * @param flatbuffers.Builder builder + * @param flatbuffers.Offset headerOffset + */ + static addHeader(builder: flatbuffers.Builder, headerOffset: flatbuffers.Offset) { + builder.addFieldOffset(2, headerOffset, 0); + } + + /** + * @param flatbuffers.Builder builder + * @param flatbuffers.Long bodyLength + */ + static addBodyLength(builder: flatbuffers.Builder, bodyLength: flatbuffers.Long) { + builder.addFieldInt64(3, bodyLength, builder.createLong(0, 0)); + } + + /** + * @param flatbuffers.Builder builder + * @param flatbuffers.Offset customMetadataOffset + */ + static addCustomMetadata(builder: flatbuffers.Builder, customMetadataOffset: flatbuffers.Offset) { + builder.addFieldOffset(4, customMetadataOffset, 0); + } + + /** + * @param flatbuffers.Builder builder + * @param Array.<flatbuffers.Offset> data + * @returns flatbuffers.Offset + */ + static createCustomMetadataVector(builder: flatbuffers.Builder, data: flatbuffers.Offset[]): flatbuffers.Offset { + builder.startVector(4, data.length, 4); + for (let i = data.length - 1; i >= 0; i--) { + builder.addOffset(data[i]); + } + return builder.endVector(); + } + + /** + * @param flatbuffers.Builder builder + * @param number numElems + */ + static startCustomMetadataVector(builder: flatbuffers.Builder, numElems: number) { + builder.startVector(4, numElems, 4); + } + + /** + * @param flatbuffers.Builder builder + * @returns flatbuffers.Offset + */ + static endMessage(builder: flatbuffers.Builder): flatbuffers.Offset { + const offset = builder.endObject(); + return offset; + } + + /** + * @param flatbuffers.Builder builder + * @param flatbuffers.Offset offset + */ + static finishMessageBuffer(builder: flatbuffers.Builder, offset: flatbuffers.Offset) { + builder.finish(offset); + } + + /** + * @param flatbuffers.Builder builder + * @param flatbuffers.Offset offset + */ + static finishSizePrefixedMessageBuffer(builder: flatbuffers.Builder, offset: flatbuffers.Offset) { + builder.finish(offset, undefined, true); + } + + static createMessage(builder: flatbuffers.Builder, version: NS13596923344997147894.MetadataVersion, headerType: MessageHeader, headerOffset: flatbuffers.Offset, bodyLength: flatbuffers.Long, customMetadataOffset: flatbuffers.Offset): flatbuffers.Offset { + Message.startMessage(builder); + Message.addVersion(builder, version); + Message.addHeaderType(builder, headerType); + Message.addHeader(builder, headerOffset); + Message.addBodyLength(builder, bodyLength); + Message.addCustomMetadata(builder, customMetadataOffset); + return Message.endMessage(builder); + } +} diff --git a/src/arrow/js/src/fb/Schema.ts b/src/arrow/js/src/fb/Schema.ts new file mode 100644 index 000000000..f675bc2a0 --- /dev/null +++ b/src/arrow/js/src/fb/Schema.ts @@ -0,0 +1,2658 @@ +// automatically generated by the FlatBuffers compiler, do not modify + +import { flatbuffers } from 'flatbuffers'; +/** + * Logical types, vector layouts, and schemas + * + * @enum {number} + */ +export enum MetadataVersion { + /** + * 0.1.0 (October 2016). + */ + V1 = 0, + + /** + * 0.2.0 (February 2017). Non-backwards compatible with V1. + */ + V2 = 1, + + /** + * 0.3.0 -> 0.7.1 (May - December 2017). Non-backwards compatible with V2. + */ + V3 = 2, + + /** + * >= 0.8.0 (December 2017). Non-backwards compatible with V3. + */ + V4 = 3, + + /** + * >= 1.0.0 (July 2020. Backwards compatible with V4 (V5 readers can read V4 + * metadata and IPC messages). Implementations are recommended to provide a + * V4 compatibility mode with V5 format changes disabled. + * + * Incompatible changes between V4 and V5: + * - Union buffer layout has changed. In V5, Unions don't have a validity + * bitmap buffer. + */ + V5 = 4 +} + +/** + * Represents Arrow Features that might not have full support + * within implementations. This is intended to be used in + * two scenarios: + * 1. A mechanism for readers of Arrow Streams + * and files to understand that the stream or file makes + * use of a feature that isn't supported or unknown to + * the implementation (and therefore can meet the Arrow + * forward compatibility guarantees). + * 2. A means of negotiating between a client and server + * what features a stream is allowed to use. The enums + * values here are intented to represent higher level + * features, additional details maybe negotiated + * with key-value pairs specific to the protocol. + * + * Enums added to this list should be assigned power-of-two values + * to facilitate exchanging and comparing bitmaps for supported + * features. + * + * @enum {number} + */ +export enum Feature { + /** + * Needed to make flatbuffers happy. + */ + UNUSED = 0, + + /** + * The stream makes use of multiple full dictionaries with the + * same ID and assumes clients implement dictionary replacement + * correctly. + */ + DICTIONARY_REPLACEMENT = 1, + + /** + * The stream makes use of compressed bodies as described + * in Message.fbs. + */ + COMPRESSED_BODY = 2 +} + +/** + * @enum {number} + */ +export enum UnionMode { + Sparse = 0, + Dense = 1 +} + +/** + * @enum {number} + */ +export enum Precision { + HALF = 0, + SINGLE = 1, + DOUBLE = 2 +} + +/** + * @enum {number} + */ +export enum DateUnit { + DAY = 0, + MILLISECOND = 1 +} + +/** + * @enum {number} + */ +export enum TimeUnit { + SECOND = 0, + MILLISECOND = 1, + MICROSECOND = 2, + NANOSECOND = 3 +} + +/** + * @enum {number} + */ +export enum IntervalUnit { + YEAR_MONTH = 0, + DAY_TIME = 1 +} + +/** + * ---------------------------------------------------------------------- + * Top-level Type value, enabling extensible type-specific metadata. We can + * add new logical types to Type without breaking backwards compatibility + * + * @enum {number} + */ +export enum Type { + NONE = 0, + Null = 1, + Int = 2, + FloatingPoint = 3, + Binary = 4, + Utf8 = 5, + Bool = 6, + Decimal = 7, + Date = 8, + Time = 9, + Timestamp = 10, + Interval = 11, + List = 12, + Struct_ = 13, + Union = 14, + FixedSizeBinary = 15, + FixedSizeList = 16, + Map = 17, + Duration = 18, + LargeBinary = 19, + LargeUtf8 = 20, + LargeList = 21 +} + +/** + * ---------------------------------------------------------------------- + * Dictionary encoding metadata + * Maintained for forwards compatibility, in the future + * Dictionaries might be explicit maps between integers and values + * allowing for non-contiguous index values + * + * @enum {number} + */ +export enum DictionaryKind { + DenseArray = 0 +} + +/** + * ---------------------------------------------------------------------- + * Endianness of the platform producing the data + * + * @enum {number} + */ +export enum Endianness { + Little = 0, + Big = 1 +} + +/** + * These are stored in the flatbuffer in the Type union below + * + * @constructor + */ +export class Null { + bb: flatbuffers.ByteBuffer | null = null; + + bb_pos: number = 0; + /** + * @param number i + * @param flatbuffers.ByteBuffer bb + * @returns Null + */ + __init(i: number, bb: flatbuffers.ByteBuffer): Null { + this.bb_pos = i; + this.bb = bb; + return this; + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param Null= obj + * @returns Null + */ + static getRootAsNull(bb: flatbuffers.ByteBuffer, obj?: Null): Null { + return (obj || new Null()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param Null= obj + * @returns Null + */ + static getSizePrefixedRootAsNull(bb: flatbuffers.ByteBuffer, obj?: Null): Null { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new Null()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.Builder builder + */ + static startNull(builder: flatbuffers.Builder) { + builder.startObject(0); + } + + /** + * @param flatbuffers.Builder builder + * @returns flatbuffers.Offset + */ + static endNull(builder: flatbuffers.Builder): flatbuffers.Offset { + const offset = builder.endObject(); + return offset; + } + + static createNull(builder: flatbuffers.Builder): flatbuffers.Offset { + Null.startNull(builder); + return Null.endNull(builder); + } +} +/** + * A Struct_ in the flatbuffer metadata is the same as an Arrow Struct + * (according to the physical memory layout). We used Struct_ here as + * Struct is a reserved word in Flatbuffers + * + * @constructor + */ +export class Struct_ { + bb: flatbuffers.ByteBuffer | null = null; + + bb_pos: number = 0; + /** + * @param number i + * @param flatbuffers.ByteBuffer bb + * @returns Struct_ + */ + __init(i: number, bb: flatbuffers.ByteBuffer): Struct_ { + this.bb_pos = i; + this.bb = bb; + return this; + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param Struct_= obj + * @returns Struct_ + */ + static getRootAsStruct_(bb: flatbuffers.ByteBuffer, obj?: Struct_): Struct_ { + return (obj || new Struct_()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param Struct_= obj + * @returns Struct_ + */ + static getSizePrefixedRootAsStruct_(bb: flatbuffers.ByteBuffer, obj?: Struct_): Struct_ { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new Struct_()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.Builder builder + */ + static startStruct_(builder: flatbuffers.Builder) { + builder.startObject(0); + } + + /** + * @param flatbuffers.Builder builder + * @returns flatbuffers.Offset + */ + static endStruct_(builder: flatbuffers.Builder): flatbuffers.Offset { + const offset = builder.endObject(); + return offset; + } + + static createStruct_(builder: flatbuffers.Builder): flatbuffers.Offset { + Struct_.startStruct_(builder); + return Struct_.endStruct_(builder); + } +} +/** + * @constructor + */ +export class List { + bb: flatbuffers.ByteBuffer | null = null; + + bb_pos: number = 0; + /** + * @param number i + * @param flatbuffers.ByteBuffer bb + * @returns List + */ + __init(i: number, bb: flatbuffers.ByteBuffer): List { + this.bb_pos = i; + this.bb = bb; + return this; + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param List= obj + * @returns List + */ + static getRootAsList(bb: flatbuffers.ByteBuffer, obj?: List): List { + return (obj || new List()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param List= obj + * @returns List + */ + static getSizePrefixedRootAsList(bb: flatbuffers.ByteBuffer, obj?: List): List { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new List()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.Builder builder + */ + static startList(builder: flatbuffers.Builder) { + builder.startObject(0); + } + + /** + * @param flatbuffers.Builder builder + * @returns flatbuffers.Offset + */ + static endList(builder: flatbuffers.Builder): flatbuffers.Offset { + const offset = builder.endObject(); + return offset; + } + + static createList(builder: flatbuffers.Builder): flatbuffers.Offset { + List.startList(builder); + return List.endList(builder); + } +} +/** + * Same as List, but with 64-bit offsets, allowing to represent + * extremely large data values. + * + * @constructor + */ +export class LargeList { + bb: flatbuffers.ByteBuffer | null = null; + + bb_pos: number = 0; + /** + * @param number i + * @param flatbuffers.ByteBuffer bb + * @returns LargeList + */ + __init(i: number, bb: flatbuffers.ByteBuffer): LargeList { + this.bb_pos = i; + this.bb = bb; + return this; + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param LargeList= obj + * @returns LargeList + */ + static getRootAsLargeList(bb: flatbuffers.ByteBuffer, obj?: LargeList): LargeList { + return (obj || new LargeList()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param LargeList= obj + * @returns LargeList + */ + static getSizePrefixedRootAsLargeList(bb: flatbuffers.ByteBuffer, obj?: LargeList): LargeList { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new LargeList()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.Builder builder + */ + static startLargeList(builder: flatbuffers.Builder) { + builder.startObject(0); + } + + /** + * @param flatbuffers.Builder builder + * @returns flatbuffers.Offset + */ + static endLargeList(builder: flatbuffers.Builder): flatbuffers.Offset { + const offset = builder.endObject(); + return offset; + } + + static createLargeList(builder: flatbuffers.Builder): flatbuffers.Offset { + LargeList.startLargeList(builder); + return LargeList.endLargeList(builder); + } +} +/** + * @constructor + */ +export class FixedSizeList { + bb: flatbuffers.ByteBuffer | null = null; + + bb_pos: number = 0; + /** + * @param number i + * @param flatbuffers.ByteBuffer bb + * @returns FixedSizeList + */ + __init(i: number, bb: flatbuffers.ByteBuffer): FixedSizeList { + this.bb_pos = i; + this.bb = bb; + return this; + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param FixedSizeList= obj + * @returns FixedSizeList + */ + static getRootAsFixedSizeList(bb: flatbuffers.ByteBuffer, obj?: FixedSizeList): FixedSizeList { + return (obj || new FixedSizeList()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param FixedSizeList= obj + * @returns FixedSizeList + */ + static getSizePrefixedRootAsFixedSizeList(bb: flatbuffers.ByteBuffer, obj?: FixedSizeList): FixedSizeList { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new FixedSizeList()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * Number of list items per value + * + * @returns number + */ + listSize(): number { + const offset = this.bb!.__offset(this.bb_pos, 4); + return offset ? this.bb!.readInt32(this.bb_pos + offset) : 0; + } + + /** + * @param flatbuffers.Builder builder + */ + static startFixedSizeList(builder: flatbuffers.Builder) { + builder.startObject(1); + } + + /** + * @param flatbuffers.Builder builder + * @param number listSize + */ + static addListSize(builder: flatbuffers.Builder, listSize: number) { + builder.addFieldInt32(0, listSize, 0); + } + + /** + * @param flatbuffers.Builder builder + * @returns flatbuffers.Offset + */ + static endFixedSizeList(builder: flatbuffers.Builder): flatbuffers.Offset { + const offset = builder.endObject(); + return offset; + } + + static createFixedSizeList(builder: flatbuffers.Builder, listSize: number): flatbuffers.Offset { + FixedSizeList.startFixedSizeList(builder); + FixedSizeList.addListSize(builder, listSize); + return FixedSizeList.endFixedSizeList(builder); + } +} +/** + * A Map is a logical nested type that is represented as + * + * List<entries: Struct<key: K, value: V>> + * + * In this layout, the keys and values are each respectively contiguous. We do + * not constrain the key and value types, so the application is responsible + * for ensuring that the keys are hashable and unique. Whether the keys are sorted + * may be set in the metadata for this field. + * + * In a field with Map type, the field has a child Struct field, which then + * has two children: key type and the second the value type. The names of the + * child fields may be respectively "entries", "key", and "value", but this is + * not enforced. + * + * Map + * - child[0] entries: Struct + * - child[0] key: K + * - child[1] value: V + * + * Neither the "entries" field nor the "key" field may be nullable. + * + * The metadata is structured so that Arrow systems without special handling + * for Map can make Map an alias for List. The "layout" attribute for the Map + * field must have the same contents as a List. + * + * @constructor + */ +export class Map { + bb: flatbuffers.ByteBuffer | null = null; + + bb_pos: number = 0; + /** + * @param number i + * @param flatbuffers.ByteBuffer bb + * @returns Map + */ + __init(i: number, bb: flatbuffers.ByteBuffer): Map { + this.bb_pos = i; + this.bb = bb; + return this; + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param Map= obj + * @returns Map + */ + static getRootAsMap(bb: flatbuffers.ByteBuffer, obj?: Map): Map { + return (obj || new Map()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param Map= obj + * @returns Map + */ + static getSizePrefixedRootAsMap(bb: flatbuffers.ByteBuffer, obj?: Map): Map { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new Map()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * Set to true if the keys within each value are sorted + * + * @returns boolean + */ + keysSorted(): boolean { + const offset = this.bb!.__offset(this.bb_pos, 4); + return offset ? !!this.bb!.readInt8(this.bb_pos + offset) : false; + } + + /** + * @param flatbuffers.Builder builder + */ + static startMap(builder: flatbuffers.Builder) { + builder.startObject(1); + } + + /** + * @param flatbuffers.Builder builder + * @param boolean keysSorted + */ + static addKeysSorted(builder: flatbuffers.Builder, keysSorted: boolean) { + builder.addFieldInt8(0, +keysSorted, +false); + } + + /** + * @param flatbuffers.Builder builder + * @returns flatbuffers.Offset + */ + static endMap(builder: flatbuffers.Builder): flatbuffers.Offset { + const offset = builder.endObject(); + return offset; + } + + static createMap(builder: flatbuffers.Builder, keysSorted: boolean): flatbuffers.Offset { + Map.startMap(builder); + Map.addKeysSorted(builder, keysSorted); + return Map.endMap(builder); + } +} +/** + * A union is a complex type with children in Field + * By default ids in the type vector refer to the offsets in the children + * optionally typeIds provides an indirection between the child offset and the type id + * for each child typeIds[offset] is the id used in the type vector + * + * @constructor + */ +export class Union { + bb: flatbuffers.ByteBuffer | null = null; + + bb_pos: number = 0; + /** + * @param number i + * @param flatbuffers.ByteBuffer bb + * @returns Union + */ + __init(i: number, bb: flatbuffers.ByteBuffer): Union { + this.bb_pos = i; + this.bb = bb; + return this; + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param Union= obj + * @returns Union + */ + static getRootAsUnion(bb: flatbuffers.ByteBuffer, obj?: Union): Union { + return (obj || new Union()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param Union= obj + * @returns Union + */ + static getSizePrefixedRootAsUnion(bb: flatbuffers.ByteBuffer, obj?: Union): Union { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new Union()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @returns UnionMode + */ + mode(): UnionMode { + const offset = this.bb!.__offset(this.bb_pos, 4); + return offset ? /** */ (this.bb!.readInt16(this.bb_pos + offset)) : UnionMode.Sparse; + } + + /** + * @param number index + * @returns number + */ + typeIds(index: number): number | null { + const offset = this.bb!.__offset(this.bb_pos, 6); + return offset ? this.bb!.readInt32(this.bb!.__vector(this.bb_pos + offset) + index * 4) : 0; + } + + /** + * @returns number + */ + typeIdsLength(): number { + const offset = this.bb!.__offset(this.bb_pos, 6); + return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0; + } + + /** + * @returns Int32Array + */ + typeIdsArray(): Int32Array | null { + const offset = this.bb!.__offset(this.bb_pos, 6); + return offset ? new Int32Array(this.bb!.bytes().buffer, this.bb!.bytes().byteOffset + this.bb!.__vector(this.bb_pos + offset), this.bb!.__vector_len(this.bb_pos + offset)) : null; + } + + /** + * @param flatbuffers.Builder builder + */ + static startUnion(builder: flatbuffers.Builder) { + builder.startObject(2); + } + + /** + * @param flatbuffers.Builder builder + * @param UnionMode mode + */ + static addMode(builder: flatbuffers.Builder, mode: UnionMode) { + builder.addFieldInt16(0, mode, UnionMode.Sparse); + } + + /** + * @param flatbuffers.Builder builder + * @param flatbuffers.Offset typeIdsOffset + */ + static addTypeIds(builder: flatbuffers.Builder, typeIdsOffset: flatbuffers.Offset) { + builder.addFieldOffset(1, typeIdsOffset, 0); + } + + /** + * @param flatbuffers.Builder builder + * @param Array.<number> data + * @returns flatbuffers.Offset + */ + static createTypeIdsVector(builder: flatbuffers.Builder, data: number[] | Int32Array): flatbuffers.Offset { + builder.startVector(4, data.length, 4); + for (let i = data.length - 1; i >= 0; i--) { + builder.addInt32(data[i]); + } + return builder.endVector(); + } + + /** + * @param flatbuffers.Builder builder + * @param number numElems + */ + static startTypeIdsVector(builder: flatbuffers.Builder, numElems: number) { + builder.startVector(4, numElems, 4); + } + + /** + * @param flatbuffers.Builder builder + * @returns flatbuffers.Offset + */ + static endUnion(builder: flatbuffers.Builder): flatbuffers.Offset { + const offset = builder.endObject(); + return offset; + } + + static createUnion(builder: flatbuffers.Builder, mode: UnionMode, typeIdsOffset: flatbuffers.Offset): flatbuffers.Offset { + Union.startUnion(builder); + Union.addMode(builder, mode); + Union.addTypeIds(builder, typeIdsOffset); + return Union.endUnion(builder); + } +} +/** + * @constructor + */ +export class Int { + bb: flatbuffers.ByteBuffer | null = null; + + bb_pos: number = 0; + /** + * @param number i + * @param flatbuffers.ByteBuffer bb + * @returns Int + */ + __init(i: number, bb: flatbuffers.ByteBuffer): Int { + this.bb_pos = i; + this.bb = bb; + return this; + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param Int= obj + * @returns Int + */ + static getRootAsInt(bb: flatbuffers.ByteBuffer, obj?: Int): Int { + return (obj || new Int()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param Int= obj + * @returns Int + */ + static getSizePrefixedRootAsInt(bb: flatbuffers.ByteBuffer, obj?: Int): Int { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new Int()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @returns number + */ + bitWidth(): number { + const offset = this.bb!.__offset(this.bb_pos, 4); + return offset ? this.bb!.readInt32(this.bb_pos + offset) : 0; + } + + /** + * @returns boolean + */ + isSigned(): boolean { + const offset = this.bb!.__offset(this.bb_pos, 6); + return offset ? !!this.bb!.readInt8(this.bb_pos + offset) : false; + } + + /** + * @param flatbuffers.Builder builder + */ + static startInt(builder: flatbuffers.Builder) { + builder.startObject(2); + } + + /** + * @param flatbuffers.Builder builder + * @param number bitWidth + */ + static addBitWidth(builder: flatbuffers.Builder, bitWidth: number) { + builder.addFieldInt32(0, bitWidth, 0); + } + + /** + * @param flatbuffers.Builder builder + * @param boolean isSigned + */ + static addIsSigned(builder: flatbuffers.Builder, isSigned: boolean) { + builder.addFieldInt8(1, +isSigned, +false); + } + + /** + * @param flatbuffers.Builder builder + * @returns flatbuffers.Offset + */ + static endInt(builder: flatbuffers.Builder): flatbuffers.Offset { + const offset = builder.endObject(); + return offset; + } + + static createInt(builder: flatbuffers.Builder, bitWidth: number, isSigned: boolean): flatbuffers.Offset { + Int.startInt(builder); + Int.addBitWidth(builder, bitWidth); + Int.addIsSigned(builder, isSigned); + return Int.endInt(builder); + } +} +/** + * @constructor + */ +export class FloatingPoint { + bb: flatbuffers.ByteBuffer | null = null; + + bb_pos: number = 0; + /** + * @param number i + * @param flatbuffers.ByteBuffer bb + * @returns FloatingPoint + */ + __init(i: number, bb: flatbuffers.ByteBuffer): FloatingPoint { + this.bb_pos = i; + this.bb = bb; + return this; + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param FloatingPoint= obj + * @returns FloatingPoint + */ + static getRootAsFloatingPoint(bb: flatbuffers.ByteBuffer, obj?: FloatingPoint): FloatingPoint { + return (obj || new FloatingPoint()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param FloatingPoint= obj + * @returns FloatingPoint + */ + static getSizePrefixedRootAsFloatingPoint(bb: flatbuffers.ByteBuffer, obj?: FloatingPoint): FloatingPoint { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new FloatingPoint()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @returns Precision + */ + precision(): Precision { + const offset = this.bb!.__offset(this.bb_pos, 4); + return offset ? /** */ (this.bb!.readInt16(this.bb_pos + offset)) : Precision.HALF; + } + + /** + * @param flatbuffers.Builder builder + */ + static startFloatingPoint(builder: flatbuffers.Builder) { + builder.startObject(1); + } + + /** + * @param flatbuffers.Builder builder + * @param Precision precision + */ + static addPrecision(builder: flatbuffers.Builder, precision: Precision) { + builder.addFieldInt16(0, precision, Precision.HALF); + } + + /** + * @param flatbuffers.Builder builder + * @returns flatbuffers.Offset + */ + static endFloatingPoint(builder: flatbuffers.Builder): flatbuffers.Offset { + const offset = builder.endObject(); + return offset; + } + + static createFloatingPoint(builder: flatbuffers.Builder, precision: Precision): flatbuffers.Offset { + FloatingPoint.startFloatingPoint(builder); + FloatingPoint.addPrecision(builder, precision); + return FloatingPoint.endFloatingPoint(builder); + } +} +/** + * Unicode with UTF-8 encoding + * + * @constructor + */ +export class Utf8 { + bb: flatbuffers.ByteBuffer | null = null; + + bb_pos: number = 0; + /** + * @param number i + * @param flatbuffers.ByteBuffer bb + * @returns Utf8 + */ + __init(i: number, bb: flatbuffers.ByteBuffer): Utf8 { + this.bb_pos = i; + this.bb = bb; + return this; + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param Utf8= obj + * @returns Utf8 + */ + static getRootAsUtf8(bb: flatbuffers.ByteBuffer, obj?: Utf8): Utf8 { + return (obj || new Utf8()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param Utf8= obj + * @returns Utf8 + */ + static getSizePrefixedRootAsUtf8(bb: flatbuffers.ByteBuffer, obj?: Utf8): Utf8 { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new Utf8()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.Builder builder + */ + static startUtf8(builder: flatbuffers.Builder) { + builder.startObject(0); + } + + /** + * @param flatbuffers.Builder builder + * @returns flatbuffers.Offset + */ + static endUtf8(builder: flatbuffers.Builder): flatbuffers.Offset { + const offset = builder.endObject(); + return offset; + } + + static createUtf8(builder: flatbuffers.Builder): flatbuffers.Offset { + Utf8.startUtf8(builder); + return Utf8.endUtf8(builder); + } +} +/** + * Opaque binary data + * + * @constructor + */ +export class Binary { + bb: flatbuffers.ByteBuffer | null = null; + + bb_pos: number = 0; + /** + * @param number i + * @param flatbuffers.ByteBuffer bb + * @returns Binary + */ + __init(i: number, bb: flatbuffers.ByteBuffer): Binary { + this.bb_pos = i; + this.bb = bb; + return this; + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param Binary= obj + * @returns Binary + */ + static getRootAsBinary(bb: flatbuffers.ByteBuffer, obj?: Binary): Binary { + return (obj || new Binary()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param Binary= obj + * @returns Binary + */ + static getSizePrefixedRootAsBinary(bb: flatbuffers.ByteBuffer, obj?: Binary): Binary { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new Binary()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.Builder builder + */ + static startBinary(builder: flatbuffers.Builder) { + builder.startObject(0); + } + + /** + * @param flatbuffers.Builder builder + * @returns flatbuffers.Offset + */ + static endBinary(builder: flatbuffers.Builder): flatbuffers.Offset { + const offset = builder.endObject(); + return offset; + } + + static createBinary(builder: flatbuffers.Builder): flatbuffers.Offset { + Binary.startBinary(builder); + return Binary.endBinary(builder); + } +} +/** + * Same as Utf8, but with 64-bit offsets, allowing to represent + * extremely large data values. + * + * @constructor + */ +export class LargeUtf8 { + bb: flatbuffers.ByteBuffer | null = null; + + bb_pos: number = 0; + /** + * @param number i + * @param flatbuffers.ByteBuffer bb + * @returns LargeUtf8 + */ + __init(i: number, bb: flatbuffers.ByteBuffer): LargeUtf8 { + this.bb_pos = i; + this.bb = bb; + return this; + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param LargeUtf8= obj + * @returns LargeUtf8 + */ + static getRootAsLargeUtf8(bb: flatbuffers.ByteBuffer, obj?: LargeUtf8): LargeUtf8 { + return (obj || new LargeUtf8()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param LargeUtf8= obj + * @returns LargeUtf8 + */ + static getSizePrefixedRootAsLargeUtf8(bb: flatbuffers.ByteBuffer, obj?: LargeUtf8): LargeUtf8 { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new LargeUtf8()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.Builder builder + */ + static startLargeUtf8(builder: flatbuffers.Builder) { + builder.startObject(0); + } + + /** + * @param flatbuffers.Builder builder + * @returns flatbuffers.Offset + */ + static endLargeUtf8(builder: flatbuffers.Builder): flatbuffers.Offset { + const offset = builder.endObject(); + return offset; + } + + static createLargeUtf8(builder: flatbuffers.Builder): flatbuffers.Offset { + LargeUtf8.startLargeUtf8(builder); + return LargeUtf8.endLargeUtf8(builder); + } +} +/** + * Same as Binary, but with 64-bit offsets, allowing to represent + * extremely large data values. + * + * @constructor + */ +export class LargeBinary { + bb: flatbuffers.ByteBuffer | null = null; + + bb_pos: number = 0; + /** + * @param number i + * @param flatbuffers.ByteBuffer bb + * @returns LargeBinary + */ + __init(i: number, bb: flatbuffers.ByteBuffer): LargeBinary { + this.bb_pos = i; + this.bb = bb; + return this; + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param LargeBinary= obj + * @returns LargeBinary + */ + static getRootAsLargeBinary(bb: flatbuffers.ByteBuffer, obj?: LargeBinary): LargeBinary { + return (obj || new LargeBinary()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param LargeBinary= obj + * @returns LargeBinary + */ + static getSizePrefixedRootAsLargeBinary(bb: flatbuffers.ByteBuffer, obj?: LargeBinary): LargeBinary { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new LargeBinary()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.Builder builder + */ + static startLargeBinary(builder: flatbuffers.Builder) { + builder.startObject(0); + } + + /** + * @param flatbuffers.Builder builder + * @returns flatbuffers.Offset + */ + static endLargeBinary(builder: flatbuffers.Builder): flatbuffers.Offset { + const offset = builder.endObject(); + return offset; + } + + static createLargeBinary(builder: flatbuffers.Builder): flatbuffers.Offset { + LargeBinary.startLargeBinary(builder); + return LargeBinary.endLargeBinary(builder); + } +} +/** + * @constructor + */ +export class FixedSizeBinary { + bb: flatbuffers.ByteBuffer | null = null; + + bb_pos: number = 0; + /** + * @param number i + * @param flatbuffers.ByteBuffer bb + * @returns FixedSizeBinary + */ + __init(i: number, bb: flatbuffers.ByteBuffer): FixedSizeBinary { + this.bb_pos = i; + this.bb = bb; + return this; + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param FixedSizeBinary= obj + * @returns FixedSizeBinary + */ + static getRootAsFixedSizeBinary(bb: flatbuffers.ByteBuffer, obj?: FixedSizeBinary): FixedSizeBinary { + return (obj || new FixedSizeBinary()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param FixedSizeBinary= obj + * @returns FixedSizeBinary + */ + static getSizePrefixedRootAsFixedSizeBinary(bb: flatbuffers.ByteBuffer, obj?: FixedSizeBinary): FixedSizeBinary { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new FixedSizeBinary()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * Number of bytes per value + * + * @returns number + */ + byteWidth(): number { + const offset = this.bb!.__offset(this.bb_pos, 4); + return offset ? this.bb!.readInt32(this.bb_pos + offset) : 0; + } + + /** + * @param flatbuffers.Builder builder + */ + static startFixedSizeBinary(builder: flatbuffers.Builder) { + builder.startObject(1); + } + + /** + * @param flatbuffers.Builder builder + * @param number byteWidth + */ + static addByteWidth(builder: flatbuffers.Builder, byteWidth: number) { + builder.addFieldInt32(0, byteWidth, 0); + } + + /** + * @param flatbuffers.Builder builder + * @returns flatbuffers.Offset + */ + static endFixedSizeBinary(builder: flatbuffers.Builder): flatbuffers.Offset { + const offset = builder.endObject(); + return offset; + } + + static createFixedSizeBinary(builder: flatbuffers.Builder, byteWidth: number): flatbuffers.Offset { + FixedSizeBinary.startFixedSizeBinary(builder); + FixedSizeBinary.addByteWidth(builder, byteWidth); + return FixedSizeBinary.endFixedSizeBinary(builder); + } +} +/** + * @constructor + */ +export class Bool { + bb: flatbuffers.ByteBuffer | null = null; + + bb_pos: number = 0; + /** + * @param number i + * @param flatbuffers.ByteBuffer bb + * @returns Bool + */ + __init(i: number, bb: flatbuffers.ByteBuffer): Bool { + this.bb_pos = i; + this.bb = bb; + return this; + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param Bool= obj + * @returns Bool + */ + static getRootAsBool(bb: flatbuffers.ByteBuffer, obj?: Bool): Bool { + return (obj || new Bool()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param Bool= obj + * @returns Bool + */ + static getSizePrefixedRootAsBool(bb: flatbuffers.ByteBuffer, obj?: Bool): Bool { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new Bool()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.Builder builder + */ + static startBool(builder: flatbuffers.Builder) { + builder.startObject(0); + } + + /** + * @param flatbuffers.Builder builder + * @returns flatbuffers.Offset + */ + static endBool(builder: flatbuffers.Builder): flatbuffers.Offset { + const offset = builder.endObject(); + return offset; + } + + static createBool(builder: flatbuffers.Builder): flatbuffers.Offset { + Bool.startBool(builder); + return Bool.endBool(builder); + } +} +/** + * Exact decimal value represented as an integer value in two's + * complement. Currently only 128-bit (16-byte) and 256-bit (32-byte) integers + * are used. The representation uses the endianness indicated + * in the Schema. + * + * @constructor + */ +export class Decimal { + bb: flatbuffers.ByteBuffer | null = null; + + bb_pos: number = 0; + /** + * @param number i + * @param flatbuffers.ByteBuffer bb + * @returns Decimal + */ + __init(i: number, bb: flatbuffers.ByteBuffer): Decimal { + this.bb_pos = i; + this.bb = bb; + return this; + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param Decimal= obj + * @returns Decimal + */ + static getRootAsDecimal(bb: flatbuffers.ByteBuffer, obj?: Decimal): Decimal { + return (obj || new Decimal()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param Decimal= obj + * @returns Decimal + */ + static getSizePrefixedRootAsDecimal(bb: flatbuffers.ByteBuffer, obj?: Decimal): Decimal { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new Decimal()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * Total number of decimal digits + * + * @returns number + */ + precision(): number { + const offset = this.bb!.__offset(this.bb_pos, 4); + return offset ? this.bb!.readInt32(this.bb_pos + offset) : 0; + } + + /** + * Number of digits after the decimal point "." + * + * @returns number + */ + scale(): number { + const offset = this.bb!.__offset(this.bb_pos, 6); + return offset ? this.bb!.readInt32(this.bb_pos + offset) : 0; + } + + /** + * Number of bits per value. The only accepted widths are 128 and 256. + * We use bitWidth for consistency with Int::bitWidth. + * + * @returns number + */ + bitWidth(): number { + const offset = this.bb!.__offset(this.bb_pos, 8); + return offset ? this.bb!.readInt32(this.bb_pos + offset) : 128; + } + + /** + * @param flatbuffers.Builder builder + */ + static startDecimal(builder: flatbuffers.Builder) { + builder.startObject(3); + } + + /** + * @param flatbuffers.Builder builder + * @param number precision + */ + static addPrecision(builder: flatbuffers.Builder, precision: number) { + builder.addFieldInt32(0, precision, 0); + } + + /** + * @param flatbuffers.Builder builder + * @param number scale + */ + static addScale(builder: flatbuffers.Builder, scale: number) { + builder.addFieldInt32(1, scale, 0); + } + + /** + * @param flatbuffers.Builder builder + * @param number bitWidth + */ + static addBitWidth(builder: flatbuffers.Builder, bitWidth: number) { + builder.addFieldInt32(2, bitWidth, 128); + } + + /** + * @param flatbuffers.Builder builder + * @returns flatbuffers.Offset + */ + static endDecimal(builder: flatbuffers.Builder): flatbuffers.Offset { + const offset = builder.endObject(); + return offset; + } + + static createDecimal(builder: flatbuffers.Builder, precision: number, scale: number, bitWidth: number): flatbuffers.Offset { + Decimal.startDecimal(builder); + Decimal.addPrecision(builder, precision); + Decimal.addScale(builder, scale); + Decimal.addBitWidth(builder, bitWidth); + return Decimal.endDecimal(builder); + } +} +/** + * Date is either a 32-bit or 64-bit type representing elapsed time since UNIX + * epoch (1970-01-01), stored in either of two units: + * + * * Milliseconds (64 bits) indicating UNIX time elapsed since the epoch (no + * leap seconds), where the values are evenly divisible by 86400000 + * * Days (32 bits) since the UNIX epoch + * + * @constructor + */ +export class Date { + bb: flatbuffers.ByteBuffer | null = null; + + bb_pos: number = 0; + /** + * @param number i + * @param flatbuffers.ByteBuffer bb + * @returns Date + */ + __init(i: number, bb: flatbuffers.ByteBuffer): Date { + this.bb_pos = i; + this.bb = bb; + return this; + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param Date= obj + * @returns Date + */ + static getRootAsDate(bb: flatbuffers.ByteBuffer, obj?: Date): Date { + return (obj || new Date()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param Date= obj + * @returns Date + */ + static getSizePrefixedRootAsDate(bb: flatbuffers.ByteBuffer, obj?: Date): Date { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new Date()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @returns DateUnit + */ + unit(): DateUnit { + const offset = this.bb!.__offset(this.bb_pos, 4); + return offset ? /** */ (this.bb!.readInt16(this.bb_pos + offset)) : DateUnit.MILLISECOND; + } + + /** + * @param flatbuffers.Builder builder + */ + static startDate(builder: flatbuffers.Builder) { + builder.startObject(1); + } + + /** + * @param flatbuffers.Builder builder + * @param DateUnit unit + */ + static addUnit(builder: flatbuffers.Builder, unit: DateUnit) { + builder.addFieldInt16(0, unit, DateUnit.MILLISECOND); + } + + /** + * @param flatbuffers.Builder builder + * @returns flatbuffers.Offset + */ + static endDate(builder: flatbuffers.Builder): flatbuffers.Offset { + const offset = builder.endObject(); + return offset; + } + + static createDate(builder: flatbuffers.Builder, unit: DateUnit): flatbuffers.Offset { + Date.startDate(builder); + Date.addUnit(builder, unit); + return Date.endDate(builder); + } +} +/** + * Time type. The physical storage type depends on the unit + * - SECOND and MILLISECOND: 32 bits + * - MICROSECOND and NANOSECOND: 64 bits + * + * @constructor + */ +export class Time { + bb: flatbuffers.ByteBuffer | null = null; + + bb_pos: number = 0; + /** + * @param number i + * @param flatbuffers.ByteBuffer bb + * @returns Time + */ + __init(i: number, bb: flatbuffers.ByteBuffer): Time { + this.bb_pos = i; + this.bb = bb; + return this; + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param Time= obj + * @returns Time + */ + static getRootAsTime(bb: flatbuffers.ByteBuffer, obj?: Time): Time { + return (obj || new Time()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param Time= obj + * @returns Time + */ + static getSizePrefixedRootAsTime(bb: flatbuffers.ByteBuffer, obj?: Time): Time { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new Time()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @returns TimeUnit + */ + unit(): TimeUnit { + const offset = this.bb!.__offset(this.bb_pos, 4); + return offset ? /** */ (this.bb!.readInt16(this.bb_pos + offset)) : TimeUnit.MILLISECOND; + } + + /** + * @returns number + */ + bitWidth(): number { + const offset = this.bb!.__offset(this.bb_pos, 6); + return offset ? this.bb!.readInt32(this.bb_pos + offset) : 32; + } + + /** + * @param flatbuffers.Builder builder + */ + static startTime(builder: flatbuffers.Builder) { + builder.startObject(2); + } + + /** + * @param flatbuffers.Builder builder + * @param TimeUnit unit + */ + static addUnit(builder: flatbuffers.Builder, unit: TimeUnit) { + builder.addFieldInt16(0, unit, TimeUnit.MILLISECOND); + } + + /** + * @param flatbuffers.Builder builder + * @param number bitWidth + */ + static addBitWidth(builder: flatbuffers.Builder, bitWidth: number) { + builder.addFieldInt32(1, bitWidth, 32); + } + + /** + * @param flatbuffers.Builder builder + * @returns flatbuffers.Offset + */ + static endTime(builder: flatbuffers.Builder): flatbuffers.Offset { + const offset = builder.endObject(); + return offset; + } + + static createTime(builder: flatbuffers.Builder, unit: TimeUnit, bitWidth: number): flatbuffers.Offset { + Time.startTime(builder); + Time.addUnit(builder, unit); + Time.addBitWidth(builder, bitWidth); + return Time.endTime(builder); + } +} +/** + * Time elapsed from the Unix epoch, 00:00:00.000 on 1 January 1970, excluding + * leap seconds, as a 64-bit integer. Note that UNIX time does not include + * leap seconds. + * + * The Timestamp metadata supports both "time zone naive" and "time zone + * aware" timestamps. Read about the timezone attribute for more detail + * + * @constructor + */ +export class Timestamp { + bb: flatbuffers.ByteBuffer | null = null; + + bb_pos: number = 0; + /** + * @param number i + * @param flatbuffers.ByteBuffer bb + * @returns Timestamp + */ + __init(i: number, bb: flatbuffers.ByteBuffer): Timestamp { + this.bb_pos = i; + this.bb = bb; + return this; + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param Timestamp= obj + * @returns Timestamp + */ + static getRootAsTimestamp(bb: flatbuffers.ByteBuffer, obj?: Timestamp): Timestamp { + return (obj || new Timestamp()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param Timestamp= obj + * @returns Timestamp + */ + static getSizePrefixedRootAsTimestamp(bb: flatbuffers.ByteBuffer, obj?: Timestamp): Timestamp { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new Timestamp()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @returns TimeUnit + */ + unit(): TimeUnit { + const offset = this.bb!.__offset(this.bb_pos, 4); + return offset ? /** */ (this.bb!.readInt16(this.bb_pos + offset)) : TimeUnit.SECOND; + } + + /** + * The time zone is a string indicating the name of a time zone, one of: + * + * * As used in the Olson time zone database (the "tz database" or + * "tzdata"), such as "America/New_York" + * * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 + * + * Whether a timezone string is present indicates different semantics about + * the data: + * + * * If the time zone is null or equal to an empty string, the data is "time + * zone naive" and shall be displayed *as is* to the user, not localized + * to the locale of the user. This data can be though of as UTC but + * without having "UTC" as the time zone, it is not considered to be + * localized to any time zone + * + * * If the time zone is set to a valid value, values can be displayed as + * "localized" to that time zone, even though the underlying 64-bit + * integers are identical to the same data stored in UTC. Converting + * between time zones is a metadata-only operation and does not change the + * underlying values + * + * @param flatbuffers.Encoding= optionalEncoding + * @returns string|Uint8Array|null + */ + timezone(): string | null; + timezone(optionalEncoding: flatbuffers.Encoding): string | Uint8Array | null; + timezone(optionalEncoding?: any): string | Uint8Array | null { + const offset = this.bb!.__offset(this.bb_pos, 6); + return offset ? this.bb!.__string(this.bb_pos + offset, optionalEncoding) : null; + } + + /** + * @param flatbuffers.Builder builder + */ + static startTimestamp(builder: flatbuffers.Builder) { + builder.startObject(2); + } + + /** + * @param flatbuffers.Builder builder + * @param TimeUnit unit + */ + static addUnit(builder: flatbuffers.Builder, unit: TimeUnit) { + builder.addFieldInt16(0, unit, TimeUnit.SECOND); + } + + /** + * @param flatbuffers.Builder builder + * @param flatbuffers.Offset timezoneOffset + */ + static addTimezone(builder: flatbuffers.Builder, timezoneOffset: flatbuffers.Offset) { + builder.addFieldOffset(1, timezoneOffset, 0); + } + + /** + * @param flatbuffers.Builder builder + * @returns flatbuffers.Offset + */ + static endTimestamp(builder: flatbuffers.Builder): flatbuffers.Offset { + const offset = builder.endObject(); + return offset; + } + + static createTimestamp(builder: flatbuffers.Builder, unit: TimeUnit, timezoneOffset: flatbuffers.Offset): flatbuffers.Offset { + Timestamp.startTimestamp(builder); + Timestamp.addUnit(builder, unit); + Timestamp.addTimezone(builder, timezoneOffset); + return Timestamp.endTimestamp(builder); + } +} +/** + * @constructor + */ +export class Interval { + bb: flatbuffers.ByteBuffer | null = null; + + bb_pos: number = 0; + /** + * @param number i + * @param flatbuffers.ByteBuffer bb + * @returns Interval + */ + __init(i: number, bb: flatbuffers.ByteBuffer): Interval { + this.bb_pos = i; + this.bb = bb; + return this; + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param Interval= obj + * @returns Interval + */ + static getRootAsInterval(bb: flatbuffers.ByteBuffer, obj?: Interval): Interval { + return (obj || new Interval()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param Interval= obj + * @returns Interval + */ + static getSizePrefixedRootAsInterval(bb: flatbuffers.ByteBuffer, obj?: Interval): Interval { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new Interval()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @returns IntervalUnit + */ + unit(): IntervalUnit { + const offset = this.bb!.__offset(this.bb_pos, 4); + return offset ? /** */ (this.bb!.readInt16(this.bb_pos + offset)) : IntervalUnit.YEAR_MONTH; + } + + /** + * @param flatbuffers.Builder builder + */ + static startInterval(builder: flatbuffers.Builder) { + builder.startObject(1); + } + + /** + * @param flatbuffers.Builder builder + * @param IntervalUnit unit + */ + static addUnit(builder: flatbuffers.Builder, unit: IntervalUnit) { + builder.addFieldInt16(0, unit, IntervalUnit.YEAR_MONTH); + } + + /** + * @param flatbuffers.Builder builder + * @returns flatbuffers.Offset + */ + static endInterval(builder: flatbuffers.Builder): flatbuffers.Offset { + const offset = builder.endObject(); + return offset; + } + + static createInterval(builder: flatbuffers.Builder, unit: IntervalUnit): flatbuffers.Offset { + Interval.startInterval(builder); + Interval.addUnit(builder, unit); + return Interval.endInterval(builder); + } +} +/** + * @constructor + */ +export class Duration { + bb: flatbuffers.ByteBuffer | null = null; + + bb_pos: number = 0; + /** + * @param number i + * @param flatbuffers.ByteBuffer bb + * @returns Duration + */ + __init(i: number, bb: flatbuffers.ByteBuffer): Duration { + this.bb_pos = i; + this.bb = bb; + return this; + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param Duration= obj + * @returns Duration + */ + static getRootAsDuration(bb: flatbuffers.ByteBuffer, obj?: Duration): Duration { + return (obj || new Duration()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param Duration= obj + * @returns Duration + */ + static getSizePrefixedRootAsDuration(bb: flatbuffers.ByteBuffer, obj?: Duration): Duration { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new Duration()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @returns TimeUnit + */ + unit(): TimeUnit { + const offset = this.bb!.__offset(this.bb_pos, 4); + return offset ? /** */ (this.bb!.readInt16(this.bb_pos + offset)) : TimeUnit.MILLISECOND; + } + + /** + * @param flatbuffers.Builder builder + */ + static startDuration(builder: flatbuffers.Builder) { + builder.startObject(1); + } + + /** + * @param flatbuffers.Builder builder + * @param TimeUnit unit + */ + static addUnit(builder: flatbuffers.Builder, unit: TimeUnit) { + builder.addFieldInt16(0, unit, TimeUnit.MILLISECOND); + } + + /** + * @param flatbuffers.Builder builder + * @returns flatbuffers.Offset + */ + static endDuration(builder: flatbuffers.Builder): flatbuffers.Offset { + const offset = builder.endObject(); + return offset; + } + + static createDuration(builder: flatbuffers.Builder, unit: TimeUnit): flatbuffers.Offset { + Duration.startDuration(builder); + Duration.addUnit(builder, unit); + return Duration.endDuration(builder); + } +} +/** + * ---------------------------------------------------------------------- + * user defined key value pairs to add custom metadata to arrow + * key namespacing is the responsibility of the user + * + * @constructor + */ +export class KeyValue { + bb: flatbuffers.ByteBuffer | null = null; + + bb_pos: number = 0; + /** + * @param number i + * @param flatbuffers.ByteBuffer bb + * @returns KeyValue + */ + __init(i: number, bb: flatbuffers.ByteBuffer): KeyValue { + this.bb_pos = i; + this.bb = bb; + return this; + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param KeyValue= obj + * @returns KeyValue + */ + static getRootAsKeyValue(bb: flatbuffers.ByteBuffer, obj?: KeyValue): KeyValue { + return (obj || new KeyValue()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param KeyValue= obj + * @returns KeyValue + */ + static getSizePrefixedRootAsKeyValue(bb: flatbuffers.ByteBuffer, obj?: KeyValue): KeyValue { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new KeyValue()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.Encoding= optionalEncoding + * @returns string|Uint8Array|null + */ + key(): string | null; + key(optionalEncoding: flatbuffers.Encoding): string | Uint8Array | null; + key(optionalEncoding?: any): string | Uint8Array | null { + const offset = this.bb!.__offset(this.bb_pos, 4); + return offset ? this.bb!.__string(this.bb_pos + offset, optionalEncoding) : null; + } + + /** + * @param flatbuffers.Encoding= optionalEncoding + * @returns string|Uint8Array|null + */ + value(): string | null; + value(optionalEncoding: flatbuffers.Encoding): string | Uint8Array | null; + value(optionalEncoding?: any): string | Uint8Array | null { + const offset = this.bb!.__offset(this.bb_pos, 6); + return offset ? this.bb!.__string(this.bb_pos + offset, optionalEncoding) : null; + } + + /** + * @param flatbuffers.Builder builder + */ + static startKeyValue(builder: flatbuffers.Builder) { + builder.startObject(2); + } + + /** + * @param flatbuffers.Builder builder + * @param flatbuffers.Offset keyOffset + */ + static addKey(builder: flatbuffers.Builder, keyOffset: flatbuffers.Offset) { + builder.addFieldOffset(0, keyOffset, 0); + } + + /** + * @param flatbuffers.Builder builder + * @param flatbuffers.Offset valueOffset + */ + static addValue(builder: flatbuffers.Builder, valueOffset: flatbuffers.Offset) { + builder.addFieldOffset(1, valueOffset, 0); + } + + /** + * @param flatbuffers.Builder builder + * @returns flatbuffers.Offset + */ + static endKeyValue(builder: flatbuffers.Builder): flatbuffers.Offset { + const offset = builder.endObject(); + return offset; + } + + static createKeyValue(builder: flatbuffers.Builder, keyOffset: flatbuffers.Offset, valueOffset: flatbuffers.Offset): flatbuffers.Offset { + KeyValue.startKeyValue(builder); + KeyValue.addKey(builder, keyOffset); + KeyValue.addValue(builder, valueOffset); + return KeyValue.endKeyValue(builder); + } +} +/** + * @constructor + */ +export class DictionaryEncoding { + bb: flatbuffers.ByteBuffer | null = null; + + bb_pos: number = 0; + /** + * @param number i + * @param flatbuffers.ByteBuffer bb + * @returns DictionaryEncoding + */ + __init(i: number, bb: flatbuffers.ByteBuffer): DictionaryEncoding { + this.bb_pos = i; + this.bb = bb; + return this; + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param DictionaryEncoding= obj + * @returns DictionaryEncoding + */ + static getRootAsDictionaryEncoding(bb: flatbuffers.ByteBuffer, obj?: DictionaryEncoding): DictionaryEncoding { + return (obj || new DictionaryEncoding()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param DictionaryEncoding= obj + * @returns DictionaryEncoding + */ + static getSizePrefixedRootAsDictionaryEncoding(bb: flatbuffers.ByteBuffer, obj?: DictionaryEncoding): DictionaryEncoding { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new DictionaryEncoding()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * The known dictionary id in the application where this data is used. In + * the file or streaming formats, the dictionary ids are found in the + * DictionaryBatch messages + * + * @returns flatbuffers.Long + */ + id(): flatbuffers.Long { + const offset = this.bb!.__offset(this.bb_pos, 4); + return offset ? this.bb!.readInt64(this.bb_pos + offset) : this.bb!.createLong(0, 0); + } + + /** + * The dictionary indices are constrained to be non-negative integers. If + * this field is null, the indices must be signed int32. To maximize + * cross-language compatibility and performance, implementations are + * recommended to prefer signed integer types over unsigned integer types + * and to avoid uint64 indices unless they are required by an application. + * + * @param Int= obj + * @returns Int|null + */ + indexType(obj?: Int): Int | null { + const offset = this.bb!.__offset(this.bb_pos, 6); + return offset ? (obj || new Int()).__init(this.bb!.__indirect(this.bb_pos + offset), this.bb!) : null; + } + + /** + * By default, dictionaries are not ordered, or the order does not have + * semantic meaning. In some statistical, applications, dictionary-encoding + * is used to represent ordered categorical data, and we provide a way to + * preserve that metadata here + * + * @returns boolean + */ + isOrdered(): boolean { + const offset = this.bb!.__offset(this.bb_pos, 8); + return offset ? !!this.bb!.readInt8(this.bb_pos + offset) : false; + } + + /** + * @returns DictionaryKind + */ + dictionaryKind(): DictionaryKind { + const offset = this.bb!.__offset(this.bb_pos, 10); + return offset ? /** */ (this.bb!.readInt16(this.bb_pos + offset)) : DictionaryKind.DenseArray; + } + + /** + * @param flatbuffers.Builder builder + */ + static startDictionaryEncoding(builder: flatbuffers.Builder) { + builder.startObject(4); + } + + /** + * @param flatbuffers.Builder builder + * @param flatbuffers.Long id + */ + static addId(builder: flatbuffers.Builder, id: flatbuffers.Long) { + builder.addFieldInt64(0, id, builder.createLong(0, 0)); + } + + /** + * @param flatbuffers.Builder builder + * @param flatbuffers.Offset indexTypeOffset + */ + static addIndexType(builder: flatbuffers.Builder, indexTypeOffset: flatbuffers.Offset) { + builder.addFieldOffset(1, indexTypeOffset, 0); + } + + /** + * @param flatbuffers.Builder builder + * @param boolean isOrdered + */ + static addIsOrdered(builder: flatbuffers.Builder, isOrdered: boolean) { + builder.addFieldInt8(2, +isOrdered, +false); + } + + /** + * @param flatbuffers.Builder builder + * @param DictionaryKind dictionaryKind + */ + static addDictionaryKind(builder: flatbuffers.Builder, dictionaryKind: DictionaryKind) { + builder.addFieldInt16(3, dictionaryKind, DictionaryKind.DenseArray); + } + + /** + * @param flatbuffers.Builder builder + * @returns flatbuffers.Offset + */ + static endDictionaryEncoding(builder: flatbuffers.Builder): flatbuffers.Offset { + const offset = builder.endObject(); + return offset; + } + + static createDictionaryEncoding(builder: flatbuffers.Builder, id: flatbuffers.Long, indexTypeOffset: flatbuffers.Offset, isOrdered: boolean, dictionaryKind: DictionaryKind): flatbuffers.Offset { + DictionaryEncoding.startDictionaryEncoding(builder); + DictionaryEncoding.addId(builder, id); + DictionaryEncoding.addIndexType(builder, indexTypeOffset); + DictionaryEncoding.addIsOrdered(builder, isOrdered); + DictionaryEncoding.addDictionaryKind(builder, dictionaryKind); + return DictionaryEncoding.endDictionaryEncoding(builder); + } +} +/** + * ---------------------------------------------------------------------- + * A field represents a named column in a record / row batch or child of a + * nested type. + * + * @constructor + */ +export class Field { + bb: flatbuffers.ByteBuffer | null = null; + + bb_pos: number = 0; + /** + * @param number i + * @param flatbuffers.ByteBuffer bb + * @returns Field + */ + __init(i: number, bb: flatbuffers.ByteBuffer): Field { + this.bb_pos = i; + this.bb = bb; + return this; + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param Field= obj + * @returns Field + */ + static getRootAsField(bb: flatbuffers.ByteBuffer, obj?: Field): Field { + return (obj || new Field()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param Field= obj + * @returns Field + */ + static getSizePrefixedRootAsField(bb: flatbuffers.ByteBuffer, obj?: Field): Field { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new Field()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * Name is not required, in i.e. a List + * + * @param flatbuffers.Encoding= optionalEncoding + * @returns string|Uint8Array|null + */ + name(): string | null; + name(optionalEncoding: flatbuffers.Encoding): string | Uint8Array | null; + name(optionalEncoding?: any): string | Uint8Array | null { + const offset = this.bb!.__offset(this.bb_pos, 4); + return offset ? this.bb!.__string(this.bb_pos + offset, optionalEncoding) : null; + } + + /** + * Whether or not this field can contain nulls. Should be true in general. + * + * @returns boolean + */ + nullable(): boolean { + const offset = this.bb!.__offset(this.bb_pos, 6); + return offset ? !!this.bb!.readInt8(this.bb_pos + offset) : false; + } + + /** + * @returns Type + */ + typeType(): Type { + const offset = this.bb!.__offset(this.bb_pos, 8); + return offset ? /** */ (this.bb!.readUint8(this.bb_pos + offset)) : Type.NONE; + } + + /** + * This is the type of the decoded value if the field is dictionary encoded. + * + * @param flatbuffers.Table obj + * @returns ?flatbuffers.Table + */ + type<T extends flatbuffers.Table>(obj: T): T | null { + const offset = this.bb!.__offset(this.bb_pos, 10); + return offset ? this.bb!.__union(obj, this.bb_pos + offset) : null; + } + + /** + * Present only if the field is dictionary encoded. + * + * @param DictionaryEncoding= obj + * @returns DictionaryEncoding|null + */ + dictionary(obj?: DictionaryEncoding): DictionaryEncoding | null { + const offset = this.bb!.__offset(this.bb_pos, 12); + return offset ? (obj || new DictionaryEncoding()).__init(this.bb!.__indirect(this.bb_pos + offset), this.bb!) : null; + } + + /** + * children apply only to nested data types like Struct, List and Union. For + * primitive types children will have length 0. + * + * @param number index + * @param Field= obj + * @returns Field + */ + children(index: number, obj?: Field): Field | null { + const offset = this.bb!.__offset(this.bb_pos, 14); + return offset ? (obj || new Field()).__init(this.bb!.__indirect(this.bb!.__vector(this.bb_pos + offset) + index * 4), this.bb!) : null; + } + + /** + * @returns number + */ + childrenLength(): number { + const offset = this.bb!.__offset(this.bb_pos, 14); + return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0; + } + + /** + * User-defined metadata + * + * @param number index + * @param KeyValue= obj + * @returns KeyValue + */ + customMetadata(index: number, obj?: KeyValue): KeyValue | null { + const offset = this.bb!.__offset(this.bb_pos, 16); + return offset ? (obj || new KeyValue()).__init(this.bb!.__indirect(this.bb!.__vector(this.bb_pos + offset) + index * 4), this.bb!) : null; + } + + /** + * @returns number + */ + customMetadataLength(): number { + const offset = this.bb!.__offset(this.bb_pos, 16); + return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0; + } + + /** + * @param flatbuffers.Builder builder + */ + static startField(builder: flatbuffers.Builder) { + builder.startObject(7); + } + + /** + * @param flatbuffers.Builder builder + * @param flatbuffers.Offset nameOffset + */ + static addName(builder: flatbuffers.Builder, nameOffset: flatbuffers.Offset) { + builder.addFieldOffset(0, nameOffset, 0); + } + + /** + * @param flatbuffers.Builder builder + * @param boolean nullable + */ + static addNullable(builder: flatbuffers.Builder, nullable: boolean) { + builder.addFieldInt8(1, +nullable, +false); + } + + /** + * @param flatbuffers.Builder builder + * @param Type typeType + */ + static addTypeType(builder: flatbuffers.Builder, typeType: Type) { + builder.addFieldInt8(2, typeType, Type.NONE); + } + + /** + * @param flatbuffers.Builder builder + * @param flatbuffers.Offset typeOffset + */ + static addType(builder: flatbuffers.Builder, typeOffset: flatbuffers.Offset) { + builder.addFieldOffset(3, typeOffset, 0); + } + + /** + * @param flatbuffers.Builder builder + * @param flatbuffers.Offset dictionaryOffset + */ + static addDictionary(builder: flatbuffers.Builder, dictionaryOffset: flatbuffers.Offset) { + builder.addFieldOffset(4, dictionaryOffset, 0); + } + + /** + * @param flatbuffers.Builder builder + * @param flatbuffers.Offset childrenOffset + */ + static addChildren(builder: flatbuffers.Builder, childrenOffset: flatbuffers.Offset) { + builder.addFieldOffset(5, childrenOffset, 0); + } + + /** + * @param flatbuffers.Builder builder + * @param Array.<flatbuffers.Offset> data + * @returns flatbuffers.Offset + */ + static createChildrenVector(builder: flatbuffers.Builder, data: flatbuffers.Offset[]): flatbuffers.Offset { + builder.startVector(4, data.length, 4); + for (let i = data.length - 1; i >= 0; i--) { + builder.addOffset(data[i]); + } + return builder.endVector(); + } + + /** + * @param flatbuffers.Builder builder + * @param number numElems + */ + static startChildrenVector(builder: flatbuffers.Builder, numElems: number) { + builder.startVector(4, numElems, 4); + } + + /** + * @param flatbuffers.Builder builder + * @param flatbuffers.Offset customMetadataOffset + */ + static addCustomMetadata(builder: flatbuffers.Builder, customMetadataOffset: flatbuffers.Offset) { + builder.addFieldOffset(6, customMetadataOffset, 0); + } + + /** + * @param flatbuffers.Builder builder + * @param Array.<flatbuffers.Offset> data + * @returns flatbuffers.Offset + */ + static createCustomMetadataVector(builder: flatbuffers.Builder, data: flatbuffers.Offset[]): flatbuffers.Offset { + builder.startVector(4, data.length, 4); + for (let i = data.length - 1; i >= 0; i--) { + builder.addOffset(data[i]); + } + return builder.endVector(); + } + + /** + * @param flatbuffers.Builder builder + * @param number numElems + */ + static startCustomMetadataVector(builder: flatbuffers.Builder, numElems: number) { + builder.startVector(4, numElems, 4); + } + + /** + * @param flatbuffers.Builder builder + * @returns flatbuffers.Offset + */ + static endField(builder: flatbuffers.Builder): flatbuffers.Offset { + const offset = builder.endObject(); + return offset; + } + + static createField(builder: flatbuffers.Builder, nameOffset: flatbuffers.Offset, nullable: boolean, typeType: Type, typeOffset: flatbuffers.Offset, dictionaryOffset: flatbuffers.Offset, childrenOffset: flatbuffers.Offset, customMetadataOffset: flatbuffers.Offset): flatbuffers.Offset { + Field.startField(builder); + Field.addName(builder, nameOffset); + Field.addNullable(builder, nullable); + Field.addTypeType(builder, typeType); + Field.addType(builder, typeOffset); + Field.addDictionary(builder, dictionaryOffset); + Field.addChildren(builder, childrenOffset); + Field.addCustomMetadata(builder, customMetadataOffset); + return Field.endField(builder); + } +} +/** + * ---------------------------------------------------------------------- + * A Buffer represents a single contiguous memory segment + * + * @constructor + */ +export class Buffer { + bb: flatbuffers.ByteBuffer | null = null; + + bb_pos: number = 0; + /** + * @param number i + * @param flatbuffers.ByteBuffer bb + * @returns Buffer + */ + __init(i: number, bb: flatbuffers.ByteBuffer): Buffer { + this.bb_pos = i; + this.bb = bb; + return this; + } + + /** + * The relative offset into the shared memory page where the bytes for this + * buffer starts + * + * @returns flatbuffers.Long + */ + offset(): flatbuffers.Long { + return this.bb!.readInt64(this.bb_pos); + } + + /** + * The absolute length (in bytes) of the memory buffer. The memory is found + * from offset (inclusive) to offset + length (non-inclusive). When building + * messages using the encapsulated IPC message, padding bytes may be written + * after a buffer, but such padding bytes do not need to be accounted for in + * the size here. + * + * @returns flatbuffers.Long + */ + length(): flatbuffers.Long { + return this.bb!.readInt64(this.bb_pos + 8); + } + + /** + * @param flatbuffers.Builder builder + * @param flatbuffers.Long offset + * @param flatbuffers.Long length + * @returns flatbuffers.Offset + */ + static createBuffer(builder: flatbuffers.Builder, offset: flatbuffers.Long, length: flatbuffers.Long): flatbuffers.Offset { + builder.prep(8, 16); + builder.writeInt64(length); + builder.writeInt64(offset); + return builder.offset(); + } + +} +/** + * ---------------------------------------------------------------------- + * A Schema describes the columns in a row batch + * + * @constructor + */ +export class Schema { + bb: flatbuffers.ByteBuffer | null = null; + + bb_pos: number = 0; + /** + * @param number i + * @param flatbuffers.ByteBuffer bb + * @returns Schema + */ + __init(i: number, bb: flatbuffers.ByteBuffer): Schema { + this.bb_pos = i; + this.bb = bb; + return this; + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param Schema= obj + * @returns Schema + */ + static getRootAsSchema(bb: flatbuffers.ByteBuffer, obj?: Schema): Schema { + return (obj || new Schema()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * @param flatbuffers.ByteBuffer bb + * @param Schema= obj + * @returns Schema + */ + static getSizePrefixedRootAsSchema(bb: flatbuffers.ByteBuffer, obj?: Schema): Schema { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new Schema()).__init(bb.readInt32(bb.position()) + bb.position(), bb); + } + + /** + * endianness of the buffer + * it is Little Endian by default + * if endianness doesn't match the underlying system then the vectors need to be converted + * + * @returns Endianness + */ + endianness(): Endianness { + const offset = this.bb!.__offset(this.bb_pos, 4); + return offset ? /** */ (this.bb!.readInt16(this.bb_pos + offset)) : Endianness.Little; + } + + /** + * @param number index + * @param Field= obj + * @returns Field + */ + fields(index: number, obj?: Field): Field | null { + const offset = this.bb!.__offset(this.bb_pos, 6); + return offset ? (obj || new Field()).__init(this.bb!.__indirect(this.bb!.__vector(this.bb_pos + offset) + index * 4), this.bb!) : null; + } + + /** + * @returns number + */ + fieldsLength(): number { + const offset = this.bb!.__offset(this.bb_pos, 6); + return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0; + } + + /** + * @param number index + * @param KeyValue= obj + * @returns KeyValue + */ + customMetadata(index: number, obj?: KeyValue): KeyValue | null { + const offset = this.bb!.__offset(this.bb_pos, 8); + return offset ? (obj || new KeyValue()).__init(this.bb!.__indirect(this.bb!.__vector(this.bb_pos + offset) + index * 4), this.bb!) : null; + } + + /** + * @returns number + */ + customMetadataLength(): number { + const offset = this.bb!.__offset(this.bb_pos, 8); + return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0; + } + + /** + * Features used in the stream/file. + * + * @param number index + * @returns flatbuffers.Long + */ + features(index: number): flatbuffers.Long | null { + const offset = this.bb!.__offset(this.bb_pos, 10); + return offset ? /** */ (this.bb!.readInt64(this.bb!.__vector(this.bb_pos + offset) + index * 8)) : this.bb!.createLong(0, 0); + } + + /** + * @returns number + */ + featuresLength(): number { + const offset = this.bb!.__offset(this.bb_pos, 10); + return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0; + } + + /** + * @param flatbuffers.Builder builder + */ + static startSchema(builder: flatbuffers.Builder) { + builder.startObject(4); + } + + /** + * @param flatbuffers.Builder builder + * @param Endianness endianness + */ + static addEndianness(builder: flatbuffers.Builder, endianness: Endianness) { + builder.addFieldInt16(0, endianness, Endianness.Little); + } + + /** + * @param flatbuffers.Builder builder + * @param flatbuffers.Offset fieldsOffset + */ + static addFields(builder: flatbuffers.Builder, fieldsOffset: flatbuffers.Offset) { + builder.addFieldOffset(1, fieldsOffset, 0); + } + + /** + * @param flatbuffers.Builder builder + * @param Array.<flatbuffers.Offset> data + * @returns flatbuffers.Offset + */ + static createFieldsVector(builder: flatbuffers.Builder, data: flatbuffers.Offset[]): flatbuffers.Offset { + builder.startVector(4, data.length, 4); + for (let i = data.length - 1; i >= 0; i--) { + builder.addOffset(data[i]); + } + return builder.endVector(); + } + + /** + * @param flatbuffers.Builder builder + * @param number numElems + */ + static startFieldsVector(builder: flatbuffers.Builder, numElems: number) { + builder.startVector(4, numElems, 4); + } + + /** + * @param flatbuffers.Builder builder + * @param flatbuffers.Offset customMetadataOffset + */ + static addCustomMetadata(builder: flatbuffers.Builder, customMetadataOffset: flatbuffers.Offset) { + builder.addFieldOffset(2, customMetadataOffset, 0); + } + + /** + * @param flatbuffers.Builder builder + * @param Array.<flatbuffers.Offset> data + * @returns flatbuffers.Offset + */ + static createCustomMetadataVector(builder: flatbuffers.Builder, data: flatbuffers.Offset[]): flatbuffers.Offset { + builder.startVector(4, data.length, 4); + for (let i = data.length - 1; i >= 0; i--) { + builder.addOffset(data[i]); + } + return builder.endVector(); + } + + /** + * @param flatbuffers.Builder builder + * @param number numElems + */ + static startCustomMetadataVector(builder: flatbuffers.Builder, numElems: number) { + builder.startVector(4, numElems, 4); + } + + /** + * @param flatbuffers.Builder builder + * @param flatbuffers.Offset featuresOffset + */ + static addFeatures(builder: flatbuffers.Builder, featuresOffset: flatbuffers.Offset) { + builder.addFieldOffset(3, featuresOffset, 0); + } + + /** + * @param flatbuffers.Builder builder + * @param Array.<flatbuffers.Long> data + * @returns flatbuffers.Offset + */ + static createFeaturesVector(builder: flatbuffers.Builder, data: flatbuffers.Long[]): flatbuffers.Offset { + builder.startVector(8, data.length, 8); + for (let i = data.length - 1; i >= 0; i--) { + builder.addInt64(data[i]); + } + return builder.endVector(); + } + + /** + * @param flatbuffers.Builder builder + * @param number numElems + */ + static startFeaturesVector(builder: flatbuffers.Builder, numElems: number) { + builder.startVector(8, numElems, 8); + } + + /** + * @param flatbuffers.Builder builder + * @returns flatbuffers.Offset + */ + static endSchema(builder: flatbuffers.Builder): flatbuffers.Offset { + const offset = builder.endObject(); + return offset; + } + + /** + * @param flatbuffers.Builder builder + * @param flatbuffers.Offset offset + */ + static finishSchemaBuffer(builder: flatbuffers.Builder, offset: flatbuffers.Offset) { + builder.finish(offset); + } + + /** + * @param flatbuffers.Builder builder + * @param flatbuffers.Offset offset + */ + static finishSizePrefixedSchemaBuffer(builder: flatbuffers.Builder, offset: flatbuffers.Offset) { + builder.finish(offset, undefined, true); + } + + static createSchema(builder: flatbuffers.Builder, endianness: Endianness, fieldsOffset: flatbuffers.Offset, customMetadataOffset: flatbuffers.Offset, featuresOffset: flatbuffers.Offset): flatbuffers.Offset { + Schema.startSchema(builder); + Schema.addEndianness(builder, endianness); + Schema.addFields(builder, fieldsOffset); + Schema.addCustomMetadata(builder, customMetadataOffset); + Schema.addFeatures(builder, featuresOffset); + return Schema.endSchema(builder); + } +} diff --git a/src/arrow/js/src/interfaces.ts b/src/arrow/js/src/interfaces.ts new file mode 100644 index 000000000..43977ca7a --- /dev/null +++ b/src/arrow/js/src/interfaces.ts @@ -0,0 +1,417 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Data } from './data'; +import { Type } from './enum'; +import * as type from './type'; +import { DataType } from './type'; +import * as vecs from './vector/index'; +import * as builders from './builder/index'; +import { BuilderOptions } from './builder/index'; + +/** @ignore */ type FloatArray = Float32Array | Float64Array; +/** @ignore */ type IntArray = Int8Array | Int16Array | Int32Array; +/** @ignore */ type UintArray = Uint8Array | Uint16Array | Uint32Array | Uint8ClampedArray; +/** @ignore */ +export type TypedArray = FloatArray | IntArray | UintArray; +/** @ignore */ +export type BigIntArray = BigInt64Array | BigUint64Array; + +/** @ignore */ +export interface TypedArrayConstructor<T extends TypedArray> { + readonly prototype: T; + new(length?: number): T; + new(array: Iterable<number>): T; + new(buffer: ArrayBufferLike, byteOffset?: number, length?: number): T; + /** + * The size in bytes of each element in the array. + */ + readonly BYTES_PER_ELEMENT: number; + /** + * Returns a new array from a set of elements. + * @param items A set of elements to include in the new array object. + */ + of(...items: number[]): T; + /** + * Creates an array from an array-like or iterable object. + * @param arrayLike An array-like or iterable object to convert to an array. + * @param mapfn A mapping function to call on every element of the array. + * @param thisArg Value of 'this' used to invoke the mapfn. + */ + from(arrayLike: ArrayLike<number>, mapfn?: (v: number, k: number) => number, thisArg?: any): T; + from<U>(arrayLike: ArrayLike<U>, mapfn: (v: U, k: number) => number, thisArg?: any): T; +} + +/** @ignore */ +export interface BigIntArrayConstructor<T extends BigIntArray> { + readonly prototype: T; + new(length?: number): T; + new(array: Iterable<bigint>): T; + new(buffer: ArrayBufferLike, byteOffset?: number, length?: number): T; + /** + * The size in bytes of each element in the array. + */ + readonly BYTES_PER_ELEMENT: number; + /** + * Returns a new array from a set of elements. + * @param items A set of elements to include in the new array object. + */ + of(...items: bigint[]): T; + /** + * Creates an array from an array-like or iterable object. + * @param arrayLike An array-like or iterable object to convert to an array. + * @param mapfn A mapping function to call on every element of the array. + * @param thisArg Value of 'this' used to invoke the mapfn. + */ + from(arrayLike: ArrayLike<bigint>, mapfn?: (v: bigint, k: number) => bigint, thisArg?: any): T; + from<U>(arrayLike: ArrayLike<U>, mapfn: (v: U, k: number) => bigint, thisArg?: any): T; +} + +/** @ignore */ +export type VectorCtorArgs< + T extends VectorType<R>, + R extends DataType = any, + TArgs extends any[] = any[], + TCtor extends new (data: Data<R>, ...args: TArgs) => T = + new (data: Data<R>, ...args: TArgs) => T +> = TCtor extends new (data: Data<R>, ...args: infer TArgs) => T ? TArgs : never; + +/** @ignore */ +export type BuilderCtorArgs< + T extends BuilderType<R, any>, + R extends DataType = any, + TArgs extends any[] = any[], + TCtor extends new (type: R, ...args: TArgs) => T = + new (type: R, ...args: TArgs) => T +> = TCtor extends new (type: R, ...args: infer TArgs) => T ? TArgs : never; + +/** + * Obtain the constructor function of an instance type + * @ignore + */ +export type ConstructorType< + T, + TCtor extends new (...args: any[]) => T = + new (...args: any[]) => T +> = TCtor extends new (...args: any[]) => T ? TCtor : never; + +/** @ignore */ +export type VectorCtorType< + T extends VectorType<R>, + R extends DataType = any, + TCtor extends new (type: R, data?: Data<R>[], offsets?: Uint32Array) => T = + new (type: R, data?: Data<R>[], offsets?: Uint32Array) => T +> = TCtor extends new (type: R, data?: Data<R>[], offsets?: Uint32Array) => T ? TCtor : never; + +/** @ignore */ +export type BuilderCtorType< + T extends BuilderType<R, any>, + R extends DataType = any, + TCtor extends new (options: BuilderOptions<R, any>) => T = + new (options: BuilderOptions<R, any>) => T +> = TCtor extends new (options: BuilderOptions<R, any>) => T ? TCtor : never; + +/** @ignore */ +export type VectorType<T extends Type | DataType = any> = + T extends Type ? TypeToVector<T> : + T extends DataType ? DataTypeToVector<T> : + vecs.BaseVector<any> + ; + +/** @ignore */ +export type BuilderType<T extends Type | DataType = any, TNull = any> = + T extends Type ? TypeToBuilder<T, TNull> : + T extends DataType ? DataTypeToBuilder<T, TNull> : + builders.Builder<any, TNull> + ; + +/** @ignore */ +export type VectorCtor<T extends Type | DataType | VectorType> = + T extends VectorType ? VectorCtorType<VectorType<T['TType']>> : + T extends Type ? VectorCtorType<VectorType<T>> : + T extends DataType ? VectorCtorType<VectorType<T['TType']>> : + VectorCtorType<vecs.BaseVector<any>> + ; + +/** @ignore */ +export type BuilderCtor<T extends Type | DataType = any> = + T extends Type ? BuilderCtorType<BuilderType<T>> : + T extends DataType ? BuilderCtorType<BuilderType<T>> : + BuilderCtorType<builders.Builder> + ; + +/** @ignore */ +export type DataTypeCtor<T extends Type | DataType | VectorType = any> = + T extends DataType ? ConstructorType<T> : + T extends VectorType ? ConstructorType<T['type']> : + T extends Type ? ConstructorType<TypeToDataType<T>> : + never + ; + +/** @ignore */ +export type TypedArrayDataType<T extends Exclude<TypedArray, Uint8ClampedArray> | BigIntArray> = + T extends Int8Array ? type.Int8 : + T extends Int16Array ? type.Int16 : + T extends Int32Array ? type.Int32 : + T extends BigInt64Array ? type.Int64 : + T extends Uint8Array ? type.Uint8 : + T extends Uint16Array ? type.Uint16 : + T extends Uint32Array ? type.Uint32 : + T extends BigUint64Array ? type.Uint64 : + T extends Float32Array ? type.Float32 : + T extends Float64Array ? type.Float64 : + never; + +/** @ignore */ +type TypeToVector<T extends Type> = { + [key: number ]: vecs.Vector<any> ; + [Type.Null ]: vecs.NullVector ; + [Type.Bool ]: vecs.BoolVector ; + [Type.Int8 ]: vecs.Int8Vector ; + [Type.Int16 ]: vecs.Int16Vector ; + [Type.Int32 ]: vecs.Int32Vector ; + [Type.Int64 ]: vecs.Int64Vector ; + [Type.Uint8 ]: vecs.Uint8Vector ; + [Type.Uint16 ]: vecs.Uint16Vector ; + [Type.Uint32 ]: vecs.Uint32Vector ; + [Type.Uint64 ]: vecs.Uint64Vector ; + [Type.Int ]: vecs.IntVector ; + [Type.Float16 ]: vecs.Float16Vector ; + [Type.Float32 ]: vecs.Float32Vector ; + [Type.Float64 ]: vecs.Float64Vector ; + [Type.Float ]: vecs.FloatVector ; + [Type.Utf8 ]: vecs.Utf8Vector ; + [Type.Binary ]: vecs.BinaryVector ; + [Type.FixedSizeBinary ]: vecs.FixedSizeBinaryVector ; + [Type.Date ]: vecs.DateVector ; + [Type.DateDay ]: vecs.DateDayVector ; + [Type.DateMillisecond ]: vecs.DateMillisecondVector ; + [Type.Timestamp ]: vecs.TimestampVector ; + [Type.TimestampSecond ]: vecs.TimestampSecondVector ; + [Type.TimestampMillisecond ]: vecs.TimestampMillisecondVector ; + [Type.TimestampMicrosecond ]: vecs.TimestampMicrosecondVector ; + [Type.TimestampNanosecond ]: vecs.TimestampNanosecondVector ; + [Type.Time ]: vecs.TimeVector ; + [Type.TimeSecond ]: vecs.TimeSecondVector ; + [Type.TimeMillisecond ]: vecs.TimeMillisecondVector ; + [Type.TimeMicrosecond ]: vecs.TimeMicrosecondVector ; + [Type.TimeNanosecond ]: vecs.TimeNanosecondVector ; + [Type.Decimal ]: vecs.DecimalVector ; + [Type.Union ]: vecs.UnionVector ; + [Type.DenseUnion ]: vecs.DenseUnionVector ; + [Type.SparseUnion ]: vecs.SparseUnionVector ; + [Type.Interval ]: vecs.IntervalVector ; + [Type.IntervalDayTime ]: vecs.IntervalDayTimeVector ; + [Type.IntervalYearMonth ]: vecs.IntervalYearMonthVector ; + [Type.Map ]: vecs.MapVector ; + [Type.List ]: vecs.ListVector ; + [Type.Struct ]: vecs.StructVector ; + [Type.Dictionary ]: vecs.DictionaryVector ; + [Type.FixedSizeList ]: vecs.FixedSizeListVector ; +}[T]; + +/** @ignore */ +type DataTypeToVector<T extends DataType = any> = { + [key: number ]: vecs.Vector<any> ; + [Type.Null ]: T extends type.Null ? vecs.NullVector : never ; + [Type.Bool ]: T extends type.Bool ? vecs.BoolVector : never ; + [Type.Int8 ]: T extends type.Int8 ? vecs.Int8Vector : never ; + [Type.Int16 ]: T extends type.Int16 ? vecs.Int16Vector : never ; + [Type.Int32 ]: T extends type.Int32 ? vecs.Int32Vector : never ; + [Type.Int64 ]: T extends type.Int64 ? vecs.Int64Vector : never ; + [Type.Uint8 ]: T extends type.Uint8 ? vecs.Uint8Vector : never ; + [Type.Uint16 ]: T extends type.Uint16 ? vecs.Uint16Vector : never ; + [Type.Uint32 ]: T extends type.Uint32 ? vecs.Uint32Vector : never ; + [Type.Uint64 ]: T extends type.Uint64 ? vecs.Uint64Vector : never ; + [Type.Int ]: T extends type.Int ? vecs.IntVector : never ; + [Type.Float16 ]: T extends type.Float16 ? vecs.Float16Vector : never ; + [Type.Float32 ]: T extends type.Float32 ? vecs.Float32Vector : never ; + [Type.Float64 ]: T extends type.Float64 ? vecs.Float64Vector : never ; + [Type.Float ]: T extends type.Float ? vecs.FloatVector : never ; + [Type.Utf8 ]: T extends type.Utf8 ? vecs.Utf8Vector : never ; + [Type.Binary ]: T extends type.Binary ? vecs.BinaryVector : never ; + [Type.FixedSizeBinary ]: T extends type.FixedSizeBinary ? vecs.FixedSizeBinaryVector : never ; + [Type.Date ]: T extends type.Date_ ? vecs.DateVector : never ; + [Type.DateDay ]: T extends type.DateDay ? vecs.DateDayVector : never ; + [Type.DateMillisecond ]: T extends type.DateMillisecond ? vecs.DateMillisecondVector : never ; + [Type.Timestamp ]: T extends type.Timestamp ? vecs.TimestampVector : never ; + [Type.TimestampSecond ]: T extends type.TimestampSecond ? vecs.TimestampSecondVector : never ; + [Type.TimestampMillisecond ]: T extends type.TimestampMillisecond ? vecs.TimestampMillisecondVector : never ; + [Type.TimestampMicrosecond ]: T extends type.TimestampMicrosecond ? vecs.TimestampMicrosecondVector : never ; + [Type.TimestampNanosecond ]: T extends type.TimestampNanosecond ? vecs.TimestampNanosecondVector : never ; + [Type.Time ]: T extends type.Time ? vecs.TimeVector : never ; + [Type.TimeSecond ]: T extends type.TimeSecond ? vecs.TimeSecondVector : never ; + [Type.TimeMillisecond ]: T extends type.TimeMillisecond ? vecs.TimeMillisecondVector : never ; + [Type.TimeMicrosecond ]: T extends type.TimeMicrosecond ? vecs.TimeMicrosecondVector : never ; + [Type.TimeNanosecond ]: T extends type.TimeNanosecond ? vecs.TimeNanosecondVector : never ; + [Type.Decimal ]: T extends type.Decimal ? vecs.DecimalVector : never ; + [Type.Union ]: T extends type.Union ? vecs.UnionVector : never ; + [Type.DenseUnion ]: T extends type.DenseUnion ? vecs.DenseUnionVector : never ; + [Type.SparseUnion ]: T extends type.SparseUnion ? vecs.SparseUnionVector : never ; + [Type.Interval ]: T extends type.Interval ? vecs.IntervalVector : never ; + [Type.IntervalDayTime ]: T extends type.IntervalDayTime ? vecs.IntervalDayTimeVector : never ; + [Type.IntervalYearMonth ]: T extends type.IntervalYearMonth ? vecs.IntervalYearMonthVector : never ; + [Type.Map ]: T extends type.Map_ ? vecs.MapVector<T['keyType'], T['valueType']> : never ; + [Type.List ]: T extends type.List ? vecs.ListVector<T['valueType']> : never ; + [Type.Struct ]: T extends type.Struct ? vecs.StructVector<T['dataTypes']> : never ; + [Type.Dictionary ]: T extends type.Dictionary ? vecs.DictionaryVector<T['valueType'], T['indices']> : never ; + [Type.FixedSizeList ]: T extends type.FixedSizeList ? vecs.FixedSizeListVector<T['valueType']> : never ; +}[T['TType']]; + +/** @ignore */ +export type TypeToDataType<T extends Type> = { + [key: number ]: type.DataType ; + [Type.Null ]: type.Null ; + [Type.Bool ]: type.Bool ; + [Type.Int ]: type.Int ; + [Type.Int16 ]: type.Int16 ; + [Type.Int32 ]: type.Int32 ; + [Type.Int64 ]: type.Int64 ; + [Type.Uint8 ]: type.Uint8 ; + [Type.Uint16 ]: type.Uint16 ; + [Type.Uint32 ]: type.Uint32 ; + [Type.Uint64 ]: type.Uint64 ; + [Type.Int8 ]: type.Int8 ; + [Type.Float16 ]: type.Float16 ; + [Type.Float32 ]: type.Float32 ; + [Type.Float64 ]: type.Float64 ; + [Type.Float ]: type.Float ; + [Type.Utf8 ]: type.Utf8 ; + [Type.Binary ]: type.Binary ; + [Type.FixedSizeBinary ]: type.FixedSizeBinary ; + [Type.Date ]: type.Date_ ; + [Type.DateDay ]: type.DateDay ; + [Type.DateMillisecond ]: type.DateMillisecond ; + [Type.Timestamp ]: type.Timestamp ; + [Type.TimestampSecond ]: type.TimestampSecond ; + [Type.TimestampMillisecond ]: type.TimestampMillisecond ; + [Type.TimestampMicrosecond ]: type.TimestampMicrosecond ; + [Type.TimestampNanosecond ]: type.TimestampNanosecond ; + [Type.Time ]: type.Time ; + [Type.TimeSecond ]: type.TimeSecond ; + [Type.TimeMillisecond ]: type.TimeMillisecond ; + [Type.TimeMicrosecond ]: type.TimeMicrosecond ; + [Type.TimeNanosecond ]: type.TimeNanosecond ; + [Type.Decimal ]: type.Decimal ; + [Type.Union ]: type.Union ; + [Type.DenseUnion ]: type.DenseUnion ; + [Type.SparseUnion ]: type.SparseUnion ; + [Type.Interval ]: type.Interval ; + [Type.IntervalDayTime ]: type.IntervalDayTime ; + [Type.IntervalYearMonth ]: type.IntervalYearMonth ; + [Type.Map ]: type.Map_ ; + [Type.List ]: type.List ; + [Type.Struct ]: type.Struct ; + [Type.Dictionary ]: type.Dictionary ; + [Type.FixedSizeList ]: type.FixedSizeList ; +}[T]; + +/** @ignore */ +type TypeToBuilder<T extends Type = any, TNull = any> = { + [key: number ]: builders.Builder ; + [Type.Null ]: builders.NullBuilder<TNull> ; + [Type.Bool ]: builders.BoolBuilder<TNull> ; + [Type.Int8 ]: builders.Int8Builder<TNull> ; + [Type.Int16 ]: builders.Int16Builder<TNull> ; + [Type.Int32 ]: builders.Int32Builder<TNull> ; + [Type.Int64 ]: builders.Int64Builder<TNull> ; + [Type.Uint8 ]: builders.Uint8Builder<TNull> ; + [Type.Uint16 ]: builders.Uint16Builder<TNull> ; + [Type.Uint32 ]: builders.Uint32Builder<TNull> ; + [Type.Uint64 ]: builders.Uint64Builder<TNull> ; + [Type.Int ]: builders.IntBuilder<any, TNull> ; + [Type.Float16 ]: builders.Float16Builder<TNull> ; + [Type.Float32 ]: builders.Float32Builder<TNull> ; + [Type.Float64 ]: builders.Float64Builder<TNull> ; + [Type.Float ]: builders.FloatBuilder<any, TNull> ; + [Type.Utf8 ]: builders.Utf8Builder<TNull> ; + [Type.Binary ]: builders.BinaryBuilder<TNull> ; + [Type.FixedSizeBinary ]: builders.FixedSizeBinaryBuilder<TNull> ; + [Type.Date ]: builders.DateBuilder<any, TNull> ; + [Type.DateDay ]: builders.DateDayBuilder<TNull> ; + [Type.DateMillisecond ]: builders.DateMillisecondBuilder<TNull> ; + [Type.Timestamp ]: builders.TimestampBuilder<any, TNull> ; + [Type.TimestampSecond ]: builders.TimestampSecondBuilder<TNull> ; + [Type.TimestampMillisecond ]: builders.TimestampMillisecondBuilder<TNull> ; + [Type.TimestampMicrosecond ]: builders.TimestampMicrosecondBuilder<TNull> ; + [Type.TimestampNanosecond ]: builders.TimestampNanosecondBuilder<TNull> ; + [Type.Time ]: builders.TimeBuilder<any, TNull> ; + [Type.TimeSecond ]: builders.TimeSecondBuilder<TNull> ; + [Type.TimeMillisecond ]: builders.TimeMillisecondBuilder<TNull> ; + [Type.TimeMicrosecond ]: builders.TimeMicrosecondBuilder<TNull> ; + [Type.TimeNanosecond ]: builders.TimeNanosecondBuilder<TNull> ; + [Type.Decimal ]: builders.DecimalBuilder<TNull> ; + [Type.Union ]: builders.UnionBuilder<any, TNull> ; + [Type.DenseUnion ]: builders.DenseUnionBuilder<any, TNull> ; + [Type.SparseUnion ]: builders.SparseUnionBuilder<any, TNull> ; + [Type.Interval ]: builders.IntervalBuilder<any, TNull> ; + [Type.IntervalDayTime ]: builders.IntervalDayTimeBuilder<TNull> ; + [Type.IntervalYearMonth ]: builders.IntervalYearMonthBuilder<TNull> ; + [Type.Map ]: builders.MapBuilder<any, any, TNull> ; + [Type.List ]: builders.ListBuilder<any, TNull> ; + [Type.Struct ]: builders.StructBuilder<any, TNull> ; + [Type.Dictionary ]: builders.DictionaryBuilder<any, TNull> ; + [Type.FixedSizeList ]: builders.FixedSizeListBuilder<any, TNull> ; +}[T]; + +/** @ignore */ +type DataTypeToBuilder<T extends DataType = any, TNull = any> = { + [key: number ]: builders.Builder<any, TNull> ; + [Type.Null ]: T extends type.Null ? builders.NullBuilder<TNull> : never ; + [Type.Bool ]: T extends type.Bool ? builders.BoolBuilder<TNull> : never ; + [Type.Int8 ]: T extends type.Int8 ? builders.Int8Builder<TNull> : never ; + [Type.Int16 ]: T extends type.Int16 ? builders.Int16Builder<TNull> : never ; + [Type.Int32 ]: T extends type.Int32 ? builders.Int32Builder<TNull> : never ; + [Type.Int64 ]: T extends type.Int64 ? builders.Int64Builder<TNull> : never ; + [Type.Uint8 ]: T extends type.Uint8 ? builders.Uint8Builder<TNull> : never ; + [Type.Uint16 ]: T extends type.Uint16 ? builders.Uint16Builder<TNull> : never ; + [Type.Uint32 ]: T extends type.Uint32 ? builders.Uint32Builder<TNull> : never ; + [Type.Uint64 ]: T extends type.Uint64 ? builders.Uint64Builder<TNull> : never ; + [Type.Int ]: T extends type.Int ? builders.IntBuilder<T, TNull> : never ; + [Type.Float16 ]: T extends type.Float16 ? builders.Float16Builder<TNull> : never ; + [Type.Float32 ]: T extends type.Float32 ? builders.Float32Builder<TNull> : never ; + [Type.Float64 ]: T extends type.Float64 ? builders.Float64Builder<TNull> : never ; + [Type.Float ]: T extends type.Float ? builders.FloatBuilder<T, TNull> : never ; + [Type.Utf8 ]: T extends type.Utf8 ? builders.Utf8Builder<TNull> : never ; + [Type.Binary ]: T extends type.Binary ? builders.BinaryBuilder<TNull> : never ; + [Type.FixedSizeBinary ]: T extends type.FixedSizeBinary ? builders.FixedSizeBinaryBuilder<TNull> : never ; + [Type.Date ]: T extends type.Date_ ? builders.DateBuilder<T, TNull> : never ; + [Type.DateDay ]: T extends type.DateDay ? builders.DateDayBuilder<TNull> : never ; + [Type.DateMillisecond ]: T extends type.DateMillisecond ? builders.DateMillisecondBuilder<TNull> : never ; + [Type.Timestamp ]: T extends type.Timestamp ? builders.TimestampBuilder<T, TNull> : never ; + [Type.TimestampSecond ]: T extends type.TimestampSecond ? builders.TimestampSecondBuilder<TNull> : never ; + [Type.TimestampMillisecond ]: T extends type.TimestampMillisecond ? builders.TimestampMillisecondBuilder<TNull> : never ; + [Type.TimestampMicrosecond ]: T extends type.TimestampMicrosecond ? builders.TimestampMicrosecondBuilder<TNull> : never ; + [Type.TimestampNanosecond ]: T extends type.TimestampNanosecond ? builders.TimestampNanosecondBuilder<TNull> : never ; + [Type.Time ]: T extends type.Time ? builders.TimeBuilder<T, TNull> : never ; + [Type.TimeSecond ]: T extends type.TimeSecond ? builders.TimeSecondBuilder<TNull> : never ; + [Type.TimeMillisecond ]: T extends type.TimeMillisecond ? builders.TimeMillisecondBuilder<TNull> : never ; + [Type.TimeMicrosecond ]: T extends type.TimeMicrosecond ? builders.TimeMicrosecondBuilder<TNull> : never ; + [Type.TimeNanosecond ]: T extends type.TimeNanosecond ? builders.TimeNanosecondBuilder<TNull> : never ; + [Type.Decimal ]: T extends type.Decimal ? builders.DecimalBuilder<TNull> : never ; + [Type.Union ]: T extends type.Union ? builders.UnionBuilder<T, TNull> : never ; + [Type.DenseUnion ]: T extends type.DenseUnion ? builders.DenseUnionBuilder<T, TNull> : never ; + [Type.SparseUnion ]: T extends type.SparseUnion ? builders.SparseUnionBuilder<T, TNull> : never ; + [Type.Interval ]: T extends type.Interval ? builders.IntervalBuilder<T, TNull> : never ; + [Type.IntervalDayTime ]: T extends type.IntervalDayTime ? builders.IntervalDayTimeBuilder<TNull> : never ; + [Type.IntervalYearMonth ]: T extends type.IntervalYearMonth ? builders.IntervalYearMonthBuilder<TNull> : never ; + [Type.Map ]: T extends type.Map_ ? builders.MapBuilder<T['keyType'], T['valueType'], TNull> : never ; + [Type.List ]: T extends type.List ? builders.ListBuilder<T['valueType'], TNull> : never ; + [Type.Struct ]: T extends type.Struct ? builders.StructBuilder<T['dataTypes'], TNull> : never ; + [Type.Dictionary ]: T extends type.Dictionary ? builders.DictionaryBuilder<T, TNull> : never ; + [Type.FixedSizeList ]: T extends type.FixedSizeList ? builders.FixedSizeListBuilder<T['valueType'], TNull> : never ; +}[T['TType']]; diff --git a/src/arrow/js/src/io/adapters.ts b/src/arrow/js/src/io/adapters.ts new file mode 100644 index 000000000..a83346ef7 --- /dev/null +++ b/src/arrow/js/src/io/adapters.ts @@ -0,0 +1,398 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { + toUint8Array, + joinUint8Arrays, + ArrayBufferViewInput, + toUint8ArrayIterator, + toUint8ArrayAsyncIterator +} from '../util/buffer'; + +import { ReadableDOMStreamOptions } from './interfaces'; + +interface ReadableStreamReadResult<T> { done: boolean; value: T } +type Uint8ArrayGenerator = Generator<Uint8Array, null, { cmd: 'peek' | 'read'; size: number }>; +type AsyncUint8ArrayGenerator = AsyncGenerator<Uint8Array, null, { cmd: 'peek' | 'read'; size: number }>; + +/** @ignore */ +export default { + fromIterable<T extends ArrayBufferViewInput>(source: Iterable<T> | T): Uint8ArrayGenerator { + return pump(fromIterable<T>(source)); + }, + fromAsyncIterable<T extends ArrayBufferViewInput>(source: AsyncIterable<T> | PromiseLike<T>): AsyncUint8ArrayGenerator { + return pump(fromAsyncIterable<T>(source)); + }, + fromDOMStream<T extends ArrayBufferViewInput>(source: ReadableStream<T>): AsyncUint8ArrayGenerator { + return pump(fromDOMStream<T>(source)); + }, + fromNodeStream(stream: NodeJS.ReadableStream): AsyncUint8ArrayGenerator { + return pump(fromNodeStream(stream)); + }, + // @ts-ignore + toDOMStream<T>(source: Iterable<T> | AsyncIterable<T>, options?: ReadableDOMStreamOptions): ReadableStream<T> { + throw new Error(`"toDOMStream" not available in this environment`); + }, + // @ts-ignore + toNodeStream<T>(source: Iterable<T> | AsyncIterable<T>, options?: import('stream').ReadableOptions): import('stream').Readable { + throw new Error(`"toNodeStream" not available in this environment`); + }, +}; + +/** @ignore */ +const pump = <T extends Uint8ArrayGenerator | AsyncUint8ArrayGenerator>(iterator: T) => { iterator.next(); return iterator; }; + +/** @ignore */ +function* fromIterable<T extends ArrayBufferViewInput>(source: Iterable<T> | T): Uint8ArrayGenerator { + + let done: boolean | undefined, threw = false; + let buffers: Uint8Array[] = [], buffer: Uint8Array; + let cmd: 'peek' | 'read', size: number, bufferLength = 0; + + function byteRange() { + if (cmd === 'peek') { + return joinUint8Arrays(buffers, size)[0]; + } + [buffer, buffers, bufferLength] = joinUint8Arrays(buffers, size); + return buffer; + } + + // Yield so the caller can inject the read command before creating the source Iterator + ({ cmd, size } = yield <any> null); + + // initialize the iterator + const it = toUint8ArrayIterator(source)[Symbol.iterator](); + + try { + do { + // read the next value + ({ done, value: buffer } = isNaN(size - bufferLength) ? + it.next(undefined) : it.next(size - bufferLength)); + // if chunk is not null or empty, push it onto the queue + if (!done && buffer.byteLength > 0) { + buffers.push(buffer); + bufferLength += buffer.byteLength; + } + // If we have enough bytes in our buffer, yield chunks until we don't + if (done || size <= bufferLength) { + do { + ({ cmd, size } = yield byteRange()); + } while (size < bufferLength); + } + } while (!done); + } catch (e) { + (threw = true) && (typeof it.throw === 'function') && (it.throw(e)); + } finally { + (threw === false) && (typeof it.return === 'function') && (it.return(null!)); + } + return null; +} + +/** @ignore */ +async function* fromAsyncIterable<T extends ArrayBufferViewInput>(source: AsyncIterable<T> | PromiseLike<T>): AsyncUint8ArrayGenerator { + + let done: boolean | undefined, threw = false; + let buffers: Uint8Array[] = [], buffer: Uint8Array; + let cmd: 'peek' | 'read', size: number, bufferLength = 0; + + function byteRange() { + if (cmd === 'peek') { + return joinUint8Arrays(buffers, size)[0]; + } + [buffer, buffers, bufferLength] = joinUint8Arrays(buffers, size); + return buffer; + } + + // Yield so the caller can inject the read command before creating the source AsyncIterator + ({ cmd, size } = (yield <any> null)!); + + // initialize the iterator + const it = toUint8ArrayAsyncIterator(source)[Symbol.asyncIterator](); + + try { + do { + // read the next value + ({ done, value: buffer } = isNaN(size - bufferLength) + ? await it.next(undefined) + : await it.next(size - bufferLength)); + // if chunk is not null or empty, push it onto the queue + if (!done && buffer.byteLength > 0) { + buffers.push(buffer); + bufferLength += buffer.byteLength; + } + // If we have enough bytes in our buffer, yield chunks until we don't + if (done || size <= bufferLength) { + do { + ({ cmd, size } = yield byteRange()); + } while (size < bufferLength); + } + } while (!done); + } catch (e) { + (threw = true) && (typeof it.throw === 'function') && (await it.throw(e)); + } finally { + (threw === false) && (typeof it.return === 'function') && (await it.return(new Uint8Array(0))); + } + return null; +} + +// All this manual Uint8Array chunk management can be avoided if/when engines +// add support for ArrayBuffer.transfer() or ArrayBuffer.prototype.realloc(): +// https://github.com/domenic/proposal-arraybuffer-transfer +/** @ignore */ +async function* fromDOMStream<T extends ArrayBufferViewInput>(source: ReadableStream<T>): AsyncUint8ArrayGenerator { + + let done = false, threw = false; + let buffers: Uint8Array[] = [], buffer: Uint8Array; + let cmd: 'peek' | 'read', size: number, bufferLength = 0; + + function byteRange() { + if (cmd === 'peek') { + return joinUint8Arrays(buffers, size)[0]; + } + [buffer, buffers, bufferLength] = joinUint8Arrays(buffers, size); + return buffer; + } + + // Yield so the caller can inject the read command before we establish the ReadableStream lock + ({ cmd, size } = yield <any> null); + + // initialize the reader and lock the stream + const it = new AdaptiveByteReader(source); + + try { + do { + // read the next value + ({ done, value: buffer } = isNaN(size - bufferLength) + ? await it['read'](undefined) + : await it['read'](size - bufferLength)); + // if chunk is not null or empty, push it onto the queue + if (!done && buffer.byteLength > 0) { + buffers.push(toUint8Array(buffer)); + bufferLength += buffer.byteLength; + } + // If we have enough bytes in our buffer, yield chunks until we don't + if (done || size <= bufferLength) { + do { + ({ cmd, size } = yield byteRange()); + } while (size < bufferLength); + } + } while (!done); + } catch (e) { + (threw = true) && (await it['cancel'](e)); + } finally { + (threw === false) ? (await it['cancel']()) + : source['locked'] && it.releaseLock(); + } + return null; +} + +/** @ignore */ +class AdaptiveByteReader<T extends ArrayBufferViewInput> { + + private supportsBYOB: boolean; + private byobReader: ReadableStreamBYOBReader | null = null; + private defaultReader: ReadableStreamDefaultReader<T> | null = null; + private reader: ReadableStreamBYOBReader | ReadableStreamDefaultReader<T> | null; + + constructor(private source: ReadableStream<T>) { + try { + this.supportsBYOB = !!(this.reader = this.getBYOBReader()); + } catch (e) { + this.supportsBYOB = !(this.reader = this.getDefaultReader()); + } + } + + get closed(): Promise<void> { + return this.reader ? this.reader['closed'].catch(() => {}) : Promise.resolve(); + } + + releaseLock(): void { + if (this.reader) { + this.reader.releaseLock(); + } + this.reader = this.byobReader = this.defaultReader = null; + } + + async cancel(reason?: any): Promise<void> { + const { reader, source } = this; + reader && (await reader['cancel'](reason).catch(() => {})); + source && (source['locked'] && this.releaseLock()); + } + + async read(size?: number): Promise<ReadableStreamReadResult<Uint8Array>> { + if (size === 0) { + return { done: this.reader == null, value: new Uint8Array(0) }; + } + const result = !this.supportsBYOB || typeof size !== 'number' + ? await this.getDefaultReader().read() + : await this.readFromBYOBReader(size); + !result.done && (result.value = toUint8Array(result as ReadableStreamReadResult<Uint8Array>)); + return result as ReadableStreamReadResult<Uint8Array>; + } + + private getDefaultReader() { + if (this.byobReader) { this.releaseLock(); } + if (!this.defaultReader) { + this.defaultReader = this.source['getReader'](); + // We have to catch and swallow errors here to avoid uncaught promise rejection exceptions + // that seem to be raised when we call `releaseLock()` on this reader. I'm still mystified + // about why these errors are raised, but I'm sure there's some important spec reason that + // I haven't considered. I hate to employ such an anti-pattern here, but it seems like the + // only solution in this case :/ + this.defaultReader['closed'].catch(() => {}); + } + return (this.reader = this.defaultReader); + } + + private getBYOBReader() { + if (this.defaultReader) { this.releaseLock(); } + if (!this.byobReader) { + this.byobReader = this.source['getReader']({ mode: 'byob' }); + // We have to catch and swallow errors here to avoid uncaught promise rejection exceptions + // that seem to be raised when we call `releaseLock()` on this reader. I'm still mystified + // about why these errors are raised, but I'm sure there's some important spec reason that + // I haven't considered. I hate to employ such an anti-pattern here, but it seems like the + // only solution in this case :/ + this.byobReader['closed'].catch(() => {}); + } + return (this.reader = this.byobReader); + } + + // This strategy plucked from the example in the streams spec: + // https://streams.spec.whatwg.org/#example-manual-read-bytes + private async readFromBYOBReader(size: number) { + return await readInto(this.getBYOBReader(), new ArrayBuffer(size), 0, size); + } +} + +/** @ignore */ +async function readInto(reader: ReadableStreamBYOBReader, buffer: ArrayBufferLike, offset: number, size: number): Promise<ReadableStreamReadResult<Uint8Array>> { + if (offset >= size) { + return { done: false, value: new Uint8Array(buffer, 0, size) }; + } + const { done, value } = await reader.read(new Uint8Array(buffer, offset, size - offset)); + if (((offset += value!.byteLength) < size) && !done) { + return await readInto(reader, value!.buffer, offset, size); + } + return { done, value: new Uint8Array(value!.buffer, 0, offset) }; +} + +/** @ignore */ +type EventName = 'end' | 'error' | 'readable'; +/** @ignore */ +type Event = [EventName, (_: any) => void, Promise<[EventName, Error | null]>]; +/** @ignore */ +const onEvent = <T extends string>(stream: NodeJS.ReadableStream, event: T) => { + const handler = (_: any) => resolve([event, _]); + let resolve: (value?: [T, any] | PromiseLike<[T, any]>) => void; + return [event, handler, new Promise<[T, any]>( + (r) => (resolve = r) && stream['once'](event, handler) + )] as Event; +}; + +/** @ignore */ +async function* fromNodeStream(stream: NodeJS.ReadableStream): AsyncUint8ArrayGenerator { + + const events: Event[] = []; + let event: EventName = 'error'; + let done = false, err: Error | null = null; + let cmd: 'peek' | 'read', size: number, bufferLength = 0; + let buffers: Uint8Array[] = [], buffer: Uint8Array | Buffer | string; + + function byteRange() { + if (cmd === 'peek') { + return joinUint8Arrays(buffers, size)[0]; + } + [buffer, buffers, bufferLength] = joinUint8Arrays(buffers, size); + return buffer; + } + + // Yield so the caller can inject the read command before we + // add the listener for the source stream's 'readable' event. + ({ cmd, size } = yield <any> null); + + // ignore stdin if it's a TTY + if ((stream as any)['isTTY']) { + yield new Uint8Array(0); + return null; + } + + try { + // initialize the stream event handlers + events[0] = onEvent(stream, 'end'); + events[1] = onEvent(stream, 'error'); + + do { + events[2] = onEvent(stream, 'readable'); + + // wait on the first message event from the stream + [event, err] = await Promise.race(events.map((x) => x[2])); + + // if the stream emitted an Error, rethrow it + if (event === 'error') { break; } + if (!(done = event === 'end')) { + // If the size is NaN, request to read everything in the stream's internal buffer + if (!isFinite(size - bufferLength)) { + buffer = toUint8Array(stream['read'](undefined)); + } else { + buffer = toUint8Array(stream['read'](size - bufferLength)); + // If the byteLength is 0, then the requested amount is more than the stream has + // in its internal buffer. In this case the stream needs a "kick" to tell it to + // continue emitting readable events, so request to read everything the stream + // has in its internal buffer right now. + if (buffer.byteLength < (size - bufferLength)) { + buffer = toUint8Array(stream['read'](undefined)); + } + } + // if chunk is not null or empty, push it onto the queue + if (buffer.byteLength > 0) { + buffers.push(buffer); + bufferLength += buffer.byteLength; + } + } + // If we have enough bytes in our buffer, yield chunks until we don't + if (done || size <= bufferLength) { + do { + ({ cmd, size } = yield byteRange()); + } while (size < bufferLength); + } + } while (!done); + } finally { + await cleanup(events, event === 'error' ? err : null); + } + + return null; + + function cleanup<T extends Error | null | void>(events: Event[], err?: T) { + buffer = buffers = <any> null; + return new Promise<T>((resolve, reject) => { + for (const [evt, fn] of events) { + stream['off'](evt, fn); + } + try { + // Some stream implementations don't call the destroy callback, + // because it's really a node-internal API. Just calling `destroy` + // here should be enough to conform to the ReadableStream contract + const destroy = (stream as any)['destroy']; + destroy && destroy.call(stream, err); + err = undefined; + } catch (e) { err = e || err; } finally { + err != null ? reject(err) : resolve(); + } + }); + } +} diff --git a/src/arrow/js/src/io/file.ts b/src/arrow/js/src/io/file.ts new file mode 100644 index 000000000..20b7dbf02 --- /dev/null +++ b/src/arrow/js/src/io/file.ts @@ -0,0 +1,115 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { FileHandle } from './interfaces'; +import { ByteStream, AsyncByteStream } from './stream'; +import { ArrayBufferViewInput, toUint8Array } from '../util/buffer'; + +/** @ignore */ +export class RandomAccessFile extends ByteStream { + public size: number; + public position = 0; + protected buffer: Uint8Array | null; + constructor(buffer: ArrayBufferViewInput, byteLength?: number) { + super(); + this.buffer = toUint8Array(buffer); + this.size = typeof byteLength === 'undefined' ? this.buffer.byteLength : byteLength; + } + public readInt32(position: number) { + const { buffer, byteOffset } = this.readAt(position, 4); + return new DataView(buffer, byteOffset).getInt32(0, true); + } + public seek(position: number) { + this.position = Math.min(position, this.size); + return position < this.size; + } + public read(nBytes?: number | null) { + const { buffer, size, position } = this; + if (buffer && position < size) { + if (typeof nBytes !== 'number') { nBytes = Infinity; } + this.position = Math.min(size, + position + Math.min(size - position, nBytes)); + return buffer.subarray(position, this.position); + } + return null; + } + public readAt(position: number, nBytes: number) { + const buf = this.buffer; + const end = Math.min(this.size, position + nBytes); + return buf ? buf.subarray(position, end) : new Uint8Array(nBytes); + } + public close() { this.buffer && (this.buffer = null); } + public throw(value?: any) { this.close(); return { done: true, value }; } + public return(value?: any) { this.close(); return { done: true, value }; } +} + +/** @ignore */ +export class AsyncRandomAccessFile extends AsyncByteStream { + public size!: number; + public position = 0; + public _pending?: Promise<void>; + protected _handle: FileHandle | null; + constructor(file: FileHandle, byteLength?: number) { + super(); + this._handle = file; + if (typeof byteLength === 'number') { + this.size = byteLength; + } else { + this._pending = (async () => { + this.size = (await file.stat()).size; + delete this._pending; + })(); + } + } + public async readInt32(position: number) { + const { buffer, byteOffset } = await this.readAt(position, 4); + return new DataView(buffer, byteOffset).getInt32(0, true); + } + public async seek(position: number) { + this._pending && await this._pending; + this.position = Math.min(position, this.size); + return position < this.size; + } + public async read(nBytes?: number | null) { + this._pending && await this._pending; + const { _handle: file, size, position } = this; + if (file && position < size) { + if (typeof nBytes !== 'number') { nBytes = Infinity; } + let pos = position, offset = 0, bytesRead = 0; + const end = Math.min(size, pos + Math.min(size - pos, nBytes)); + const buffer = new Uint8Array(Math.max(0, (this.position = end) - pos)); + while ((pos += bytesRead) < end && (offset += bytesRead) < buffer.byteLength) { + ({ bytesRead } = await file.read(buffer, offset, buffer.byteLength - offset, pos)); + } + return buffer; + } + return null; + } + public async readAt(position: number, nBytes: number) { + this._pending && await this._pending; + const { _handle: file, size } = this; + if (file && (position + nBytes) < size) { + const end = Math.min(size, position + nBytes); + const buffer = new Uint8Array(end - position); + return (await file.read(buffer, 0, nBytes, position)).buffer; + } + return new Uint8Array(nBytes); + } + public async close() { const f = this._handle; this._handle = null; f && await f.close(); } + public async throw(value?: any) { await this.close(); return { done: true, value }; } + public async return(value?: any) { await this.close(); return { done: true, value }; } +} diff --git a/src/arrow/js/src/io/interfaces.ts b/src/arrow/js/src/io/interfaces.ts new file mode 100644 index 000000000..4b5641ff1 --- /dev/null +++ b/src/arrow/js/src/io/interfaces.ts @@ -0,0 +1,179 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import streamAdapters from './adapters'; + +/** @ignore */ +export const ITERATOR_DONE: any = Object.freeze({ done: true, value: void (0) }); + +/** @ignore */ +export type FileHandle = import('fs').promises.FileHandle; +/** @ignore */ +export type ArrowJSONLike = { schema: any; batches?: any[]; dictionaries?: any[] }; +/** @ignore */ +export type ReadableDOMStreamOptions = { type: 'bytes' | undefined; autoAllocateChunkSize?: number; highWaterMark?: number }; + +/** @ignore */ +export class ArrowJSON { + constructor(private _json: ArrowJSONLike) {} + public get schema(): any { return this._json['schema']; } + public get batches(): any[] { return (this._json['batches'] || []) as any[]; } + public get dictionaries(): any[] { return (this._json['dictionaries'] || []) as any[]; } +} + +/** @ignore */ +export interface Readable<T> { + + readonly closed: Promise<void>; + cancel(reason?: any): Promise<void>; + + read(size?: number | null): Promise<T | null>; + peek(size?: number | null): Promise<T | null>; + throw(value?: any): Promise<IteratorResult<any>>; + return(value?: any): Promise<IteratorResult<any>>; + next(size?: number | null): Promise<IteratorResult<T>>; +} + +/** @ignore */ +export interface Writable<T> { + readonly closed: Promise<void>; + close(): void; + write(chunk: T): void; + abort(reason?: any): void; +} + +/** @ignore */ +export interface ReadableWritable<TReadable, TWritable> extends Readable<TReadable>, Writable<TWritable> { + [Symbol.asyncIterator](): AsyncIterableIterator<TReadable>; + toDOMStream(options?: ReadableDOMStreamOptions): ReadableStream<TReadable>; + toNodeStream(options?: import('stream').ReadableOptions): import('stream').Readable; +} + +/** @ignore */ +export abstract class ReadableInterop<T> { + + public abstract toDOMStream(options?: ReadableDOMStreamOptions): ReadableStream<T>; + public abstract toNodeStream(options?: import('stream').ReadableOptions): import('stream').Readable; + + public tee(): [ReadableStream<T>, ReadableStream<T>] { + return this._getDOMStream().tee(); + } + public pipe<R extends NodeJS.WritableStream>(writable: R, options?: { end?: boolean }) { + return this._getNodeStream().pipe(writable, options); + } + public pipeTo(writable: WritableStream<T>, options?: PipeOptions) { return this._getDOMStream().pipeTo(writable, options); } + public pipeThrough<R extends ReadableStream<any>>(duplex: { writable: WritableStream<T>; readable: R }, options?: PipeOptions) { + return this._getDOMStream().pipeThrough(duplex, options); + } + + protected _DOMStream?: ReadableStream<T>; + private _getDOMStream() { + return this._DOMStream || (this._DOMStream = this.toDOMStream()); + } + + protected _nodeStream?: import('stream').Readable; + private _getNodeStream() { + return this._nodeStream || (this._nodeStream = this.toNodeStream()); + } +} + +/** @ignore */ +type Resolution<T> = { resolve: (value?: T | PromiseLike<T>) => void; reject: (reason?: any) => void }; + +/** @ignore */ +export class AsyncQueue<TReadable = Uint8Array, TWritable = TReadable> extends ReadableInterop<TReadable> + implements AsyncIterableIterator<TReadable>, ReadableWritable<TReadable, TWritable> { + + protected _values: TWritable[] = []; + protected _error?: { error: any }; + protected _closedPromise: Promise<void>; + protected _closedPromiseResolve?: (value?: any) => void; + protected resolvers: Resolution<IteratorResult<TReadable>>[] = []; + + constructor() { + super(); + this._closedPromise = new Promise((r) => this._closedPromiseResolve = r); + } + + public get closed(): Promise<void> { return this._closedPromise; } + public async cancel(reason?: any) { await this.return(reason); } + public write(value: TWritable) { + if (this._ensureOpen()) { + this.resolvers.length <= 0 + ? (this._values.push(value)) + : (this.resolvers.shift()!.resolve({ done: false, value } as any)); + } + } + public abort(value?: any) { + if (this._closedPromiseResolve) { + this.resolvers.length <= 0 + ? (this._error = { error: value }) + : (this.resolvers.shift()!.reject({ done: true, value })); + } + } + public close() { + if (this._closedPromiseResolve) { + const { resolvers } = this; + while (resolvers.length > 0) { + resolvers.shift()!.resolve(ITERATOR_DONE); + } + this._closedPromiseResolve(); + this._closedPromiseResolve = undefined; + } + } + + public [Symbol.asyncIterator]() { return this; } + public toDOMStream(options?: ReadableDOMStreamOptions) { + return streamAdapters.toDOMStream( + (this._closedPromiseResolve || this._error) + ? (this as AsyncIterable<TReadable>) + : (this._values as any) as Iterable<TReadable>, + options); + } + public toNodeStream(options?: import('stream').ReadableOptions) { + return streamAdapters.toNodeStream( + (this._closedPromiseResolve || this._error) + ? (this as AsyncIterable<TReadable>) + : (this._values as any) as Iterable<TReadable>, + options); + } + public async throw(_?: any) { await this.abort(_); return ITERATOR_DONE; } + public async return(_?: any) { await this.close(); return ITERATOR_DONE; } + + public async read(size?: number | null): Promise<TReadable | null> { return (await this.next(size, 'read')).value; } + public async peek(size?: number | null): Promise<TReadable | null> { return (await this.next(size, 'peek')).value; } + public next(..._args: any[]): Promise<IteratorResult<TReadable>> { + if (this._values.length > 0) { + return Promise.resolve({ done: false, value: this._values.shift()! } as any); + } else if (this._error) { + return Promise.reject({ done: true, value: this._error.error }); + } else if (!this._closedPromiseResolve) { + return Promise.resolve(ITERATOR_DONE); + } else { + return new Promise<IteratorResult<TReadable>>((resolve, reject) => { + this.resolvers.push({ resolve, reject }); + }); + } + } + + protected _ensureOpen() { + if (this._closedPromiseResolve) { + return true; + } + throw new Error(`AsyncQueue is closed`); + } +} diff --git a/src/arrow/js/src/io/node/builder.ts b/src/arrow/js/src/io/node/builder.ts new file mode 100644 index 000000000..eb9579536 --- /dev/null +++ b/src/arrow/js/src/io/node/builder.ts @@ -0,0 +1,98 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Duplex } from 'stream'; +import { DataType } from '../../type'; +import { Builder, BuilderOptions } from '../../builder/index'; + +/** @ignore */ +export interface BuilderDuplexOptions<T extends DataType = any, TNull = any> extends BuilderOptions<T, TNull> { + autoDestroy?: boolean; + highWaterMark?: number; + queueingStrategy?: 'bytes' | 'count'; + dictionaryHashFunction?: (value: any) => string | number; + valueToChildTypeId?: (builder: Builder<T, TNull>, value: any, offset: number) => number; +} + +/** @ignore */ +export function builderThroughNodeStream<T extends DataType = any, TNull = any>(options: BuilderDuplexOptions<T, TNull>) { + return new BuilderDuplex(Builder.new(options), options); +} + +/** @ignore */ +type CB = (error?: Error | null | undefined) => void; + +/** @ignore */ +class BuilderDuplex<T extends DataType = any, TNull = any> extends Duplex { + + private _finished: boolean; + private _numChunks: number; + private _desiredSize: number; + private _builder: Builder<T, TNull>; + private _getSize: (builder: Builder<T, TNull>) => number; + + constructor(builder: Builder<T, TNull>, options: BuilderDuplexOptions<T, TNull>) { + + const { queueingStrategy = 'count', autoDestroy = true } = options; + const { highWaterMark = queueingStrategy !== 'bytes' ? 1000 : 2 ** 14 } = options; + + super({ autoDestroy, highWaterMark: 1, allowHalfOpen: true, writableObjectMode: true, readableObjectMode: true }); + + this._numChunks = 0; + this._finished = false; + this._builder = builder; + this._desiredSize = highWaterMark; + this._getSize = queueingStrategy !== 'bytes' ? builderLength : builderByteLength; + } + _read(size: number) { + this._maybeFlush(this._builder, this._desiredSize = size); + } + _final(cb?: CB) { + this._maybeFlush(this._builder.finish(), this._desiredSize); + cb && cb(); + } + _write(value: any, _: string, cb?: CB) { + const result = this._maybeFlush( + this._builder.append(value), + this._desiredSize + ); + cb && cb(); + return result; + } + _destroy(err: Error | null, cb?: (error: Error | null) => void) { + this._builder.clear(); + cb && cb(err); + } + private _maybeFlush(builder: Builder<T, TNull>, size: number) { + if (this._getSize(builder) >= size) { + ++this._numChunks && this.push(builder.toVector()); + } + if (builder.finished) { + if (builder.length > 0 || this._numChunks === 0) { + ++this._numChunks && this.push(builder.toVector()); + } + if (!this._finished && (this._finished = true)) { + this.push(null); + } + return false; + } + return this._getSize(builder) < this.writableHighWaterMark; + } +} + +/** @ignore */ const builderLength = <T extends DataType = any>(builder: Builder<T>) => builder.length; +/** @ignore */ const builderByteLength = <T extends DataType = any>(builder: Builder<T>) => builder.byteLength; diff --git a/src/arrow/js/src/io/node/iterable.ts b/src/arrow/js/src/io/node/iterable.ts new file mode 100644 index 000000000..457bc894d --- /dev/null +++ b/src/arrow/js/src/io/node/iterable.ts @@ -0,0 +1,113 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Readable } from 'stream'; +import { isIterable, isAsyncIterable } from '../../util/compat'; + +/** @ignore */ +type ReadableOptions = import('stream').ReadableOptions; +/** @ignore */ +type SourceIterator<T> = Generator<T, void, number | null>; +/** @ignore */ +type AsyncSourceIterator<T> = AsyncGenerator<T, void, number | null>; + +/** @ignore */ +export function toNodeStream<T>(source: Iterable<T> | AsyncIterable<T>, options?: ReadableOptions): Readable { + if (isAsyncIterable<T>(source)) { return new AsyncIterableReadable(source[Symbol.asyncIterator]() as AsyncSourceIterator<T>, options); } + if (isIterable<T>(source)) { return new IterableReadable(source[Symbol.iterator]() as SourceIterator<T>, options); } + /* istanbul ignore next */ + throw new Error(`toNodeStream() must be called with an Iterable or AsyncIterable`); +} + +/** @ignore */ +class IterableReadable<T extends Uint8Array | any> extends Readable { + private _pulling: boolean; + private _bytesMode: boolean; + private _iterator: SourceIterator<T>; + constructor(it: SourceIterator<T>, options?: ReadableOptions) { + super(options); + this._iterator = it; + this._pulling = false; + this._bytesMode = !options || !options.objectMode; + } + _read(size: number) { + const it = this._iterator; + if (it && !this._pulling && (this._pulling = true)) { + this._pulling = this._pull(size, it); + } + } + _destroy(e: Error | null, cb: (e: Error | null) => void) { + const it = this._iterator; + let fn: any; + it && (fn = e != null && it.throw || it.return); + fn?.call(it, e); + cb && cb(null); + } + private _pull(size: number, it: SourceIterator<T>) { + const bm = this._bytesMode; + let r: IteratorResult<T> | null = null; + while (this.readable && !(r = it.next(bm ? size : null)).done) { + if (size != null) { + size -= (bm && ArrayBuffer.isView(r.value) ? r.value.byteLength : 1); + } + if (!this.push(r.value) || size <= 0) { break; } + } + if ((r?.done || !this.readable) && (this.push(null) || true)) { + it.return && it.return(); + } + return !this.readable; + } +} + +/** @ignore */ +class AsyncIterableReadable<T extends Uint8Array | any> extends Readable { + private _pulling: boolean; + private _bytesMode: boolean; + private _iterator: AsyncSourceIterator<T>; + constructor(it: AsyncSourceIterator<T>, options?: ReadableOptions) { + super(options); + this._iterator = it; + this._pulling = false; + this._bytesMode = !options || !options.objectMode; + } + _read(size: number) { + const it = this._iterator; + if (it && !this._pulling && (this._pulling = true)) { + (async () => this._pulling = await this._pull(size, it))(); + } + } + _destroy(e: Error | null, cb: (e: Error | null) => void) { + const it = this._iterator; + let fn: any; + it && (fn = e != null && it.throw || it.return); + fn?.call(it, e).then(() => cb && cb(null)) || (cb && cb(null)); + } + private async _pull(size: number, it: AsyncSourceIterator<T>) { + const bm = this._bytesMode; + let r: IteratorResult<T> | null = null; + while (this.readable && !(r = await it.next(bm ? size : null)).done) { + if (size != null) { + size -= (bm && ArrayBuffer.isView(r.value) ? r.value.byteLength : 1); + } + if (!this.push(r.value) || size <= 0) { break; } + } + if ((r?.done || !this.readable) && (this.push(null) || true)) { + it.return && it.return(); + } + return !this.readable; + } +} diff --git a/src/arrow/js/src/io/node/reader.ts b/src/arrow/js/src/io/node/reader.ts new file mode 100644 index 000000000..a51fb0b40 --- /dev/null +++ b/src/arrow/js/src/io/node/reader.ts @@ -0,0 +1,86 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { DataType } from '../../type'; +import { Duplex, DuplexOptions } from 'stream'; +import { RecordBatch } from '../../recordbatch'; +import { AsyncByteQueue } from '../../io/stream'; +import { RecordBatchReader } from '../../ipc/reader'; + +/** @ignore */ +export function recordBatchReaderThroughNodeStream<T extends { [key: string]: DataType } = any>(options?: DuplexOptions & { autoDestroy: boolean }) { + return new RecordBatchReaderDuplex<T>(options); +} + +/** @ignore */ +type CB = (error?: Error | null | undefined) => void; + +/** @ignore */ +class RecordBatchReaderDuplex<T extends { [key: string]: DataType } = any> extends Duplex { + private _pulling = false; + private _autoDestroy = true; + private _reader: RecordBatchReader | null; + private _asyncQueue: AsyncByteQueue | null; + constructor(options?: DuplexOptions & { autoDestroy: boolean }) { + super({ allowHalfOpen: false, ...options, readableObjectMode: true, writableObjectMode: false }); + this._reader = null; + this._pulling = false; + this._asyncQueue = new AsyncByteQueue(); + this._autoDestroy = options && (typeof options.autoDestroy === 'boolean') ? options.autoDestroy : true; + } + _final(cb?: CB) { + const aq = this._asyncQueue; + aq?.close(); + cb && cb(); + } + _write(x: any, _: string, cb: CB) { + const aq = this._asyncQueue; + aq?.write(x); + cb && cb(); + return true; + } + _read(size: number) { + const aq = this._asyncQueue; + if (aq && !this._pulling && (this._pulling = true)) { + (async () => { + if (!this._reader) { + this._reader = await this._open(aq); + } + this._pulling = await this._pull(size, this._reader); + })(); + } + } + _destroy(err: Error | null, cb: (error: Error | null) => void) { + const aq = this._asyncQueue; + if (aq) { err ? aq.abort(err) : aq.close(); } + cb(this._asyncQueue = this._reader = null); + } + async _open(source: AsyncByteQueue) { + return await (await RecordBatchReader.from<T>(source)).open({ autoDestroy: this._autoDestroy }); + } + async _pull(size: number, reader: RecordBatchReader<T>) { + let r: IteratorResult<RecordBatch<T>> | null = null; + while (this.readable && !(r = await reader.next()).done) { + if (!this.push(r.value) || (size != null && --size <= 0)) { break; } + } + if (!this.readable || (r?.done && (reader.autoDestroy || (await reader.reset().open()).closed))) { + this.push(null); + await reader.cancel(); + } + return !this.readable; + } +} diff --git a/src/arrow/js/src/io/node/writer.ts b/src/arrow/js/src/io/node/writer.ts new file mode 100644 index 000000000..79d61b9a3 --- /dev/null +++ b/src/arrow/js/src/io/node/writer.ts @@ -0,0 +1,77 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { DataType } from '../../type'; +import { Duplex, DuplexOptions } from 'stream'; +import { AsyncByteStream } from '../../io/stream'; +import { RecordBatchWriter } from '../../ipc/writer'; + +/** @ignore */ +export function recordBatchWriterThroughNodeStream<T extends { [key: string]: DataType } = any>(this: typeof RecordBatchWriter, options?: DuplexOptions & { autoDestroy: boolean }) { + return new RecordBatchWriterDuplex(new this<T>(options)); +} + +/** @ignore */ +type CB = (error?: Error | null | undefined) => void; + +/** @ignore */ +class RecordBatchWriterDuplex<T extends { [key: string]: DataType } = any> extends Duplex { + private _pulling = false; + private _reader: AsyncByteStream | null; + private _writer: RecordBatchWriter | null; + constructor(writer: RecordBatchWriter<T>, options?: DuplexOptions) { + super({ allowHalfOpen: false, ...options, writableObjectMode: true, readableObjectMode: false }); + this._writer = writer; + this._reader = new AsyncByteStream(writer); + } + _final(cb?: CB) { + const writer = this._writer; + writer?.close(); + cb && cb(); + } + _write(x: any, _: string, cb: CB) { + const writer = this._writer; + writer?.write(x); + cb && cb(); + return true; + } + _read(size: number) { + const it = this._reader; + if (it && !this._pulling && (this._pulling = true)) { + (async () => this._pulling = await this._pull(size, it))(); + } + } + _destroy(err: Error | null, cb: (error: Error | null) => void) { + const writer = this._writer; + if (writer) { err ? writer.abort(err) : writer.close(); } + cb(this._reader = this._writer = null); + } + async _pull(size: number, reader: AsyncByteStream) { + let r: IteratorResult<Uint8Array> | null = null; + while (this.readable && !(r = await reader.next(size || null)).done) { + if (size != null && r.value) { + size -= r.value.byteLength; + } + if (!this.push(r.value) || size <= 0) { break; } + } + if ((r?.done || !this.readable)) { + this.push(null); + await reader.cancel(); + } + return !this.readable; + } +} diff --git a/src/arrow/js/src/io/stream.ts b/src/arrow/js/src/io/stream.ts new file mode 100644 index 000000000..2384ab0b9 --- /dev/null +++ b/src/arrow/js/src/io/stream.ts @@ -0,0 +1,152 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import streamAdapters from './adapters'; +import { decodeUtf8 } from '../util/utf8'; +import { ITERATOR_DONE, Readable, Writable, AsyncQueue } from './interfaces'; +import { toUint8Array, joinUint8Arrays, ArrayBufferViewInput } from '../util/buffer'; + +import { + isPromise, isFetchResponse, + isIterable, isAsyncIterable, + isReadableDOMStream, isReadableNodeStream +} from '../util/compat'; + +/** @ignore */ +export type WritableSink<T> = Writable<T> | WritableStream<T> | NodeJS.WritableStream | null; +/** @ignore */ +export type ReadableSource<T> = Readable<T> | PromiseLike<T> | AsyncIterable<T> | ReadableStream<T> | NodeJS.ReadableStream | null; + +/** @ignore */ +export class AsyncByteQueue<T extends ArrayBufferViewInput = Uint8Array> extends AsyncQueue<Uint8Array, T> { + public write(value: ArrayBufferViewInput | Uint8Array) { + if ((value = toUint8Array(value)).byteLength > 0) { + return super.write(value as T); + } + } + public toString(sync: true): string; + public toString(sync?: false): Promise<string>; + public toString(sync = false) { + return sync + ? decodeUtf8(this.toUint8Array(true)) + : this.toUint8Array(false).then(decodeUtf8); + } + public toUint8Array(sync: true): Uint8Array; + public toUint8Array(sync?: false): Promise<Uint8Array>; + public toUint8Array(sync = false) { + return sync ? joinUint8Arrays(this._values as any[])[0] : (async () => { + const buffers = []; + let byteLength = 0; + for await (const chunk of this) { + buffers.push(chunk); + byteLength += chunk.byteLength; + } + return joinUint8Arrays(buffers, byteLength)[0]; + })(); + } +} + +/** @ignore */ +export class ByteStream implements IterableIterator<Uint8Array> { + private source!: ByteStreamSource<Uint8Array>; + constructor(source?: Iterable<ArrayBufferViewInput> | ArrayBufferViewInput) { + if (source) { + this.source = new ByteStreamSource(streamAdapters.fromIterable(source)); + } + } + [Symbol.iterator]() { return this; } + public next(value?: any) { return this.source.next(value); } + public throw(value?: any) { return this.source.throw(value); } + public return(value?: any) { return this.source.return(value); } + public peek(size?: number | null) { return this.source.peek(size); } + public read(size?: number | null) { return this.source.read(size); } +} + +/** @ignore */ +export class AsyncByteStream implements Readable<Uint8Array>, AsyncIterableIterator<Uint8Array> { + private source!: AsyncByteStreamSource<Uint8Array>; + constructor(source?: PromiseLike<ArrayBufferViewInput> | Response | ReadableStream<ArrayBufferViewInput> | NodeJS.ReadableStream | AsyncIterable<ArrayBufferViewInput> | Iterable<ArrayBufferViewInput>) { + if (source instanceof AsyncByteStream) { + this.source = (source as AsyncByteStream).source; + } else if (source instanceof AsyncByteQueue) { + this.source = new AsyncByteStreamSource(streamAdapters.fromAsyncIterable(source)); + } else if (isReadableNodeStream(source)) { + this.source = new AsyncByteStreamSource(streamAdapters.fromNodeStream(source)); + } else if (isReadableDOMStream<ArrayBufferViewInput>(source)) { + this.source = new AsyncByteStreamSource(streamAdapters.fromDOMStream(source)); + } else if (isFetchResponse(source)) { + this.source = new AsyncByteStreamSource(streamAdapters.fromDOMStream(source.body!)); + } else if (isIterable<ArrayBufferViewInput>(source)) { + this.source = new AsyncByteStreamSource(streamAdapters.fromIterable(source)); + } else if (isPromise<ArrayBufferViewInput>(source)) { + this.source = new AsyncByteStreamSource(streamAdapters.fromAsyncIterable(source)); + } else if (isAsyncIterable<ArrayBufferViewInput>(source)) { + this.source = new AsyncByteStreamSource(streamAdapters.fromAsyncIterable(source)); + } + } + [Symbol.asyncIterator]() { return this; } + public next(value?: any) { return this.source.next(value); } + public throw(value?: any) { return this.source.throw(value); } + public return(value?: any) { return this.source.return(value); } + public get closed(): Promise<void> { return this.source.closed; } + public cancel(reason?: any) { return this.source.cancel(reason); } + public peek(size?: number | null) { return this.source.peek(size); } + public read(size?: number | null) { return this.source.read(size); } +} + +/** @ignore */ +type ByteStreamSourceIterator<T> = Generator<T, null, { cmd: 'peek' | 'read'; size?: number | null }>; +/** @ignore */ +type AsyncByteStreamSourceIterator<T> = AsyncGenerator<T, null, { cmd: 'peek' | 'read'; size?: number | null }>; + +/** @ignore */ +class ByteStreamSource<T> { + constructor(protected source: ByteStreamSourceIterator<T>) {} + public cancel(reason?: any) { this.return(reason); } + public peek(size?: number | null): T | null { return this.next(size, 'peek').value; } + public read(size?: number | null): T | null { return this.next(size, 'read').value; } + public next(size?: number | null, cmd: 'peek' | 'read' = 'read') { return this.source.next({ cmd, size }); } + public throw(value?: any) { return Object.create((this.source.throw && this.source.throw(value)) || ITERATOR_DONE); } + public return(value?: any) { return Object.create((this.source.return && this.source.return(value)) || ITERATOR_DONE); } +} + +/** @ignore */ +class AsyncByteStreamSource<T> implements Readable<T> { + + private _closedPromise: Promise<void>; + private _closedPromiseResolve?: (value?: any) => void; + constructor (protected source: ByteStreamSourceIterator<T> | AsyncByteStreamSourceIterator<T>) { + this._closedPromise = new Promise((r) => this._closedPromiseResolve = r); + } + public async cancel(reason?: any) { await this.return(reason); } + public get closed(): Promise<void> { return this._closedPromise; } + public async read(size?: number | null): Promise<T | null> { return (await this.next(size, 'read')).value; } + public async peek(size?: number | null): Promise<T | null> { return (await this.next(size, 'peek')).value; } + public async next(size?: number | null, cmd: 'peek' | 'read' = 'read') { return (await this.source.next({ cmd, size })); } + public async throw(value?: any) { + const result = (this.source.throw && await this.source.throw(value)) || ITERATOR_DONE; + this._closedPromiseResolve && this._closedPromiseResolve(); + this._closedPromiseResolve = undefined; + return Object.create(result); + } + public async return(value?: any) { + const result = (this.source.return && await this.source.return(value)) || ITERATOR_DONE; + this._closedPromiseResolve && this._closedPromiseResolve(); + this._closedPromiseResolve = undefined; + return Object.create(result); + } +} diff --git a/src/arrow/js/src/io/whatwg/builder.ts b/src/arrow/js/src/io/whatwg/builder.ts new file mode 100644 index 000000000..c65511844 --- /dev/null +++ b/src/arrow/js/src/io/whatwg/builder.ts @@ -0,0 +1,116 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { DataType } from '../../type'; +import { Vector } from '../../vector'; +import { VectorType as V } from '../../interfaces'; +import { Builder, BuilderOptions } from '../../builder/index'; + +/** @ignore */ +export interface BuilderTransformOptions<T extends DataType = any, TNull = any> extends BuilderOptions<T, TNull> { + queueingStrategy?: 'bytes' | 'count'; + dictionaryHashFunction?: (value: any) => string | number; + readableStrategy?: { highWaterMark?: number; size?: any; type?: 'bytes' }; + writableStrategy?: { highWaterMark?: number; size?: any; type?: 'bytes' }; + valueToChildTypeId?: (builder: Builder<T, TNull>, value: any, offset: number) => number; +} + +/** @ignore */ +export function builderThroughDOMStream<T extends DataType = any, TNull = any>(options: BuilderTransformOptions<T, TNull>) { + return new BuilderTransform(options); +} + +/** @ignore */ +export class BuilderTransform<T extends DataType = any, TNull = any> { + + public readable: ReadableStream<V<T>>; + public writable: WritableStream<T['TValue'] | TNull>; + public _controller: ReadableStreamDefaultController<V<T>> | null; + + private _numChunks = 0; + private _finished = false; + private _bufferedSize = 0; + private _builder: Builder<T, TNull>; + private _getSize: (builder: Builder<T, TNull>) => number; + + constructor(options: BuilderTransformOptions<T, TNull>) { + + // Access properties by string indexers to defeat closure compiler + + const { + ['readableStrategy']: readableStrategy, + ['writableStrategy']: writableStrategy, + ['queueingStrategy']: queueingStrategy = 'count', + ...builderOptions + } = options; + + this._controller = null; + this._builder = Builder.new<T, TNull>(builderOptions); + this._getSize = queueingStrategy !== 'bytes' ? chunkLength : chunkByteLength; + + const { ['highWaterMark']: readableHighWaterMark = queueingStrategy === 'bytes' ? 2 ** 14 : 1000 } = { ...readableStrategy }; + const { ['highWaterMark']: writableHighWaterMark = queueingStrategy === 'bytes' ? 2 ** 14 : 1000 } = { ...writableStrategy }; + + this['readable'] = new ReadableStream<V<T>>({ + ['cancel']: () => { this._builder.clear(); }, + ['pull']: (c) => { this._maybeFlush(this._builder, this._controller = c); }, + ['start']: (c) => { this._maybeFlush(this._builder, this._controller = c); }, + }, { + 'highWaterMark': readableHighWaterMark, + 'size': queueingStrategy !== 'bytes' ? chunkLength : chunkByteLength, + }); + + this['writable'] = new WritableStream({ + ['abort']: () => { this._builder.clear(); }, + ['write']: () => { this._maybeFlush(this._builder, this._controller); }, + ['close']: () => { this._maybeFlush(this._builder.finish(), this._controller); }, + }, { + 'highWaterMark': writableHighWaterMark, + 'size': (value: T['TValue'] | TNull) => this._writeValueAndReturnChunkSize(value), + }); + } + + private _writeValueAndReturnChunkSize(value: T['TValue'] | TNull) { + const bufferedSize = this._bufferedSize; + this._bufferedSize = this._getSize(this._builder.append(value)); + return this._bufferedSize - bufferedSize; + } + + private _maybeFlush(builder: Builder<T, TNull>, controller: ReadableStreamDefaultController<V<T>> | null) { + if (controller === null) { return; } + if (this._bufferedSize >= controller.desiredSize!) { + ++this._numChunks && this._enqueue(controller, builder.toVector()); + } + if (builder.finished) { + if (builder.length > 0 || this._numChunks === 0) { + ++this._numChunks && this._enqueue(controller, builder.toVector()); + } + if (!this._finished && (this._finished = true)) { + this._enqueue(controller, null); + } + } + } + + private _enqueue(controller: ReadableStreamDefaultController<V<T>>, chunk: V<T> | null) { + this._bufferedSize = 0; + this._controller = null; + chunk === null ? controller.close() : controller.enqueue(chunk); + } +} + +/** @ignore */ const chunkLength = <T extends DataType = any>(chunk: Vector<T> | Builder<T>) => chunk.length; +/** @ignore */ const chunkByteLength = <T extends DataType = any>(chunk: Vector<T> | Builder<T>) => chunk.byteLength; diff --git a/src/arrow/js/src/io/whatwg/iterable.ts b/src/arrow/js/src/io/whatwg/iterable.ts new file mode 100644 index 000000000..ce9e97369 --- /dev/null +++ b/src/arrow/js/src/io/whatwg/iterable.ts @@ -0,0 +1,93 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { toUint8Array } from '../../util/buffer'; +import { ReadableDOMStreamOptions } from '../../io/interfaces'; +import { isIterable, isAsyncIterable } from '../../util/compat'; + +/** @ignore */ +type SourceIterator<T> = Generator<T, void, number | null>; +/** @ignore */ +type AsyncSourceIterator<T> = AsyncGenerator<T, void, number | null>; + +/** @ignore */ +export function toDOMStream<T>(source: Iterable<T> | AsyncIterable<T>, options?: ReadableDOMStreamOptions): ReadableStream<T> { + if (isAsyncIterable<T>(source)) { return asyncIterableAsReadableDOMStream(source, options); } + if (isIterable<T>(source)) { return iterableAsReadableDOMStream(source, options); } + /* istanbul ignore next */ + throw new Error(`toDOMStream() must be called with an Iterable or AsyncIterable`); +} + +/** @ignore */ +function iterableAsReadableDOMStream<T>(source: Iterable<T>, options?: ReadableDOMStreamOptions) { + + let it: SourceIterator<T> | null = null; + const bm = (options?.type === 'bytes') || false; + const hwm = options?.highWaterMark || (2 ** 24); + + return new ReadableStream<T>({ + ...options as any, + start(controller) { next(controller, it || (it = source[Symbol.iterator]() as SourceIterator<T>)); }, + pull(controller) { it ? (next(controller, it)) : controller.close(); }, + cancel() { (it?.return && it.return() || true) && (it = null); } + }, { highWaterMark: bm ? hwm : undefined, ...options }); + + function next(controller: ReadableStreamDefaultController<T>, it: SourceIterator<T>) { + let buf: Uint8Array; + let r: IteratorResult<T> | null = null; + let size = controller.desiredSize || null; + while (!(r = it.next(bm ? size : null)).done) { + if (ArrayBuffer.isView(r.value) && (buf = toUint8Array(r.value))) { + size != null && bm && (size = size - buf.byteLength + 1); + r.value = <any> buf; + } + controller.enqueue(r.value); + if (size != null && --size <= 0) { return; } + } + controller.close(); + } +} + +/** @ignore */ +function asyncIterableAsReadableDOMStream<T>(source: AsyncIterable<T>, options?: ReadableDOMStreamOptions) { + + let it: AsyncSourceIterator<T> | null = null; + const bm = (options?.type === 'bytes') || false; + const hwm = options?.highWaterMark || (2 ** 24); + + return new ReadableStream<T>({ + ...options as any, + async start(controller) { await next(controller, it || (it = source[Symbol.asyncIterator]() as AsyncSourceIterator<T>)); }, + async pull(controller) { it ? (await next(controller, it)) : controller.close(); }, + async cancel() { (it?.return && await it.return() || true) && (it = null); }, + }, { highWaterMark: bm ? hwm : undefined, ...options }); + + async function next(controller: ReadableStreamDefaultController<T>, it: AsyncSourceIterator<T>) { + let buf: Uint8Array; + let r: IteratorResult<T> | null = null; + let size = controller.desiredSize || null; + while (!(r = await it.next(bm ? size : null)).done) { + if (ArrayBuffer.isView(r.value) && (buf = toUint8Array(r.value))) { + size != null && bm && (size = size - buf.byteLength + 1); + r.value = <any> buf; + } + controller.enqueue(r.value); + if (size != null && --size <= 0) { return; } + } + controller.close(); + } +} diff --git a/src/arrow/js/src/io/whatwg/reader.ts b/src/arrow/js/src/io/whatwg/reader.ts new file mode 100644 index 000000000..9e19bac53 --- /dev/null +++ b/src/arrow/js/src/io/whatwg/reader.ts @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { DataType } from '../../type'; +import { RecordBatch } from '../../recordbatch'; +import { AsyncByteQueue } from '../../io/stream'; +import { RecordBatchReader } from '../../ipc/reader'; + +/** @ignore */ +export function recordBatchReaderThroughDOMStream<T extends { [key: string]: DataType } = any>(writableStrategy?: ByteLengthQueuingStrategy, readableStrategy?: { autoDestroy: boolean }) { + + const queue = new AsyncByteQueue(); + let reader: RecordBatchReader<T> | null = null; + + const readable = new ReadableStream<RecordBatch<T>>({ + async cancel() { await queue.close(); }, + async start(controller) { await next(controller, reader || (reader = await open())); }, + async pull(controller) { reader ? await next(controller, reader) : controller.close(); } + }); + + return { writable: new WritableStream(queue, { 'highWaterMark': 2 ** 14, ...writableStrategy }), readable }; + + async function open() { + return await (await RecordBatchReader.from<T>(queue)).open(readableStrategy); + } + + async function next(controller: ReadableStreamDefaultController<RecordBatch<T>>, reader: RecordBatchReader<T>) { + let size = controller.desiredSize; + let r: IteratorResult<RecordBatch<T>> | null = null; + while (!(r = await reader.next()).done) { + controller.enqueue(r.value); + if (size != null && --size <= 0) { + return; + } + } + controller.close(); + } +} diff --git a/src/arrow/js/src/io/whatwg/writer.ts b/src/arrow/js/src/io/whatwg/writer.ts new file mode 100644 index 000000000..49789bdd3 --- /dev/null +++ b/src/arrow/js/src/io/whatwg/writer.ts @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { DataType } from '../../type'; +import { RecordBatch } from '../../recordbatch'; +import { AsyncByteStream } from '../../io/stream'; +import { RecordBatchWriter } from '../../ipc/writer'; + +/** @ignore */ +export function recordBatchWriterThroughDOMStream<T extends { [key: string]: DataType } = any>( + this: typeof RecordBatchWriter, + writableStrategy?: QueuingStrategy<RecordBatch<T>> & { autoDestroy: boolean }, + readableStrategy?: { highWaterMark?: number; size?: any } +) { + + const writer = new this<T>(writableStrategy); + const reader = new AsyncByteStream(writer); + const readable = new ReadableStream({ + type: 'bytes', + async cancel() { await reader.cancel(); }, + async pull(controller) { await next(controller); }, + async start(controller) { await next(controller); }, + }, { 'highWaterMark': 2 ** 14, ...readableStrategy }); + + return { writable: new WritableStream(writer, writableStrategy), readable }; + + async function next(controller: ReadableStreamDefaultController<Uint8Array>) { + let buf: Uint8Array | null = null; + let size = controller.desiredSize; + while (buf = await reader.read(size || null)) { + controller.enqueue(buf); + if (size != null && (size -= buf.byteLength) <= 0) { return; } + } + controller.close(); + } +} diff --git a/src/arrow/js/src/ipc/message.ts b/src/arrow/js/src/ipc/message.ts new file mode 100644 index 000000000..34c0aa308 --- /dev/null +++ b/src/arrow/js/src/ipc/message.ts @@ -0,0 +1,257 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { MessageHeader } from '../enum'; +import { flatbuffers } from 'flatbuffers'; +import ByteBuffer = flatbuffers.ByteBuffer; +import { Message } from './metadata/message'; +import { isFileHandle } from '../util/compat'; +import { AsyncRandomAccessFile } from '../io/file'; +import { toUint8Array, ArrayBufferViewInput } from '../util/buffer'; +import { ByteStream, ReadableSource, AsyncByteStream } from '../io/stream'; +import { ArrowJSON, ArrowJSONLike, ITERATOR_DONE, FileHandle } from '../io/interfaces'; + +/** @ignore */ const invalidMessageType = (type: MessageHeader) => `Expected ${MessageHeader[type]} Message in stream, but was null or length 0.`; +/** @ignore */ const nullMessage = (type: MessageHeader) => `Header pointer of flatbuffer-encoded ${MessageHeader[type]} Message is null or length 0.`; +/** @ignore */ const invalidMessageMetadata = (expected: number, actual: number) => `Expected to read ${expected} metadata bytes, but only read ${actual}.`; +/** @ignore */ const invalidMessageBodyLength = (expected: number, actual: number) => `Expected to read ${expected} bytes for message body, but only read ${actual}.`; + +/** @ignore */ +export class MessageReader implements IterableIterator<Message> { + protected source: ByteStream; + constructor(source: ByteStream | ArrayBufferViewInput | Iterable<ArrayBufferViewInput>) { + this.source = source instanceof ByteStream ? source : new ByteStream(source); + } + public [Symbol.iterator](): IterableIterator<Message> { return this as IterableIterator<Message>; } + public next(): IteratorResult<Message> { + let r; + if ((r = this.readMetadataLength()).done) { return ITERATOR_DONE; } + // ARROW-6313: If the first 4 bytes are continuation indicator (-1), read + // the next 4 for the 32-bit metadata length. Otherwise, assume this is a + // pre-v0.15 message, where the first 4 bytes are the metadata length. + if ((r.value === -1) && + (r = this.readMetadataLength()).done) { return ITERATOR_DONE; } + if ((r = this.readMetadata(r.value)).done) { return ITERATOR_DONE; } + return (<any> r) as IteratorResult<Message>; + } + public throw(value?: any) { return this.source.throw(value); } + public return(value?: any) { return this.source.return(value); } + public readMessage<T extends MessageHeader>(type?: T | null) { + let r: IteratorResult<Message<T>>; + if ((r = this.next()).done) { return null; } + if ((type != null) && r.value.headerType !== type) { + throw new Error(invalidMessageType(type)); + } + return r.value; + } + public readMessageBody(bodyLength: number): Uint8Array { + if (bodyLength <= 0) { return new Uint8Array(0); } + const buf = toUint8Array(this.source.read(bodyLength)); + if (buf.byteLength < bodyLength) { + throw new Error(invalidMessageBodyLength(bodyLength, buf.byteLength)); + } + // 1. Work around bugs in fs.ReadStream's internal Buffer pooling, see: https://github.com/nodejs/node/issues/24817 + // 2. Work around https://github.com/whatwg/streams/blob/0ebe4b042e467d9876d80ae045de3843092ad797/reference-implementation/lib/helpers.js#L126 + return /* 1. */ (buf.byteOffset % 8 === 0) && + /* 2. */ (buf.byteOffset + buf.byteLength) <= buf.buffer.byteLength ? buf : buf.slice(); + } + public readSchema(throwIfNull = false) { + const type = MessageHeader.Schema; + const message = this.readMessage(type); + const schema = message?.header(); + if (throwIfNull && !schema) { + throw new Error(nullMessage(type)); + } + return schema; + } + protected readMetadataLength(): IteratorResult<number> { + const buf = this.source.read(PADDING); + const bb = buf && new ByteBuffer(buf); + const len = bb?.readInt32(0) || 0; + return { done: len === 0, value: len }; + } + protected readMetadata(metadataLength: number): IteratorResult<Message> { + const buf = this.source.read(metadataLength); + if (!buf) { return ITERATOR_DONE; } + if (buf.byteLength < metadataLength) { + throw new Error(invalidMessageMetadata(metadataLength, buf.byteLength)); + } + return { done: false, value: Message.decode(buf) }; + } +} + +/** @ignore */ +export class AsyncMessageReader implements AsyncIterableIterator<Message> { + protected source: AsyncByteStream; + constructor(source: ReadableSource<Uint8Array>); + constructor(source: FileHandle, byteLength?: number); + constructor(source: any, byteLength?: number) { + this.source = source instanceof AsyncByteStream ? source + : isFileHandle(source) + ? new AsyncRandomAccessFile(source, byteLength!) + : new AsyncByteStream(source); + } + public [Symbol.asyncIterator](): AsyncIterableIterator<Message> { return this as AsyncIterableIterator<Message>; } + public async next(): Promise<IteratorResult<Message>> { + let r; + if ((r = await this.readMetadataLength()).done) { return ITERATOR_DONE; } + // ARROW-6313: If the first 4 bytes are continuation indicator (-1), read + // the next 4 for the 32-bit metadata length. Otherwise, assume this is a + // pre-v0.15 message, where the first 4 bytes are the metadata length. + if ((r.value === -1) && + (r = await this.readMetadataLength()).done) { return ITERATOR_DONE; } + if ((r = await this.readMetadata(r.value)).done) { return ITERATOR_DONE; } + return (<any> r) as IteratorResult<Message>; + } + public async throw(value?: any) { return await this.source.throw(value); } + public async return(value?: any) { return await this.source.return(value); } + public async readMessage<T extends MessageHeader>(type?: T | null) { + let r: IteratorResult<Message<T>>; + if ((r = await this.next()).done) { return null; } + if ((type != null) && r.value.headerType !== type) { + throw new Error(invalidMessageType(type)); + } + return r.value; + } + public async readMessageBody(bodyLength: number): Promise<Uint8Array> { + if (bodyLength <= 0) { return new Uint8Array(0); } + const buf = toUint8Array(await this.source.read(bodyLength)); + if (buf.byteLength < bodyLength) { + throw new Error(invalidMessageBodyLength(bodyLength, buf.byteLength)); + } + // 1. Work around bugs in fs.ReadStream's internal Buffer pooling, see: https://github.com/nodejs/node/issues/24817 + // 2. Work around https://github.com/whatwg/streams/blob/0ebe4b042e467d9876d80ae045de3843092ad797/reference-implementation/lib/helpers.js#L126 + return /* 1. */ (buf.byteOffset % 8 === 0) && + /* 2. */ (buf.byteOffset + buf.byteLength) <= buf.buffer.byteLength ? buf : buf.slice(); + } + public async readSchema(throwIfNull = false) { + const type = MessageHeader.Schema; + const message = await this.readMessage(type); + const schema = message?.header(); + if (throwIfNull && !schema) { + throw new Error(nullMessage(type)); + } + return schema; + } + protected async readMetadataLength(): Promise<IteratorResult<number>> { + const buf = await this.source.read(PADDING); + const bb = buf && new ByteBuffer(buf); + const len = bb?.readInt32(0) || 0; + return { done: len === 0, value: len }; + } + protected async readMetadata(metadataLength: number): Promise<IteratorResult<Message>> { + const buf = await this.source.read(metadataLength); + if (!buf) { return ITERATOR_DONE; } + if (buf.byteLength < metadataLength) { + throw new Error(invalidMessageMetadata(metadataLength, buf.byteLength)); + } + return { done: false, value: Message.decode(buf) }; + } +} + +/** @ignore */ +export class JSONMessageReader extends MessageReader { + private _schema = false; + private _json: ArrowJSON; + private _body: any[] = []; + private _batchIndex = 0; + private _dictionaryIndex = 0; + constructor(source: ArrowJSON | ArrowJSONLike) { + super(new Uint8Array(0)); + this._json = source instanceof ArrowJSON ? source : new ArrowJSON(source); + } + public next() { + const { _json } = this; + if (!this._schema) { + this._schema = true; + const message = Message.fromJSON(_json.schema, MessageHeader.Schema); + return { done: false, value: message }; + } + if (this._dictionaryIndex < _json.dictionaries.length) { + const batch = _json.dictionaries[this._dictionaryIndex++]; + this._body = batch['data']['columns']; + const message = Message.fromJSON(batch, MessageHeader.DictionaryBatch); + return { done: false, value: message }; + } + if (this._batchIndex < _json.batches.length) { + const batch = _json.batches[this._batchIndex++]; + this._body = batch['columns']; + const message = Message.fromJSON(batch, MessageHeader.RecordBatch); + return { done: false, value: message }; + } + this._body = []; + return ITERATOR_DONE; + } + public readMessageBody(_bodyLength?: number) { + return flattenDataSources(this._body) as any; + function flattenDataSources(xs: any[]): any[][] { + return (xs || []).reduce<any[][]>((buffers, column: any) => [ + ...buffers, + ...(column['VALIDITY'] && [column['VALIDITY']] || []), + ...(column['TYPE'] && [column['TYPE']] || []), + ...(column['OFFSET'] && [column['OFFSET']] || []), + ...(column['DATA'] && [column['DATA']] || []), + ...flattenDataSources(column['children']) + ], [] as any[][]); + } + } + public readMessage<T extends MessageHeader>(type?: T | null) { + let r: IteratorResult<Message<T>>; + if ((r = this.next()).done) { return null; } + if ((type != null) && r.value.headerType !== type) { + throw new Error(invalidMessageType(type)); + } + return r.value; + } + public readSchema() { + const type = MessageHeader.Schema; + const message = this.readMessage(type); + const schema = message?.header(); + if (!message || !schema) { + throw new Error(nullMessage(type)); + } + return schema; + } +} + +/** @ignore */ +export const PADDING = 4; +/** @ignore */ +export const MAGIC_STR = 'ARROW1'; +/** @ignore */ +export const MAGIC = new Uint8Array(MAGIC_STR.length); + +for (let i = 0; i < MAGIC_STR.length; i += 1 | 0) { + MAGIC[i] = MAGIC_STR.charCodeAt(i); +} + +/** @ignore */ +export function checkForMagicArrowString(buffer: Uint8Array, index = 0) { + for (let i = -1, n = MAGIC.length; ++i < n;) { + if (MAGIC[i] !== buffer[index + i]) { + return false; + } + } + return true; +} + +/** @ignore */ +export const magicLength = MAGIC.length; +/** @ignore */ +export const magicAndPadding = magicLength + PADDING; +/** @ignore */ +export const magicX2AndPadding = magicLength * 2 + PADDING; diff --git a/src/arrow/js/src/ipc/metadata/file.ts b/src/arrow/js/src/ipc/metadata/file.ts new file mode 100644 index 000000000..5a1be844e --- /dev/null +++ b/src/arrow/js/src/ipc/metadata/file.ts @@ -0,0 +1,163 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/* eslint-disable @typescript-eslint/naming-convention */ + +import { + Block as _Block, + Footer as _Footer +} from '../../fb/File'; + +import { flatbuffers } from 'flatbuffers'; + +import Long = flatbuffers.Long; +import Builder = flatbuffers.Builder; +import ByteBuffer = flatbuffers.ByteBuffer; + +import { Schema } from '../../schema'; +import { MetadataVersion } from '../../enum'; +import { toUint8Array } from '../../util/buffer'; +import { ArrayBufferViewInput } from '../../util/buffer'; + +/** @ignore */ +class Footer_ { + + /** @nocollapse */ + public static decode(buf: ArrayBufferViewInput) { + buf = new ByteBuffer(toUint8Array(buf)); + const footer = _Footer.getRootAsFooter(buf); + const schema = Schema.decode(footer.schema()!); + return new OffHeapFooter(schema, footer) as Footer_; + } + + /** @nocollapse */ + public static encode(footer: Footer_) { + + const b: Builder = new Builder(); + const schemaOffset = Schema.encode(b, footer.schema); + + _Footer.startRecordBatchesVector(b, footer.numRecordBatches); + [...footer.recordBatches()].slice().reverse().forEach((rb) => FileBlock.encode(b, rb)); + const recordBatchesOffset = b.endVector(); + + _Footer.startDictionariesVector(b, footer.numDictionaries); + [...footer.dictionaryBatches()].slice().reverse().forEach((db) => FileBlock.encode(b, db)); + + const dictionaryBatchesOffset = b.endVector(); + + _Footer.startFooter(b); + _Footer.addSchema(b, schemaOffset); + _Footer.addVersion(b, MetadataVersion.V4); + _Footer.addRecordBatches(b, recordBatchesOffset); + _Footer.addDictionaries(b, dictionaryBatchesOffset); + _Footer.finishFooterBuffer(b, _Footer.endFooter(b)); + + return b.asUint8Array(); + } + + protected _recordBatches!: FileBlock[]; + protected _dictionaryBatches!: FileBlock[]; + public get numRecordBatches() { return this._recordBatches.length; } + public get numDictionaries() { return this._dictionaryBatches.length; } + + constructor(public schema: Schema, + public version: MetadataVersion = MetadataVersion.V4, + recordBatches?: FileBlock[], dictionaryBatches?: FileBlock[]) { + recordBatches && (this._recordBatches = recordBatches); + dictionaryBatches && (this._dictionaryBatches = dictionaryBatches); + } + + public *recordBatches(): Iterable<FileBlock> { + for (let block, i = -1, n = this.numRecordBatches; ++i < n;) { + if (block = this.getRecordBatch(i)) { yield block; } + } + } + + public *dictionaryBatches(): Iterable<FileBlock> { + for (let block, i = -1, n = this.numDictionaries; ++i < n;) { + if (block = this.getDictionaryBatch(i)) { yield block; } + } + } + + public getRecordBatch(index: number) { + return index >= 0 + && index < this.numRecordBatches + && this._recordBatches[index] || null; + } + + public getDictionaryBatch(index: number) { + return index >= 0 + && index < this.numDictionaries + && this._dictionaryBatches[index] || null; + } +} + +export { Footer_ as Footer }; + +/** @ignore */ +class OffHeapFooter extends Footer_ { + + public get numRecordBatches() { return this._footer.recordBatchesLength(); } + public get numDictionaries() { return this._footer.dictionariesLength(); } + + constructor(schema: Schema, protected _footer: _Footer) { + super(schema, _footer.version()); + } + + public getRecordBatch(index: number) { + if (index >= 0 && index < this.numRecordBatches) { + const fileBlock = this._footer.recordBatches(index); + if (fileBlock) { return FileBlock.decode(fileBlock); } + } + return null; + } + + public getDictionaryBatch(index: number) { + if (index >= 0 && index < this.numDictionaries) { + const fileBlock = this._footer.dictionaries(index); + if (fileBlock) { return FileBlock.decode(fileBlock); } + } + return null; + } +} + +/** @ignore */ +export class FileBlock { + + /** @nocollapse */ + public static decode(block: _Block) { + return new FileBlock(block.metaDataLength(), block.bodyLength(), block.offset()); + } + + /** @nocollapse */ + public static encode(b: Builder, fileBlock: FileBlock) { + const { metaDataLength } = fileBlock; + const offset = new Long(fileBlock.offset, 0); + const bodyLength = new Long(fileBlock.bodyLength, 0); + return _Block.createBlock(b, offset, metaDataLength, bodyLength); + } + + public offset: number; + public bodyLength: number; + public metaDataLength: number; + + constructor(metaDataLength: number, bodyLength: Long | number, offset: Long | number) { + this.metaDataLength = metaDataLength; + this.offset = typeof offset === 'number' ? offset : offset.low; + this.bodyLength = typeof bodyLength === 'number' ? bodyLength : bodyLength.low; + } +} diff --git a/src/arrow/js/src/ipc/metadata/json.ts b/src/arrow/js/src/ipc/metadata/json.ts new file mode 100644 index 000000000..399615c31 --- /dev/null +++ b/src/arrow/js/src/ipc/metadata/json.ts @@ -0,0 +1,206 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/* eslint-disable brace-style */ + +import { Schema, Field } from '../../schema'; +import { + DataType, Dictionary, TimeBitWidth, + Utf8, Binary, Decimal, FixedSizeBinary, + List, FixedSizeList, Map_, Struct, Union, + Bool, Null, Int, Float, Date_, Time, Interval, Timestamp, IntBitWidth, Int32, TKeys, +} from '../../type'; + +import { DictionaryBatch, RecordBatch, FieldNode, BufferRegion } from './message'; +import { TimeUnit, Precision, IntervalUnit, UnionMode, DateUnit } from '../../enum'; + +/** @ignore */ +export function schemaFromJSON(_schema: any, dictionaries: Map<number, DataType> = new Map()) { + return new Schema( + schemaFieldsFromJSON(_schema, dictionaries), + customMetadataFromJSON(_schema['customMetadata']), + dictionaries + ); +} + +/** @ignore */ +export function recordBatchFromJSON(b: any) { + return new RecordBatch( + b['count'], + fieldNodesFromJSON(b['columns']), + buffersFromJSON(b['columns']) + ); +} + +/** @ignore */ +export function dictionaryBatchFromJSON(b: any) { + return new DictionaryBatch( + recordBatchFromJSON(b['data']), + b['id'], b['isDelta'] + ); +} + +/** @ignore */ +function schemaFieldsFromJSON(_schema: any, dictionaries?: Map<number, DataType>) { + return (_schema['fields'] || []).filter(Boolean).map((f: any) => Field.fromJSON(f, dictionaries)); +} + +/** @ignore */ +function fieldChildrenFromJSON(_field: any, dictionaries?: Map<number, DataType>): Field[] { + return (_field['children'] || []).filter(Boolean).map((f: any) => Field.fromJSON(f, dictionaries)); +} + +/** @ignore */ +function fieldNodesFromJSON(xs: any[]): FieldNode[] { + return (xs || []).reduce<FieldNode[]>((fieldNodes, column: any) => [ + ...fieldNodes, + new FieldNode( + column['count'], + nullCountFromJSON(column['VALIDITY']) + ), + ...fieldNodesFromJSON(column['children']) + ], [] as FieldNode[]); +} + +/** @ignore */ +function buffersFromJSON(xs: any[], buffers: BufferRegion[] = []): BufferRegion[] { + for (let i = -1, n = (xs || []).length; ++i < n;) { + const column = xs[i]; + column['VALIDITY'] && buffers.push(new BufferRegion(buffers.length, column['VALIDITY'].length)); + column['TYPE'] && buffers.push(new BufferRegion(buffers.length, column['TYPE'].length)); + column['OFFSET'] && buffers.push(new BufferRegion(buffers.length, column['OFFSET'].length)); + column['DATA'] && buffers.push(new BufferRegion(buffers.length, column['DATA'].length)); + buffers = buffersFromJSON(column['children'], buffers); + } + return buffers; +} + +/** @ignore */ +function nullCountFromJSON(validity: number[]) { + return (validity || []).reduce((sum, val) => sum + +(val === 0), 0); +} + +/** @ignore */ +export function fieldFromJSON(_field: any, dictionaries?: Map<number, DataType>) { + + let id: number; + let keys: TKeys | null; + let field: Field | void; + let dictMeta: any; + let type: DataType<any>; + let dictType: Dictionary; + + // If no dictionary encoding + if (!dictionaries || !(dictMeta = _field['dictionary'])) { + type = typeFromJSON(_field, fieldChildrenFromJSON(_field, dictionaries)); + field = new Field(_field['name'], type, _field['nullable'], customMetadataFromJSON(_field['customMetadata'])); + } + // If dictionary encoded and the first time we've seen this dictionary id, decode + // the data type and child fields, then wrap in a Dictionary type and insert the + // data type into the dictionary types map. + else if (!dictionaries.has(id = dictMeta['id'])) { + // a dictionary index defaults to signed 32 bit int if unspecified + keys = (keys = dictMeta['indexType']) ? indexTypeFromJSON(keys) as TKeys : new Int32(); + dictionaries.set(id, type = typeFromJSON(_field, fieldChildrenFromJSON(_field, dictionaries))); + dictType = new Dictionary(type, keys, id, dictMeta['isOrdered']); + field = new Field(_field['name'], dictType, _field['nullable'], customMetadataFromJSON(_field['customMetadata'])); + } + // If dictionary encoded, and have already seen this dictionary Id in the schema, then reuse the + // data type and wrap in a new Dictionary type and field. + else { + // a dictionary index defaults to signed 32 bit int if unspecified + keys = (keys = dictMeta['indexType']) ? indexTypeFromJSON(keys) as TKeys : new Int32(); + dictType = new Dictionary(dictionaries.get(id)!, keys, id, dictMeta['isOrdered']); + field = new Field(_field['name'], dictType, _field['nullable'], customMetadataFromJSON(_field['customMetadata'])); + } + return field || null; +} + +/** @ignore */ +function customMetadataFromJSON(_metadata?: Record<string, string>) { + return new Map<string, string>(Object.entries(_metadata || {})); +} + +/** @ignore */ +function indexTypeFromJSON(_type: any) { + return new Int(_type['isSigned'], _type['bitWidth']); +} + +/** @ignore */ +function typeFromJSON(f: any, children?: Field[]): DataType<any> { + + const typeId = f['type']['name']; + + switch (typeId) { + case 'NONE': return new Null(); + case 'null': return new Null(); + case 'binary': return new Binary(); + case 'utf8': return new Utf8(); + case 'bool': return new Bool(); + case 'list': return new List((children || [])[0]); + case 'struct': return new Struct(children || []); + case 'struct_': return new Struct(children || []); + } + + switch (typeId) { + case 'int': { + const t = f['type']; + return new Int(t['isSigned'], t['bitWidth'] as IntBitWidth); + } + case 'floatingpoint': { + const t = f['type']; + return new Float(Precision[t['precision']] as any); + } + case 'decimal': { + const t = f['type']; + return new Decimal(t['scale'], t['precision']); + } + case 'date': { + const t = f['type']; + return new Date_(DateUnit[t['unit']] as any); + } + case 'time': { + const t = f['type']; + return new Time(TimeUnit[t['unit']] as any, t['bitWidth'] as TimeBitWidth); + } + case 'timestamp': { + const t = f['type']; + return new Timestamp(TimeUnit[t['unit']] as any, t['timezone']); + } + case 'interval': { + const t = f['type']; + return new Interval(IntervalUnit[t['unit']] as any); + } + case 'union': { + const t = f['type']; + return new Union(UnionMode[t['mode']] as any, (t['typeIds'] || []), children || []); + } + case 'fixedsizebinary': { + const t = f['type']; + return new FixedSizeBinary(t['byteWidth']); + } + case 'fixedsizelist': { + const t = f['type']; + return new FixedSizeList(t['listSize'], (children || [])[0]); + } + case 'map': { + const t = f['type']; + return new Map_((children || [])[0], t['keysSorted']); + } + } + throw new Error(`Unrecognized type: "${typeId}"`); +} diff --git a/src/arrow/js/src/ipc/metadata/message.ts b/src/arrow/js/src/ipc/metadata/message.ts new file mode 100644 index 000000000..2ebb73e4c --- /dev/null +++ b/src/arrow/js/src/ipc/metadata/message.ts @@ -0,0 +1,621 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/* eslint-disable brace-style */ + +import { flatbuffers } from 'flatbuffers'; + +import { + Type, + Int as _Int, + Field as _Field, + Schema as _Schema, + Buffer as _Buffer, + KeyValue as _KeyValue, + Endianness as _Endianness, + DictionaryEncoding as _DictionaryEncoding, + FloatingPoint as _FloatingPoint, + Decimal as _Decimal, + Date as _Date, + Time as _Time, + Timestamp as _Timestamp, + Interval as _Interval, + Union as _Union, + FixedSizeBinary as _FixedSizeBinary, + FixedSizeList as _FixedSizeList, + Map as _Map, +} from '../../fb/Schema'; + +import { + Message as _Message, + FieldNode as _FieldNode, + RecordBatch as _RecordBatch, + DictionaryBatch as _DictionaryBatch, +} from '../../fb/Message'; + +import { Schema, Field } from '../../schema'; +import { toUint8Array } from '../../util/buffer'; +import { ArrayBufferViewInput } from '../../util/buffer'; +import { MessageHeader, MetadataVersion } from '../../enum'; +import { instance as typeAssembler } from '../../visitor/typeassembler'; +import { fieldFromJSON, schemaFromJSON, recordBatchFromJSON, dictionaryBatchFromJSON } from './json'; + +import Long = flatbuffers.Long; +import Builder = flatbuffers.Builder; +import ByteBuffer = flatbuffers.ByteBuffer; + +import { + DataType, Dictionary, TimeBitWidth, + Utf8, Binary, Decimal, FixedSizeBinary, + List, FixedSizeList, Map_, Struct, Union, + Bool, Null, Int, Float, Date_, Time, Interval, Timestamp, IntBitWidth, Int32, TKeys, +} from '../../type'; + +/** + * @ignore + * @private + **/ +export class Message<T extends MessageHeader = any> { + + /** @nocollapse */ + public static fromJSON<T extends MessageHeader>(msg: any, headerType: T): Message<T> { + const message = new Message(0, MetadataVersion.V4, headerType); + message._createHeader = messageHeaderFromJSON(msg, headerType); + return message; + } + + /** @nocollapse */ + public static decode(buf: ArrayBufferViewInput) { + buf = new ByteBuffer(toUint8Array(buf)); + const _message = _Message.getRootAsMessage(buf); + const bodyLength: Long = _message.bodyLength()!; + const version: MetadataVersion = _message.version(); + const headerType: MessageHeader = _message.headerType(); + const message = new Message(bodyLength, version, headerType); + message._createHeader = decodeMessageHeader(_message, headerType); + return message; + } + + /** @nocollapse */ + public static encode<T extends MessageHeader>(message: Message<T>) { + const b = new Builder(); + let headerOffset = -1; + if (message.isSchema()) { + headerOffset = Schema.encode(b, message.header() as Schema); + } else if (message.isRecordBatch()) { + headerOffset = RecordBatch.encode(b, message.header() as RecordBatch); + } else if (message.isDictionaryBatch()) { + headerOffset = DictionaryBatch.encode(b, message.header() as DictionaryBatch); + } + _Message.startMessage(b); + _Message.addVersion(b, MetadataVersion.V4); + _Message.addHeader(b, headerOffset); + _Message.addHeaderType(b, message.headerType); + _Message.addBodyLength(b, new Long(message.bodyLength, 0)); + _Message.finishMessageBuffer(b, _Message.endMessage(b)); + return b.asUint8Array(); + } + + /** @nocollapse */ + public static from(header: Schema | RecordBatch | DictionaryBatch, bodyLength = 0) { + if (header instanceof Schema) { + return new Message(0, MetadataVersion.V4, MessageHeader.Schema, header); + } + if (header instanceof RecordBatch) { + return new Message(bodyLength, MetadataVersion.V4, MessageHeader.RecordBatch, header); + } + if (header instanceof DictionaryBatch) { + return new Message(bodyLength, MetadataVersion.V4, MessageHeader.DictionaryBatch, header); + } + throw new Error(`Unrecognized Message header: ${header}`); + } + + public body: Uint8Array; + protected _headerType: T; + protected _bodyLength: number; + protected _version: MetadataVersion; + public get type() { return this.headerType; } + public get version() { return this._version; } + public get headerType() { return this._headerType; } + public get bodyLength() { return this._bodyLength; } + protected _createHeader!: MessageHeaderDecoder; + public header() { return this._createHeader<T>(); } + public isSchema(): this is Message<MessageHeader.Schema> { return this.headerType === MessageHeader.Schema; } + public isRecordBatch(): this is Message<MessageHeader.RecordBatch> { return this.headerType === MessageHeader.RecordBatch; } + public isDictionaryBatch(): this is Message<MessageHeader.DictionaryBatch> { return this.headerType === MessageHeader.DictionaryBatch; } + + constructor(bodyLength: Long | number, version: MetadataVersion, headerType: T, header?: any) { + this._version = version; + this._headerType = headerType; + this.body = new Uint8Array(0); + header && (this._createHeader = () => header); + this._bodyLength = typeof bodyLength === 'number' ? bodyLength : bodyLength.low; + } +} + +/** + * @ignore + * @private + **/ +export class RecordBatch { + protected _length: number; + protected _nodes: FieldNode[]; + protected _buffers: BufferRegion[]; + public get nodes() { return this._nodes; } + public get length() { return this._length; } + public get buffers() { return this._buffers; } + constructor(length: Long | number, nodes: FieldNode[], buffers: BufferRegion[]) { + this._nodes = nodes; + this._buffers = buffers; + this._length = typeof length === 'number' ? length : length.low; + } +} + +/** + * @ignore + * @private + **/ +export class DictionaryBatch { + + protected _id: number; + protected _isDelta: boolean; + protected _data: RecordBatch; + public get id() { return this._id; } + public get data() { return this._data; } + public get isDelta() { return this._isDelta; } + public get length(): number { return this.data.length; } + public get nodes(): FieldNode[] { return this.data.nodes; } + public get buffers(): BufferRegion[] { return this.data.buffers; } + + constructor(data: RecordBatch, id: Long | number, isDelta = false) { + this._data = data; + this._isDelta = isDelta; + this._id = typeof id === 'number' ? id : id.low; + } +} + +/** + * @ignore + * @private + **/ +export class BufferRegion { + public offset: number; + public length: number; + constructor(offset: Long | number, length: Long | number) { + this.offset = typeof offset === 'number' ? offset : offset.low; + this.length = typeof length === 'number' ? length : length.low; + } +} + +/** + * @ignore + * @private + **/ +export class FieldNode { + public length: number; + public nullCount: number; + constructor(length: Long | number, nullCount: Long | number) { + this.length = typeof length === 'number' ? length : length.low; + this.nullCount = typeof nullCount === 'number' ? nullCount : nullCount.low; + } +} + +/** @ignore */ +function messageHeaderFromJSON(message: any, type: MessageHeader) { + return (() => { + switch (type) { + case MessageHeader.Schema: return Schema.fromJSON(message); + case MessageHeader.RecordBatch: return RecordBatch.fromJSON(message); + case MessageHeader.DictionaryBatch: return DictionaryBatch.fromJSON(message); + } + throw new Error(`Unrecognized Message type: { name: ${MessageHeader[type]}, type: ${type} }`); + }) as MessageHeaderDecoder; +} + +/** @ignore */ +function decodeMessageHeader(message: _Message, type: MessageHeader) { + return (() => { + switch (type) { + case MessageHeader.Schema: return Schema.decode(message.header(new _Schema())!); + case MessageHeader.RecordBatch: return RecordBatch.decode(message.header(new _RecordBatch())!, message.version()); + case MessageHeader.DictionaryBatch: return DictionaryBatch.decode(message.header(new _DictionaryBatch())!, message.version()); + } + throw new Error(`Unrecognized Message type: { name: ${MessageHeader[type]}, type: ${type} }`); + }) as MessageHeaderDecoder; +} + +Field['encode'] = encodeField; +Field['decode'] = decodeField; +Field['fromJSON'] = fieldFromJSON; + +Schema['encode'] = encodeSchema; +Schema['decode'] = decodeSchema; +Schema['fromJSON'] = schemaFromJSON; + +RecordBatch['encode'] = encodeRecordBatch; +RecordBatch['decode'] = decodeRecordBatch; +RecordBatch['fromJSON'] = recordBatchFromJSON; + +DictionaryBatch['encode'] = encodeDictionaryBatch; +DictionaryBatch['decode'] = decodeDictionaryBatch; +DictionaryBatch['fromJSON'] = dictionaryBatchFromJSON; + +FieldNode['encode'] = encodeFieldNode; +FieldNode['decode'] = decodeFieldNode; + +BufferRegion['encode'] = encodeBufferRegion; +BufferRegion['decode'] = decodeBufferRegion; + +declare module '../../schema' { + namespace Field { + export { encodeField as encode }; + export { decodeField as decode }; + export { fieldFromJSON as fromJSON }; + } + namespace Schema { + export { encodeSchema as encode }; + export { decodeSchema as decode }; + export { schemaFromJSON as fromJSON }; + } +} + +declare module './message' { + namespace RecordBatch { + export { encodeRecordBatch as encode }; + export { decodeRecordBatch as decode }; + export { recordBatchFromJSON as fromJSON }; + } + namespace DictionaryBatch { + export { encodeDictionaryBatch as encode }; + export { decodeDictionaryBatch as decode }; + export { dictionaryBatchFromJSON as fromJSON }; + } + namespace FieldNode { + export { encodeFieldNode as encode }; + export { decodeFieldNode as decode }; + } + namespace BufferRegion { + export { encodeBufferRegion as encode }; + export { decodeBufferRegion as decode }; + } +} + +/** @ignore */ +function decodeSchema(_schema: _Schema, dictionaries: Map<number, DataType> = new Map()) { + const fields = decodeSchemaFields(_schema, dictionaries); + return new Schema(fields, decodeCustomMetadata(_schema), dictionaries); +} + +/** @ignore */ +function decodeRecordBatch(batch: _RecordBatch, version = MetadataVersion.V4) { + return new RecordBatch(batch.length(), decodeFieldNodes(batch), decodeBuffers(batch, version)); +} + +/** @ignore */ +function decodeDictionaryBatch(batch: _DictionaryBatch, version = MetadataVersion.V4) { + return new DictionaryBatch(RecordBatch.decode(batch.data()!, version), batch.id(), batch.isDelta()); +} + +/** @ignore */ +function decodeBufferRegion(b: _Buffer) { + return new BufferRegion(b.offset(), b.length()); +} + +/** @ignore */ +function decodeFieldNode(f: _FieldNode) { + return new FieldNode(f.length(), f.nullCount()); +} + +/** @ignore */ +function decodeFieldNodes(batch: _RecordBatch) { + const nodes = [] as FieldNode[]; + for (let f, i = -1, j = -1, n = batch.nodesLength(); ++i < n;) { + if (f = batch.nodes(i)) { + nodes[++j] = FieldNode.decode(f); + } + } + return nodes; +} + +/** @ignore */ +function decodeBuffers(batch: _RecordBatch, version: MetadataVersion) { + const bufferRegions = [] as BufferRegion[]; + for (let b, i = -1, j = -1, n = batch.buffersLength(); ++i < n;) { + if (b = batch.buffers(i)) { + // If this Arrow buffer was written before version 4, + // advance the buffer's bb_pos 8 bytes to skip past + // the now-removed page_id field + if (version < MetadataVersion.V4) { + b.bb_pos += (8 * (i + 1)); + } + bufferRegions[++j] = BufferRegion.decode(b); + } + } + return bufferRegions; +} + +/** @ignore */ +function decodeSchemaFields(schema: _Schema, dictionaries?: Map<number, DataType>) { + const fields = [] as Field[]; + for (let f, i = -1, j = -1, n = schema.fieldsLength(); ++i < n;) { + if (f = schema.fields(i)) { + fields[++j] = Field.decode(f, dictionaries); + } + } + return fields; +} + +/** @ignore */ +function decodeFieldChildren(field: _Field, dictionaries?: Map<number, DataType>): Field[] { + const children = [] as Field[]; + for (let f, i = -1, j = -1, n = field.childrenLength(); ++i < n;) { + if (f = field.children(i)) { + children[++j] = Field.decode(f, dictionaries); + } + } + return children; +} + +/** @ignore */ +function decodeField(f: _Field, dictionaries?: Map<number, DataType>) { + + let id: number; + let field: Field | void; + let type: DataType<any>; + let keys: _Int | TKeys | null; + let dictType: Dictionary; + let dictMeta: _DictionaryEncoding | null; + + // If no dictionary encoding + if (!dictionaries || !(dictMeta = f.dictionary())) { + type = decodeFieldType(f, decodeFieldChildren(f, dictionaries)); + field = new Field(f.name()!, type, f.nullable(), decodeCustomMetadata(f)); + } + // If dictionary encoded and the first time we've seen this dictionary id, decode + // the data type and child fields, then wrap in a Dictionary type and insert the + // data type into the dictionary types map. + else if (!dictionaries.has(id = dictMeta.id().low)) { + // a dictionary index defaults to signed 32 bit int if unspecified + keys = (keys = dictMeta.indexType()) ? decodeIndexType(keys) as TKeys : new Int32(); + dictionaries.set(id, type = decodeFieldType(f, decodeFieldChildren(f, dictionaries))); + dictType = new Dictionary(type, keys, id, dictMeta.isOrdered()); + field = new Field(f.name()!, dictType, f.nullable(), decodeCustomMetadata(f)); + } + // If dictionary encoded, and have already seen this dictionary Id in the schema, then reuse the + // data type and wrap in a new Dictionary type and field. + else { + // a dictionary index defaults to signed 32 bit int if unspecified + keys = (keys = dictMeta.indexType()) ? decodeIndexType(keys) as TKeys : new Int32(); + dictType = new Dictionary(dictionaries.get(id)!, keys, id, dictMeta.isOrdered()); + field = new Field(f.name()!, dictType, f.nullable(), decodeCustomMetadata(f)); + } + return field || null; +} + +/** @ignore */ +function decodeCustomMetadata(parent?: _Schema | _Field | null) { + const data = new Map<string, string>(); + if (parent) { + for (let entry, key, i = -1, n = parent.customMetadataLength() | 0; ++i < n;) { + if ((entry = parent.customMetadata(i)) && (key = entry.key()) != null) { + data.set(key, entry.value()!); + } + } + } + return data; +} + +/** @ignore */ +function decodeIndexType(_type: _Int) { + return new Int(_type.isSigned(), _type.bitWidth() as IntBitWidth); +} + +/** @ignore */ +function decodeFieldType(f: _Field, children?: Field[]): DataType<any> { + + const typeId = f.typeType(); + + switch (typeId) { + case Type['NONE']: return new Null(); + case Type['Null']: return new Null(); + case Type['Binary']: return new Binary(); + case Type['Utf8']: return new Utf8(); + case Type['Bool']: return new Bool(); + case Type['List']: return new List((children || [])[0]); + case Type['Struct_']: return new Struct(children || []); + } + + switch (typeId) { + case Type['Int']: { + const t = f.type(new _Int())!; + return new Int(t.isSigned(), t.bitWidth()); + } + case Type['FloatingPoint']: { + const t = f.type(new _FloatingPoint())!; + return new Float(t.precision()); + } + case Type['Decimal']: { + const t = f.type(new _Decimal())!; + return new Decimal(t.scale(), t.precision()); + } + case Type['Date']: { + const t = f.type(new _Date())!; + return new Date_(t.unit()); + } + case Type['Time']: { + const t = f.type(new _Time())!; + return new Time(t.unit(), t.bitWidth() as TimeBitWidth); + } + case Type['Timestamp']: { + const t = f.type(new _Timestamp())!; + return new Timestamp(t.unit(), t.timezone()); + } + case Type['Interval']: { + const t = f.type(new _Interval())!; + return new Interval(t.unit()); + } + case Type['Union']: { + const t = f.type(new _Union())!; + return new Union(t.mode(), t.typeIdsArray() || [], children || []); + } + case Type['FixedSizeBinary']: { + const t = f.type(new _FixedSizeBinary())!; + return new FixedSizeBinary(t.byteWidth()); + } + case Type['FixedSizeList']: { + const t = f.type(new _FixedSizeList())!; + return new FixedSizeList(t.listSize(), (children || [])[0]); + } + case Type['Map']: { + const t = f.type(new _Map())!; + return new Map_((children || [])[0], t.keysSorted()); + } + } + throw new Error(`Unrecognized type: "${Type[typeId]}" (${typeId})`); +} + +/** @ignore */ +function encodeSchema(b: Builder, schema: Schema) { + + const fieldOffsets = schema.fields.map((f) => Field.encode(b, f)); + + _Schema.startFieldsVector(b, fieldOffsets.length); + + const fieldsVectorOffset = _Schema.createFieldsVector(b, fieldOffsets); + + const metadataOffset = !(schema.metadata && schema.metadata.size > 0) ? -1 : + _Schema.createCustomMetadataVector(b, [...schema.metadata].map(([k, v]) => { + const key = b.createString(`${k}`); + const val = b.createString(`${v}`); + _KeyValue.startKeyValue(b); + _KeyValue.addKey(b, key); + _KeyValue.addValue(b, val); + return _KeyValue.endKeyValue(b); + })); + + _Schema.startSchema(b); + _Schema.addFields(b, fieldsVectorOffset); + _Schema.addEndianness(b, platformIsLittleEndian ? _Endianness.Little : _Endianness.Big); + + if (metadataOffset !== -1) { _Schema.addCustomMetadata(b, metadataOffset); } + + return _Schema.endSchema(b); +} + +/** @ignore */ +function encodeField(b: Builder, field: Field) { + + let nameOffset = -1; + let typeOffset = -1; + let dictionaryOffset = -1; + + const type = field.type; + let typeId: Type = <any> field.typeId; + + if (!DataType.isDictionary(type)) { + typeOffset = typeAssembler.visit(type, b)!; + } else { + typeId = type.dictionary.typeId; + dictionaryOffset = typeAssembler.visit(type, b)!; + typeOffset = typeAssembler.visit(type.dictionary, b)!; + } + + const childOffsets = (type.children || []).map((f: Field) => Field.encode(b, f)); + const childrenVectorOffset = _Field.createChildrenVector(b, childOffsets); + + const metadataOffset = !(field.metadata && field.metadata.size > 0) ? -1 : + _Field.createCustomMetadataVector(b, [...field.metadata].map(([k, v]) => { + const key = b.createString(`${k}`); + const val = b.createString(`${v}`); + _KeyValue.startKeyValue(b); + _KeyValue.addKey(b, key); + _KeyValue.addValue(b, val); + return _KeyValue.endKeyValue(b); + })); + + if (field.name) { + nameOffset = b.createString(field.name); + } + + _Field.startField(b); + _Field.addType(b, typeOffset); + _Field.addTypeType(b, typeId); + _Field.addChildren(b, childrenVectorOffset); + _Field.addNullable(b, !!field.nullable); + + if (nameOffset !== -1) { _Field.addName(b, nameOffset); } + if (dictionaryOffset !== -1) { _Field.addDictionary(b, dictionaryOffset); } + if (metadataOffset !== -1) { _Field.addCustomMetadata(b, metadataOffset); } + + return _Field.endField(b); +} + +/** @ignore */ +function encodeRecordBatch(b: Builder, recordBatch: RecordBatch) { + + const nodes = recordBatch.nodes || []; + const buffers = recordBatch.buffers || []; + + _RecordBatch.startNodesVector(b, nodes.length); + nodes.slice().reverse().forEach((n) => FieldNode.encode(b, n)); + + const nodesVectorOffset = b.endVector(); + + _RecordBatch.startBuffersVector(b, buffers.length); + buffers.slice().reverse().forEach((b_) => BufferRegion.encode(b, b_)); + + const buffersVectorOffset = b.endVector(); + + _RecordBatch.startRecordBatch(b); + _RecordBatch.addLength(b, new Long(recordBatch.length, 0)); + _RecordBatch.addNodes(b, nodesVectorOffset); + _RecordBatch.addBuffers(b, buffersVectorOffset); + return _RecordBatch.endRecordBatch(b); +} + +/** @ignore */ +function encodeDictionaryBatch(b: Builder, dictionaryBatch: DictionaryBatch) { + const dataOffset = RecordBatch.encode(b, dictionaryBatch.data); + _DictionaryBatch.startDictionaryBatch(b); + _DictionaryBatch.addId(b, new Long(dictionaryBatch.id, 0)); + _DictionaryBatch.addIsDelta(b, dictionaryBatch.isDelta); + _DictionaryBatch.addData(b, dataOffset); + return _DictionaryBatch.endDictionaryBatch(b); +} + +/** @ignore */ +function encodeFieldNode(b: Builder, node: FieldNode) { + return _FieldNode.createFieldNode(b, new Long(node.length, 0), new Long(node.nullCount, 0)); +} + +/** @ignore */ +function encodeBufferRegion(b: Builder, node: BufferRegion) { + return _Buffer.createBuffer(b, new Long(node.offset, 0), new Long(node.length, 0)); +} + +/** @ignore */ +const platformIsLittleEndian = (function() { + const buffer = new ArrayBuffer(2); + new DataView(buffer).setInt16(0, 256, true /* littleEndian */); + // Int16Array uses the platform's endianness. + return new Int16Array(buffer)[0] === 256; +})(); + +/** @ignore */ +type MessageHeaderDecoder = <T extends MessageHeader>() => T extends MessageHeader.Schema ? Schema + : T extends MessageHeader.RecordBatch ? RecordBatch + : T extends MessageHeader.DictionaryBatch ? DictionaryBatch : never; diff --git a/src/arrow/js/src/ipc/reader.ts b/src/arrow/js/src/ipc/reader.ts new file mode 100644 index 000000000..a150ac1bb --- /dev/null +++ b/src/arrow/js/src/ipc/reader.ts @@ -0,0 +1,739 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Vector } from '../vector'; +import { DataType } from '../type'; +import { MessageHeader } from '../enum'; +import { Footer } from './metadata/file'; +import { Schema, Field } from '../schema'; +import streamAdapters from '../io/adapters'; +import { Message } from './metadata/message'; +import * as metadata from './metadata/message'; +import { ArrayBufferViewInput } from '../util/buffer'; +import { ByteStream, AsyncByteStream } from '../io/stream'; +import { RandomAccessFile, AsyncRandomAccessFile } from '../io/file'; +import { VectorLoader, JSONVectorLoader } from '../visitor/vectorloader'; +import { RecordBatch, _InternalEmptyPlaceholderRecordBatch } from '../recordbatch'; +import { + FileHandle, + ArrowJSONLike, + ITERATOR_DONE, + ReadableInterop, +} from '../io/interfaces'; +import { + MessageReader, AsyncMessageReader, JSONMessageReader, + checkForMagicArrowString, magicLength, magicAndPadding, magicX2AndPadding +} from './message'; +import { + isPromise, + isIterable, isAsyncIterable, + isIteratorResult, isArrowJSON, + isFileHandle, isFetchResponse, + isReadableDOMStream, isReadableNodeStream +} from '../util/compat'; + +/** @ignore */ export type FromArg0 = ArrowJSONLike; +/** @ignore */ export type FromArg1 = PromiseLike<ArrowJSONLike>; +/** @ignore */ export type FromArg2 = Iterable<ArrayBufferViewInput> | ArrayBufferViewInput; +/** @ignore */ export type FromArg3 = PromiseLike<Iterable<ArrayBufferViewInput> | ArrayBufferViewInput>; +/** @ignore */ export type FromArg4 = Response | NodeJS.ReadableStream | ReadableStream<ArrayBufferViewInput> | AsyncIterable<ArrayBufferViewInput>; +/** @ignore */ export type FromArg5 = FileHandle | PromiseLike<FileHandle> | PromiseLike<FromArg4>; +/** @ignore */ export type FromArgs = FromArg0 | FromArg1 | FromArg2 | FromArg3 | FromArg4 | FromArg5; + +/** @ignore */ type OpenOptions = { autoDestroy?: boolean }; +/** @ignore */ type RecordBatchReaders<T extends { [key: string]: DataType } = any> = RecordBatchFileReader<T> | RecordBatchStreamReader<T>; +/** @ignore */ type AsyncRecordBatchReaders<T extends { [key: string]: DataType } = any> = AsyncRecordBatchFileReader<T> | AsyncRecordBatchStreamReader<T>; +/** @ignore */ type RecordBatchFileReaders<T extends { [key: string]: DataType } = any> = RecordBatchFileReader<T> | AsyncRecordBatchFileReader<T>; +/** @ignore */ type RecordBatchStreamReaders<T extends { [key: string]: DataType } = any> = RecordBatchStreamReader<T> | AsyncRecordBatchStreamReader<T>; + +export class RecordBatchReader<T extends { [key: string]: DataType } = any> extends ReadableInterop<RecordBatch<T>> { + + protected _impl: RecordBatchReaderImpls<T>; + protected constructor(impl: RecordBatchReaderImpls<T>) { + super(); + this._impl = impl; + } + + public get closed() { return this._impl.closed; } + public get schema() { return this._impl.schema; } + public get autoDestroy() { return this._impl.autoDestroy; } + public get dictionaries() { return this._impl.dictionaries; } + public get numDictionaries() { return this._impl.numDictionaries; } + public get numRecordBatches() { return this._impl.numRecordBatches; } + public get footer(): Footer | null { return this._impl.isFile() ? this._impl.footer : null; } + + public isSync(): this is RecordBatchReaders<T> { return this._impl.isSync(); } + public isAsync(): this is AsyncRecordBatchReaders<T> { return this._impl.isAsync(); } + public isFile(): this is RecordBatchFileReaders<T> { return this._impl.isFile(); } + public isStream(): this is RecordBatchStreamReaders<T> { return this._impl.isStream(); } + + public next() { + return this._impl.next(); + } + public throw(value?: any) { + return this._impl.throw(value); + } + public return(value?: any) { + return this._impl.return(value); + } + public cancel() { + return this._impl.cancel(); + } + public reset(schema?: Schema<T> | null): this { + this._impl.reset(schema); + this._DOMStream = undefined; + this._nodeStream = undefined; + return this; + } + public open(options?: OpenOptions) { + const opening = this._impl.open(options); + return isPromise(opening) ? opening.then(() => this) : this; + } + public readRecordBatch(index: number): RecordBatch<T> | null | Promise<RecordBatch<T> | null> { + return this._impl.isFile() ? this._impl.readRecordBatch(index) : null; + } + public [Symbol.iterator](): IterableIterator<RecordBatch<T>> { + return (<IterableIterator<RecordBatch<T>>> this._impl)[Symbol.iterator](); + } + public [Symbol.asyncIterator](): AsyncIterableIterator<RecordBatch<T>> { + return (<AsyncIterableIterator<RecordBatch<T>>> this._impl)[Symbol.asyncIterator](); + } + public toDOMStream() { + return streamAdapters.toDOMStream<RecordBatch<T>>( + (this.isSync() + ? { [Symbol.iterator]: () => this } as Iterable<RecordBatch<T>> + : { [Symbol.asyncIterator]: () => this } as AsyncIterable<RecordBatch<T>>)); + } + public toNodeStream() { + return streamAdapters.toNodeStream<RecordBatch<T>>( + (this.isSync() + ? { [Symbol.iterator]: () => this } as Iterable<RecordBatch<T>> + : { [Symbol.asyncIterator]: () => this } as AsyncIterable<RecordBatch<T>>), + { objectMode: true }); + } + + /** @nocollapse */ + // @ts-ignore + public static throughNode(options?: import('stream').DuplexOptions & { autoDestroy: boolean }): import('stream').Duplex { + throw new Error(`"throughNode" not available in this environment`); + } + /** @nocollapse */ + public static throughDOM<T extends { [key: string]: DataType }>( + // @ts-ignore + writableStrategy?: ByteLengthQueuingStrategy, + // @ts-ignore + readableStrategy?: { autoDestroy: boolean } + ): { writable: WritableStream<Uint8Array>; readable: ReadableStream<RecordBatch<T>> } { + throw new Error(`"throughDOM" not available in this environment`); + } + + public static from<T extends RecordBatchReader>(source: T): T; + public static from<T extends { [key: string]: DataType } = any>(source: FromArg0): RecordBatchStreamReader<T>; + public static from<T extends { [key: string]: DataType } = any>(source: FromArg1): Promise<RecordBatchStreamReader<T>>; + public static from<T extends { [key: string]: DataType } = any>(source: FromArg2): RecordBatchFileReader<T> | RecordBatchStreamReader<T>; + public static from<T extends { [key: string]: DataType } = any>(source: FromArg3): Promise<RecordBatchFileReader<T> | RecordBatchStreamReader<T>>; + public static from<T extends { [key: string]: DataType } = any>(source: FromArg4): Promise<RecordBatchFileReader<T> | AsyncRecordBatchReaders<T>>; + public static from<T extends { [key: string]: DataType } = any>(source: FromArg5): Promise<AsyncRecordBatchFileReader<T> | AsyncRecordBatchStreamReader<T>>; + /** @nocollapse */ + public static from<T extends { [key: string]: DataType } = any>(source: any) { + if (source instanceof RecordBatchReader) { + return source; + } else if (isArrowJSON(source)) { + return fromArrowJSON<T>(source); + } else if (isFileHandle(source)) { + return fromFileHandle<T>(source); + } else if (isPromise<any>(source)) { + return (async () => await RecordBatchReader.from<any>(await source))(); + } else if (isFetchResponse(source) || isReadableDOMStream(source) || isReadableNodeStream(source) || isAsyncIterable(source)) { + return fromAsyncByteStream<T>(new AsyncByteStream(source)); + } + return fromByteStream<T>(new ByteStream(source)); + } + + public static readAll<T extends RecordBatchReader>(source: T): T extends RecordBatchReaders ? IterableIterator<T> : AsyncIterableIterator<T>; + public static readAll<T extends { [key: string]: DataType } = any>(source: FromArg0): IterableIterator<RecordBatchStreamReader<T>>; + public static readAll<T extends { [key: string]: DataType } = any>(source: FromArg1): AsyncIterableIterator<RecordBatchStreamReader<T>>; + public static readAll<T extends { [key: string]: DataType } = any>(source: FromArg2): IterableIterator<RecordBatchFileReader<T> | RecordBatchStreamReader<T>>; + public static readAll<T extends { [key: string]: DataType } = any>(source: FromArg3): AsyncIterableIterator<RecordBatchFileReader<T> | RecordBatchStreamReader<T>>; + public static readAll<T extends { [key: string]: DataType } = any>(source: FromArg4): AsyncIterableIterator<RecordBatchFileReader<T> | AsyncRecordBatchReaders<T>>; + public static readAll<T extends { [key: string]: DataType } = any>(source: FromArg5): AsyncIterableIterator<AsyncRecordBatchFileReader<T> | AsyncRecordBatchStreamReader<T>>; + /** @nocollapse */ + public static readAll<T extends { [key: string]: DataType } = any>(source: any) { + if (source instanceof RecordBatchReader) { + return source.isSync() ? readAllSync(source) : readAllAsync(source as AsyncRecordBatchReaders<T>); + } else if (isArrowJSON(source) || ArrayBuffer.isView(source) || isIterable<ArrayBufferViewInput>(source) || isIteratorResult(source)) { + return readAllSync<T>(source) as IterableIterator<RecordBatchReaders<T>>; + } + return readAllAsync<T>(source) as AsyncIterableIterator<RecordBatchReaders<T> | AsyncRecordBatchReaders<T>>; + } +} + +// +// Since TS is a structural type system, we define the following subclass stubs +// so that concrete types exist to associate with with the interfaces below. +// +// The implementation for each RecordBatchReader is hidden away in the set of +// `RecordBatchReaderImpl` classes in the second half of this file. This allows +// us to export a single RecordBatchReader class, and swap out the impl based +// on the io primitives or underlying arrow (JSON, file, or stream) at runtime. +// +// Async/await makes our job a bit harder, since it forces everything to be +// either fully sync or fully async. This is why the logic for the reader impls +// has been duplicated into both sync and async variants. Since the RBR +// delegates to its impl, an RBR with an AsyncRecordBatchFileReaderImpl for +// example will return async/await-friendly Promises, but one with a (sync) +// RecordBatchStreamReaderImpl will always return values. Nothing should be +// different about their logic, aside from the async handling. This is also why +// this code looks highly structured, as it should be nearly identical and easy +// to follow. +// + +/** @ignore */ +export class RecordBatchStreamReader<T extends { [key: string]: DataType } = any> extends RecordBatchReader<T> { + constructor(protected _impl: RecordBatchStreamReaderImpl<T>) { super (_impl); } + public [Symbol.iterator]() { return (this._impl as IterableIterator<RecordBatch<T>>)[Symbol.iterator](); } + public async *[Symbol.asyncIterator](): AsyncIterableIterator<RecordBatch<T>> { yield* this[Symbol.iterator](); } +} +/** @ignore */ +export class AsyncRecordBatchStreamReader<T extends { [key: string]: DataType } = any> extends RecordBatchReader<T> { + constructor(protected _impl: AsyncRecordBatchStreamReaderImpl<T>) { super (_impl); } + public [Symbol.iterator](): IterableIterator<RecordBatch<T>> { throw new Error(`AsyncRecordBatchStreamReader is not Iterable`); } + public [Symbol.asyncIterator]() { return (this._impl as AsyncIterableIterator<RecordBatch<T>>)[Symbol.asyncIterator](); } +} +/** @ignore */ +export class RecordBatchFileReader<T extends { [key: string]: DataType } = any> extends RecordBatchStreamReader<T> { + constructor(protected _impl: RecordBatchFileReaderImpl<T>) { super (_impl); } +} +/** @ignore */ +export class AsyncRecordBatchFileReader<T extends { [key: string]: DataType } = any> extends AsyncRecordBatchStreamReader<T> { + constructor(protected _impl: AsyncRecordBatchFileReaderImpl<T>) { super (_impl); } +} + +// +// Now override the return types for each sync/async RecordBatchReader variant +// + +/** @ignore */ +export interface RecordBatchStreamReader<T extends { [key: string]: DataType } = any> extends RecordBatchReader<T> { + open(options?: OpenOptions | undefined): this; + cancel(): void; + throw(value?: any): IteratorResult<any>; + return(value?: any): IteratorResult<any>; + next(value?: any): IteratorResult<RecordBatch<T>>; +} + +/** @ignore */ +export interface AsyncRecordBatchStreamReader<T extends { [key: string]: DataType } = any> extends RecordBatchReader<T> { + open(options?: OpenOptions | undefined): Promise<this>; + cancel(): Promise<void>; + throw(value?: any): Promise<IteratorResult<any>>; + return(value?: any): Promise<IteratorResult<any>>; + next(value?: any): Promise<IteratorResult<RecordBatch<T>>>; +} + +/** @ignore */ +export interface RecordBatchFileReader<T extends { [key: string]: DataType } = any> extends RecordBatchStreamReader<T> { + readRecordBatch(index: number): RecordBatch<T> | null; +} + +/** @ignore */ +export interface AsyncRecordBatchFileReader<T extends { [key: string]: DataType } = any> extends AsyncRecordBatchStreamReader<T> { + readRecordBatch(index: number): Promise<RecordBatch<T> | null>; +} + +/** @ignore */ +type RecordBatchReaderImpls<T extends { [key: string]: DataType } = any> = + RecordBatchJSONReaderImpl<T> | + RecordBatchFileReaderImpl<T> | + RecordBatchStreamReaderImpl<T> | + AsyncRecordBatchFileReaderImpl<T> | + AsyncRecordBatchStreamReaderImpl<T>; + +/** @ignore */ +interface RecordBatchReaderImpl<T extends { [key: string]: DataType } = any> { + + closed: boolean; + schema: Schema<T>; + autoDestroy: boolean; + dictionaries: Map<number, Vector>; + + isFile(): this is RecordBatchFileReaders<T>; + isStream(): this is RecordBatchStreamReaders<T>; + isSync(): this is RecordBatchReaders<T>; + isAsync(): this is AsyncRecordBatchReaders<T>; + + reset(schema?: Schema<T> | null): this; +} + +/** @ignore */ +interface RecordBatchStreamReaderImpl<T extends { [key: string]: DataType } = any> extends RecordBatchReaderImpl<T> { + + open(options?: OpenOptions): this; + cancel(): void; + + throw(value?: any): IteratorResult<any>; + return(value?: any): IteratorResult<any>; + next(value?: any): IteratorResult<RecordBatch<T>>; + + [Symbol.iterator](): IterableIterator<RecordBatch<T>>; +} + +/** @ignore */ +interface AsyncRecordBatchStreamReaderImpl<T extends { [key: string]: DataType } = any> extends RecordBatchReaderImpl<T> { + + open(options?: OpenOptions): Promise<this>; + cancel(): Promise<void>; + + throw(value?: any): Promise<IteratorResult<any>>; + return(value?: any): Promise<IteratorResult<any>>; + next(value?: any): Promise<IteratorResult<RecordBatch<T>>>; + + [Symbol.asyncIterator](): AsyncIterableIterator<RecordBatch<T>>; +} + +/** @ignore */ +interface RecordBatchFileReaderImpl<T extends { [key: string]: DataType } = any> extends RecordBatchStreamReaderImpl<T> { + readRecordBatch(index: number): RecordBatch<T> | null; +} + +/** @ignore */ +interface AsyncRecordBatchFileReaderImpl<T extends { [key: string]: DataType } = any> extends AsyncRecordBatchStreamReaderImpl<T> { + readRecordBatch(index: number): Promise<RecordBatch<T> | null>; +} + +/** @ignore */ +abstract class RecordBatchReaderImpl<T extends { [key: string]: DataType } = any> implements RecordBatchReaderImpl<T> { + + public schema!: Schema<T>; + public closed = false; + public autoDestroy = true; + public dictionaries: Map<number, Vector>; + + protected _dictionaryIndex = 0; + protected _recordBatchIndex = 0; + public get numDictionaries() { return this._dictionaryIndex; } + public get numRecordBatches() { return this._recordBatchIndex; } + + constructor(dictionaries = new Map<number, Vector>()) { + this.dictionaries = dictionaries; + } + + public isSync(): this is RecordBatchReaders<T> { return false; } + public isAsync(): this is AsyncRecordBatchReaders<T> { return false; } + public isFile(): this is RecordBatchFileReaders<T> { return false; } + public isStream(): this is RecordBatchStreamReaders<T> { return false; } + + public reset(schema?: Schema<T> | null) { + this._dictionaryIndex = 0; + this._recordBatchIndex = 0; + this.schema = <any> schema; + this.dictionaries = new Map(); + return this; + } + + protected _loadRecordBatch(header: metadata.RecordBatch, body: any) { + return new RecordBatch<T>(this.schema, header.length, this._loadVectors(header, body, this.schema.fields)); + } + protected _loadDictionaryBatch(header: metadata.DictionaryBatch, body: any) { + const { id, isDelta, data } = header; + const { dictionaries, schema } = this; + const dictionary = dictionaries.get(id); + if (isDelta || !dictionary) { + const type = schema.dictionaries.get(id)!; + return (dictionary && isDelta ? dictionary.concat( + Vector.new(this._loadVectors(data, body, [type])[0])) : + Vector.new(this._loadVectors(data, body, [type])[0])) as Vector; + } + return dictionary; + } + protected _loadVectors(header: metadata.RecordBatch, body: any, types: (Field | DataType)[]) { + return new VectorLoader(body, header.nodes, header.buffers, this.dictionaries).visitMany(types); + } +} + +/** @ignore */ +class RecordBatchStreamReaderImpl<T extends { [key: string]: DataType } = any> extends RecordBatchReaderImpl<T> implements IterableIterator<RecordBatch<T>> { + + protected _reader: MessageReader; + protected _handle: ByteStream | ArrowJSONLike; + + constructor(source: ByteStream | ArrowJSONLike, dictionaries?: Map<number, Vector>) { + super(dictionaries); + this._reader = !isArrowJSON(source) + ? new MessageReader(this._handle = source) + : new JSONMessageReader(this._handle = source); + } + + public isSync(): this is RecordBatchReaders<T> { return true; } + public isStream(): this is RecordBatchStreamReaders<T> { return true; } + public [Symbol.iterator](): IterableIterator<RecordBatch<T>> { + return this as IterableIterator<RecordBatch<T>>; + } + public cancel() { + if (!this.closed && (this.closed = true)) { + this.reset()._reader.return(); + this._reader = <any> null; + this.dictionaries = <any> null; + } + } + public open(options?: OpenOptions) { + if (!this.closed) { + this.autoDestroy = shouldAutoDestroy(this, options); + if (!(this.schema || (this.schema = this._reader.readSchema()!))) { + this.cancel(); + } + } + return this; + } + public throw(value?: any): IteratorResult<any> { + if (!this.closed && this.autoDestroy && (this.closed = true)) { + return this.reset()._reader.throw(value); + } + return ITERATOR_DONE; + } + public return(value?: any): IteratorResult<any> { + if (!this.closed && this.autoDestroy && (this.closed = true)) { + return this.reset()._reader.return(value); + } + return ITERATOR_DONE; + } + public next(): IteratorResult<RecordBatch<T>> { + if (this.closed) { return ITERATOR_DONE; } + let message: Message | null; + const { _reader: reader } = this; + while (message = this._readNextMessageAndValidate()) { + if (message.isSchema()) { + this.reset(message.header()); + } else if (message.isRecordBatch()) { + this._recordBatchIndex++; + const header = message.header(); + const buffer = reader.readMessageBody(message.bodyLength); + const recordBatch = this._loadRecordBatch(header, buffer); + return { done: false, value: recordBatch }; + } else if (message.isDictionaryBatch()) { + this._dictionaryIndex++; + const header = message.header(); + const buffer = reader.readMessageBody(message.bodyLength); + const vector = this._loadDictionaryBatch(header, buffer); + this.dictionaries.set(header.id, vector); + } + } + if (this.schema && this._recordBatchIndex === 0) { + this._recordBatchIndex++; + return { done: false, value: new _InternalEmptyPlaceholderRecordBatch<T>(this.schema) }; + } + return this.return(); + } + protected _readNextMessageAndValidate<T extends MessageHeader>(type?: T | null) { + return this._reader.readMessage<T>(type); + } +} + +/** @ignore */ +class AsyncRecordBatchStreamReaderImpl<T extends { [key: string]: DataType } = any> extends RecordBatchReaderImpl<T> implements AsyncIterableIterator<RecordBatch<T>> { + + protected _handle: AsyncByteStream; + protected _reader: AsyncMessageReader; + + constructor(source: AsyncByteStream, dictionaries?: Map<number, Vector>) { + super(dictionaries); + this._reader = new AsyncMessageReader(this._handle = source); + } + public isAsync(): this is AsyncRecordBatchReaders<T> { return true; } + public isStream(): this is RecordBatchStreamReaders<T> { return true; } + public [Symbol.asyncIterator](): AsyncIterableIterator<RecordBatch<T>> { + return this as AsyncIterableIterator<RecordBatch<T>>; + } + public async cancel() { + if (!this.closed && (this.closed = true)) { + await this.reset()._reader.return(); + this._reader = <any> null; + this.dictionaries = <any> null; + } + } + public async open(options?: OpenOptions) { + if (!this.closed) { + this.autoDestroy = shouldAutoDestroy(this, options); + if (!(this.schema || (this.schema = (await this._reader.readSchema())!))) { + await this.cancel(); + } + } + return this; + } + public async throw(value?: any): Promise<IteratorResult<any>> { + if (!this.closed && this.autoDestroy && (this.closed = true)) { + return await this.reset()._reader.throw(value); + } + return ITERATOR_DONE; + } + public async return(value?: any): Promise<IteratorResult<any>> { + if (!this.closed && this.autoDestroy && (this.closed = true)) { + return await this.reset()._reader.return(value); + } + return ITERATOR_DONE; + } + public async next() { + if (this.closed) { return ITERATOR_DONE; } + let message: Message | null; + const { _reader: reader } = this; + while (message = await this._readNextMessageAndValidate()) { + if (message.isSchema()) { + await this.reset(message.header()); + } else if (message.isRecordBatch()) { + this._recordBatchIndex++; + const header = message.header(); + const buffer = await reader.readMessageBody(message.bodyLength); + const recordBatch = this._loadRecordBatch(header, buffer); + return { done: false, value: recordBatch }; + } else if (message.isDictionaryBatch()) { + this._dictionaryIndex++; + const header = message.header(); + const buffer = await reader.readMessageBody(message.bodyLength); + const vector = this._loadDictionaryBatch(header, buffer); + this.dictionaries.set(header.id, vector); + } + } + if (this.schema && this._recordBatchIndex === 0) { + this._recordBatchIndex++; + return { done: false, value: new _InternalEmptyPlaceholderRecordBatch<T>(this.schema) }; + } + return await this.return(); + } + protected async _readNextMessageAndValidate<T extends MessageHeader>(type?: T | null) { + return await this._reader.readMessage<T>(type); + } +} + +/** @ignore */ +class RecordBatchFileReaderImpl<T extends { [key: string]: DataType } = any> extends RecordBatchStreamReaderImpl<T> { + + protected _footer?: Footer; + protected _handle!: RandomAccessFile; + public get footer() { return this._footer!; } + public get numDictionaries() { return this._footer ? this._footer.numDictionaries : 0; } + public get numRecordBatches() { return this._footer ? this._footer.numRecordBatches : 0; } + + constructor(source: RandomAccessFile | ArrayBufferViewInput, dictionaries?: Map<number, Vector>) { + super(source instanceof RandomAccessFile ? source : new RandomAccessFile(source), dictionaries); + } + public isSync(): this is RecordBatchReaders<T> { return true; } + public isFile(): this is RecordBatchFileReaders<T> { return true; } + public open(options?: OpenOptions) { + if (!this.closed && !this._footer) { + this.schema = (this._footer = this._readFooter()).schema; + for (const block of this._footer.dictionaryBatches()) { + block && this._readDictionaryBatch(this._dictionaryIndex++); + } + } + return super.open(options); + } + public readRecordBatch(index: number) { + if (this.closed) { return null; } + if (!this._footer) { this.open(); } + const block = this._footer && this._footer.getRecordBatch(index); + if (block && this._handle.seek(block.offset)) { + const message = this._reader.readMessage(MessageHeader.RecordBatch); + if (message?.isRecordBatch()) { + const header = message.header(); + const buffer = this._reader.readMessageBody(message.bodyLength); + const recordBatch = this._loadRecordBatch(header, buffer); + return recordBatch; + } + } + return null; + } + protected _readDictionaryBatch(index: number) { + const block = this._footer && this._footer.getDictionaryBatch(index); + if (block && this._handle.seek(block.offset)) { + const message = this._reader.readMessage(MessageHeader.DictionaryBatch); + if (message?.isDictionaryBatch()) { + const header = message.header(); + const buffer = this._reader.readMessageBody(message.bodyLength); + const vector = this._loadDictionaryBatch(header, buffer); + this.dictionaries.set(header.id, vector); + } + } + } + protected _readFooter() { + const { _handle } = this; + const offset = _handle.size - magicAndPadding; + const length = _handle.readInt32(offset); + const buffer = _handle.readAt(offset - length, length); + return Footer.decode(buffer); + } + protected _readNextMessageAndValidate<T extends MessageHeader>(type?: T | null): Message<T> | null { + if (!this._footer) { this.open(); } + if (this._footer && this._recordBatchIndex < this.numRecordBatches) { + const block = this._footer && this._footer.getRecordBatch(this._recordBatchIndex); + if (block && this._handle.seek(block.offset)) { + return this._reader.readMessage(type); + } + } + return null; + } +} + +/** @ignore */ +class AsyncRecordBatchFileReaderImpl<T extends { [key: string]: DataType } = any> extends AsyncRecordBatchStreamReaderImpl<T> + implements AsyncRecordBatchFileReaderImpl<T> { + + protected _footer?: Footer; + protected _handle!: AsyncRandomAccessFile; + public get footer() { return this._footer!; } + public get numDictionaries() { return this._footer ? this._footer.numDictionaries : 0; } + public get numRecordBatches() { return this._footer ? this._footer.numRecordBatches : 0; } + + constructor(source: FileHandle, byteLength?: number, dictionaries?: Map<number, Vector>); + constructor(source: FileHandle | AsyncRandomAccessFile, dictionaries?: Map<number, Vector>); + constructor(source: FileHandle | AsyncRandomAccessFile, ...rest: any[]) { + const byteLength = typeof rest[0] !== 'number' ? <number> rest.shift() : undefined; + const dictionaries = rest[0] instanceof Map ? <Map<number, Vector>> rest.shift() : undefined; + super(source instanceof AsyncRandomAccessFile ? source : new AsyncRandomAccessFile(source, byteLength), dictionaries); + } + public isFile(): this is RecordBatchFileReaders<T> { return true; } + public isAsync(): this is AsyncRecordBatchReaders<T> { return true; } + public async open(options?: OpenOptions) { + if (!this.closed && !this._footer) { + this.schema = (this._footer = await this._readFooter()).schema; + for (const block of this._footer.dictionaryBatches()) { + block && await this._readDictionaryBatch(this._dictionaryIndex++); + } + } + return await super.open(options); + } + public async readRecordBatch(index: number) { + if (this.closed) { return null; } + if (!this._footer) { await this.open(); } + const block = this._footer && this._footer.getRecordBatch(index); + if (block && (await this._handle.seek(block.offset))) { + const message = await this._reader.readMessage(MessageHeader.RecordBatch); + if (message?.isRecordBatch()) { + const header = message.header(); + const buffer = await this._reader.readMessageBody(message.bodyLength); + const recordBatch = this._loadRecordBatch(header, buffer); + return recordBatch; + } + } + return null; + } + protected async _readDictionaryBatch(index: number) { + const block = this._footer && this._footer.getDictionaryBatch(index); + if (block && (await this._handle.seek(block.offset))) { + const message = await this._reader.readMessage(MessageHeader.DictionaryBatch); + if (message?.isDictionaryBatch()) { + const header = message.header(); + const buffer = await this._reader.readMessageBody(message.bodyLength); + const vector = this._loadDictionaryBatch(header, buffer); + this.dictionaries.set(header.id, vector); + } + } + } + protected async _readFooter() { + const { _handle } = this; + _handle._pending && await _handle._pending; + const offset = _handle.size - magicAndPadding; + const length = await _handle.readInt32(offset); + const buffer = await _handle.readAt(offset - length, length); + return Footer.decode(buffer); + } + protected async _readNextMessageAndValidate<T extends MessageHeader>(type?: T | null): Promise<Message<T> | null> { + if (!this._footer) { await this.open(); } + if (this._footer && this._recordBatchIndex < this.numRecordBatches) { + const block = this._footer.getRecordBatch(this._recordBatchIndex); + if (block && await this._handle.seek(block.offset)) { + return await this._reader.readMessage(type); + } + } + return null; + } +} + +/** @ignore */ +class RecordBatchJSONReaderImpl<T extends { [key: string]: DataType } = any> extends RecordBatchStreamReaderImpl<T> { + constructor(source: ArrowJSONLike, dictionaries?: Map<number, Vector>) { + super(source, dictionaries); + } + protected _loadVectors(header: metadata.RecordBatch, body: any, types: (Field | DataType)[]) { + return new JSONVectorLoader(body, header.nodes, header.buffers, this.dictionaries).visitMany(types); + } +} + +// +// Define some helper functions and static implementations down here. There's +// a bit of branching in the static methods that can lead to the same routines +// being executed, so we've broken those out here for readability. +// + +/** @ignore */ +function shouldAutoDestroy(self: { autoDestroy: boolean }, options?: OpenOptions) { + return options && (typeof options['autoDestroy'] === 'boolean') ? options['autoDestroy'] : self['autoDestroy']; +} + +/** @ignore */ +function* readAllSync<T extends { [key: string]: DataType } = any>(source: RecordBatchReaders<T> | FromArg0 | FromArg2) { + const reader = RecordBatchReader.from<T>(<any> source) as RecordBatchReaders<T>; + try { + if (!reader.open({ autoDestroy: false }).closed) { + do { yield reader; } while (!(reader.reset().open()).closed); + } + } finally { reader.cancel(); } +} + +/** @ignore */ +async function* readAllAsync<T extends { [key: string]: DataType } = any>(source: AsyncRecordBatchReaders<T> | FromArg1 | FromArg3 | FromArg4 | FromArg5) { + const reader = await RecordBatchReader.from<T>(<any> source) as RecordBatchReader<T>; + try { + if (!(await reader.open({ autoDestroy: false })).closed) { + do { yield reader; } while (!(await reader.reset().open()).closed); + } + } finally { await reader.cancel(); } +} + +/** @ignore */ +function fromArrowJSON<T extends { [key: string]: DataType }>(source: ArrowJSONLike) { + return new RecordBatchStreamReader(new RecordBatchJSONReaderImpl<T>(source)); +} + +/** @ignore */ +function fromByteStream<T extends { [key: string]: DataType }>(source: ByteStream) { + const bytes = source.peek((magicLength + 7) & ~7); + return bytes && bytes.byteLength >= 4 ? !checkForMagicArrowString(bytes) + ? new RecordBatchStreamReader(new RecordBatchStreamReaderImpl<T>(source)) + : new RecordBatchFileReader(new RecordBatchFileReaderImpl<T>(source.read())) + : new RecordBatchStreamReader(new RecordBatchStreamReaderImpl<T>(function*(): any {}())); +} + +/** @ignore */ +async function fromAsyncByteStream<T extends { [key: string]: DataType }>(source: AsyncByteStream) { + const bytes = await source.peek((magicLength + 7) & ~7); + return bytes && bytes.byteLength >= 4 ? !checkForMagicArrowString(bytes) + ? new AsyncRecordBatchStreamReader(new AsyncRecordBatchStreamReaderImpl<T>(source)) + : new RecordBatchFileReader(new RecordBatchFileReaderImpl<T>(await source.read())) + : new AsyncRecordBatchStreamReader(new AsyncRecordBatchStreamReaderImpl<T>(async function*(): any {}())); +} + +/** @ignore */ +async function fromFileHandle<T extends { [key: string]: DataType }>(source: FileHandle) { + const { size } = await source.stat(); + const file = new AsyncRandomAccessFile(source, size); + if (size >= magicX2AndPadding) { + if (checkForMagicArrowString(await file.readAt(0, (magicLength + 7) & ~7))) { + return new AsyncRecordBatchFileReader(new AsyncRecordBatchFileReaderImpl<T>(file)); + } + } + return new AsyncRecordBatchStreamReader(new AsyncRecordBatchStreamReaderImpl<T>(file)); +} diff --git a/src/arrow/js/src/ipc/writer.ts b/src/arrow/js/src/ipc/writer.ts new file mode 100644 index 000000000..12aa83355 --- /dev/null +++ b/src/arrow/js/src/ipc/writer.ts @@ -0,0 +1,492 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Table } from '../table'; +import { MAGIC } from './message'; +import { Vector } from '../vector'; +import { Column } from '../column'; +import { DataType } from '../type'; +import { Schema, Field } from '../schema'; +import { Message } from './metadata/message'; +import * as metadata from './metadata/message'; +import { FileBlock, Footer } from './metadata/file'; +import { MessageHeader, MetadataVersion } from '../enum'; +import { compareSchemas } from '../visitor/typecomparator'; +import { WritableSink, AsyncByteQueue } from '../io/stream'; +import { VectorAssembler } from '../visitor/vectorassembler'; +import { JSONTypeAssembler } from '../visitor/jsontypeassembler'; +import { JSONVectorAssembler } from '../visitor/jsonvectorassembler'; +import { ArrayBufferViewInput, toUint8Array } from '../util/buffer'; +import { RecordBatch, _InternalEmptyPlaceholderRecordBatch } from '../recordbatch'; +import { Writable, ReadableInterop, ReadableDOMStreamOptions } from '../io/interfaces'; +import { isPromise, isAsyncIterable, isWritableDOMStream, isWritableNodeStream, isIterable, isObject } from '../util/compat'; + +export interface RecordBatchStreamWriterOptions { + /** + * + */ + autoDestroy?: boolean; + /** + * A flag indicating whether the RecordBatchWriter should construct pre-0.15.0 + * encapsulated IPC Messages, which reserves 4 bytes for the Message metadata + * length instead of 8. + * @see https://issues.apache.org/jira/browse/ARROW-6313 + */ + writeLegacyIpcFormat?: boolean; +} + +export class RecordBatchWriter<T extends { [key: string]: DataType } = any> extends ReadableInterop<Uint8Array> implements Writable<RecordBatch<T>> { + + /** @nocollapse */ + // @ts-ignore + public static throughNode(options?: import('stream').DuplexOptions & { autoDestroy: boolean }): import('stream').Duplex { + throw new Error(`"throughNode" not available in this environment`); + } + /** @nocollapse */ + public static throughDOM<T extends { [key: string]: DataType }>( + // @ts-ignore + writableStrategy?: QueuingStrategy<RecordBatch<T>> & { autoDestroy: boolean }, + // @ts-ignore + readableStrategy?: { highWaterMark?: number; size?: any } + ): { writable: WritableStream<Table<T> | RecordBatch<T>>; readable: ReadableStream<Uint8Array> } { + throw new Error(`"throughDOM" not available in this environment`); + } + + constructor(options?: RecordBatchStreamWriterOptions) { + super(); + isObject(options) || (options = { autoDestroy: true, writeLegacyIpcFormat: false }); + this._autoDestroy = (typeof options.autoDestroy === 'boolean') ? options.autoDestroy : true; + this._writeLegacyIpcFormat = (typeof options.writeLegacyIpcFormat === 'boolean') ? options.writeLegacyIpcFormat : false; + } + + protected _position = 0; + protected _started = false; + protected _autoDestroy: boolean; + protected _writeLegacyIpcFormat: boolean; + // @ts-ignore + protected _sink = new AsyncByteQueue(); + protected _schema: Schema | null = null; + protected _dictionaryBlocks: FileBlock[] = []; + protected _recordBatchBlocks: FileBlock[] = []; + protected _dictionaryDeltaOffsets = new Map<number, number>(); + + public toString(sync: true): string; + public toString(sync?: false): Promise<string>; + public toString(sync: any = false) { + return this._sink.toString(sync) as Promise<string> | string; + } + public toUint8Array(sync: true): Uint8Array; + public toUint8Array(sync?: false): Promise<Uint8Array>; + public toUint8Array(sync: any = false) { + return this._sink.toUint8Array(sync) as Promise<Uint8Array> | Uint8Array; + } + + public writeAll(input: Table<T> | Iterable<RecordBatch<T>>): this; + public writeAll(input: AsyncIterable<RecordBatch<T>>): Promise<this>; + public writeAll(input: PromiseLike<AsyncIterable<RecordBatch<T>>>): Promise<this>; + public writeAll(input: PromiseLike<Table<T> | Iterable<RecordBatch<T>>>): Promise<this>; + public writeAll(input: PromiseLike<any> | Table<T> | Iterable<RecordBatch<T>> | AsyncIterable<RecordBatch<T>>) { + if (isPromise<any>(input)) { + return input.then((x) => this.writeAll(x)); + } else if (isAsyncIterable<RecordBatch<T>>(input)) { + return writeAllAsync(this, input); + } + return writeAll(this, <any> input); + } + + public get closed() { return this._sink.closed; } + public [Symbol.asyncIterator]() { return this._sink[Symbol.asyncIterator](); } + public toDOMStream(options?: ReadableDOMStreamOptions) { return this._sink.toDOMStream(options); } + public toNodeStream(options?: import('stream').ReadableOptions) { return this._sink.toNodeStream(options); } + + public close() { + return this.reset()._sink.close(); + } + public abort(reason?: any) { + return this.reset()._sink.abort(reason); + } + public finish() { + this._autoDestroy ? this.close() : this.reset(this._sink, this._schema); + return this; + } + public reset(sink: WritableSink<ArrayBufferViewInput> = this._sink, schema: Schema<T> | null = null) { + if ((sink === this._sink) || (sink instanceof AsyncByteQueue)) { + this._sink = sink as AsyncByteQueue; + } else { + this._sink = new AsyncByteQueue(); + if (sink && isWritableDOMStream(sink)) { + this.toDOMStream({ type: 'bytes' }).pipeTo(sink); + } else if (sink && isWritableNodeStream(sink)) { + this.toNodeStream({ objectMode: false }).pipe(sink); + } + } + + if (this._started && this._schema) { + this._writeFooter(this._schema); + } + + this._started = false; + this._dictionaryBlocks = []; + this._recordBatchBlocks = []; + this._dictionaryDeltaOffsets = new Map(); + + if (!schema || !(compareSchemas(schema, this._schema))) { + if (schema === null) { + this._position = 0; + this._schema = null; + } else { + this._started = true; + this._schema = schema; + this._writeSchema(schema); + } + } + + return this; + } + + public write(payload?: Table<T> | RecordBatch<T> | Iterable<RecordBatch<T>> | null) { + let schema: Schema<T> | null = null; + + if (!this._sink) { + throw new Error(`RecordBatchWriter is closed`); + } else if (payload == null) { + return this.finish() && undefined; + } else if (payload instanceof Table && !(schema = payload.schema)) { + return this.finish() && undefined; + } else if (payload instanceof RecordBatch && !(schema = payload.schema)) { + return this.finish() && undefined; + } + + if (schema && !compareSchemas(schema, this._schema)) { + if (this._started && this._autoDestroy) { + return this.close(); + } + this.reset(this._sink, schema); + } + + if (payload instanceof RecordBatch) { + if (!(payload instanceof _InternalEmptyPlaceholderRecordBatch)) { + this._writeRecordBatch(payload); + } + } else if (payload instanceof Table) { + this.writeAll(payload.chunks); + } else if (isIterable(payload)) { + this.writeAll(payload); + } + } + + protected _writeMessage<T extends MessageHeader>(message: Message<T>, alignment = 8) { + const a = alignment - 1; + const buffer = Message.encode(message); + const flatbufferSize = buffer.byteLength; + const prefixSize = !this._writeLegacyIpcFormat ? 8 : 4; + const alignedSize = (flatbufferSize + prefixSize + a) & ~a; + const nPaddingBytes = alignedSize - flatbufferSize - prefixSize; + + if (message.headerType === MessageHeader.RecordBatch) { + this._recordBatchBlocks.push(new FileBlock(alignedSize, message.bodyLength, this._position)); + } else if (message.headerType === MessageHeader.DictionaryBatch) { + this._dictionaryBlocks.push(new FileBlock(alignedSize, message.bodyLength, this._position)); + } + + // If not in legacy pre-0.15.0 mode, write the stream continuation indicator + if (!this._writeLegacyIpcFormat) { + this._write(Int32Array.of(-1)); + } + // Write the flatbuffer size prefix including padding + this._write(Int32Array.of(alignedSize - prefixSize)); + // Write the flatbuffer + if (flatbufferSize > 0) { this._write(buffer); } + // Write any padding + return this._writePadding(nPaddingBytes); + } + + protected _write(chunk: ArrayBufferViewInput) { + if (this._started) { + const buffer = toUint8Array(chunk); + if (buffer && buffer.byteLength > 0) { + this._sink.write(buffer); + this._position += buffer.byteLength; + } + } + return this; + } + + protected _writeSchema(schema: Schema<T>) { + return this._writeMessage(Message.from(schema)); + } + + // @ts-ignore + protected _writeFooter(schema: Schema<T>) { + // eos bytes + return this._writeLegacyIpcFormat + ? this._write(Int32Array.of(0)) + : this._write(Int32Array.of(-1, 0)); + } + + protected _writeMagic() { + return this._write(MAGIC); + } + + protected _writePadding(nBytes: number) { + return nBytes > 0 ? this._write(new Uint8Array(nBytes)) : this; + } + + protected _writeRecordBatch(batch: RecordBatch<T>) { + const { byteLength, nodes, bufferRegions, buffers } = VectorAssembler.assemble(batch); + const recordBatch = new metadata.RecordBatch(batch.length, nodes, bufferRegions); + const message = Message.from(recordBatch, byteLength); + return this + ._writeDictionaries(batch) + ._writeMessage(message) + ._writeBodyBuffers(buffers); + } + + protected _writeDictionaryBatch(dictionary: Vector, id: number, isDelta = false) { + this._dictionaryDeltaOffsets.set(id, dictionary.length + (this._dictionaryDeltaOffsets.get(id) || 0)); + const { byteLength, nodes, bufferRegions, buffers } = VectorAssembler.assemble(dictionary); + const recordBatch = new metadata.RecordBatch(dictionary.length, nodes, bufferRegions); + const dictionaryBatch = new metadata.DictionaryBatch(recordBatch, id, isDelta); + const message = Message.from(dictionaryBatch, byteLength); + return this + ._writeMessage(message) + ._writeBodyBuffers(buffers); + } + + protected _writeBodyBuffers(buffers: ArrayBufferView[]) { + let buffer: ArrayBufferView; + let size: number, padding: number; + for (let i = -1, n = buffers.length; ++i < n;) { + if ((buffer = buffers[i]) && (size = buffer.byteLength) > 0) { + this._write(buffer); + if ((padding = ((size + 7) & ~7) - size) > 0) { + this._writePadding(padding); + } + } + } + return this; + } + + protected _writeDictionaries(batch: RecordBatch<T>) { + for (let [id, dictionary] of batch.dictionaries) { + let offset = this._dictionaryDeltaOffsets.get(id) || 0; + if (offset === 0 || (dictionary = dictionary.slice(offset)).length > 0) { + const chunks = 'chunks' in dictionary ? (dictionary as any).chunks : [dictionary]; + for (const chunk of chunks) { + this._writeDictionaryBatch(chunk, id, offset > 0); + offset += chunk.length; + } + } + } + return this; + } +} + +/** @ignore */ +export class RecordBatchStreamWriter<T extends { [key: string]: DataType } = any> extends RecordBatchWriter<T> { + public static writeAll<T extends { [key: string]: DataType } = any>(input: Table<T> | Iterable<RecordBatch<T>>, options?: RecordBatchStreamWriterOptions): RecordBatchStreamWriter<T>; + public static writeAll<T extends { [key: string]: DataType } = any>(input: AsyncIterable<RecordBatch<T>>, options?: RecordBatchStreamWriterOptions): Promise<RecordBatchStreamWriter<T>>; + public static writeAll<T extends { [key: string]: DataType } = any>(input: PromiseLike<AsyncIterable<RecordBatch<T>>>, options?: RecordBatchStreamWriterOptions): Promise<RecordBatchStreamWriter<T>>; + public static writeAll<T extends { [key: string]: DataType } = any>(input: PromiseLike<Table<T> | Iterable<RecordBatch<T>>>, options?: RecordBatchStreamWriterOptions): Promise<RecordBatchStreamWriter<T>>; + /** @nocollapse */ + public static writeAll<T extends { [key: string]: DataType } = any>(input: any, options?: RecordBatchStreamWriterOptions) { + const writer = new RecordBatchStreamWriter<T>(options); + if (isPromise<any>(input)) { + return input.then((x) => writer.writeAll(x)); + } else if (isAsyncIterable<RecordBatch<T>>(input)) { + return writeAllAsync(writer, input); + } + return writeAll(writer, input); + } +} + +/** @ignore */ +export class RecordBatchFileWriter<T extends { [key: string]: DataType } = any> extends RecordBatchWriter<T> { + public static writeAll<T extends { [key: string]: DataType } = any>(input: Table<T> | Iterable<RecordBatch<T>>): RecordBatchFileWriter<T>; + public static writeAll<T extends { [key: string]: DataType } = any>(input: AsyncIterable<RecordBatch<T>>): Promise<RecordBatchFileWriter<T>>; + public static writeAll<T extends { [key: string]: DataType } = any>(input: PromiseLike<AsyncIterable<RecordBatch<T>>>): Promise<RecordBatchFileWriter<T>>; + public static writeAll<T extends { [key: string]: DataType } = any>(input: PromiseLike<Table<T> | Iterable<RecordBatch<T>>>): Promise<RecordBatchFileWriter<T>>; + /** @nocollapse */ + public static writeAll<T extends { [key: string]: DataType } = any>(input: any) { + const writer = new RecordBatchFileWriter<T>(); + if (isPromise<any>(input)) { + return input.then((x) => writer.writeAll(x)); + } else if (isAsyncIterable<RecordBatch<T>>(input)) { + return writeAllAsync(writer, input); + } + return writeAll(writer, input); + } + + constructor() { + super(); + this._autoDestroy = true; + } + + // @ts-ignore + protected _writeSchema(schema: Schema<T>) { + return this._writeMagic()._writePadding(2); + } + + protected _writeFooter(schema: Schema<T>) { + const buffer = Footer.encode(new Footer( + schema, MetadataVersion.V4, + this._recordBatchBlocks, this._dictionaryBlocks + )); + return super + ._writeFooter(schema) // EOS bytes for sequential readers + ._write(buffer) // Write the flatbuffer + ._write(Int32Array.of(buffer.byteLength)) // then the footer size suffix + ._writeMagic(); // then the magic suffix + } +} + +/** @ignore */ +export class RecordBatchJSONWriter<T extends { [key: string]: DataType } = any> extends RecordBatchWriter<T> { + + public static writeAll<T extends { [key: string]: DataType } = any>(this: typeof RecordBatchWriter, input: Table<T> | Iterable<RecordBatch<T>>): RecordBatchJSONWriter<T>; + // @ts-ignore + public static writeAll<T extends { [key: string]: DataType } = any>(this: typeof RecordBatchWriter, input: AsyncIterable<RecordBatch<T>>): Promise<RecordBatchJSONWriter<T>>; + public static writeAll<T extends { [key: string]: DataType } = any>(this: typeof RecordBatchWriter, input: PromiseLike<AsyncIterable<RecordBatch<T>>>): Promise<RecordBatchJSONWriter<T>>; + public static writeAll<T extends { [key: string]: DataType } = any>(this: typeof RecordBatchWriter, input: PromiseLike<Table<T> | Iterable<RecordBatch<T>>>): Promise<RecordBatchJSONWriter<T>>; + /** @nocollapse */ + public static writeAll<T extends { [key: string]: DataType } = any>(this: typeof RecordBatchWriter, input: any) { + return new RecordBatchJSONWriter<T>().writeAll(input as any); + } + + private _recordBatches: RecordBatch[]; + private _dictionaries: RecordBatch[]; + + constructor() { + super(); + this._autoDestroy = true; + this._recordBatches = []; + this._dictionaries = []; + } + + protected _writeMessage() { return this; } + // @ts-ignore + protected _writeFooter(schema: Schema<T>) { return this; } + protected _writeSchema(schema: Schema<T>) { + return this._write(`{\n "schema": ${ + JSON.stringify({ fields: schema.fields.map(fieldToJSON) }, null, 2) + }`); + } + protected _writeDictionaries(batch: RecordBatch<T>) { + if (batch.dictionaries.size > 0) { + this._dictionaries.push(batch); + } + return this; + } + protected _writeDictionaryBatch(dictionary: Vector, id: number, isDelta = false) { + this._dictionaryDeltaOffsets.set(id, dictionary.length + (this._dictionaryDeltaOffsets.get(id) || 0)); + this._write(this._dictionaryBlocks.length === 0 ? ` ` : `,\n `); + this._write(`${dictionaryBatchToJSON(dictionary, id, isDelta)}`); + this._dictionaryBlocks.push(new FileBlock(0, 0, 0)); + return this; + } + protected _writeRecordBatch(batch: RecordBatch<T>) { + this._writeDictionaries(batch); + this._recordBatches.push(batch); + return this; + } + public close() { + + if (this._dictionaries.length > 0) { + this._write(`,\n "dictionaries": [\n`); + for (const batch of this._dictionaries) { + super._writeDictionaries(batch); + } + this._write(`\n ]`); + } + + if (this._recordBatches.length > 0) { + for (let i = -1, n = this._recordBatches.length; ++i < n;) { + this._write(i === 0 ? `,\n "batches": [\n ` : `,\n `); + this._write(`${recordBatchToJSON(this._recordBatches[i])}`); + this._recordBatchBlocks.push(new FileBlock(0, 0, 0)); + } + this._write(`\n ]`); + } + + if (this._schema) { + this._write(`\n}`); + } + + this._dictionaries = []; + this._recordBatches = []; + + return super.close(); + } +} + +/** @ignore */ +function writeAll<T extends { [key: string]: DataType } = any>(writer: RecordBatchWriter<T>, input: Table<T> | Iterable<RecordBatch<T>>) { + let chunks = input as Iterable<RecordBatch<T>>; + if (input instanceof Table) { + chunks = input.chunks; + writer.reset(undefined, input.schema); + } + for (const batch of chunks) { + writer.write(batch); + } + return writer.finish(); +} + +/** @ignore */ +async function writeAllAsync<T extends { [key: string]: DataType } = any>(writer: RecordBatchWriter<T>, batches: AsyncIterable<RecordBatch<T>>) { + for await (const batch of batches) { + writer.write(batch); + } + return writer.finish(); +} + +/** @ignore */ +function fieldToJSON({ name, type, nullable }: Field): Record<string, unknown> { + const assembler = new JSONTypeAssembler(); + return { + 'name': name, 'nullable': nullable, + 'type': assembler.visit(type), + 'children': (type.children || []).map(fieldToJSON), + 'dictionary': !DataType.isDictionary(type) ? undefined : { + 'id': type.id, + 'isOrdered': type.isOrdered, + 'indexType': assembler.visit(type.indices) + } + }; +} + +/** @ignore */ +function dictionaryBatchToJSON(dictionary: Vector, id: number, isDelta = false) { + const field = new Field(`${id}`, dictionary.type, dictionary.nullCount > 0); + const columns = JSONVectorAssembler.assemble(new Column(field, [dictionary])); + return JSON.stringify({ + 'id': id, + 'isDelta': isDelta, + 'data': { + 'count': dictionary.length, + 'columns': columns + } + }, null, 2); +} + +/** @ignore */ +function recordBatchToJSON(records: RecordBatch) { + return JSON.stringify({ + 'count': records.length, + 'columns': JSONVectorAssembler.assemble(records) + }, null, 2); +} diff --git a/src/arrow/js/src/recordbatch.ts b/src/arrow/js/src/recordbatch.ts new file mode 100644 index 000000000..5463a387f --- /dev/null +++ b/src/arrow/js/src/recordbatch.ts @@ -0,0 +1,151 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Data } from './data'; +import { Table } from './table'; +import { Vector } from './vector'; +import { Visitor } from './visitor'; +import { Schema, Field } from './schema'; +import { isIterable } from './util/compat'; +import { Chunked } from './vector/chunked'; +import { selectFieldArgs } from './util/args'; +import { DataType, Struct, Dictionary } from './type'; +import { ensureSameLengthData } from './util/recordbatch'; +import { Clonable, Sliceable, Applicative } from './vector'; +import { StructVector, VectorBuilderOptions, VectorBuilderOptionsAsync } from './vector/index'; + +type VectorMap = { [key: string]: Vector }; +type Fields<T extends { [key: string]: DataType }> = (keyof T)[] | Field<T[keyof T]>[]; +type ChildData<T extends { [key: string]: DataType }> = (Data<T[keyof T]> | Vector<T[keyof T]>)[]; + +export interface RecordBatch<T extends { [key: string]: DataType } = any> { + concat(...others: Vector<Struct<T>>[]): Table<T>; + slice(begin?: number, end?: number): RecordBatch<T>; + clone(data: Data<Struct<T>>, children?: Vector[]): RecordBatch<T>; +} + +export class RecordBatch<T extends { [key: string]: DataType } = any> + extends StructVector<T> + implements Clonable<RecordBatch<T>>, + Sliceable<RecordBatch<T>>, + Applicative<Struct<T>, Table<T>> { + + public static from<T extends { [key: string]: DataType } = any, TNull = any>(options: VectorBuilderOptions<Struct<T>, TNull>): Table<T>; + public static from<T extends { [key: string]: DataType } = any, TNull = any>(options: VectorBuilderOptionsAsync<Struct<T>, TNull>): Promise<Table<T>>; + /** @nocollapse */ + public static from<T extends { [key: string]: DataType } = any, TNull = any>(options: VectorBuilderOptions<Struct<T>, TNull> | VectorBuilderOptionsAsync<Struct<T>, TNull>) { + if (isIterable<(Struct<T>)['TValue'] | TNull>(options['values'])) { + return Table.from(options as VectorBuilderOptions<Struct<T>, TNull>); + } + return Table.from(options as VectorBuilderOptionsAsync<Struct<T>, TNull>); + } + + public static new<T extends VectorMap = any>(children: T): RecordBatch<{ [P in keyof T]: T[P]['type'] }>; + public static new<T extends { [key: string]: DataType } = any>(children: ChildData<T>, fields?: Fields<T>): RecordBatch<T>; + /** @nocollapse */ + public static new<T extends { [key: string]: DataType } = any>(...args: any[]) { + const [fs, xs] = selectFieldArgs<T>(args); + const vs = xs.filter((x): x is Vector<T[keyof T]> => x instanceof Vector); + return new RecordBatch(...ensureSameLengthData(new Schema<T>(fs), vs.map((x) => x.data))); + } + + protected _schema: Schema; + protected _dictionaries?: Map<number, Vector>; + + constructor(schema: Schema<T>, length: number, children: (Data | Vector)[]); + constructor(schema: Schema<T>, data: Data<Struct<T>>, children?: Vector[]); + constructor(...args: any[]) { + let data: Data<Struct<T>>; + const schema = args[0] as Schema<T>; + let children: Vector[] | undefined; + if (args[1] instanceof Data) { + [, data, children] = (args as [any, Data<Struct<T>>, Vector<T[keyof T]>[]?]); + } else { + const fields = schema.fields as Field<T[keyof T]>[]; + const [, length, childData] = args as [any, number, Data<T[keyof T]>[]]; + data = Data.Struct(new Struct<T>(fields), 0, length, 0, null, childData); + } + super(data, children); + this._schema = schema; + } + + public clone(data: Data<Struct<T>>, children = this._children) { + return new RecordBatch<T>(this._schema, data, children); + } + + public concat(...others: Vector<Struct<T>>[]): Table<T> { + const schema = this._schema, chunks = Chunked.flatten(this, ...others); + return new Table(schema, chunks.map(({ data }) => new RecordBatch(schema, data))); + } + + public get schema() { return this._schema; } + public get numCols() { return this._schema.fields.length; } + public get dictionaries() { + return this._dictionaries || (this._dictionaries = DictionaryCollector.collect(this)); + } + + public select<K extends keyof T = any>(...columnNames: K[]) { + const nameToIndex = this._schema.fields.reduce((m, f, i) => m.set(f.name as K, i), new Map<K, number>()); + return this.selectAt(...columnNames.map((columnName) => nameToIndex.get(columnName)!).filter((x) => x > -1)); + } + public selectAt<K extends T[keyof T] = any>(...columnIndices: number[]) { + const schema = this._schema.selectAt(...columnIndices); + const childData = columnIndices.map((i) => this.data.childData[i]).filter(Boolean); + return new RecordBatch<{ [key: string]: K }>(schema, this.length, childData); + } +} + +/** + * An internal class used by the `RecordBatchReader` and `RecordBatchWriter` + * implementations to differentiate between a stream with valid zero-length + * RecordBatches, and a stream with a Schema message, but no RecordBatches. + * @see https://github.com/apache/arrow/pull/4373 + * @ignore + * @private + */ +/* eslint-disable @typescript-eslint/naming-convention */ +export class _InternalEmptyPlaceholderRecordBatch<T extends { [key: string]: DataType } = any> extends RecordBatch<T> { + constructor(schema: Schema<T>) { + super(schema, 0, schema.fields.map((f) => Data.new(f.type, 0, 0, 0))); + } +} + +/** @ignore */ +class DictionaryCollector extends Visitor { + public dictionaries = new Map<number, Vector>(); + public static collect<T extends RecordBatch>(batch: T) { + return new DictionaryCollector().visit( + batch.data, new Struct(batch.schema.fields) + ).dictionaries; + } + public visit(data: Data, type: DataType) { + if (DataType.isDictionary(type)) { + return this.visitDictionary(data, type); + } else { + data.childData.forEach((child, i) => + this.visit(child, type.children[i].type)); + } + return this; + } + public visitDictionary(data: Data, type: Dictionary) { + const dictionary = data.dictionary; + if (dictionary && dictionary.length > 0) { + this.dictionaries.set(type.id, dictionary); + } + return this; + } +} diff --git a/src/arrow/js/src/schema.ts b/src/arrow/js/src/schema.ts new file mode 100644 index 000000000..437ffa228 --- /dev/null +++ b/src/arrow/js/src/schema.ts @@ -0,0 +1,154 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { DataType } from './type'; + +export class Schema<T extends { [key: string]: DataType } = any> { + + public readonly fields: Field<T[keyof T]>[]; + public readonly metadata: Map<string, string>; + public readonly dictionaries: Map<number, DataType>; + + constructor(fields: Field[] = [], + metadata?: Map<string, string> | null, + dictionaries?: Map<number, DataType> | null) { + this.fields = (fields || []) as Field<T[keyof T]>[]; + this.metadata = metadata || new Map(); + if (!dictionaries) { + dictionaries = generateDictionaryMap(fields); + } + this.dictionaries = dictionaries; + } + public get [Symbol.toStringTag]() { return 'Schema'; } + public toString() { + return `Schema<{ ${this.fields.map((f, i) => `${i}: ${f}`).join(', ')} }>`; + } + + public select<K extends keyof T = any>(...columnNames: K[]) { + const names = columnNames.reduce((xs, x) => (xs[x] = true) && xs, Object.create(null)); + return new Schema<{ [P in K]: T[P] }>(this.fields.filter((f) => names[f.name]), this.metadata); + } + public selectAt<K extends T[keyof T] = any>(...columnIndices: number[]) { + return new Schema<{ [key: string]: K }>(columnIndices.map((i) => this.fields[i]).filter(Boolean), this.metadata); + } + + public assign<R extends { [key: string]: DataType } = any>(schema: Schema<R>): Schema<T & R>; + public assign<R extends { [key: string]: DataType } = any>(...fields: (Field<R[keyof R]> | Field<R[keyof R]>[])[]): Schema<T & R>; + public assign<R extends { [key: string]: DataType } = any>(...args: (Schema<R> | Field<R[keyof R]> | Field<R[keyof R]>[])[]) { + + const other = (args[0] instanceof Schema + ? args[0] as Schema<R> + : Array.isArray(args[0]) + ? new Schema<R>(<Field<R[keyof R]>[]> args[0]) + : new Schema<R>(<Field<R[keyof R]>[]> args)); + + const curFields = [...this.fields] as Field[]; + const metadata = mergeMaps(mergeMaps(new Map(), this.metadata), other.metadata); + const newFields = other.fields.filter((f2) => { + const i = curFields.findIndex((f) => f.name === f2.name); + return ~i ? (curFields[i] = f2.clone({ + metadata: mergeMaps(mergeMaps(new Map(), curFields[i].metadata), f2.metadata) + })) && false : true; + }) as Field[]; + + const newDictionaries = generateDictionaryMap(newFields, new Map()); + + return new Schema<T & R>( + [...curFields, ...newFields], metadata, + new Map([...this.dictionaries, ...newDictionaries]) + ); + } +} + +export class Field<T extends DataType = any> { + + public static new<T extends DataType = any>(props: { name: string | number; type: T; nullable?: boolean; metadata?: Map<string, string> | null }): Field<T>; + public static new<T extends DataType = any>(name: string | number | Field<T>, type: T, nullable?: boolean, metadata?: Map<string, string> | null): Field<T>; + /** @nocollapse */ + public static new<T extends DataType = any>(...args: any[]) { + let [name, type, nullable, metadata] = args; + if (args[0] && typeof args[0] === 'object') { + ({ name } = args[0]); + (type === undefined) && (type = args[0].type); + (nullable === undefined) && (nullable = args[0].nullable); + (metadata === undefined) && (metadata = args[0].metadata); + } + return new Field<T>(`${name}`, type, nullable, metadata); + } + + public readonly type: T; + public readonly name: string; + public readonly nullable: boolean; + public readonly metadata: Map<string, string>; + + constructor(name: string, type: T, nullable = false, metadata?: Map<string, string> | null) { + this.name = name; + this.type = type; + this.nullable = nullable; + this.metadata = metadata || new Map(); + } + + public get typeId() { return this.type.typeId; } + public get [Symbol.toStringTag]() { return 'Field'; } + public toString() { return `${this.name}: ${this.type}`; } + public clone<R extends DataType = T>(props: { name?: string | number; type?: R; nullable?: boolean; metadata?: Map<string, string> | null }): Field<R>; + public clone<R extends DataType = T>(name?: string | number | Field<T>, type?: R, nullable?: boolean, metadata?: Map<string, string> | null): Field<R>; + public clone<R extends DataType = T>(...args: any[]) { + let [name, type, nullable, metadata] = args; + (!args[0] || typeof args[0] !== 'object') + ? ([name = this.name, type = this.type, nullable = this.nullable, metadata = this.metadata] = args) + : ({name = this.name, type = this.type, nullable = this.nullable, metadata = this.metadata} = args[0]); + return Field.new<R>(name, type, nullable, metadata); + } +} + +/** @ignore */ +function mergeMaps<TKey, TVal>(m1?: Map<TKey, TVal> | null, m2?: Map<TKey, TVal> | null): Map<TKey, TVal> { + return new Map([...(m1 || new Map()), ...(m2 || new Map())]); +} + +/** @ignore */ +function generateDictionaryMap(fields: Field[], dictionaries = new Map<number, DataType>()): Map<number, DataType> { + + for (let i = -1, n = fields.length; ++i < n;) { + const field = fields[i]; + const type = field.type; + if (DataType.isDictionary(type)) { + if (!dictionaries.has(type.id)) { + dictionaries.set(type.id, type.dictionary); + } else if (dictionaries.get(type.id) !== type.dictionary) { + throw new Error(`Cannot create Schema containing two different dictionaries with the same Id`); + } + } + if (type.children && type.children.length > 0) { + generateDictionaryMap(type.children, dictionaries); + } + } + + return dictionaries; +} + +// Add these here so they're picked up by the externs creator +// in the build, and closure-compiler doesn't minify them away +(Schema.prototype as any).fields = null; +(Schema.prototype as any).metadata = null; +(Schema.prototype as any).dictionaries = null; + +(Field.prototype as any).type = null; +(Field.prototype as any).name = null; +(Field.prototype as any).nullable = null; +(Field.prototype as any).metadata = null; diff --git a/src/arrow/js/src/table.ts b/src/arrow/js/src/table.ts new file mode 100644 index 000000000..d5e121de7 --- /dev/null +++ b/src/arrow/js/src/table.ts @@ -0,0 +1,289 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Column } from './column'; +import { Data } from './data'; +import { TypedArray, TypedArrayDataType } from './interfaces'; +import { RecordBatchReader } from './ipc/reader'; +import { RecordBatchFileWriter, RecordBatchStreamWriter } from './ipc/writer'; +import { RecordBatch, _InternalEmptyPlaceholderRecordBatch } from './recordbatch'; +import { Field, Schema } from './schema'; +import { DataType, RowLike, Struct } from './type'; +import { selectArgs, selectColumnArgs } from './util/args'; +import { isAsyncIterable, isIterable, isPromise } from './util/compat'; +import { distributeColumnsIntoRecordBatches, distributeVectorsIntoRecordBatches } from './util/recordbatch'; +import { Applicative, Clonable, Sliceable } from './vector'; +import { Chunked, StructVector, Vector, VectorBuilderOptions, VectorBuilderOptionsAsync } from './vector/index'; + +type VectorMap = { [key: string]: Vector | Exclude<TypedArray, Uint8ClampedArray> }; +type Fields<T extends { [key: string]: DataType }> = (keyof T)[] | Field<T[keyof T]>[]; +type ChildData<T extends { [key: string]: DataType }> = Data<T[keyof T]>[] | Vector<T[keyof T]>[]; +type Columns<T extends { [key: string]: DataType }> = Column<T[keyof T]>[] | Column<T[keyof T]>[][]; + +export interface Table<T extends { [key: string]: DataType } = any> { + + get(index: number): Struct<T>['TValue']; + [Symbol.iterator](): IterableIterator<RowLike<T>>; + + slice(begin?: number, end?: number): Table<T>; + concat(...others: Vector<Struct<T>>[]): Table<T>; + clone(chunks?: RecordBatch<T>[], offsets?: Uint32Array): Table<T>; +} + +export class Table<T extends { [key: string]: DataType } = any> + extends Chunked<Struct<T>> + implements Clonable<Table<T>>, + Sliceable<Table<T>>, + Applicative<Struct<T>, Table<T>> { + + /** @nocollapse */ + public static empty<T extends { [key: string]: DataType } = Record<string, never>>(schema = new Schema<T>([])) { return new Table<T>(schema, []); } + + public static from(): Table<Record<string, never>>; + public static from<T extends { [key: string]: DataType } = any>(source: RecordBatchReader<T>): Table<T>; + public static from<T extends { [key: string]: DataType } = any>(source: import('./ipc/reader').FromArg0): Table<T>; + public static from<T extends { [key: string]: DataType } = any>(source: import('./ipc/reader').FromArg2): Table<T>; + public static from<T extends { [key: string]: DataType } = any>(source: import('./ipc/reader').FromArg1): Promise<Table<T>>; + public static from<T extends { [key: string]: DataType } = any>(source: import('./ipc/reader').FromArg3): Promise<Table<T>>; + public static from<T extends { [key: string]: DataType } = any>(source: import('./ipc/reader').FromArg4): Promise<Table<T>>; + public static from<T extends { [key: string]: DataType } = any>(source: import('./ipc/reader').FromArg5): Promise<Table<T>>; + public static from<T extends { [key: string]: DataType } = any>(source: PromiseLike<RecordBatchReader<T>>): Promise<Table<T>>; + public static from<T extends { [key: string]: DataType } = any, TNull = any>(options: VectorBuilderOptions<Struct<T>, TNull>): Table<T>; + public static from<T extends { [key: string]: DataType } = any, TNull = any>(options: VectorBuilderOptionsAsync<Struct<T>, TNull>): Promise<Table<T>>; + /** @nocollapse */ + public static from<T extends { [key: string]: DataType } = any, TNull = any>(input?: any) { + + if (!input) { return Table.empty(); } + + if (typeof input === 'object') { + const table = isIterable(input['values']) ? tableFromIterable<T, TNull>(input) + : isAsyncIterable(input['values']) ? tableFromAsyncIterable<T, TNull>(input) + : null; + if (table !== null) { return table; } + } + + let reader = RecordBatchReader.from<T>(input) as RecordBatchReader<T> | Promise<RecordBatchReader<T>>; + + if (isPromise<RecordBatchReader<T>>(reader)) { + return (async () => await Table.from(await reader))(); + } + if (reader.isSync() && (reader = reader.open())) { + return !reader.schema ? Table.empty() : new Table<T>(reader.schema, [...reader]); + } + return (async (opening) => { + const reader = await opening; + const schema = reader.schema; + const batches: RecordBatch[] = []; + if (schema) { + for await (const batch of reader) { + batches.push(batch); + } + return new Table<T>(schema, batches); + } + return Table.empty(); + })(reader.open()); + } + + /** @nocollapse */ + public static async fromAsync<T extends { [key: string]: DataType } = any>(source: import('./ipc/reader').FromArgs): Promise<Table<T>> { + return await Table.from<T>(source as any); + } + + /** @nocollapse */ + public static fromStruct<T extends { [key: string]: DataType } = any>(vector: Vector<Struct<T>>) { + return Table.new<T>(vector.data.childData as Data<T[keyof T]>[], vector.type.children); + } + + /** + * @summary Create a new Table from a collection of Columns or Vectors, + * with an optional list of names or Fields. + * + * + * `Table.new` accepts an Object of + * Columns or Vectors, where the keys will be used as the field names + * for the Schema: + * ```ts + * const i32s = Int32Vector.from([1, 2, 3]); + * const f32s = Float32Vector.from([.1, .2, .3]); + * const table = Table.new({ i32: i32s, f32: f32s }); + * assert(table.schema.fields[0].name === 'i32'); + * ``` + * + * It also accepts a a list of Vectors with an optional list of names or + * Fields for the resulting Schema. If the list is omitted or a name is + * missing, the numeric index of each Vector will be used as the name: + * ```ts + * const i32s = Int32Vector.from([1, 2, 3]); + * const f32s = Float32Vector.from([.1, .2, .3]); + * const table = Table.new([i32s, f32s], ['i32']); + * assert(table.schema.fields[0].name === 'i32'); + * assert(table.schema.fields[1].name === '1'); + * ``` + * + * If the supplied arguments are Columns, `Table.new` will infer the Schema + * from the Columns: + * ```ts + * const i32s = Column.new('i32', Int32Vector.from([1, 2, 3])); + * const f32s = Column.new('f32', Float32Vector.from([.1, .2, .3])); + * const table = Table.new(i32s, f32s); + * assert(table.schema.fields[0].name === 'i32'); + * assert(table.schema.fields[1].name === 'f32'); + * ``` + * + * If the supplied Vector or Column lengths are unequal, `Table.new` will + * extend the lengths of the shorter Columns, allocating additional bytes + * to represent the additional null slots. The memory required to allocate + * these additional bitmaps can be computed as: + * ```ts + * let additionalBytes = 0; + * for (let vec in shorter_vectors) { + * additionalBytes += (((longestLength - vec.length) + 63) & ~63) >> 3; + * } + * ``` + * + * For example, an additional null bitmap for one million null values would require + * 125,000 bytes (`((1e6 + 63) & ~63) >> 3`), or approx. `0.11MiB` + */ + public static new<T extends { [key: string]: DataType } = any>(...columns: Columns<T>): Table<T>; + public static new<T extends VectorMap = any>(children: T): Table<{ [P in keyof T]: T[P] extends Vector ? T[P]['type'] : T[P] extends Exclude<TypedArray, Uint8ClampedArray> ? TypedArrayDataType<T[P]> : never}>; + public static new<T extends { [key: string]: DataType } = any>(children: ChildData<T>, fields?: Fields<T>): Table<T>; + /** @nocollapse */ + public static new(...cols: any[]) { + return new Table(...distributeColumnsIntoRecordBatches(selectColumnArgs(cols))); + } + + constructor(table: Table<T>); + constructor(batches: RecordBatch<T>[]); + constructor(...batches: RecordBatch<T>[]); + constructor(schema: Schema<T>, batches: RecordBatch<T>[]); + constructor(schema: Schema<T>, ...batches: RecordBatch<T>[]); + constructor(...args: any[]) { + + let schema: Schema<T> = null!; + + if (args[0] instanceof Schema) { schema = args[0]; } + + const chunks = args[0] instanceof Table ? (args[0] as Table<T>).chunks : selectArgs<RecordBatch<T>>(RecordBatch, args); + + if (!schema && !(schema = chunks[0]?.schema)) { + throw new TypeError('Table must be initialized with a Schema or at least one RecordBatch'); + } + + chunks[0] || (chunks[0] = new _InternalEmptyPlaceholderRecordBatch(schema)); + + super(new Struct(schema.fields), chunks); + + this._schema = schema; + this._chunks = chunks; + } + + protected _schema: Schema<T>; + // List of inner RecordBatches + protected _chunks: RecordBatch<T>[]; + protected _children?: Column<T[keyof T]>[]; + + public get schema() { return this._schema; } + public get length() { return this._length; } + public get chunks() { return this._chunks; } + public get numCols() { return this._numChildren; } + + public clone(chunks = this._chunks) { + return new Table<T>(this._schema, chunks); + } + + public getColumn<R extends keyof T>(name: R): Column<T[R]> { + return this.getColumnAt(this.getColumnIndex(name)) as Column<T[R]>; + } + public getColumnAt<R extends DataType = any>(index: number): Column<R> | null { + return this.getChildAt(index); + } + public getColumnIndex<R extends keyof T>(name: R) { + return this._schema.fields.findIndex((f) => f.name === name); + } + public getChildAt<R extends DataType = any>(index: number): Column<R> | null { + if (index < 0 || index >= this.numChildren) { return null; } + let field: Field<R>, child: Column<R>; + const fields = (this._schema as Schema<any>).fields; + const columns = this._children || (this._children = []) as Column[]; + if (child = columns[index]) { return child as Column<R>; } + if (field = fields[index]) { + const chunks = this._chunks + .map((chunk) => chunk.getChildAt<R>(index)) + .filter((vec): vec is Vector<R> => vec != null); + if (chunks.length > 0) { + return (columns[index] = new Column<R>(field, chunks)); + } + } + return null; + } + + // @ts-ignore + public serialize(encoding = 'binary', stream = true) { + const Writer = !stream + ? RecordBatchFileWriter + : RecordBatchStreamWriter; + return Writer.writeAll(this).toUint8Array(true); + } + public count(): number { + return this._length; + } + public select<K extends keyof T = any>(...columnNames: K[]) { + const nameToIndex = this._schema.fields.reduce((m, f, i) => m.set(f.name as K, i), new Map<K, number>()); + return this.selectAt(...columnNames.map((columnName) => nameToIndex.get(columnName)!).filter((x) => x > -1)); + } + public selectAt<K extends T[keyof T] = any>(...columnIndices: number[]) { + const schema = this._schema.selectAt<K>(...columnIndices); + return new Table(schema, this._chunks.map(({ length, data: { childData } }) => { + return new RecordBatch(schema, length, columnIndices.map((i) => childData[i]).filter(Boolean)); + })); + } + public assign<R extends { [key: string]: DataType } = any>(other: Table<R>) { + + const fields = this._schema.fields; + const [indices, oldToNew] = other.schema.fields.reduce((memo, f2, newIdx) => { + const [indices, oldToNew] = memo; + const i = fields.findIndex((f) => f.name === f2.name); + ~i ? (oldToNew[i] = newIdx) : indices.push(newIdx); + return memo; + }, [[], []] as number[][]); + + const schema = this._schema.assign(other.schema); + const columns = [ + ...fields.map((_f, i, _fs, j = oldToNew[i]) => + (j === undefined ? this.getColumnAt(i) : other.getColumnAt(j))!), + ...indices.map((i) => other.getColumnAt(i)!) + ].filter(Boolean) as Column<(T & R)[keyof T | keyof R]>[]; + + return new Table<T & R>(...distributeVectorsIntoRecordBatches<any>(schema, columns)); + } +} + +function tableFromIterable<T extends { [key: string]: DataType } = any, TNull = any>(input: VectorBuilderOptions<Struct<T>, TNull>) { + const { type } = input; + if (type instanceof Struct) { + return Table.fromStruct(StructVector.from(input as VectorBuilderOptions<Struct<T>, TNull>)); + } + return null; +} + +function tableFromAsyncIterable<T extends { [key: string]: DataType } = any, TNull = any>(input: VectorBuilderOptionsAsync<Struct<T>, TNull>) { + const { type } = input; + if (type instanceof Struct) { + return StructVector.from(input as VectorBuilderOptionsAsync<Struct<T>, TNull>).then((vector) => Table.fromStruct(vector)); + } + return null; +} diff --git a/src/arrow/js/src/type.ts b/src/arrow/js/src/type.ts new file mode 100644 index 000000000..7d5c051ad --- /dev/null +++ b/src/arrow/js/src/type.ts @@ -0,0 +1,613 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/* eslint-disable @typescript-eslint/naming-convention */ + +import { Field } from './schema'; +import { flatbuffers } from 'flatbuffers'; +import { VectorType as V } from './interfaces'; +import { TypedArrayConstructor } from './interfaces'; + +import Long = flatbuffers.Long; +import { + Type, + Precision, UnionMode, + DateUnit, TimeUnit, IntervalUnit +} from './enum'; + +/** @ignore */ +export type TimeBitWidth = 32 | 64; +/** @ignore */ +export type IntBitWidth = 8 | 16 | 32 | 64; +/** @ignore */ +export type IsSigned = { 'true': true; 'false': false }; +/** @ignore */ +export type RowLike<T extends { [key: string]: DataType }> = + ( Iterable<[string, T[keyof T]['TValue'] | null]> ) + & { [P in keyof T]: T[P]['TValue'] | null } + & { get<K extends keyof T>(key: K): T[K]['TValue'] | null } + & { set<K extends keyof T>(key: K, val: T[K]['TValue'] | null): void } + ; + +/** @ignore */ +export type MapLike<K extends DataType = any, V extends DataType = any> = + { [P in K['TValue']]: V['TValue'] | null } + & ( Map<K['TValue'], V['TValue'] | null> ) + ; + +export interface DataType<TType extends Type = Type, TChildren extends { [key: string]: DataType } = any> { + readonly TType: TType; + readonly TArray: any; + readonly TValue: any; + readonly ArrayType: any; + readonly children: Field<TChildren[keyof TChildren]>[]; +} + +/** + * An abstract base class for classes that encapsulate metadata about each of + * the logical types that Arrow can represent. + */ +export abstract class DataType<TType extends Type = Type, TChildren extends { [key: string]: DataType } = any> { + + public [Symbol.toStringTag]: string; + + /** @nocollapse */ static isNull (x: any): x is Null { return x?.typeId === Type.Null; } + /** @nocollapse */ static isInt (x: any): x is Int_ { return x?.typeId === Type.Int; } + /** @nocollapse */ static isFloat (x: any): x is Float { return x?.typeId === Type.Float; } + /** @nocollapse */ static isBinary (x: any): x is Binary { return x?.typeId === Type.Binary; } + /** @nocollapse */ static isUtf8 (x: any): x is Utf8 { return x?.typeId === Type.Utf8; } + /** @nocollapse */ static isBool (x: any): x is Bool { return x?.typeId === Type.Bool; } + /** @nocollapse */ static isDecimal (x: any): x is Decimal { return x?.typeId === Type.Decimal; } + /** @nocollapse */ static isDate (x: any): x is Date_ { return x?.typeId === Type.Date; } + /** @nocollapse */ static isTime (x: any): x is Time_ { return x?.typeId === Type.Time; } + /** @nocollapse */ static isTimestamp (x: any): x is Timestamp_ { return x?.typeId === Type.Timestamp; } + /** @nocollapse */ static isInterval (x: any): x is Interval_ { return x?.typeId === Type.Interval; } + /** @nocollapse */ static isList (x: any): x is List { return x?.typeId === Type.List; } + /** @nocollapse */ static isStruct (x: any): x is Struct { return x?.typeId === Type.Struct; } + /** @nocollapse */ static isUnion (x: any): x is Union_ { return x?.typeId === Type.Union; } + /** @nocollapse */ static isFixedSizeBinary (x: any): x is FixedSizeBinary { return x?.typeId === Type.FixedSizeBinary; } + /** @nocollapse */ static isFixedSizeList (x: any): x is FixedSizeList { return x?.typeId === Type.FixedSizeList; } + /** @nocollapse */ static isMap (x: any): x is Map_ { return x?.typeId === Type.Map; } + /** @nocollapse */ static isDictionary (x: any): x is Dictionary { return x?.typeId === Type.Dictionary; } + + public get typeId(): TType { return <any> Type.NONE; } + + protected static [Symbol.toStringTag] = ((proto: DataType) => { + (<any> proto).children = null; + (<any> proto).ArrayType = Array; + return proto[Symbol.toStringTag] = 'DataType'; + })(DataType.prototype); +} + +/** @ignore */ +export interface Null extends DataType<Type.Null> { TArray: void; TValue: null } +/** @ignore */ +export class Null extends DataType<Type.Null> { + public toString() { return `Null`; } + public get typeId() { return Type.Null as Type.Null; } + protected static [Symbol.toStringTag] = ((proto: Null) => { + return proto[Symbol.toStringTag] = 'Null'; + })(Null.prototype); +} + +/** @ignore */ +type Ints = Type.Int | Type.Int8 | Type.Int16 | Type.Int32 | Type.Int64 | Type.Uint8 | Type.Uint16 | Type.Uint32 | Type.Uint64; +/** @ignore */ +type IType = { + [Type.Int ]: { bitWidth: IntBitWidth; isSigned: true | false; TArray: IntArray; TValue: number | bigint | Int32Array | Uint32Array }; + [Type.Int8 ]: { bitWidth: 8; isSigned: true; TArray: Int8Array; TValue: number }; + [Type.Int16 ]: { bitWidth: 16; isSigned: true; TArray: Int16Array; TValue: number }; + [Type.Int32 ]: { bitWidth: 32; isSigned: true; TArray: Int32Array; TValue: number }; + [Type.Int64 ]: { bitWidth: 64; isSigned: true; TArray: Int32Array; TValue: bigint | Int32Array | Uint32Array }; + [Type.Uint8 ]: { bitWidth: 8; isSigned: false; TArray: Uint8Array; TValue: number }; + [Type.Uint16]: { bitWidth: 16; isSigned: false; TArray: Uint16Array; TValue: number }; + [Type.Uint32]: { bitWidth: 32; isSigned: false; TArray: Uint32Array; TValue: number }; + [Type.Uint64]: { bitWidth: 64; isSigned: false; TArray: Uint32Array; TValue: bigint | Int32Array | Uint32Array }; +}; + +/** @ignore */ +interface Int_<T extends Ints = Ints> extends DataType<T> { TArray: IType[T]['TArray']; TValue: IType[T]['TValue'] } +/** @ignore */ +class Int_<T extends Ints = Ints> extends DataType<T> { + constructor(public readonly isSigned: IType[T]['isSigned'], + public readonly bitWidth: IType[T]['bitWidth']) { + super(); + } + public get typeId() { return Type.Int as T; } + public get ArrayType(): TypedArrayConstructor<IType[T]['TArray']> { + switch (this.bitWidth) { + case 8: return this.isSigned ? Int8Array : Uint8Array; + case 16: return this.isSigned ? Int16Array : Uint16Array; + case 32: return this.isSigned ? Int32Array : Uint32Array; + case 64: return this.isSigned ? Int32Array : Uint32Array; + } + throw new Error(`Unrecognized ${this[Symbol.toStringTag]} type`); + } + public toString() { return `${this.isSigned ? `I` : `Ui`}nt${this.bitWidth}`; } + protected static [Symbol.toStringTag] = ((proto: Int_) => { + (<any> proto).isSigned = null; + (<any> proto).bitWidth = null; + return proto[Symbol.toStringTag] = 'Int'; + })(Int_.prototype); +} + +export { Int_ as Int }; + +/** @ignore */ +export class Int8 extends Int_<Type.Int8> { constructor() { super(true, 8); } } +/** @ignore */ +export class Int16 extends Int_<Type.Int16> { constructor() { super(true, 16); } } +/** @ignore */ +export class Int32 extends Int_<Type.Int32> { constructor() { super(true, 32); } } +/** @ignore */ +export class Int64 extends Int_<Type.Int64> { constructor() { super(true, 64); } } +/** @ignore */ +export class Uint8 extends Int_<Type.Uint8> { constructor() { super(false, 8); } } +/** @ignore */ +export class Uint16 extends Int_<Type.Uint16> { constructor() { super(false, 16); } } +/** @ignore */ +export class Uint32 extends Int_<Type.Uint32> { constructor() { super(false, 32); } } +/** @ignore */ +export class Uint64 extends Int_<Type.Uint64> { constructor() { super(false, 64); } } + +Object.defineProperty(Int8.prototype, 'ArrayType', { value: Int8Array }); +Object.defineProperty(Int16.prototype, 'ArrayType', { value: Int16Array }); +Object.defineProperty(Int32.prototype, 'ArrayType', { value: Int32Array }); +Object.defineProperty(Int64.prototype, 'ArrayType', { value: Int32Array }); +Object.defineProperty(Uint8.prototype, 'ArrayType', { value: Uint8Array }); +Object.defineProperty(Uint16.prototype, 'ArrayType', { value: Uint16Array }); +Object.defineProperty(Uint32.prototype, 'ArrayType', { value: Uint32Array }); +Object.defineProperty(Uint64.prototype, 'ArrayType', { value: Uint32Array }); + +/** @ignore */ +type Floats = Type.Float | Type.Float16 | Type.Float32 | Type.Float64; +/** @ignore */ +type FType = { + [Type.Float ]: { precision: Precision; TArray: FloatArray; TValue: number }; + [Type.Float16]: { precision: Precision.HALF; TArray: Uint16Array; TValue: number }; + [Type.Float32]: { precision: Precision.SINGLE; TArray: Float32Array; TValue: number }; + [Type.Float64]: { precision: Precision.DOUBLE; TArray: Float64Array; TValue: number }; +}; + +/** @ignore */ +export interface Float<T extends Floats = Floats> extends DataType<T> { TArray: FType[T]['TArray']; TValue: number } +/** @ignore */ +export class Float<T extends Floats = Floats> extends DataType<T> { + constructor(public readonly precision: Precision) { + super(); + } + public get typeId() { return Type.Float as T; } + public get ArrayType(): TypedArrayConstructor<FType[T]['TArray']> { + switch (this.precision) { + case Precision.HALF: return Uint16Array; + case Precision.SINGLE: return Float32Array; + case Precision.DOUBLE: return Float64Array; + } + // @ts-ignore + throw new Error(`Unrecognized ${this[Symbol.toStringTag]} type`); + } + public toString() { return `Float${(this.precision << 5) || 16}`; } + protected static [Symbol.toStringTag] = ((proto: Float) => { + (<any> proto).precision = null; + return proto[Symbol.toStringTag] = 'Float'; + })(Float.prototype); +} + +/** @ignore */ +export class Float16 extends Float<Type.Float16> { constructor() { super(Precision.HALF); } } +/** @ignore */ +export class Float32 extends Float<Type.Float32> { constructor() { super(Precision.SINGLE); } } +/** @ignore */ +export class Float64 extends Float<Type.Float64> { constructor() { super(Precision.DOUBLE); } } + +Object.defineProperty(Float16.prototype, 'ArrayType', { value: Uint16Array }); +Object.defineProperty(Float32.prototype, 'ArrayType', { value: Float32Array }); +Object.defineProperty(Float64.prototype, 'ArrayType', { value: Float64Array }); + +/** @ignore */ +export interface Binary extends DataType<Type.Binary> { TArray: Uint8Array; TValue: Uint8Array; ArrayType: TypedArrayConstructor<Uint8Array> } +/** @ignore */ +export class Binary extends DataType<Type.Binary> { + constructor() { + super(); + } + public get typeId() { return Type.Binary as Type.Binary; } + public toString() { return `Binary`; } + protected static [Symbol.toStringTag] = ((proto: Binary) => { + (<any> proto).ArrayType = Uint8Array; + return proto[Symbol.toStringTag] = 'Binary'; + })(Binary.prototype); +} + +/** @ignore */ +export interface Utf8 extends DataType<Type.Utf8> { TArray: Uint8Array; TValue: string; ArrayType: TypedArrayConstructor<Uint8Array> } +/** @ignore */ +export class Utf8 extends DataType<Type.Utf8> { + constructor() { + super(); + } + public get typeId() { return Type.Utf8 as Type.Utf8; } + public toString() { return `Utf8`; } + protected static [Symbol.toStringTag] = ((proto: Utf8) => { + (<any> proto).ArrayType = Uint8Array; + return proto[Symbol.toStringTag] = 'Utf8'; + })(Utf8.prototype); +} + +/** @ignore */ +export interface Bool extends DataType<Type.Bool> { TArray: Uint8Array; TValue: boolean; ArrayType: TypedArrayConstructor<Uint8Array> } +/** @ignore */ +export class Bool extends DataType<Type.Bool> { + constructor() { + super(); + } + public get typeId() { return Type.Bool as Type.Bool; } + public toString() { return `Bool`; } + protected static [Symbol.toStringTag] = ((proto: Bool) => { + (<any> proto).ArrayType = Uint8Array; + return proto[Symbol.toStringTag] = 'Bool'; + })(Bool.prototype); +} + +/** @ignore */ +export interface Decimal extends DataType<Type.Decimal> { TArray: Uint32Array; TValue: Uint32Array; ArrayType: TypedArrayConstructor<Uint32Array> } +/** @ignore */ +export class Decimal extends DataType<Type.Decimal> { + constructor(public readonly scale: number, + public readonly precision: number) { + super(); + } + public get typeId() { return Type.Decimal as Type.Decimal; } + public toString() { return `Decimal[${this.precision}e${this.scale > 0 ? `+` : ``}${this.scale}]`; } + protected static [Symbol.toStringTag] = ((proto: Decimal) => { + (<any> proto).scale = null; + (<any> proto).precision = null; + (<any> proto).ArrayType = Uint32Array; + return proto[Symbol.toStringTag] = 'Decimal'; + })(Decimal.prototype); +} + +/** @ignore */ +export type Dates = Type.Date | Type.DateDay | Type.DateMillisecond; +/** @ignore */ +export interface Date_<T extends Dates = Dates> extends DataType<T> { TArray: Int32Array; TValue: Date; ArrayType: TypedArrayConstructor<Int32Array> } +/** @ignore */ +export class Date_<T extends Dates = Dates> extends DataType<T> { + constructor(public readonly unit: DateUnit) { + super(); + } + public get typeId() { return Type.Date as T; } + public toString() { return `Date${(this.unit + 1) * 32}<${DateUnit[this.unit]}>`; } + protected static [Symbol.toStringTag] = ((proto: Date_) => { + (<any> proto).unit = null; + (<any> proto).ArrayType = Int32Array; + return proto[Symbol.toStringTag] = 'Date'; + })(Date_.prototype); +} + +/** @ignore */ +export class DateDay extends Date_<Type.DateDay> { constructor() { super(DateUnit.DAY); } } +/** @ignore */ +export class DateMillisecond extends Date_<Type.DateMillisecond> { constructor() { super(DateUnit.MILLISECOND); } } + +/** @ignore */ +type Times = Type.Time | Type.TimeSecond | Type.TimeMillisecond | Type.TimeMicrosecond | Type.TimeNanosecond; +/** @ignore */ +type TimesType = { + [Type.Time ]: { unit: TimeUnit; TValue: number | Int32Array }; + [Type.TimeSecond ]: { unit: TimeUnit.SECOND; TValue: number }; + [Type.TimeMillisecond]: { unit: TimeUnit.MILLISECOND; TValue: number }; + [Type.TimeMicrosecond]: { unit: TimeUnit.MICROSECOND; TValue: Int32Array }; + [Type.TimeNanosecond ]: { unit: TimeUnit.NANOSECOND; TValue: Int32Array }; +}; + +/** @ignore */ +interface Time_<T extends Times = Times> extends DataType<T> { TArray: Int32Array; TValue: TimesType[T]['TValue']; ArrayType: TypedArrayConstructor<Int32Array> } +/** @ignore */ +class Time_<T extends Times = Times> extends DataType<T> { + constructor(public readonly unit: TimesType[T]['unit'], + public readonly bitWidth: TimeBitWidth) { + super(); + } + public get typeId() { return Type.Time as T; } + public toString() { return `Time${this.bitWidth}<${TimeUnit[this.unit]}>`; } + protected static [Symbol.toStringTag] = ((proto: Time_) => { + (<any> proto).unit = null; + (<any> proto).bitWidth = null; + (<any> proto).ArrayType = Int32Array; + return proto[Symbol.toStringTag] = 'Time'; + })(Time_.prototype); +} + +export { Time_ as Time }; + +/** @ignore */ +export class TimeSecond extends Time_<Type.TimeSecond> { constructor() { super(TimeUnit.SECOND, 32); } } +/** @ignore */ +export class TimeMillisecond extends Time_<Type.TimeMillisecond> { constructor() { super(TimeUnit.MILLISECOND, 32); } } +/** @ignore */ +export class TimeMicrosecond extends Time_<Type.TimeMicrosecond> { constructor() { super(TimeUnit.MICROSECOND, 64); } } +/** @ignore */ +export class TimeNanosecond extends Time_<Type.TimeNanosecond> { constructor() { super(TimeUnit.NANOSECOND, 64); } } + +/** @ignore */ +type Timestamps = Type.Timestamp | Type.TimestampSecond | Type.TimestampMillisecond | Type.TimestampMicrosecond | Type.TimestampNanosecond; +/** @ignore */ +interface Timestamp_<T extends Timestamps = Timestamps> extends DataType<T> { TArray: Int32Array; TValue: number; ArrayType: TypedArrayConstructor<Int32Array> } +/** @ignore */ +class Timestamp_<T extends Timestamps = Timestamps> extends DataType<T> { + constructor(public readonly unit: TimeUnit, + public readonly timezone?: string | null) { + super(); + } + public get typeId() { return Type.Timestamp as T; } + public toString() { return `Timestamp<${TimeUnit[this.unit]}${this.timezone ? `, ${this.timezone}` : ``}>`; } + protected static [Symbol.toStringTag] = ((proto: Timestamp_) => { + (<any> proto).unit = null; + (<any> proto).timezone = null; + (<any> proto).ArrayType = Int32Array; + return proto[Symbol.toStringTag] = 'Timestamp'; + })(Timestamp_.prototype); +} + +export { Timestamp_ as Timestamp }; + +/** @ignore */ +export class TimestampSecond extends Timestamp_<Type.TimestampSecond> { constructor(timezone?: string | null) { super(TimeUnit.SECOND, timezone); } } +/** @ignore */ +export class TimestampMillisecond extends Timestamp_<Type.TimestampMillisecond> { constructor(timezone?: string | null) { super(TimeUnit.MILLISECOND, timezone); } } +/** @ignore */ +export class TimestampMicrosecond extends Timestamp_<Type.TimestampMicrosecond> { constructor(timezone?: string | null) { super(TimeUnit.MICROSECOND, timezone); } } +/** @ignore */ +export class TimestampNanosecond extends Timestamp_<Type.TimestampNanosecond> { constructor(timezone?: string | null) { super(TimeUnit.NANOSECOND, timezone); } } + +/** @ignore */ +type Intervals = Type.Interval | Type.IntervalDayTime | Type.IntervalYearMonth; +/** @ignore */ +interface Interval_<T extends Intervals = Intervals> extends DataType<T> { TArray: Int32Array; TValue: Int32Array; ArrayType: TypedArrayConstructor<Int32Array> } +/** @ignore */ +class Interval_<T extends Intervals = Intervals> extends DataType<T> { + constructor(public readonly unit: IntervalUnit) { + super(); + } + public get typeId() { return Type.Interval as T; } + public toString() { return `Interval<${IntervalUnit[this.unit]}>`; } + protected static [Symbol.toStringTag] = ((proto: Interval_) => { + (<any> proto).unit = null; + (<any> proto).ArrayType = Int32Array; + return proto[Symbol.toStringTag] = 'Interval'; + })(Interval_.prototype); +} + +export { Interval_ as Interval }; + +/** @ignore */ +export class IntervalDayTime extends Interval_<Type.IntervalDayTime> { constructor() { super(IntervalUnit.DAY_TIME); } } +/** @ignore */ +export class IntervalYearMonth extends Interval_<Type.IntervalYearMonth> { constructor() { super(IntervalUnit.YEAR_MONTH); } } + +/** @ignore */ +export interface List<T extends DataType = any> extends DataType<Type.List, { [0]: T }> { TArray: IterableArrayLike<T>; TValue: V<T> } +/** @ignore */ +export class List<T extends DataType = any> extends DataType<Type.List, { [0]: T }> { + constructor(child: Field<T>) { + super(); + this.children = [child]; + } + public readonly children: Field<T>[]; + public get typeId() { return Type.List as Type.List; } + public toString() { return `List<${this.valueType}>`; } + public get valueType(): T { return this.children[0].type as T; } + public get valueField(): Field<T> { return this.children[0] as Field<T>; } + public get ArrayType(): T['ArrayType'] { return this.valueType.ArrayType; } + protected static [Symbol.toStringTag] = ((proto: List) => { + (<any> proto).children = null; + return proto[Symbol.toStringTag] = 'List'; + })(List.prototype); +} + +/** @ignore */ +export interface Struct<T extends { [key: string]: DataType } = any> extends DataType<Type.Struct> { TArray: IterableArrayLike<RowLike<T>>; TValue: RowLike<T>; dataTypes: T } +/** @ignore */ +export class Struct<T extends { [key: string]: DataType } = any> extends DataType<Type.Struct, T> { + public readonly children: Field<T[keyof T]>[]; + constructor(children: Field<T[keyof T]>[]) { + super(); + this.children = children; + } + public get typeId() { return Type.Struct as Type.Struct; } + public toString() { return `Struct<{${this.children.map((f) => `${f.name}:${f.type}`).join(`, `)}}>`; } + protected static [Symbol.toStringTag] = ((proto: Struct) => { + (<any> proto).children = null; + return proto[Symbol.toStringTag] = 'Struct'; + })(Struct.prototype); +} + +/** @ignore */ +type Unions = Type.Union | Type.DenseUnion | Type.SparseUnion; +/** @ignore */ +interface Union_<T extends Unions = Unions> extends DataType<T> { TArray: Int8Array; TValue: any; ArrayType: TypedArrayConstructor<Int8Array> } +/** @ignore */ +class Union_<T extends Unions = Unions> extends DataType<T> { + public readonly mode: UnionMode; + public readonly typeIds: Int32Array; + public readonly children: Field<any>[]; + public readonly typeIdToChildIndex: { [key: number]: number }; + constructor(mode: UnionMode, + typeIds: number[] | Int32Array, + children: Field<any>[]) { + super(); + this.mode = mode; + this.children = children; + this.typeIds = typeIds = Int32Array.from(typeIds); + this.typeIdToChildIndex = typeIds.reduce((typeIdToChildIndex, typeId, idx) => { + return (typeIdToChildIndex[typeId] = idx) && typeIdToChildIndex || typeIdToChildIndex; + }, Object.create(null) as { [key: number]: number }); + } + public get typeId() { return Type.Union as T; } + public toString() { + return `${this[Symbol.toStringTag]}<${ + this.children.map((x) => `${x.type}`).join(` | `) + }>`; +} + protected static [Symbol.toStringTag] = ((proto: Union_) => { + (<any> proto).mode = null; + (<any> proto).typeIds = null; + (<any> proto).children = null; + (<any> proto).typeIdToChildIndex = null; + (<any> proto).ArrayType = Int8Array; + return proto[Symbol.toStringTag] = 'Union'; + })(Union_.prototype); +} + +export { Union_ as Union }; + +/** @ignore */ +export class DenseUnion extends Union_<Type.DenseUnion> { + constructor(typeIds: number[] | Int32Array, children: Field[]) { + super(UnionMode.Dense, typeIds, children); + } +} + +/** @ignore */ +export class SparseUnion extends Union_<Type.SparseUnion> { + constructor(typeIds: number[] | Int32Array, children: Field[]) { + super(UnionMode.Sparse, typeIds, children); + } +} + +/** @ignore */ +export interface FixedSizeBinary extends DataType<Type.FixedSizeBinary> { TArray: Uint8Array; TValue: Uint8Array; ArrayType: TypedArrayConstructor<Uint8Array> } +/** @ignore */ +export class FixedSizeBinary extends DataType<Type.FixedSizeBinary> { + constructor(public readonly byteWidth: number) { + super(); + } + public get typeId() { return Type.FixedSizeBinary as Type.FixedSizeBinary; } + public toString() { return `FixedSizeBinary[${this.byteWidth}]`; } + protected static [Symbol.toStringTag] = ((proto: FixedSizeBinary) => { + (<any> proto).byteWidth = null; + (<any> proto).ArrayType = Uint8Array; + return proto[Symbol.toStringTag] = 'FixedSizeBinary'; + })(FixedSizeBinary.prototype); +} + +/** @ignore */ +export interface FixedSizeList<T extends DataType = any> extends DataType<Type.FixedSizeList> { TArray: IterableArrayLike<T['TArray']>; TValue: V<T> } +/** @ignore */ +export class FixedSizeList<T extends DataType = any> extends DataType<Type.FixedSizeList, { [0]: T }> { + public readonly children: Field<T>[]; + constructor(public readonly listSize: number, child: Field<T>) { + super(); + this.children = [child]; + } + public get typeId() { return Type.FixedSizeList as Type.FixedSizeList; } + public get valueType(): T { return this.children[0].type as T; } + public get valueField(): Field<T> { return this.children[0] as Field<T>; } + public get ArrayType(): T['ArrayType'] { return this.valueType.ArrayType; } + public toString() { return `FixedSizeList[${this.listSize}]<${this.valueType}>`; } + protected static [Symbol.toStringTag] = ((proto: FixedSizeList) => { + (<any> proto).children = null; + (<any> proto).listSize = null; + return proto[Symbol.toStringTag] = 'FixedSizeList'; + })(FixedSizeList.prototype); +} + +/** @ignore */ +export interface Map_<TKey extends DataType = any, TValue extends DataType = any> extends DataType<Type.Map> { + TArray: IterableArrayLike<Map<TKey['TValue'], TValue['TValue'] | null>>; + TChild: Struct<{ key: TKey; value: TValue }>; + TValue: MapLike<TKey, TValue>; +} + +/** @ignore */ +export class Map_<TKey extends DataType = any, TValue extends DataType = any> extends DataType<Type.Map> { + constructor(child: Field<Struct<{ key: TKey; value: TValue }>>, keysSorted = false) { + super(); + this.children = [child]; + this.keysSorted = keysSorted; + } + public readonly keysSorted: boolean; + public readonly children: Field<Struct<{ key: TKey; value: TValue }>>[]; + public get typeId() { return Type.Map as Type.Map; } + public get keyType(): TKey { return this.children[0].type.children[0].type as TKey; } + public get valueType(): TValue { return this.children[0].type.children[1].type as TValue; } + public toString() { return `Map<{${this.children[0].type.children.map((f) => `${f.name}:${f.type}`).join(`, `)}}>`; } + protected static [Symbol.toStringTag] = ((proto: Map_) => { + (<any> proto).children = null; + (<any> proto).keysSorted = null; + return proto[Symbol.toStringTag] = 'Map_'; + })(Map_.prototype); +} + +/** @ignore */ +const getId = ((atomicDictionaryId) => () => ++atomicDictionaryId)(-1); + +/** @ignore */ +export type TKeys = Int8 | Int16 | Int32 | Uint8 | Uint16 | Uint32; + +/** @ignore */ +export interface Dictionary<T extends DataType = any, TKey extends TKeys = TKeys> extends DataType<Type.Dictionary> { TArray: TKey['TArray']; TValue: T['TValue'] } +/** @ignore */ +export class Dictionary<T extends DataType = any, TKey extends TKeys = TKeys> extends DataType<Type.Dictionary> { + public readonly id: number; + public readonly indices: TKey; + public readonly dictionary: T; + public readonly isOrdered: boolean; + constructor(dictionary: T, indices: TKey, id?: Long | number | null, isOrdered?: boolean | null) { + super(); + this.indices = indices; + this.dictionary = dictionary; + this.isOrdered = isOrdered || false; + this.id = id == null ? getId() : typeof id === 'number' ? id : id.low; + } + public get typeId() { return Type.Dictionary as Type.Dictionary; } + public get children() { return this.dictionary.children; } + public get valueType(): T { return this.dictionary as T; } + public get ArrayType(): T['ArrayType'] { return this.dictionary.ArrayType; } + public toString() { return `Dictionary<${this.indices}, ${this.dictionary}>`; } + protected static [Symbol.toStringTag] = ((proto: Dictionary) => { + (<any> proto).id = null; + (<any> proto).indices = null; + (<any> proto).isOrdered = null; + (<any> proto).dictionary = null; + return proto[Symbol.toStringTag] = 'Dictionary'; + })(Dictionary.prototype); +} + +/** @ignore */ +export interface IterableArrayLike<T = any> extends ArrayLike<T>, Iterable<T> {} +/** @ignore */ +export type FloatArray = Uint16Array | Float32Array | Float64Array; +/** @ignore */ +export type IntArray = Int8Array | Int16Array | Int32Array | Uint8Array | Uint16Array | Uint32Array; + +/** @ignore */ +export function strideForType(type: DataType) { + const t: any = type; + switch (type.typeId) { + case Type.Decimal: return 4; + case Type.Timestamp: return 2; + case Type.Date: return 1 + (t as Date_).unit; + case Type.Interval: return 1 + (t as Interval_).unit; + case Type.Int: return 1 + +((t as Int_).bitWidth > 32); + case Type.Time: return 1 + +((t as Time_).bitWidth > 32); + case Type.FixedSizeList: return (t as FixedSizeList).listSize; + case Type.FixedSizeBinary: return (t as FixedSizeBinary).byteWidth; + default: return 1; + } +} diff --git a/src/arrow/js/src/util/args.ts b/src/arrow/js/src/util/args.ts new file mode 100644 index 000000000..25f571999 --- /dev/null +++ b/src/arrow/js/src/util/args.ts @@ -0,0 +1,196 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Data } from '../data'; +import { Field } from '../schema'; +import { Column } from '../column'; +import { Vector } from '../vector'; +import { DataType, Float32, Float64, FloatArray, IntArray, Int16, Int32, Int64, Int8, Uint16, Uint32, Uint64, Uint8 } from '../type'; +import { Chunked } from '../vector/chunked'; +import { BigIntArray, TypedArray as TypedArray_ } from '../interfaces'; +import { FloatArrayCtor } from '../vector/float'; +import { IntArrayCtor } from '../vector/int'; + +type RecordBatchCtor = typeof import('../recordbatch').RecordBatch; + +const isArray = Array.isArray; + +type TypedArray = Exclude<TypedArray_ | BigIntArray, Uint8ClampedArray>; + +/** @ignore */ +export function isTypedArray(arr: any): arr is TypedArray { + return ArrayBuffer.isView(arr) && 'BYTES_PER_ELEMENT' in arr; +} + + +/** @ignore */ +type ArrayCtor = FloatArrayCtor | IntArrayCtor; + +/** @ignore */ +export function arrayTypeToDataType(ctor: ArrayCtor) { + switch (ctor) { + case Int8Array: return Int8; + case Int16Array: return Int16; + case Int32Array: return Int32; + case BigInt64Array: return Int64; + case Uint8Array: return Uint8; + case Uint16Array: return Uint16; + case Uint32Array: return Uint32; + case BigUint64Array: return Uint64; + case Float32Array: return Float32; + case Float64Array: return Float64; + default: return null; + } +} + +/** @ignore */ +function vectorFromTypedArray(array: TypedArray): Vector { + const ArrowType = arrayTypeToDataType(array.constructor as ArrayCtor); + if (!ArrowType) { + throw new TypeError('Unrecognized Array input'); + } + const type = new ArrowType(); + const data = Data.new(type, 0, array.length, 0, [undefined, array as IntArray | FloatArray]); + return Vector.new(data); +} + +/** @ignore */ +export const selectArgs = <T>(Ctor: any, vals: any[]) => _selectArgs(Ctor, vals, [], 0) as T[]; +/** @ignore */ +export const selectColumnArgs = <T extends { [key: string]: DataType }>(args: any[]) => { + const [fields, values] = _selectFieldArgs<T>(args, [[], []]); + return values.map((x, i) => + x instanceof Column ? Column.new(x.field.clone(fields[i]), x) : + x instanceof Vector ? Column.new(fields[i], x) as Column<T[keyof T]> : + isTypedArray(x) ? Column.new(fields[i], vectorFromTypedArray(x)) as Column<T[keyof T]> : + Column.new(fields[i], [] as Vector<T[keyof T]>[])); +}; + +/** @ignore */ +export const selectFieldArgs = <T extends { [key: string]: DataType }>(args: any[]) => _selectFieldArgs<T>(args, [[], []]); +/** @ignore */ +export const selectChunkArgs = <T>(Ctor: any, vals: any[]) => _selectChunkArgs(Ctor, vals, [], 0) as T[]; +/** @ignore */ +export const selectVectorChildrenArgs = <T extends Vector>(Ctor: RecordBatchCtor, vals: any[]) => _selectVectorChildrenArgs(Ctor, vals, [], 0) as T[]; +/** @ignore */ +export const selectColumnChildrenArgs = <T extends Column>(Ctor: RecordBatchCtor, vals: any[]) => _selectColumnChildrenArgs(Ctor, vals, [], 0) as T[]; + +/** @ignore */ +function _selectArgs<T>(Ctor: any, vals: any[], res: T[], idx: number) { + let value: any, j = idx; + let i = -1; + const n = vals.length; + while (++i < n) { + if (isArray(value = vals[i])) { + j = _selectArgs(Ctor, value, res, j).length; + } else if (value instanceof Ctor) { res[j++] = value; } + } + return res; +} + +/** @ignore */ +function _selectChunkArgs<T>(Ctor: any, vals: any[], res: T[], idx: number) { + let value: any, j = idx; + let i = -1; + const n = vals.length; + while (++i < n) { + if (isArray(value = vals[i])) { + j = _selectChunkArgs(Ctor, value, res, j).length; + } else if (value instanceof Chunked) { + j = _selectChunkArgs(Ctor, value.chunks, res, j).length; + } else if (value instanceof Ctor) { res[j++] = value; } + } + return res; +} + +/** @ignore */ +function _selectVectorChildrenArgs<T extends Vector>(Ctor: RecordBatchCtor, vals: any[], res: T[], idx: number) { + let value: any, j = idx; + let i = -1; + const n = vals.length; + while (++i < n) { + if (isArray(value = vals[i])) { + j = _selectVectorChildrenArgs(Ctor, value, res, j).length; + } else if (value instanceof Ctor) { + j = _selectArgs(Vector, value.schema.fields.map((_, i) => value.getChildAt(i)!), res, j).length; + } else if (value instanceof Vector) { res[j++] = value as T; } + } + return res; +} + +/** @ignore */ +function _selectColumnChildrenArgs<T extends Column>(Ctor: RecordBatchCtor, vals: any[], res: T[], idx: number) { + let value: any, j = idx; + let i = -1; + const n = vals.length; + while (++i < n) { + if (isArray(value = vals[i])) { + j = _selectColumnChildrenArgs(Ctor, value, res, j).length; + } else if (value instanceof Ctor) { + j = _selectArgs(Column, value.schema.fields.map((f, i) => Column.new(f, value.getChildAt(i)!)), res, j).length; + } else if (value instanceof Column) { res[j++] = value as T; } + } + return res; +} + +/** @ignore */ +const toKeysAndValues = (xs: [any[], any[]], [k, v]: [any, any], i: number) => (xs[0][i] = k, xs[1][i] = v, xs); + +/** @ignore */ +function _selectFieldArgs<T extends { [key: string]: DataType }>(vals: any[], ret: [Field<T[keyof T]>[], (Vector<T[keyof T]> | TypedArray)[]]): [Field<T[keyof T]>[], (T[keyof T] | Vector<T[keyof T]> | TypedArray)[]] { + let keys: any[]; + let n: number; + switch (n = vals.length) { + case 0: return ret; + case 1: + keys = ret[0]; + if (!(vals[0])) { return ret; } + if (isArray(vals[0])) { return _selectFieldArgs(vals[0], ret); } + if (!(vals[0] instanceof Data || vals[0] instanceof Vector || isTypedArray(vals[0]) || vals[0] instanceof DataType)) { + [keys, vals] = Object.entries(vals[0]).reduce(toKeysAndValues, ret); + } + break; + default: + !isArray(keys = vals[n - 1]) + ? (vals = isArray(vals[0]) ? vals[0] : vals, keys = []) + : (vals = isArray(vals[0]) ? vals[0] : vals.slice(0, n - 1)); + } + + let fieldIndex = -1; + let valueIndex = -1; + let idx = -1; + const len = vals.length; + let field: number | string | Field<T[keyof T]>; + let val: Vector<T[keyof T]> | Data<T[keyof T]>; + const [fields, values] = ret as [Field<T[keyof T]>[], any[]]; + + while (++idx < len) { + val = vals[idx]; + if (val instanceof Column && (values[++valueIndex] = val)) { + fields[++fieldIndex] = val.field.clone(keys[idx], val.type, true); + } else { + ({ [idx]: field = idx } = keys); + if (val instanceof DataType && (values[++valueIndex] = val)) { + fields[++fieldIndex] = Field.new(field, val as DataType, true) as Field<T[keyof T]>; + } else if (val?.type && (values[++valueIndex] = val)) { + val instanceof Data && (values[valueIndex] = val = Vector.new(val) as Vector); + fields[++fieldIndex] = Field.new(field, val.type, true) as Field<T[keyof T]>; + } + } + } + return ret; +} diff --git a/src/arrow/js/src/util/bit.ts b/src/arrow/js/src/util/bit.ts new file mode 100644 index 000000000..e4c3d267e --- /dev/null +++ b/src/arrow/js/src/util/bit.ts @@ -0,0 +1,161 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/** @ignore */ +export function getBool(_data: any, _index: number, byte: number, bit: number) { + return (byte & 1 << bit) !== 0; +} + +/** @ignore */ +export function getBit(_data: any, _index: number, byte: number, bit: number): 0 | 1 { + return (byte & 1 << bit) >> bit as (0 | 1); +} + +/** @ignore */ +export function setBool(bytes: Uint8Array, index: number, value: any) { + return value ? + !!(bytes[index >> 3] |= (1 << (index % 8))) || true : + !(bytes[index >> 3] &= ~(1 << (index % 8))) && false ; +} + +/** @ignore */ +export function truncateBitmap(offset: number, length: number, bitmap: Uint8Array) { + const alignedSize = (bitmap.byteLength + 7) & ~7; + if (offset > 0 || bitmap.byteLength < alignedSize) { + const bytes = new Uint8Array(alignedSize); + // If the offset is a multiple of 8 bits, it's safe to slice the bitmap + bytes.set(offset % 8 === 0 ? bitmap.subarray(offset >> 3) : + // Otherwise iterate each bit from the offset and return a new one + packBools(new BitIterator(bitmap, offset, length, null, getBool)).subarray(0, alignedSize)); + return bytes; + } + return bitmap; +} + +/** @ignore */ +export function packBools(values: Iterable<any>) { + const xs: number[] = []; + let i = 0, bit = 0, byte = 0; + for (const value of values) { + value && (byte |= 1 << bit); + if (++bit === 8) { + xs[i++] = byte; + byte = bit = 0; + } + } + if (i === 0 || bit > 0) { xs[i++] = byte; } + const b = new Uint8Array((xs.length + 7) & ~7); + b.set(xs); + return b; +} + +/** @ignore */ +export class BitIterator<T> implements IterableIterator<T> { + bit: number; + byte: number; + byteIndex: number; + index: number; + + constructor( + private bytes: Uint8Array, + begin: number, + private length: number, + private context: any, + private get: (context: any, index: number, byte: number, bit: number) => T + ) { + this.bit = begin % 8; + this.byteIndex = begin >> 3; + this.byte = bytes[this.byteIndex++]; + this.index = 0; + } + + next(): IteratorResult<T> { + if (this.index < this.length) { + if (this.bit === 8) { + this.bit = 0; + this.byte = this.bytes[this.byteIndex++]; + } + return { + value: this.get(this.context, this.index++, this.byte, this.bit++) + }; + } + return { done: true, value: null }; + } + + [Symbol.iterator]() { + return this; + } +} + +/** + * Compute the population count (the number of bits set to 1) for a range of bits in a Uint8Array. + * @param vector The Uint8Array of bits for which to compute the population count. + * @param lhs The range's left-hand side (or start) bit + * @param rhs The range's right-hand side (or end) bit + */ +/** @ignore */ +export function popcnt_bit_range(data: Uint8Array, lhs: number, rhs: number): number { + if (rhs - lhs <= 0) { return 0; } + // If the bit range is less than one byte, sum the 1 bits in the bit range + if (rhs - lhs < 8) { + let sum = 0; + for (const bit of new BitIterator(data, lhs, rhs - lhs, data, getBit)) { + sum += bit; + } + return sum; + } + // Get the next lowest multiple of 8 from the right hand side + const rhsInside = rhs >> 3 << 3; + // Get the next highest multiple of 8 from the left hand side + const lhsInside = lhs + (lhs % 8 === 0 ? 0 : 8 - lhs % 8); + return ( + // Get the popcnt of bits between the left hand side, and the next highest multiple of 8 + popcnt_bit_range(data, lhs, lhsInside) + + // Get the popcnt of bits between the right hand side, and the next lowest multiple of 8 + popcnt_bit_range(data, rhsInside, rhs) + + // Get the popcnt of all bits between the left and right hand sides' multiples of 8 + popcnt_array(data, lhsInside >> 3, (rhsInside - lhsInside) >> 3) + ); +} + +/** @ignore */ +export function popcnt_array(arr: ArrayBufferView, byteOffset?: number, byteLength?: number) { + let cnt = 0, pos = byteOffset! | 0; + const view = new DataView(arr.buffer, arr.byteOffset, arr.byteLength); + const len = byteLength === void 0 ? arr.byteLength : pos + byteLength; + while (len - pos >= 4) { + cnt += popcnt_uint32(view.getUint32(pos)); + pos += 4; + } + while (len - pos >= 2) { + cnt += popcnt_uint32(view.getUint16(pos)); + pos += 2; + } + while (len - pos >= 1) { + cnt += popcnt_uint32(view.getUint8(pos)); + pos += 1; + } + return cnt; +} + +/** @ignore */ +export function popcnt_uint32(uint32: number): number { + let i = uint32 | 0; + i = i - ((i >>> 1) & 0x55555555); + i = (i & 0x33333333) + ((i >>> 2) & 0x33333333); + return (((i + (i >>> 4)) & 0x0F0F0F0F) * 0x01010101) >>> 24; +} diff --git a/src/arrow/js/src/util/bn.ts b/src/arrow/js/src/util/bn.ts new file mode 100644 index 000000000..7c71969a4 --- /dev/null +++ b/src/arrow/js/src/util/bn.ts @@ -0,0 +1,231 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { ArrayBufferViewInput, toArrayBufferView } from './buffer'; +import { TypedArray, TypedArrayConstructor } from '../interfaces'; +import { BigIntArray, BigIntArrayConstructor } from '../interfaces'; +import { BigIntAvailable, BigInt64Array, BigUint64Array } from './compat'; + +/** @ignore */ +export const isArrowBigNumSymbol = Symbol.for('isArrowBigNum'); + +/** @ignore */ type BigNumArray = IntArray | UintArray; +/** @ignore */ type IntArray = Int8Array | Int16Array | Int32Array; +/** @ignore */ type UintArray = Uint8Array | Uint16Array | Uint32Array | Uint8ClampedArray; + +/** @ignore */ +function BigNum(this: any, x: any, ...xs: any) { + if (xs.length === 0) { + return Object.setPrototypeOf(toArrayBufferView(this['TypedArray'], x), this.constructor.prototype); + } + return Object.setPrototypeOf(new this['TypedArray'](x, ...xs), this.constructor.prototype); +} + +BigNum.prototype[isArrowBigNumSymbol] = true; +BigNum.prototype.toJSON = function<T extends BN<BigNumArray>>(this: T) { return `"${bignumToString(this)}"`; }; +BigNum.prototype.valueOf = function<T extends BN<BigNumArray>>(this: T) { return bignumToNumber(this); }; +BigNum.prototype.toString = function<T extends BN<BigNumArray>>(this: T) { return bignumToString(this); }; +BigNum.prototype[Symbol.toPrimitive] = function<T extends BN<BigNumArray>>(this: T, hint: 'string' | 'number' | 'default' = 'default') { + switch (hint) { + case 'number': return bignumToNumber(this); + case 'string': return bignumToString(this); + case 'default': return bignumToBigInt(this); + } + // @ts-ignore + return bignumToString(this); +}; + +/** @ignore */ +type TypedArrayConstructorArgs = + [number | void] | + [Iterable<number> | Iterable<bigint>] | + [ArrayBufferLike, number | void, number | void] ; + +/** @ignore */ +function SignedBigNum(this: any, ...args: TypedArrayConstructorArgs) { return BigNum.apply(this, args); } +/** @ignore */ +function UnsignedBigNum(this: any, ...args: TypedArrayConstructorArgs) { return BigNum.apply(this, args); } +/** @ignore */ +function DecimalBigNum(this: any, ...args: TypedArrayConstructorArgs) { return BigNum.apply(this, args); } + +Object.setPrototypeOf(SignedBigNum.prototype, Object.create(Int32Array.prototype)); +Object.setPrototypeOf(UnsignedBigNum.prototype, Object.create(Uint32Array.prototype)); +Object.setPrototypeOf(DecimalBigNum.prototype, Object.create(Uint32Array.prototype)); +Object.assign(SignedBigNum.prototype, BigNum.prototype, { 'constructor': SignedBigNum, 'signed': true, 'TypedArray': Int32Array, 'BigIntArray': BigInt64Array }); +Object.assign(UnsignedBigNum.prototype, BigNum.prototype, { 'constructor': UnsignedBigNum, 'signed': false, 'TypedArray': Uint32Array, 'BigIntArray': BigUint64Array }); +Object.assign(DecimalBigNum.prototype, BigNum.prototype, { 'constructor': DecimalBigNum, 'signed': true, 'TypedArray': Uint32Array, 'BigIntArray': BigUint64Array }); + +/** @ignore */ +function bignumToNumber<T extends BN<BigNumArray>>(bn: T) { + const { buffer, byteOffset, length, 'signed': signed } = bn; + const words = new Int32Array(buffer, byteOffset, length); + let number = 0, i = 0; + const n = words.length; + let hi, lo; + while (i < n) { + lo = words[i++]; + hi = words[i++]; + signed || (hi = hi >>> 0); + number += (lo >>> 0) + (hi * (i ** 32)); + } + return number; +} + +/** @ignore */ +export let bignumToString: { <T extends BN<BigNumArray>>(a: T): string }; +/** @ignore */ +export let bignumToBigInt: { <T extends BN<BigNumArray>>(a: T): bigint }; + +if (!BigIntAvailable) { + bignumToString = decimalToString; + bignumToBigInt = <any> bignumToString; +} else { + bignumToBigInt = (<T extends BN<BigNumArray>>(a: T) => a.byteLength === 8 ? new a['BigIntArray'](a.buffer, a.byteOffset, 1)[0] : <any>decimalToString(a)); + bignumToString = (<T extends BN<BigNumArray>>(a: T) => a.byteLength === 8 ? `${new a['BigIntArray'](a.buffer, a.byteOffset, 1)[0]}` : decimalToString(a)); +} + +/** @ignore */ +function decimalToString<T extends BN<BigNumArray>>(a: T) { + let digits = ''; + const base64 = new Uint32Array(2); + let base32 = new Uint16Array(a.buffer, a.byteOffset, a.byteLength / 2); + const checks = new Uint32Array((base32 = new Uint16Array(base32).reverse()).buffer); + let i = -1; + const n = base32.length - 1; + do { + for (base64[0] = base32[i = 0]; i < n;) { + base32[i++] = base64[1] = base64[0] / 10; + base64[0] = ((base64[0] - base64[1] * 10) << 16) + base32[i]; + } + base32[i] = base64[1] = base64[0] / 10; + base64[0] = base64[0] - base64[1] * 10; + digits = `${base64[0]}${digits}`; + } while (checks[0] || checks[1] || checks[2] || checks[3]); + return digits ? digits : `0`; +} + +/** @ignore */ +export class BN<T extends BigNumArray> { + /** @nocollapse */ + public static new<T extends BigNumArray>(num: T, isSigned?: boolean): (T & BN<T>) { + switch (isSigned) { + case true: return new (<any> SignedBigNum)(num) as (T & BN<T>); + case false: return new (<any> UnsignedBigNum)(num) as (T & BN<T>); + } + switch (num.constructor) { + case Int8Array: + case Int16Array: + case Int32Array: + case BigInt64Array: + return new (<any> SignedBigNum)(num) as (T & BN<T>); + } + if (num.byteLength === 16) { + return new (<any> DecimalBigNum)(num) as (T & BN<T>); + } + return new (<any> UnsignedBigNum)(num) as (T & BN<T>); + } + /** @nocollapse */ + public static signed<T extends IntArray>(num: T): (T & BN<T>) { + return new (<any> SignedBigNum)(num) as (T & BN<T>); + } + /** @nocollapse */ + public static unsigned<T extends UintArray>(num: T): (T & BN<T>) { + return new (<any> UnsignedBigNum)(num) as (T & BN<T>); + } + /** @nocollapse */ + public static decimal<T extends UintArray>(num: T): (T & BN<T>) { + return new (<any> DecimalBigNum)(num) as (T & BN<T>); + } + constructor(num: T, isSigned?: boolean) { + return BN.new(num, isSigned) as any; + } +} + +/** @ignore */ +export interface BN<T extends BigNumArray> extends TypedArrayLike<T> { + + new<T extends ArrayBufferViewInput>(buffer: T, signed?: boolean): T; + + readonly signed: boolean; + readonly TypedArray: TypedArrayConstructor<TypedArray>; + readonly BigIntArray: BigIntArrayConstructor<BigIntArray>; + + [Symbol.toStringTag]: + 'Int8Array' | + 'Int16Array' | + 'Int32Array' | + 'Uint8Array' | + 'Uint16Array' | + 'Uint32Array' | + 'Uint8ClampedArray'; + + /** + * Convert the bytes to their (positive) decimal representation for printing + */ + toString(): string; + /** + * Down-convert the bytes to a 53-bit precision integer. Invoked by JS for + * arithmetic operators, like `+`. Easy (and unsafe) way to convert BN to + * number via `+bn_inst` + */ + valueOf(): number; + /** + * Return the JSON representation of the bytes. Must be wrapped in double-quotes, + * so it's compatible with JSON.stringify(). + */ + toJSON(): string; + [Symbol.toPrimitive](hint?: any): number | string | bigint; +} + +/** @ignore */ +interface TypedArrayLike<T extends BigNumArray> { + + readonly length: number; + readonly buffer: ArrayBuffer; + readonly byteLength: number; + readonly byteOffset: number; + readonly BYTES_PER_ELEMENT: number; + + includes(searchElement: number, fromIndex?: number | undefined): boolean; + copyWithin(target: number, start: number, end?: number | undefined): this; + every(callbackfn: (value: number, index: number, array: T) => boolean, thisArg?: any): boolean; + fill(value: number, start?: number | undefined, end?: number | undefined): this; + filter(callbackfn: (value: number, index: number, array: T) => boolean, thisArg?: any): T; + find(predicate: (value: number, index: number, obj: T) => boolean, thisArg?: any): number | undefined; + findIndex(predicate: (value: number, index: number, obj: T) => boolean, thisArg?: any): number; + forEach(callbackfn: (value: number, index: number, array: T) => void, thisArg?: any): void; + indexOf(searchElement: number, fromIndex?: number | undefined): number; + join(separator?: string | undefined): string; + lastIndexOf(searchElement: number, fromIndex?: number | undefined): number; + map(callbackfn: (value: number, index: number, array: T) => number, thisArg?: any): T; + reduce(callbackfn: (previousValue: number, currentValue: number, currentIndex: number, array: T) => number): number; + reduce(callbackfn: (previousValue: number, currentValue: number, currentIndex: number, array: T) => number, initialValue: number): number; + reduce<U>(callbackfn: (previousValue: U, currentValue: number, currentIndex: number, array: T) => U, initialValue: U): U; + reduceRight(callbackfn: (previousValue: number, currentValue: number, currentIndex: number, array: T) => number): number; + reduceRight(callbackfn: (previousValue: number, currentValue: number, currentIndex: number, array: T) => number, initialValue: number): number; + reduceRight<U>(callbackfn: (previousValue: U, currentValue: number, currentIndex: number, array: T) => U, initialValue: U): U; + reverse(): T; + set(array: ArrayLike<number>, offset?: number | undefined): void; + slice(start?: number | undefined, end?: number | undefined): T; + some(callbackfn: (value: number, index: number, array: T) => boolean, thisArg?: any): boolean; + sort(compareFn?: ((a: number, b: number) => number) | undefined): this; + subarray(begin: number, end?: number | undefined): T; + toLocaleString(): string; + entries(): IterableIterator<[number, number]>; + keys(): IterableIterator<number>; + values(): IterableIterator<number>; +} diff --git a/src/arrow/js/src/util/buffer.ts b/src/arrow/js/src/util/buffer.ts new file mode 100644 index 000000000..86dae86c6 --- /dev/null +++ b/src/arrow/js/src/util/buffer.ts @@ -0,0 +1,235 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { flatbuffers } from 'flatbuffers'; +import { encodeUtf8 } from '../util/utf8'; +import ByteBuffer = flatbuffers.ByteBuffer; +import { TypedArray, TypedArrayConstructor } from '../interfaces'; +import { BigIntArray, BigIntArrayConstructor } from '../interfaces'; +import { isPromise, isIterable, isAsyncIterable, isIteratorResult, BigInt64Array, BigUint64Array } from './compat'; + +/** @ignore */ +const SharedArrayBuf = (typeof SharedArrayBuffer !== 'undefined' ? SharedArrayBuffer : ArrayBuffer); + +/** @ignore */ +function collapseContiguousByteRanges(chunks: Uint8Array[]) { + const result = chunks[0] ? [chunks[0]] : []; + let xOffset: number, yOffset: number, xLen: number, yLen: number; + for (let x, y, i = 0, j = 0, n = chunks.length; ++i < n;) { + x = result[j]; + y = chunks[i]; + // continue if x and y don't share the same underlying ArrayBuffer, or if x isn't before y + if (!x || !y || x.buffer !== y.buffer || y.byteOffset < x.byteOffset) { + y && (result[++j] = y); + continue; + } + ({ byteOffset: xOffset, byteLength: xLen } = x); + ({ byteOffset: yOffset, byteLength: yLen } = y); + // continue if the byte ranges of x and y aren't contiguous + if ((xOffset + xLen) < yOffset || (yOffset + yLen) < xOffset) { + y && (result[++j] = y); + continue; + } + result[j] = new Uint8Array(x.buffer, xOffset, yOffset - xOffset + yLen); + } + return result; +} + +/** @ignore */ +export function memcpy<TTarget extends ArrayBufferView, TSource extends ArrayBufferView>(target: TTarget, source: TSource, targetByteOffset = 0, sourceByteLength = source.byteLength) { + const targetByteLength = target.byteLength; + const dst = new Uint8Array(target.buffer, target.byteOffset, targetByteLength); + const src = new Uint8Array(source.buffer, source.byteOffset, Math.min(sourceByteLength, targetByteLength)); + dst.set(src, targetByteOffset); + return target; +} + +/** @ignore */ +export function joinUint8Arrays(chunks: Uint8Array[], size?: number | null): [Uint8Array, Uint8Array[], number] { + // collapse chunks that share the same underlying ArrayBuffer and whose byte ranges overlap, + // to avoid unnecessarily copying the bytes to do this buffer join. This is a common case during + // streaming, where we may be reading partial byte ranges out of the same underlying ArrayBuffer + const result = collapseContiguousByteRanges(chunks); + const byteLength = result.reduce((x, b) => x + b.byteLength, 0); + let source: Uint8Array, sliced: Uint8Array, buffer: Uint8Array | void; + let offset = 0, index = -1; + const length = Math.min(size || Infinity, byteLength); + for (let n = result.length; ++index < n;) { + source = result[index]; + sliced = source.subarray(0, Math.min(source.length, length - offset)); + if (length <= (offset + sliced.length)) { + if (sliced.length < source.length) { + result[index] = source.subarray(sliced.length); + } else if (sliced.length === source.length) { index++; } + buffer ? memcpy(buffer, sliced, offset) : (buffer = sliced); + break; + } + memcpy(buffer || (buffer = new Uint8Array(length)), sliced, offset); + offset += sliced.length; + } + return [buffer || new Uint8Array(0), result.slice(index), byteLength - (buffer ? buffer.byteLength : 0)]; +} + +/** @ignore */ +export type ArrayBufferViewInput = ArrayBufferView | ArrayBufferLike | ArrayBufferView | Iterable<number> | ArrayLike<number> | ByteBuffer | string | null | undefined | + IteratorResult<ArrayBufferView | ArrayBufferLike | ArrayBufferView | Iterable<number> | ArrayLike<number> | ByteBuffer | string | null | undefined> | + ReadableStreamReadResult<ArrayBufferView | ArrayBufferLike | ArrayBufferView | Iterable<number> | ArrayLike<number> | ByteBuffer | string | null | undefined> ; + +/** @ignore */ +export function toArrayBufferView<T extends TypedArray>(ArrayBufferViewCtor: TypedArrayConstructor<T>, input: ArrayBufferViewInput): T; +export function toArrayBufferView<T extends BigIntArray>(ArrayBufferViewCtor: BigIntArrayConstructor<T>, input: ArrayBufferViewInput): T; +export function toArrayBufferView(ArrayBufferViewCtor: any, input: ArrayBufferViewInput) { + + let value: any = isIteratorResult(input) ? input.value : input; + + if (value instanceof ArrayBufferViewCtor) { + if (ArrayBufferViewCtor === Uint8Array) { + // Node's `Buffer` class passes the `instanceof Uint8Array` check, but we need + // a real Uint8Array, since Buffer#slice isn't the same as Uint8Array#slice :/ + return new ArrayBufferViewCtor(value.buffer, value.byteOffset, value.byteLength); + } + return value; + } + if (!value) { return new ArrayBufferViewCtor(0); } + if (typeof value === 'string') { value = encodeUtf8(value); } + if (value instanceof ArrayBuffer) { return new ArrayBufferViewCtor(value); } + if (value instanceof SharedArrayBuf) { return new ArrayBufferViewCtor(value); } + if (value instanceof ByteBuffer) { return toArrayBufferView(ArrayBufferViewCtor, value.bytes()); } + return !ArrayBuffer.isView(value) ? ArrayBufferViewCtor.from(value) : value.byteLength <= 0 ? new ArrayBufferViewCtor(0) + : new ArrayBufferViewCtor(value.buffer, value.byteOffset, value.byteLength / ArrayBufferViewCtor.BYTES_PER_ELEMENT); +} + +/** @ignore */ export const toInt8Array = (input: ArrayBufferViewInput) => toArrayBufferView(Int8Array, input); +/** @ignore */ export const toInt16Array = (input: ArrayBufferViewInput) => toArrayBufferView(Int16Array, input); +/** @ignore */ export const toInt32Array = (input: ArrayBufferViewInput) => toArrayBufferView(Int32Array, input); +/** @ignore */ export const toBigInt64Array = (input: ArrayBufferViewInput) => toArrayBufferView(BigInt64Array, input); +/** @ignore */ export const toUint8Array = (input: ArrayBufferViewInput) => toArrayBufferView(Uint8Array, input); +/** @ignore */ export const toUint16Array = (input: ArrayBufferViewInput) => toArrayBufferView(Uint16Array, input); +/** @ignore */ export const toUint32Array = (input: ArrayBufferViewInput) => toArrayBufferView(Uint32Array, input); +/** @ignore */ export const toBigUint64Array = (input: ArrayBufferViewInput) => toArrayBufferView(BigUint64Array, input); +/** @ignore */ export const toFloat32Array = (input: ArrayBufferViewInput) => toArrayBufferView(Float32Array, input); +/** @ignore */ export const toFloat64Array = (input: ArrayBufferViewInput) => toArrayBufferView(Float64Array, input); +/** @ignore */ export const toUint8ClampedArray = (input: ArrayBufferViewInput) => toArrayBufferView(Uint8ClampedArray, input); + +/** @ignore */ +type ArrayBufferViewIteratorInput = Iterable<ArrayBufferViewInput> | ArrayBufferViewInput; + +/** @ignore */ +const pump = <T extends Iterator<any> | AsyncIterator<any>>(iterator: T) => { iterator.next(); return iterator; }; + +/** @ignore */ +export function* toArrayBufferViewIterator<T extends TypedArray>(ArrayCtor: TypedArrayConstructor<T>, source: ArrayBufferViewIteratorInput) { + + const wrap = function*<T>(x: T) { yield x; }; + const buffers: Iterable<ArrayBufferViewInput> = + (typeof source === 'string') ? wrap(source) + : (ArrayBuffer.isView(source)) ? wrap(source) + : (source instanceof ArrayBuffer) ? wrap(source) + : (source instanceof SharedArrayBuf) ? wrap(source) + : !isIterable<ArrayBufferViewInput>(source) ? wrap(source) : source; + + yield* pump((function* (it: Iterator<ArrayBufferViewInput, any, number | undefined>): Generator<T, void, number | undefined> { + let r: IteratorResult<any> = <any> null; + do { + r = it.next(yield toArrayBufferView(ArrayCtor, r)); + } while (!r.done); + })(buffers[Symbol.iterator]())); + return new ArrayCtor(); +} + +/** @ignore */ export const toInt8ArrayIterator = (input: ArrayBufferViewIteratorInput) => toArrayBufferViewIterator(Int8Array, input); +/** @ignore */ export const toInt16ArrayIterator = (input: ArrayBufferViewIteratorInput) => toArrayBufferViewIterator(Int16Array, input); +/** @ignore */ export const toInt32ArrayIterator = (input: ArrayBufferViewIteratorInput) => toArrayBufferViewIterator(Int32Array, input); +/** @ignore */ export const toUint8ArrayIterator = (input: ArrayBufferViewIteratorInput) => toArrayBufferViewIterator(Uint8Array, input); +/** @ignore */ export const toUint16ArrayIterator = (input: ArrayBufferViewIteratorInput) => toArrayBufferViewIterator(Uint16Array, input); +/** @ignore */ export const toUint32ArrayIterator = (input: ArrayBufferViewIteratorInput) => toArrayBufferViewIterator(Uint32Array, input); +/** @ignore */ export const toFloat32ArrayIterator = (input: ArrayBufferViewIteratorInput) => toArrayBufferViewIterator(Float32Array, input); +/** @ignore */ export const toFloat64ArrayIterator = (input: ArrayBufferViewIteratorInput) => toArrayBufferViewIterator(Float64Array, input); +/** @ignore */ export const toUint8ClampedArrayIterator = (input: ArrayBufferViewIteratorInput) => toArrayBufferViewIterator(Uint8ClampedArray, input); + +/** @ignore */ +type ArrayBufferViewAsyncIteratorInput = AsyncIterable<ArrayBufferViewInput> | Iterable<ArrayBufferViewInput> | PromiseLike<ArrayBufferViewInput> | ArrayBufferViewInput; + +/** @ignore */ +export async function* toArrayBufferViewAsyncIterator<T extends TypedArray>(ArrayCtor: TypedArrayConstructor<T>, source: ArrayBufferViewAsyncIteratorInput): AsyncGenerator<T, T, number | undefined> { + + // if a Promise, unwrap the Promise and iterate the resolved value + if (isPromise<ArrayBufferViewInput>(source)) { + return yield* toArrayBufferViewAsyncIterator(ArrayCtor, await source); + } + + const wrap = async function*<T>(x: T) { yield await x; }; + const emit = async function* <T extends Iterable<any>>(source: T) { + yield* pump((function*(it: Iterator<any>) { + let r: IteratorResult<any> = <any> null; + do { + r = it.next(yield r?.value); + } while (!r.done); + })(source[Symbol.iterator]())); + }; + + const buffers: AsyncIterable<ArrayBufferViewInput> = + (typeof source === 'string') ? wrap(source) // if string, wrap in an AsyncIterableIterator + : (ArrayBuffer.isView(source)) ? wrap(source) // if TypedArray, wrap in an AsyncIterableIterator + : (source instanceof ArrayBuffer) ? wrap(source) // if ArrayBuffer, wrap in an AsyncIterableIterator + : (source instanceof SharedArrayBuf) ? wrap(source) // if SharedArrayBuffer, wrap in an AsyncIterableIterator + : isIterable<ArrayBufferViewInput>(source) ? emit(source) // If Iterable, wrap in an AsyncIterableIterator and compose the `next` values + : !isAsyncIterable<ArrayBufferViewInput>(source) ? wrap(source) // If not an AsyncIterable, treat as a sentinel and wrap in an AsyncIterableIterator + : source; // otherwise if AsyncIterable, use it + + yield* pump((async function* (it: AsyncIterator<ArrayBufferViewInput, any, number | undefined>): AsyncGenerator<T, void, number | undefined> { + let r: IteratorResult<any> = <any> null; + do { + r = await it.next(yield toArrayBufferView(ArrayCtor, r)); + } while (!r.done); + })(buffers[Symbol.asyncIterator]())); + return new ArrayCtor(); +} + +/** @ignore */ export const toInt8ArrayAsyncIterator = (input: ArrayBufferViewAsyncIteratorInput) => toArrayBufferViewAsyncIterator(Int8Array, input); +/** @ignore */ export const toInt16ArrayAsyncIterator = (input: ArrayBufferViewAsyncIteratorInput) => toArrayBufferViewAsyncIterator(Int16Array, input); +/** @ignore */ export const toInt32ArrayAsyncIterator = (input: ArrayBufferViewAsyncIteratorInput) => toArrayBufferViewAsyncIterator(Int32Array, input); +/** @ignore */ export const toUint8ArrayAsyncIterator = (input: ArrayBufferViewAsyncIteratorInput) => toArrayBufferViewAsyncIterator(Uint8Array, input); +/** @ignore */ export const toUint16ArrayAsyncIterator = (input: ArrayBufferViewAsyncIteratorInput) => toArrayBufferViewAsyncIterator(Uint16Array, input); +/** @ignore */ export const toUint32ArrayAsyncIterator = (input: ArrayBufferViewAsyncIteratorInput) => toArrayBufferViewAsyncIterator(Uint32Array, input); +/** @ignore */ export const toFloat32ArrayAsyncIterator = (input: ArrayBufferViewAsyncIteratorInput) => toArrayBufferViewAsyncIterator(Float32Array, input); +/** @ignore */ export const toFloat64ArrayAsyncIterator = (input: ArrayBufferViewAsyncIteratorInput) => toArrayBufferViewAsyncIterator(Float64Array, input); +/** @ignore */ export const toUint8ClampedArrayAsyncIterator = (input: ArrayBufferViewAsyncIteratorInput) => toArrayBufferViewAsyncIterator(Uint8ClampedArray, input); + +/** @ignore */ +export function rebaseValueOffsets(offset: number, length: number, valueOffsets: Int32Array) { + // If we have a non-zero offset, create a new offsets array with the values + // shifted by the start offset, such that the new start offset is 0 + if (offset !== 0) { + valueOffsets = valueOffsets.slice(0, length + 1); + for (let i = -1; ++i <= length;) { + valueOffsets[i] += offset; + } + } + return valueOffsets; +} + +/** @ignore */ +export function compareArrayLike<T extends ArrayLike<any>>(a: T, b: T) { + let i = 0; + const n = a.length; + if (n !== b.length) { return false; } + if (n > 0) { + do { if (a[i] !== b[i]) { return false; } } while (++i < n); + } + return true; +} diff --git a/src/arrow/js/src/util/compat.ts b/src/arrow/js/src/util/compat.ts new file mode 100644 index 000000000..62fcb772e --- /dev/null +++ b/src/arrow/js/src/util/compat.ts @@ -0,0 +1,178 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { ReadableInterop, ArrowJSONLike } from '../io/interfaces'; + +/** @ignore */ +type FSReadStream = import('fs').ReadStream; +/** @ignore */ +type FileHandle = import('fs').promises.FileHandle; + +/** @ignore */ +export interface Subscription { + unsubscribe: () => void; +} + +/** @ignore */ +export interface Observer<T> { + closed?: boolean; + next: (value: T) => void; + error: (err: any) => void; + complete: () => void; +} + +/** @ignore */ +export interface Observable<T> { + subscribe: (observer: Observer<T>) => Subscription; +} + +/** @ignore */ +const [BigIntCtor, BigIntAvailable] = (() => { + const BigIntUnavailableError = () => { throw new Error('BigInt is not available in this environment'); }; + function BigIntUnavailable() { throw BigIntUnavailableError(); } + BigIntUnavailable.asIntN = () => { throw BigIntUnavailableError(); }; + BigIntUnavailable.asUintN = () => { throw BigIntUnavailableError(); }; + return typeof BigInt !== 'undefined' ? [BigInt, true] : [<any> BigIntUnavailable, false]; +})() as [BigIntConstructor, boolean]; + +/** @ignore */ +const [BigInt64ArrayCtor, BigInt64ArrayAvailable] = (() => { + const BigInt64ArrayUnavailableError = () => { throw new Error('BigInt64Array is not available in this environment'); }; + class BigInt64ArrayUnavailable { + static get BYTES_PER_ELEMENT() { return 8; } + static of() { throw BigInt64ArrayUnavailableError(); } + static from() { throw BigInt64ArrayUnavailableError(); } + constructor() { throw BigInt64ArrayUnavailableError(); } + } + return typeof BigInt64Array !== 'undefined' ? [BigInt64Array, true] : [<any> BigInt64ArrayUnavailable, false]; +})() as [BigInt64ArrayConstructor, boolean]; + +/** @ignore */ +const [BigUint64ArrayCtor, BigUint64ArrayAvailable] = (() => { + const BigUint64ArrayUnavailableError = () => { throw new Error('BigUint64Array is not available in this environment'); }; + class BigUint64ArrayUnavailable { + static get BYTES_PER_ELEMENT() { return 8; } + static of() { throw BigUint64ArrayUnavailableError(); } + static from() { throw BigUint64ArrayUnavailableError(); } + constructor() { throw BigUint64ArrayUnavailableError(); } + } + return typeof BigUint64Array !== 'undefined' ? [BigUint64Array, true] : [<any> BigUint64ArrayUnavailable, false]; +})() as [BigUint64ArrayConstructor, boolean]; + +export { BigIntCtor as BigInt, BigIntAvailable }; +export { BigInt64ArrayCtor as BigInt64Array, BigInt64ArrayAvailable }; +export { BigUint64ArrayCtor as BigUint64Array, BigUint64ArrayAvailable }; + +/** @ignore */ const isNumber = (x: any) => typeof x === 'number'; +/** @ignore */ const isBoolean = (x: any) => typeof x === 'boolean'; +/** @ignore */ const isFunction = (x: any) => typeof x === 'function'; +/** @ignore */ +// eslint-disable-next-line @typescript-eslint/ban-types +export const isObject = (x: any): x is Object => x != null && Object(x) === x; + +/** @ignore */ +export const isPromise = <T = any>(x: any): x is PromiseLike<T> => { + return isObject(x) && isFunction(x.then); +}; + +/** @ignore */ +export const isObservable = <T = any>(x: any): x is Observable<T> => { + return isObject(x) && isFunction(x.subscribe); +}; + +/** @ignore */ +export const isIterable = <T = any>(x: any): x is Iterable<T> => { + return isObject(x) && isFunction(x[Symbol.iterator]); +}; + +/** @ignore */ +export const isAsyncIterable = <T = any>(x: any): x is AsyncIterable<T> => { + return isObject(x) && isFunction(x[Symbol.asyncIterator]); +}; + +/** @ignore */ +export const isArrowJSON = (x: any): x is ArrowJSONLike => { + return isObject(x) && isObject(x['schema']); +}; + +/** @ignore */ +export const isArrayLike = <T = any>(x: any): x is ArrayLike<T> => { + return isObject(x) && isNumber(x['length']); +}; + +/** @ignore */ +export const isIteratorResult = <T = any>(x: any): x is IteratorResult<T> => { + return isObject(x) && ('done' in x) && ('value' in x); +}; + +/** @ignore */ +export const isUnderlyingSink = <T = any>(x: any): x is UnderlyingSink<T> => { + return isObject(x) && + isFunction(x['abort']) && + isFunction(x['close']) && + isFunction(x['start']) && + isFunction(x['write']); +}; + +/** @ignore */ +export const isFileHandle = (x: any): x is FileHandle => { + return isObject(x) && isFunction(x['stat']) && isNumber(x['fd']); +}; + +/** @ignore */ +export const isFSReadStream = (x: any): x is FSReadStream => { + return isReadableNodeStream(x) && isNumber((<any> x)['bytesRead']); +}; + +/** @ignore */ +export const isFetchResponse = (x: any): x is Response => { + return isObject(x) && isReadableDOMStream(x['body']); +}; + +/** @ignore */ +export const isWritableDOMStream = <T = any>(x: any): x is WritableStream<T> => { + return isObject(x) && + isFunction(x['abort']) && + isFunction(x['getWriter']) && + !(x instanceof ReadableInterop); +}; + +/** @ignore */ +export const isReadableDOMStream = <T = any>(x: any): x is ReadableStream<T> => { + return isObject(x) && + isFunction(x['cancel']) && + isFunction(x['getReader']) && + !(x instanceof ReadableInterop); +}; + +/** @ignore */ +export const isWritableNodeStream = (x: any): x is NodeJS.WritableStream => { + return isObject(x) && + isFunction(x['end']) && + isFunction(x['write']) && + isBoolean(x['writable']) && + !(x instanceof ReadableInterop); +}; + +/** @ignore */ +export const isReadableNodeStream = (x: any): x is NodeJS.ReadableStream => { + return isObject(x) && + isFunction(x['read']) && + isFunction(x['pipe']) && + isBoolean(x['readable']) && + !(x instanceof ReadableInterop); +}; diff --git a/src/arrow/js/src/util/fn.ts b/src/arrow/js/src/util/fn.ts new file mode 100644 index 000000000..a58f9d337 --- /dev/null +++ b/src/arrow/js/src/util/fn.ts @@ -0,0 +1,31 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/** @ignore */ +export function partial0<T>(visit: (node: T) => any) { + return function(this: T) { return visit(this); }; +} + +/** @ignore */ +export function partial1<T>(visit: (node: T, a: any) => any) { + return function(this: T, a: any) { return visit(this, a); }; +} + +/** @ignore */ +export function partial2<T>(visit: (node: T, a: any, b: any) => any) { + return function(this: T, a: any, b: any) { return visit(this, a, b); }; +} diff --git a/src/arrow/js/src/util/int.ts b/src/arrow/js/src/util/int.ts new file mode 100644 index 000000000..147106dbb --- /dev/null +++ b/src/arrow/js/src/util/int.ts @@ -0,0 +1,440 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/** @ignore */ +const carryBit16 = 1 << 16; + +/** @ignore */ +function intAsHex(value: number): string { + if (value < 0) { + value = 0xFFFFFFFF + value + 1; + } + return `0x${value.toString(16)}`; +} + +/** @ignore */ +const kInt32DecimalDigits = 8; +/** @ignore */ +const kPowersOfTen = [1, + 10, + 100, + 1000, + 10000, + 100000, + 1000000, + 10000000, + 100000000]; + +/** @ignore */ +export class BaseInt64 { + constructor (protected buffer: Uint32Array) {} + + public high(): number { return this.buffer[1]; } + public low (): number { return this.buffer[0]; } + + protected _times(other: BaseInt64) { + // Break the left and right numbers into 16 bit chunks + // so that we can multiply them without overflow. + const L = new Uint32Array([ + this.buffer[1] >>> 16, + this.buffer[1] & 0xFFFF, + this.buffer[0] >>> 16, + this.buffer[0] & 0xFFFF + ]); + + const R = new Uint32Array([ + other.buffer[1] >>> 16, + other.buffer[1] & 0xFFFF, + other.buffer[0] >>> 16, + other.buffer[0] & 0xFFFF + ]); + + let product = L[3] * R[3]; + this.buffer[0] = product & 0xFFFF; + + let sum = product >>> 16; + + product = L[2] * R[3]; + sum += product; + + product = (L[3] * R[2]) >>> 0; + sum += product; + + this.buffer[0] += sum << 16; + + this.buffer[1] = (sum >>> 0 < product ? carryBit16 : 0); + + this.buffer[1] += sum >>> 16; + this.buffer[1] += L[1] * R[3] + L[2] * R[2] + L[3] * R[1]; + this.buffer[1] += (L[0] * R[3] + L[1] * R[2] + L[2] * R[1] + L[3] * R[0]) << 16; + + return this; + } + + protected _plus(other: BaseInt64) { + const sum = (this.buffer[0] + other.buffer[0]) >>> 0; + this.buffer[1] += other.buffer[1]; + if (sum < (this.buffer[0] >>> 0)) { + ++this.buffer[1]; + } + this.buffer[0] = sum; + } + + public lessThan(other: BaseInt64): boolean { + return this.buffer[1] < other.buffer[1] || + (this.buffer[1] === other.buffer[1] && this.buffer[0] < other.buffer[0]); + } + + public equals(other: BaseInt64): boolean { + return this.buffer[1] === other.buffer[1] && this.buffer[0] == other.buffer[0]; + } + + public greaterThan(other: BaseInt64): boolean { + return other.lessThan(this); + } + + public hex(): string { + return `${intAsHex(this.buffer[1])} ${intAsHex(this.buffer[0])}`; + } +} + +/** @ignore */ +export class Uint64 extends BaseInt64 { + public times(other: Uint64): Uint64 { + this._times(other); + return this; + } + + public plus(other: Uint64): Uint64 { + this._plus(other); + return this; + } + + /** @nocollapse */ + public static from(val: any, out_buffer = new Uint32Array(2)): Uint64 { + return Uint64.fromString( + typeof(val) === 'string' ? val : val.toString(), + out_buffer + ); + } + + /** @nocollapse */ + public static fromNumber(num: number, out_buffer = new Uint32Array(2)): Uint64 { + // Always parse numbers as strings - pulling out high and low bits + // directly seems to lose precision sometimes + // For example: + // > -4613034156400212000 >>> 0 + // 721782784 + // The correct lower 32-bits are 721782752 + return Uint64.fromString(num.toString(), out_buffer); + } + + /** @nocollapse */ + public static fromString(str: string, out_buffer = new Uint32Array(2)): Uint64 { + const length = str.length; + + const out = new Uint64(out_buffer); + for (let posn = 0; posn < length;) { + const group = kInt32DecimalDigits < length - posn ? + kInt32DecimalDigits : length - posn; + const chunk = new Uint64(new Uint32Array([parseInt(str.substr(posn, group), 10), 0])); + const multiple = new Uint64(new Uint32Array([kPowersOfTen[group], 0])); + + out.times(multiple); + out.plus(chunk); + + posn += group; + } + + return out; + } + + /** @nocollapse */ + public static convertArray(values: (string|number)[]): Uint32Array { + const data = new Uint32Array(values.length * 2); + for (let i = -1, n = values.length; ++i < n;) { + Uint64.from(values[i], new Uint32Array(data.buffer, data.byteOffset + 2 * i * 4, 2)); + } + return data; + } + + /** @nocollapse */ + public static multiply(left: Uint64, right: Uint64): Uint64 { + const rtrn = new Uint64(new Uint32Array(left.buffer)); + return rtrn.times(right); + } + + /** @nocollapse */ + public static add(left: Uint64, right: Uint64): Uint64 { + const rtrn = new Uint64(new Uint32Array(left.buffer)); + return rtrn.plus(right); + } +} + +/** @ignore */ +export class Int64 extends BaseInt64 { + public negate(): Int64 { + this.buffer[0] = ~this.buffer[0] + 1; + this.buffer[1] = ~this.buffer[1]; + + if (this.buffer[0] == 0) { ++this.buffer[1]; } + return this; + } + + public times(other: Int64): Int64 { + this._times(other); + return this; + } + + public plus(other: Int64): Int64 { + this._plus(other); + return this; + } + + public lessThan(other: Int64): boolean { + // force high bytes to be signed + const this_high = this.buffer[1] << 0; + const other_high = other.buffer[1] << 0; + return this_high < other_high || + (this_high === other_high && this.buffer[0] < other.buffer[0]); + } + + /** @nocollapse */ + public static from(val: any, out_buffer = new Uint32Array(2)): Int64 { + return Int64.fromString( + typeof(val) === 'string' ? val : val.toString(), + out_buffer + ); + } + + /** @nocollapse */ + public static fromNumber(num: number, out_buffer = new Uint32Array(2)): Int64 { + // Always parse numbers as strings - pulling out high and low bits + // directly seems to lose precision sometimes + // For example: + // > -4613034156400212000 >>> 0 + // 721782784 + // The correct lower 32-bits are 721782752 + return Int64.fromString(num.toString(), out_buffer); + } + + /** @nocollapse */ + public static fromString(str: string, out_buffer = new Uint32Array(2)): Int64 { + // TODO: Assert that out_buffer is 0 and length = 2 + const negate = str.startsWith('-'); + const length = str.length; + + const out = new Int64(out_buffer); + for (let posn = negate ? 1 : 0; posn < length;) { + const group = kInt32DecimalDigits < length - posn ? + kInt32DecimalDigits : length - posn; + const chunk = new Int64(new Uint32Array([parseInt(str.substr(posn, group), 10), 0])); + const multiple = new Int64(new Uint32Array([kPowersOfTen[group], 0])); + + out.times(multiple); + out.plus(chunk); + + posn += group; + } + return negate ? out.negate() : out; + } + + /** @nocollapse */ + public static convertArray(values: (string|number)[]): Uint32Array { + const data = new Uint32Array(values.length * 2); + for (let i = -1, n = values.length; ++i < n;) { + Int64.from(values[i], new Uint32Array(data.buffer, data.byteOffset + 2 * i * 4, 2)); + } + return data; + } + + /** @nocollapse */ + public static multiply(left: Int64, right: Int64): Int64 { + const rtrn = new Int64(new Uint32Array(left.buffer)); + return rtrn.times(right); + } + + /** @nocollapse */ + public static add(left: Int64, right: Int64): Int64 { + const rtrn = new Int64(new Uint32Array(left.buffer)); + return rtrn.plus(right); + } +} + +/** @ignore */ +export class Int128 { + constructor (private buffer: Uint32Array) { + // buffer[3] MSB (high) + // buffer[2] + // buffer[1] + // buffer[0] LSB (low) + } + + public high(): Int64 { + return new Int64(new Uint32Array(this.buffer.buffer, this.buffer.byteOffset + 8, 2)); + } + + public low(): Int64 { + return new Int64(new Uint32Array(this.buffer.buffer, this.buffer.byteOffset, 2)); + } + + public negate(): Int128 { + this.buffer[0] = ~this.buffer[0] + 1; + this.buffer[1] = ~this.buffer[1]; + this.buffer[2] = ~this.buffer[2]; + this.buffer[3] = ~this.buffer[3]; + + if (this.buffer[0] == 0) { ++this.buffer[1]; } + if (this.buffer[1] == 0) { ++this.buffer[2]; } + if (this.buffer[2] == 0) { ++this.buffer[3]; } + return this; + } + + public times(other: Int128): Int128 { + // Break the left and right numbers into 32 bit chunks + // so that we can multiply them without overflow. + const L0 = new Uint64(new Uint32Array([this.buffer[3], 0])); + const L1 = new Uint64(new Uint32Array([this.buffer[2], 0])); + const L2 = new Uint64(new Uint32Array([this.buffer[1], 0])); + const L3 = new Uint64(new Uint32Array([this.buffer[0], 0])); + + const R0 = new Uint64(new Uint32Array([other.buffer[3], 0])); + const R1 = new Uint64(new Uint32Array([other.buffer[2], 0])); + const R2 = new Uint64(new Uint32Array([other.buffer[1], 0])); + const R3 = new Uint64(new Uint32Array([other.buffer[0], 0])); + + let product = Uint64.multiply(L3, R3); + this.buffer[0] = product.low(); + + const sum = new Uint64(new Uint32Array([product.high(), 0])); + + product = Uint64.multiply(L2, R3); + sum.plus(product); + + product = Uint64.multiply(L3, R2); + sum.plus(product); + + this.buffer[1] = sum.low(); + + this.buffer[3] = (sum.lessThan(product) ? 1 : 0); + + this.buffer[2] = sum.high(); + const high = new Uint64(new Uint32Array(this.buffer.buffer, this.buffer.byteOffset + 8, 2)); + + high.plus(Uint64.multiply(L1, R3)) + .plus(Uint64.multiply(L2, R2)) + .plus(Uint64.multiply(L3, R1)); + this.buffer[3] += Uint64.multiply(L0, R3) + .plus(Uint64.multiply(L1, R2)) + .plus(Uint64.multiply(L2, R1)) + .plus(Uint64.multiply(L3, R0)).low(); + + return this; + } + + public plus(other: Int128): Int128 { + const sums = new Uint32Array(4); + sums[3] = (this.buffer[3] + other.buffer[3]) >>> 0; + sums[2] = (this.buffer[2] + other.buffer[2]) >>> 0; + sums[1] = (this.buffer[1] + other.buffer[1]) >>> 0; + sums[0] = (this.buffer[0] + other.buffer[0]) >>> 0; + + if (sums[0] < (this.buffer[0] >>> 0)) { + ++sums[1]; + } + if (sums[1] < (this.buffer[1] >>> 0)) { + ++sums[2]; + } + if (sums[2] < (this.buffer[2] >>> 0)) { + ++sums[3]; + } + + this.buffer[3] = sums[3]; + this.buffer[2] = sums[2]; + this.buffer[1] = sums[1]; + this.buffer[0] = sums[0]; + + return this; + } + + public hex(): string { + return `${intAsHex(this.buffer[3])} ${intAsHex(this.buffer[2])} ${intAsHex(this.buffer[1])} ${intAsHex(this.buffer[0])}`; + } + + /** @nocollapse */ + public static multiply(left: Int128, right: Int128): Int128 { + const rtrn = new Int128(new Uint32Array(left.buffer)); + return rtrn.times(right); + } + + /** @nocollapse */ + public static add(left: Int128, right: Int128): Int128 { + const rtrn = new Int128(new Uint32Array(left.buffer)); + return rtrn.plus(right); + } + + /** @nocollapse */ + public static from(val: any, out_buffer = new Uint32Array(4)): Int128 { + return Int128.fromString( + typeof(val) === 'string' ? val : val.toString(), + out_buffer + ); + } + + /** @nocollapse */ + public static fromNumber(num: number, out_buffer = new Uint32Array(4)): Int128 { + // Always parse numbers as strings - pulling out high and low bits + // directly seems to lose precision sometimes + // For example: + // > -4613034156400212000 >>> 0 + // 721782784 + // The correct lower 32-bits are 721782752 + return Int128.fromString(num.toString(), out_buffer); + } + + /** @nocollapse */ + public static fromString(str: string, out_buffer = new Uint32Array(4)): Int128 { + // TODO: Assert that out_buffer is 0 and length = 4 + const negate = str.startsWith('-'); + const length = str.length; + + const out = new Int128(out_buffer); + for (let posn = negate ? 1 : 0; posn < length;) { + const group = kInt32DecimalDigits < length - posn ? + kInt32DecimalDigits : length - posn; + const chunk = new Int128(new Uint32Array([parseInt(str.substr(posn, group), 10), 0, 0, 0])); + const multiple = new Int128(new Uint32Array([kPowersOfTen[group], 0, 0, 0])); + + out.times(multiple); + out.plus(chunk); + + posn += group; + } + + return negate ? out.negate() : out; + } + + /** @nocollapse */ + public static convertArray(values: (string|number)[]): Uint32Array { + // TODO: Distinguish between string and number at compile-time + const data = new Uint32Array(values.length * 4); + for (let i = -1, n = values.length; ++i < n;) { + Int128.from(values[i], new Uint32Array(data.buffer, data.byteOffset + 4 * 4 * i, 4)); + } + return data; + } +} diff --git a/src/arrow/js/src/util/math.ts b/src/arrow/js/src/util/math.ts new file mode 100644 index 000000000..47678e1a9 --- /dev/null +++ b/src/arrow/js/src/util/math.ts @@ -0,0 +1,105 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +const f64 = new Float64Array(1); +const u32 = new Uint32Array(f64.buffer); + +/** + * Convert uint16 (logically a float16) to a JS float64. Inspired by numpy's `npy_half_to_double`: + * https://github.com/numpy/numpy/blob/5a5987291dc95376bb098be8d8e5391e89e77a2c/numpy/core/src/npymath/halffloat.c#L29 + * @param h {number} the uint16 to convert + * @private + * @ignore + */ +export function uint16ToFloat64(h: number) { + const expo = (h & 0x7C00) >> 10; + const sigf = (h & 0x03FF) / 1024; + const sign = (-1) ** ((h & 0x8000) >> 15); + switch (expo) { + case 0x1F: return sign * (sigf ? NaN : 1 / 0); + case 0x00: return sign * (sigf ? 6.103515625e-5 * sigf : 0); + } + return sign * (2 ** (expo - 15)) * (1 + sigf); +} + +/** + * Convert a float64 to uint16 (assuming the float64 is logically a float16). Inspired by numpy's `npy_double_to_half`: + * https://github.com/numpy/numpy/blob/5a5987291dc95376bb098be8d8e5391e89e77a2c/numpy/core/src/npymath/halffloat.c#L43 + * @param d {number} The float64 to convert + * @private + * @ignore + */ +export function float64ToUint16(d: number) { + + if (d !== d) { return 0x7E00; } // NaN + + f64[0] = d; + + // Magic numbers: + // 0x80000000 = 10000000 00000000 00000000 00000000 -- masks the 32nd bit + // 0x7ff00000 = 01111111 11110000 00000000 00000000 -- masks the 21st-31st bits + // 0x000fffff = 00000000 00001111 11111111 11111111 -- masks the 1st-20th bit + + const sign = (u32[1] & 0x80000000) >> 16 & 0xFFFF; + let expo = (u32[1] & 0x7ff00000), sigf = 0x0000; + + if (expo >= 0x40f00000) { + // + // If exponent overflowed, the float16 is either NaN or Infinity. + // Rules to propagate the sign bit: mantissa > 0 ? NaN : +/-Infinity + // + // Magic numbers: + // 0x40F00000 = 01000000 11110000 00000000 00000000 -- 6-bit exponent overflow + // 0x7C000000 = 01111100 00000000 00000000 00000000 -- masks the 27th-31st bits + // + // returns: + // qNaN, aka 32256 decimal, 0x7E00 hex, or 01111110 00000000 binary + // sNaN, aka 32000 decimal, 0x7D00 hex, or 01111101 00000000 binary + // +inf, aka 31744 decimal, 0x7C00 hex, or 01111100 00000000 binary + // -inf, aka 64512 decimal, 0xFC00 hex, or 11111100 00000000 binary + // + // If mantissa is greater than 23 bits, set to +Infinity like numpy + if (u32[0] > 0) { + expo = 0x7C00; + } else { + expo = (expo & 0x7C000000) >> 16; + sigf = (u32[1] & 0x000fffff) >> 10; + } + } else if (expo <= 0x3f000000) { + // + // If exponent underflowed, the float is either signed zero or subnormal. + // + // Magic numbers: + // 0x3F000000 = 00111111 00000000 00000000 00000000 -- 6-bit exponent underflow + // + sigf = 0x100000 + (u32[1] & 0x000fffff); + sigf = 0x100000 + (sigf << ((expo >> 20) - 998)) >> 21; + expo = 0; + } else { + // + // No overflow or underflow, rebase the exponent and round the mantissa + // Magic numbers: + // 0x200 = 00000010 00000000 -- masks off the 10th bit + // + + // Ensure the first mantissa bit (the 10th one) is 1 and round + expo = (expo - 0x3f000000) >> 10; + sigf = ((u32[1] & 0x000fffff) + 0x200) >> 10; + } + + return sign | expo | sigf & 0xFFFF; +} diff --git a/src/arrow/js/src/util/pretty.ts b/src/arrow/js/src/util/pretty.ts new file mode 100644 index 000000000..a189fc490 --- /dev/null +++ b/src/arrow/js/src/util/pretty.ts @@ -0,0 +1,37 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/** @ignore */ const undf = void (0); + +/** @ignore */ +export function valueToString(x: any) { + if (x === null) { return 'null'; } + if (x === undf) { return 'undefined'; } + switch (typeof x) { + case 'number': return `${x}`; + case 'bigint': return `${x}`; + case 'string': return `"${x}"`; + } + // If [Symbol.toPrimitive] is implemented (like in BN) + // use it instead of JSON.stringify(). This ensures we + // print BigInts, Decimals, and Binary in their native + // representation + if (typeof x[Symbol.toPrimitive] === 'function') { + return x[Symbol.toPrimitive]('string'); + } + return ArrayBuffer.isView(x) ? `[${x}]` : JSON.stringify(x); +} diff --git a/src/arrow/js/src/util/recordbatch.ts b/src/arrow/js/src/util/recordbatch.ts new file mode 100644 index 000000000..37a630858 --- /dev/null +++ b/src/arrow/js/src/util/recordbatch.ts @@ -0,0 +1,121 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Column } from '../column'; +import { Vector } from '../vector'; +import { DataType } from '../type'; +import { Data, Buffers } from '../data'; +import { Schema, Field } from '../schema'; +import { Chunked } from '../vector/chunked'; +import { RecordBatch } from '../recordbatch'; + +const noopBuf = new Uint8Array(0); +const nullBufs = (bitmapLength: number) => <unknown> [ + noopBuf, noopBuf, new Uint8Array(bitmapLength), noopBuf +] as Buffers<any>; + +/** @ignore */ +export function ensureSameLengthData<T extends { [key: string]: DataType } = any>( + schema: Schema<T>, + chunks: Data<T[keyof T]>[], + batchLength = chunks.reduce((l, c) => Math.max(l, c.length), 0) +) { + let data: Data<T[keyof T]>; + let field: Field<T[keyof T]>; + let i = -1; + const n = chunks.length; + const fields = [...schema.fields]; + const batchData = [] as Data<T[keyof T]>[]; + const bitmapLength = ((batchLength + 63) & ~63) >> 3; + while (++i < n) { + if ((data = chunks[i]) && data.length === batchLength) { + batchData[i] = data; + } else { + (field = fields[i]).nullable || (fields[i] = fields[i].clone({ nullable: true }) as Field<T[keyof T]>); + batchData[i] = data ? data._changeLengthAndBackfillNullBitmap(batchLength) + : Data.new(field.type, 0, batchLength, batchLength, nullBufs(bitmapLength)) as Data<T[keyof T]>; + } + } + return [new Schema<T>(fields), batchLength, batchData] as [Schema<T>, number, Data<T[keyof T]>[]]; +} + +/** @ignore */ +export function distributeColumnsIntoRecordBatches<T extends { [key: string]: DataType } = any>(columns: Column<T[keyof T]>[]): [Schema<T>, RecordBatch<T>[]] { + return distributeVectorsIntoRecordBatches<T>(new Schema<T>(columns.map(({ field }) => field)), columns); +} + +/** @ignore */ +export function distributeVectorsIntoRecordBatches<T extends { [key: string]: DataType } = any>(schema: Schema<T>, vecs: (Vector<T[keyof T]> | Chunked<T[keyof T]>)[]): [Schema<T>, RecordBatch<T>[]] { + return uniformlyDistributeChunksAcrossRecordBatches<T>(schema, vecs.map((v) => v instanceof Chunked ? v.chunks.map((c) => c.data) : [v.data])); +} + +/** @ignore */ +function uniformlyDistributeChunksAcrossRecordBatches<T extends { [key: string]: DataType } = any>(schema: Schema<T>, columns: Data<T[keyof T]>[][]): [Schema<T>, RecordBatch<T>[]] { + + const fields = [...schema.fields]; + const batchArgs = [] as [number, Data<T[keyof T]>[]][]; + const memo = { numBatches: columns.reduce((n, c) => Math.max(n, c.length), 0) }; + + let numBatches = 0, batchLength = 0; + let i = -1; + const numColumns = columns.length; + let child: Data<T[keyof T]>, childData: Data<T[keyof T]>[] = []; + + while (memo.numBatches-- > 0) { + + for (batchLength = Number.POSITIVE_INFINITY, i = -1; ++i < numColumns;) { + childData[i] = child = columns[i].shift()!; + batchLength = Math.min(batchLength, child ? child.length : batchLength); + } + + if (isFinite(batchLength)) { + childData = distributeChildData(fields, batchLength, childData, columns, memo); + if (batchLength > 0) { + batchArgs[numBatches++] = [batchLength, childData.slice()]; + } + } + } + return [ + schema = new Schema<T>(fields, schema.metadata), + batchArgs.map((xs) => new RecordBatch(schema, ...xs)) + ]; +} + +/** @ignore */ +function distributeChildData<T extends { [key: string]: DataType } = any>(fields: Field<T[keyof T]>[], batchLength: number, childData: Data<T[keyof T]>[], columns: Data<T[keyof T]>[][], memo: { numBatches: number }) { + let data: Data<T[keyof T]>; + let field: Field<T[keyof T]>; + let length = 0, i = -1; + const n = columns.length; + const bitmapLength = ((batchLength + 63) & ~63) >> 3; + while (++i < n) { + if ((data = childData[i]) && ((length = data.length) >= batchLength)) { + if (length === batchLength) { + childData[i] = data; + } else { + childData[i] = data.slice(0, batchLength); + data = data.slice(batchLength, length - batchLength); + memo.numBatches = Math.max(memo.numBatches, columns[i].unshift(data)); + } + } else { + (field = fields[i]).nullable || (fields[i] = field.clone({ nullable: true }) as Field<T[keyof T]>); + childData[i] = data ? data._changeLengthAndBackfillNullBitmap(batchLength) + : Data.new(field.type, 0, batchLength, batchLength, nullBufs(bitmapLength)) as Data<T[keyof T]>; + } + } + return childData; +} diff --git a/src/arrow/js/src/util/utf8.ts b/src/arrow/js/src/util/utf8.ts new file mode 100644 index 000000000..b6f8fcdb8 --- /dev/null +++ b/src/arrow/js/src/util/utf8.ts @@ -0,0 +1,24 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +const decoder = new TextDecoder('utf-8'); +/** @ignore */ +export const decodeUtf8 = (buffer?: BufferSource) => decoder.decode(buffer); + +const encoder = new TextEncoder(); +/** @ignore */ +export const encodeUtf8 = (value?: string) => encoder.encode(value); diff --git a/src/arrow/js/src/util/vector.ts b/src/arrow/js/src/util/vector.ts new file mode 100644 index 000000000..a6cfd0373 --- /dev/null +++ b/src/arrow/js/src/util/vector.ts @@ -0,0 +1,198 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Vector } from '../vector'; +import { MapRow, StructRow } from '../vector/row'; +import { compareArrayLike } from '../util/buffer'; +import { BigInt, BigIntAvailable } from './compat'; + +/** @ignore */ +type RangeLike = { length: number; stride?: number }; +/** @ignore */ +type ClampThen<T extends RangeLike> = (source: T, index: number) => any; +/** @ignore */ +type ClampRangeThen<T extends RangeLike> = (source: T, offset: number, length: number) => any; + +export function clampIndex<T extends RangeLike>(source: T, index: number): number; +export function clampIndex<T extends RangeLike, N extends ClampThen<T> = ClampThen<T>>(source: T, index: number, then: N): ReturnType<N>; +/** @ignore */ +export function clampIndex<T extends RangeLike, N extends ClampThen<T> = ClampThen<T>>(source: T, index: number, then?: N) { + const length = source.length; + const adjust = index > -1 ? index : (length + (index % length)); + return then ? then(source, adjust) : adjust; +} + +/** @ignore */ +let tmp: number; +export function clampRange<T extends RangeLike>(source: T, begin: number | undefined, end: number | undefined): [number, number]; +export function clampRange<T extends RangeLike, N extends ClampRangeThen<T> = ClampRangeThen<T>>(source: T, begin: number | undefined, end: number | undefined, then: N): ReturnType<N>; +/** @ignore */ +export function clampRange<T extends RangeLike, N extends ClampRangeThen<T> = ClampRangeThen<T>>(source: T, begin: number | undefined, end: number | undefined, then?: N) { + + // Adjust args similar to Array.prototype.slice. Normalize begin/end to + // clamp between 0 and length, and wrap around on negative indices, e.g. + // slice(-1, 5) or slice(5, -1) + const { length: len = 0 } = source; + let lhs = typeof begin !== 'number' ? 0 : begin; + let rhs = typeof end !== 'number' ? len : end; + // wrap around on negative start/end positions + (lhs < 0) && (lhs = ((lhs % len) + len) % len); + (rhs < 0) && (rhs = ((rhs % len) + len) % len); + // ensure lhs <= rhs + (rhs < lhs) && (tmp = lhs, lhs = rhs, rhs = tmp); + // ensure rhs <= length + (rhs > len) && (rhs = len); + + return then ? then(source, lhs, rhs) : [lhs, rhs]; +} + +const big0 = BigIntAvailable ? BigInt(0) : 0; +const isNaNFast = (value: any) => value !== value; + +/** @ignore */ +export function createElementComparator(search: any) { + const typeofSearch = typeof search; + // Compare primitives + if (typeofSearch !== 'object' || search === null) { + // Compare NaN + if (isNaNFast(search)) { + return isNaNFast; + } + return typeofSearch !== 'bigint' + ? (value: any) => value === search + : (value: any) => (big0 + value) === search; + } + // Compare Dates + if (search instanceof Date) { + const valueOfSearch = search.valueOf(); + return (value: any) => value instanceof Date ? (value.valueOf() === valueOfSearch) : false; + } + // Compare TypedArrays + if (ArrayBuffer.isView(search)) { + return (value: any) => value ? compareArrayLike(search, value) : false; + } + // Compare Maps and Rows + if (search instanceof Map) { return creatMapComparator(search); } + // Compare Array-likes + if (Array.isArray(search)) { return createArrayLikeComparator(search); } + // Compare Vectors + if (search instanceof Vector) { return createVectorComparator(search); } + // Compare non-empty Objects + return createObjectComparator(search); +} + +/** @ignore */ +function createArrayLikeComparator(lhs: ArrayLike<any>) { + const comparators = [] as ((x: any) => boolean)[]; + for (let i = -1, n = lhs.length; ++i < n;) { + comparators[i] = createElementComparator(lhs[i]); + } + return createSubElementsComparator(comparators); +} + +/** @ignore */ +function creatMapComparator(lhs: Map<any, any>) { + let i = -1; + const comparators = [] as ((x: any) => boolean)[]; + lhs.forEach((v) => comparators[++i] = createElementComparator(v)); + return createSubElementsComparator(comparators); +} + +/** @ignore */ +function createVectorComparator(lhs: Vector<any>) { + const comparators = [] as ((x: any) => boolean)[]; + for (let i = -1, n = lhs.length; ++i < n;) { + comparators[i] = createElementComparator(lhs.get(i)); + } + return createSubElementsComparator(comparators); +} + +/** @ignore */ +function createObjectComparator(lhs: any) { + const keys = Object.keys(lhs); + // Only compare non-empty Objects + if (keys.length === 0) { return () => false; } + const comparators = [] as ((x: any) => boolean)[]; + for (let i = -1, n = keys.length; ++i < n;) { + comparators[i] = createElementComparator(lhs[keys[i]]); + } + return createSubElementsComparator(comparators, keys); +} + +function createSubElementsComparator(comparators: ((x: any) => boolean)[], keys?: Iterable<string>) { + return (rhs: any) => { + if (!rhs || typeof rhs !== 'object') { + return false; + } + switch (rhs.constructor) { + case Array: return compareArray(comparators, rhs); + case Map: + case MapRow: + case StructRow: + return compareObject(comparators, rhs, rhs.keys()); + case Object: + case undefined: // support `Object.create(null)` objects + return compareObject(comparators, rhs, keys || Object.keys(rhs)); + } + return rhs instanceof Vector ? compareVector(comparators, rhs) : false; + }; +} + +function compareArray(comparators: ((x: any) => boolean)[], arr: any[]) { + const n = comparators.length; + if (arr.length !== n) { return false; } + for (let i = -1; ++i < n;) { + if (!(comparators[i](arr[i]))) { return false; } + } + return true; +} + +function compareVector(comparators: ((x: any) => boolean)[], vec: Vector) { + const n = comparators.length; + if (vec.length !== n) { return false; } + for (let i = -1; ++i < n;) { + if (!(comparators[i](vec.get(i)))) { return false; } + } + return true; +} + +function compareObject(comparators: ((x: any) => boolean)[], obj: Map<any, any>, keys: Iterable<string>) { + + const lKeyItr = keys[Symbol.iterator](); + const rKeyItr = obj instanceof Map ? obj.keys() : Object.keys(obj)[Symbol.iterator](); + const rValItr = obj instanceof Map ? obj.values() : Object.values(obj)[Symbol.iterator](); + + let i = 0; + const n = comparators.length; + let rVal = rValItr.next(); + let lKey = lKeyItr.next(); + let rKey = rKeyItr.next(); + + for (; i < n && !lKey.done && !rKey.done && !rVal.done; + ++i, lKey = lKeyItr.next(), rKey = rKeyItr.next(), rVal = rValItr.next()) { + if (lKey.value !== rKey.value || !comparators[i](rVal.value)) { + break; + } + } + if (i === n && lKey.done && rKey.done && rVal.done) { + return true; + } + lKeyItr.return && lKeyItr.return(); + rKeyItr.return && rKeyItr.return(); + rValItr.return && rValItr.return(); + return false; +} diff --git a/src/arrow/js/src/vector.ts b/src/arrow/js/src/vector.ts new file mode 100644 index 000000000..bd7838cdf --- /dev/null +++ b/src/arrow/js/src/vector.ts @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Data } from './data'; +import { DataType } from './type'; +import { Chunked } from './vector/chunked'; + +/** @ignore */ +export interface Clonable<R extends AbstractVector> { + clone(...args: any[]): R; +} + +/** @ignore */ +export interface Sliceable<R extends AbstractVector> { + slice(begin?: number, end?: number): R; +} + +/** @ignore */ +export interface Applicative<T extends DataType, R extends Chunked> { + concat(...others: Vector<T>[]): R; + readonly [Symbol.isConcatSpreadable]: boolean; +} + +export interface AbstractVector<T extends DataType = any> + extends Clonable<AbstractVector<T>>, + Sliceable<AbstractVector<T>>, + Applicative<T, Chunked<T>> { + + readonly TType: T['TType']; + readonly TArray: T['TArray']; + readonly TValue: T['TValue']; +} + +export abstract class AbstractVector<T extends DataType = any> implements Iterable<T['TValue'] | null> { + + public abstract readonly data: Data<T>; + public abstract readonly type: T; + public abstract readonly typeId: T['TType']; + public abstract readonly length: number; + public abstract readonly stride: number; + public abstract readonly nullCount: number; + public abstract readonly byteLength: number; + public abstract readonly numChildren: number; + + public abstract readonly ArrayType: T['ArrayType']; + + public abstract isValid(index: number): boolean; + public abstract get(index: number): T['TValue'] | null; + public abstract set(index: number, value: T['TValue'] | null): void; + public abstract indexOf(value: T['TValue'] | null, fromIndex?: number): number; + public abstract [Symbol.iterator](): IterableIterator<T['TValue'] | null>; + + public abstract toArray(): T['TArray']; + public abstract getChildAt<R extends DataType = any>(index: number): Vector<R> | null; +} + +(AbstractVector.prototype as any).data = null; + +export { AbstractVector as Vector }; diff --git a/src/arrow/js/src/vector/base.ts b/src/arrow/js/src/vector/base.ts new file mode 100644 index 000000000..2ceecdda4 --- /dev/null +++ b/src/arrow/js/src/vector/base.ts @@ -0,0 +1,111 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Data } from '../data'; +import { Type } from '../enum'; +import { DataType } from '../type'; +import { Chunked } from './chunked'; +import { clampRange } from '../util/vector'; +import { VectorType as V } from '../interfaces'; +import { AbstractVector, Vector, Clonable, Sliceable, Applicative } from '../vector'; + +/** @ignore */ +export interface BaseVector<T extends DataType = any> extends Clonable<V<T>>, Sliceable<V<T>>, Applicative<T, Chunked<T>> { + slice(begin?: number, end?: number): V<T>; + concat(...others: Vector<T>[]): Chunked<T>; + clone<R extends DataType = T>(data: Data<R>, children?: Vector<R>[]): V<R>; +} + +/** @ignore */ +export abstract class BaseVector<T extends DataType = any> extends AbstractVector<T> + implements Clonable<V<T>>, Sliceable<V<T>>, Applicative<T, Chunked<T>> { + + protected _children?: Vector[]; + + constructor(data: Data<T>, children?: Vector[]) { + super(); + this._children = children; + this.numChildren = data.childData.length; + this._bindDataAccessors(this.data = data); + } + + public readonly data: Data<T>; + public readonly numChildren: number; + + public get type() { return this.data.type; } + public get typeId() { return this.data.typeId; } + public get length() { return this.data.length; } + public get offset() { return this.data.offset; } + public get stride() { return this.data.stride; } + public get nullCount() { return this.data.nullCount; } + public get byteLength() { return this.data.byteLength; } + public get VectorName() { return `${Type[this.typeId]}Vector`; } + + public get ArrayType(): T['ArrayType'] { return this.type.ArrayType; } + + public get values() { return this.data.values; } + public get typeIds() { return this.data.typeIds; } + public get nullBitmap() { return this.data.nullBitmap; } + public get valueOffsets() { return this.data.valueOffsets; } + + public get [Symbol.toStringTag]() { return `${this.VectorName}<${this.type[Symbol.toStringTag]}>`; } + + public clone<R extends DataType = T>(data: Data<R>, children = this._children) { + return Vector.new<R>(data, children) as any; + } + + public concat(...others: Vector<T>[]) { + return Chunked.concat<T>(this, ...others); + } + + public slice(begin?: number, end?: number) { + // Adjust args similar to Array.prototype.slice. Normalize begin/end to + // clamp between 0 and length, and wrap around on negative indices, e.g. + // slice(-1, 5) or slice(5, -1) + return clampRange(this, begin, end, this._sliceInternal); + } + + public isValid(index: number): boolean { + if (this.nullCount > 0) { + const idx = this.offset + index; + const val = this.nullBitmap[idx >> 3]; + const mask = (val & (1 << (idx % 8))); + return mask !== 0; + } + return true; + } + + public getChildAt<R extends DataType = any>(index: number): Vector<R> | null { + return index < 0 || index >= this.numChildren ? null : ( + (this._children || (this._children = []))[index] || + (this._children[index] = Vector.new<R>(this.data.childData[index] as Data<R>)) + ) as Vector<R>; + } + + public toJSON() { return [...this]; } + + protected _sliceInternal(self: this, begin: number, end: number) { + return self.clone(self.data.slice(begin, end - begin), null!); + } + + // @ts-ignore + protected _bindDataAccessors(data: Data<T>) { + // Implementation in src/vectors/index.ts due to circular dependency/packaging shenanigans + } +} + +(BaseVector.prototype as any)[Symbol.isConcatSpreadable] = true; diff --git a/src/arrow/js/src/vector/binary.ts b/src/arrow/js/src/vector/binary.ts new file mode 100644 index 000000000..603187a78 --- /dev/null +++ b/src/arrow/js/src/vector/binary.ts @@ -0,0 +1,27 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Vector } from '../vector'; +import { BaseVector } from './base'; +import { Binary, Utf8 } from '../type'; + +/** @ignore */ +export class BinaryVector extends BaseVector<Binary> { + public asUtf8() { + return Vector.new(this.data.clone(new Utf8())); + } +} diff --git a/src/arrow/js/src/vector/bool.ts b/src/arrow/js/src/vector/bool.ts new file mode 100644 index 000000000..b555f4692 --- /dev/null +++ b/src/arrow/js/src/vector/bool.ts @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Bool } from '../type'; +import { Chunked } from './chunked'; +import { BaseVector } from './base'; +import { VectorBuilderOptions } from './index'; +import { vectorFromValuesWithType } from './index'; +import { VectorBuilderOptionsAsync } from './index'; + +/** @ignore */ +export class BoolVector extends BaseVector<Bool> { + public static from<TNull = any>(input: Iterable<boolean | TNull>): BoolVector; + public static from<TNull = any>(input: AsyncIterable<boolean | TNull>): Promise<BoolVector>; + public static from<TNull = any>(input: VectorBuilderOptions<Bool, boolean | TNull>): Chunked<Bool>; + public static from<TNull = any>(input: VectorBuilderOptionsAsync<Bool, boolean | TNull>): Promise<Chunked<Bool>>; + /** @nocollapse */ + public static from<TNull = any>(input: Iterable<boolean | TNull> | AsyncIterable<boolean | TNull> | VectorBuilderOptions<Bool, boolean | TNull> | VectorBuilderOptionsAsync<Bool, boolean | TNull>) { + return vectorFromValuesWithType(() => new Bool(), input); + } +} diff --git a/src/arrow/js/src/vector/chunked.ts b/src/arrow/js/src/vector/chunked.ts new file mode 100644 index 000000000..656c4a1b6 --- /dev/null +++ b/src/arrow/js/src/vector/chunked.ts @@ -0,0 +1,320 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Data } from '../data'; +import { Field } from '../schema'; +import { clampRange } from '../util/vector'; +import { DataType, Dictionary } from '../type'; +import { selectChunkArgs } from '../util/args'; +import { DictionaryVector } from './dictionary'; +import { AbstractVector, Vector } from '../vector'; +import { Clonable, Sliceable, Applicative } from '../vector'; + +/** @ignore */ +type ChunkedDict<T extends DataType> = T extends Dictionary ? Vector<T['dictionary']> : null | never; +/** @ignore */ +type ChunkedKeys<T extends DataType> = T extends Dictionary ? Vector<T['indices']> | Chunked<T['indices']> : null | never; + +/** @ignore */ +export type SearchContinuation<T extends Chunked> = (column: T, chunkIndex: number, valueIndex: number) => any; + +/** @ignore */ +class ChunkedIterator<T extends DataType> implements IterableIterator<T['TValue'] | null> { + private chunkIndex = 0; + private chunkIterator: IterableIterator<T['TValue'] | null>; + + constructor( + private chunks: Vector<T>[], + ) { + this.chunkIterator = this.getChunkIterator(); + } + + next(): IteratorResult<T['TValue'] | null> { + while (this.chunkIndex < this.chunks.length) { + const next = this.chunkIterator.next(); + + if (!next.done) { + return next; + } + + if (++this.chunkIndex < this.chunks.length) { + this.chunkIterator = this.getChunkIterator(); + } + } + + return {done: true, value: null}; + } + + getChunkIterator() { + return this.chunks[this.chunkIndex][Symbol.iterator](); + } + + [Symbol.iterator]() { + return this; + } +} + +/** @ignore */ +export class Chunked<T extends DataType = any> + extends AbstractVector<T> + implements Clonable<Chunked<T>>, + Sliceable<Chunked<T>>, + Applicative<T, Chunked<T>> { + + /** @nocollapse */ + public static flatten<T extends DataType>(...vectors: (Vector<T> | Vector<T>[])[]) { + return selectChunkArgs<Vector<T>>(Vector, vectors); + } + + /** @nocollapse */ + public static concat<T extends DataType>(...vectors: (Vector<T> | Vector<T>[])[]) { + const chunks = Chunked.flatten<T>(...vectors); + return new Chunked<T>(chunks[0].type, chunks); + } + + protected _type: T; + protected _length: number; + protected _chunks: Vector<T>[]; + protected _numChildren: number; + protected _children?: Chunked[]; + protected _nullCount = -1; + protected _chunkOffsets: Uint32Array; + + constructor(type: T, chunks: Vector<T>[] = [], offsets = calculateOffsets(chunks)) { + super(); + this._type = type; + this._chunks = chunks; + this._chunkOffsets = offsets; + this._length = offsets[offsets.length - 1]; + this._numChildren = (this._type.children || []).length; + } + + public get type() { return this._type; } + public get length() { return this._length; } + public get chunks() { return this._chunks; } + public get typeId(): T['TType'] { return this._type.typeId; } + public get VectorName() { return `Chunked<${this._type}>`; } + public get data(): Data<T> { + return this._chunks[0] ? this._chunks[0].data : <any> null; + } + + public get ArrayType() { return this._type.ArrayType; } + public get numChildren() { return this._numChildren; } + public get stride() { return this._chunks[0] ? this._chunks[0].stride : 1; } + public get byteLength(): number { + return this._chunks.reduce((byteLength, chunk) => byteLength + chunk.byteLength, 0); + } + public get nullCount() { + let nullCount = this._nullCount; + if (nullCount < 0) { + this._nullCount = nullCount = this._chunks.reduce((x, { nullCount }) => x + nullCount, 0); + } + return nullCount; + } + + protected _indices?: ChunkedKeys<T>; + public get indices(): ChunkedKeys<T> | null { + if (DataType.isDictionary(this._type)) { + if (!this._indices) { + const chunks = (<any> this._chunks) as DictionaryVector<T, any>[]; + this._indices = (chunks.length === 1 + ? chunks[0].indices + : Chunked.concat(...chunks.map((x) => x.indices))) as ChunkedKeys<T>; + } + return this._indices; + } + return null; + } + public get dictionary(): ChunkedDict<T> | null { + if (DataType.isDictionary(this._type)) { + return this._chunks[this._chunks.length - 1].data.dictionary as ChunkedDict<T>; + } + return null; + } + + public [Symbol.iterator](): IterableIterator<T['TValue'] | null> { + return new ChunkedIterator(this._chunks); + } + + public clone(chunks = this._chunks): Chunked<T> { + return new Chunked(this._type, chunks); + } + + public concat(...others: Vector<T>[]): Chunked<T> { + return this.clone(Chunked.flatten(this, ...others)); + } + + public slice(begin?: number, end?: number): Chunked<T> { + return clampRange(this, begin, end, this._sliceInternal); + } + + public getChildAt<R extends DataType = any>(index: number): Chunked<R> | null { + + if (index < 0 || index >= this._numChildren) { return null; } + + const columns = this._children || (this._children = []); + let child: Chunked<R>, field: Field<R>, chunks: Vector<R>[]; + + if (child = columns[index]) { return child; } + if (field = ((this._type.children || [])[index] as Field<R>)) { + chunks = this._chunks + .map((vector) => vector.getChildAt<R>(index)) + .filter((vec): vec is Vector<R> => vec != null); + if (chunks.length > 0) { + return (columns[index] = new Chunked<R>(field.type, chunks)); + } + } + + return null; + } + + public search(index: number): [number, number] | null; + public search<N extends SearchContinuation<Chunked<T>>>(index: number, then?: N): ReturnType<N>; + public search<N extends SearchContinuation<Chunked<T>>>(index: number, then?: N) { + const idx = index; + // binary search to find the child vector and value indices + const offsets = this._chunkOffsets; + let rhs = offsets.length - 1; + // return early if out of bounds, or if there's just one child + if (idx < 0 ) { return null; } + if (idx >= offsets[rhs]) { return null; } + if (rhs <= 1 ) { return then ? then(this, 0, idx) : [0, idx]; } + let lhs = 0, pos = 0, mid = 0; + do { + if (lhs + 1 === rhs) { + return then ? then(this, lhs, idx - pos) : [lhs, idx - pos]; + } + mid = lhs + ((rhs - lhs) / 2) | 0; + idx >= offsets[mid] ? (lhs = mid) : (rhs = mid); + } while (idx < offsets[rhs] && idx >= (pos = offsets[lhs])); + return null; + } + + public isValid(index: number): boolean { + return !!this.search(index, this.isValidInternal); + } + + public get(index: number): T['TValue'] | null { + return this.search(index, this.getInternal); + } + + public set(index: number, value: T['TValue'] | null): void { + this.search(index, ({ chunks }, i, j) => chunks[i].set(j, value)); + } + + public indexOf(element: T['TValue'], offset?: number): number { + if (offset && typeof offset === 'number') { + return this.search(offset, (self, i, j) => this.indexOfInternal(self, i, j, element))!; + } + return this.indexOfInternal(this, 0, Math.max(0, offset || 0), element); + } + + public toArray(): T['TArray'] { + const { chunks } = this; + const n = chunks.length; + let ArrayType: any = this._type.ArrayType; + if (n <= 0) { return new ArrayType(0); } + if (n <= 1) { return chunks[0].toArray(); } + let len = 0; + const src = new Array(n); + for (let i = -1; ++i < n;) { + len += (src[i] = chunks[i].toArray()).length; + } + if (ArrayType !== src[0].constructor) { + ArrayType = src[0].constructor; + } + const dst = new ArrayType(len); + const set: any = ArrayType === Array ? arraySet : typedSet; + for (let i = -1, idx = 0; ++i < n;) { + idx = set(src[i], dst, idx); + } + return dst; + } + + protected getInternal({ _chunks }: Chunked<T>, i: number, j: number) { return _chunks[i].get(j); } + protected isValidInternal({ _chunks }: Chunked<T>, i: number, j: number) { return _chunks[i].isValid(j); } + protected indexOfInternal({ _chunks }: Chunked<T>, chunkIndex: number, fromIndex: number, element: T['TValue']) { + let i = chunkIndex - 1; + const n = _chunks.length; + let start = fromIndex, offset = 0, found = -1; + while (++i < n) { + if (~(found = _chunks[i].indexOf(element, start))) { + return offset + found; + } + start = 0; + offset += _chunks[i].length; + } + return -1; + } + + protected _sliceInternal(self: Chunked<T>, begin: number, end: number) { + const slices: Vector<T>[] = []; + const { chunks, _chunkOffsets: chunkOffsets } = self; + for (let i = -1, n = chunks.length; ++i < n;) { + const chunk = chunks[i]; + const chunkLength = chunk.length; + const chunkOffset = chunkOffsets[i]; + // If the child is to the right of the slice boundary, we can stop + if (chunkOffset >= end) { break; } + // If the child is to the left of of the slice boundary, exclude + if (begin >= chunkOffset + chunkLength) { continue; } + // If the child is between both left and right boundaries, include w/o slicing + if (chunkOffset >= begin && (chunkOffset + chunkLength) <= end) { + slices.push(chunk); + continue; + } + // If the child overlaps one of the slice boundaries, include that slice + const from = Math.max(0, begin - chunkOffset); + const to = Math.min(end - chunkOffset, chunkLength); + slices.push(chunk.slice(from, to) as Vector<T>); + } + return self.clone(slices); + } +} + +/** @ignore */ +function calculateOffsets<T extends DataType>(vectors: Vector<T>[]) { + const offsets = new Uint32Array((vectors || []).length + 1); + let offset = offsets[0] = 0; + const length = offsets.length; + for (let index = 0; ++index < length;) { + offsets[index] = (offset += vectors[index - 1].length); + } + return offsets; +} + +/** @ignore */ +const typedSet = (src: TypedArray, dst: TypedArray, offset: number) => { + dst.set(src, offset); + return (offset + src.length); +}; + +/** @ignore */ +const arraySet = (src: any[], dst: any[], offset: number) => { + let idx = offset; + for (let i = -1, n = src.length; ++i < n;) { + dst[idx++] = src[i]; + } + return idx; +}; + +/** @ignore */ +interface TypedArray extends ArrayBufferView { + readonly length: number; + readonly [n: number]: number; + set(array: ArrayLike<number>, offset?: number): void; +} diff --git a/src/arrow/js/src/vector/date.ts b/src/arrow/js/src/vector/date.ts new file mode 100644 index 000000000..8c2b7a563 --- /dev/null +++ b/src/arrow/js/src/vector/date.ts @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { DateUnit } from '../enum'; +import { Chunked } from './chunked'; +import { BaseVector } from './base'; +import { VectorType as V } from '../interfaces'; +import { VectorBuilderOptions } from './index'; +import { vectorFromValuesWithType } from './index'; +import { VectorBuilderOptionsAsync } from './index'; +import { Date_, DateDay, DateMillisecond } from '../type'; + +/** @ignore */ +type FromArgs<T extends Date_> = [Iterable<Date>, T['unit']]; + +/** @ignore */ +export class DateVector<T extends Date_ = Date_> extends BaseVector<T> { + public static from<T extends DateUnit.DAY>(...args: FromArgs<DateDay>): V<DateDay>; + public static from<T extends DateUnit.MILLISECOND>(...args: FromArgs<DateMillisecond>): V<DateMillisecond>; + public static from<T extends Date_, TNull = any>(input: Iterable<Date | TNull>): V<T>; + public static from<T extends Date_, TNull = any>(input: AsyncIterable<Date | TNull>): Promise<V<T>>; + public static from<T extends Date_, TNull = any>(input: VectorBuilderOptions<T, Date | TNull>): Chunked<T>; + public static from<T extends Date_, TNull = any>(input: VectorBuilderOptionsAsync<T, Date | TNull>): Promise<Chunked<T>>; + /** @nocollapse */ + public static from<T extends Date_, TNull = any>(...args: FromArgs<T> | [Iterable<Date | TNull> | AsyncIterable<Date | TNull> | VectorBuilderOptions<T, Date | TNull> | VectorBuilderOptionsAsync<T, Date | TNull>]) { + if (args.length === 2) { + return vectorFromValuesWithType(() => args[1] === DateUnit.DAY ? new DateDay() : new DateMillisecond() as T, args[0]); + } + return vectorFromValuesWithType(() => new DateMillisecond() as T, args[0]); + } +} + +/** @ignore */ +export class DateDayVector extends DateVector<DateDay> {} + +/** @ignore */ +export class DateMillisecondVector extends DateVector<DateMillisecond> {} diff --git a/src/arrow/js/src/vector/decimal.ts b/src/arrow/js/src/vector/decimal.ts new file mode 100644 index 000000000..a1056fd4f --- /dev/null +++ b/src/arrow/js/src/vector/decimal.ts @@ -0,0 +1,22 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Decimal } from '../type'; +import { BaseVector } from './base'; + +/** @ignore */ +export class DecimalVector extends BaseVector<Decimal> {} diff --git a/src/arrow/js/src/vector/dictionary.ts b/src/arrow/js/src/vector/dictionary.ts new file mode 100644 index 000000000..4b39dbe97 --- /dev/null +++ b/src/arrow/js/src/vector/dictionary.ts @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Data } from '../data'; +import { Vector } from '../vector'; +import { BaseVector } from './base'; +import { VectorType as V } from '../interfaces'; +import { VectorBuilderOptions } from './index'; +import { vectorFromValuesWithType } from './index'; +import { VectorBuilderOptionsAsync } from './index'; +import { DataType, Dictionary, TKeys } from '../type'; + +/** @ignore */ +type FromArgs<T extends DataType = any, TKey extends TKeys = TKeys> = [Vector<T>, TKey, ArrayLike<number> | TKey['TArray']]; + +/** @ignore */ +export class DictionaryVector<T extends DataType = any, TKey extends TKeys = TKeys> extends BaseVector<Dictionary<T, TKey>> { + public static from<T extends DataType = any, TKey extends TKeys = TKeys>(...args: FromArgs<T, TKey>): V<Dictionary<T, TKey>>; + public static from<T extends DataType = any, TKey extends TKeys = TKeys>(input: VectorBuilderOptions<Dictionary<T, TKey>>): Vector<Dictionary<T, TKey>>; + public static from<T extends DataType = any, TKey extends TKeys = TKeys>(input: VectorBuilderOptionsAsync<Dictionary<T, TKey>>): Promise<Vector<Dictionary<T, TKey>>>; + /** @nocollapse */ + public static from<T extends DataType = any, TKey extends TKeys = TKeys>(...args: any[]) { + if (args.length === 3) { + const [values, indices, keys] = args as FromArgs<T, TKey>; + const type = new Dictionary(values.type, indices, null, null); + return Vector.new(Data.Dictionary(type, 0, keys.length, 0, null, keys, values)); + } + return vectorFromValuesWithType(() => args[0].type, args[0]); + } + + constructor(data: Data<Dictionary<T, TKey>>) { + super(data); + this.indices = Vector.new(data.clone(this.type.indices)); + } + + public readonly indices: V<TKey>; + + public get dictionary() { return <Vector<T>> this.data.dictionary; } + public reverseLookup(value: T) { return this.dictionary.indexOf(value); } + public getKey(idx: number): TKey['TValue'] | null { return this.indices.get(idx); } + public getValue(key: number): T['TValue'] | null { return this.dictionary.get(key); } + public setKey(idx: number, key: TKey['TValue'] | null) { return this.indices.set(idx, key); } + public setValue(key: number, value: T['TValue'] | null) { return this.dictionary.set(key, value); } +} + +(DictionaryVector.prototype as any).indices = null; diff --git a/src/arrow/js/src/vector/fixedsizebinary.ts b/src/arrow/js/src/vector/fixedsizebinary.ts new file mode 100644 index 000000000..779be19ff --- /dev/null +++ b/src/arrow/js/src/vector/fixedsizebinary.ts @@ -0,0 +1,22 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { BaseVector } from './base'; +import { FixedSizeBinary } from '../type'; + +/** @ignore */ +export class FixedSizeBinaryVector extends BaseVector<FixedSizeBinary> {} diff --git a/src/arrow/js/src/vector/fixedsizelist.ts b/src/arrow/js/src/vector/fixedsizelist.ts new file mode 100644 index 000000000..13637021f --- /dev/null +++ b/src/arrow/js/src/vector/fixedsizelist.ts @@ -0,0 +1,22 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { BaseVector } from './base'; +import { DataType, FixedSizeList } from '../type'; + +/** @ignore */ +export class FixedSizeListVector<T extends DataType = any> extends BaseVector<FixedSizeList<T>> {} diff --git a/src/arrow/js/src/vector/float.ts b/src/arrow/js/src/vector/float.ts new file mode 100644 index 000000000..8260d2b27 --- /dev/null +++ b/src/arrow/js/src/vector/float.ts @@ -0,0 +1,144 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Data } from '../data'; +import { Vector } from '../vector'; +import { Chunked } from './chunked'; +import { BaseVector } from './base'; +import { VectorBuilderOptions } from './index'; +import { vectorFromValuesWithType } from './index'; +import { VectorBuilderOptionsAsync } from './index'; +import { Float, Float16, Float32, Float64, FloatArray } from '../type'; +import { VectorType as V, TypedArrayConstructor } from '../interfaces'; + +/** @ignore */ +type FloatVectorConstructors = + typeof FloatVector | + typeof Float16Vector | + typeof Float32Vector | + typeof Float64Vector ; + +/** @ignore */ +type FromInput<T extends Float, TNull = any> = + FloatArray | + Iterable<T['TValue'] | TNull> | + AsyncIterable<T['TValue'] | TNull> | + VectorBuilderOptions<T, TNull> | + VectorBuilderOptionsAsync<T, TNull> ; + +/** @ignore */ +export type FloatArrayCtor = TypedArrayConstructor<FloatArray>; + +/** @ignore */ +export class FloatVector<T extends Float = Float> extends BaseVector<T> { + + // Guaranteed zero-copy variants + public static from(this: typeof FloatVector, input: Uint16Array): Float16Vector; + public static from(this: typeof FloatVector, input: Float32Array): Float32Vector; + public static from(this: typeof FloatVector, input: Float64Array): Float64Vector; + + // Zero-copy if input is a TypedArray of the same type as the + // Vector that from is called on, otherwise uses the Builders + public static from<TNull = any>(this: typeof Float16Vector, input: FromInput<Float16, TNull>): Float16Vector; + public static from<TNull = any>(this: typeof Float32Vector, input: FromInput<Float32, TNull>): Float32Vector; + public static from<TNull = any>(this: typeof Float64Vector, input: FromInput<Float64, TNull>): Float64Vector; + + // Not zero-copy + public static from<T extends Float, TNull = any>(this: typeof FloatVector, input: Iterable<T['TValue'] | TNull>): V<T>; + public static from<T extends Float, TNull = any>(this: typeof FloatVector, input: AsyncIterable<T['TValue'] | TNull>): Promise<V<T>>; + public static from<T extends Float, TNull = any>(this: typeof FloatVector, input: VectorBuilderOptions<T, TNull>): Chunked<T>; + public static from<T extends Float, TNull = any>(this: typeof FloatVector, input: VectorBuilderOptionsAsync<T, TNull>): Promise<Chunked<T>>; + /** @nocollapse */ + public static from<T extends Float, TNull = any>(this: FloatVectorConstructors, input: FromInput<T, TNull>) { + + let ArrowType = vectorTypeToDataType(this); + + if ((input instanceof ArrayBuffer) || ArrayBuffer.isView(input)) { + const InputType = arrayTypeToDataType(input.constructor as FloatArrayCtor) || ArrowType; + // Special case, infer the Arrow DataType from the input if calling the base + // FloatVector.from with a TypedArray, e.g. `FloatVector.from(new Float32Array())` + if (ArrowType === null) { + ArrowType = InputType; + } + // If the DataType inferred from the Vector constructor matches the + // DataType inferred from the input arguments, return zero-copy view + if (ArrowType && ArrowType === InputType) { + const type = new ArrowType(); + const length = input.byteLength / type.ArrayType.BYTES_PER_ELEMENT; + // If the ArrowType is Float16 but the input type isn't a Uint16Array, + // let the Float16Builder handle casting the input values to Uint16s. + if (!convertTo16Bit(ArrowType, input.constructor)) { + return Vector.new(Data.Float(type, 0, length, 0, null, input as FloatArray)); + } + } + } + + if (ArrowType) { + // If the DataType inferred from the Vector constructor is different than + // the DataType inferred from the input TypedArray, or if input isn't a + // TypedArray, use the Builders to construct the result Vector + return vectorFromValuesWithType(() => new ArrowType!() as T, input); + } + + if ((input instanceof DataView) || (input instanceof ArrayBuffer)) { + throw new TypeError(`Cannot infer float type from instance of ${input.constructor.name}`); + } + + throw new TypeError('Unrecognized FloatVector input'); + } +} + +/** @ignore */ +export class Float16Vector extends FloatVector<Float16> { + // Since JS doesn't have half floats, `toArray()` returns a zero-copy slice + // of the underlying Uint16Array data. This behavior ensures we don't incur + // extra compute or copies if you're calling `toArray()` in order to create + // a buffer for something like WebGL. Buf if you're using JS and want typed + // arrays of 4-to-8-byte precision, these methods will enumerate the values + // and clamp to the desired byte lengths. + public toFloat32Array() { return new Float32Array(this as Iterable<number>); } + public toFloat64Array() { return new Float64Array(this as Iterable<number>); } +} + +/** @ignore */ +export class Float32Vector extends FloatVector<Float32> {} +/** @ignore */ +export class Float64Vector extends FloatVector<Float64> {} + +const convertTo16Bit = (typeCtor: any, dataCtor: any) => { + return (typeCtor === Float16) && (dataCtor !== Uint16Array); +}; + +/** @ignore */ +const arrayTypeToDataType = (ctor: FloatArrayCtor) => { + switch (ctor) { + case Uint16Array: return Float16; + case Float32Array: return Float32; + case Float64Array: return Float64; + default: return null; + } +}; + +/** @ignore */ +const vectorTypeToDataType = (ctor: FloatVectorConstructors) => { + switch (ctor) { + case Float16Vector: return Float16; + case Float32Vector: return Float32; + case Float64Vector: return Float64; + default: return null; + } +}; diff --git a/src/arrow/js/src/vector/index.ts b/src/arrow/js/src/vector/index.ts new file mode 100644 index 000000000..30f5e3cfa --- /dev/null +++ b/src/arrow/js/src/vector/index.ts @@ -0,0 +1,207 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +export { Vector } from '../vector'; +export { BaseVector } from './base'; +export { BinaryVector } from './binary'; +export { BoolVector } from './bool'; +export { Chunked } from './chunked'; +export { DateVector, DateDayVector, DateMillisecondVector } from './date'; +export { DecimalVector } from './decimal'; +export { DictionaryVector } from './dictionary'; +export { FixedSizeBinaryVector } from './fixedsizebinary'; +export { FixedSizeListVector } from './fixedsizelist'; +export { FloatVector, Float16Vector, Float32Vector, Float64Vector } from './float'; +export { IntervalVector, IntervalDayTimeVector, IntervalYearMonthVector } from './interval'; +export { IntVector, Int8Vector, Int16Vector, Int32Vector, Int64Vector, Uint8Vector, Uint16Vector, Uint32Vector, Uint64Vector } from './int'; +export { ListVector } from './list'; +export { MapVector } from './map'; +export { NullVector } from './null'; +export { StructVector } from './struct'; +export { TimestampVector, TimestampSecondVector, TimestampMillisecondVector, TimestampMicrosecondVector, TimestampNanosecondVector } from './timestamp'; +export { TimeVector, TimeSecondVector, TimeMillisecondVector, TimeMicrosecondVector, TimeNanosecondVector } from './time'; +export { UnionVector, DenseUnionVector, SparseUnionVector } from './union'; +export { Utf8Vector } from './utf8'; +export { MapRow, StructRow } from './row'; + +import * as fn from '../util/fn'; +import { Data } from '../data'; +import { Type } from '../enum'; +import { Vector } from '../vector'; +import { DataType } from '../type'; +import { Chunked } from './chunked'; +import { BaseVector } from './base'; +import { setBool } from '../util/bit'; +import { isIterable, isAsyncIterable } from '../util/compat'; +import { Builder, IterableBuilderOptions } from '../builder'; +import { VectorType as V, VectorCtorArgs } from '../interfaces'; +import { instance as getVisitor } from '../visitor/get'; +import { instance as setVisitor } from '../visitor/set'; +import { instance as indexOfVisitor } from '../visitor/indexof'; +import { instance as toArrayVisitor } from '../visitor/toarray'; +import { instance as iteratorVisitor } from '../visitor/iterator'; +import { instance as byteWidthVisitor } from '../visitor/bytewidth'; +import { instance as getVectorConstructor } from '../visitor/vectorctor'; + +declare module '../vector' { + namespace Vector { + export { newVector as new }; + export { vectorFrom as from }; + } +} + +declare module './base' { + namespace BaseVector { + export { vectorFrom as from }; + } + interface BaseVector<T extends DataType> { + get(index: number): T['TValue'] | null; + set(index: number, value: T['TValue'] | null): void; + indexOf(value: T['TValue'] | null, fromIndex?: number): number; + toArray(): T['TArray']; + getByteWidth(): number; + [Symbol.iterator](): IterableIterator<T['TValue'] | null>; + } +} + +/** @nocollapse */ +Vector.new = newVector; + +/** @nocollapse */ +Vector.from = vectorFrom; + +/** @ignore */ +function newVector<T extends DataType>(data: Data<T>, ...args: VectorCtorArgs<V<T>>): V<T> { + return new (getVectorConstructor.getVisitFn<T>(data)())(data, ...args) as V<T>; +} + +/** @ignore */ +export interface VectorBuilderOptions<T extends DataType, TNull = any> extends IterableBuilderOptions<T, TNull> { values: Iterable<T['TValue'] | TNull> } +/** @ignore */ +export interface VectorBuilderOptionsAsync<T extends DataType, TNull = any> extends IterableBuilderOptions<T, TNull> { values: AsyncIterable<T['TValue'] | TNull> } + +/** @ignore */ +export function vectorFromValuesWithType<T extends DataType, TNull = any>(newDataType: () => T, input: Iterable<T['TValue'] | TNull> | AsyncIterable<T['TValue'] | TNull> | VectorBuilderOptions<T, TNull> | VectorBuilderOptionsAsync<T, TNull>) { + if (isIterable(input)) { + return Vector.from({ 'nullValues': [null, undefined], type: newDataType(), 'values': input }) as V<T>; + } else if (isAsyncIterable(input)) { + return Vector.from({ 'nullValues': [null, undefined], type: newDataType(), 'values': input }) as Promise<V<T>>; + } + const { + 'values': values = [], + 'type': type = newDataType(), + 'nullValues': nullValues = [null, undefined], + } = { ...input }; + return isIterable(values) + ? Vector.from({ nullValues, ...input, type } as VectorBuilderOptions<T, TNull>) + : Vector.from({ nullValues, ...input, type } as VectorBuilderOptionsAsync<T, TNull>); +} + +/** @ignore */ +function vectorFrom<T extends DataType = any, TNull = any>(input: VectorBuilderOptions<T, TNull>): Vector<T>; +function vectorFrom<T extends DataType = any, TNull = any>(input: VectorBuilderOptionsAsync<T, TNull>): Promise<Vector<T>>; +function vectorFrom<T extends DataType = any, TNull = any>(input: VectorBuilderOptions<T, TNull> | VectorBuilderOptionsAsync<T, TNull>) { + const { 'values': values = [], ...options } = { 'nullValues': [null, undefined], ...input } as VectorBuilderOptions<T, TNull> | VectorBuilderOptionsAsync<T, TNull>; + if (isIterable<T['TValue'] | TNull>(values)) { + const chunks = [...Builder.throughIterable(options)(values)]; + return (chunks.length === 1 ? chunks[0] : Chunked.concat<T>(chunks)) as Vector<T>; + } + return (async (chunks: V<T>[]) => { + const transform = Builder.throughAsyncIterable(options); + for await (const chunk of transform(values)) { + chunks.push(chunk); + } + return (chunks.length === 1 ? chunks[0] : Chunked.concat<T>(chunks)) as Vector<T>; + })([]); +} + +// +// We provide the following method implementations for code navigability purposes only. +// They're overridden at runtime below with the specific Visitor implementation for each type, +// short-circuiting the usual Visitor traversal and reducing intermediate lookups and calls. +// This comment is here to remind you to not set breakpoints in these function bodies, or to inform +// you why the breakpoints you have already set are not being triggered. Have a great day! +// + +BaseVector.prototype.get = function baseVectorGet<T extends DataType>(this: BaseVector<T>, index: number): T['TValue'] | null { + return getVisitor.visit(this, index); +}; + +BaseVector.prototype.set = function baseVectorSet<T extends DataType>(this: BaseVector<T>, index: number, value: T['TValue'] | null): void { + return setVisitor.visit(this, index, value); +}; + +BaseVector.prototype.indexOf = function baseVectorIndexOf<T extends DataType>(this: BaseVector<T>, value: T['TValue'] | null, fromIndex?: number): number { + return indexOfVisitor.visit(this, value, fromIndex); +}; + +BaseVector.prototype.toArray = function baseVectorToArray<T extends DataType>(this: BaseVector<T>): T['TArray'] { + return toArrayVisitor.visit(this); +}; + +BaseVector.prototype.getByteWidth = function baseVectorGetByteWidth<T extends DataType>(this: BaseVector<T>): number { + return byteWidthVisitor.visit(this.type); +}; + +BaseVector.prototype[Symbol.iterator] = function baseVectorSymbolIterator<T extends DataType>(this: BaseVector<T>): IterableIterator<T['TValue'] | null> { + return iteratorVisitor.visit(this); +}; + +(BaseVector.prototype as any)._bindDataAccessors = bindBaseVectorDataAccessors; + +// Perf: bind and assign the operator Visitor methods to each of the Vector subclasses for each Type +(Object.keys(Type) as any[]) + .map((T: any) => Type[T] as any) + .filter((T: any): T is Type => typeof T === 'number') + .filter((typeId) => typeId !== Type.NONE) + .forEach((typeId) => { + const VectorCtor = getVectorConstructor.visit(typeId); + VectorCtor.prototype['get'] = fn.partial1(getVisitor.getVisitFn(typeId)); + VectorCtor.prototype['set'] = fn.partial2(setVisitor.getVisitFn(typeId)); + VectorCtor.prototype['indexOf'] = fn.partial2(indexOfVisitor.getVisitFn(typeId)); + VectorCtor.prototype['toArray'] = fn.partial0(toArrayVisitor.getVisitFn(typeId)); + VectorCtor.prototype['getByteWidth'] = partialType0(byteWidthVisitor.getVisitFn(typeId)); + VectorCtor.prototype[Symbol.iterator] = fn.partial0(iteratorVisitor.getVisitFn(typeId)); + }); + +/** @ignore */ +function partialType0<T extends Vector>(visit: (node: T['type']) => any) { + return function(this: T) { return visit(this.type); }; +} + +/** @ignore */ +function wrapNullableGet<T extends DataType, V extends Vector<T>, F extends (i: number) => any>(fn: F): (...args: Parameters<F>) => ReturnType<F> { + return function(this: V, i: number) { return this.isValid(i) ? fn.call(this, i) : null; }; +} + +/** @ignore */ +function wrapNullableSet<T extends DataType, V extends BaseVector<T>, F extends (i: number, a: any) => void>(fn: F): (...args: Parameters<F>) => void { + return function(this: V, i: number, a: any) { + if (setBool(this.nullBitmap, this.offset + i, !((a == null)))) { + fn.call(this, i, a); + } + }; +} + +/** @ignore */ +function bindBaseVectorDataAccessors<T extends DataType>(this: BaseVector<T>) { + const nullBitmap = this.nullBitmap; + if (nullBitmap && nullBitmap.byteLength > 0) { + this.get = wrapNullableGet(this.get); + this.set = wrapNullableSet(this.set); + } +} diff --git a/src/arrow/js/src/vector/int.ts b/src/arrow/js/src/vector/int.ts new file mode 100644 index 000000000..dbfba58c9 --- /dev/null +++ b/src/arrow/js/src/vector/int.ts @@ -0,0 +1,195 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Data } from '../data'; +import { Vector } from '../vector'; +import { Chunked } from './chunked'; +import { BaseVector } from './base'; +import { VectorBuilderOptions } from './index'; +import { vectorFromValuesWithType } from './index'; +import { VectorBuilderOptionsAsync } from './index'; +import { BigInt64Array, BigUint64Array } from '../util/compat'; +import { toBigInt64Array, toBigUint64Array } from '../util/buffer'; +import { Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, IntArray } from '../type'; +import { VectorType as V, TypedArrayConstructor, BigIntArrayConstructor, BigIntArray } from '../interfaces'; + +/** @ignore */ +type IntVectorConstructors = + typeof IntVector | + typeof Int8Vector | + typeof Int16Vector | + typeof Int32Vector | + typeof Uint8Vector | + typeof Uint16Vector | + typeof Uint32Vector | + typeof Int64Vector | + typeof Uint64Vector ; + +/** @ignore */ +type FromInput<T extends Int, TNull = any> = + IntArray | BigIntArray | + Iterable<T['TValue'] | TNull> | + AsyncIterable<T['TValue'] | TNull> | + VectorBuilderOptions<T, TNull> | + VectorBuilderOptionsAsync<T, TNull> ; + +/** @ignore */ +type FromArgs<T extends Int, TNull = any> = [FromInput<T, TNull>, boolean?]; + +/** @ignore */ +export type IntArrayCtor = TypedArrayConstructor<IntArray> | BigIntArrayConstructor<BigIntArray>; + +/** @ignore */ +export class IntVector<T extends Int = Int> extends BaseVector<T> { + + // Guaranteed zero-copy variants + public static from(this: typeof IntVector, input: Int8Array): Int8Vector; + public static from(this: typeof IntVector, input: Int16Array): Int16Vector; + public static from(this: typeof IntVector, input: Int32Array): Int32Vector; + public static from(this: typeof IntVector, input: BigInt64Array): Int64Vector; + public static from(this: typeof IntVector, input: Int32Array, is64bit: true): Int64Vector; + public static from(this: typeof IntVector, input: Uint8Array): Uint8Vector; + public static from(this: typeof IntVector, input: Uint16Array): Uint16Vector; + public static from(this: typeof IntVector, input: Uint32Array): Uint32Vector; + public static from(this: typeof IntVector, input: BigUint64Array): Uint64Vector; + public static from(this: typeof IntVector, input: Uint32Array, is64bit: true): Uint64Vector; + + // Zero-copy if input is a TypedArray of the same type as the + // Vector that from is called on, otherwise uses the Builders + public static from<TNull = any>(this: typeof Int8Vector, input: FromInput<Int8, TNull>): Int8Vector; + public static from<TNull = any>(this: typeof Int16Vector, input: FromInput<Int16, TNull>): Int16Vector; + public static from<TNull = any>(this: typeof Int32Vector, input: FromInput<Int32, TNull>): Int32Vector; + public static from<TNull = any>(this: typeof Int64Vector, input: FromInput<Int64, TNull>): Int64Vector; + public static from<TNull = any>(this: typeof Uint8Vector, input: FromInput<Uint8, TNull>): Uint8Vector; + public static from<TNull = any>(this: typeof Uint16Vector, input: FromInput<Uint16, TNull>): Uint16Vector; + public static from<TNull = any>(this: typeof Uint32Vector, input: FromInput<Uint32, TNull>): Uint32Vector; + public static from<TNull = any>(this: typeof Uint64Vector, input: FromInput<Uint64, TNull>): Uint64Vector; + + // Not zero-copy + public static from<T extends Int, TNull = any>(this: typeof IntVector, input: Iterable<T['TValue'] | TNull>): V<T>; + public static from<T extends Int, TNull = any>(this: typeof IntVector, input: AsyncIterable<T['TValue'] | TNull>): Promise<V<T>>; + public static from<T extends Int, TNull = any>(this: typeof IntVector, input: VectorBuilderOptions<T, TNull>): Chunked<T>; + public static from<T extends Int, TNull = any>(this: typeof IntVector, input: VectorBuilderOptionsAsync<T, TNull>): Promise<Chunked<T>>; + /** @nocollapse */ + public static from<T extends Int, TNull = any>(this: IntVectorConstructors, ...args: FromArgs<T, TNull>) { + + const [input, is64bit = false] = args; + let ArrowType = vectorTypeToDataType(this, is64bit); + + if ((input instanceof ArrayBuffer) || ArrayBuffer.isView(input)) { + const InputType = arrayTypeToDataType(input.constructor as IntArrayCtor, is64bit) || ArrowType; + // Special case, infer the Arrow DataType from the input if calling the base + // IntVector.from with a TypedArray, e.g. `IntVector.from(new Int32Array())` + if (ArrowType === null) { + ArrowType = InputType; + } + // If the DataType inferred from the Vector constructor matches the + // DataType inferred from the input arguments, return zero-copy view + if (ArrowType && ArrowType === InputType) { + const type = new ArrowType(); + let length = input.byteLength / type.ArrayType.BYTES_PER_ELEMENT; + // If the ArrowType is 64bit but the input type is 32bit pairs, update the logical length + if (convert32To64Bit(ArrowType, input.constructor)) { + length *= 0.5; + } + return Vector.new(Data.Int(type, 0, length, 0, null, input as IntArray)); + } + } + + if (ArrowType) { + // If the DataType inferred from the Vector constructor is different than + // the DataType inferred from the input TypedArray, or if input isn't a + // TypedArray, use the Builders to construct the result Vector + return vectorFromValuesWithType(() => new ArrowType!() as T, input); + } + + if ((input instanceof DataView) || (input instanceof ArrayBuffer)) { + throw new TypeError(`Cannot infer integer type from instance of ${input.constructor.name}`); + } + + throw new TypeError('Unrecognized IntVector input'); + } +} + +/** @ignore */ +export class Int8Vector extends IntVector<Int8> {} +/** @ignore */ +export class Int16Vector extends IntVector<Int16> {} +/** @ignore */ +export class Int32Vector extends IntVector<Int32> {} +/** @ignore */ +export class Int64Vector extends IntVector<Int64> { + public toBigInt64Array() { + return toBigInt64Array(this.values); + } + private _values64!: BigInt64Array; + public get values64(): BigInt64Array { + return this._values64 || (this._values64 = this.toBigInt64Array()); + } +} + +/** @ignore */ +export class Uint8Vector extends IntVector<Uint8> {} +/** @ignore */ +export class Uint16Vector extends IntVector<Uint16> {} +/** @ignore */ +export class Uint32Vector extends IntVector<Uint32> {} +/** @ignore */ +export class Uint64Vector extends IntVector<Uint64> { + public toBigUint64Array() { + return toBigUint64Array(this.values); + } + private _values64!: BigUint64Array; + public get values64(): BigUint64Array { + return this._values64 || (this._values64 = this.toBigUint64Array()); + } +} + +const convert32To64Bit = (typeCtor: any, dataCtor: any) => { + return (typeCtor === Int64 || typeCtor === Uint64) && + (dataCtor === Int32Array || dataCtor === Uint32Array); +}; + +/** @ignore */ +const arrayTypeToDataType = (ctor: IntArrayCtor, is64bit: boolean) => { + switch (ctor) { + case Int8Array: return Int8; + case Int16Array: return Int16; + case Int32Array: return is64bit ? Int64 : Int32; + case BigInt64Array: return Int64; + case Uint8Array: return Uint8; + case Uint16Array: return Uint16; + case Uint32Array: return is64bit ? Uint64 : Uint32; + case BigUint64Array: return Uint64; + default: return null; + } +}; + +/** @ignore */ +const vectorTypeToDataType = (ctor: IntVectorConstructors, is64bit: boolean) => { + switch (ctor) { + case Int8Vector: return Int8; + case Int16Vector: return Int16; + case Int32Vector: return is64bit ? Int64 : Int32; + case Int64Vector: return Int64; + case Uint8Vector: return Uint8; + case Uint16Vector: return Uint16; + case Uint32Vector: return is64bit ? Uint64 : Uint32; + case Uint64Vector: return Uint64; + default: return null; + } +}; diff --git a/src/arrow/js/src/vector/interval.ts b/src/arrow/js/src/vector/interval.ts new file mode 100644 index 000000000..70384ab97 --- /dev/null +++ b/src/arrow/js/src/vector/interval.ts @@ -0,0 +1,26 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { BaseVector } from './base'; +import { Interval, IntervalDayTime, IntervalYearMonth } from '../type'; + +/** @ignore */ +export class IntervalVector<T extends Interval = Interval> extends BaseVector<T> {} +/** @ignore */ +export class IntervalDayTimeVector extends IntervalVector<IntervalDayTime> {} +/** @ignore */ +export class IntervalYearMonthVector extends IntervalVector<IntervalYearMonth> {} diff --git a/src/arrow/js/src/vector/list.ts b/src/arrow/js/src/vector/list.ts new file mode 100644 index 000000000..6ea189044 --- /dev/null +++ b/src/arrow/js/src/vector/list.ts @@ -0,0 +1,22 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { BaseVector } from './base'; +import { DataType, List } from '../type'; + +/** @ignore */ +export class ListVector<T extends DataType = any> extends BaseVector<List<T>> {} diff --git a/src/arrow/js/src/vector/map.ts b/src/arrow/js/src/vector/map.ts new file mode 100644 index 000000000..9975919f7 --- /dev/null +++ b/src/arrow/js/src/vector/map.ts @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { MapRow } from './row'; +import { Field } from '../schema'; +import { Vector } from '../vector'; +import { BaseVector } from './base'; +import { DataType, Map_, Struct, List } from '../type'; + +/** @ignore */ +export class MapVector<K extends DataType = any, V extends DataType = any> extends BaseVector<Map_<K, V>> { + public asList() { + const child = this.type.children[0] as Field<Struct<{ key: K; value: V }>>; + return Vector.new(this.data.clone(new List<Struct<{ key: K; value: V }>>(child))); + } + public bind(index: number): Map_<K, V>['TValue'] { + const child = this.getChildAt<Struct<{ key: K; value: V }>>(0)!; + const { [index]: begin, [index + 1]: end } = this.valueOffsets; + return new MapRow(child.slice(begin, end)); + } +} diff --git a/src/arrow/js/src/vector/null.ts b/src/arrow/js/src/vector/null.ts new file mode 100644 index 000000000..ffa3d0576 --- /dev/null +++ b/src/arrow/js/src/vector/null.ts @@ -0,0 +1,22 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Null } from '../type'; +import { BaseVector } from './base'; + +/** @ignore */ +export class NullVector extends BaseVector<Null> {} diff --git a/src/arrow/js/src/vector/row.ts b/src/arrow/js/src/vector/row.ts new file mode 100644 index 000000000..23d1b5440 --- /dev/null +++ b/src/arrow/js/src/vector/row.ts @@ -0,0 +1,296 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Vector } from '../vector'; +import { StructVector } from './struct'; +import { valueToString } from '../util/pretty'; +import { DataType, Struct, RowLike } from '../type'; + +/** @ignore */ const kParent = Symbol.for('parent'); +/** @ignore */ const kRowIndex = Symbol.for('rowIndex'); +/** @ignore */ const kKeyToIdx = Symbol.for('keyToIdx'); +/** @ignore */ const kIdxToVal = Symbol.for('idxToVal'); +/** @ignore */ const kCustomInspect = Symbol.for('nodejs.util.inspect.custom'); + +abstract class Row<K extends PropertyKey = any, V = any> implements Map<K, V> { + + public readonly size: number; + public readonly [Symbol.toStringTag]: string; + + protected [kRowIndex]: number; + protected [kParent]: Vector<Struct>; + protected [kKeyToIdx]: Map<K, number>; + protected [kIdxToVal]: V[]; + + constructor(parent: Vector<Struct>, numKeys: number) { + this[kParent] = parent; + this.size = numKeys; + } + + public abstract keys(): IterableIterator<K>; + public abstract values(): IterableIterator<V>; + public abstract getKey(idx: number): K; + public abstract getIndex(key: K): number; + public abstract getValue(idx: number): V; + public abstract setValue(idx: number, val: V): void; + + public entries() { return this[Symbol.iterator](); } + + public has(key: K) { return this.get(key) !== undefined; } + + public get(key: K) { + let val = undefined; + if (key != null) { + const ktoi = this[kKeyToIdx] || (this[kKeyToIdx] = new Map()); + let idx = ktoi.get(key); + if (idx !== undefined) { + const itov = this[kIdxToVal] || (this[kIdxToVal] = new Array(this.size)); + ((val = itov[idx]) !== undefined) || (itov[idx] = val = this.getValue(idx)); + } else if ((idx = this.getIndex(key)) > -1) { + ktoi.set(key, idx); + const itov = this[kIdxToVal] || (this[kIdxToVal] = new Array(this.size)); + ((val = itov[idx]) !== undefined) || (itov[idx] = val = this.getValue(idx)); + } + } + return val; + } + + public set(key: K, val: V) { + if (key != null) { + const ktoi = this[kKeyToIdx] || (this[kKeyToIdx] = new Map()); + let idx = ktoi.get(key); + if (idx === undefined) { + ktoi.set(key, idx = this.getIndex(key)); + } + if (idx > -1) { + const itov = this[kIdxToVal] || (this[kIdxToVal] = new Array(this.size)); + itov[idx] = <any> this.setValue(idx, val); + } + } + return this; + } + + public clear(): void { throw new Error(`Clearing ${this[Symbol.toStringTag]} not supported.`); } + + public delete(_: K): boolean { throw new Error(`Deleting ${this[Symbol.toStringTag]} values not supported.`); } + + public *[Symbol.iterator](): IterableIterator<[K, V]> { + + const ki = this.keys(); + const vi = this.values(); + const ktoi = this[kKeyToIdx] || (this[kKeyToIdx] = new Map()); + const itov = this[kIdxToVal] || (this[kIdxToVal] = new Array(this.size)); + + for (let k: K, v: V, i = 0, kr: IteratorResult<K>, vr: IteratorResult<V>; + !((kr = ki.next()).done || (vr = vi.next()).done); + ++i + ) { + k = kr.value; + v = vr.value; + itov[i] = v; + ktoi.has(k) || ktoi.set(k, i); + yield [k, v]; + } + } + + public forEach(callbackfn: (value: V, key: K, map: Map<K, V>) => void, thisArg?: any): void { + + const ki = this.keys(); + const vi = this.values(); + const callback = thisArg === undefined ? callbackfn : + (v: V, k: K, m: Map<K, V>) => callbackfn.call(thisArg, v, k, m); + const ktoi = this[kKeyToIdx] || (this[kKeyToIdx] = new Map()); + const itov = this[kIdxToVal] || (this[kIdxToVal] = new Array(this.size)); + + for (let k: K, v: V, i = 0, kr: IteratorResult<K>, vr: IteratorResult<V>; + !((kr = ki.next()).done || (vr = vi.next()).done); + ++i + ) { + k = kr.value; + v = vr.value; + itov[i] = v; + ktoi.has(k) || ktoi.set(k, i); + callback(v, k, this); + } + } + + public toArray() { return [...this.values()]; } + public toJSON() { + const obj = {} as any; + this.forEach((val, key) => obj[key] = val); + return obj; + } + + public inspect() { return this.toString(); } + public [kCustomInspect]() { return this.toString(); } + public toString() { + const str: string[] = []; + this.forEach((val, key) => { + key = valueToString(key); + val = valueToString(val); + str.push(`${key}: ${val}`); + }); + return `{ ${str.join(', ')} }`; + } + + protected static [Symbol.toStringTag] = ((proto: Row) => { + Object.defineProperties(proto, { + 'size': { writable: true, enumerable: false, configurable: false, value: 0 }, + [kParent]: { writable: true, enumerable: false, configurable: false, value: null }, + [kRowIndex]: { writable: true, enumerable: false, configurable: false, value: -1 }, + }); + return (proto as any)[Symbol.toStringTag] = 'Row'; + })(Row.prototype); +} + +export class MapRow<K extends DataType = any, V extends DataType = any> extends Row<K['TValue'], V['TValue'] | null> { + constructor(slice: Vector<Struct<{ key: K; value: V }>>) { + super(slice, slice.length); + return createRowProxy(this); + } + public keys() { + return this[kParent].getChildAt(0)![Symbol.iterator](); + } + public values() { + return this[kParent].getChildAt(1)![Symbol.iterator](); + } + public getKey(idx: number): K['TValue'] { + return this[kParent].getChildAt(0)!.get(idx); + } + public getIndex(key: K['TValue']): number { + return this[kParent].getChildAt(0)!.indexOf(key); + } + public getValue(index: number): V['TValue'] | null { + return this[kParent].getChildAt(1)!.get(index); + } + public setValue(index: number, value: V['TValue'] | null): void { + this[kParent].getChildAt(1)!.set(index, value); + } +} + +export class StructRow<T extends { [key: string]: DataType } = any> extends Row<keyof T, T[keyof T]['TValue'] | null> { + constructor(parent: StructVector<T>) { + super(parent, parent.type.children.length); + return defineRowProxyProperties(this); + } + public *keys() { + for (const field of this[kParent].type.children) { + yield field.name as keyof T; + } + } + public *values() { + for (const field of this[kParent].type.children) { + yield (this as RowLike<T>)[field.name]; + } + } + public getKey(idx: number): keyof T { + return this[kParent].type.children[idx].name as keyof T; + } + public getIndex(key: keyof T): number { + return this[kParent].type.children.findIndex((f) => f.name === key); + } + public getValue(index: number): T[keyof T]['TValue'] | null { + return this[kParent].getChildAt(index)!.get(this[kRowIndex]); + } + public setValue(index: number, value: T[keyof T]['TValue'] | null): void { + return this[kParent].getChildAt(index)!.set(this[kRowIndex], value); + } +} + +Object.setPrototypeOf(Row.prototype, Map.prototype); + +/** @ignore */ +const defineRowProxyProperties = (() => { + const desc = { enumerable: true, configurable: false, get: null as any, set: null as any }; + return <T extends Row>(row: T) => { + let idx = -1; + const ktoi = row[kKeyToIdx] || (row[kKeyToIdx] = new Map()); + const getter = (key: any) => function(this: T) { return this.get(key); }; + const setter = (key: any) => function(this: T, val: any) { return this.set(key, val); }; + for (const key of row.keys()) { + ktoi.set(key, ++idx); + desc.get = getter(key); + desc.set = setter(key); + Object.prototype.hasOwnProperty.call(row, key) || (desc.enumerable = true, Object.defineProperty(row, key, desc)); + Object.prototype.hasOwnProperty.call(row, idx) || (desc.enumerable = false, Object.defineProperty(row, idx, desc)); + } + desc.get = desc.set = null; + return row; + }; +})(); + +/** @ignore */ +const createRowProxy = (() => { + if (typeof Proxy === 'undefined') { + return defineRowProxyProperties; + } + const has = Row.prototype.has; + const get = Row.prototype.get; + const set = Row.prototype.set; + const getKey = Row.prototype.getKey; + const RowProxyHandler: ProxyHandler<Row> = { + isExtensible() { return false; }, + deleteProperty() { return false; }, + preventExtensions() { return true; }, + ownKeys(row: Row) { return [...row.keys()].map((x) => `${x}`); }, + has(row: Row, key: PropertyKey) { + switch (key) { + case 'getKey': case 'getIndex': case 'getValue': case 'setValue': case 'toArray': case 'toJSON': case 'inspect': + case 'constructor': case 'isPrototypeOf': case 'propertyIsEnumerable': case 'toString': case 'toLocaleString': case 'valueOf': + case 'size': case 'has': case 'get': case 'set': case 'clear': case 'delete': case 'keys': case 'values': case 'entries': case 'forEach': + case '__proto__': case '__defineGetter__': case '__defineSetter__': case 'hasOwnProperty': case '__lookupGetter__': case '__lookupSetter__': + case Symbol.iterator: case Symbol.toStringTag: case kParent: case kRowIndex: case kIdxToVal: case kKeyToIdx: case kCustomInspect: + return true; + } + if (typeof key === 'number' && !row.has(key)) { + key = row.getKey(key); + } + return row.has(key); + }, + get(row: Row, key: PropertyKey, receiver: any) { + switch (key) { + case 'getKey': case 'getIndex': case 'getValue': case 'setValue': case 'toArray': case 'toJSON': case 'inspect': + case 'constructor': case 'isPrototypeOf': case 'propertyIsEnumerable': case 'toString': case 'toLocaleString': case 'valueOf': + case 'size': case 'has': case 'get': case 'set': case 'clear': case 'delete': case 'keys': case 'values': case 'entries': case 'forEach': + case '__proto__': case '__defineGetter__': case '__defineSetter__': case 'hasOwnProperty': case '__lookupGetter__': case '__lookupSetter__': + case Symbol.iterator: case Symbol.toStringTag: case kParent: case kRowIndex: case kIdxToVal: case kKeyToIdx: case kCustomInspect: + return Reflect.get(row, key, receiver); + } + if (typeof key === 'number' && !has.call(receiver, key)) { + key = getKey.call(receiver, key); + } + return get.call(receiver, key); + }, + set(row: Row, key: PropertyKey, val: any, receiver: any) { + switch (key) { + case kParent: case kRowIndex: case kIdxToVal: case kKeyToIdx: + return Reflect.set(row, key, val, receiver); + case 'getKey': case 'getIndex': case 'getValue': case 'setValue': case 'toArray': case 'toJSON': case 'inspect': + case 'constructor': case 'isPrototypeOf': case 'propertyIsEnumerable': case 'toString': case 'toLocaleString': case 'valueOf': + case 'size': case 'has': case 'get': case 'set': case 'clear': case 'delete': case 'keys': case 'values': case 'entries': case 'forEach': + case '__proto__': case '__defineGetter__': case '__defineSetter__': case 'hasOwnProperty': case '__lookupGetter__': case '__lookupSetter__': + case Symbol.iterator: case Symbol.toStringTag: + return false; + } + if (typeof key === 'number' && !has.call(receiver, key)) { + key = getKey.call(receiver, key); + } + return has.call(receiver, key) ? !!set.call(receiver, key, val) : false; + }, + }; + return <T extends Row>(row: T) => new Proxy(row, RowProxyHandler) as T; +})(); diff --git a/src/arrow/js/src/vector/struct.ts b/src/arrow/js/src/vector/struct.ts new file mode 100644 index 000000000..b825f092e --- /dev/null +++ b/src/arrow/js/src/vector/struct.ts @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { StructRow } from './row'; +import { BaseVector } from './base'; +import { DataType, Struct } from '../type'; + +/** @ignore */ const kRowIndex = Symbol.for('rowIndex'); +/** @ignore */ +export class StructVector<T extends { [key: string]: DataType } = any> extends BaseVector<Struct<T>> { + private _row!: StructRow<T>; + public bind(index: number): Struct<T>['TValue'] { + const proto = this._row || (this._row = new StructRow<T>(this)); + const bound = Object.create(proto); + bound[kRowIndex] = index; + return bound; + } +} diff --git a/src/arrow/js/src/vector/time.ts b/src/arrow/js/src/vector/time.ts new file mode 100644 index 000000000..0abded940 --- /dev/null +++ b/src/arrow/js/src/vector/time.ts @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { BaseVector } from './base'; +import { Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond } from '../type'; + +/** @ignore */ +export class TimeVector<T extends Time = Time> extends BaseVector<T> {} +/** @ignore */ +export class TimeSecondVector extends TimeVector<TimeSecond> {} +/** @ignore */ +export class TimeMillisecondVector extends TimeVector<TimeMillisecond> {} +/** @ignore */ +export class TimeMicrosecondVector extends TimeVector<TimeMicrosecond> {} +/** @ignore */ +export class TimeNanosecondVector extends TimeVector<TimeNanosecond> {} diff --git a/src/arrow/js/src/vector/timestamp.ts b/src/arrow/js/src/vector/timestamp.ts new file mode 100644 index 000000000..caff0bd6f --- /dev/null +++ b/src/arrow/js/src/vector/timestamp.ts @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { BaseVector } from './base'; +import { Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond } from '../type'; + +/** @ignore */ +export class TimestampVector<T extends Timestamp = Timestamp> extends BaseVector<T> {} +/** @ignore */ +export class TimestampSecondVector extends TimestampVector<TimestampSecond> {} +/** @ignore */ +export class TimestampMillisecondVector extends TimestampVector<TimestampMillisecond> {} +/** @ignore */ +export class TimestampMicrosecondVector extends TimestampVector<TimestampMicrosecond> {} +/** @ignore */ +export class TimestampNanosecondVector extends TimestampVector<TimestampNanosecond> {} diff --git a/src/arrow/js/src/vector/union.ts b/src/arrow/js/src/vector/union.ts new file mode 100644 index 000000000..854519c57 --- /dev/null +++ b/src/arrow/js/src/vector/union.ts @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { BaseVector } from './base'; +import { Union, DenseUnion, SparseUnion} from '../type'; + +/** @ignore */ +export class UnionVector<T extends Union = Union> extends BaseVector<T> { + public get typeIdToChildIndex() { return this.data.type.typeIdToChildIndex; } +} + +/** @ignore */ +export class DenseUnionVector extends UnionVector<DenseUnion> { + public get valueOffsets() { return this.data.valueOffsets!; } +} + +/** @ignore */ +export class SparseUnionVector extends UnionVector<SparseUnion> {} diff --git a/src/arrow/js/src/vector/utf8.ts b/src/arrow/js/src/vector/utf8.ts new file mode 100644 index 000000000..a891c0dc5 --- /dev/null +++ b/src/arrow/js/src/vector/utf8.ts @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Vector } from '../vector'; +import { Chunked } from './chunked'; +import { BaseVector } from './base'; +import { Binary, Utf8 } from '../type'; +import { VectorBuilderOptions } from './index'; +import { vectorFromValuesWithType } from './index'; +import { VectorBuilderOptionsAsync } from './index'; + +/** @ignore */ +export class Utf8Vector extends BaseVector<Utf8> { + public static from<TNull = any>(input: Iterable<string | TNull>): Utf8Vector; + public static from<TNull = any>(input: AsyncIterable<string | TNull>): Promise<Utf8Vector>; + public static from<TNull = any>(input: VectorBuilderOptions<Utf8, string | TNull>): Chunked<Utf8>; + public static from<TNull = any>(input: VectorBuilderOptionsAsync<Utf8, string | TNull>): Promise<Chunked<Utf8>>; + /** @nocollapse */ + public static from<TNull = any>(input: Iterable<string | TNull> | AsyncIterable<string | TNull> | VectorBuilderOptions<Utf8, string | TNull> | VectorBuilderOptionsAsync<Utf8, string | TNull>) { + return vectorFromValuesWithType(() => new Utf8(), input); + } + public asBinary() { + return Vector.new(this.data.clone(new Binary())); + } +} diff --git a/src/arrow/js/src/visitor.ts b/src/arrow/js/src/visitor.ts new file mode 100644 index 000000000..3a63c93f9 --- /dev/null +++ b/src/arrow/js/src/visitor.ts @@ -0,0 +1,260 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Data } from './data'; +import { Vector } from './vector'; +import { Type, Precision, DateUnit, TimeUnit, IntervalUnit, UnionMode } from './enum'; +import { DataType, Float, Int, Date_, Interval, Time, Timestamp, Union, } from './type'; + +export abstract class Visitor { + public visitMany(nodes: any[], ...args: any[][]) { + return nodes.map((node, i) => this.visit(node, ...args.map((x) => x[i]))); + } + public visit(...args: any[]) { + return this.getVisitFn(args[0], false).apply(this, args); + } + public getVisitFn(node: any, throwIfNotFound = true) { + return getVisitFn(this, node, throwIfNotFound); + } + public visitNull (_node: any, ..._args: any[]): any { return null; } + public visitBool (_node: any, ..._args: any[]): any { return null; } + public visitInt (_node: any, ..._args: any[]): any { return null; } + public visitFloat (_node: any, ..._args: any[]): any { return null; } + public visitUtf8 (_node: any, ..._args: any[]): any { return null; } + public visitBinary (_node: any, ..._args: any[]): any { return null; } + public visitFixedSizeBinary (_node: any, ..._args: any[]): any { return null; } + public visitDate (_node: any, ..._args: any[]): any { return null; } + public visitTimestamp (_node: any, ..._args: any[]): any { return null; } + public visitTime (_node: any, ..._args: any[]): any { return null; } + public visitDecimal (_node: any, ..._args: any[]): any { return null; } + public visitList (_node: any, ..._args: any[]): any { return null; } + public visitStruct (_node: any, ..._args: any[]): any { return null; } + public visitUnion (_node: any, ..._args: any[]): any { return null; } + public visitDictionary (_node: any, ..._args: any[]): any { return null; } + public visitInterval (_node: any, ..._args: any[]): any { return null; } + public visitFixedSizeList (_node: any, ..._args: any[]): any { return null; } + public visitMap (_node: any, ..._args: any[]): any { return null; } +} + +/** @ignore */ +function getVisitFn<T extends DataType>(visitor: Visitor, node: any, throwIfNotFound = true) { + let fn: any = null; + let dtype: T['TType'] = Type.NONE; + if (node instanceof Data ) dtype = inferDType(node.type as T); + else if (node instanceof Vector ) dtype = inferDType(node.type as T); + else if (node instanceof DataType) dtype = inferDType(node as T); + else if (typeof (dtype = node) !== 'number') dtype = Type[node] as any as T['TType']; + + switch (dtype) { + case Type.Null: fn = visitor.visitNull; break; + case Type.Bool: fn = visitor.visitBool; break; + case Type.Int: fn = visitor.visitInt; break; + case Type.Int8: fn = visitor.visitInt8 || visitor.visitInt; break; + case Type.Int16: fn = visitor.visitInt16 || visitor.visitInt; break; + case Type.Int32: fn = visitor.visitInt32 || visitor.visitInt; break; + case Type.Int64: fn = visitor.visitInt64 || visitor.visitInt; break; + case Type.Uint8: fn = visitor.visitUint8 || visitor.visitInt; break; + case Type.Uint16: fn = visitor.visitUint16 || visitor.visitInt; break; + case Type.Uint32: fn = visitor.visitUint32 || visitor.visitInt; break; + case Type.Uint64: fn = visitor.visitUint64 || visitor.visitInt; break; + case Type.Float: fn = visitor.visitFloat; break; + case Type.Float16: fn = visitor.visitFloat16 || visitor.visitFloat; break; + case Type.Float32: fn = visitor.visitFloat32 || visitor.visitFloat; break; + case Type.Float64: fn = visitor.visitFloat64 || visitor.visitFloat; break; + case Type.Utf8: fn = visitor.visitUtf8; break; + case Type.Binary: fn = visitor.visitBinary; break; + case Type.FixedSizeBinary: fn = visitor.visitFixedSizeBinary; break; + case Type.Date: fn = visitor.visitDate; break; + case Type.DateDay: fn = visitor.visitDateDay || visitor.visitDate; break; + case Type.DateMillisecond: fn = visitor.visitDateMillisecond || visitor.visitDate; break; + case Type.Timestamp: fn = visitor.visitTimestamp; break; + case Type.TimestampSecond: fn = visitor.visitTimestampSecond || visitor.visitTimestamp; break; + case Type.TimestampMillisecond: fn = visitor.visitTimestampMillisecond || visitor.visitTimestamp; break; + case Type.TimestampMicrosecond: fn = visitor.visitTimestampMicrosecond || visitor.visitTimestamp; break; + case Type.TimestampNanosecond: fn = visitor.visitTimestampNanosecond || visitor.visitTimestamp; break; + case Type.Time: fn = visitor.visitTime; break; + case Type.TimeSecond: fn = visitor.visitTimeSecond || visitor.visitTime; break; + case Type.TimeMillisecond: fn = visitor.visitTimeMillisecond || visitor.visitTime; break; + case Type.TimeMicrosecond: fn = visitor.visitTimeMicrosecond || visitor.visitTime; break; + case Type.TimeNanosecond: fn = visitor.visitTimeNanosecond || visitor.visitTime; break; + case Type.Decimal: fn = visitor.visitDecimal; break; + case Type.List: fn = visitor.visitList; break; + case Type.Struct: fn = visitor.visitStruct; break; + case Type.Union: fn = visitor.visitUnion; break; + case Type.DenseUnion: fn = visitor.visitDenseUnion || visitor.visitUnion; break; + case Type.SparseUnion: fn = visitor.visitSparseUnion || visitor.visitUnion; break; + case Type.Dictionary: fn = visitor.visitDictionary; break; + case Type.Interval: fn = visitor.visitInterval; break; + case Type.IntervalDayTime: fn = visitor.visitIntervalDayTime || visitor.visitInterval; break; + case Type.IntervalYearMonth: fn = visitor.visitIntervalYearMonth || visitor.visitInterval; break; + case Type.FixedSizeList: fn = visitor.visitFixedSizeList; break; + case Type.Map: fn = visitor.visitMap; break; + } + if (typeof fn === 'function') return fn; + if (!throwIfNotFound) return () => null; + throw new Error(`Unrecognized type '${Type[dtype]}'`); +} + +/** @ignore */ +function inferDType<T extends DataType>(type: T): Type { + switch (type.typeId) { + case Type.Null: return Type.Null; + case Type.Int: { + const { bitWidth, isSigned } = (type as any as Int); + switch (bitWidth) { + case 8: return isSigned ? Type.Int8 : Type.Uint8 ; + case 16: return isSigned ? Type.Int16 : Type.Uint16; + case 32: return isSigned ? Type.Int32 : Type.Uint32; + case 64: return isSigned ? Type.Int64 : Type.Uint64; + } + // @ts-ignore + return Type.Int; + } + case Type.Float: + switch((type as any as Float).precision) { + case Precision.HALF: return Type.Float16; + case Precision.SINGLE: return Type.Float32; + case Precision.DOUBLE: return Type.Float64; + } + // @ts-ignore + return Type.Float; + case Type.Binary: return Type.Binary; + case Type.Utf8: return Type.Utf8; + case Type.Bool: return Type.Bool; + case Type.Decimal: return Type.Decimal; + case Type.Time: + switch ((type as any as Time).unit) { + case TimeUnit.SECOND: return Type.TimeSecond; + case TimeUnit.MILLISECOND: return Type.TimeMillisecond; + case TimeUnit.MICROSECOND: return Type.TimeMicrosecond; + case TimeUnit.NANOSECOND: return Type.TimeNanosecond; + } + // @ts-ignore + return Type.Time; + case Type.Timestamp: + switch ((type as any as Timestamp).unit) { + case TimeUnit.SECOND: return Type.TimestampSecond; + case TimeUnit.MILLISECOND: return Type.TimestampMillisecond; + case TimeUnit.MICROSECOND: return Type.TimestampMicrosecond; + case TimeUnit.NANOSECOND: return Type.TimestampNanosecond; + } + // @ts-ignore + return Type.Timestamp; + case Type.Date: + switch ((type as any as Date_).unit) { + case DateUnit.DAY: return Type.DateDay; + case DateUnit.MILLISECOND: return Type.DateMillisecond; + } + // @ts-ignore + return Type.Date; + case Type.Interval: + switch ((type as any as Interval).unit) { + case IntervalUnit.DAY_TIME: return Type.IntervalDayTime; + case IntervalUnit.YEAR_MONTH: return Type.IntervalYearMonth; + } + // @ts-ignore + return Type.Interval; + case Type.Map: return Type.Map; + case Type.List: return Type.List; + case Type.Struct: return Type.Struct; + case Type.Union: + switch ((type as any as Union).mode) { + case UnionMode.Dense: return Type.DenseUnion; + case UnionMode.Sparse: return Type.SparseUnion; + } + // @ts-ignore + return Type.Union; + case Type.FixedSizeBinary: return Type.FixedSizeBinary; + case Type.FixedSizeList: return Type.FixedSizeList; + case Type.Dictionary: return Type.Dictionary; + } + throw new Error(`Unrecognized type '${Type[type.typeId]}'`); +} + +export interface Visitor { + visitNull (node: any, ...args: any[]): any; + visitBool (node: any, ...args: any[]): any; + visitInt (node: any, ...args: any[]): any; + visitInt8? (node: any, ...args: any[]): any; + visitInt16? (node: any, ...args: any[]): any; + visitInt32? (node: any, ...args: any[]): any; + visitInt64? (node: any, ...args: any[]): any; + visitUint8? (node: any, ...args: any[]): any; + visitUint16? (node: any, ...args: any[]): any; + visitUint32? (node: any, ...args: any[]): any; + visitUint64? (node: any, ...args: any[]): any; + visitFloat (node: any, ...args: any[]): any; + visitFloat16? (node: any, ...args: any[]): any; + visitFloat32? (node: any, ...args: any[]): any; + visitFloat64? (node: any, ...args: any[]): any; + visitUtf8 (node: any, ...args: any[]): any; + visitBinary (node: any, ...args: any[]): any; + visitFixedSizeBinary (node: any, ...args: any[]): any; + visitDate (node: any, ...args: any[]): any; + visitDateDay? (node: any, ...args: any[]): any; + visitDateMillisecond? (node: any, ...args: any[]): any; + visitTimestamp (node: any, ...args: any[]): any; + visitTimestampSecond? (node: any, ...args: any[]): any; + visitTimestampMillisecond? (node: any, ...args: any[]): any; + visitTimestampMicrosecond? (node: any, ...args: any[]): any; + visitTimestampNanosecond? (node: any, ...args: any[]): any; + visitTime (node: any, ...args: any[]): any; + visitTimeSecond? (node: any, ...args: any[]): any; + visitTimeMillisecond? (node: any, ...args: any[]): any; + visitTimeMicrosecond? (node: any, ...args: any[]): any; + visitTimeNanosecond? (node: any, ...args: any[]): any; + visitDecimal (node: any, ...args: any[]): any; + visitList (node: any, ...args: any[]): any; + visitStruct (node: any, ...args: any[]): any; + visitUnion (node: any, ...args: any[]): any; + visitDenseUnion? (node: any, ...args: any[]): any; + visitSparseUnion? (node: any, ...args: any[]): any; + visitDictionary (node: any, ...args: any[]): any; + visitInterval (node: any, ...args: any[]): any; + visitIntervalDayTime? (node: any, ...args: any[]): any; + visitIntervalYearMonth? (node: any, ...args: any[]): any; + visitFixedSizeList (node: any, ...args: any[]): any; + visitMap (node: any, ...args: any[]): any; +} + +// Add these here so they're picked up by the externs creator +// in the build, and closure-compiler doesn't minify them away +(Visitor.prototype as any).visitInt8 = null; +(Visitor.prototype as any).visitInt16 = null; +(Visitor.prototype as any).visitInt32 = null; +(Visitor.prototype as any).visitInt64 = null; +(Visitor.prototype as any).visitUint8 = null; +(Visitor.prototype as any).visitUint16 = null; +(Visitor.prototype as any).visitUint32 = null; +(Visitor.prototype as any).visitUint64 = null; +(Visitor.prototype as any).visitFloat16 = null; +(Visitor.prototype as any).visitFloat32 = null; +(Visitor.prototype as any).visitFloat64 = null; +(Visitor.prototype as any).visitDateDay = null; +(Visitor.prototype as any).visitDateMillisecond = null; +(Visitor.prototype as any).visitTimestampSecond = null; +(Visitor.prototype as any).visitTimestampMillisecond = null; +(Visitor.prototype as any).visitTimestampMicrosecond = null; +(Visitor.prototype as any).visitTimestampNanosecond = null; +(Visitor.prototype as any).visitTimeSecond = null; +(Visitor.prototype as any).visitTimeMillisecond = null; +(Visitor.prototype as any).visitTimeMicrosecond = null; +(Visitor.prototype as any).visitTimeNanosecond = null; +(Visitor.prototype as any).visitDenseUnion = null; +(Visitor.prototype as any).visitSparseUnion = null; +(Visitor.prototype as any).visitIntervalDayTime = null; +(Visitor.prototype as any).visitIntervalYearMonth = null; diff --git a/src/arrow/js/src/visitor/builderctor.ts b/src/arrow/js/src/visitor/builderctor.ts new file mode 100644 index 000000000..ac35a9874 --- /dev/null +++ b/src/arrow/js/src/visitor/builderctor.ts @@ -0,0 +1,98 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Data } from '../data'; +import { Type } from '../enum'; +import { DataType } from '../type'; +import { Visitor } from '../visitor'; +import { VectorType, BuilderCtor } from '../interfaces'; +import { BinaryBuilder } from '../builder/binary'; +import { BoolBuilder } from '../builder/bool'; +import { DateBuilder, DateDayBuilder, DateMillisecondBuilder } from '../builder/date'; +import { DecimalBuilder } from '../builder/decimal'; +import { DictionaryBuilder } from '../builder/dictionary'; +import { FixedSizeBinaryBuilder } from '../builder/fixedsizebinary'; +import { FixedSizeListBuilder } from '../builder/fixedsizelist'; +import { FloatBuilder, Float16Builder, Float32Builder, Float64Builder } from '../builder/float'; +import { IntervalBuilder, IntervalDayTimeBuilder, IntervalYearMonthBuilder } from '../builder/interval'; +import { IntBuilder, Int8Builder, Int16Builder, Int32Builder, Int64Builder, Uint8Builder, Uint16Builder, Uint32Builder, Uint64Builder } from '../builder/int'; +import { ListBuilder } from '../builder/list'; +import { MapBuilder } from '../builder/map'; +import { NullBuilder } from '../builder/null'; +import { StructBuilder } from '../builder/struct'; +import { TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuilder, TimestampMicrosecondBuilder, TimestampNanosecondBuilder } from '../builder/timestamp'; +import { TimeBuilder, TimeSecondBuilder, TimeMillisecondBuilder, TimeMicrosecondBuilder, TimeNanosecondBuilder } from '../builder/time'; +import { UnionBuilder, DenseUnionBuilder, SparseUnionBuilder } from '../builder/union'; +import { Utf8Builder } from '../builder/utf8'; + +/** @ignore */ +export interface GetBuilderCtor extends Visitor { + visit<T extends Type>(type: T): BuilderCtor<T>; + visitMany<T extends Type>(types: T[]): BuilderCtor<T>[]; + getVisitFn<T extends Type>(type: T): () => BuilderCtor<T>; + getVisitFn<T extends DataType>(node: VectorType<T> | Data<T> | T): () => BuilderCtor<T>; +} + +/** @ignore */ +export class GetBuilderCtor extends Visitor { + public visitNull () { return NullBuilder; } + public visitBool () { return BoolBuilder; } + public visitInt () { return IntBuilder; } + public visitInt8 () { return Int8Builder; } + public visitInt16 () { return Int16Builder; } + public visitInt32 () { return Int32Builder; } + public visitInt64 () { return Int64Builder; } + public visitUint8 () { return Uint8Builder; } + public visitUint16 () { return Uint16Builder; } + public visitUint32 () { return Uint32Builder; } + public visitUint64 () { return Uint64Builder; } + public visitFloat () { return FloatBuilder; } + public visitFloat16 () { return Float16Builder; } + public visitFloat32 () { return Float32Builder; } + public visitFloat64 () { return Float64Builder; } + public visitUtf8 () { return Utf8Builder; } + public visitBinary () { return BinaryBuilder; } + public visitFixedSizeBinary () { return FixedSizeBinaryBuilder; } + public visitDate () { return DateBuilder; } + public visitDateDay () { return DateDayBuilder; } + public visitDateMillisecond () { return DateMillisecondBuilder; } + public visitTimestamp () { return TimestampBuilder; } + public visitTimestampSecond () { return TimestampSecondBuilder; } + public visitTimestampMillisecond () { return TimestampMillisecondBuilder; } + public visitTimestampMicrosecond () { return TimestampMicrosecondBuilder; } + public visitTimestampNanosecond () { return TimestampNanosecondBuilder; } + public visitTime () { return TimeBuilder; } + public visitTimeSecond () { return TimeSecondBuilder; } + public visitTimeMillisecond () { return TimeMillisecondBuilder; } + public visitTimeMicrosecond () { return TimeMicrosecondBuilder; } + public visitTimeNanosecond () { return TimeNanosecondBuilder; } + public visitDecimal () { return DecimalBuilder; } + public visitList () { return ListBuilder; } + public visitStruct () { return StructBuilder; } + public visitUnion () { return UnionBuilder; } + public visitDenseUnion () { return DenseUnionBuilder; } + public visitSparseUnion () { return SparseUnionBuilder; } + public visitDictionary () { return DictionaryBuilder; } + public visitInterval () { return IntervalBuilder; } + public visitIntervalDayTime () { return IntervalDayTimeBuilder; } + public visitIntervalYearMonth () { return IntervalYearMonthBuilder; } + public visitFixedSizeList () { return FixedSizeListBuilder; } + public visitMap () { return MapBuilder; } +} + +/** @ignore */ +export const instance = new GetBuilderCtor(); diff --git a/src/arrow/js/src/visitor/bytewidth.ts b/src/arrow/js/src/visitor/bytewidth.ts new file mode 100644 index 000000000..8be7c7a64 --- /dev/null +++ b/src/arrow/js/src/visitor/bytewidth.ts @@ -0,0 +1,68 @@ +/* istanbul ignore file */ + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Data } from '../data'; +import { Visitor } from '../visitor'; +import { VectorType } from '../interfaces'; +import { Type, TimeUnit } from '../enum'; +import { Schema, Field } from '../schema'; +import { + DataType, Dictionary, + Float, Int, Date_, Interval, Time, Timestamp, + Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, + List, FixedSizeList, Map_, Struct, Union, +} from '../type'; + +/** @ignore */ const sum = (x: number, y: number) => x + y; +/** @ignore */ const variableWidthColumnErrorMessage = (type: DataType) => `Cannot compute the byte width of variable-width column ${type}`; + +/** @ignore */ +export interface ByteWidthVisitor extends Visitor { + visit<T extends DataType>(node: T): number; + visitMany<T extends DataType>(nodes: T[]): number[]; + getVisitFn<T extends Type> (node: T): (type: DataType<T>) => number; + getVisitFn<T extends DataType>(node: VectorType<T> | Data<T> | T): (type: T) => number; +} + +/** @ignore */ +export class ByteWidthVisitor extends Visitor { + public visitNull (____: Null ) { return 0; } + public visitInt (type: Int ) { return type.bitWidth / 8; } + public visitFloat (type: Float ) { return type.ArrayType.BYTES_PER_ELEMENT; } + public visitBinary (type: Binary ) { throw new Error(variableWidthColumnErrorMessage(type)); } + public visitUtf8 (type: Utf8 ) { throw new Error(variableWidthColumnErrorMessage(type)); } + public visitBool (____: Bool ) { return 1 / 8; } + public visitDecimal (____: Decimal ) { return 16; } + public visitDate (type: Date_ ) { return (type.unit + 1) * 4; } + public visitTime (type: Time ) { return type.bitWidth / 8; } + public visitTimestamp (type: Timestamp ) { return type.unit === TimeUnit.SECOND ? 4 : 8; } + public visitInterval (type: Interval ) { return (type.unit + 1) * 4; } + public visitList (type: List ) { throw new Error(variableWidthColumnErrorMessage(type)); } + public visitStruct (type: Struct ) { return this.visitFields(type.children).reduce(sum, 0); } + public visitUnion (type: Union ) { return this.visitFields(type.children).reduce(sum, 0); } + public visitFixedSizeBinary (type: FixedSizeBinary ) { return type.byteWidth; } + public visitFixedSizeList (type: FixedSizeList ) { return type.listSize * this.visitFields(type.children).reduce(sum, 0); } + public visitMap (type: Map_ ) { return this.visitFields(type.children).reduce(sum, 0); } + public visitDictionary (type: Dictionary ) { return this.visit(type.indices); } + public visitFields (fields: Field[] ) { return (fields || []).map((field) => this.visit(field.type)); } + public visitSchema (schema: Schema ) { return this.visitFields(schema.fields).reduce(sum, 0); } +} + +/** @ignore */ +export const instance = new ByteWidthVisitor(); diff --git a/src/arrow/js/src/visitor/get.ts b/src/arrow/js/src/visitor/get.ts new file mode 100644 index 000000000..733418c0a --- /dev/null +++ b/src/arrow/js/src/visitor/get.ts @@ -0,0 +1,321 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Data } from '../data'; +import { BN } from '../util/bn'; +import { Visitor } from '../visitor'; +import { decodeUtf8 } from '../util/utf8'; +import { VectorType } from '../interfaces'; +import { uint16ToFloat64 } from '../util/math'; +import { Type, UnionMode, Precision, DateUnit, TimeUnit, IntervalUnit } from '../enum'; +import { + DataType, Dictionary, + Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Float, Float16, Float32, Float64, + Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, + Date_, DateDay, DateMillisecond, + Interval, IntervalDayTime, IntervalYearMonth, + Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond, + Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, + Union, DenseUnion, SparseUnion, +} from '../type'; + +/** @ignore */ +export interface GetVisitor extends Visitor { + visit<T extends VectorType> (node: T, index: number): T['TValue']; + visitMany<T extends VectorType> (nodes: T[], indices: number[]): T['TValue'][]; + getVisitFn<T extends Type> (node: T): (vector: VectorType<T>, index: number) => VectorType<T>['TValue']; + getVisitFn<T extends DataType>(node: VectorType<T> | Data<T> | T): (vector: VectorType<T>, index: number) => VectorType<T>['TValue']; + visitNull <T extends Null> (vector: VectorType<T>, index: number): T['TValue']; + visitBool <T extends Bool> (vector: VectorType<T>, index: number): T['TValue']; + visitInt <T extends Int> (vector: VectorType<T>, index: number): T['TValue']; + visitInt8 <T extends Int8> (vector: VectorType<T>, index: number): T['TValue']; + visitInt16 <T extends Int16> (vector: VectorType<T>, index: number): T['TValue']; + visitInt32 <T extends Int32> (vector: VectorType<T>, index: number): T['TValue']; + visitInt64 <T extends Int64> (vector: VectorType<T>, index: number): T['TValue']; + visitUint8 <T extends Uint8> (vector: VectorType<T>, index: number): T['TValue']; + visitUint16 <T extends Uint16> (vector: VectorType<T>, index: number): T['TValue']; + visitUint32 <T extends Uint32> (vector: VectorType<T>, index: number): T['TValue']; + visitUint64 <T extends Uint64> (vector: VectorType<T>, index: number): T['TValue']; + visitFloat <T extends Float> (vector: VectorType<T>, index: number): T['TValue']; + visitFloat16 <T extends Float16> (vector: VectorType<T>, index: number): T['TValue']; + visitFloat32 <T extends Float32> (vector: VectorType<T>, index: number): T['TValue']; + visitFloat64 <T extends Float64> (vector: VectorType<T>, index: number): T['TValue']; + visitUtf8 <T extends Utf8> (vector: VectorType<T>, index: number): T['TValue']; + visitBinary <T extends Binary> (vector: VectorType<T>, index: number): T['TValue']; + visitFixedSizeBinary <T extends FixedSizeBinary> (vector: VectorType<T>, index: number): T['TValue']; + visitDate <T extends Date_> (vector: VectorType<T>, index: number): T['TValue']; + visitDateDay <T extends DateDay> (vector: VectorType<T>, index: number): T['TValue']; + visitDateMillisecond <T extends DateMillisecond> (vector: VectorType<T>, index: number): T['TValue']; + visitTimestamp <T extends Timestamp> (vector: VectorType<T>, index: number): T['TValue']; + visitTimestampSecond <T extends TimestampSecond> (vector: VectorType<T>, index: number): T['TValue']; + visitTimestampMillisecond <T extends TimestampMillisecond> (vector: VectorType<T>, index: number): T['TValue']; + visitTimestampMicrosecond <T extends TimestampMicrosecond> (vector: VectorType<T>, index: number): T['TValue']; + visitTimestampNanosecond <T extends TimestampNanosecond> (vector: VectorType<T>, index: number): T['TValue']; + visitTime <T extends Time> (vector: VectorType<T>, index: number): T['TValue']; + visitTimeSecond <T extends TimeSecond> (vector: VectorType<T>, index: number): T['TValue']; + visitTimeMillisecond <T extends TimeMillisecond> (vector: VectorType<T>, index: number): T['TValue']; + visitTimeMicrosecond <T extends TimeMicrosecond> (vector: VectorType<T>, index: number): T['TValue']; + visitTimeNanosecond <T extends TimeNanosecond> (vector: VectorType<T>, index: number): T['TValue']; + visitDecimal <T extends Decimal> (vector: VectorType<T>, index: number): T['TValue']; + visitList <T extends List> (vector: VectorType<T>, index: number): T['TValue']; + visitStruct <T extends Struct> (vector: VectorType<T>, index: number): T['TValue']; + visitUnion <T extends Union> (vector: VectorType<T>, index: number): T['TValue']; + visitDenseUnion <T extends DenseUnion> (vector: VectorType<T>, index: number): T['TValue']; + visitSparseUnion <T extends SparseUnion> (vector: VectorType<T>, index: number): T['TValue']; + visitDictionary <T extends Dictionary> (vector: VectorType<T>, index: number): T['TValue']; + visitInterval <T extends Interval> (vector: VectorType<T>, index: number): T['TValue']; + visitIntervalDayTime <T extends IntervalDayTime> (vector: VectorType<T>, index: number): T['TValue']; + visitIntervalYearMonth <T extends IntervalYearMonth> (vector: VectorType<T>, index: number): T['TValue']; + visitFixedSizeList <T extends FixedSizeList> (vector: VectorType<T>, index: number): T['TValue']; + visitMap <T extends Map_> (vector: VectorType<T>, index: number): T['TValue']; +} + +/** @ignore */ +export class GetVisitor extends Visitor {} + +/** @ignore */const epochDaysToMs = (data: Int32Array, index: number) => 86400000 * data[index]; +/** @ignore */const epochMillisecondsLongToMs = (data: Int32Array, index: number) => 4294967296 * (data[index + 1]) + (data[index] >>> 0); +/** @ignore */const epochMicrosecondsLongToMs = (data: Int32Array, index: number) => 4294967296 * (data[index + 1] / 1000) + ((data[index] >>> 0) / 1000); +/** @ignore */const epochNanosecondsLongToMs = (data: Int32Array, index: number) => 4294967296 * (data[index + 1] / 1000000) + ((data[index] >>> 0) / 1000000); + +/** @ignore */const epochMillisecondsToDate = (epochMs: number) => new Date(epochMs); +/** @ignore */const epochDaysToDate = (data: Int32Array, index: number) => epochMillisecondsToDate(epochDaysToMs(data, index)); +/** @ignore */const epochMillisecondsLongToDate = (data: Int32Array, index: number) => epochMillisecondsToDate(epochMillisecondsLongToMs(data, index)); + +/** @ignore */ +const getNull = <T extends Null>(_vector: VectorType<T>, _index: number): T['TValue'] => null; +/** @ignore */ +const getVariableWidthBytes = (values: Uint8Array, valueOffsets: Int32Array, index: number) => { + const { [index]: x, [index + 1]: y } = valueOffsets; + return x != null && y != null ? values.subarray(x, y) : null as any; +}; + +/** @ignore */ +const getBool = <T extends Bool>({ offset, values }: VectorType<T>, index: number): T['TValue'] => { + const idx = offset + index; + const byte = values[idx >> 3]; + return (byte & 1 << (idx % 8)) !== 0; +}; + +/** @ignore */ +type Numeric1X = Int8 | Int16 | Int32 | Uint8 | Uint16 | Uint32 | Float32 | Float64; +/** @ignore */ +type Numeric2X = Int64 | Uint64; + +/** @ignore */ +const getDateDay = <T extends DateDay> ({ values }: VectorType<T>, index: number): T['TValue'] => epochDaysToDate(values, index); +/** @ignore */ +const getDateMillisecond = <T extends DateMillisecond>({ values }: VectorType<T>, index: number): T['TValue'] => epochMillisecondsLongToDate(values, index * 2); +/** @ignore */ +const getNumeric = <T extends Numeric1X> ({ stride, values }: VectorType<T>, index: number): T['TValue'] => values[stride * index]; +/** @ignore */ +const getFloat16 = <T extends Float16> ({ stride, values }: VectorType<T>, index: number): T['TValue'] => uint16ToFloat64(values[stride * index]); +/** @ignore */ +const getBigInts = <T extends Numeric2X>({ stride, values, type }: VectorType<T>, index: number): T['TValue'] => <any> BN.new(values.subarray(stride * index, stride * (index + 1)), type.isSigned); +/** @ignore */ +const getFixedSizeBinary = <T extends FixedSizeBinary>({ stride, values }: VectorType<T>, index: number): T['TValue'] => values.subarray(stride * index, stride * (index + 1)); + +/** @ignore */ +const getBinary = <T extends Binary>({ values, valueOffsets }: VectorType<T>, index: number): T['TValue'] => getVariableWidthBytes(values, valueOffsets, index); +/** @ignore */ +const getUtf8 = <T extends Utf8>({ values, valueOffsets }: VectorType<T>, index: number): T['TValue'] => { + const bytes = getVariableWidthBytes(values, valueOffsets, index); + return bytes !== null ? decodeUtf8(bytes) : null as any; +}; + +/* istanbul ignore next */ +/** @ignore */ +const getInt = <T extends Int>(vector: VectorType<T>, index: number): T['TValue'] => ( + vector.type.bitWidth < 64 + ? getNumeric(vector as VectorType<Numeric1X>, index) + : getBigInts(vector as VectorType<Numeric2X>, index) +); + +/* istanbul ignore next */ +/** @ignore */ +const getFloat = <T extends Float> (vector: VectorType<T>, index: number): T['TValue'] => ( + vector.type.precision !== Precision.HALF + ? getNumeric(vector as VectorType<Numeric1X>, index) + : getFloat16(vector as VectorType<Float16>, index) +); + +/* istanbul ignore next */ +/** @ignore */ +const getDate = <T extends Date_> (vector: VectorType<T>, index: number): T['TValue'] => ( + vector.type.unit === DateUnit.DAY + ? getDateDay(vector as VectorType<DateDay>, index) + : getDateMillisecond(vector as VectorType<DateMillisecond>, index) +); + +/** @ignore */ +const getTimestampSecond = <T extends TimestampSecond> ({ values }: VectorType<T>, index: number): T['TValue'] => 1000 * epochMillisecondsLongToMs(values, index * 2); +/** @ignore */ +const getTimestampMillisecond = <T extends TimestampMillisecond>({ values }: VectorType<T>, index: number): T['TValue'] => epochMillisecondsLongToMs(values, index * 2); +/** @ignore */ +const getTimestampMicrosecond = <T extends TimestampMicrosecond>({ values }: VectorType<T>, index: number): T['TValue'] => epochMicrosecondsLongToMs(values, index * 2); +/** @ignore */ +const getTimestampNanosecond = <T extends TimestampNanosecond> ({ values }: VectorType<T>, index: number): T['TValue'] => epochNanosecondsLongToMs(values, index * 2); +/* istanbul ignore next */ +/** @ignore */ +const getTimestamp = <T extends Timestamp>(vector: VectorType<T>, index: number): T['TValue'] => { + switch (vector.type.unit) { + case TimeUnit.SECOND: return getTimestampSecond(vector as VectorType<TimestampSecond>, index); + case TimeUnit.MILLISECOND: return getTimestampMillisecond(vector as VectorType<TimestampMillisecond>, index); + case TimeUnit.MICROSECOND: return getTimestampMicrosecond(vector as VectorType<TimestampMicrosecond>, index); + case TimeUnit.NANOSECOND: return getTimestampNanosecond(vector as VectorType<TimestampNanosecond>, index); + } +}; + +/** @ignore */ +const getTimeSecond = <T extends TimeSecond> ({ values, stride }: VectorType<T>, index: number): T['TValue'] => values[stride * index]; +/** @ignore */ +const getTimeMillisecond = <T extends TimeMillisecond>({ values, stride }: VectorType<T>, index: number): T['TValue'] => values[stride * index]; +/** @ignore */ +const getTimeMicrosecond = <T extends TimeMicrosecond>({ values }: VectorType<T>, index: number): T['TValue'] => BN.signed(values.subarray(2 * index, 2 * (index + 1))); +/** @ignore */ +const getTimeNanosecond = <T extends TimeNanosecond> ({ values }: VectorType<T>, index: number): T['TValue'] => BN.signed(values.subarray(2 * index, 2 * (index + 1))); +/* istanbul ignore next */ +/** @ignore */ +const getTime = <T extends Time>(vector: VectorType<T>, index: number): T['TValue'] => { + switch (vector.type.unit) { + case TimeUnit.SECOND: return getTimeSecond(vector as VectorType<TimeSecond>, index); + case TimeUnit.MILLISECOND: return getTimeMillisecond(vector as VectorType<TimeMillisecond>, index); + case TimeUnit.MICROSECOND: return getTimeMicrosecond(vector as VectorType<TimeMicrosecond>, index); + case TimeUnit.NANOSECOND: return getTimeNanosecond(vector as VectorType<TimeNanosecond>, index); + } +}; + +/** @ignore */ +const getDecimal = <T extends Decimal>({ values }: VectorType<T>, index: number): T['TValue'] => BN.decimal(values.subarray(4 * index, 4 * (index + 1))); + +/** @ignore */ +const getList = <T extends List>(vector: VectorType<T>, index: number): T['TValue'] => { + const child = vector.getChildAt(0)!, { valueOffsets, stride } = vector; + return child.slice(valueOffsets[index * stride], valueOffsets[(index * stride) + 1]) as T['TValue']; +}; + +/** @ignore */ +const getMap = <T extends Map_>(vector: VectorType<T>, index: number): T['TValue'] => { + return vector.bind(index) as T['TValue']; +}; + +/** @ignore */ +const getStruct = <T extends Struct>(vector: VectorType<T>, index: number): T['TValue'] => { + return vector.bind(index) as T['TValue']; +}; + +/* istanbul ignore next */ +/** @ignore */ +const getUnion = < + V extends VectorType<Union> | VectorType<DenseUnion> | VectorType<SparseUnion> +>(vector: V, index: number): V['TValue'] => { + return vector.type.mode === UnionMode.Dense ? + getDenseUnion(vector as VectorType<DenseUnion>, index) : + getSparseUnion(vector as VectorType<SparseUnion>, index); +}; + +/** @ignore */ +const getDenseUnion = <T extends DenseUnion>(vector: VectorType<T>, index: number): T['TValue'] => { + const childIndex = vector.typeIdToChildIndex[vector.typeIds[index]]; + const child = vector.getChildAt(childIndex); + return child ? child.get(vector.valueOffsets[index]) : null; +}; + +/** @ignore */ +const getSparseUnion = <T extends SparseUnion>(vector: VectorType<T>, index: number): T['TValue'] => { + const childIndex = vector.typeIdToChildIndex[vector.typeIds[index]]; + const child = vector.getChildAt(childIndex); + return child ? child.get(index) : null; +}; + +/** @ignore */ +const getDictionary = <T extends Dictionary>(vector: VectorType<T>, index: number): T['TValue'] => { + return vector.getValue(vector.getKey(index)!); +}; + +/* istanbul ignore next */ +/** @ignore */ +const getInterval = <T extends Interval>(vector: VectorType<T>, index: number): T['TValue'] => + (vector.type.unit === IntervalUnit.DAY_TIME) + ? getIntervalDayTime(vector as VectorType<IntervalDayTime>, index) + : getIntervalYearMonth(vector as VectorType<IntervalYearMonth>, index); + +/** @ignore */ +const getIntervalDayTime = <T extends IntervalDayTime>({ values }: VectorType<T>, index: number): T['TValue'] => values.subarray(2 * index, 2 * (index + 1)); + +/** @ignore */ +const getIntervalYearMonth = <T extends IntervalYearMonth>({ values }: VectorType<T>, index: number): T['TValue'] => { + const interval = values[index]; + const int32s = new Int32Array(2); + int32s[0] = interval / 12 | 0; /* years */ + int32s[1] = interval % 12 | 0; /* months */ + return int32s; +}; + +/** @ignore */ +const getFixedSizeList = <T extends FixedSizeList>(vector: VectorType<T>, index: number): T['TValue'] => { + const child = vector.getChildAt(0)!, { stride } = vector; + return child.slice(index * stride, (index + 1) * stride) as T['TValue']; +}; + +GetVisitor.prototype.visitNull = getNull; +GetVisitor.prototype.visitBool = getBool; +GetVisitor.prototype.visitInt = getInt; +GetVisitor.prototype.visitInt8 = getNumeric; +GetVisitor.prototype.visitInt16 = getNumeric; +GetVisitor.prototype.visitInt32 = getNumeric; +GetVisitor.prototype.visitInt64 = getBigInts; +GetVisitor.prototype.visitUint8 = getNumeric; +GetVisitor.prototype.visitUint16 = getNumeric; +GetVisitor.prototype.visitUint32 = getNumeric; +GetVisitor.prototype.visitUint64 = getBigInts; +GetVisitor.prototype.visitFloat = getFloat; +GetVisitor.prototype.visitFloat16 = getFloat16; +GetVisitor.prototype.visitFloat32 = getNumeric; +GetVisitor.prototype.visitFloat64 = getNumeric; +GetVisitor.prototype.visitUtf8 = getUtf8; +GetVisitor.prototype.visitBinary = getBinary; +GetVisitor.prototype.visitFixedSizeBinary = getFixedSizeBinary; +GetVisitor.prototype.visitDate = getDate; +GetVisitor.prototype.visitDateDay = getDateDay; +GetVisitor.prototype.visitDateMillisecond = getDateMillisecond; +GetVisitor.prototype.visitTimestamp = getTimestamp; +GetVisitor.prototype.visitTimestampSecond = getTimestampSecond; +GetVisitor.prototype.visitTimestampMillisecond = getTimestampMillisecond; +GetVisitor.prototype.visitTimestampMicrosecond = getTimestampMicrosecond; +GetVisitor.prototype.visitTimestampNanosecond = getTimestampNanosecond; +GetVisitor.prototype.visitTime = getTime; +GetVisitor.prototype.visitTimeSecond = getTimeSecond; +GetVisitor.prototype.visitTimeMillisecond = getTimeMillisecond; +GetVisitor.prototype.visitTimeMicrosecond = getTimeMicrosecond; +GetVisitor.prototype.visitTimeNanosecond = getTimeNanosecond; +GetVisitor.prototype.visitDecimal = getDecimal; +GetVisitor.prototype.visitList = getList; +GetVisitor.prototype.visitStruct = getStruct; +GetVisitor.prototype.visitUnion = getUnion; +GetVisitor.prototype.visitDenseUnion = getDenseUnion; +GetVisitor.prototype.visitSparseUnion = getSparseUnion; +GetVisitor.prototype.visitDictionary = getDictionary; +GetVisitor.prototype.visitInterval = getInterval; +GetVisitor.prototype.visitIntervalDayTime = getIntervalDayTime; +GetVisitor.prototype.visitIntervalYearMonth = getIntervalYearMonth; +GetVisitor.prototype.visitFixedSizeList = getFixedSizeList; +GetVisitor.prototype.visitMap = getMap; + +/** @ignore */ +export const instance = new GetVisitor(); diff --git a/src/arrow/js/src/visitor/indexof.ts b/src/arrow/js/src/visitor/indexof.ts new file mode 100644 index 000000000..ab4678aed --- /dev/null +++ b/src/arrow/js/src/visitor/indexof.ts @@ -0,0 +1,183 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Data } from '../data'; +import { Type } from '../enum'; +import { Visitor } from '../visitor'; +import { VectorType } from '../interfaces'; +import { getBool, BitIterator } from '../util/bit'; +import { createElementComparator } from '../util/vector'; +import { + DataType, Dictionary, + Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Float, Float16, Float32, Float64, + Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, + Date_, DateDay, DateMillisecond, + Interval, IntervalDayTime, IntervalYearMonth, + Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond, + Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, + Union, DenseUnion, SparseUnion, +} from '../type'; + +/** @ignore */ +export interface IndexOfVisitor extends Visitor { + visit<T extends VectorType> (node: T, value: T['TValue'] | null, index?: number): number; + visitMany <T extends VectorType> (nodes: T[], values: (T['TValue'] | null)[], indices: (number | undefined)[]): number[]; + getVisitFn<T extends Type> (node: T): (vector: VectorType<T>, value: VectorType<T>['TValue'] | null, index?: number) => number; + getVisitFn<T extends DataType>(node: VectorType<T> | Data<T> | T): (vector: VectorType<T>, value: T['TValue'] | null, index?: number) => number; + visitNull <T extends Null> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitBool <T extends Bool> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitInt <T extends Int> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitInt8 <T extends Int8> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitInt16 <T extends Int16> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitInt32 <T extends Int32> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitInt64 <T extends Int64> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitUint8 <T extends Uint8> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitUint16 <T extends Uint16> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitUint32 <T extends Uint32> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitUint64 <T extends Uint64> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitFloat <T extends Float> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitFloat16 <T extends Float16> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitFloat32 <T extends Float32> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitFloat64 <T extends Float64> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitUtf8 <T extends Utf8> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitBinary <T extends Binary> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitFixedSizeBinary <T extends FixedSizeBinary> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitDate <T extends Date_> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitDateDay <T extends DateDay> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitDateMillisecond <T extends DateMillisecond> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitTimestamp <T extends Timestamp> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitTimestampSecond <T extends TimestampSecond> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitTimestampMillisecond <T extends TimestampMillisecond>(vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitTimestampMicrosecond <T extends TimestampMicrosecond>(vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitTimestampNanosecond <T extends TimestampNanosecond> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitTime <T extends Time> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitTimeSecond <T extends TimeSecond> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitTimeMillisecond <T extends TimeMillisecond> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitTimeMicrosecond <T extends TimeMicrosecond> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitTimeNanosecond <T extends TimeNanosecond> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitDecimal <T extends Decimal> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitList <T extends List> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitStruct <T extends Struct> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitUnion <T extends Union> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitDenseUnion <T extends DenseUnion> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitSparseUnion <T extends SparseUnion> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitDictionary <T extends Dictionary> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitInterval <T extends Interval> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitIntervalDayTime <T extends IntervalDayTime> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitIntervalYearMonth <T extends IntervalYearMonth> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitFixedSizeList <T extends FixedSizeList> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; + visitMap <T extends Map_> (vector: VectorType<T>, value: T['TValue'] | null, index?: number): number; +} + +/** @ignore */ +export class IndexOfVisitor extends Visitor {} + +/** @ignore */ +function nullIndexOf(vector: VectorType<Null>, searchElement?: null) { + // if you're looking for nulls and the vector isn't empty, we've got 'em! + return searchElement === null && vector.length > 0 ? 0 : -1; +} + +/** @ignore */ +function indexOfNull<T extends DataType>(vector: VectorType<T>, fromIndex?: number): number { + const { nullBitmap } = vector.data; + if (!nullBitmap || vector.nullCount <= 0) { + return -1; + } + let i = 0; + for (const isValid of new BitIterator(nullBitmap, vector.data.offset + (fromIndex || 0), vector.length, nullBitmap, getBool)) { + if (!isValid) { return i; } + ++i; + } + return -1; +} + +/** @ignore */ +function indexOfValue<T extends DataType>(vector: VectorType<T>, searchElement?: T['TValue'] | null, fromIndex?: number): number { + if (searchElement === undefined) { return -1; } + if (searchElement === null) { return indexOfNull(vector, fromIndex); } + const compare = createElementComparator(searchElement); + for (let i = (fromIndex || 0) - 1, n = vector.length; ++i < n;) { + if (compare(vector.get(i))) { + return i; + } + } + return -1; +} + +/** @ignore */ +function indexOfUnion<T extends DataType>(vector: VectorType<T>, searchElement?: T['TValue'] | null, fromIndex?: number): number { + // Unions are special -- they do have a nullBitmap, but so can their children. + // If the searchElement is null, we don't know whether it came from the Union's + // bitmap or one of its childrens'. So we don't interrogate the Union's bitmap, + // since that will report the wrong index if a child has a null before the Union. + const compare = createElementComparator(searchElement); + for (let i = (fromIndex || 0) - 1, n = vector.length; ++i < n;) { + if (compare(vector.get(i))) { + return i; + } + } + return -1; +} + +IndexOfVisitor.prototype.visitNull = nullIndexOf; +IndexOfVisitor.prototype.visitBool = indexOfValue; +IndexOfVisitor.prototype.visitInt = indexOfValue; +IndexOfVisitor.prototype.visitInt8 = indexOfValue; +IndexOfVisitor.prototype.visitInt16 = indexOfValue; +IndexOfVisitor.prototype.visitInt32 = indexOfValue; +IndexOfVisitor.prototype.visitInt64 = indexOfValue; +IndexOfVisitor.prototype.visitUint8 = indexOfValue; +IndexOfVisitor.prototype.visitUint16 = indexOfValue; +IndexOfVisitor.prototype.visitUint32 = indexOfValue; +IndexOfVisitor.prototype.visitUint64 = indexOfValue; +IndexOfVisitor.prototype.visitFloat = indexOfValue; +IndexOfVisitor.prototype.visitFloat16 = indexOfValue; +IndexOfVisitor.prototype.visitFloat32 = indexOfValue; +IndexOfVisitor.prototype.visitFloat64 = indexOfValue; +IndexOfVisitor.prototype.visitUtf8 = indexOfValue; +IndexOfVisitor.prototype.visitBinary = indexOfValue; +IndexOfVisitor.prototype.visitFixedSizeBinary = indexOfValue; +IndexOfVisitor.prototype.visitDate = indexOfValue; +IndexOfVisitor.prototype.visitDateDay = indexOfValue; +IndexOfVisitor.prototype.visitDateMillisecond = indexOfValue; +IndexOfVisitor.prototype.visitTimestamp = indexOfValue; +IndexOfVisitor.prototype.visitTimestampSecond = indexOfValue; +IndexOfVisitor.prototype.visitTimestampMillisecond = indexOfValue; +IndexOfVisitor.prototype.visitTimestampMicrosecond = indexOfValue; +IndexOfVisitor.prototype.visitTimestampNanosecond = indexOfValue; +IndexOfVisitor.prototype.visitTime = indexOfValue; +IndexOfVisitor.prototype.visitTimeSecond = indexOfValue; +IndexOfVisitor.prototype.visitTimeMillisecond = indexOfValue; +IndexOfVisitor.prototype.visitTimeMicrosecond = indexOfValue; +IndexOfVisitor.prototype.visitTimeNanosecond = indexOfValue; +IndexOfVisitor.prototype.visitDecimal = indexOfValue; +IndexOfVisitor.prototype.visitList = indexOfValue; +IndexOfVisitor.prototype.visitStruct = indexOfValue; +IndexOfVisitor.prototype.visitUnion = indexOfValue; +IndexOfVisitor.prototype.visitDenseUnion = indexOfUnion; +IndexOfVisitor.prototype.visitSparseUnion = indexOfUnion; +IndexOfVisitor.prototype.visitDictionary = indexOfValue; +IndexOfVisitor.prototype.visitInterval = indexOfValue; +IndexOfVisitor.prototype.visitIntervalDayTime = indexOfValue; +IndexOfVisitor.prototype.visitIntervalYearMonth = indexOfValue; +IndexOfVisitor.prototype.visitFixedSizeList = indexOfValue; +IndexOfVisitor.prototype.visitMap = indexOfValue; + +/** @ignore */ +export const instance = new IndexOfVisitor(); diff --git a/src/arrow/js/src/visitor/iterator.ts b/src/arrow/js/src/visitor/iterator.ts new file mode 100644 index 000000000..4a8e6b5b6 --- /dev/null +++ b/src/arrow/js/src/visitor/iterator.ts @@ -0,0 +1,193 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Data } from '../data'; +import { Type } from '../enum'; +import { Visitor } from '../visitor'; +import { VectorType } from '../interfaces'; +import { BitIterator } from '../util/bit'; +import { instance as getVisitor } from './get'; +import { + DataType, Dictionary, + Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Float, Float16, Float32, Float64, + Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, + Date_, DateDay, DateMillisecond, + Interval, IntervalDayTime, IntervalYearMonth, + Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond, + Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, + Union, DenseUnion, SparseUnion, +} from '../type'; + +/** @ignore */ +export interface IteratorVisitor extends Visitor { + visit<T extends VectorType>(node: T): IterableIterator<T['TValue'] | null>; + visitMany <T extends VectorType>(nodes: T[]): IterableIterator<T['TValue'] | null>[]; + getVisitFn<T extends Type>(node: T): (vector: VectorType<T>) => IterableIterator<VectorType<T>['TValue'] | null>; + getVisitFn<T extends DataType>(node: VectorType<T> | Data<T> | T): (vector: VectorType<T>) => IterableIterator<VectorType<T>['TValue'] | null>; + visitNull <T extends Null> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitBool <T extends Bool> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitInt <T extends Int> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitInt8 <T extends Int8> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitInt16 <T extends Int16> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitInt32 <T extends Int32> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitInt64 <T extends Int64> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitUint8 <T extends Uint8> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitUint16 <T extends Uint16> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitUint32 <T extends Uint32> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitUint64 <T extends Uint64> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitFloat <T extends Float> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitFloat16 <T extends Float16> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitFloat32 <T extends Float32> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitFloat64 <T extends Float64> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitUtf8 <T extends Utf8> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitBinary <T extends Binary> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitFixedSizeBinary <T extends FixedSizeBinary> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitDate <T extends Date_> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitDateDay <T extends DateDay> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitDateMillisecond <T extends DateMillisecond> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitTimestamp <T extends Timestamp> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitTimestampSecond <T extends TimestampSecond> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitTimestampMillisecond <T extends TimestampMillisecond> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitTimestampMicrosecond <T extends TimestampMicrosecond> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitTimestampNanosecond <T extends TimestampNanosecond> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitTime <T extends Time> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitTimeSecond <T extends TimeSecond> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitTimeMillisecond <T extends TimeMillisecond> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitTimeMicrosecond <T extends TimeMicrosecond> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitTimeNanosecond <T extends TimeNanosecond> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitDecimal <T extends Decimal> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitList <T extends List> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitStruct <T extends Struct> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitUnion <T extends Union> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitDenseUnion <T extends DenseUnion> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitSparseUnion <T extends SparseUnion> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitDictionary <T extends Dictionary> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitInterval <T extends Interval> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitIntervalDayTime <T extends IntervalDayTime> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitIntervalYearMonth <T extends IntervalYearMonth> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitFixedSizeList <T extends FixedSizeList> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; + visitMap <T extends Map_> (vector: VectorType<T>): IterableIterator<T['TValue'] | null>; +} + +/** @ignore */ +export class IteratorVisitor extends Visitor {} + +/** @ignore */ +function nullableIterator<T extends DataType>(vector: VectorType<T>): IterableIterator<T['TValue'] | null> { + const getFn = getVisitor.getVisitFn(vector); + return new BitIterator<T['TValue'] | null>( + vector.data.nullBitmap, vector.data.offset, vector.length, vector, + (vec: VectorType<T>, idx: number, nullByte: number, nullBit: number) => + ((nullByte & 1 << nullBit) !== 0) ? getFn(vec, idx) : null + ); +} + +/** @ignore */ +class VectorIterator<T extends DataType> implements IterableIterator<T['TValue'] | null> { + private index = 0; + + constructor( + private vector: VectorType<T>, + private getFn: (vector: VectorType<T>, index: number) => VectorType<T>['TValue'] + ) {} + + next(): IteratorResult<T['TValue'] | null> { + if (this.index < this.vector.length) { + return { + value: this.getFn(this.vector, this.index++) + }; + } + + return {done: true, value: null}; + } + + [Symbol.iterator]() { + return this; + } +} + +/** @ignore */ +function vectorIterator<T extends DataType>(vector: VectorType<T>): IterableIterator<T['TValue'] | null> { + + // If nullable, iterate manually + if (vector.nullCount > 0) { + return nullableIterator<T>(vector); + } + + const { type, typeId, length } = vector; + + // Fast case, defer to native iterators if possible + if (vector.stride === 1 && ( + (typeId === Type.Timestamp) || + (typeId === Type.Int && (type as Int).bitWidth !== 64) || + (typeId === Type.Time && (type as Time).bitWidth !== 64) || + (typeId === Type.Float && (type as Float).precision > 0 /* Precision.HALF */) + )) { + return vector.data.values.subarray(0, length)[Symbol.iterator](); + } + + // Otherwise, iterate manually + return new VectorIterator(vector, getVisitor.getVisitFn(vector)); +} + +IteratorVisitor.prototype.visitNull = vectorIterator; +IteratorVisitor.prototype.visitBool = vectorIterator; +IteratorVisitor.prototype.visitInt = vectorIterator; +IteratorVisitor.prototype.visitInt8 = vectorIterator; +IteratorVisitor.prototype.visitInt16 = vectorIterator; +IteratorVisitor.prototype.visitInt32 = vectorIterator; +IteratorVisitor.prototype.visitInt64 = vectorIterator; +IteratorVisitor.prototype.visitUint8 = vectorIterator; +IteratorVisitor.prototype.visitUint16 = vectorIterator; +IteratorVisitor.prototype.visitUint32 = vectorIterator; +IteratorVisitor.prototype.visitUint64 = vectorIterator; +IteratorVisitor.prototype.visitFloat = vectorIterator; +IteratorVisitor.prototype.visitFloat16 = vectorIterator; +IteratorVisitor.prototype.visitFloat32 = vectorIterator; +IteratorVisitor.prototype.visitFloat64 = vectorIterator; +IteratorVisitor.prototype.visitUtf8 = vectorIterator; +IteratorVisitor.prototype.visitBinary = vectorIterator; +IteratorVisitor.prototype.visitFixedSizeBinary = vectorIterator; +IteratorVisitor.prototype.visitDate = vectorIterator; +IteratorVisitor.prototype.visitDateDay = vectorIterator; +IteratorVisitor.prototype.visitDateMillisecond = vectorIterator; +IteratorVisitor.prototype.visitTimestamp = vectorIterator; +IteratorVisitor.prototype.visitTimestampSecond = vectorIterator; +IteratorVisitor.prototype.visitTimestampMillisecond = vectorIterator; +IteratorVisitor.prototype.visitTimestampMicrosecond = vectorIterator; +IteratorVisitor.prototype.visitTimestampNanosecond = vectorIterator; +IteratorVisitor.prototype.visitTime = vectorIterator; +IteratorVisitor.prototype.visitTimeSecond = vectorIterator; +IteratorVisitor.prototype.visitTimeMillisecond = vectorIterator; +IteratorVisitor.prototype.visitTimeMicrosecond = vectorIterator; +IteratorVisitor.prototype.visitTimeNanosecond = vectorIterator; +IteratorVisitor.prototype.visitDecimal = vectorIterator; +IteratorVisitor.prototype.visitList = vectorIterator; +IteratorVisitor.prototype.visitStruct = vectorIterator; +IteratorVisitor.prototype.visitUnion = vectorIterator; +IteratorVisitor.prototype.visitDenseUnion = vectorIterator; +IteratorVisitor.prototype.visitSparseUnion = vectorIterator; +IteratorVisitor.prototype.visitDictionary = vectorIterator; +IteratorVisitor.prototype.visitInterval = vectorIterator; +IteratorVisitor.prototype.visitIntervalDayTime = vectorIterator; +IteratorVisitor.prototype.visitIntervalYearMonth = vectorIterator; +IteratorVisitor.prototype.visitFixedSizeList = vectorIterator; +IteratorVisitor.prototype.visitMap = vectorIterator; + +/** @ignore */ +export const instance = new IteratorVisitor(); diff --git a/src/arrow/js/src/visitor/jsontypeassembler.ts b/src/arrow/js/src/visitor/jsontypeassembler.ts new file mode 100644 index 000000000..54f046f64 --- /dev/null +++ b/src/arrow/js/src/visitor/jsontypeassembler.ts @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import * as type from '../type'; +import { Visitor } from '../visitor'; +import { Type as ArrowType } from '../fb/Schema'; +import { Precision, DateUnit, TimeUnit, IntervalUnit, UnionMode } from '../enum'; + +/** @ignore */ +export interface JSONTypeAssembler extends Visitor { + visit<T extends type.DataType>(node: T): Record<string, unknown> | undefined; +} + +/** @ignore */ +export class JSONTypeAssembler extends Visitor { + public visit<T extends type.DataType>(node: T): Record<string, unknown> | undefined { + return node == null ? undefined : super.visit(node); + } + public visitNull<T extends type.Null>({ typeId }: T) { + return { 'name': ArrowType[typeId].toLowerCase() }; + } + public visitInt<T extends type.Int>({ typeId, bitWidth, isSigned }: T) { + return { 'name': ArrowType[typeId].toLowerCase(), 'bitWidth': bitWidth, 'isSigned': isSigned }; + } + public visitFloat<T extends type.Float>({ typeId, precision }: T) { + return { 'name': ArrowType[typeId].toLowerCase(), 'precision': Precision[precision] }; + } + public visitBinary<T extends type.Binary>({ typeId }: T) { + return { 'name': ArrowType[typeId].toLowerCase() }; + } + public visitBool<T extends type.Bool>({ typeId }: T) { + return { 'name': ArrowType[typeId].toLowerCase() }; + } + public visitUtf8<T extends type.Utf8>({ typeId }: T) { + return { 'name': ArrowType[typeId].toLowerCase() }; + } + public visitDecimal<T extends type.Decimal>({ typeId, scale, precision }: T) { + return { 'name': ArrowType[typeId].toLowerCase(), 'scale': scale, 'precision': precision }; + } + public visitDate<T extends type.Date_>({ typeId, unit }: T) { + return { 'name': ArrowType[typeId].toLowerCase(), 'unit': DateUnit[unit] }; + } + public visitTime<T extends type.Time>({ typeId, unit, bitWidth }: T) { + return { 'name': ArrowType[typeId].toLowerCase(), 'unit': TimeUnit[unit], bitWidth }; + } + public visitTimestamp<T extends type.Timestamp>({ typeId, timezone, unit }: T) { + return { 'name': ArrowType[typeId].toLowerCase(), 'unit': TimeUnit[unit], timezone }; + } + public visitInterval<T extends type.Interval>({ typeId, unit }: T) { + return { 'name': ArrowType[typeId].toLowerCase(), 'unit': IntervalUnit[unit] }; + } + public visitList<T extends type.List>({ typeId }: T) { + return { 'name': ArrowType[typeId].toLowerCase() }; + } + public visitStruct<T extends type.Struct>({ typeId }: T) { + return { 'name': ArrowType[typeId].toLowerCase() }; + } + public visitUnion<T extends type.Union>({ typeId, mode, typeIds }: T) { + return { + 'name': ArrowType[typeId].toLowerCase(), + 'mode': UnionMode[mode], + 'typeIds': [...typeIds] + }; + } + public visitDictionary<T extends type.Dictionary>(node: T) { + return this.visit(node.dictionary); + } + public visitFixedSizeBinary<T extends type.FixedSizeBinary>({ typeId, byteWidth }: T) { + return { 'name': ArrowType[typeId].toLowerCase(), 'byteWidth': byteWidth }; + } + public visitFixedSizeList<T extends type.FixedSizeList>({ typeId, listSize }: T) { + return { 'name': ArrowType[typeId].toLowerCase(), 'listSize': listSize }; + } + public visitMap<T extends type.Map_>({ typeId, keysSorted }: T) { + return { 'name': ArrowType[typeId].toLowerCase(), 'keysSorted': keysSorted }; + } +} diff --git a/src/arrow/js/src/visitor/jsonvectorassembler.ts b/src/arrow/js/src/visitor/jsonvectorassembler.ts new file mode 100644 index 000000000..f3c013344 --- /dev/null +++ b/src/arrow/js/src/visitor/jsonvectorassembler.ts @@ -0,0 +1,177 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { BN } from '../util/bn'; +import { Column } from '../column'; +import { Vector } from '../vector'; +import { Visitor } from '../visitor'; +import { BufferType } from '../enum'; +import { RecordBatch } from '../recordbatch'; +import { VectorType as V } from '../interfaces'; +import { UnionMode, DateUnit, TimeUnit } from '../enum'; +import { BitIterator, getBit, getBool } from '../util/bit'; +import { selectColumnChildrenArgs } from '../util/args'; +import { + DataType, + Float, Int, Date_, Interval, Time, Timestamp, Union, + Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, +} from '../type'; + +/** @ignore */ +export interface JSONVectorAssembler extends Visitor { + + visit <T extends Column> (node: T ): Record<string, unknown>; + visitMany <T extends Column> (cols: T[]): Record<string, unknown>[]; + getVisitFn<T extends DataType>(node: Column<T>): (column: Column<T>) => { name: string; count: number; VALIDITY: (0 | 1)[]; DATA?: any[]; OFFSET?: number[]; TYPE?: number[]; children?: any[] }; + + visitNull <T extends Null> (vector: V<T>): Record<string, never>; + visitBool <T extends Bool> (vector: V<T>): { DATA: boolean[] }; + visitInt <T extends Int> (vector: V<T>): { DATA: (number | string)[] }; + visitFloat <T extends Float> (vector: V<T>): { DATA: number[] }; + visitUtf8 <T extends Utf8> (vector: V<T>): { DATA: string[]; OFFSET: number[] }; + visitBinary <T extends Binary> (vector: V<T>): { DATA: string[]; OFFSET: number[] }; + visitFixedSizeBinary <T extends FixedSizeBinary> (vector: V<T>): { DATA: string[] }; + visitDate <T extends Date_> (vector: V<T>): { DATA: number[] }; + visitTimestamp <T extends Timestamp> (vector: V<T>): { DATA: string[] }; + visitTime <T extends Time> (vector: V<T>): { DATA: number[] }; + visitDecimal <T extends Decimal> (vector: V<T>): { DATA: string[] }; + visitList <T extends List> (vector: V<T>): { children: any[]; OFFSET: number[] }; + visitStruct <T extends Struct> (vector: V<T>): { children: any[] }; + visitUnion <T extends Union> (vector: V<T>): { children: any[]; TYPE: number[] }; + visitInterval <T extends Interval> (vector: V<T>): { DATA: number[] }; + visitFixedSizeList <T extends FixedSizeList> (vector: V<T>): { children: any[] }; + visitMap <T extends Map_> (vector: V<T>): { children: any[] }; +} + +/** @ignore */ +export class JSONVectorAssembler extends Visitor { + + /** @nocollapse */ + public static assemble<T extends Column | RecordBatch>(...args: (T | T[])[]) { + return new JSONVectorAssembler().visitMany(selectColumnChildrenArgs(RecordBatch, args)); + } + + public visit<T extends Column>(column: T) { + const { data, name, length } = column; + const { offset, nullCount, nullBitmap } = data; + const type = DataType.isDictionary(column.type) ? column.type.indices : column.type; + const buffers = Object.assign([], data.buffers, { [BufferType.VALIDITY]: undefined }); + return { + 'name': name, + 'count': length, + 'VALIDITY': DataType.isNull(type) ? undefined + : nullCount <= 0 ? Array.from({ length }, () => 1) + : [...new BitIterator(nullBitmap, offset, length, null, getBit)], + ...super.visit(Vector.new(data.clone(type, offset, length, 0, buffers))) + }; + } + public visitNull() { return {}; } + public visitBool<T extends Bool>({ values, offset, length }: V<T>) { + return { 'DATA': [...new BitIterator(values, offset, length, null, getBool)] }; + } + public visitInt<T extends Int>(vector: V<T>) { + return { + 'DATA': vector.type.bitWidth < 64 + ? [...vector.values] + : [...bigNumsToStrings(vector.values as (Int32Array | Uint32Array), 2)] + }; + } + public visitFloat<T extends Float>(vector: V<T>) { + return { 'DATA': [...vector.values] }; + } + public visitUtf8<T extends Utf8>(vector: V<T>) { + return { 'DATA': [...vector], 'OFFSET': [...vector.valueOffsets] }; + } + public visitBinary<T extends Binary>(vector: V<T>) { + return { 'DATA': [...binaryToString(vector)], OFFSET: [...vector.valueOffsets] }; + } + public visitFixedSizeBinary<T extends FixedSizeBinary>(vector: V<T>) { + return { 'DATA': [...binaryToString(vector)] }; + } + public visitDate<T extends Date_>(vector: V<T>) { + return { + 'DATA': vector.type.unit === DateUnit.DAY + ? [...vector.values] + : [...bigNumsToStrings(vector.values, 2)] + }; + } + public visitTimestamp<T extends Timestamp>(vector: V<T>) { + return { 'DATA': [...bigNumsToStrings(vector.values, 2)] }; + } + public visitTime<T extends Time>(vector: V<T>) { + return { + 'DATA': vector.type.unit < TimeUnit.MICROSECOND + ? [...vector.values] + : [...bigNumsToStrings(vector.values, 2)] + }; + } + public visitDecimal<T extends Decimal>(vector: V<T>) { + return { 'DATA': [...bigNumsToStrings(vector.values, 4)] }; + } + public visitList<T extends List>(vector: V<T>) { + return { + 'OFFSET': [...vector.valueOffsets], + 'children': vector.type.children.map((f, i) => + this.visit(new Column(f, [vector.getChildAt(i)!]))) + }; + } + public visitStruct<T extends Struct>(vector: V<T>) { + return { + 'children': vector.type.children.map((f, i) => + this.visit(new Column(f, [vector.getChildAt(i)!]))) + }; + } + public visitUnion<T extends Union>(vector: V<T>) { + return { + 'TYPE': [...vector.typeIds], + 'OFFSET': vector.type.mode === UnionMode.Dense ? [...vector.valueOffsets] : undefined, + 'children': vector.type.children.map((f, i) => this.visit(new Column(f, [vector.getChildAt(i)!]))) + }; + } + public visitInterval<T extends Interval>(vector: V<T>) { + return { 'DATA': [...vector.values] }; + } + public visitFixedSizeList<T extends FixedSizeList>(vector: V<T>) { + return { + 'children': vector.type.children.map((f, i) => + this.visit(new Column(f, [vector.getChildAt(i)!]))) + }; + } + public visitMap<T extends Map_>(vector: V<T>) { + return { + 'OFFSET': [...vector.valueOffsets], + 'children': vector.type.children.map((f, i) => + this.visit(new Column(f, [vector.getChildAt(i)!]))) + }; + } +} + +/** @ignore */ +function* binaryToString(vector: Vector<Binary> | Vector<FixedSizeBinary>) { + for (const octets of vector as Iterable<Uint8Array>) { + yield octets.reduce((str, byte) => { + return `${str}${('0' + (byte & 0xFF).toString(16)).slice(-2)}`; + }, '').toUpperCase(); + } +} + +/** @ignore */ +function* bigNumsToStrings(values: Uint32Array | Int32Array, stride: number) { + for (let i = -1, n = values.length / stride; ++i < n;) { + yield `${BN.new(values.subarray((i + 0) * stride, (i + 1) * stride), false)}`; + } +} diff --git a/src/arrow/js/src/visitor/set.ts b/src/arrow/js/src/visitor/set.ts new file mode 100644 index 000000000..77985e5be --- /dev/null +++ b/src/arrow/js/src/visitor/set.ts @@ -0,0 +1,354 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Data } from '../data'; +import { Field } from '../schema'; +import { Vector } from '../vector'; +import { Visitor } from '../visitor'; +import { encodeUtf8 } from '../util/utf8'; +import { VectorType } from '../interfaces'; +import { float64ToUint16 } from '../util/math'; +import { toArrayBufferView } from '../util/buffer'; +import { Type, UnionMode, Precision, DateUnit, TimeUnit, IntervalUnit } from '../enum'; +import { + DataType, Dictionary, + Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Float, Float16, Float32, Float64, + Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, + Date_, DateDay, DateMillisecond, + Interval, IntervalDayTime, IntervalYearMonth, + Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond, + Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, + Union, DenseUnion, SparseUnion, +} from '../type'; + +/** @ignore */ +export interface SetVisitor extends Visitor { + visit<T extends VectorType>(node: T, index: number, value: T['TValue']): void; + visitMany<T extends VectorType>(nodes: T[], indices: number[], values: T['TValue'][]): void[]; + getVisitFn<T extends Type>(node: T): (vector: VectorType<T>, index: number, value: VectorType<T>['TValue']) => void; + getVisitFn<T extends DataType>(node: VectorType<T> | Data<T> | T): (vector: VectorType<T>, index: number, value: VectorType<T>['TValue']) => void; + visitNull <T extends Null> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitBool <T extends Bool> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitInt <T extends Int> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitInt8 <T extends Int8> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitInt16 <T extends Int16> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitInt32 <T extends Int32> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitInt64 <T extends Int64> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitUint8 <T extends Uint8> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitUint16 <T extends Uint16> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitUint32 <T extends Uint32> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitUint64 <T extends Uint64> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitFloat <T extends Float> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitFloat16 <T extends Float16> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitFloat32 <T extends Float32> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitFloat64 <T extends Float64> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitUtf8 <T extends Utf8> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitBinary <T extends Binary> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitFixedSizeBinary <T extends FixedSizeBinary> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitDate <T extends Date_> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitDateDay <T extends DateDay> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitDateMillisecond <T extends DateMillisecond> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitTimestamp <T extends Timestamp> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitTimestampSecond <T extends TimestampSecond> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitTimestampMillisecond <T extends TimestampMillisecond>(vector: VectorType<T>, index: number, value: T['TValue']): void; + visitTimestampMicrosecond <T extends TimestampMicrosecond>(vector: VectorType<T>, index: number, value: T['TValue']): void; + visitTimestampNanosecond <T extends TimestampNanosecond> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitTime <T extends Time> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitTimeSecond <T extends TimeSecond> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitTimeMillisecond <T extends TimeMillisecond> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitTimeMicrosecond <T extends TimeMicrosecond> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitTimeNanosecond <T extends TimeNanosecond> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitDecimal <T extends Decimal> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitList <T extends List> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitStruct <T extends Struct> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitUnion <T extends Union> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitDenseUnion <T extends DenseUnion> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitSparseUnion <T extends SparseUnion> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitDictionary <T extends Dictionary> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitInterval <T extends Interval> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitIntervalDayTime <T extends IntervalDayTime> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitIntervalYearMonth <T extends IntervalYearMonth> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitFixedSizeList <T extends FixedSizeList> (vector: VectorType<T>, index: number, value: T['TValue']): void; + visitMap <T extends Map_> (vector: VectorType<T>, index: number, value: T['TValue']): void; +} + +/** @ignore */ +export class SetVisitor extends Visitor {} + +/** @ignore */ +const setEpochMsToDays = (data: Int32Array, index: number, epochMs: number) => { data[index] = (epochMs / 86400000) | 0; }; +/** @ignore */ +const setEpochMsToMillisecondsLong = (data: Int32Array, index: number, epochMs: number) => { + data[index] = (epochMs % 4294967296) | 0; + data[index + 1] = (epochMs / 4294967296) | 0; +}; +/** @ignore */ +const setEpochMsToMicrosecondsLong = (data: Int32Array, index: number, epochMs: number) => { + data[index] = ((epochMs * 1000) % 4294967296) | 0; + data[index + 1] = ((epochMs * 1000) / 4294967296) | 0; +}; +/** @ignore */ +const setEpochMsToNanosecondsLong = (data: Int32Array, index: number, epochMs: number) => { + data[index] = ((epochMs * 1000000) % 4294967296) | 0; + data[index + 1] = ((epochMs * 1000000) / 4294967296) | 0; +}; + +/** @ignore */ +const setVariableWidthBytes = (values: Uint8Array, valueOffsets: Int32Array, index: number, value: Uint8Array) => { + const { [index]: x, [index + 1]: y } = valueOffsets; + if (x != null && y != null) { + values.set(value.subarray(0, y - x), x); + } +}; + +/** @ignore */ +const setBool = <T extends Bool>({ offset, values }: VectorType<T>, index: number, val: boolean) => { + const idx = offset + index; + val ? (values[idx >> 3] |= (1 << (idx % 8))) // true + : (values[idx >> 3] &= ~(1 << (idx % 8))); // false + +}; + +/** @ignore */ type Numeric1X = Int8 | Int16 | Int32 | Uint8 | Uint16 | Uint32 | Float32 | Float64; +/** @ignore */ type Numeric2X = Int64 | Uint64; + +/** @ignore */ +const setDateDay = <T extends DateDay> ({ values }: VectorType<T>, index: number, value: T['TValue']): void => { setEpochMsToDays(values, index, value.valueOf()); }; +/** @ignore */ +const setDateMillisecond = <T extends DateMillisecond>({ values }: VectorType<T>, index: number, value: T['TValue']): void => { setEpochMsToMillisecondsLong(values, index * 2, value.valueOf()); }; +/** @ignore */ +const setNumeric = <T extends Numeric1X> ({ stride, values }: VectorType<T>, index: number, value: T['TValue']): void => { values[stride * index] = value; }; +/** @ignore */ +const setFloat16 = <T extends Float16> ({ stride, values }: VectorType<T>, index: number, value: T['TValue']): void => { values[stride * index] = float64ToUint16(value); }; +/** @ignore */ +const setNumericX2 = <T extends Numeric2X> (vector: VectorType<T>, index: number, value: T['TValue']): void => { + switch (typeof value) { + case 'bigint': vector.values64[index] = value; break; + case 'number': vector.values[index * vector.stride] = value; break; + default: { + const val = value as T['TArray']; + const { stride, ArrayType } = vector; + const long = toArrayBufferView<T['TArray']>(ArrayType, val); + vector.values.set(long.subarray(0, stride), stride * index); + } + } +}; +/** @ignore */ +const setFixedSizeBinary = <T extends FixedSizeBinary>({ stride, values }: VectorType<T>, index: number, value: T['TValue']): void => { values.set(value.subarray(0, stride), stride * index); }; + +/** @ignore */ +const setBinary = <T extends Binary>({ values, valueOffsets }: VectorType<T>, index: number, value: T['TValue']) => setVariableWidthBytes(values, valueOffsets, index, value); +/** @ignore */ +const setUtf8 = <T extends Utf8>({ values, valueOffsets }: VectorType<T>, index: number, value: T['TValue']) => { + setVariableWidthBytes(values, valueOffsets, index, encodeUtf8(value)); +}; + +/* istanbul ignore next */ +/** @ignore */ +const setInt = <T extends Int>(vector: VectorType<T>, index: number, value: T['TValue']): void => { + vector.type.bitWidth < 64 + ? setNumeric(vector as VectorType<Numeric1X>, index, value as Numeric1X['TValue']) + : setNumericX2(vector as VectorType<Numeric2X>, index, value as Numeric2X['TValue']); +}; + +/* istanbul ignore next */ +/** @ignore */ +const setFloat = <T extends Float>(vector: VectorType<T>, index: number, value: T['TValue']): void => { + vector.type.precision !== Precision.HALF + ? setNumeric(vector as VectorType<Numeric1X>, index, value) + : setFloat16(vector as VectorType<Float16>, index, value); +}; + +/* istanbul ignore next */ +const setDate = <T extends Date_> (vector: VectorType<T>, index: number, value: T['TValue']): void => { + vector.type.unit === DateUnit.DAY + ? setDateDay(vector as VectorType<DateDay>, index, value) + : setDateMillisecond(vector as VectorType<DateMillisecond>, index, value); +}; + +/** @ignore */ +const setTimestampSecond = <T extends TimestampSecond> ({ values }: VectorType<T>, index: number, value: T['TValue']): void => setEpochMsToMillisecondsLong(values, index * 2, value / 1000); +/** @ignore */ +const setTimestampMillisecond = <T extends TimestampMillisecond>({ values }: VectorType<T>, index: number, value: T['TValue']): void => setEpochMsToMillisecondsLong(values, index * 2, value); +/** @ignore */ +const setTimestampMicrosecond = <T extends TimestampMicrosecond>({ values }: VectorType<T>, index: number, value: T['TValue']): void => setEpochMsToMicrosecondsLong(values, index * 2, value); +/** @ignore */ +const setTimestampNanosecond = <T extends TimestampNanosecond> ({ values }: VectorType<T>, index: number, value: T['TValue']): void => setEpochMsToNanosecondsLong(values, index * 2, value); +/* istanbul ignore next */ +/** @ignore */ +const setTimestamp = <T extends Timestamp>(vector: VectorType<T>, index: number, value: T['TValue']): void => { + switch (vector.type.unit) { + case TimeUnit.SECOND: return setTimestampSecond(vector as VectorType<TimestampSecond>, index, value); + case TimeUnit.MILLISECOND: return setTimestampMillisecond(vector as VectorType<TimestampMillisecond>, index, value); + case TimeUnit.MICROSECOND: return setTimestampMicrosecond(vector as VectorType<TimestampMicrosecond>, index, value); + case TimeUnit.NANOSECOND: return setTimestampNanosecond(vector as VectorType<TimestampNanosecond>, index, value); + } +}; + +/** @ignore */ +const setTimeSecond = <T extends TimeSecond> ({ values, stride }: VectorType<T>, index: number, value: T['TValue']): void => { values[stride * index] = value; }; +/** @ignore */ +const setTimeMillisecond = <T extends TimeMillisecond>({ values, stride }: VectorType<T>, index: number, value: T['TValue']): void => { values[stride * index] = value; }; +/** @ignore */ +const setTimeMicrosecond = <T extends TimeMicrosecond>({ values }: VectorType<T>, index: number, value: T['TValue']): void => { values.set(value.subarray(0, 2), 2 * index); }; +/** @ignore */ +const setTimeNanosecond = <T extends TimeNanosecond> ({ values }: VectorType<T>, index: number, value: T['TValue']): void => { values.set(value.subarray(0, 2), 2 * index); }; +/* istanbul ignore next */ +/** @ignore */ +const setTime = <T extends Time>(vector: VectorType<T>, index: number, value: T['TValue']): void => { + switch (vector.type.unit) { + case TimeUnit.SECOND: return setTimeSecond(vector as VectorType<TimeSecond>, index, value as TimeSecond['TValue']); + case TimeUnit.MILLISECOND: return setTimeMillisecond(vector as VectorType<TimeMillisecond>, index, value as TimeMillisecond['TValue']); + case TimeUnit.MICROSECOND: return setTimeMicrosecond(vector as VectorType<TimeMicrosecond>, index, value as TimeMicrosecond['TValue']); + case TimeUnit.NANOSECOND: return setTimeNanosecond(vector as VectorType<TimeNanosecond>, index, value as TimeNanosecond['TValue']); + } +}; + +/** @ignore */ +const setDecimal = <T extends Decimal>({ values }: VectorType<T>, index: number, value: T['TValue']): void => { values.set(value.subarray(0, 4), 4 * index); }; + +/** @ignore */ +const setList = <T extends List>(vector: VectorType<T>, index: number, value: T['TValue']): void => { + const values = vector.getChildAt(0)!, valueOffsets = vector.valueOffsets; + for (let idx = -1, itr = valueOffsets[index], end = valueOffsets[index + 1]; itr < end;) { + values.set(itr++, value.get(++idx)); + } +}; + +/** @ignore */ +const setMap = <T extends Map_>(vector: VectorType<T>, index: number, value: T['TValue']) => { + const values = vector.getChildAt(0)!, valueOffsets = vector.valueOffsets; + const entries = value instanceof Map ? [...value] : Object.entries(value); + for (let idx = -1, itr = valueOffsets[index], end = valueOffsets[index + 1]; itr < end;) { + values.set(itr++, entries[++idx]); + } +}; + +/** @ignore */ const _setStructArrayValue = (o: number, v: any[]) => (c: Vector | null, _: Field, i: number) => c?.set(o, v[i]); +/** @ignore */ const _setStructVectorValue = (o: number, v: Vector) => (c: Vector | null, _: Field, i: number) => c?.set(o, v.get(i)); +/** @ignore */ const _setStructMapValue = (o: number, v: Map<string, any>) => (c: Vector | null, f: Field, _: number) => c?.set(o, v.get(f.name)); +/** @ignore */ const _setStructObjectValue = (o: number, v: { [key: string]: any }) => (c: Vector | null, f: Field, _: number) => c?.set(o, v[f.name]); +/** @ignore */ +const setStruct = <T extends Struct>(vector: VectorType<T>, index: number, value: T['TValue']) => { + + const setValue = value instanceof Map ? _setStructMapValue(index, value) : + value instanceof Vector ? _setStructVectorValue(index, value) : + Array.isArray(value) ? _setStructArrayValue(index, value) : + _setStructObjectValue(index, value) ; + + vector.type.children.forEach((f: Field, i: number) => setValue(vector.getChildAt(i), f, i)); +}; + +/* istanbul ignore next */ +/** @ignore */ +const setUnion = < + V extends VectorType<Union> | VectorType<DenseUnion> | VectorType<SparseUnion> +>(vector: V, index: number, value: V['TValue']) => { + vector.type.mode === UnionMode.Dense ? + setDenseUnion(vector as VectorType<DenseUnion>, index, value) : + setSparseUnion(vector as VectorType<SparseUnion>, index, value); +}; + +/** @ignore */ +const setDenseUnion = <T extends DenseUnion>(vector: VectorType<T>, index: number, value: T['TValue']): void => { + const childIndex = vector.typeIdToChildIndex[vector.typeIds[index]]; + const child = vector.getChildAt(childIndex); + child && child.set(vector.valueOffsets[index], value); +}; + +/** @ignore */ +const setSparseUnion = <T extends SparseUnion>(vector: VectorType<T>, index: number, value: T['TValue']): void => { + const childIndex = vector.typeIdToChildIndex[vector.typeIds[index]]; + const child = vector.getChildAt(childIndex); + child && child.set(index, value); +}; + +/** @ignore */ +const setDictionary = <T extends Dictionary>(vector: VectorType<T>, index: number, value: T['TValue']): void => { + const key = vector.getKey(index); + if (key !== null) { + vector.setValue(key, value); + } +}; + +/* istanbul ignore next */ +/** @ignore */ +const setIntervalValue = <T extends Interval>(vector: VectorType<T>, index: number, value: T['TValue']): void => { + (vector.type.unit === IntervalUnit.DAY_TIME) + ? setIntervalDayTime(vector as VectorType<IntervalDayTime>, index, value) + : setIntervalYearMonth(vector as VectorType<IntervalYearMonth>, index, value); +}; + +/** @ignore */ +const setIntervalDayTime = <T extends IntervalDayTime>({ values }: VectorType<T>, index: number, value: T['TValue']): void => { values.set(value.subarray(0, 2), 2 * index); }; +/** @ignore */ +const setIntervalYearMonth = <T extends IntervalYearMonth>({ values }: VectorType<T>, index: number, value: T['TValue']): void => { values[index] = (value[0] * 12) + (value[1] % 12); }; + +/** @ignore */ +const setFixedSizeList = <T extends FixedSizeList>(vector: VectorType<T>, index: number, value: T['TValue']): void => { + const child = vector.getChildAt(0)!, { stride } = vector; + for (let idx = -1, offset = index * stride; ++idx < stride;) { + child.set(offset + idx, value.get(idx)); + } +}; + +SetVisitor.prototype.visitBool = setBool; +SetVisitor.prototype.visitInt = setInt; +SetVisitor.prototype.visitInt8 = setNumeric; +SetVisitor.prototype.visitInt16 = setNumeric; +SetVisitor.prototype.visitInt32 = setNumeric; +SetVisitor.prototype.visitInt64 = setNumericX2; +SetVisitor.prototype.visitUint8 = setNumeric; +SetVisitor.prototype.visitUint16 = setNumeric; +SetVisitor.prototype.visitUint32 = setNumeric; +SetVisitor.prototype.visitUint64 = setNumericX2; +SetVisitor.prototype.visitFloat = setFloat; +SetVisitor.prototype.visitFloat16 = setFloat16; +SetVisitor.prototype.visitFloat32 = setNumeric; +SetVisitor.prototype.visitFloat64 = setNumeric; +SetVisitor.prototype.visitUtf8 = setUtf8; +SetVisitor.prototype.visitBinary = setBinary; +SetVisitor.prototype.visitFixedSizeBinary = setFixedSizeBinary; +SetVisitor.prototype.visitDate = setDate; +SetVisitor.prototype.visitDateDay = setDateDay; +SetVisitor.prototype.visitDateMillisecond = setDateMillisecond; +SetVisitor.prototype.visitTimestamp = setTimestamp; +SetVisitor.prototype.visitTimestampSecond = setTimestampSecond; +SetVisitor.prototype.visitTimestampMillisecond = setTimestampMillisecond; +SetVisitor.prototype.visitTimestampMicrosecond = setTimestampMicrosecond; +SetVisitor.prototype.visitTimestampNanosecond = setTimestampNanosecond; +SetVisitor.prototype.visitTime = setTime; +SetVisitor.prototype.visitTimeSecond = setTimeSecond; +SetVisitor.prototype.visitTimeMillisecond = setTimeMillisecond; +SetVisitor.prototype.visitTimeMicrosecond = setTimeMicrosecond; +SetVisitor.prototype.visitTimeNanosecond = setTimeNanosecond; +SetVisitor.prototype.visitDecimal = setDecimal; +SetVisitor.prototype.visitList = setList; +SetVisitor.prototype.visitStruct = setStruct; +SetVisitor.prototype.visitUnion = setUnion; +SetVisitor.prototype.visitDenseUnion = setDenseUnion; +SetVisitor.prototype.visitSparseUnion = setSparseUnion; +SetVisitor.prototype.visitDictionary = setDictionary; +SetVisitor.prototype.visitInterval = setIntervalValue; +SetVisitor.prototype.visitIntervalDayTime = setIntervalDayTime; +SetVisitor.prototype.visitIntervalYearMonth = setIntervalYearMonth; +SetVisitor.prototype.visitFixedSizeList = setFixedSizeList; +SetVisitor.prototype.visitMap = setMap; + +/** @ignore */ +export const instance = new SetVisitor(); diff --git a/src/arrow/js/src/visitor/toarray.ts b/src/arrow/js/src/visitor/toarray.ts new file mode 100644 index 000000000..395e9943c --- /dev/null +++ b/src/arrow/js/src/visitor/toarray.ts @@ -0,0 +1,151 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Data } from '../data'; +import { Type } from '../enum'; +import { Visitor } from '../visitor'; +import { VectorType } from '../interfaces'; +import { instance as iteratorVisitor } from './iterator'; +import { + DataType, Dictionary, + Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Float, Float16, Float32, Float64, + Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, + Date_, DateDay, DateMillisecond, + Interval, IntervalDayTime, IntervalYearMonth, + Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond, + Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, + Union, DenseUnion, SparseUnion, +} from '../type'; + +/** @ignore */ +export interface ToArrayVisitor extends Visitor { + visit<T extends VectorType>(node: T): T['TArray']; + visitMany<T extends VectorType>(nodes: T[]): T['TArray'][]; + getVisitFn<T extends Type>(node: T): (vector: VectorType<T>) => VectorType<T>['TArray']; + getVisitFn<T extends DataType>(node: VectorType<T> | Data<T> | T): (vector: VectorType<T>) => VectorType<T>['TArray']; + visitNull <T extends Null> (vector: VectorType<T>): VectorType<T>['TArray']; + visitBool <T extends Bool> (vector: VectorType<T>): VectorType<T>['TArray']; + visitInt <T extends Int> (vector: VectorType<T>): VectorType<T>['TArray']; + visitInt8 <T extends Int8> (vector: VectorType<T>): VectorType<T>['TArray']; + visitInt16 <T extends Int16> (vector: VectorType<T>): VectorType<T>['TArray']; + visitInt32 <T extends Int32> (vector: VectorType<T>): VectorType<T>['TArray']; + visitInt64 <T extends Int64> (vector: VectorType<T>): VectorType<T>['TArray']; + visitUint8 <T extends Uint8> (vector: VectorType<T>): VectorType<T>['TArray']; + visitUint16 <T extends Uint16> (vector: VectorType<T>): VectorType<T>['TArray']; + visitUint32 <T extends Uint32> (vector: VectorType<T>): VectorType<T>['TArray']; + visitUint64 <T extends Uint64> (vector: VectorType<T>): VectorType<T>['TArray']; + visitFloat <T extends Float> (vector: VectorType<T>): VectorType<T>['TArray']; + visitFloat16 <T extends Float16> (vector: VectorType<T>): VectorType<T>['TArray']; + visitFloat32 <T extends Float32> (vector: VectorType<T>): VectorType<T>['TArray']; + visitFloat64 <T extends Float64> (vector: VectorType<T>): VectorType<T>['TArray']; + visitUtf8 <T extends Utf8> (vector: VectorType<T>): VectorType<T>['TArray']; + visitBinary <T extends Binary> (vector: VectorType<T>): VectorType<T>['TArray']; + visitFixedSizeBinary <T extends FixedSizeBinary> (vector: VectorType<T>): VectorType<T>['TArray']; + visitDate <T extends Date_> (vector: VectorType<T>): VectorType<T>['TArray']; + visitDateDay <T extends DateDay> (vector: VectorType<T>): VectorType<T>['TArray']; + visitDateMillisecond <T extends DateMillisecond> (vector: VectorType<T>): VectorType<T>['TArray']; + visitTimestamp <T extends Timestamp> (vector: VectorType<T>): VectorType<T>['TArray']; + visitTimestampSecond <T extends TimestampSecond> (vector: VectorType<T>): VectorType<T>['TArray']; + visitTimestampMillisecond <T extends TimestampMillisecond>(vector: VectorType<T>): VectorType<T>['TArray']; + visitTimestampMicrosecond <T extends TimestampMicrosecond>(vector: VectorType<T>): VectorType<T>['TArray']; + visitTimestampNanosecond <T extends TimestampNanosecond> (vector: VectorType<T>): VectorType<T>['TArray']; + visitTime <T extends Time> (vector: VectorType<T>): VectorType<T>['TArray']; + visitTimeSecond <T extends TimeSecond> (vector: VectorType<T>): VectorType<T>['TArray']; + visitTimeMillisecond <T extends TimeMillisecond> (vector: VectorType<T>): VectorType<T>['TArray']; + visitTimeMicrosecond <T extends TimeMicrosecond> (vector: VectorType<T>): VectorType<T>['TArray']; + visitTimeNanosecond <T extends TimeNanosecond> (vector: VectorType<T>): VectorType<T>['TArray']; + visitDecimal <T extends Decimal> (vector: VectorType<T>): VectorType<T>['TArray']; + visitList <R extends DataType, T extends List<R>> (vector: VectorType<T>): VectorType<T>['TArray']; + visitStruct <T extends Struct> (vector: VectorType<T>): VectorType<T>['TArray']; + visitUnion <T extends Union> (vector: VectorType<T>): VectorType<T>['TArray']; + visitDenseUnion <T extends DenseUnion> (vector: VectorType<T>): VectorType<T>['TArray']; + visitSparseUnion <T extends SparseUnion> (vector: VectorType<T>): VectorType<T>['TArray']; + visitDictionary <R extends DataType, T extends Dictionary<R>> (vector: VectorType<T>): VectorType<T>['TArray']; + visitInterval <T extends Interval> (vector: VectorType<T>): VectorType<T>['TArray']; + visitIntervalDayTime <T extends IntervalDayTime> (vector: VectorType<T>): VectorType<T>['TArray']; + visitIntervalYearMonth <T extends IntervalYearMonth> (vector: VectorType<T>): VectorType<T>['TArray']; + visitFixedSizeList <R extends DataType, T extends FixedSizeList<R>> (vector: VectorType<T>): VectorType<T>['TArray']; + visitMap <T extends Map_> (vector: VectorType<T>): VectorType<T>['TArray']; +} + +/** @ignore */ +export class ToArrayVisitor extends Visitor {} + +/** @ignore */ +function arrayOfVector<T extends DataType>(vector: VectorType<T>): T['TArray'] { + + const { type, length, stride } = vector; + + // Fast case, return subarray if possible + switch (type.typeId) { + case Type.Int: + case Type.Float: case Type.Decimal: + case Type.Time: case Type.Timestamp: + return vector.data.values.subarray(0, length * stride); + } + + // Otherwise if not primitive, slow copy + return [...iteratorVisitor.visit(vector)] as T['TArray']; +} + +ToArrayVisitor.prototype.visitNull = arrayOfVector; +ToArrayVisitor.prototype.visitBool = arrayOfVector; +ToArrayVisitor.prototype.visitInt = arrayOfVector; +ToArrayVisitor.prototype.visitInt8 = arrayOfVector; +ToArrayVisitor.prototype.visitInt16 = arrayOfVector; +ToArrayVisitor.prototype.visitInt32 = arrayOfVector; +ToArrayVisitor.prototype.visitInt64 = arrayOfVector; +ToArrayVisitor.prototype.visitUint8 = arrayOfVector; +ToArrayVisitor.prototype.visitUint16 = arrayOfVector; +ToArrayVisitor.prototype.visitUint32 = arrayOfVector; +ToArrayVisitor.prototype.visitUint64 = arrayOfVector; +ToArrayVisitor.prototype.visitFloat = arrayOfVector; +ToArrayVisitor.prototype.visitFloat16 = arrayOfVector; +ToArrayVisitor.prototype.visitFloat32 = arrayOfVector; +ToArrayVisitor.prototype.visitFloat64 = arrayOfVector; +ToArrayVisitor.prototype.visitUtf8 = arrayOfVector; +ToArrayVisitor.prototype.visitBinary = arrayOfVector; +ToArrayVisitor.prototype.visitFixedSizeBinary = arrayOfVector; +ToArrayVisitor.prototype.visitDate = arrayOfVector; +ToArrayVisitor.prototype.visitDateDay = arrayOfVector; +ToArrayVisitor.prototype.visitDateMillisecond = arrayOfVector; +ToArrayVisitor.prototype.visitTimestamp = arrayOfVector; +ToArrayVisitor.prototype.visitTimestampSecond = arrayOfVector; +ToArrayVisitor.prototype.visitTimestampMillisecond = arrayOfVector; +ToArrayVisitor.prototype.visitTimestampMicrosecond = arrayOfVector; +ToArrayVisitor.prototype.visitTimestampNanosecond = arrayOfVector; +ToArrayVisitor.prototype.visitTime = arrayOfVector; +ToArrayVisitor.prototype.visitTimeSecond = arrayOfVector; +ToArrayVisitor.prototype.visitTimeMillisecond = arrayOfVector; +ToArrayVisitor.prototype.visitTimeMicrosecond = arrayOfVector; +ToArrayVisitor.prototype.visitTimeNanosecond = arrayOfVector; +ToArrayVisitor.prototype.visitDecimal = arrayOfVector; +ToArrayVisitor.prototype.visitList = arrayOfVector; +ToArrayVisitor.prototype.visitStruct = arrayOfVector; +ToArrayVisitor.prototype.visitUnion = arrayOfVector; +ToArrayVisitor.prototype.visitDenseUnion = arrayOfVector; +ToArrayVisitor.prototype.visitSparseUnion = arrayOfVector; +ToArrayVisitor.prototype.visitDictionary = arrayOfVector; +ToArrayVisitor.prototype.visitInterval = arrayOfVector; +ToArrayVisitor.prototype.visitIntervalDayTime = arrayOfVector; +ToArrayVisitor.prototype.visitIntervalYearMonth = arrayOfVector; +ToArrayVisitor.prototype.visitFixedSizeList = arrayOfVector; +ToArrayVisitor.prototype.visitMap = arrayOfVector; + +/** @ignore */ +export const instance = new ToArrayVisitor(); diff --git a/src/arrow/js/src/visitor/typeassembler.ts b/src/arrow/js/src/visitor/typeassembler.ts new file mode 100644 index 000000000..4cd65d926 --- /dev/null +++ b/src/arrow/js/src/visitor/typeassembler.ts @@ -0,0 +1,158 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { flatbuffers } from 'flatbuffers'; +import Long = flatbuffers.Long; +import Builder = flatbuffers.Builder; + +import * as type from '../type'; +import { Visitor } from '../visitor'; + +import { + Null, + Int, + FloatingPoint, + Binary, + Bool, + Utf8, + Decimal, + Date, + Time, + Timestamp, + Interval, + List, + Struct_ as Struct, + Union, + DictionaryEncoding, + FixedSizeBinary, + FixedSizeList, + Map as Map_, +} from '../fb/Schema'; + +/** @ignore */ +export interface TypeAssembler extends Visitor { + visit<T extends type.DataType>(node: T, builder: Builder): number | undefined; +} + +/** @ignore */ +export class TypeAssembler extends Visitor { + public visit<T extends type.DataType>(node: T, builder: Builder): number | undefined { + return (node == null || builder == null) ? undefined : super.visit(node, builder); + } + public visitNull<T extends type.Null>(_node: T, b: Builder) { + Null.startNull(b); + return Null.endNull(b); + } + public visitInt<T extends type.Int>(node: T, b: Builder) { + Int.startInt(b); + Int.addBitWidth(b, node.bitWidth); + Int.addIsSigned(b, node.isSigned); + return Int.endInt(b); + } + public visitFloat<T extends type.Float>(node: T, b: Builder) { + FloatingPoint.startFloatingPoint(b); + FloatingPoint.addPrecision(b, node.precision); + return FloatingPoint.endFloatingPoint(b); + } + public visitBinary<T extends type.Binary>(_node: T, b: Builder) { + Binary.startBinary(b); + return Binary.endBinary(b); + } + public visitBool<T extends type.Bool>(_node: T, b: Builder) { + Bool.startBool(b); + return Bool.endBool(b); + } + public visitUtf8<T extends type.Utf8>(_node: T, b: Builder) { + Utf8.startUtf8(b); + return Utf8.endUtf8(b); + } + public visitDecimal<T extends type.Decimal>(node: T, b: Builder) { + Decimal.startDecimal(b); + Decimal.addScale(b, node.scale); + Decimal.addPrecision(b, node.precision); + return Decimal.endDecimal(b); + } + public visitDate<T extends type.Date_>(node: T, b: Builder) { + Date.startDate(b); + Date.addUnit(b, node.unit); + return Date.endDate(b); + } + public visitTime<T extends type.Time>(node: T, b: Builder) { + Time.startTime(b); + Time.addUnit(b, node.unit); + Time.addBitWidth(b, node.bitWidth); + return Time.endTime(b); + } + public visitTimestamp<T extends type.Timestamp>(node: T, b: Builder) { + const timezone = (node.timezone && b.createString(node.timezone)) || undefined; + Timestamp.startTimestamp(b); + Timestamp.addUnit(b, node.unit); + if (timezone !== undefined) { + Timestamp.addTimezone(b, timezone); + } + return Timestamp.endTimestamp(b); + } + public visitInterval<T extends type.Interval>(node: T, b: Builder) { + Interval.startInterval(b); + Interval.addUnit(b, node.unit); + return Interval.endInterval(b); + } + public visitList<T extends type.List>(_node: T, b: Builder) { + List.startList(b); + return List.endList(b); + } + public visitStruct<T extends type.Struct>(_node: T, b: Builder) { + Struct.startStruct_(b); + return Struct.endStruct_(b); + } + public visitUnion<T extends type.Union>(node: T, b: Builder) { + Union.startTypeIdsVector(b, node.typeIds.length); + const typeIds = Union.createTypeIdsVector(b, node.typeIds); + Union.startUnion(b); + Union.addMode(b, node.mode); + Union.addTypeIds(b, typeIds); + return Union.endUnion(b); + } + public visitDictionary<T extends type.Dictionary>(node: T, b: Builder) { + const indexType = this.visit(node.indices, b); + DictionaryEncoding.startDictionaryEncoding(b); + DictionaryEncoding.addId(b, new Long(node.id, 0)); + DictionaryEncoding.addIsOrdered(b, node.isOrdered); + if (indexType !== undefined) { + DictionaryEncoding.addIndexType(b, indexType); + } + return DictionaryEncoding.endDictionaryEncoding(b); + } + public visitFixedSizeBinary<T extends type.FixedSizeBinary>(node: T, b: Builder) { + FixedSizeBinary.startFixedSizeBinary(b); + FixedSizeBinary.addByteWidth(b, node.byteWidth); + return FixedSizeBinary.endFixedSizeBinary(b); + } + public visitFixedSizeList<T extends type.FixedSizeList>(node: T, b: Builder) { + FixedSizeList.startFixedSizeList(b); + FixedSizeList.addListSize(b, node.listSize); + return FixedSizeList.endFixedSizeList(b); + } + public visitMap<T extends type.Map_>(node: T, b: Builder) { + Map_.startMap(b); + Map_.addKeysSorted(b, node.keysSorted); + return Map_.endMap(b); + } +} + +/** @ignore */ +export const instance = new TypeAssembler(); diff --git a/src/arrow/js/src/visitor/typecomparator.ts b/src/arrow/js/src/visitor/typecomparator.ts new file mode 100644 index 000000000..478b505f8 --- /dev/null +++ b/src/arrow/js/src/visitor/typecomparator.ts @@ -0,0 +1,280 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Data } from '../data'; +import { Visitor } from '../visitor'; +import { VectorType } from '../interfaces'; +import { Schema, Field } from '../schema'; +import { + DataType, Dictionary, + Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Float, Float16, Float32, Float64, + Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, + Date_, DateDay, DateMillisecond, + Interval, IntervalDayTime, IntervalYearMonth, + Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond, + Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, + Union, DenseUnion, SparseUnion, +} from '../type'; + +/** @ignore */ +export interface TypeComparator extends Visitor { + visit<T extends DataType>(type: T, other?: DataType | null): other is T; + visitMany<T extends DataType>(nodes: T[], others?: DataType[] | null): boolean[]; + getVisitFn<T extends DataType>(node: VectorType<T> | Data<T> | T): (other?: DataType | null) => other is T; + visitNull <T extends Null> (type: T, other?: DataType | null): other is T; + visitBool <T extends Bool> (type: T, other?: DataType | null): other is T; + visitInt <T extends Int> (type: T, other?: DataType | null): other is T; + visitInt8 <T extends Int8> (type: T, other?: DataType | null): other is T; + visitInt16 <T extends Int16> (type: T, other?: DataType | null): other is T; + visitInt32 <T extends Int32> (type: T, other?: DataType | null): other is T; + visitInt64 <T extends Int64> (type: T, other?: DataType | null): other is T; + visitUint8 <T extends Uint8> (type: T, other?: DataType | null): other is T; + visitUint16 <T extends Uint16> (type: T, other?: DataType | null): other is T; + visitUint32 <T extends Uint32> (type: T, other?: DataType | null): other is T; + visitUint64 <T extends Uint64> (type: T, other?: DataType | null): other is T; + visitFloat <T extends Float> (type: T, other?: DataType | null): other is T; + visitFloat16 <T extends Float16> (type: T, other?: DataType | null): other is T; + visitFloat32 <T extends Float32> (type: T, other?: DataType | null): other is T; + visitFloat64 <T extends Float64> (type: T, other?: DataType | null): other is T; + visitUtf8 <T extends Utf8> (type: T, other?: DataType | null): other is T; + visitBinary <T extends Binary> (type: T, other?: DataType | null): other is T; + visitFixedSizeBinary <T extends FixedSizeBinary> (type: T, other?: DataType | null): other is T; + visitDate <T extends Date_> (type: T, other?: DataType | null): other is T; + visitDateDay <T extends DateDay> (type: T, other?: DataType | null): other is T; + visitDateMillisecond <T extends DateMillisecond> (type: T, other?: DataType | null): other is T; + visitTimestamp <T extends Timestamp> (type: T, other?: DataType | null): other is T; + visitTimestampSecond <T extends TimestampSecond> (type: T, other?: DataType | null): other is T; + visitTimestampMillisecond <T extends TimestampMillisecond> (type: T, other?: DataType | null): other is T; + visitTimestampMicrosecond <T extends TimestampMicrosecond> (type: T, other?: DataType | null): other is T; + visitTimestampNanosecond <T extends TimestampNanosecond> (type: T, other?: DataType | null): other is T; + visitTime <T extends Time> (type: T, other?: DataType | null): other is T; + visitTimeSecond <T extends TimeSecond> (type: T, other?: DataType | null): other is T; + visitTimeMillisecond <T extends TimeMillisecond> (type: T, other?: DataType | null): other is T; + visitTimeMicrosecond <T extends TimeMicrosecond> (type: T, other?: DataType | null): other is T; + visitTimeNanosecond <T extends TimeNanosecond> (type: T, other?: DataType | null): other is T; + visitDecimal <T extends Decimal> (type: T, other?: DataType | null): other is T; + visitList <T extends List> (type: T, other?: DataType | null): other is T; + visitStruct <T extends Struct> (type: T, other?: DataType | null): other is T; + visitUnion <T extends Union> (type: T, other?: DataType | null): other is T; + visitDenseUnion <T extends DenseUnion> (type: T, other?: DataType | null): other is T; + visitSparseUnion <T extends SparseUnion> (type: T, other?: DataType | null): other is T; + visitDictionary <T extends Dictionary> (type: T, other?: DataType | null): other is T; + visitInterval <T extends Interval> (type: T, other?: DataType | null): other is T; + visitIntervalDayTime <T extends IntervalDayTime> (type: T, other?: DataType | null): other is T; + visitIntervalYearMonth <T extends IntervalYearMonth> (type: T, other?: DataType | null): other is T; + visitFixedSizeList <T extends FixedSizeList> (type: T, other?: DataType | null): other is T; + visitMap <T extends Map_> (type: T, other?: DataType | null): other is T; +} + +/** @ignore */ +export class TypeComparator extends Visitor { + compareSchemas<T extends { [key: string]: DataType }>(schema: Schema<T>, other?: Schema | null): other is Schema<T> { + return (schema === other) || ( + other instanceof schema.constructor && + this.compareManyFields(schema.fields, other.fields) + ); + } + compareManyFields<T extends { [key: string]: DataType }>(fields: Field<T[keyof T]>[], others?: Field[] | null): others is Field<T[keyof T]>[] { + return (fields === others) || ( + Array.isArray(fields) && + Array.isArray(others) && + fields.length === others.length && + fields.every((f, i) => this.compareFields(f, others[i])) + ); + } + compareFields<T extends DataType = any>(field: Field<T>, other?: Field | null): other is Field<T> { + return (field === other) || ( + other instanceof field.constructor && + field.name === other.name && + field.nullable === other.nullable && + this.visit(field.type, other.type) + ); + } +} + +function compareConstructor<T extends DataType>(type: T, other?: DataType | null): other is T { + return other instanceof type.constructor; +} + +function compareAny<T extends DataType>(type: T, other?: DataType | null): other is T { + return (type === other) || compareConstructor(type, other); +} + +function compareInt<T extends Int>(type: T, other?: DataType | null): other is T { + return (type === other) || ( + compareConstructor(type, other) && + type.bitWidth === other.bitWidth && + type.isSigned === other.isSigned + ); +} + +function compareFloat<T extends Float>(type: T, other?: DataType | null): other is T { + return (type === other) || ( + compareConstructor(type, other) && + type.precision === other.precision + ); +} + +function compareFixedSizeBinary<T extends FixedSizeBinary>(type: T, other?: DataType | null): other is T { + return (type === other) || ( + compareConstructor(type, other) && + type.byteWidth === other.byteWidth + ); +} + +function compareDate<T extends Date_>(type: T, other?: DataType | null): other is T { + return (type === other) || ( + compareConstructor(type, other) && + type.unit === other.unit + ); +} + +function compareTimestamp<T extends Timestamp>(type: T, other?: DataType | null): other is T { + return (type === other) || ( + compareConstructor(type, other) && + type.unit === other.unit && + type.timezone === other.timezone + ); +} + +function compareTime<T extends Time>(type: T, other?: DataType | null): other is T { + return (type === other) || ( + compareConstructor(type, other) && + type.unit === other.unit && + type.bitWidth === other.bitWidth + ); +} + +function compareList<T extends List>(type: T, other?: DataType | null): other is T { + return (type === other) || ( + compareConstructor(type, other) && + type.children.length === other.children.length && + instance.compareManyFields(type.children, other.children) + ); +} + +function compareStruct<T extends Struct>(type: T, other?: DataType | null): other is T { + return (type === other) || ( + compareConstructor(type, other) && + type.children.length === other.children.length && + instance.compareManyFields(type.children, other.children) + ); +} + +function compareUnion<T extends Union>(type: T, other?: DataType | null): other is T { + return (type === other) || ( + compareConstructor(type, other) && + type.mode === other.mode && + type.typeIds.every((x, i) => x === other.typeIds[i]) && + instance.compareManyFields(type.children, other.children) + ); +} + +function compareDictionary<T extends Dictionary>(type: T, other?: DataType | null): other is T { + return (type === other) || ( + compareConstructor(type, other) && + type.id === other.id && + type.isOrdered === other.isOrdered && + instance.visit(<any> type.indices, other.indices) && + instance.visit(type.dictionary, other.dictionary) + ); +} + +function compareInterval<T extends Interval>(type: T, other?: DataType | null): other is T { + return (type === other) || ( + compareConstructor(type, other) && + type.unit === other.unit + ); +} + +function compareFixedSizeList<T extends FixedSizeList>(type: T, other?: DataType | null): other is T { + return (type === other) || ( + compareConstructor(type, other) && + type.listSize === other.listSize && + type.children.length === other.children.length && + instance.compareManyFields(type.children, other.children) + ); +} + +function compareMap<T extends Map_>(type: T, other?: DataType | null): other is T { + return (type === other) || ( + compareConstructor(type, other) && + type.keysSorted === other.keysSorted && + type.children.length === other.children.length && + instance.compareManyFields(type.children, other.children) + ); +} + +TypeComparator.prototype.visitNull = compareAny; +TypeComparator.prototype.visitBool = compareAny; +TypeComparator.prototype.visitInt = compareInt; +TypeComparator.prototype.visitInt8 = compareInt; +TypeComparator.prototype.visitInt16 = compareInt; +TypeComparator.prototype.visitInt32 = compareInt; +TypeComparator.prototype.visitInt64 = compareInt; +TypeComparator.prototype.visitUint8 = compareInt; +TypeComparator.prototype.visitUint16 = compareInt; +TypeComparator.prototype.visitUint32 = compareInt; +TypeComparator.prototype.visitUint64 = compareInt; +TypeComparator.prototype.visitFloat = compareFloat; +TypeComparator.prototype.visitFloat16 = compareFloat; +TypeComparator.prototype.visitFloat32 = compareFloat; +TypeComparator.prototype.visitFloat64 = compareFloat; +TypeComparator.prototype.visitUtf8 = compareAny; +TypeComparator.prototype.visitBinary = compareAny; +TypeComparator.prototype.visitFixedSizeBinary = compareFixedSizeBinary; +TypeComparator.prototype.visitDate = compareDate; +TypeComparator.prototype.visitDateDay = compareDate; +TypeComparator.prototype.visitDateMillisecond = compareDate; +TypeComparator.prototype.visitTimestamp = compareTimestamp; +TypeComparator.prototype.visitTimestampSecond = compareTimestamp; +TypeComparator.prototype.visitTimestampMillisecond = compareTimestamp; +TypeComparator.prototype.visitTimestampMicrosecond = compareTimestamp; +TypeComparator.prototype.visitTimestampNanosecond = compareTimestamp; +TypeComparator.prototype.visitTime = compareTime; +TypeComparator.prototype.visitTimeSecond = compareTime; +TypeComparator.prototype.visitTimeMillisecond = compareTime; +TypeComparator.prototype.visitTimeMicrosecond = compareTime; +TypeComparator.prototype.visitTimeNanosecond = compareTime; +TypeComparator.prototype.visitDecimal = compareAny; +TypeComparator.prototype.visitList = compareList; +TypeComparator.prototype.visitStruct = compareStruct; +TypeComparator.prototype.visitUnion = compareUnion; +TypeComparator.prototype.visitDenseUnion = compareUnion; +TypeComparator.prototype.visitSparseUnion = compareUnion; +TypeComparator.prototype.visitDictionary = compareDictionary; +TypeComparator.prototype.visitInterval = compareInterval; +TypeComparator.prototype.visitIntervalDayTime = compareInterval; +TypeComparator.prototype.visitIntervalYearMonth = compareInterval; +TypeComparator.prototype.visitFixedSizeList = compareFixedSizeList; +TypeComparator.prototype.visitMap = compareMap; + +/** @ignore */ +export const instance = new TypeComparator(); + +export function compareSchemas<T extends { [key: string]: DataType }>(schema: Schema<T>, other?: Schema | null): other is Schema<T> { + return instance.compareSchemas(schema, other); +} + +export function compareFields<T extends DataType = any>(field: Field<T>, other?: Field | null): other is Field<T> { + return instance.compareFields(field, other); +} + +export function compareTypes<A extends DataType = any>(type: A, other?: DataType): other is A { + return instance.visit(type, other); +} diff --git a/src/arrow/js/src/visitor/typector.ts b/src/arrow/js/src/visitor/typector.ts new file mode 100644 index 000000000..9d0a9f17d --- /dev/null +++ b/src/arrow/js/src/visitor/typector.ts @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Data } from '../data'; +import { Type } from '../enum'; +import * as type from '../type'; +import { DataType } from '../type'; +import { Visitor } from '../visitor'; +import { VectorType } from '../interfaces'; +import { DataTypeCtor } from '../interfaces'; + +/** @ignore */ +export interface GetDataTypeConstructor extends Visitor { + visit<T extends Type>(node: T): DataTypeCtor<T>; + visitMany<T extends Type>(nodes: T[]): DataTypeCtor<T>[]; + getVisitFn<T extends Type>(node: T): () => DataTypeCtor<T>; + getVisitFn<T extends DataType>(node: VectorType<T> | Data<T> | T): () => DataTypeCtor<T>; +} + +/** @ignore */ +export class GetDataTypeConstructor extends Visitor { + public visitNull () { return type.Null; } + public visitBool () { return type.Bool; } + public visitInt () { return type.Int; } + public visitInt8 () { return type.Int8; } + public visitInt16 () { return type.Int16; } + public visitInt32 () { return type.Int32; } + public visitInt64 () { return type.Int64; } + public visitUint8 () { return type.Uint8; } + public visitUint16 () { return type.Uint16; } + public visitUint32 () { return type.Uint32; } + public visitUint64 () { return type.Uint64; } + public visitFloat () { return type.Float; } + public visitFloat16 () { return type.Float16; } + public visitFloat32 () { return type.Float32; } + public visitFloat64 () { return type.Float64; } + public visitUtf8 () { return type.Utf8; } + public visitBinary () { return type.Binary; } + public visitFixedSizeBinary () { return type.FixedSizeBinary; } + public visitDate () { return type.Date_; } + public visitDateDay () { return type.DateDay; } + public visitDateMillisecond () { return type.DateMillisecond; } + public visitTimestamp () { return type.Timestamp; } + public visitTimestampSecond () { return type.TimestampSecond; } + public visitTimestampMillisecond () { return type.TimestampMillisecond; } + public visitTimestampMicrosecond () { return type.TimestampMicrosecond; } + public visitTimestampNanosecond () { return type.TimestampNanosecond; } + public visitTime () { return type.Time; } + public visitTimeSecond () { return type.TimeSecond; } + public visitTimeMillisecond () { return type.TimeMillisecond; } + public visitTimeMicrosecond () { return type.TimeMicrosecond; } + public visitTimeNanosecond () { return type.TimeNanosecond; } + public visitDecimal () { return type.Decimal; } + public visitList () { return type.List; } + public visitStruct () { return type.Struct; } + public visitUnion () { return type.Union; } + public visitDenseUnion () { return type.DenseUnion; } + public visitSparseUnion () { return type.SparseUnion; } + public visitDictionary () { return type.Dictionary; } + public visitInterval () { return type.Interval; } + public visitIntervalDayTime () { return type.IntervalDayTime; } + public visitIntervalYearMonth () { return type.IntervalYearMonth; } + public visitFixedSizeList () { return type.FixedSizeList; } + public visitMap () { return type.Map_; } +} + +/** @ignore */ +export const instance = new GetDataTypeConstructor(); diff --git a/src/arrow/js/src/visitor/vectorassembler.ts b/src/arrow/js/src/visitor/vectorassembler.ts new file mode 100644 index 000000000..e324bc02e --- /dev/null +++ b/src/arrow/js/src/visitor/vectorassembler.ts @@ -0,0 +1,234 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Data } from '../data'; +import { Vector } from '../vector'; +import { Visitor } from '../visitor'; +import { Type, UnionMode } from '../enum'; +import { RecordBatch } from '../recordbatch'; +import { VectorType as V } from '../interfaces'; +import { rebaseValueOffsets } from '../util/buffer'; +import { packBools, truncateBitmap } from '../util/bit'; +import { selectVectorChildrenArgs } from '../util/args'; +import { BufferRegion, FieldNode } from '../ipc/metadata/message'; +import { + DataType, Dictionary, + Float, Int, Date_, Interval, Time, Timestamp, Union, + Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, +} from '../type'; + +/** @ignore */ +export interface VectorAssembler extends Visitor { + visit<T extends Vector>(node: T): this; + visitMany<T extends Vector>(nodes: T[]): this[]; + getVisitFn<T extends Type>(node: T): (vector: V<T>) => this; + getVisitFn<T extends DataType>(node: V<T> | Data<T> | T): (vector: V<T>) => this; + + visitBool <T extends Bool> (vector: V<T>): this; + visitInt <T extends Int> (vector: V<T>): this; + visitFloat <T extends Float> (vector: V<T>): this; + visitUtf8 <T extends Utf8> (vector: V<T>): this; + visitBinary <T extends Binary> (vector: V<T>): this; + visitFixedSizeBinary <T extends FixedSizeBinary> (vector: V<T>): this; + visitDate <T extends Date_> (vector: V<T>): this; + visitTimestamp <T extends Timestamp> (vector: V<T>): this; + visitTime <T extends Time> (vector: V<T>): this; + visitDecimal <T extends Decimal> (vector: V<T>): this; + visitList <T extends List> (vector: V<T>): this; + visitStruct <T extends Struct> (vector: V<T>): this; + visitUnion <T extends Union> (vector: V<T>): this; + visitInterval <T extends Interval> (vector: V<T>): this; + visitFixedSizeList <T extends FixedSizeList> (vector: V<T>): this; + visitMap <T extends Map_> (vector: V<T>): this; +} + +/** @ignore */ +export class VectorAssembler extends Visitor { + + /** @nocollapse */ + public static assemble<T extends Vector | RecordBatch>(...args: (T | T[])[]) { + const assembler = new VectorAssembler(); + const vectorChildren = selectVectorChildrenArgs(RecordBatch, args); + const [assembleResult = assembler] = assembler.visitMany(vectorChildren); + return assembleResult; + } + + private constructor() { super(); } + + public visit<T extends Vector>(vector: T): this { + if (!DataType.isDictionary(vector.type)) { + const { data, length, nullCount } = vector; + if (length > 2147483647) { + /* istanbul ignore next */ + throw new RangeError('Cannot write arrays larger than 2^31 - 1 in length'); + } + if (!DataType.isNull(vector.type)) { + addBuffer.call(this, nullCount <= 0 + ? new Uint8Array(0) // placeholder validity buffer + : truncateBitmap(data.offset, length, data.nullBitmap) + ); + } + this.nodes.push(new FieldNode(length, nullCount)); + } + return super.visit(vector); + } + + public visitNull<T extends Null>(_nullV: V<T>) { + return this; + } + public visitDictionary<T extends Dictionary>(vector: V<T>) { + // Assemble the indices here, Dictionary assembled separately. + return this.visit(vector.indices); + } + + public get nodes() { return this._nodes; } + public get buffers() { return this._buffers; } + public get byteLength() { return this._byteLength; } + public get bufferRegions() { return this._bufferRegions; } + + protected _byteLength = 0; + protected _nodes: FieldNode[] = []; + protected _buffers: ArrayBufferView[] = []; + protected _bufferRegions: BufferRegion[] = []; +} + +/** @ignore */ +function addBuffer(this: VectorAssembler, values: ArrayBufferView) { + const byteLength = (values.byteLength + 7) & ~7; // Round up to a multiple of 8 + this.buffers.push(values); + this.bufferRegions.push(new BufferRegion(this._byteLength, byteLength)); + this._byteLength += byteLength; + return this; +} + +/** @ignore */ +function assembleUnion<T extends Union>(this: VectorAssembler, vector: V<T>) { + const { type, length, typeIds, valueOffsets } = vector; + // All Union Vectors have a typeIds buffer + addBuffer.call(this, typeIds); + // If this is a Sparse Union, treat it like all other Nested types + if (type.mode === UnionMode.Sparse) { + return assembleNestedVector.call(this, vector); + } else if (type.mode === UnionMode.Dense) { + // If this is a Dense Union, add the valueOffsets buffer and potentially slice the children + if (vector.offset <= 0) { + // If the Vector hasn't been sliced, write the existing valueOffsets + addBuffer.call(this, valueOffsets); + // We can treat this like all other Nested types + return assembleNestedVector.call(this, vector); + } else { + // A sliced Dense Union is an unpleasant case. Because the offsets are different for + // each child vector, we need to "rebase" the valueOffsets for each child + // Union typeIds are not necessary 0-indexed + const maxChildTypeId = typeIds.reduce((x, y) => Math.max(x, y), typeIds[0]); + const childLengths = new Int32Array(maxChildTypeId + 1); + // Set all to -1 to indicate that we haven't observed a first occurrence of a particular child yet + const childOffsets = new Int32Array(maxChildTypeId + 1).fill(-1); + const shiftedOffsets = new Int32Array(length); + // If we have a non-zero offset, then the value offsets do not start at + // zero. We must a) create a new offsets array with shifted offsets and + // b) slice the values array accordingly + const unshiftedOffsets = rebaseValueOffsets(-valueOffsets[0], length, valueOffsets); + for (let typeId, shift, index = -1; ++index < length;) { + if ((shift = childOffsets[typeId = typeIds[index]]) === -1) { + shift = childOffsets[typeId] = unshiftedOffsets[typeId]; + } + shiftedOffsets[index] = unshiftedOffsets[index] - shift; + ++childLengths[typeId]; + } + addBuffer.call(this, shiftedOffsets); + // Slice and visit children accordingly + for (let child: Vector | null, childIndex = -1, numChildren = type.children.length; ++childIndex < numChildren;) { + if (child = vector.getChildAt(childIndex)) { + const typeId = type.typeIds[childIndex]; + const childLength = Math.min(length, childLengths[typeId]); + this.visit(child.slice(childOffsets[typeId], childLength)); + } + } + } + } + return this; +} + +/** @ignore */ +function assembleBoolVector<T extends Bool>(this: VectorAssembler, vector: V<T>) { + // Bool vector is a special case of FlatVector, as its data buffer needs to stay packed + let values: Uint8Array; + if (vector.nullCount >= vector.length) { + // If all values are null, just insert a placeholder empty data buffer (fastest path) + return addBuffer.call(this, new Uint8Array(0)); + } else if ((values = vector.values) instanceof Uint8Array) { + // If values is already a Uint8Array, slice the bitmap (fast path) + return addBuffer.call(this, truncateBitmap(vector.offset, vector.length, values)); + } + // Otherwise if the underlying data *isn't* a Uint8Array, enumerate the + // values as bools and re-pack them into a Uint8Array. This code isn't + // reachable unless you're trying to manipulate the Data internals, + // we we're only doing this for safety. + /* istanbul ignore next */ + return addBuffer.call(this, packBools(vector)); +} + +/** @ignore */ +function assembleFlatVector<T extends Int | Float | FixedSizeBinary | Date_ | Timestamp | Time | Decimal | Interval>(this: VectorAssembler, vector: V<T>) { + return addBuffer.call(this, vector.values.subarray(0, vector.length * vector.stride)); +} + +/** @ignore */ +function assembleFlatListVector<T extends Utf8 | Binary>(this: VectorAssembler, vector: V<T>) { + const { length, values, valueOffsets } = vector; + const firstOffset = valueOffsets[0]; + const lastOffset = valueOffsets[length]; + const byteLength = Math.min(lastOffset - firstOffset, values.byteLength - firstOffset); + // Push in the order FlatList types read their buffers + addBuffer.call(this, rebaseValueOffsets(-valueOffsets[0], length, valueOffsets)); // valueOffsets buffer first + addBuffer.call(this, values.subarray(firstOffset, firstOffset + byteLength)); // sliced values buffer second + return this; +} + +/** @ignore */ +function assembleListVector<T extends Map_ | List | FixedSizeList>(this: VectorAssembler, vector: V<T>) { + const { length, valueOffsets } = vector; + // If we have valueOffsets (MapVector, ListVector), push that buffer first + if (valueOffsets) { + addBuffer.call(this, rebaseValueOffsets(valueOffsets[0], length, valueOffsets)); + } + // Then insert the List's values child + return this.visit(vector.getChildAt(0)!); +} + +/** @ignore */ +function assembleNestedVector<T extends Struct | Union>(this: VectorAssembler, vector: V<T>) { + return this.visitMany(vector.type.children.map((_, i) => vector.getChildAt(i)!).filter(Boolean))[0]; +} + +VectorAssembler.prototype.visitBool = assembleBoolVector; +VectorAssembler.prototype.visitInt = assembleFlatVector; +VectorAssembler.prototype.visitFloat = assembleFlatVector; +VectorAssembler.prototype.visitUtf8 = assembleFlatListVector; +VectorAssembler.prototype.visitBinary = assembleFlatListVector; +VectorAssembler.prototype.visitFixedSizeBinary = assembleFlatVector; +VectorAssembler.prototype.visitDate = assembleFlatVector; +VectorAssembler.prototype.visitTimestamp = assembleFlatVector; +VectorAssembler.prototype.visitTime = assembleFlatVector; +VectorAssembler.prototype.visitDecimal = assembleFlatVector; +VectorAssembler.prototype.visitList = assembleListVector; +VectorAssembler.prototype.visitStruct = assembleNestedVector; +VectorAssembler.prototype.visitUnion = assembleUnion; +VectorAssembler.prototype.visitInterval = assembleFlatVector; +VectorAssembler.prototype.visitFixedSizeList = assembleListVector; +VectorAssembler.prototype.visitMap = assembleListVector; diff --git a/src/arrow/js/src/visitor/vectorctor.ts b/src/arrow/js/src/visitor/vectorctor.ts new file mode 100644 index 000000000..5db268c00 --- /dev/null +++ b/src/arrow/js/src/visitor/vectorctor.ts @@ -0,0 +1,99 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Data } from '../data'; +import { Type } from '../enum'; +import { DataType } from '../type'; +import { Visitor } from '../visitor'; +import { VectorType, VectorCtor } from '../interfaces'; + +import { BinaryVector } from '../vector/binary'; +import { BoolVector } from '../vector/bool'; +import { DateVector, DateDayVector, DateMillisecondVector } from '../vector/date'; +import { DecimalVector } from '../vector/decimal'; +import { DictionaryVector } from '../vector/dictionary'; +import { FixedSizeBinaryVector } from '../vector/fixedsizebinary'; +import { FixedSizeListVector } from '../vector/fixedsizelist'; +import { FloatVector, Float16Vector, Float32Vector, Float64Vector } from '../vector/float'; +import { IntervalVector, IntervalDayTimeVector, IntervalYearMonthVector } from '../vector/interval'; +import { IntVector, Int8Vector, Int16Vector, Int32Vector, Int64Vector, Uint8Vector, Uint16Vector, Uint32Vector, Uint64Vector } from '../vector/int'; +import { ListVector } from '../vector/list'; +import { MapVector } from '../vector/map'; +import { NullVector } from '../vector/null'; +import { StructVector } from '../vector/struct'; +import { TimestampVector, TimestampSecondVector, TimestampMillisecondVector, TimestampMicrosecondVector, TimestampNanosecondVector } from '../vector/timestamp'; +import { TimeVector, TimeSecondVector, TimeMillisecondVector, TimeMicrosecondVector, TimeNanosecondVector } from '../vector/time'; +import { UnionVector, DenseUnionVector, SparseUnionVector } from '../vector/union'; +import { Utf8Vector } from '../vector/utf8'; + +/** @ignore */ +export interface GetVectorConstructor extends Visitor { + visit<T extends Type>(node: T): VectorCtor<T>; + visitMany <T extends Type>(nodes: T[]): VectorCtor<T>[]; + getVisitFn<T extends Type>(node: T): () => VectorCtor<T>; + getVisitFn<T extends DataType>(node: VectorType<T> | Data<T> | T): () => VectorCtor<T>; +} + +/** @ignore */ +export class GetVectorConstructor extends Visitor { + public visitNull () { return NullVector; } + public visitBool () { return BoolVector; } + public visitInt () { return IntVector; } + public visitInt8 () { return Int8Vector; } + public visitInt16 () { return Int16Vector; } + public visitInt32 () { return Int32Vector; } + public visitInt64 () { return Int64Vector; } + public visitUint8 () { return Uint8Vector; } + public visitUint16 () { return Uint16Vector; } + public visitUint32 () { return Uint32Vector; } + public visitUint64 () { return Uint64Vector; } + public visitFloat () { return FloatVector; } + public visitFloat16 () { return Float16Vector; } + public visitFloat32 () { return Float32Vector; } + public visitFloat64 () { return Float64Vector; } + public visitUtf8 () { return Utf8Vector; } + public visitBinary () { return BinaryVector; } + public visitFixedSizeBinary () { return FixedSizeBinaryVector; } + public visitDate () { return DateVector; } + public visitDateDay () { return DateDayVector; } + public visitDateMillisecond () { return DateMillisecondVector; } + public visitTimestamp () { return TimestampVector; } + public visitTimestampSecond () { return TimestampSecondVector; } + public visitTimestampMillisecond () { return TimestampMillisecondVector; } + public visitTimestampMicrosecond () { return TimestampMicrosecondVector; } + public visitTimestampNanosecond () { return TimestampNanosecondVector; } + public visitTime () { return TimeVector; } + public visitTimeSecond () { return TimeSecondVector; } + public visitTimeMillisecond () { return TimeMillisecondVector; } + public visitTimeMicrosecond () { return TimeMicrosecondVector; } + public visitTimeNanosecond () { return TimeNanosecondVector; } + public visitDecimal () { return DecimalVector; } + public visitList () { return ListVector; } + public visitStruct () { return StructVector; } + public visitUnion () { return UnionVector; } + public visitDenseUnion () { return DenseUnionVector; } + public visitSparseUnion () { return SparseUnionVector; } + public visitDictionary () { return DictionaryVector; } + public visitInterval () { return IntervalVector; } + public visitIntervalDayTime () { return IntervalDayTimeVector; } + public visitIntervalYearMonth () { return IntervalYearMonthVector; } + public visitFixedSizeList () { return FixedSizeListVector; } + public visitMap () { return MapVector; } +} + +/** @ignore */ +export const instance = new GetVectorConstructor(); diff --git a/src/arrow/js/src/visitor/vectorloader.ts b/src/arrow/js/src/visitor/vectorloader.ts new file mode 100644 index 000000000..0a7bb41d8 --- /dev/null +++ b/src/arrow/js/src/visitor/vectorloader.ts @@ -0,0 +1,141 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Data } from '../data'; +import * as type from '../type'; +import { Field } from '../schema'; +import { Vector } from '../vector'; +import { DataType } from '../type'; +import { Visitor } from '../visitor'; +import { packBools } from '../util/bit'; +import { encodeUtf8 } from '../util/utf8'; +import { Int64, Int128 } from '../util/int'; +import { UnionMode, DateUnit } from '../enum'; +import { toArrayBufferView } from '../util/buffer'; +import { BufferRegion, FieldNode } from '../ipc/metadata/message'; + +/** @ignore */ +export interface VectorLoader extends Visitor { + visit<T extends DataType>(node: Field<T> | T): Data<T>; + visitMany<T extends DataType>(nodes: (Field<T> | T)[]): Data<T>[]; +} + +/** @ignore */ +export class VectorLoader extends Visitor { + private bytes: Uint8Array; + private nodes: FieldNode[]; + private nodesIndex = -1; + private buffers: BufferRegion[]; + private buffersIndex = -1; + private dictionaries: Map<number, Vector<any>>; + constructor(bytes: Uint8Array, nodes: FieldNode[], buffers: BufferRegion[], dictionaries: Map<number, Vector<any>>) { + super(); + this.bytes = bytes; + this.nodes = nodes; + this.buffers = buffers; + this.dictionaries = dictionaries; + } + + public visit<T extends DataType>(node: Field<T> | T): Data<T> { + return super.visit(node instanceof Field ? node.type : node); + } + + public visitNull <T extends type.Null> (type: T, { length, } = this.nextFieldNode()) { return Data.Null(type, 0, length); } + public visitBool <T extends type.Bool> (type: T, { length, nullCount } = this.nextFieldNode()) { return Data.Bool(type, 0, length, nullCount, this.readNullBitmap(type, nullCount), this.readData(type)); } + public visitInt <T extends type.Int> (type: T, { length, nullCount } = this.nextFieldNode()) { return Data.Int(type, 0, length, nullCount, this.readNullBitmap(type, nullCount), this.readData(type)); } + public visitFloat <T extends type.Float> (type: T, { length, nullCount } = this.nextFieldNode()) { return Data.Float(type, 0, length, nullCount, this.readNullBitmap(type, nullCount), this.readData(type)); } + public visitUtf8 <T extends type.Utf8> (type: T, { length, nullCount } = this.nextFieldNode()) { return Data.Utf8(type, 0, length, nullCount, this.readNullBitmap(type, nullCount), this.readOffsets(type), this.readData(type)); } + public visitBinary <T extends type.Binary> (type: T, { length, nullCount } = this.nextFieldNode()) { return Data.Binary(type, 0, length, nullCount, this.readNullBitmap(type, nullCount), this.readOffsets(type), this.readData(type)); } + public visitFixedSizeBinary <T extends type.FixedSizeBinary> (type: T, { length, nullCount } = this.nextFieldNode()) { return Data.FixedSizeBinary(type, 0, length, nullCount, this.readNullBitmap(type, nullCount), this.readData(type)); } + public visitDate <T extends type.Date_> (type: T, { length, nullCount } = this.nextFieldNode()) { return Data.Date(type, 0, length, nullCount, this.readNullBitmap(type, nullCount), this.readData(type)); } + public visitTimestamp <T extends type.Timestamp> (type: T, { length, nullCount } = this.nextFieldNode()) { return Data.Timestamp(type, 0, length, nullCount, this.readNullBitmap(type, nullCount), this.readData(type)); } + public visitTime <T extends type.Time> (type: T, { length, nullCount } = this.nextFieldNode()) { return Data.Time(type, 0, length, nullCount, this.readNullBitmap(type, nullCount), this.readData(type)); } + public visitDecimal <T extends type.Decimal> (type: T, { length, nullCount } = this.nextFieldNode()) { return Data.Decimal(type, 0, length, nullCount, this.readNullBitmap(type, nullCount), this.readData(type)); } + public visitList <T extends type.List> (type: T, { length, nullCount } = this.nextFieldNode()) { return Data.List(type, 0, length, nullCount, this.readNullBitmap(type, nullCount), this.readOffsets(type), this.visit(type.children[0])); } + public visitStruct <T extends type.Struct> (type: T, { length, nullCount } = this.nextFieldNode()) { return Data.Struct(type, 0, length, nullCount, this.readNullBitmap(type, nullCount), this.visitMany(type.children)); } + public visitUnion <T extends type.Union> (type: T ) { return type.mode === UnionMode.Sparse ? this.visitSparseUnion(type as type.SparseUnion) : this.visitDenseUnion(type as type.DenseUnion); } + public visitDenseUnion <T extends type.DenseUnion> (type: T, { length, nullCount } = this.nextFieldNode()) { return Data.Union(type, 0, length, nullCount, this.readNullBitmap(type, nullCount), this.readTypeIds(type), this.readOffsets(type), this.visitMany(type.children)); } + public visitSparseUnion <T extends type.SparseUnion> (type: T, { length, nullCount } = this.nextFieldNode()) { return Data.Union(type, 0, length, nullCount, this.readNullBitmap(type, nullCount), this.readTypeIds(type), this.visitMany(type.children)); } + public visitDictionary <T extends type.Dictionary> (type: T, { length, nullCount } = this.nextFieldNode()) { return Data.Dictionary(type, 0, length, nullCount, this.readNullBitmap(type, nullCount), this.readData(type.indices), this.readDictionary(type)); } + public visitInterval <T extends type.Interval> (type: T, { length, nullCount } = this.nextFieldNode()) { return Data.Interval(type, 0, length, nullCount, this.readNullBitmap(type, nullCount), this.readData(type)); } + public visitFixedSizeList <T extends type.FixedSizeList> (type: T, { length, nullCount } = this.nextFieldNode()) { return Data.FixedSizeList(type, 0, length, nullCount, this.readNullBitmap(type, nullCount), this.visit(type.children[0])); } + public visitMap <T extends type.Map_> (type: T, { length, nullCount } = this.nextFieldNode()) { return Data.Map(type, 0, length, nullCount, this.readNullBitmap(type, nullCount), this.readOffsets(type), this.visit(type.children[0])); } + + protected nextFieldNode() { return this.nodes[++this.nodesIndex]; } + protected nextBufferRange() { return this.buffers[++this.buffersIndex]; } + protected readNullBitmap<T extends DataType>(type: T, nullCount: number, buffer = this.nextBufferRange()) { + return nullCount > 0 && this.readData(type, buffer) || new Uint8Array(0); + } + protected readOffsets<T extends DataType>(type: T, buffer?: BufferRegion) { return this.readData(type, buffer); } + protected readTypeIds<T extends DataType>(type: T, buffer?: BufferRegion) { return this.readData(type, buffer); } + protected readData<T extends DataType>(_type: T, { length, offset } = this.nextBufferRange()) { + return this.bytes.subarray(offset, offset + length); + } + protected readDictionary<T extends type.Dictionary>(type: T): Vector<T['dictionary']> { + return this.dictionaries.get(type.id)!; + } +} + +/** @ignore */ +export class JSONVectorLoader extends VectorLoader { + private sources: any[][]; + constructor(sources: any[][], nodes: FieldNode[], buffers: BufferRegion[], dictionaries: Map<number, Vector<any>>) { + super(new Uint8Array(0), nodes, buffers, dictionaries); + this.sources = sources; + } + protected readNullBitmap<T extends DataType>(_type: T, nullCount: number, { offset } = this.nextBufferRange()) { + return nullCount <= 0 ? new Uint8Array(0) : packBools(this.sources[offset]); + } + protected readOffsets<T extends DataType>(_type: T, { offset } = this.nextBufferRange()) { + return toArrayBufferView(Uint8Array, toArrayBufferView(Int32Array, this.sources[offset])); + } + protected readTypeIds<T extends DataType>(type: T, { offset } = this.nextBufferRange()) { + return toArrayBufferView(Uint8Array, toArrayBufferView(type.ArrayType, this.sources[offset])); + } + protected readData<T extends DataType>(type: T, { offset } = this.nextBufferRange()) { + const { sources } = this; + if (DataType.isTimestamp(type)) { + return toArrayBufferView(Uint8Array, Int64.convertArray(sources[offset] as string[])); + } else if ((DataType.isInt(type) || DataType.isTime(type)) && type.bitWidth === 64) { + return toArrayBufferView(Uint8Array, Int64.convertArray(sources[offset] as string[])); + } else if (DataType.isDate(type) && type.unit === DateUnit.MILLISECOND) { + return toArrayBufferView(Uint8Array, Int64.convertArray(sources[offset] as string[])); + } else if (DataType.isDecimal(type)) { + return toArrayBufferView(Uint8Array, Int128.convertArray(sources[offset] as string[])); + } else if (DataType.isBinary(type) || DataType.isFixedSizeBinary(type)) { + return binaryDataFromJSON(sources[offset] as string[]); + } else if (DataType.isBool(type)) { + return packBools(sources[offset] as number[]); + } else if (DataType.isUtf8(type)) { + return encodeUtf8((sources[offset] as string[]).join('')); + } + return toArrayBufferView(Uint8Array, toArrayBufferView(type.ArrayType, sources[offset].map((x) => +x))); + } +} + +/** @ignore */ +function binaryDataFromJSON(values: string[]) { + // "DATA": ["49BC7D5B6C47D2","3F5FB6D9322026"] + // There are definitely more efficient ways to do this... but it gets the + // job done. + const joined = values.join(''); + const data = new Uint8Array(joined.length / 2); + for (let i = 0; i < joined.length; i += 2) { + data[i >> 1] = parseInt(joined.substr(i, 2), 16); + } + return data; +} |