1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
|
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
import { vecs } from '../generate-test-data';
import * as generate from '../generate-test-data';
import { Schema, Field, Dictionary } from '../Arrow';
const listVectorGeneratorNames = ['list', 'fixedSizeList'];
const nestedVectorGeneratorNames = [ 'struct', 'denseUnion', 'sparseUnion', 'map' ];
const dictionaryKeyGeneratorNames = ['int8' ,'int16' ,'int32' ,'uint8' ,'uint16' ,'uint32'];
const valueVectorGeneratorNames = [
'null_', 'bool', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64',
'float16', 'float32', 'float64', 'utf8', 'binary', 'fixedSizeBinary', 'dateDay', 'dateMillisecond',
'timestampSecond', 'timestampMillisecond', 'timestampMicrosecond', 'timestampNanosecond',
'timeSecond', 'timeMillisecond', 'timeMicrosecond', 'timeNanosecond', 'decimal',
'dictionary', 'intervalDayTime', 'intervalYearMonth'
];
const vectorGeneratorNames = [...valueVectorGeneratorNames, ...listVectorGeneratorNames, ...nestedVectorGeneratorNames];
export function* generateRandomTables(batchLengths = [1000, 2000, 3000], minCols = 1, maxCols = 5) {
let numCols = 0;
let allNames = shuffle(vectorGeneratorNames);
do {
numCols = Math.max(Math.min(
Math.random() * maxCols | 0, allNames.length), minCols);
let names = allNames.slice(0, numCols);
let types = names.map((fn) => vecs[fn](0).vector.type);
let schema = new Schema(names.map((name, i) => new Field(name, types[i])));
yield generate.table(batchLengths, schema).table;
} while ((allNames = allNames.slice(numCols)).length > 0);
}
/**
* Yields a series of tables containing a single Dictionary-encoded column.
* Each yielded table will be a unique combination of dictionary and indexType,
* such that consuming all tables ensures all Arrow types dictionary-encode.
*
* @param batchLengths number[] Number and length of recordbatches to generate
*/
export function* generateDictionaryTables(batchLengths = [100, 200, 300]) {
for (const dictName of valueVectorGeneratorNames) {
if (dictName === 'dictionary') { continue; }
const dictionary = vecs[dictName](100).vector;
for (const keys of dictionaryKeyGeneratorNames) {
const valsType = dictionary.type;
const keysType = vecs[keys](0).vector.type;
const dictType = new Dictionary(valsType, keysType);
const schema = new Schema([new Field(`dict[${keys}]`, dictType, true)]);
yield generate.table(batchLengths, schema).table;
}
}
}
function shuffle(input: any[]) {
const result = input.slice();
let j, tmp, i = result.length;
while (--i > 0) {
j = (Math.random() * (i + 1)) | 0;
tmp = result[i];
result[i] = result[j];
result[j] = tmp;
}
return result;
}
|