diff options
Diffstat (limited to 'lib/compression/tests/scripts/make-test-vectors')
-rwxr-xr-x | lib/compression/tests/scripts/make-test-vectors | 185 |
1 files changed, 185 insertions, 0 deletions
diff --git a/lib/compression/tests/scripts/make-test-vectors b/lib/compression/tests/scripts/make-test-vectors new file mode 100755 index 0000000..6f25866 --- /dev/null +++ b/lib/compression/tests/scripts/make-test-vectors @@ -0,0 +1,185 @@ +#!/usr/bin/python3 +"""Generate a few strings with unbalanced distributions to test the +regeneration of the Huffman tree when it gets too deep. + +USAGE: make-test-vectors DIR + +This will fill up DIR with test files. +""" +import sys +import random +from collections import defaultdict + + +if '--help' in sys.argv or '-h' in sys.argv or len(sys.argv) != 2: + print(__doc__) + exit(len(sys.argv) != 2) + + +DIR = sys.argv[1] + +SIZE = (1 << 17) + (23) # two and a bit blocks +SIZE_NAME = "128k+" +# SIZE = (1 << 16) +# SIZE_NAME = "64" + + +random.seed(1) + + +def squares(n): + array = [] + for i in range(n): + a = random.random() + b = random.random() + array.append(int(a * b * 256)) + return bytes(array) + + +def skewed_choices(n): + b = list(range(256)) + array = random.choices(b, weights=b, k=n) + return bytes(array) + + +def fib_shuffle(n): + array = [] + a, b = 1, 1 + for i in range(100): + array.extend([i] * a) + a, b = a + b, a + if len(array) > 1000000: + break + random.shuffle(array) + return bytes(array[:n]) + + +def exp_shuffle(n): + array = [] + for i in range(256): + array.extend([i] * int(1.04 ** i)) + if len(array) > 1000000: + break + random.shuffle(array) + return bytes(array[:n]) + + +def and_rand(n): + array = [] + for i in range(n): + a = random.randrange(256) + b = random.randrange(256) + array.append(a & b) + return bytes(array) + + +def betavar(n, a, b): + array = [] + for i in range(n): + x = random.betavariate(a, b) + array.append(int(x * 255.999999999999)) + return bytes(array) + + +def repeated_alphabet(n): + a = b'abcdefghijklmnopqrstuvwxyz' + na = n // len(a) + 1 + s = a * na + return s[:n] + + +def decayed_alphabet(n): + s = list(repeated_alphabet(n)) + for i in range(256): + j = random.randrange(n) + s[j] = i + + return bytes(s) + + +def trigram_model(n): + with open(__file__, 'rb') as f: + data = f.read() + lut = defaultdict(list) + for a, b, c in zip(data, data[1:], data[2:]): + k = bytes([a, b]) + lut[k].append(c) + + k = random.choice(list(lut.keys())) + s = [] + p = k[1] + for i in range(n + 10): + c = random.choice(lut[k]) + s.append(c) + k = bytes([p, c]) + p = c + + return bytes(s[10:]) + + +def trigram_sum_model(n): + with open(__file__, 'rb') as f: + data = f.read() + lut = [[random.randrange(256)] for i in range(512)] + for a, b, c in zip(data, data[1:], data[2:]): + lut[a + b].append(c) + + s = [] + i = random.randrange(len(data) - 1) + a = data[i] + b = data[i + 1] + + for i in range(n + 10): + x = lut[a + b] + c = random.choice(x) + s.append(c) + a = b + b = c + + return bytes(s[10:]) + + +def the_classics(): + # this used to be main() + sq = squares(SIZE) + ch = skewed_choices(SIZE) + fs = fib_shuffle(SIZE) + es = exp_shuffle(SIZE) + ar = and_rand(SIZE) + bv1 = betavar(SIZE, 0.1, 1.5) + bv2 = betavar(SIZE, 0.5, 2.0) + bv3 = betavar(SIZE, 0.05, 0.05) + + print("n sq ch fs es") + for i in range(256): + print(f"{i:3} {sq.count(i):5} {ch.count(i):5} " + f"{fs.count(i):5} {es.count(i):5}" + f"{ar.count(i):5} {bv1.count(i):5}" + f"{bv2.count(i):5} {bv3.count(i):5}" + ) + + for series, fn in ((sq, "square_series"), + (ch, "skewed_choices"), + (fs, "fib_shuffle"), + (es, "exp_shuffle"), + (ar, "and_rand"), + (bv1, "beta-variate1"), + (bv2, "beta-variate2"), + (bv3, "beta-variate3"), + ): + with open(f"{DIR}/{fn}-{SIZE_NAME}", "wb") as f: + f.write(series) + + +def main(): + if True: + the_classics() + for series, fn in ((decayed_alphabet(SIZE), "decayed_alphabet"), + (trigram_model(SIZE), "trigram"), + (trigram_sum_model(SIZE), "trigram_sum"), + ): + with open(f"{DIR}/{fn}_{SIZE_NAME}", "wb") as f: + f.write(series) + + +main() |