6 files changed, 558 insertions, 0 deletions
diff --git a/lib/compression/tests/scripts/README b/lib/compression/tests/scripts/README
new file mode 100644
index 0000000..aff1047
--- /dev/null
+++ b/lib/compression/tests/scripts/README
@@ -0,0 +1,19 @@
+Tools used in the development and testing the LZ77 + Huffman code.
+
+These might not be of use to anyone ever, but here they are.
+
+make-fuzz-examples
+encodes compressed files for decompression fuzzer.
+
+decode-huffman-header
+print Huffman codes and validate first header in a compressed file.
+
+three-byte-hash
+check that a three byte hash works.
+
+make-test-vectors
+make files with randomly unbalanced symbol distributions.
+
+generate-windows-test-vectors.c
+if compiled under Cygwin or similar on Windows, this can be used to
+generate and verify test vectors.
diff --git a/lib/compression/tests/scripts/decode-huffman-header b/lib/compression/tests/scripts/decode-huffman-header
new file mode 100755
index 0000000..9c1279b
--- /dev/null
+++ b/lib/compression/tests/scripts/decode-huffman-header
@@ -0,0 +1,54 @@
+#!/usr/bin/python3
+"""Print the codes in the first Huffman tree header in the given file.
+
+USAGE: decode-huffman-header FILE
+
+The number of codes of different length is printed first, followed by
+the implied total frequency of codes, followed by the deduced codes.
+
+If the total is not 1.0, the header is invalid.
+"""
+
+import sys
+
+
+if '--help' in sys.argv or '-h' in sys.argv or len(sys.argv) != 2:
+    print(__doc__)
+    exit(len(sys.argv) != 2)
+
+
+def read_table(data):
+    lengths = [[] for x in range(16)]
+    for i, b in enumerate(data):
+        even = b & 15
+        odd = b >> 4
+        lengths[even].append(i * 2)
+        lengths[odd].append(i * 2 + 1)
+
+    code = 0
+
+    total = 0.0
+    for i, bucket in enumerate(lengths):
+        if bucket and i:
+            portion = 1.0 / (1 << i) * len(bucket)
+            total += portion
+            print(f"length {i:2}: {len(bucket):4} ({portion})")
+    print(f"total {total}")
+
+    for i, bucket in enumerate(lengths):
+        if i == 0:
+            continue
+        code <<= 1
+        for c in bucket:
+            print(f'{c:03x}   {code:0{i}b}')
+            code += 1
+
+
+def main():
+    fn = sys.argv[1]
+    with open(fn, 'rb') as f:
+        data = f.read(256)
+        read_table(data)
+
+
+main()
diff --git a/lib/compression/tests/scripts/generate-windows-test-vectors.c b/lib/compression/tests/scripts/generate-windows-test-vectors.c
new file mode 100644
index 0000000..28724d2
--- /dev/null
+++ b/lib/compression/tests/scripts/generate-windows-test-vectors.c
@@ -0,0 +1,206 @@
+/*
+ * Generate test vectorsa for Windows LZ77 Huffman compression.
+ *
+ * Copyright (c) 2022 Douglas Bagnall <dbagnall@samba.org>
+ *
+ * GPLv3+.
+ *
+ * Can be compiled on Windows 2012r2 under Cygwin
+ *
+ * gcc -o generate-windows-test-vectors  \
+ *       generate-windows-test-vectors.c \
+ *	 C:\Windows\SysWOW64\cabinet.dll \
+ *	 -lcabinet
+ *
+ * There might be better ways.
+ *
+ * See https://learn.microsoft.com/en-us/windows/win32/cmpapi/-compression-portal
+ */
+
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+/* compressapi.h is in the Windows API. mingw-w64 has a copy. */
+#include <compressapi.h>
+#include <errhandlingapi.h>
+
+struct blob {
+	uint8_t *data;
+	size_t length;
+};
+
+/* Windows size_t is different than Cygwin size_t (though still 64 bit) */
+typedef unsigned long long wsize_t;
+
+
+#define compression_flags (COMPRESS_ALGORITHM_XPRESS_HUFF | COMPRESS_RAW)
+
+int32_t compression_level = 0;
+
+static struct blob compress(struct blob input)
+{
+	COMPRESSOR_HANDLE handle;
+	struct blob output;
+	bool ok;
+	wsize_t used;
+
+	ok = CreateCompressor(compression_flags, NULL, &handle);
+
+	if (! ok) {
+		fprintf(stderr, "CreateCompressor failed\n");
+		exit(1);
+	}
+
+	output.length = input.length * 3 + 256;
+	output.data = malloc(output.length);
+	if (output.data == NULL) {
+		fprintf(stderr, "output allocation failed (estimated %zu)\n",
+			output.length);
+		exit(1);
+	}
+
+
+	ok = SetCompressorInformation(handle,
+				      COMPRESS_INFORMATION_CLASS_LEVEL,
+				      &compression_level,
+				      sizeof(compression_level));
+
+	if (! ok) {
+	  fprintf(stderr, "SetCompressorInformation failed: %d\n",
+		  GetLastError());
+	  //exit(1);
+	}
+
+	ok = Compress(handle,
+		      input.data,
+		      input.length,
+		      output.data,
+		      output.length,
+		      &used);
+	if (! ok) {
+		fprintf(stderr, "Compress failed\n");
+		exit(1);
+	}
+	output.data = realloc(output.data, used);
+	if (output.data == NULL) {
+		fprintf(stderr,
+			"failed to shrinkwrap output! (from %zu to %llu)\n",
+			output.length, used);
+		exit(1);
+	}
+	output.length = used;
+	CloseCompressor(handle);
+	return output;
+}
+
+
+struct blob decompress(struct blob input,
+		       size_t expected_size)
+{
+	DECOMPRESSOR_HANDLE handle;
+	struct blob output;
+	bool ok;
+	wsize_t used;
+
+	ok = CreateDecompressor(compression_flags, NULL, &handle);
+
+	if (! ok) {
+		fprintf(stderr, "CreateDecompressor failed\n");
+		exit(1);
+	}
+
+	output.length = expected_size;
+	output.data = malloc(output.length);
+	if (output.data == NULL) {
+		fprintf(stderr, "output allocation failed (%zu)\n",
+			output.length);
+		exit(1);
+	}
+
+	ok = Decompress(handle,
+			input.data,
+			input.length,
+			output.data,
+			output.length,
+			&used);
+	if (! ok) {
+		fprintf(stderr, "Decompress failed\n");
+		exit(1);
+	}
+	CloseDecompressor(handle);
+	return output;
+}
+
+
+static void __attribute__((noreturn)) usage(int ret)
+{
+	fprintf(stderr,
+		"USAGE: test-win-vectors {c,d} filename [length|level] > DEST\n\n");
+	fprintf(stderr, "c for< compression, d for decompression\n");
+	fprintf(stderr, "decompressed length is required for decompression\n");
+	fprintf(stderr, "compression level flag is optional [default 0]\n");
+	exit(ret);
+}
+
+int main(int argc, const char *argv[])
+{
+	FILE *fh;
+	const char *filename;
+	struct stat s;
+	int ret;
+	struct blob input = {0};
+	struct blob output = {0};
+
+	if (argc < 3 || argc > 4) {
+		usage(1);
+	}
+	filename = argv[2];
+
+	fh = fopen(filename, "rb");
+	if (fh == NULL) {
+		fprintf(stderr, "Could not open %s\n", filename);
+		usage(1);
+	}
+
+	ret = fstat(fileno(fh), &s);
+	if (ret != 0) {
+		fprintf(stderr, "Could not stat %s: %d\n", filename, ret);
+		usage(1);
+	}
+	input.length = s.st_size;
+	input.data = malloc(input.length);
+	if (input.data == NULL) {
+		fprintf(stderr, "input too big for memory?! (%zu)\n",
+			s.st_size);
+		exit(1);
+	}
+
+	fread(input.data, 1, input.length, fh);
+
+	if (strcmp(argv[1], "c") == 0) {
+		if (argc == 4 && strcmp(argv[3], "0")) {
+			compression_level = 1;
+		}	       		
+		output = compress(input);
+	} else if (strcmp(argv[1], "d") == 0) {
+		size_t decomp_size;
+		if (argc != 4) {
+			fprintf(stderr, "no length given\n");
+			usage(1);
+		}
+		decomp_size = atoi(argv[3]);
+		output = decompress(input, decomp_size);
+	} else {
+		usage(1);
+	}
+	fwrite(output.data, 1, output.length, stdout);
+	free(output.data);
+	return 0;
+}
diff --git a/lib/compression/tests/scripts/make-fuzz-examples b/lib/compression/tests/scripts/make-fuzz-examples
new file mode 100755
index 0000000..09200fb
--- /dev/null
+++ b/lib/compression/tests/scripts/make-fuzz-examples
@@ -0,0 +1,45 @@
+#!/usr/bin/python3
+#
+"""Pack the compressed files created by test_lzx_huffman.c (with
+LZXHUFF_DEBUG_FILES) into the format used by the decompression fuzzer.
+
+That is, the first 3 bytes are the length of the decompressed file,
+and the rest of the file is the compressed data.
+
+USAGE: make-fuzz-examples DIR
+
+where DIR is probably '/tmp'.
+"""
+import os
+import sys
+
+
+if '--help' in sys.argv or '-h' in sys.argv or len(sys.argv) != 2:
+    print(__doc__)
+    exit(len(sys.argv) != 2)
+
+
+def main():
+    files = set(os.listdir(sys.argv[1]))
+
+    for fn in files:
+        if fn.endswith('-compressed'):
+            fn2 = fn.replace('-compressed', '-decompressed')
+            if fn2 not in files:
+                print(f"skipping {fn}, no {fn2}")
+                continue
+            cfn = '/tmp/' + fn
+            dfn = '/tmp/' + fn2
+            wfn = '/tmp/' + fn.replace('-compressed', '.fuzz')
+
+            size = os.stat(dfn).st_size
+            sbytes = bytes([(size & 0xff), (size >> 8) & 0xff, (size >> 16) & 0xff])
+
+            with open(cfn, 'rb') as f:
+                s = f.read()
+
+            with open(wfn, 'wb') as f:
+                s = f.write(sbytes + s)
+
+
+main()
diff --git a/lib/compression/tests/scripts/make-test-vectors b/lib/compression/tests/scripts/make-test-vectors
new file mode 100755
index 0000000..6f25866
--- /dev/null
+++ b/lib/compression/tests/scripts/make-test-vectors
@@ -0,0 +1,185 @@
+#!/usr/bin/python3
+"""Generate a few strings with unbalanced distributions to test the
+regeneration of the Huffman tree when it gets too deep.
+
+USAGE: make-test-vectors DIR
+
+This will fill up DIR with test files.
+"""
+import sys
+import random
+from collections import defaultdict
+
+
+if '--help' in sys.argv or '-h' in sys.argv or len(sys.argv) != 2:
+    print(__doc__)
+    exit(len(sys.argv) != 2)
+
+
+DIR = sys.argv[1]
+
+SIZE = (1 << 17) + (23)  # two and a bit blocks
+SIZE_NAME = "128k+"
+# SIZE = (1 << 16)
+# SIZE_NAME = "64"
+
+
+random.seed(1)
+
+
+def squares(n):
+    array = []
+    for i in range(n):
+        a = random.random()
+        b = random.random()
+        array.append(int(a * b * 256))
+    return bytes(array)
+
+
+def skewed_choices(n):
+    b = list(range(256))
+    array = random.choices(b, weights=b, k=n)
+    return bytes(array)
+
+
+def fib_shuffle(n):
+    array = []
+    a, b = 1, 1
+    for i in range(100):
+        array.extend([i] * a)
+        a, b = a + b, a
+        if len(array) > 1000000:
+            break
+    random.shuffle(array)
+    return bytes(array[:n])
+
+
+def exp_shuffle(n):
+    array = []
+    for i in range(256):
+        array.extend([i] * int(1.04 ** i))
+        if len(array) > 1000000:
+            break
+    random.shuffle(array)
+    return bytes(array[:n])
+
+
+def and_rand(n):
+    array = []
+    for i in range(n):
+        a = random.randrange(256)
+        b = random.randrange(256)
+        array.append(a & b)
+    return bytes(array)
+
+
+def betavar(n, a, b):
+    array = []
+    for i in range(n):
+        x = random.betavariate(a, b)
+        array.append(int(x * 255.999999999999))
+    return bytes(array)
+
+
+def repeated_alphabet(n):
+    a = b'abcdefghijklmnopqrstuvwxyz'
+    na = n // len(a) + 1
+    s = a * na
+    return s[:n]
+
+
+def decayed_alphabet(n):
+    s = list(repeated_alphabet(n))
+    for i in range(256):
+        j = random.randrange(n)
+        s[j] = i
+
+    return bytes(s)
+
+
+def trigram_model(n):
+    with open(__file__, 'rb') as f:
+        data = f.read()
+    lut = defaultdict(list)
+    for a, b, c in zip(data, data[1:], data[2:]):
+        k = bytes([a, b])
+        lut[k].append(c)
+
+    k = random.choice(list(lut.keys()))
+    s = []
+    p = k[1]
+    for i in range(n + 10):
+        c = random.choice(lut[k])
+        s.append(c)
+        k = bytes([p, c])
+        p = c
+
+    return bytes(s[10:])
+
+
+def trigram_sum_model(n):
+    with open(__file__, 'rb') as f:
+        data = f.read()
+    lut = [[random.randrange(256)] for i in range(512)]
+    for a, b, c in zip(data, data[1:], data[2:]):
+        lut[a + b].append(c)
+
+    s = []
+    i = random.randrange(len(data) - 1)
+    a = data[i]
+    b = data[i + 1]
+
+    for i in range(n + 10):
+        x = lut[a + b]
+        c = random.choice(x)
+        s.append(c)
+        a = b
+        b = c
+
+    return bytes(s[10:])
+
+
+def the_classics():
+    # this used to be main()
+    sq = squares(SIZE)
+    ch = skewed_choices(SIZE)
+    fs = fib_shuffle(SIZE)
+    es = exp_shuffle(SIZE)
+    ar = and_rand(SIZE)
+    bv1 = betavar(SIZE, 0.1, 1.5)
+    bv2 = betavar(SIZE, 0.5, 2.0)
+    bv3 = betavar(SIZE, 0.05, 0.05)
+
+    print("n     sq       ch      fs       es")
+    for i in range(256):
+        print(f"{i:3} {sq.count(i):5}   {ch.count(i):5}   "
+              f"{fs.count(i):5}   {es.count(i):5}"
+              f"{ar.count(i):5}   {bv1.count(i):5}"
+              f"{bv2.count(i):5}   {bv3.count(i):5}"
+              )
+
+    for series, fn in ((sq, "square_series"),
+                       (ch, "skewed_choices"),
+                       (fs, "fib_shuffle"),
+                       (es, "exp_shuffle"),
+                       (ar, "and_rand"),
+                       (bv1, "beta-variate1"),
+                       (bv2, "beta-variate2"),
+                       (bv3, "beta-variate3"),
+                       ):
+        with open(f"{DIR}/{fn}-{SIZE_NAME}", "wb") as f:
+            f.write(series)
+
+
+def main():
+    if True:
+        the_classics()
+    for series, fn in ((decayed_alphabet(SIZE), "decayed_alphabet"),
+                       (trigram_model(SIZE), "trigram"),
+                       (trigram_sum_model(SIZE), "trigram_sum"),
+                       ):
+        with open(f"{DIR}/{fn}_{SIZE_NAME}", "wb") as f:
+            f.write(series)
+
+
+main()
diff --git a/lib/compression/tests/scripts/three-byte-hash b/lib/compression/tests/scripts/three-byte-hash
new file mode 100755
index 0000000..100d0bc
--- /dev/null
+++ b/lib/compression/tests/scripts/three-byte-hash
@@ -0,0 +1,49 @@
+#!/usr/bin/python3
+"""Print statistics about a certain three byte hash.
+
+USAGE: three_byte_hash
+"""
+import sys
+
+if '--help' in sys.argv or '-h' in sys.argv or len(sys.argv) > 1:
+    print(__doc__)
+    exit(not ('--help' in sys.argv or '-h' in sys.argv))
+
+
+from statistics import mean, pstdev, median
+
+
+def h(*args, bits=12):
+    a = args[0]
+    b = args[1] ^ 0x2e
+    c = args[2] ^ 0x55
+    d = ((a + b) << 8) ^ (((c - a) & 0xffff) << 5) ^ (c + b) ^ (0xcab + a)
+    return d & ((1 << bits) - 1)
+
+
+def count(fn, bits, filter=None):
+    counts = [0] * (1 << bits)
+    for i in range(256 ** 3):
+        a, b, c = i & 255, (i >> 8) & 255, i >> 16
+        if filter and not (filter(a) and filter(b) and filter(c)):
+            continue
+
+        h = fn(a, b, c, bits=bits)
+        counts[h] += 1
+
+    print(f" {bits} bits; {len(counts)} buckets, "
+          f"expected {(1<<24) / len(counts)}")
+    print(f"median {median(counts)}")
+    print(f"mean {mean(counts)}")
+    print(f"min {min(counts)}")
+    print(f"max {max(counts)}")
+    print(f"stddev {pstdev(counts)}")
+
+
+for b in (12, 13, 14):
+    count(h, b)
+
+    print("With ASCII filter")
+    letters = set(range(32, 127))
+    letters |= set(b'\r\n\t\0')
+    count(h, b, filter=letters.__contains__)