summaryrefslogtreecommitdiffstats
path: root/lib/compression/tests/scripts
diff options
context:
space:
mode:
Diffstat (limited to 'lib/compression/tests/scripts')
-rw-r--r--lib/compression/tests/scripts/README19
-rwxr-xr-xlib/compression/tests/scripts/decode-huffman-header54
-rw-r--r--lib/compression/tests/scripts/generate-windows-test-vectors.c206
-rwxr-xr-xlib/compression/tests/scripts/make-fuzz-examples45
-rwxr-xr-xlib/compression/tests/scripts/make-test-vectors185
-rwxr-xr-xlib/compression/tests/scripts/three-byte-hash49
6 files changed, 558 insertions, 0 deletions
diff --git a/lib/compression/tests/scripts/README b/lib/compression/tests/scripts/README
new file mode 100644
index 0000000..aff1047
--- /dev/null
+++ b/lib/compression/tests/scripts/README
@@ -0,0 +1,19 @@
+Tools used in the development and testing the LZ77 + Huffman code.
+
+These might not be of use to anyone ever, but here they are.
+
+make-fuzz-examples
+encodes compressed files for decompression fuzzer.
+
+decode-huffman-header
+print Huffman codes and validate first header in a compressed file.
+
+three-byte-hash
+check that a three byte hash works.
+
+make-test-vectors
+make files with randomly unbalanced symbol distributions.
+
+generate-windows-test-vectors.c
+if compiled under Cygwin or similar on Windows, this can be used to
+generate and verify test vectors.
diff --git a/lib/compression/tests/scripts/decode-huffman-header b/lib/compression/tests/scripts/decode-huffman-header
new file mode 100755
index 0000000..9c1279b
--- /dev/null
+++ b/lib/compression/tests/scripts/decode-huffman-header
@@ -0,0 +1,54 @@
+#!/usr/bin/python3
+"""Print the codes in the first Huffman tree header in the given file.
+
+USAGE: decode-huffman-header FILE
+
+The number of codes of different length is printed first, followed by
+the implied total frequency of codes, followed by the deduced codes.
+
+If the total is not 1.0, the header is invalid.
+"""
+
+import sys
+
+
+if '--help' in sys.argv or '-h' in sys.argv or len(sys.argv) != 2:
+ print(__doc__)
+ exit(len(sys.argv) != 2)
+
+
+def read_table(data):
+ lengths = [[] for x in range(16)]
+ for i, b in enumerate(data):
+ even = b & 15
+ odd = b >> 4
+ lengths[even].append(i * 2)
+ lengths[odd].append(i * 2 + 1)
+
+ code = 0
+
+ total = 0.0
+ for i, bucket in enumerate(lengths):
+ if bucket and i:
+ portion = 1.0 / (1 << i) * len(bucket)
+ total += portion
+ print(f"length {i:2}: {len(bucket):4} ({portion})")
+ print(f"total {total}")
+
+ for i, bucket in enumerate(lengths):
+ if i == 0:
+ continue
+ code <<= 1
+ for c in bucket:
+ print(f'{c:03x} {code:0{i}b}')
+ code += 1
+
+
+def main():
+ fn = sys.argv[1]
+ with open(fn, 'rb') as f:
+ data = f.read(256)
+ read_table(data)
+
+
+main()
diff --git a/lib/compression/tests/scripts/generate-windows-test-vectors.c b/lib/compression/tests/scripts/generate-windows-test-vectors.c
new file mode 100644
index 0000000..28724d2
--- /dev/null
+++ b/lib/compression/tests/scripts/generate-windows-test-vectors.c
@@ -0,0 +1,206 @@
+/*
+ * Generate test vectorsa for Windows LZ77 Huffman compression.
+ *
+ * Copyright (c) 2022 Douglas Bagnall <dbagnall@samba.org>
+ *
+ * GPLv3+.
+ *
+ * Can be compiled on Windows 2012r2 under Cygwin
+ *
+ * gcc -o generate-windows-test-vectors \
+ * generate-windows-test-vectors.c \
+ * C:\Windows\SysWOW64\cabinet.dll \
+ * -lcabinet
+ *
+ * There might be better ways.
+ *
+ * See https://learn.microsoft.com/en-us/windows/win32/cmpapi/-compression-portal
+ */
+
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+/* compressapi.h is in the Windows API. mingw-w64 has a copy. */
+#include <compressapi.h>
+#include <errhandlingapi.h>
+
+struct blob {
+ uint8_t *data;
+ size_t length;
+};
+
+/* Windows size_t is different than Cygwin size_t (though still 64 bit) */
+typedef unsigned long long wsize_t;
+
+
+#define compression_flags (COMPRESS_ALGORITHM_XPRESS_HUFF | COMPRESS_RAW)
+
+int32_t compression_level = 0;
+
+static struct blob compress(struct blob input)
+{
+ COMPRESSOR_HANDLE handle;
+ struct blob output;
+ bool ok;
+ wsize_t used;
+
+ ok = CreateCompressor(compression_flags, NULL, &handle);
+
+ if (! ok) {
+ fprintf(stderr, "CreateCompressor failed\n");
+ exit(1);
+ }
+
+ output.length = input.length * 3 + 256;
+ output.data = malloc(output.length);
+ if (output.data == NULL) {
+ fprintf(stderr, "output allocation failed (estimated %zu)\n",
+ output.length);
+ exit(1);
+ }
+
+
+ ok = SetCompressorInformation(handle,
+ COMPRESS_INFORMATION_CLASS_LEVEL,
+ &compression_level,
+ sizeof(compression_level));
+
+ if (! ok) {
+ fprintf(stderr, "SetCompressorInformation failed: %d\n",
+ GetLastError());
+ //exit(1);
+ }
+
+ ok = Compress(handle,
+ input.data,
+ input.length,
+ output.data,
+ output.length,
+ &used);
+ if (! ok) {
+ fprintf(stderr, "Compress failed\n");
+ exit(1);
+ }
+ output.data = realloc(output.data, used);
+ if (output.data == NULL) {
+ fprintf(stderr,
+ "failed to shrinkwrap output! (from %zu to %llu)\n",
+ output.length, used);
+ exit(1);
+ }
+ output.length = used;
+ CloseCompressor(handle);
+ return output;
+}
+
+
+struct blob decompress(struct blob input,
+ size_t expected_size)
+{
+ DECOMPRESSOR_HANDLE handle;
+ struct blob output;
+ bool ok;
+ wsize_t used;
+
+ ok = CreateDecompressor(compression_flags, NULL, &handle);
+
+ if (! ok) {
+ fprintf(stderr, "CreateDecompressor failed\n");
+ exit(1);
+ }
+
+ output.length = expected_size;
+ output.data = malloc(output.length);
+ if (output.data == NULL) {
+ fprintf(stderr, "output allocation failed (%zu)\n",
+ output.length);
+ exit(1);
+ }
+
+ ok = Decompress(handle,
+ input.data,
+ input.length,
+ output.data,
+ output.length,
+ &used);
+ if (! ok) {
+ fprintf(stderr, "Decompress failed\n");
+ exit(1);
+ }
+ CloseDecompressor(handle);
+ return output;
+}
+
+
+static void __attribute__((noreturn)) usage(int ret)
+{
+ fprintf(stderr,
+ "USAGE: test-win-vectors {c,d} filename [length|level] > DEST\n\n");
+ fprintf(stderr, "c for< compression, d for decompression\n");
+ fprintf(stderr, "decompressed length is required for decompression\n");
+ fprintf(stderr, "compression level flag is optional [default 0]\n");
+ exit(ret);
+}
+
+int main(int argc, const char *argv[])
+{
+ FILE *fh;
+ const char *filename;
+ struct stat s;
+ int ret;
+ struct blob input = {0};
+ struct blob output = {0};
+
+ if (argc < 3 || argc > 4) {
+ usage(1);
+ }
+ filename = argv[2];
+
+ fh = fopen(filename, "rb");
+ if (fh == NULL) {
+ fprintf(stderr, "Could not open %s\n", filename);
+ usage(1);
+ }
+
+ ret = fstat(fileno(fh), &s);
+ if (ret != 0) {
+ fprintf(stderr, "Could not stat %s: %d\n", filename, ret);
+ usage(1);
+ }
+ input.length = s.st_size;
+ input.data = malloc(input.length);
+ if (input.data == NULL) {
+ fprintf(stderr, "input too big for memory?! (%zu)\n",
+ s.st_size);
+ exit(1);
+ }
+
+ fread(input.data, 1, input.length, fh);
+
+ if (strcmp(argv[1], "c") == 0) {
+ if (argc == 4 && strcmp(argv[3], "0")) {
+ compression_level = 1;
+ }
+ output = compress(input);
+ } else if (strcmp(argv[1], "d") == 0) {
+ size_t decomp_size;
+ if (argc != 4) {
+ fprintf(stderr, "no length given\n");
+ usage(1);
+ }
+ decomp_size = atoi(argv[3]);
+ output = decompress(input, decomp_size);
+ } else {
+ usage(1);
+ }
+ fwrite(output.data, 1, output.length, stdout);
+ free(output.data);
+ return 0;
+}
diff --git a/lib/compression/tests/scripts/make-fuzz-examples b/lib/compression/tests/scripts/make-fuzz-examples
new file mode 100755
index 0000000..09200fb
--- /dev/null
+++ b/lib/compression/tests/scripts/make-fuzz-examples
@@ -0,0 +1,45 @@
+#!/usr/bin/python3
+#
+"""Pack the compressed files created by test_lzx_huffman.c (with
+LZXHUFF_DEBUG_FILES) into the format used by the decompression fuzzer.
+
+That is, the first 3 bytes are the length of the decompressed file,
+and the rest of the file is the compressed data.
+
+USAGE: make-fuzz-examples DIR
+
+where DIR is probably '/tmp'.
+"""
+import os
+import sys
+
+
+if '--help' in sys.argv or '-h' in sys.argv or len(sys.argv) != 2:
+ print(__doc__)
+ exit(len(sys.argv) != 2)
+
+
+def main():
+ files = set(os.listdir(sys.argv[1]))
+
+ for fn in files:
+ if fn.endswith('-compressed'):
+ fn2 = fn.replace('-compressed', '-decompressed')
+ if fn2 not in files:
+ print(f"skipping {fn}, no {fn2}")
+ continue
+ cfn = '/tmp/' + fn
+ dfn = '/tmp/' + fn2
+ wfn = '/tmp/' + fn.replace('-compressed', '.fuzz')
+
+ size = os.stat(dfn).st_size
+ sbytes = bytes([(size & 0xff), (size >> 8) & 0xff, (size >> 16) & 0xff])
+
+ with open(cfn, 'rb') as f:
+ s = f.read()
+
+ with open(wfn, 'wb') as f:
+ s = f.write(sbytes + s)
+
+
+main()
diff --git a/lib/compression/tests/scripts/make-test-vectors b/lib/compression/tests/scripts/make-test-vectors
new file mode 100755
index 0000000..6f25866
--- /dev/null
+++ b/lib/compression/tests/scripts/make-test-vectors
@@ -0,0 +1,185 @@
+#!/usr/bin/python3
+"""Generate a few strings with unbalanced distributions to test the
+regeneration of the Huffman tree when it gets too deep.
+
+USAGE: make-test-vectors DIR
+
+This will fill up DIR with test files.
+"""
+import sys
+import random
+from collections import defaultdict
+
+
+if '--help' in sys.argv or '-h' in sys.argv or len(sys.argv) != 2:
+ print(__doc__)
+ exit(len(sys.argv) != 2)
+
+
+DIR = sys.argv[1]
+
+SIZE = (1 << 17) + (23) # two and a bit blocks
+SIZE_NAME = "128k+"
+# SIZE = (1 << 16)
+# SIZE_NAME = "64"
+
+
+random.seed(1)
+
+
+def squares(n):
+ array = []
+ for i in range(n):
+ a = random.random()
+ b = random.random()
+ array.append(int(a * b * 256))
+ return bytes(array)
+
+
+def skewed_choices(n):
+ b = list(range(256))
+ array = random.choices(b, weights=b, k=n)
+ return bytes(array)
+
+
+def fib_shuffle(n):
+ array = []
+ a, b = 1, 1
+ for i in range(100):
+ array.extend([i] * a)
+ a, b = a + b, a
+ if len(array) > 1000000:
+ break
+ random.shuffle(array)
+ return bytes(array[:n])
+
+
+def exp_shuffle(n):
+ array = []
+ for i in range(256):
+ array.extend([i] * int(1.04 ** i))
+ if len(array) > 1000000:
+ break
+ random.shuffle(array)
+ return bytes(array[:n])
+
+
+def and_rand(n):
+ array = []
+ for i in range(n):
+ a = random.randrange(256)
+ b = random.randrange(256)
+ array.append(a & b)
+ return bytes(array)
+
+
+def betavar(n, a, b):
+ array = []
+ for i in range(n):
+ x = random.betavariate(a, b)
+ array.append(int(x * 255.999999999999))
+ return bytes(array)
+
+
+def repeated_alphabet(n):
+ a = b'abcdefghijklmnopqrstuvwxyz'
+ na = n // len(a) + 1
+ s = a * na
+ return s[:n]
+
+
+def decayed_alphabet(n):
+ s = list(repeated_alphabet(n))
+ for i in range(256):
+ j = random.randrange(n)
+ s[j] = i
+
+ return bytes(s)
+
+
+def trigram_model(n):
+ with open(__file__, 'rb') as f:
+ data = f.read()
+ lut = defaultdict(list)
+ for a, b, c in zip(data, data[1:], data[2:]):
+ k = bytes([a, b])
+ lut[k].append(c)
+
+ k = random.choice(list(lut.keys()))
+ s = []
+ p = k[1]
+ for i in range(n + 10):
+ c = random.choice(lut[k])
+ s.append(c)
+ k = bytes([p, c])
+ p = c
+
+ return bytes(s[10:])
+
+
+def trigram_sum_model(n):
+ with open(__file__, 'rb') as f:
+ data = f.read()
+ lut = [[random.randrange(256)] for i in range(512)]
+ for a, b, c in zip(data, data[1:], data[2:]):
+ lut[a + b].append(c)
+
+ s = []
+ i = random.randrange(len(data) - 1)
+ a = data[i]
+ b = data[i + 1]
+
+ for i in range(n + 10):
+ x = lut[a + b]
+ c = random.choice(x)
+ s.append(c)
+ a = b
+ b = c
+
+ return bytes(s[10:])
+
+
+def the_classics():
+ # this used to be main()
+ sq = squares(SIZE)
+ ch = skewed_choices(SIZE)
+ fs = fib_shuffle(SIZE)
+ es = exp_shuffle(SIZE)
+ ar = and_rand(SIZE)
+ bv1 = betavar(SIZE, 0.1, 1.5)
+ bv2 = betavar(SIZE, 0.5, 2.0)
+ bv3 = betavar(SIZE, 0.05, 0.05)
+
+ print("n sq ch fs es")
+ for i in range(256):
+ print(f"{i:3} {sq.count(i):5} {ch.count(i):5} "
+ f"{fs.count(i):5} {es.count(i):5}"
+ f"{ar.count(i):5} {bv1.count(i):5}"
+ f"{bv2.count(i):5} {bv3.count(i):5}"
+ )
+
+ for series, fn in ((sq, "square_series"),
+ (ch, "skewed_choices"),
+ (fs, "fib_shuffle"),
+ (es, "exp_shuffle"),
+ (ar, "and_rand"),
+ (bv1, "beta-variate1"),
+ (bv2, "beta-variate2"),
+ (bv3, "beta-variate3"),
+ ):
+ with open(f"{DIR}/{fn}-{SIZE_NAME}", "wb") as f:
+ f.write(series)
+
+
+def main():
+ if True:
+ the_classics()
+ for series, fn in ((decayed_alphabet(SIZE), "decayed_alphabet"),
+ (trigram_model(SIZE), "trigram"),
+ (trigram_sum_model(SIZE), "trigram_sum"),
+ ):
+ with open(f"{DIR}/{fn}_{SIZE_NAME}", "wb") as f:
+ f.write(series)
+
+
+main()
diff --git a/lib/compression/tests/scripts/three-byte-hash b/lib/compression/tests/scripts/three-byte-hash
new file mode 100755
index 0000000..100d0bc
--- /dev/null
+++ b/lib/compression/tests/scripts/three-byte-hash
@@ -0,0 +1,49 @@
+#!/usr/bin/python3
+"""Print statistics about a certain three byte hash.
+
+USAGE: three_byte_hash
+"""
+import sys
+
+if '--help' in sys.argv or '-h' in sys.argv or len(sys.argv) > 1:
+ print(__doc__)
+ exit(not ('--help' in sys.argv or '-h' in sys.argv))
+
+
+from statistics import mean, pstdev, median
+
+
+def h(*args, bits=12):
+ a = args[0]
+ b = args[1] ^ 0x2e
+ c = args[2] ^ 0x55
+ d = ((a + b) << 8) ^ (((c - a) & 0xffff) << 5) ^ (c + b) ^ (0xcab + a)
+ return d & ((1 << bits) - 1)
+
+
+def count(fn, bits, filter=None):
+ counts = [0] * (1 << bits)
+ for i in range(256 ** 3):
+ a, b, c = i & 255, (i >> 8) & 255, i >> 16
+ if filter and not (filter(a) and filter(b) and filter(c)):
+ continue
+
+ h = fn(a, b, c, bits=bits)
+ counts[h] += 1
+
+ print(f" {bits} bits; {len(counts)} buckets, "
+ f"expected {(1<<24) / len(counts)}")
+ print(f"median {median(counts)}")
+ print(f"mean {mean(counts)}")
+ print(f"min {min(counts)}")
+ print(f"max {max(counts)}")
+ print(f"stddev {pstdev(counts)}")
+
+
+for b in (12, 13, 14):
+ count(h, b)
+
+ print("With ASCII filter")
+ letters = set(range(32, 127))
+ letters |= set(b'\r\n\t\0')
+ count(h, b, filter=letters.__contains__)