diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 17:20:00 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 17:20:00 +0000 |
commit | 8daa83a594a2e98f39d764422bfbdbc62c9efd44 (patch) | |
tree | 4099e8021376c7d8c05bdf8503093d80e9c7bad0 /lib/compression/tests/scripts | |
parent | Initial commit. (diff) | |
download | samba-8daa83a594a2e98f39d764422bfbdbc62c9efd44.tar.xz samba-8daa83a594a2e98f39d764422bfbdbc62c9efd44.zip |
Adding upstream version 2:4.20.0+dfsg.upstream/2%4.20.0+dfsg
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'lib/compression/tests/scripts')
-rw-r--r-- | lib/compression/tests/scripts/README | 19 | ||||
-rwxr-xr-x | lib/compression/tests/scripts/decode-huffman-header | 54 | ||||
-rw-r--r-- | lib/compression/tests/scripts/generate-windows-test-vectors.c | 206 | ||||
-rwxr-xr-x | lib/compression/tests/scripts/make-fuzz-examples | 45 | ||||
-rwxr-xr-x | lib/compression/tests/scripts/make-test-vectors | 185 | ||||
-rwxr-xr-x | lib/compression/tests/scripts/three-byte-hash | 49 |
6 files changed, 558 insertions, 0 deletions
diff --git a/lib/compression/tests/scripts/README b/lib/compression/tests/scripts/README new file mode 100644 index 0000000..aff1047 --- /dev/null +++ b/lib/compression/tests/scripts/README @@ -0,0 +1,19 @@ +Tools used in the development and testing the LZ77 + Huffman code. + +These might not be of use to anyone ever, but here they are. + +make-fuzz-examples +encodes compressed files for decompression fuzzer. + +decode-huffman-header +print Huffman codes and validate first header in a compressed file. + +three-byte-hash +check that a three byte hash works. + +make-test-vectors +make files with randomly unbalanced symbol distributions. + +generate-windows-test-vectors.c +if compiled under Cygwin or similar on Windows, this can be used to +generate and verify test vectors. diff --git a/lib/compression/tests/scripts/decode-huffman-header b/lib/compression/tests/scripts/decode-huffman-header new file mode 100755 index 0000000..9c1279b --- /dev/null +++ b/lib/compression/tests/scripts/decode-huffman-header @@ -0,0 +1,54 @@ +#!/usr/bin/python3 +"""Print the codes in the first Huffman tree header in the given file. + +USAGE: decode-huffman-header FILE + +The number of codes of different length is printed first, followed by +the implied total frequency of codes, followed by the deduced codes. + +If the total is not 1.0, the header is invalid. +""" + +import sys + + +if '--help' in sys.argv or '-h' in sys.argv or len(sys.argv) != 2: + print(__doc__) + exit(len(sys.argv) != 2) + + +def read_table(data): + lengths = [[] for x in range(16)] + for i, b in enumerate(data): + even = b & 15 + odd = b >> 4 + lengths[even].append(i * 2) + lengths[odd].append(i * 2 + 1) + + code = 0 + + total = 0.0 + for i, bucket in enumerate(lengths): + if bucket and i: + portion = 1.0 / (1 << i) * len(bucket) + total += portion + print(f"length {i:2}: {len(bucket):4} ({portion})") + print(f"total {total}") + + for i, bucket in enumerate(lengths): + if i == 0: + continue + code <<= 1 + for c in bucket: + print(f'{c:03x} {code:0{i}b}') + code += 1 + + +def main(): + fn = sys.argv[1] + with open(fn, 'rb') as f: + data = f.read(256) + read_table(data) + + +main() diff --git a/lib/compression/tests/scripts/generate-windows-test-vectors.c b/lib/compression/tests/scripts/generate-windows-test-vectors.c new file mode 100644 index 0000000..28724d2 --- /dev/null +++ b/lib/compression/tests/scripts/generate-windows-test-vectors.c @@ -0,0 +1,206 @@ +/*
+ * Generate test vectorsa for Windows LZ77 Huffman compression.
+ *
+ * Copyright (c) 2022 Douglas Bagnall <dbagnall@samba.org>
+ *
+ * GPLv3+.
+ *
+ * Can be compiled on Windows 2012r2 under Cygwin
+ *
+ * gcc -o generate-windows-test-vectors \
+ * generate-windows-test-vectors.c \
+ * C:\Windows\SysWOW64\cabinet.dll \
+ * -lcabinet
+ *
+ * There might be better ways.
+ *
+ * See https://learn.microsoft.com/en-us/windows/win32/cmpapi/-compression-portal
+ */
+
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+/* compressapi.h is in the Windows API. mingw-w64 has a copy. */
+#include <compressapi.h>
+#include <errhandlingapi.h>
+
+struct blob {
+ uint8_t *data;
+ size_t length;
+};
+
+/* Windows size_t is different than Cygwin size_t (though still 64 bit) */
+typedef unsigned long long wsize_t;
+
+
+#define compression_flags (COMPRESS_ALGORITHM_XPRESS_HUFF | COMPRESS_RAW)
+
+int32_t compression_level = 0;
+
+static struct blob compress(struct blob input)
+{
+ COMPRESSOR_HANDLE handle;
+ struct blob output;
+ bool ok;
+ wsize_t used;
+
+ ok = CreateCompressor(compression_flags, NULL, &handle);
+
+ if (! ok) {
+ fprintf(stderr, "CreateCompressor failed\n");
+ exit(1);
+ }
+
+ output.length = input.length * 3 + 256;
+ output.data = malloc(output.length);
+ if (output.data == NULL) {
+ fprintf(stderr, "output allocation failed (estimated %zu)\n",
+ output.length);
+ exit(1);
+ }
+
+
+ ok = SetCompressorInformation(handle,
+ COMPRESS_INFORMATION_CLASS_LEVEL,
+ &compression_level,
+ sizeof(compression_level));
+
+ if (! ok) {
+ fprintf(stderr, "SetCompressorInformation failed: %d\n",
+ GetLastError());
+ //exit(1);
+ }
+
+ ok = Compress(handle,
+ input.data,
+ input.length,
+ output.data,
+ output.length,
+ &used);
+ if (! ok) {
+ fprintf(stderr, "Compress failed\n");
+ exit(1);
+ }
+ output.data = realloc(output.data, used);
+ if (output.data == NULL) {
+ fprintf(stderr,
+ "failed to shrinkwrap output! (from %zu to %llu)\n",
+ output.length, used);
+ exit(1);
+ }
+ output.length = used;
+ CloseCompressor(handle);
+ return output;
+}
+
+
+struct blob decompress(struct blob input,
+ size_t expected_size)
+{
+ DECOMPRESSOR_HANDLE handle;
+ struct blob output;
+ bool ok;
+ wsize_t used;
+
+ ok = CreateDecompressor(compression_flags, NULL, &handle);
+
+ if (! ok) {
+ fprintf(stderr, "CreateDecompressor failed\n");
+ exit(1);
+ }
+
+ output.length = expected_size;
+ output.data = malloc(output.length);
+ if (output.data == NULL) {
+ fprintf(stderr, "output allocation failed (%zu)\n",
+ output.length);
+ exit(1);
+ }
+
+ ok = Decompress(handle,
+ input.data,
+ input.length,
+ output.data,
+ output.length,
+ &used);
+ if (! ok) {
+ fprintf(stderr, "Decompress failed\n");
+ exit(1);
+ }
+ CloseDecompressor(handle);
+ return output;
+}
+
+
+static void __attribute__((noreturn)) usage(int ret)
+{
+ fprintf(stderr,
+ "USAGE: test-win-vectors {c,d} filename [length|level] > DEST\n\n");
+ fprintf(stderr, "c for< compression, d for decompression\n");
+ fprintf(stderr, "decompressed length is required for decompression\n");
+ fprintf(stderr, "compression level flag is optional [default 0]\n");
+ exit(ret);
+}
+
+int main(int argc, const char *argv[])
+{
+ FILE *fh;
+ const char *filename;
+ struct stat s;
+ int ret;
+ struct blob input = {0};
+ struct blob output = {0};
+
+ if (argc < 3 || argc > 4) {
+ usage(1);
+ }
+ filename = argv[2];
+
+ fh = fopen(filename, "rb");
+ if (fh == NULL) {
+ fprintf(stderr, "Could not open %s\n", filename);
+ usage(1);
+ }
+
+ ret = fstat(fileno(fh), &s);
+ if (ret != 0) {
+ fprintf(stderr, "Could not stat %s: %d\n", filename, ret);
+ usage(1);
+ }
+ input.length = s.st_size;
+ input.data = malloc(input.length);
+ if (input.data == NULL) {
+ fprintf(stderr, "input too big for memory?! (%zu)\n",
+ s.st_size);
+ exit(1);
+ }
+
+ fread(input.data, 1, input.length, fh);
+
+ if (strcmp(argv[1], "c") == 0) {
+ if (argc == 4 && strcmp(argv[3], "0")) {
+ compression_level = 1;
+ }
+ output = compress(input);
+ } else if (strcmp(argv[1], "d") == 0) {
+ size_t decomp_size;
+ if (argc != 4) {
+ fprintf(stderr, "no length given\n");
+ usage(1);
+ }
+ decomp_size = atoi(argv[3]);
+ output = decompress(input, decomp_size);
+ } else {
+ usage(1);
+ }
+ fwrite(output.data, 1, output.length, stdout);
+ free(output.data);
+ return 0;
+}
diff --git a/lib/compression/tests/scripts/make-fuzz-examples b/lib/compression/tests/scripts/make-fuzz-examples new file mode 100755 index 0000000..09200fb --- /dev/null +++ b/lib/compression/tests/scripts/make-fuzz-examples @@ -0,0 +1,45 @@ +#!/usr/bin/python3 +# +"""Pack the compressed files created by test_lzx_huffman.c (with +LZXHUFF_DEBUG_FILES) into the format used by the decompression fuzzer. + +That is, the first 3 bytes are the length of the decompressed file, +and the rest of the file is the compressed data. + +USAGE: make-fuzz-examples DIR + +where DIR is probably '/tmp'. +""" +import os +import sys + + +if '--help' in sys.argv or '-h' in sys.argv or len(sys.argv) != 2: + print(__doc__) + exit(len(sys.argv) != 2) + + +def main(): + files = set(os.listdir(sys.argv[1])) + + for fn in files: + if fn.endswith('-compressed'): + fn2 = fn.replace('-compressed', '-decompressed') + if fn2 not in files: + print(f"skipping {fn}, no {fn2}") + continue + cfn = '/tmp/' + fn + dfn = '/tmp/' + fn2 + wfn = '/tmp/' + fn.replace('-compressed', '.fuzz') + + size = os.stat(dfn).st_size + sbytes = bytes([(size & 0xff), (size >> 8) & 0xff, (size >> 16) & 0xff]) + + with open(cfn, 'rb') as f: + s = f.read() + + with open(wfn, 'wb') as f: + s = f.write(sbytes + s) + + +main() diff --git a/lib/compression/tests/scripts/make-test-vectors b/lib/compression/tests/scripts/make-test-vectors new file mode 100755 index 0000000..6f25866 --- /dev/null +++ b/lib/compression/tests/scripts/make-test-vectors @@ -0,0 +1,185 @@ +#!/usr/bin/python3 +"""Generate a few strings with unbalanced distributions to test the +regeneration of the Huffman tree when it gets too deep. + +USAGE: make-test-vectors DIR + +This will fill up DIR with test files. +""" +import sys +import random +from collections import defaultdict + + +if '--help' in sys.argv or '-h' in sys.argv or len(sys.argv) != 2: + print(__doc__) + exit(len(sys.argv) != 2) + + +DIR = sys.argv[1] + +SIZE = (1 << 17) + (23) # two and a bit blocks +SIZE_NAME = "128k+" +# SIZE = (1 << 16) +# SIZE_NAME = "64" + + +random.seed(1) + + +def squares(n): + array = [] + for i in range(n): + a = random.random() + b = random.random() + array.append(int(a * b * 256)) + return bytes(array) + + +def skewed_choices(n): + b = list(range(256)) + array = random.choices(b, weights=b, k=n) + return bytes(array) + + +def fib_shuffle(n): + array = [] + a, b = 1, 1 + for i in range(100): + array.extend([i] * a) + a, b = a + b, a + if len(array) > 1000000: + break + random.shuffle(array) + return bytes(array[:n]) + + +def exp_shuffle(n): + array = [] + for i in range(256): + array.extend([i] * int(1.04 ** i)) + if len(array) > 1000000: + break + random.shuffle(array) + return bytes(array[:n]) + + +def and_rand(n): + array = [] + for i in range(n): + a = random.randrange(256) + b = random.randrange(256) + array.append(a & b) + return bytes(array) + + +def betavar(n, a, b): + array = [] + for i in range(n): + x = random.betavariate(a, b) + array.append(int(x * 255.999999999999)) + return bytes(array) + + +def repeated_alphabet(n): + a = b'abcdefghijklmnopqrstuvwxyz' + na = n // len(a) + 1 + s = a * na + return s[:n] + + +def decayed_alphabet(n): + s = list(repeated_alphabet(n)) + for i in range(256): + j = random.randrange(n) + s[j] = i + + return bytes(s) + + +def trigram_model(n): + with open(__file__, 'rb') as f: + data = f.read() + lut = defaultdict(list) + for a, b, c in zip(data, data[1:], data[2:]): + k = bytes([a, b]) + lut[k].append(c) + + k = random.choice(list(lut.keys())) + s = [] + p = k[1] + for i in range(n + 10): + c = random.choice(lut[k]) + s.append(c) + k = bytes([p, c]) + p = c + + return bytes(s[10:]) + + +def trigram_sum_model(n): + with open(__file__, 'rb') as f: + data = f.read() + lut = [[random.randrange(256)] for i in range(512)] + for a, b, c in zip(data, data[1:], data[2:]): + lut[a + b].append(c) + + s = [] + i = random.randrange(len(data) - 1) + a = data[i] + b = data[i + 1] + + for i in range(n + 10): + x = lut[a + b] + c = random.choice(x) + s.append(c) + a = b + b = c + + return bytes(s[10:]) + + +def the_classics(): + # this used to be main() + sq = squares(SIZE) + ch = skewed_choices(SIZE) + fs = fib_shuffle(SIZE) + es = exp_shuffle(SIZE) + ar = and_rand(SIZE) + bv1 = betavar(SIZE, 0.1, 1.5) + bv2 = betavar(SIZE, 0.5, 2.0) + bv3 = betavar(SIZE, 0.05, 0.05) + + print("n sq ch fs es") + for i in range(256): + print(f"{i:3} {sq.count(i):5} {ch.count(i):5} " + f"{fs.count(i):5} {es.count(i):5}" + f"{ar.count(i):5} {bv1.count(i):5}" + f"{bv2.count(i):5} {bv3.count(i):5}" + ) + + for series, fn in ((sq, "square_series"), + (ch, "skewed_choices"), + (fs, "fib_shuffle"), + (es, "exp_shuffle"), + (ar, "and_rand"), + (bv1, "beta-variate1"), + (bv2, "beta-variate2"), + (bv3, "beta-variate3"), + ): + with open(f"{DIR}/{fn}-{SIZE_NAME}", "wb") as f: + f.write(series) + + +def main(): + if True: + the_classics() + for series, fn in ((decayed_alphabet(SIZE), "decayed_alphabet"), + (trigram_model(SIZE), "trigram"), + (trigram_sum_model(SIZE), "trigram_sum"), + ): + with open(f"{DIR}/{fn}_{SIZE_NAME}", "wb") as f: + f.write(series) + + +main() diff --git a/lib/compression/tests/scripts/three-byte-hash b/lib/compression/tests/scripts/three-byte-hash new file mode 100755 index 0000000..100d0bc --- /dev/null +++ b/lib/compression/tests/scripts/three-byte-hash @@ -0,0 +1,49 @@ +#!/usr/bin/python3 +"""Print statistics about a certain three byte hash. + +USAGE: three_byte_hash +""" +import sys + +if '--help' in sys.argv or '-h' in sys.argv or len(sys.argv) > 1: + print(__doc__) + exit(not ('--help' in sys.argv or '-h' in sys.argv)) + + +from statistics import mean, pstdev, median + + +def h(*args, bits=12): + a = args[0] + b = args[1] ^ 0x2e + c = args[2] ^ 0x55 + d = ((a + b) << 8) ^ (((c - a) & 0xffff) << 5) ^ (c + b) ^ (0xcab + a) + return d & ((1 << bits) - 1) + + +def count(fn, bits, filter=None): + counts = [0] * (1 << bits) + for i in range(256 ** 3): + a, b, c = i & 255, (i >> 8) & 255, i >> 16 + if filter and not (filter(a) and filter(b) and filter(c)): + continue + + h = fn(a, b, c, bits=bits) + counts[h] += 1 + + print(f" {bits} bits; {len(counts)} buckets, " + f"expected {(1<<24) / len(counts)}") + print(f"median {median(counts)}") + print(f"mean {mean(counts)}") + print(f"min {min(counts)}") + print(f"max {max(counts)}") + print(f"stddev {pstdev(counts)}") + + +for b in (12, 13, 14): + count(h, b) + + print("With ASCII filter") + letters = set(range(32, 127)) + letters |= set(b'\r\n\t\0') + count(h, b, filter=letters.__contains__) |