summaryrefslogtreecommitdiffstats
path: root/lib/compression/tests
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 17:20:00 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 17:20:00 +0000
commit8daa83a594a2e98f39d764422bfbdbc62c9efd44 (patch)
tree4099e8021376c7d8c05bdf8503093d80e9c7bad0 /lib/compression/tests
parentInitial commit. (diff)
downloadsamba-8daa83a594a2e98f39d764422bfbdbc62c9efd44.tar.xz
samba-8daa83a594a2e98f39d764422bfbdbc62c9efd44.zip
Adding upstream version 2:4.20.0+dfsg.upstream/2%4.20.0+dfsg
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'lib/compression/tests')
-rw-r--r--lib/compression/tests/scripts/README19
-rwxr-xr-xlib/compression/tests/scripts/decode-huffman-header54
-rw-r--r--lib/compression/tests/scripts/generate-windows-test-vectors.c206
-rwxr-xr-xlib/compression/tests/scripts/make-fuzz-examples45
-rwxr-xr-xlib/compression/tests/scripts/make-test-vectors185
-rwxr-xr-xlib/compression/tests/scripts/three-byte-hash49
-rw-r--r--lib/compression/tests/test_lzx_huffman.c1255
-rw-r--r--lib/compression/tests/test_lzxpress_plain.c1194
8 files changed, 3007 insertions, 0 deletions
diff --git a/lib/compression/tests/scripts/README b/lib/compression/tests/scripts/README
new file mode 100644
index 0000000..aff1047
--- /dev/null
+++ b/lib/compression/tests/scripts/README
@@ -0,0 +1,19 @@
+Tools used in the development and testing the LZ77 + Huffman code.
+
+These might not be of use to anyone ever, but here they are.
+
+make-fuzz-examples
+encodes compressed files for decompression fuzzer.
+
+decode-huffman-header
+print Huffman codes and validate first header in a compressed file.
+
+three-byte-hash
+check that a three byte hash works.
+
+make-test-vectors
+make files with randomly unbalanced symbol distributions.
+
+generate-windows-test-vectors.c
+if compiled under Cygwin or similar on Windows, this can be used to
+generate and verify test vectors.
diff --git a/lib/compression/tests/scripts/decode-huffman-header b/lib/compression/tests/scripts/decode-huffman-header
new file mode 100755
index 0000000..9c1279b
--- /dev/null
+++ b/lib/compression/tests/scripts/decode-huffman-header
@@ -0,0 +1,54 @@
+#!/usr/bin/python3
+"""Print the codes in the first Huffman tree header in the given file.
+
+USAGE: decode-huffman-header FILE
+
+The number of codes of different length is printed first, followed by
+the implied total frequency of codes, followed by the deduced codes.
+
+If the total is not 1.0, the header is invalid.
+"""
+
+import sys
+
+
+if '--help' in sys.argv or '-h' in sys.argv or len(sys.argv) != 2:
+ print(__doc__)
+ exit(len(sys.argv) != 2)
+
+
+def read_table(data):
+ lengths = [[] for x in range(16)]
+ for i, b in enumerate(data):
+ even = b & 15
+ odd = b >> 4
+ lengths[even].append(i * 2)
+ lengths[odd].append(i * 2 + 1)
+
+ code = 0
+
+ total = 0.0
+ for i, bucket in enumerate(lengths):
+ if bucket and i:
+ portion = 1.0 / (1 << i) * len(bucket)
+ total += portion
+ print(f"length {i:2}: {len(bucket):4} ({portion})")
+ print(f"total {total}")
+
+ for i, bucket in enumerate(lengths):
+ if i == 0:
+ continue
+ code <<= 1
+ for c in bucket:
+ print(f'{c:03x} {code:0{i}b}')
+ code += 1
+
+
+def main():
+ fn = sys.argv[1]
+ with open(fn, 'rb') as f:
+ data = f.read(256)
+ read_table(data)
+
+
+main()
diff --git a/lib/compression/tests/scripts/generate-windows-test-vectors.c b/lib/compression/tests/scripts/generate-windows-test-vectors.c
new file mode 100644
index 0000000..28724d2
--- /dev/null
+++ b/lib/compression/tests/scripts/generate-windows-test-vectors.c
@@ -0,0 +1,206 @@
+/*
+ * Generate test vectorsa for Windows LZ77 Huffman compression.
+ *
+ * Copyright (c) 2022 Douglas Bagnall <dbagnall@samba.org>
+ *
+ * GPLv3+.
+ *
+ * Can be compiled on Windows 2012r2 under Cygwin
+ *
+ * gcc -o generate-windows-test-vectors \
+ * generate-windows-test-vectors.c \
+ * C:\Windows\SysWOW64\cabinet.dll \
+ * -lcabinet
+ *
+ * There might be better ways.
+ *
+ * See https://learn.microsoft.com/en-us/windows/win32/cmpapi/-compression-portal
+ */
+
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+/* compressapi.h is in the Windows API. mingw-w64 has a copy. */
+#include <compressapi.h>
+#include <errhandlingapi.h>
+
+struct blob {
+ uint8_t *data;
+ size_t length;
+};
+
+/* Windows size_t is different than Cygwin size_t (though still 64 bit) */
+typedef unsigned long long wsize_t;
+
+
+#define compression_flags (COMPRESS_ALGORITHM_XPRESS_HUFF | COMPRESS_RAW)
+
+int32_t compression_level = 0;
+
+static struct blob compress(struct blob input)
+{
+ COMPRESSOR_HANDLE handle;
+ struct blob output;
+ bool ok;
+ wsize_t used;
+
+ ok = CreateCompressor(compression_flags, NULL, &handle);
+
+ if (! ok) {
+ fprintf(stderr, "CreateCompressor failed\n");
+ exit(1);
+ }
+
+ output.length = input.length * 3 + 256;
+ output.data = malloc(output.length);
+ if (output.data == NULL) {
+ fprintf(stderr, "output allocation failed (estimated %zu)\n",
+ output.length);
+ exit(1);
+ }
+
+
+ ok = SetCompressorInformation(handle,
+ COMPRESS_INFORMATION_CLASS_LEVEL,
+ &compression_level,
+ sizeof(compression_level));
+
+ if (! ok) {
+ fprintf(stderr, "SetCompressorInformation failed: %d\n",
+ GetLastError());
+ //exit(1);
+ }
+
+ ok = Compress(handle,
+ input.data,
+ input.length,
+ output.data,
+ output.length,
+ &used);
+ if (! ok) {
+ fprintf(stderr, "Compress failed\n");
+ exit(1);
+ }
+ output.data = realloc(output.data, used);
+ if (output.data == NULL) {
+ fprintf(stderr,
+ "failed to shrinkwrap output! (from %zu to %llu)\n",
+ output.length, used);
+ exit(1);
+ }
+ output.length = used;
+ CloseCompressor(handle);
+ return output;
+}
+
+
+struct blob decompress(struct blob input,
+ size_t expected_size)
+{
+ DECOMPRESSOR_HANDLE handle;
+ struct blob output;
+ bool ok;
+ wsize_t used;
+
+ ok = CreateDecompressor(compression_flags, NULL, &handle);
+
+ if (! ok) {
+ fprintf(stderr, "CreateDecompressor failed\n");
+ exit(1);
+ }
+
+ output.length = expected_size;
+ output.data = malloc(output.length);
+ if (output.data == NULL) {
+ fprintf(stderr, "output allocation failed (%zu)\n",
+ output.length);
+ exit(1);
+ }
+
+ ok = Decompress(handle,
+ input.data,
+ input.length,
+ output.data,
+ output.length,
+ &used);
+ if (! ok) {
+ fprintf(stderr, "Decompress failed\n");
+ exit(1);
+ }
+ CloseDecompressor(handle);
+ return output;
+}
+
+
+static void __attribute__((noreturn)) usage(int ret)
+{
+ fprintf(stderr,
+ "USAGE: test-win-vectors {c,d} filename [length|level] > DEST\n\n");
+ fprintf(stderr, "c for< compression, d for decompression\n");
+ fprintf(stderr, "decompressed length is required for decompression\n");
+ fprintf(stderr, "compression level flag is optional [default 0]\n");
+ exit(ret);
+}
+
+int main(int argc, const char *argv[])
+{
+ FILE *fh;
+ const char *filename;
+ struct stat s;
+ int ret;
+ struct blob input = {0};
+ struct blob output = {0};
+
+ if (argc < 3 || argc > 4) {
+ usage(1);
+ }
+ filename = argv[2];
+
+ fh = fopen(filename, "rb");
+ if (fh == NULL) {
+ fprintf(stderr, "Could not open %s\n", filename);
+ usage(1);
+ }
+
+ ret = fstat(fileno(fh), &s);
+ if (ret != 0) {
+ fprintf(stderr, "Could not stat %s: %d\n", filename, ret);
+ usage(1);
+ }
+ input.length = s.st_size;
+ input.data = malloc(input.length);
+ if (input.data == NULL) {
+ fprintf(stderr, "input too big for memory?! (%zu)\n",
+ s.st_size);
+ exit(1);
+ }
+
+ fread(input.data, 1, input.length, fh);
+
+ if (strcmp(argv[1], "c") == 0) {
+ if (argc == 4 && strcmp(argv[3], "0")) {
+ compression_level = 1;
+ }
+ output = compress(input);
+ } else if (strcmp(argv[1], "d") == 0) {
+ size_t decomp_size;
+ if (argc != 4) {
+ fprintf(stderr, "no length given\n");
+ usage(1);
+ }
+ decomp_size = atoi(argv[3]);
+ output = decompress(input, decomp_size);
+ } else {
+ usage(1);
+ }
+ fwrite(output.data, 1, output.length, stdout);
+ free(output.data);
+ return 0;
+}
diff --git a/lib/compression/tests/scripts/make-fuzz-examples b/lib/compression/tests/scripts/make-fuzz-examples
new file mode 100755
index 0000000..09200fb
--- /dev/null
+++ b/lib/compression/tests/scripts/make-fuzz-examples
@@ -0,0 +1,45 @@
+#!/usr/bin/python3
+#
+"""Pack the compressed files created by test_lzx_huffman.c (with
+LZXHUFF_DEBUG_FILES) into the format used by the decompression fuzzer.
+
+That is, the first 3 bytes are the length of the decompressed file,
+and the rest of the file is the compressed data.
+
+USAGE: make-fuzz-examples DIR
+
+where DIR is probably '/tmp'.
+"""
+import os
+import sys
+
+
+if '--help' in sys.argv or '-h' in sys.argv or len(sys.argv) != 2:
+ print(__doc__)
+ exit(len(sys.argv) != 2)
+
+
+def main():
+ files = set(os.listdir(sys.argv[1]))
+
+ for fn in files:
+ if fn.endswith('-compressed'):
+ fn2 = fn.replace('-compressed', '-decompressed')
+ if fn2 not in files:
+ print(f"skipping {fn}, no {fn2}")
+ continue
+ cfn = '/tmp/' + fn
+ dfn = '/tmp/' + fn2
+ wfn = '/tmp/' + fn.replace('-compressed', '.fuzz')
+
+ size = os.stat(dfn).st_size
+ sbytes = bytes([(size & 0xff), (size >> 8) & 0xff, (size >> 16) & 0xff])
+
+ with open(cfn, 'rb') as f:
+ s = f.read()
+
+ with open(wfn, 'wb') as f:
+ s = f.write(sbytes + s)
+
+
+main()
diff --git a/lib/compression/tests/scripts/make-test-vectors b/lib/compression/tests/scripts/make-test-vectors
new file mode 100755
index 0000000..6f25866
--- /dev/null
+++ b/lib/compression/tests/scripts/make-test-vectors
@@ -0,0 +1,185 @@
+#!/usr/bin/python3
+"""Generate a few strings with unbalanced distributions to test the
+regeneration of the Huffman tree when it gets too deep.
+
+USAGE: make-test-vectors DIR
+
+This will fill up DIR with test files.
+"""
+import sys
+import random
+from collections import defaultdict
+
+
+if '--help' in sys.argv or '-h' in sys.argv or len(sys.argv) != 2:
+ print(__doc__)
+ exit(len(sys.argv) != 2)
+
+
+DIR = sys.argv[1]
+
+SIZE = (1 << 17) + (23) # two and a bit blocks
+SIZE_NAME = "128k+"
+# SIZE = (1 << 16)
+# SIZE_NAME = "64"
+
+
+random.seed(1)
+
+
+def squares(n):
+ array = []
+ for i in range(n):
+ a = random.random()
+ b = random.random()
+ array.append(int(a * b * 256))
+ return bytes(array)
+
+
+def skewed_choices(n):
+ b = list(range(256))
+ array = random.choices(b, weights=b, k=n)
+ return bytes(array)
+
+
+def fib_shuffle(n):
+ array = []
+ a, b = 1, 1
+ for i in range(100):
+ array.extend([i] * a)
+ a, b = a + b, a
+ if len(array) > 1000000:
+ break
+ random.shuffle(array)
+ return bytes(array[:n])
+
+
+def exp_shuffle(n):
+ array = []
+ for i in range(256):
+ array.extend([i] * int(1.04 ** i))
+ if len(array) > 1000000:
+ break
+ random.shuffle(array)
+ return bytes(array[:n])
+
+
+def and_rand(n):
+ array = []
+ for i in range(n):
+ a = random.randrange(256)
+ b = random.randrange(256)
+ array.append(a & b)
+ return bytes(array)
+
+
+def betavar(n, a, b):
+ array = []
+ for i in range(n):
+ x = random.betavariate(a, b)
+ array.append(int(x * 255.999999999999))
+ return bytes(array)
+
+
+def repeated_alphabet(n):
+ a = b'abcdefghijklmnopqrstuvwxyz'
+ na = n // len(a) + 1
+ s = a * na
+ return s[:n]
+
+
+def decayed_alphabet(n):
+ s = list(repeated_alphabet(n))
+ for i in range(256):
+ j = random.randrange(n)
+ s[j] = i
+
+ return bytes(s)
+
+
+def trigram_model(n):
+ with open(__file__, 'rb') as f:
+ data = f.read()
+ lut = defaultdict(list)
+ for a, b, c in zip(data, data[1:], data[2:]):
+ k = bytes([a, b])
+ lut[k].append(c)
+
+ k = random.choice(list(lut.keys()))
+ s = []
+ p = k[1]
+ for i in range(n + 10):
+ c = random.choice(lut[k])
+ s.append(c)
+ k = bytes([p, c])
+ p = c
+
+ return bytes(s[10:])
+
+
+def trigram_sum_model(n):
+ with open(__file__, 'rb') as f:
+ data = f.read()
+ lut = [[random.randrange(256)] for i in range(512)]
+ for a, b, c in zip(data, data[1:], data[2:]):
+ lut[a + b].append(c)
+
+ s = []
+ i = random.randrange(len(data) - 1)
+ a = data[i]
+ b = data[i + 1]
+
+ for i in range(n + 10):
+ x = lut[a + b]
+ c = random.choice(x)
+ s.append(c)
+ a = b
+ b = c
+
+ return bytes(s[10:])
+
+
+def the_classics():
+ # this used to be main()
+ sq = squares(SIZE)
+ ch = skewed_choices(SIZE)
+ fs = fib_shuffle(SIZE)
+ es = exp_shuffle(SIZE)
+ ar = and_rand(SIZE)
+ bv1 = betavar(SIZE, 0.1, 1.5)
+ bv2 = betavar(SIZE, 0.5, 2.0)
+ bv3 = betavar(SIZE, 0.05, 0.05)
+
+ print("n sq ch fs es")
+ for i in range(256):
+ print(f"{i:3} {sq.count(i):5} {ch.count(i):5} "
+ f"{fs.count(i):5} {es.count(i):5}"
+ f"{ar.count(i):5} {bv1.count(i):5}"
+ f"{bv2.count(i):5} {bv3.count(i):5}"
+ )
+
+ for series, fn in ((sq, "square_series"),
+ (ch, "skewed_choices"),
+ (fs, "fib_shuffle"),
+ (es, "exp_shuffle"),
+ (ar, "and_rand"),
+ (bv1, "beta-variate1"),
+ (bv2, "beta-variate2"),
+ (bv3, "beta-variate3"),
+ ):
+ with open(f"{DIR}/{fn}-{SIZE_NAME}", "wb") as f:
+ f.write(series)
+
+
+def main():
+ if True:
+ the_classics()
+ for series, fn in ((decayed_alphabet(SIZE), "decayed_alphabet"),
+ (trigram_model(SIZE), "trigram"),
+ (trigram_sum_model(SIZE), "trigram_sum"),
+ ):
+ with open(f"{DIR}/{fn}_{SIZE_NAME}", "wb") as f:
+ f.write(series)
+
+
+main()
diff --git a/lib/compression/tests/scripts/three-byte-hash b/lib/compression/tests/scripts/three-byte-hash
new file mode 100755
index 0000000..100d0bc
--- /dev/null
+++ b/lib/compression/tests/scripts/three-byte-hash
@@ -0,0 +1,49 @@
+#!/usr/bin/python3
+"""Print statistics about a certain three byte hash.
+
+USAGE: three_byte_hash
+"""
+import sys
+
+if '--help' in sys.argv or '-h' in sys.argv or len(sys.argv) > 1:
+ print(__doc__)
+ exit(not ('--help' in sys.argv or '-h' in sys.argv))
+
+
+from statistics import mean, pstdev, median
+
+
+def h(*args, bits=12):
+ a = args[0]
+ b = args[1] ^ 0x2e
+ c = args[2] ^ 0x55
+ d = ((a + b) << 8) ^ (((c - a) & 0xffff) << 5) ^ (c + b) ^ (0xcab + a)
+ return d & ((1 << bits) - 1)
+
+
+def count(fn, bits, filter=None):
+ counts = [0] * (1 << bits)
+ for i in range(256 ** 3):
+ a, b, c = i & 255, (i >> 8) & 255, i >> 16
+ if filter and not (filter(a) and filter(b) and filter(c)):
+ continue
+
+ h = fn(a, b, c, bits=bits)
+ counts[h] += 1
+
+ print(f" {bits} bits; {len(counts)} buckets, "
+ f"expected {(1<<24) / len(counts)}")
+ print(f"median {median(counts)}")
+ print(f"mean {mean(counts)}")
+ print(f"min {min(counts)}")
+ print(f"max {max(counts)}")
+ print(f"stddev {pstdev(counts)}")
+
+
+for b in (12, 13, 14):
+ count(h, b)
+
+ print("With ASCII filter")
+ letters = set(range(32, 127))
+ letters |= set(b'\r\n\t\0')
+ count(h, b, filter=letters.__contains__)
diff --git a/lib/compression/tests/test_lzx_huffman.c b/lib/compression/tests/test_lzx_huffman.c
new file mode 100644
index 0000000..7770535
--- /dev/null
+++ b/lib/compression/tests/test_lzx_huffman.c
@@ -0,0 +1,1255 @@
+/*
+ * Samba compression library - LGPLv3
+ *
+ * Copyright © Catalyst IT 2022
+ *
+ * Written by Douglas Bagnall <douglas.bagnall@catalyst.net.nz>
+ *
+ * ** NOTE! The following LGPL license applies to this file.
+ * ** It does NOT imply that all of Samba is released under the LGPL
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdarg.h>
+#include <stddef.h>
+#include <setjmp.h>
+#include <cmocka.h>
+#include <stdbool.h>
+#include <sys/stat.h>
+#include "replace.h"
+#include <talloc.h>
+#include "lzxpress_huffman.h"
+#include "lib/util/stable_sort.h"
+#include "lib/util/data_blob.h"
+
+/* set LZXHUFF_DEBUG_FILES to true to save round-trip files in /tmp. */
+#define LZXHUFF_DEBUG_FILES false
+
+/* set LZXHUFF_DEBUG_VERBOSE to true to print more. */
+#define LZXHUFF_DEBUG_VERBOSE false
+
+
+#if LZXHUFF_DEBUG_VERBOSE
+#define debug_message(...) print_message(__VA_ARGS__)
+
+#include <time.h>
+
+struct timespec start = {0};
+struct timespec end = {0};
+static void debug_start_timer(void)
+{
+ clock_gettime(CLOCK_MONOTONIC, &start);
+}
+
+static void debug_end_timer(const char *name, size_t len)
+{
+ uint64_t ns;
+ double secs;
+ double rate;
+ clock_gettime(CLOCK_MONOTONIC, &end);
+ ns = end.tv_nsec;
+ ns += end.tv_sec * 1000 * 1000 * 1000;
+ ns -= start.tv_nsec;
+ ns -= start.tv_sec * 1000 * 1000 * 1000;
+ secs = ns / 1e9;
+ rate = len / (secs * 1024 * 1024);
+ debug_message("%s %zu bytes in %.2g: \033[1;35m%.2f\033[0m MB per second\n",
+ name, len, secs, rate);
+}
+
+#else
+#define debug_message(...) /* debug_message */
+#define debug_start_timer(...) /* debug_start_timer */
+#define debug_end_timer(...) /* debug_end_timer */
+#endif
+
+
+struct lzx_pair {
+ const char *name;
+ DATA_BLOB compressed;
+ DATA_BLOB decompressed;
+};
+
+struct lzx_file_pair {
+ const char *name;
+ const char *compressed_file;
+ const char *decompressed_file;
+};
+
+
+#define DECOMP_DIR "testdata/compression/decompressed"
+#define COMP_DIR "testdata/compression/compressed-huffman"
+#define MORE_COMP_DIR "testdata/compression/compressed-more-huffman"
+
+
+#define VARRGH(...) __VA_ARGS__
+
+#define BLOB_FROM_ARRAY(...) \
+ { \
+ .data = (uint8_t[]){__VA_ARGS__}, \
+ .length = sizeof((uint8_t[]){__VA_ARGS__}) \
+ }
+
+#define BLOB_FROM_STRING(s) \
+ { \
+ .data = discard_const_p(uint8_t, s), \
+ .length = (sizeof(s) - 1) \
+ }
+
+
+const char * file_names[] = {
+ "27826-8.txt",
+ "5d049b4cb1bd933f5e8ex19",
+ "638e61e96d54279981c3x5",
+ "64k-minus-one-zeros",
+ "64k-plus-one-zeros",
+ "64k-zeros",
+ "96f696a4e5ce56c61a3dx10",
+ "9e0b6a12febf38e98f13",
+ "abc-times-101",
+ "abc-times-105",
+ "abc-times-200",
+ "and_rand",
+ "and_rand-128k+",
+ "b63289ccc7f218c0d56b",
+ "beta-variate1-128k+",
+ "beta-variate2-128k+",
+ "beta-variate3-128k+",
+ "decayed_alphabet_128k+",
+ "decayed_alphabet_64k",
+ "exp_shuffle",
+ "exp_shuffle-128k+",
+ "f00842317dc6d5695b02",
+ "fib_shuffle",
+ "fib_shuffle-128k+",
+ "fuzzing-0fc2d461b56cd8103c91",
+ "fuzzing-17c961778538cc10ab7c",
+ "fuzzing-3591f9dc02bb00a54b60",
+ "fuzzing-3ec3bca27bb9eb40c128",
+ "fuzzing-80b4fa18ff5f8dd04862",
+ "fuzzing-a3115a81d1ac500318f9",
+ "generate-windows-test-vectors.c",
+ "midsummer-nights-dream.txt",
+ "notes-on-the-underground.txt",
+ "pg22009.txt",
+ "repeating",
+ "repeating-exactly-64k",
+ "setup.log",
+ "skewed_choices",
+ "skewed_choices-128k+",
+ /* These ones were deathly slow in fuzzing at one point */
+ "slow-015ddc36a71412ccc50d",
+ "slow-100e9f966a7feb9ca40a",
+ "slow-2a671c3cff4f1574cbab",
+ "slow-33d90a24e70515b14cd0",
+ "slow-49d8c05261e3f412fc72",
+ "slow-50a249d2fe56873e56a0",
+ "slow-63e9f0b52235fb0129fa",
+ "slow-73b7f971d65908ac0095",
+ "slow-8b61e3dd267908544531",
+ "slow-9d1c5a079b0462986f1f",
+ "slow-aa7262a821dabdcf04a6",
+ "slow-b8a91d142b0d2af7f5ca",
+ "slow-c79142457734bbc8d575",
+ "slow-d736544545b90d83fe75",
+ "slow-e3b9bdfaed7d1a606fdb",
+ "slow-f3f1c02a9d006e5e1703",
+ "square_series",
+ "square_series-128k+",
+ "trigram_128k+",
+ "trigram_64k",
+ "trigram_sum_128k+",
+ "trigram_sum_64k",
+ NULL
+};
+
+struct lzx_pair bidirectional_pairs[] = {
+
+ {.name = "abc__100_repeats", /* [MS-XCA] 3.2 example 2. */
+ .decompressed = BLOB_FROM_STRING(
+ "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+ "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+ "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+ "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+ "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+ ),
+ .compressed = BLOB_FROM_ARRAY(
+ /*
+ * The 'a', 'b', and 'c' bytes are 0x61, 0x62, 0x63. No other
+ * symbols occur. That means we need 48 0x00 bytes for the
+ * first 96 symbol-nybbles, then some short codes, then zeros
+ * again for the rest of the literals.
+ */
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+ 0,0,0,0,0, 0,0,0,
+ 0x30, 0x23, /* a_ cb */
+ 0,0,0,0,0, 0,0,0,0,0,
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, /* 100 bytes */
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+ 0,0,0,0,0, 0,0,0, /* here end the 0-255 literals (128 bytes) */
+ 0x02, /* 'EOF' symbol 256 (byte 128 low) */
+ 0,0,0,0,0, 0,0,0,0,0, 0, /* 140 bytes */
+ 0,0,0,
+ 0x20, /* codepoint 287 (byte 143 high) */
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0, /* 160 bytes */
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, /* 240 bytes */
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,
+ /*
+ * So that's the tree.
+ *
+ * length 2 codes for 'c', EOF, 287
+ * length 3 for 'a', 'b'.
+ *
+ * c: 00
+ * EOF: 01
+ * 287: 10
+ * a: 110
+ * b: 111
+ *
+ * thus the literal string "abc" is 110-111-00.
+ *
+ * Now for the lz77 match definitions for EOF and 287.
+ *
+ * Why 287? It encodes the match distance and offset.
+ *
+ * 287 - 256 = 31
+ *
+ * _length = 31 % 16 = 15
+ * _distance = 31 / 16 = 1
+ *
+ * (it's easier to look at the hex, 0x11f:
+ * 1xx means a match; x1x is _distance; xxf is _length)
+ *
+ * _distance 1 means a two bit distance (10 or 11; 2 or 3).
+ * That means the next bit will be the least significant bit
+ * of distance (1 in this case, meaning distance 3).
+ *
+ * if _length is 15, real length is included inline.
+ *
+ * 'EOF' == 256 means _length = 0, _distance = 0.
+ *
+ * _distance 0 means 1, so no further bits needed.
+ * _length 0 means length 3.
+ *
+ * but when used as EOF, this doesn't matter.
+ */
+ 0xa8, 0xdc, 0x00, 0x00, 0xff, 0x26, 0x01
+ /* These remaining bytes are:
+ *
+ * 10101000 11011100 00000000 00000000 11111111
+ * 00100110 00000001
+ *
+ * and we read them as 16 chunks (i.e. flipping odd/even order)
+ *
+ * 110-111-00 10-1-01-000
+ * a b c 287 | EOF
+ * |
+ * this is the 287 distance low bit.
+ *
+ * The last 3 bits are not used. The 287 length is sort of
+ * out of band, coming up soon (because 287 encodes length
+ * 15; most codes do not do this).
+ *
+ * 00000000 00000000
+ *
+ * This padding is there because the first 32 bits are read
+ * at the beginning of decoding. If there were more things to
+ * be encoded, they would be in here.
+ *
+ * 11111111
+ *
+ * This byte is pulled as the length for the 287 match.
+ * Because it is 0xff, we pull a further 2 bytes for the
+ * actual length, i.e. a 16 bit number greater than 270.
+ *
+ * 00000001 00100110
+ *
+ * that is 0x126 = 294 = the match length - 3 (because we're
+ * encoding ["abc", <copy from 3 back, 297 chars>, EOF]).
+ *
+ */
+ )
+ },
+ {.name = "abcdefghijklmnopqrstuvwxyz", /* [MS-XCA] 3.2 example 1. */
+ .decompressed = BLOB_FROM_STRING("abcdefghijklmnopqrstuvwxyz"),
+ .compressed = BLOB_FROM_ARRAY(
+ /*
+ * In this case there are no matches encoded as there are no
+ * repeated symbols. Including the EOF, there are 27 symbols
+ * all occurring exactly as frequently as each other (once).
+ * From that we would expect the codes to be mostly 5 bits
+ * long, because 27 < 2^5 (32), but greater than 2^4. And
+ * that's what we see.
+ */
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+ 0,0,0,0,0, 0,0,0,
+ /* 14 non-zero bytes for 26 letters/nibbles */
+ 0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+ 0x55, 0x55, 0x55, 0x45, 0x44, 0x04,
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0, /* 80 */
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+ 0,0,0,0,0, 0,0,0,
+ 0x04, /* 0x100 EOF */
+ /* no matches */
+ 0,0,0,0,0, 0,0,0,0,0, 0, /* 140 */
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, /* 240 */
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,
+
+ 0xd8, 0x52, 0x3e, 0xd7, 0x94, 0x11, 0x5b, 0xe9,
+ 0x19, 0x5f, 0xf9, 0xd6, 0x7c, 0xdf, 0x8d, 0x04,
+ 0x00, 0x00, 0x00, 0x00)
+ },
+ {0}
+};
+
+
+static void test_lzxpress_huffman_decompress(void **state)
+{
+ size_t i;
+ ssize_t written;
+ uint8_t *dest = NULL;
+ TALLOC_CTX *mem_ctx = talloc_new(NULL);
+ for (i = 0; bidirectional_pairs[i].name != NULL; i++) {
+ struct lzx_pair p = bidirectional_pairs[i];
+ dest = talloc_array(mem_ctx, uint8_t, p.decompressed.length);
+
+ debug_message("%s compressed %zu decomp %zu\n", p.name,
+ p.compressed.length,
+ p.decompressed.length);
+
+ written = lzxpress_huffman_decompress(p.compressed.data,
+ p.compressed.length,
+ dest,
+ p.decompressed.length);
+ assert_int_not_equal(written, -1);
+ assert_int_equal(written, p.decompressed.length);
+
+ assert_memory_equal(dest, p.decompressed.data, p.decompressed.length);
+ talloc_free(dest);
+ }
+}
+
+static void test_lzxpress_huffman_compress(void **state)
+{
+ size_t i;
+ ssize_t written;
+ uint8_t *dest = NULL;
+ TALLOC_CTX *mem_ctx = talloc_new(NULL);
+ for (i = 0; bidirectional_pairs[i].name != NULL; i++) {
+ struct lzx_pair p = bidirectional_pairs[i];
+ debug_message("%s compressed %zu decomp %zu\n", p.name,
+ p.compressed.length,
+ p.decompressed.length);
+
+ written = lzxpress_huffman_compress_talloc(mem_ctx,
+ p.decompressed.data,
+ p.decompressed.length,
+ &dest);
+
+ assert_int_not_equal(written, -1);
+ assert_int_equal(written, p.compressed.length);
+ assert_memory_equal(dest, p.compressed.data, p.compressed.length);
+ talloc_free(dest);
+ }
+}
+
+
+static DATA_BLOB datablob_from_file(TALLOC_CTX *mem_ctx,
+ const char *filename)
+{
+ DATA_BLOB b = {0};
+ FILE *fh = fopen(filename, "rb");
+ int ret;
+ struct stat s;
+ size_t len;
+ if (fh == NULL) {
+ debug_message("could not open '%s'\n", filename);
+ return b;
+ }
+ ret = fstat(fileno(fh), &s);
+ if (ret != 0) {
+ fclose(fh);
+ return b;
+ }
+ b.data = talloc_array(mem_ctx, uint8_t, s.st_size);
+ if (b.data == NULL) {
+ fclose(fh);
+ return b;
+ }
+ len = fread(b.data, 1, s.st_size, fh);
+ if (ferror(fh) || len != s.st_size) {
+ TALLOC_FREE(b.data);
+ } else {
+ b.length = len;
+ }
+ fclose(fh);
+ return b;
+}
+
+
+
+static void test_lzxpress_huffman_decompress_files(void **state)
+{
+ size_t i;
+ int score = 0;
+ TALLOC_CTX *mem_ctx = talloc_new(NULL);
+ for (i = 0; file_names[i] != NULL; i++) {
+ char filename[200];
+ uint8_t *dest = NULL;
+ ssize_t written;
+ TALLOC_CTX *tmp_ctx = talloc_new(mem_ctx);
+ struct lzx_pair p = {
+ .name = file_names[i]
+ };
+
+ debug_message("%s\n", p.name);
+
+ snprintf(filename, sizeof(filename),
+ "%s/%s.decomp", DECOMP_DIR, p.name);
+
+ p.decompressed = datablob_from_file(tmp_ctx, filename);
+ assert_non_null(p.decompressed.data);
+
+ snprintf(filename, sizeof(filename),
+ "%s/%s.lzhuff", COMP_DIR, p.name);
+
+ p.compressed = datablob_from_file(tmp_ctx, filename);
+ assert_non_null(p.compressed.data);
+
+ dest = talloc_array(tmp_ctx, uint8_t, p.decompressed.length);
+ debug_start_timer();
+ written = lzxpress_huffman_decompress(p.compressed.data,
+ p.compressed.length,
+ dest,
+ p.decompressed.length);
+ debug_end_timer("decompress", p.decompressed.length);
+ if (written != -1 &&
+ written == p.decompressed.length &&
+ memcmp(dest, p.decompressed.data, p.decompressed.length) == 0) {
+ debug_message("\033[1;32mdecompressed %s!\033[0m\n", p.name);
+ score++;
+ } else {
+ debug_message("\033[1;31mfailed to decompress %s!\033[0m\n",
+ p.name);
+ debug_message("size %zd vs reference %zu\n",
+ written, p.decompressed.length);
+ }
+ talloc_free(tmp_ctx);
+ }
+ debug_message("%d/%zu correct\n", score, i);
+ assert_int_equal(score, i);
+}
+
+
+static void test_lzxpress_huffman_decompress_more_compressed_files(void **state)
+{
+ /*
+ * This tests the decompression of files that have been compressed on
+ * Windows with the level turned up (to 1, default for MS-XCA is 0).
+ *
+ * The format is identical, but it will have tried harder to find
+ * matches.
+ */
+ size_t i;
+ int score = 0;
+ int found = 0;
+ TALLOC_CTX *mem_ctx = talloc_new(NULL);
+ for (i = 0; file_names[i] != NULL; i++) {
+ char filename[200];
+ uint8_t *dest = NULL;
+ ssize_t written;
+ TALLOC_CTX *tmp_ctx = talloc_new(mem_ctx);
+ struct lzx_pair p = {
+ .name = file_names[i]
+ };
+
+ debug_message("%s\n", p.name);
+
+ snprintf(filename, sizeof(filename),
+ "%s/%s.decomp", DECOMP_DIR, p.name);
+
+ p.decompressed = datablob_from_file(tmp_ctx, filename);
+ assert_non_null(p.decompressed.data);
+
+ snprintf(filename, sizeof(filename),
+ "%s/%s.lzhuff", MORE_COMP_DIR, p.name);
+
+ p.compressed = datablob_from_file(tmp_ctx, filename);
+ if (p.compressed.data == NULL) {
+ /*
+ * We don't have all the vectors in the
+ * more-compressed directory, which is OK, we skip
+ * them.
+ */
+ continue;
+ }
+ found++;
+ dest = talloc_array(tmp_ctx, uint8_t, p.decompressed.length);
+ debug_start_timer();
+ written = lzxpress_huffman_decompress(p.compressed.data,
+ p.compressed.length,
+ dest,
+ p.decompressed.length);
+ debug_end_timer("decompress", p.decompressed.length);
+ if (written == p.decompressed.length &&
+ memcmp(dest, p.decompressed.data, p.decompressed.length) == 0) {
+ debug_message("\033[1;32mdecompressed %s!\033[0m\n", p.name);
+ score++;
+ } else {
+ debug_message("\033[1;31mfailed to decompress %s!\033[0m\n",
+ p.name);
+ debug_message("size %zd vs reference %zu\n",
+ written, p.decompressed.length);
+ }
+ talloc_free(tmp_ctx);
+ }
+ debug_message("%d/%d correct\n", score, found);
+ assert_int_equal(score, found);
+}
+
+
+/*
+ * attempt_round_trip() tests whether a data blob can survive a compression
+ * and decompression cycle. If save_name is not NULL and LZXHUFF_DEBUG_FILES
+ * evals to true, the various stages are saved in files with that name and the
+ * '-original', '-compressed', and '-decompressed' suffixes. If ref_compressed
+ * has data, it'll print a message saying whether the compressed data matches
+ * that.
+ */
+
+static ssize_t attempt_round_trip(TALLOC_CTX *mem_ctx,
+ DATA_BLOB original,
+ const char *save_name,
+ DATA_BLOB ref_compressed)
+{
+ TALLOC_CTX *tmp_ctx = talloc_new(mem_ctx);
+ DATA_BLOB compressed = data_blob_talloc(tmp_ctx, NULL,
+ original.length * 4 / 3 + 260);
+ DATA_BLOB decompressed = data_blob_talloc(tmp_ctx, NULL,
+ original.length);
+ ssize_t comp_written, decomp_written;
+ debug_start_timer();
+ comp_written = lzxpress_huffman_compress_talloc(tmp_ctx,
+ original.data,
+ original.length,
+ &compressed.data);
+ debug_end_timer("compress", original.length);
+ if (comp_written <= 0) {
+ talloc_free(tmp_ctx);
+ return -1;
+ }
+
+ if (ref_compressed.data != NULL) {
+ /*
+ * This is informational, not an assertion; there are
+ * ~infinite legitimate ways to compress the data, many as
+ * good as each other (think of compression as a language, not
+ * a format).
+ */
+ debug_message("compressed size %zd vs reference %zu\n",
+ comp_written, ref_compressed.length);
+
+ if (comp_written == compressed.length &&
+ memcmp(compressed.data, ref_compressed.data, comp_written) == 0) {
+ debug_message("\033[1;32mbyte identical!\033[0m\n");
+ }
+ }
+ debug_start_timer();
+ decomp_written = lzxpress_huffman_decompress(compressed.data,
+ comp_written,
+ decompressed.data,
+ original.length);
+ debug_end_timer("decompress", original.length);
+ if (save_name != NULL && LZXHUFF_DEBUG_FILES) {
+ char s[300];
+ FILE *fh = NULL;
+
+ snprintf(s, sizeof(s), "%s-original", save_name);
+ fprintf(stderr, "Saving %zu bytes to %s\n", original.length, s);
+ fh = fopen(s, "w");
+ fwrite(original.data, 1, original.length, fh);
+ fclose(fh);
+
+ snprintf(s, sizeof(s), "%s-compressed", save_name);
+ fprintf(stderr, "Saving %zu bytes to %s\n", comp_written, s);
+ fh = fopen(s, "w");
+ fwrite(compressed.data, 1, comp_written, fh);
+ fclose(fh);
+ /*
+ * We save the decompressed file using original.length, not
+ * the returned size. If these differ, the returned size will
+ * be -1. By saving the whole buffer we can see at what point
+ * it went haywire.
+ */
+ snprintf(s, sizeof(s), "%s-decompressed", save_name);
+ fprintf(stderr, "Saving %zu bytes to %s\n", original.length, s);
+ fh = fopen(s, "w");
+ fwrite(decompressed.data, 1, original.length, fh);
+ fclose(fh);
+ }
+
+ if (original.length != decomp_written ||
+ memcmp(decompressed.data,
+ original.data,
+ original.length) != 0) {
+ debug_message("\033[1;31mgot %zd, expected %zu\033[0m\n",
+ decomp_written,
+ original.length);
+ talloc_free(tmp_ctx);
+ return -1;
+ }
+ talloc_free(tmp_ctx);
+ return comp_written;
+}
+
+
+static void test_lzxpress_huffman_round_trip(void **state)
+{
+ size_t i;
+ int score = 0;
+ ssize_t compressed_total = 0;
+ ssize_t reference_total = 0;
+ TALLOC_CTX *mem_ctx = talloc_new(NULL);
+ for (i = 0; file_names[i] != NULL; i++) {
+ char filename[200];
+ char *debug_files = NULL;
+ TALLOC_CTX *tmp_ctx = talloc_new(mem_ctx);
+ ssize_t comp_size;
+ struct lzx_pair p = {
+ .name = file_names[i]
+ };
+ debug_message("-------------------\n");
+ debug_message("%s\n", p.name);
+
+ snprintf(filename, sizeof(filename),
+ "%s/%s.decomp", DECOMP_DIR, p.name);
+
+ p.decompressed = datablob_from_file(tmp_ctx, filename);
+ assert_non_null(p.decompressed.data);
+
+ snprintf(filename, sizeof(filename),
+ "%s/%s.lzhuff", COMP_DIR, p.name);
+
+ p.compressed = datablob_from_file(tmp_ctx, filename);
+ if (p.compressed.data == NULL) {
+ debug_message(
+ "Could not load %s reference file %s\n",
+ p.name, filename);
+ debug_message("%s decompressed %zu\n", p.name,
+ p.decompressed.length);
+ } else {
+ debug_message("%s: reference compressed %zu decomp %zu\n",
+ p.name,
+ p.compressed.length,
+ p.decompressed.length);
+ }
+ if (1) {
+ /*
+ * We're going to save copies in /tmp.
+ */
+ snprintf(filename, sizeof(filename),
+ "/tmp/lzxhuffman-%s", p.name);
+ debug_files = filename;
+ }
+
+ comp_size = attempt_round_trip(mem_ctx, p.decompressed,
+ debug_files,
+ p.compressed);
+ if (comp_size > 0) {
+ debug_message("\033[1;32mround trip!\033[0m\n");
+ score++;
+ if (p.compressed.length) {
+ compressed_total += comp_size;
+ reference_total += p.compressed.length;
+ }
+ }
+ talloc_free(tmp_ctx);
+ }
+ debug_message("%d/%zu correct\n", score, i);
+ print_message("\033[1;34mtotal compressed size: %zu\033[0m\n",
+ compressed_total);
+ print_message("total reference size: %zd \n", reference_total);
+ print_message("diff: %7zd \n",
+ reference_total - compressed_total);
+ assert_true(reference_total != 0);
+ print_message("ratio: \033[1;3%dm%.2f\033[0m \n",
+ 2 + (compressed_total >= reference_total),
+ ((double)compressed_total) / reference_total);
+ /*
+ * Assert that the compression is *about* as good as Windows. Of course
+ * it doesn't matter if we do better, but mysteriously getting better
+ * is usually a sign that something is wrong.
+ *
+ * At the time of writing, compressed_total is 2674004, or 10686 more
+ * than the Windows reference total. That's < 0.5% difference, we're
+ * asserting at 2%.
+ */
+ assert_true(labs(compressed_total - reference_total) <
+ compressed_total / 50);
+
+ assert_int_equal(score, i);
+ talloc_free(mem_ctx);
+}
+
+/*
+ * Bob Jenkins' Small Fast RNG.
+ *
+ * We don't need it to be this good, but we do need it to be reproduceable
+ * across platforms, which rand() etc aren't.
+ *
+ * http://burtleburtle.net/bob/rand/smallprng.html
+ */
+
+struct jsf_rng {
+ uint32_t a;
+ uint32_t b;
+ uint32_t c;
+ uint32_t d;
+};
+
+#define ROTATE32(x, k) (((x) << (k)) | ((x) >> (32 - (k))))
+
+static uint32_t jsf32(struct jsf_rng *x) {
+ uint32_t e = x->a - ROTATE32(x->b, 27);
+ x->a = x->b ^ ROTATE32(x->c, 17);
+ x->b = x->c + x->d;
+ x->c = x->d + e;
+ x->d = e + x->a;
+ return x->d;
+}
+
+static void jsf32_init(struct jsf_rng *x, uint32_t seed) {
+ size_t i;
+ x->a = 0xf1ea5eed;
+ x->b = x->c = x->d = seed;
+ for (i = 0; i < 20; ++i) {
+ jsf32(x);
+ }
+}
+
+
+static void test_lzxpress_huffman_long_gpl_round_trip(void **state)
+{
+ /*
+ * We use a kind of model-free Markov model to generate a massively
+ * extended pastiche of the GPLv3 (chosen because it is right there in
+ * "COPYING" and won't change often).
+ *
+ * The point is to check a round trip of a very long message with
+ * multiple repetitions on many scales, without having to add a very
+ * large file.
+ */
+ size_t i, j, k;
+ uint8_t c;
+ TALLOC_CTX *mem_ctx = talloc_new(NULL);
+ DATA_BLOB gpl = datablob_from_file(mem_ctx, "COPYING");
+ DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, 5 * 1024 * 1024);
+ DATA_BLOB ref = {0};
+ ssize_t comp_size;
+ struct jsf_rng rng;
+
+ if (gpl.data == NULL) {
+ print_message("could not read COPYING\n");
+ fail();
+ }
+
+ jsf32_init(&rng, 1);
+
+ j = 1;
+ original.data[0] = gpl.data[0];
+ for (i = 1; i < original.length; i++) {
+ size_t m;
+ char p = original.data[i - 1];
+ c = gpl.data[j];
+ original.data[i] = c;
+ j++;
+ m = (j + jsf32(&rng)) % (gpl.length - 50);
+ for (k = m; k < m + 30; k++) {
+ if (p == gpl.data[k] &&
+ c == gpl.data[k + 1]) {
+ j = k + 2;
+ break;
+ }
+ }
+ if (j == gpl.length) {
+ j = 1;
+ }
+ }
+
+ comp_size = attempt_round_trip(mem_ctx, original, "/tmp/gpl", ref);
+ assert_true(comp_size > 0);
+ assert_true(comp_size < original.length);
+
+ talloc_free(mem_ctx);
+}
+
+
+static void test_lzxpress_huffman_long_random_graph_round_trip(void **state)
+{
+ size_t i;
+ TALLOC_CTX *mem_ctx = talloc_new(NULL);
+ DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, 5 * 1024 * 1024);
+ DATA_BLOB ref = {0};
+ /*
+ * There's a random trigram graph, with each pair of sequential bytes
+ * pointing to a successor. This would probably fall into a fairly
+ * simple loop, but we introduce damage into the system, randomly
+ * flipping about 1 bit in 64.
+ *
+ * The result is semi-structured and compressible.
+ */
+ uint8_t *d = original.data;
+ uint8_t *table = talloc_array(mem_ctx, uint8_t, 65536);
+ uint32_t *table32 = (void*)table;
+ ssize_t comp_size;
+ struct jsf_rng rng;
+
+ jsf32_init(&rng, 1);
+ for (i = 0; i < (65536 / 4); i++) {
+ table32[i] = jsf32(&rng);
+ }
+
+ d[0] = 'a';
+ d[1] = 'b';
+
+ for (i = 2; i < original.length; i++) {
+ uint16_t k = (d[i - 2] << 8) | d[i - 1];
+ uint32_t damage = jsf32(&rng) & jsf32(&rng) & jsf32(&rng);
+ damage &= (damage >> 16);
+ k ^= damage & 0xffff;
+ d[i] = table[k];
+ }
+
+ comp_size = attempt_round_trip(mem_ctx, original, "/tmp/random-graph", ref);
+ assert_true(comp_size > 0);
+ assert_true(comp_size < original.length);
+
+ talloc_free(mem_ctx);
+}
+
+
+static void test_lzxpress_huffman_chaos_graph_round_trip(void **state)
+{
+ size_t i;
+ TALLOC_CTX *mem_ctx = talloc_new(NULL);
+ DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, 5 * 1024 * 1024);
+ DATA_BLOB ref = {0};
+ /*
+ * There's a random trigram graph, with each pair of sequential bytes
+ * pointing to a successor. This would probably fall into a fairly
+ * simple loop, but we keep changing the graph. The result is long
+ * periods of stability separatd by bursts of noise.
+ */
+ uint8_t *d = original.data;
+ uint8_t *table = talloc_array(mem_ctx, uint8_t, 65536);
+ uint32_t *table32 = (void*)table;
+ ssize_t comp_size;
+ struct jsf_rng rng;
+
+ jsf32_init(&rng, 1);
+ for (i = 0; i < (65536 / 4); i++) {
+ table32[i] = jsf32(&rng);
+ }
+
+ d[0] = 'a';
+ d[1] = 'b';
+
+ for (i = 2; i < original.length; i++) {
+ uint16_t k = (d[i - 2] << 8) | d[i - 1];
+ uint32_t damage = jsf32(&rng);
+ d[i] = table[k];
+ if ((damage >> 29) == 0) {
+ uint16_t index = damage & 0xffff;
+ uint8_t value = (damage >> 16) & 0xff;
+ table[index] = value;
+ }
+ }
+
+ comp_size = attempt_round_trip(mem_ctx, original, "/tmp/chaos-graph", ref);
+ assert_true(comp_size > 0);
+ assert_true(comp_size < original.length);
+
+ talloc_free(mem_ctx);
+}
+
+
+static void test_lzxpress_huffman_sparse_random_graph_round_trip(void **state)
+{
+ size_t i;
+ TALLOC_CTX *mem_ctx = talloc_new(NULL);
+ DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, 5 * 1024 * 1024);
+ DATA_BLOB ref = {0};
+ /*
+ * There's a random trigram graph, with each pair of sequential bytes
+ * pointing to a successor. This will fall into a fairly simple loops,
+ * but we introduce damage into the system, randomly mangling about 1
+ * byte in 65536.
+ *
+ * The result has very long repetitive runs, which should lead to
+ * oversized blocks.
+ */
+ uint8_t *d = original.data;
+ uint8_t *table = talloc_array(mem_ctx, uint8_t, 65536);
+ uint32_t *table32 = (void*)table;
+ ssize_t comp_size;
+ struct jsf_rng rng;
+
+ jsf32_init(&rng, 3);
+ for (i = 0; i < (65536 / 4); i++) {
+ table32[i] = jsf32(&rng);
+ }
+
+ d[0] = 'a';
+ d[1] = 'b';
+
+ for (i = 2; i < original.length; i++) {
+ uint16_t k = (d[i - 2] << 8) | d[i - 1];
+ uint32_t damage = jsf32(&rng);
+ if ((damage & 0xffff0000) == 0) {
+ k ^= damage & 0xffff;
+ }
+ d[i] = table[k];
+ }
+
+ comp_size = attempt_round_trip(mem_ctx, original, "/tmp/sparse-random-graph", ref);
+ assert_true(comp_size > 0);
+ assert_true(comp_size < original.length);
+
+ talloc_free(mem_ctx);
+}
+
+
+static void test_lzxpress_huffman_random_noise_round_trip(void **state)
+{
+ size_t i;
+ size_t len = 1024 * 1024;
+ TALLOC_CTX *mem_ctx = talloc_new(NULL);
+ DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, len);
+ DATA_BLOB ref = {0};
+ ssize_t comp_size;
+ /*
+ * We are filling this up with incompressible noise, but we can assert
+ * quite tight bounds on how badly it will fail to compress.
+ *
+ * Specifically, with randomly distributed codes, the Huffman table
+ * should come out as roughly even, averaging 8 bit codes. Then there
+ * will be a 256 byte table every 64k, which is a 1/256 overhead (i.e.
+ * the compressed length will be 257/256 the original *on average*).
+ * We assert it is less than 1 in 200 but more than 1 in 300.
+ */
+ uint32_t *d32 = (uint32_t*)((void*)original.data);
+ struct jsf_rng rng;
+ jsf32_init(&rng, 2);
+
+ for (i = 0; i < (len / 4); i++) {
+ d32[i] = jsf32(&rng);
+ }
+
+ comp_size = attempt_round_trip(mem_ctx, original, "/tmp/random-noise", ref);
+ assert_true(comp_size > 0);
+ assert_true(comp_size > original.length + original.length / 300);
+ assert_true(comp_size < original.length + original.length / 200);
+ debug_message("original size %zu; compressed size %zd; ratio %.3f\n",
+ len, comp_size, ((double)comp_size) / len);
+
+ talloc_free(mem_ctx);
+}
+
+
+static void test_lzxpress_huffman_overlong_matches(void **state)
+{
+ size_t i, j = 0;
+ TALLOC_CTX *mem_ctx = talloc_new(NULL);
+ DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, 1024 * 1024);
+ DATA_BLOB ref = {0};
+ uint8_t *d = original.data;
+ char filename[300];
+ /*
+ * We are testing with something like "aaaaaaaaaaaaaaaaaaaaaaabbbbb"
+ * where typically the number of "a"s is > 65536, and the number of
+ * "b"s is < 42.
+ */
+ ssize_t na[] = {65535, 65536, 65537, 65559, 65575, 200000, -1};
+ ssize_t nb[] = {1, 2, 20, 39, 40, 41, 42, -1};
+ int score = 0;
+ ssize_t comp_size;
+
+ for (i = 0; na[i] >= 0; i++) {
+ ssize_t a = na[i];
+ memset(d, 'a', a);
+ for (j = 0; nb[j] >= 0; j++) {
+ ssize_t b = nb[j];
+ memset(d + a, 'b', b);
+ original.length = a + b;
+ snprintf(filename, sizeof(filename),
+ "/tmp/overlong-%zd-%zd", a, b);
+ comp_size = attempt_round_trip(mem_ctx,
+ original,
+ filename, ref);
+ if (comp_size > 0) {
+ score++;
+ }
+ }
+ }
+ debug_message("%d/%zu correct\n", score, i * j);
+ assert_int_equal(score, i * j);
+ talloc_free(mem_ctx);
+}
+
+
+static void test_lzxpress_huffman_overlong_matches_abc(void **state)
+{
+ size_t i, j = 0, k = 0;
+ TALLOC_CTX *mem_ctx = talloc_new(NULL);
+ DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, 1024 * 1024);
+ DATA_BLOB ref = {0};
+ uint8_t *d = original.data;
+ char filename[300];
+ /*
+ * We are testing with something like "aaaabbbbcc" where typically
+ * the number of "a"s + "b"s is around 65536, and the number of "c"s
+ * is < 43.
+ */
+ ssize_t nab[] = {1, 21, 32767, 32768, 32769, -1};
+ ssize_t nc[] = {1, 2, 20, 39, 40, 41, 42, -1};
+ int score = 0;
+ ssize_t comp_size;
+
+ for (i = 0; nab[i] >= 0; i++) {
+ ssize_t a = nab[i];
+ memset(d, 'a', a);
+ for (j = 0; nab[j] >= 0; j++) {
+ ssize_t b = nab[j];
+ memset(d + a, 'b', b);
+ for (k = 0; nc[k] >= 0; k++) {
+ ssize_t c = nc[k];
+ memset(d + a + b, 'c', c);
+ original.length = a + b + c;
+ snprintf(filename, sizeof(filename),
+ "/tmp/overlong-abc-%zd-%zd-%zd",
+ a, b, c);
+ comp_size = attempt_round_trip(mem_ctx,
+ original,
+ filename, ref);
+ if (comp_size > 0) {
+ score++;
+ }
+ }
+ }
+ }
+ debug_message("%d/%zu correct\n", score, i * j * k);
+ assert_int_equal(score, i * j * k);
+ talloc_free(mem_ctx);
+}
+
+
+static void test_lzxpress_huffman_extremely_compressible_middle(void **state)
+{
+ size_t len = 192 * 1024;
+ TALLOC_CTX *mem_ctx = talloc_new(NULL);
+ DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, len);
+ DATA_BLOB ref = {0};
+ ssize_t comp_size;
+ /*
+ * When a middle block (i.e. not the first and not the last of >= 3),
+ * can be entirely expressed as a match starting in the previous
+ * block, the Huffman tree would end up with 1 element, which does not
+ * work for the code construction. It really wants to use both bits.
+ * So we need to ensure we have some way of dealing with this.
+ */
+ memset(original.data, 'a', 0x10000 - 1);
+ memset(original.data + 0x10000 - 1, 'b', 0x10000 + 1);
+ memset(original.data + 0x20000, 'a', 0x10000);
+ comp_size = attempt_round_trip(mem_ctx, original, "/tmp/compressible-middle", ref);
+ assert_true(comp_size > 0);
+ assert_true(comp_size < 1024);
+ debug_message("original size %zu; compressed size %zd; ratio %.3f\n",
+ len, comp_size, ((double)comp_size) / len);
+
+ talloc_free(mem_ctx);
+}
+
+
+static void test_lzxpress_huffman_max_length_limit(void **state)
+{
+ size_t len = 65 * 1024 * 1024;
+ TALLOC_CTX *mem_ctx = talloc_new(NULL);
+ DATA_BLOB original = data_blob_talloc_zero(mem_ctx, len);
+ DATA_BLOB ref = {0};
+ ssize_t comp_size;
+ /*
+ * Reputedly Windows has a 64MB limit in the maximum match length it
+ * will encode. We follow this, and test that here with nearly 65 MB
+ * of zeros between two letters; this should be encoded in three
+ * blocks:
+ *
+ * 1. 'a', 64M × '\0'
+ * 2. (1M - 2) × '\0' -- finishing off what would have been the same match
+ * 3. 'b' EOF
+ *
+ * Which we can assert by saying the length is > 768, < 1024.
+ */
+ original.data[0] = 'a';
+ original.data[len - 1] = 'b';
+ comp_size = attempt_round_trip(mem_ctx, original, "/tmp/max-length-limit", ref);
+ assert_true(comp_size > 0x300);
+ assert_true(comp_size < 0x400);
+ debug_message("original size %zu; compressed size %zd; ratio %.3f\n",
+ len, comp_size, ((double)comp_size) / len);
+
+ talloc_free(mem_ctx);
+}
+
+
+static void test_lzxpress_huffman_short_boring_strings(void **state)
+{
+ size_t len = 64 * 1024;
+ TALLOC_CTX *mem_ctx = talloc_new(NULL);
+ DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, len);
+ DATA_BLOB ref = {0};
+ ssize_t comp_size;
+ ssize_t lengths[] = {
+ 1, 2, 20, 39, 40, 41, 42, 256, 270, 273, 274, 1000, 64000, -1};
+ char filename[300];
+ size_t i;
+ /*
+ * How do short repetitive strings work? We're poking at the limit
+ * around which LZ77 comprssion is turned on.
+ *
+ * For this test we don't change the blob memory between runs, just
+ * the declared length.
+ */
+ memset(original.data, 'a', len);
+ for (i = 0; lengths[i] >= 0; i++) {
+ original.length = lengths[i];
+ snprintf(filename, sizeof(filename),
+ "/tmp/short-boring-%zu",
+ original.length);
+ comp_size = attempt_round_trip(mem_ctx, original, filename, ref);
+ if (original.length < 41) {
+ assert_true(comp_size > 256 + original.length / 8);
+ } else if (original.length < 274) {
+ assert_true(comp_size == 261);
+ } else {
+ assert_true(comp_size == 263);
+ }
+ assert_true(comp_size < 261 + original.length / 8);
+ }
+ /* let's just show we didn't change the original */
+ for (i = 0; i < len; i++) {
+ if (original.data[i] != 'a') {
+ fail_msg("input data[%zu] was changed! (%2x, expected %2x)\n",
+ i, original.data[i], 'a');
+ }
+ }
+
+ talloc_free(mem_ctx);
+}
+
+
+static void test_lzxpress_huffman_compress_empty_or_null(void **state)
+{
+ /*
+ * We expect these to fail with a -1, except the last one, which does
+ * the real thing.
+ */
+ ssize_t ret;
+ const uint8_t *input = bidirectional_pairs[0].decompressed.data;
+ size_t ilen = bidirectional_pairs[0].decompressed.length;
+ size_t olen = bidirectional_pairs[0].compressed.length;
+ uint8_t output[olen];
+ struct lzxhuff_compressor_mem cmp_mem;
+
+ ret = lzxpress_huffman_compress(&cmp_mem, input, 0, output, olen);
+ assert_int_equal(ret, -1LL);
+ ret = lzxpress_huffman_compress(&cmp_mem, input, ilen, output, 0);
+ assert_int_equal(ret, -1LL);
+
+ ret = lzxpress_huffman_compress(&cmp_mem, NULL, ilen, output, olen);
+ assert_int_equal(ret, -1LL);
+ ret = lzxpress_huffman_compress(&cmp_mem, input, ilen, NULL, olen);
+ assert_int_equal(ret, -1LL);
+ ret = lzxpress_huffman_compress(NULL, input, ilen, output, olen);
+ assert_int_equal(ret, -1LL);
+
+ ret = lzxpress_huffman_compress(&cmp_mem, input, ilen, output, olen);
+ assert_int_equal(ret, olen);
+}
+
+
+static void test_lzxpress_huffman_decompress_empty_or_null(void **state)
+{
+ /*
+ * We expect these to fail with a -1, except the last one.
+ */
+ ssize_t ret;
+ const uint8_t *input = bidirectional_pairs[0].compressed.data;
+ size_t ilen = bidirectional_pairs[0].compressed.length;
+ size_t olen = bidirectional_pairs[0].decompressed.length;
+ uint8_t output[olen];
+
+ ret = lzxpress_huffman_decompress(input, 0, output, olen);
+ assert_int_equal(ret, -1LL);
+ ret = lzxpress_huffman_decompress(input, ilen, output, 0);
+ assert_int_equal(ret, -1LL);
+
+ ret = lzxpress_huffman_decompress(NULL, ilen, output, olen);
+ assert_int_equal(ret, -1LL);
+ ret = lzxpress_huffman_decompress(input, ilen, NULL, olen);
+ assert_int_equal(ret, -1LL);
+
+ ret = lzxpress_huffman_decompress(input, ilen, output, olen);
+ assert_int_equal(ret, olen);
+}
+
+
+int main(void) {
+ const struct CMUnitTest tests[] = {
+ cmocka_unit_test(test_lzxpress_huffman_short_boring_strings),
+ cmocka_unit_test(test_lzxpress_huffman_max_length_limit),
+ cmocka_unit_test(test_lzxpress_huffman_extremely_compressible_middle),
+ cmocka_unit_test(test_lzxpress_huffman_long_random_graph_round_trip),
+ cmocka_unit_test(test_lzxpress_huffman_chaos_graph_round_trip),
+ cmocka_unit_test(test_lzxpress_huffman_sparse_random_graph_round_trip),
+ cmocka_unit_test(test_lzxpress_huffman_round_trip),
+ cmocka_unit_test(test_lzxpress_huffman_decompress_files),
+ cmocka_unit_test(test_lzxpress_huffman_decompress_more_compressed_files),
+ cmocka_unit_test(test_lzxpress_huffman_compress),
+ cmocka_unit_test(test_lzxpress_huffman_decompress),
+ cmocka_unit_test(test_lzxpress_huffman_long_gpl_round_trip),
+ cmocka_unit_test(test_lzxpress_huffman_long_random_graph_round_trip),
+ cmocka_unit_test(test_lzxpress_huffman_random_noise_round_trip),
+ cmocka_unit_test(test_lzxpress_huffman_overlong_matches_abc),
+ cmocka_unit_test(test_lzxpress_huffman_overlong_matches),
+ cmocka_unit_test(test_lzxpress_huffman_decompress_empty_or_null),
+ cmocka_unit_test(test_lzxpress_huffman_compress_empty_or_null),
+ };
+ if (!isatty(1)) {
+ cmocka_set_message_output(CM_OUTPUT_SUBUNIT);
+ }
+
+ return cmocka_run_group_tests(tests, NULL, NULL);
+}
diff --git a/lib/compression/tests/test_lzxpress_plain.c b/lib/compression/tests/test_lzxpress_plain.c
new file mode 100644
index 0000000..1c14793
--- /dev/null
+++ b/lib/compression/tests/test_lzxpress_plain.c
@@ -0,0 +1,1194 @@
+/*
+ Unix SMB/CIFS implementation.
+ test suite for the compression functions
+
+ Copyright (C) Jelmer Vernooij 2007
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <stdarg.h>
+#include <stddef.h>
+#include <setjmp.h>
+#include <sys/stat.h>
+#include <cmocka.h>
+#include "includes.h"
+#include "talloc.h"
+#include "lzxpress.h"
+#include "lib/util/base64.h"
+
+
+/* set LZX_DEBUG_FILES to true to save round-trip files in /tmp. */
+#define LZX_DEBUG_FILES false
+
+/* set LZX_DEBUG_VERBOSE to true to print more. */
+#define LZX_DEBUG_VERBOSE false
+
+
+#if LZX_DEBUG_VERBOSE
+#define debug_message(...) print_message(__VA_ARGS__)
+
+#include <time.h>
+
+struct timespec start = {0};
+struct timespec end = {0};
+static void debug_start_timer(void)
+{
+ clock_gettime(CLOCK_MONOTONIC, &start);
+}
+
+static void debug_end_timer(const char *name, size_t len)
+{
+ uint64_t ns;
+ double secs;
+ double rate;
+ clock_gettime(CLOCK_MONOTONIC, &end);
+ ns = end.tv_nsec;
+ ns += end.tv_sec * 1000 * 1000 * 1000;
+ ns -= start.tv_nsec;
+ ns -= start.tv_sec * 1000 * 1000 * 1000;
+ secs = ns / 1e9;
+ rate = len / (secs * 1024 * 1024);
+ debug_message("%s %zu bytes in %.2g: \033[1;35m%.2f\033[0m MB per second\n",
+ name, len, secs, rate);
+}
+
+#else
+#define debug_message(...) /* debug_message */
+#define debug_start_timer(...) /* debug_start_timer */
+#define debug_end_timer(...) /* debug_end_timer */
+#endif
+
+
+struct lzx_pair {
+ const char *name;
+ DATA_BLOB compressed;
+ DATA_BLOB decompressed;
+};
+
+struct lzx_file_pair {
+ const char *name;
+ const char *compressed_file;
+ const char *decompressed_file;
+};
+
+
+#define DECOMP_DIR "testdata/compression/decompressed"
+#define COMP_DIR "testdata/compression/compressed-plain"
+#define MORE_COMP_DIR "testdata/compression/compressed-more-plain"
+
+
+#define BLOB_FROM_ARRAY(...) \
+ { \
+ .data = (uint8_t[]){__VA_ARGS__}, \
+ .length = sizeof((uint8_t[]){__VA_ARGS__}) \
+ }
+
+#define BLOB_FROM_STRING(s) \
+ { \
+ .data = discard_const_p(uint8_t, s), \
+ .length = (sizeof(s) - 1) \
+ }
+
+
+const char * file_names[] = {
+ "generate-windows-test-vectors.c",
+ "fib_shuffle-128k+",
+ "fuzzing-0fc2d461b56cd8103c91",
+ "fuzzing-3ec3bca27bb9eb40c128",
+ "fuzzing-a3115a81d1ac500318f9",
+ "fuzzing-3591f9dc02bb00a54b60",
+ "27826-8.txt",
+ "5d049b4cb1bd933f5e8ex19",
+ "638e61e96d54279981c3x5",
+ "64k-minus-one-zeros",
+ "64k-plus-one-zeros",
+ "64k-zeros",
+ "96f696a4e5ce56c61a3dx10",
+ "9e0b6a12febf38e98f13",
+ "abc-times-101",
+ "abc-times-105",
+ "abc-times-200",
+ "b63289ccc7f218c0d56b",
+ "beta-variate1-128k+",
+ "beta-variate3-128k+",
+ "decayed_alphabet_128k+",
+ "decayed_alphabet_64k",
+ "f00842317dc6d5695b02",
+ "fib_shuffle",
+ "midsummer-nights-dream.txt",
+ "notes-on-the-underground.txt",
+ "pg22009.txt",
+ "repeating",
+ "repeating-exactly-64k",
+ "setup.log",
+ "slow-015ddc36a71412ccc50d",
+ "slow-100e9f966a7feb9ca40a",
+ "slow-2a671c3cff4f1574cbab",
+ "slow-33d90a24e70515b14cd0",
+ "slow-49d8c05261e3f412fc72",
+ "slow-50a249d2fe56873e56a0",
+ "slow-63e9f0b52235fb0129fa",
+ "slow-73b7f971d65908ac0095",
+ "slow-8b61e3dd267908544531",
+ "slow-9d1c5a079b0462986f1f",
+ "slow-aa7262a821dabdcf04a6",
+ "slow-b8a91d142b0d2af7f5ca",
+ "slow-c79142457734bbc8d575",
+ "slow-d736544545b90d83fe75",
+ "slow-e3b9bdfaed7d1a606fdb",
+ "slow-f3f1c02a9d006e5e1703",
+ "trigram_128k+",
+ "trigram_64k",
+ "trigram_sum_128k+",
+ "trigram_sum_64k",
+ NULL
+};
+
+
+
+static DATA_BLOB datablob_from_file(TALLOC_CTX *mem_ctx,
+ const char *filename)
+{
+ DATA_BLOB b = {0};
+ FILE *fh = fopen(filename, "rb");
+ int ret;
+ struct stat s;
+ size_t len;
+ if (fh == NULL) {
+ debug_message("could not open '%s'\n", filename);
+ return b;
+ }
+ ret = fstat(fileno(fh), &s);
+ if (ret != 0) {
+ fclose(fh);
+ return b;
+ }
+ b.data = talloc_array(mem_ctx, uint8_t, s.st_size);
+ if (b.data == NULL) {
+ fclose(fh);
+ return b;
+ }
+ len = fread(b.data, 1, s.st_size, fh);
+ if (ferror(fh) || len != s.st_size) {
+ TALLOC_FREE(b.data);
+ } else {
+ b.length = len;
+ }
+ fclose(fh);
+ return b;
+}
+
+
+
+static void test_lzxpress_plain_decompress_files(void **state)
+{
+ size_t i;
+ int score = 0;
+ TALLOC_CTX *mem_ctx = talloc_new(NULL);
+ for (i = 0; file_names[i] != NULL; i++) {
+ char filename[200];
+ uint8_t *dest = NULL;
+ ssize_t written;
+ TALLOC_CTX *tmp_ctx = talloc_new(mem_ctx);
+ struct lzx_pair p = {
+ .name = file_names[i]
+ };
+
+ debug_message("%s\n", p.name);
+
+ snprintf(filename, sizeof(filename),
+ "%s/%s.decomp", DECOMP_DIR, p.name);
+
+ p.decompressed = datablob_from_file(tmp_ctx, filename);
+ assert_non_null(p.decompressed.data);
+
+ snprintf(filename, sizeof(filename),
+ "%s/%s.lzplain", COMP_DIR, p.name);
+
+ p.compressed = datablob_from_file(tmp_ctx, filename);
+ assert_non_null(p.compressed.data);
+
+ dest = talloc_array(tmp_ctx, uint8_t, p.decompressed.length);
+ debug_start_timer();
+ written = lzxpress_decompress(p.compressed.data,
+ p.compressed.length,
+ dest,
+ p.decompressed.length);
+ debug_end_timer("decompress", p.decompressed.length);
+ if (written == p.decompressed.length &&
+ memcmp(dest, p.decompressed.data, p.decompressed.length) == 0) {
+ debug_message("\033[1;32mdecompressed %s!\033[0m\n", p.name);
+ score++;
+ } else {
+ debug_message("\033[1;31mfailed to decompress %s!\033[0m\n",
+ p.name);
+ debug_message("size %zd vs reference %zu\n",
+ written, p.decompressed.length);
+ }
+ talloc_free(tmp_ctx);
+ }
+ debug_message("%d/%zu correct\n", score, i);
+ assert_int_equal(score, i);
+}
+
+
+static void test_lzxpress_plain_decompress_more_compressed_files(void **state)
+{
+ /*
+ * This tests the decompression of files that have been compressed on
+ * Windows with the level turned up (to 1, default for MS-XCA is 0).
+ *
+ * The format is identical, but it will have tried harder to find
+ * matches.
+ */
+ size_t i;
+ int score = 0;
+ int found = 0;
+ TALLOC_CTX *mem_ctx = talloc_new(NULL);
+ for (i = 0; file_names[i] != NULL; i++) {
+ char filename[200];
+ uint8_t *dest = NULL;
+ ssize_t written;
+ TALLOC_CTX *tmp_ctx = talloc_new(mem_ctx);
+ struct lzx_pair p = {
+ .name = file_names[i]
+ };
+
+ debug_message("%s\n", p.name);
+
+ snprintf(filename, sizeof(filename),
+ "%s/%s.decomp", DECOMP_DIR, p.name);
+
+ p.decompressed = datablob_from_file(tmp_ctx, filename);
+ assert_non_null(p.decompressed.data);
+
+ snprintf(filename, sizeof(filename),
+ "%s/%s.lzplain", MORE_COMP_DIR, p.name);
+
+ p.compressed = datablob_from_file(tmp_ctx, filename);
+ if (p.compressed.data == NULL) {
+ /*
+ * We don't have all the vectors in the
+ * more-compressed directory, which is OK, we skip
+ * them.
+ */
+ continue;
+ }
+ found++;
+ dest = talloc_array(tmp_ctx, uint8_t, p.decompressed.length);
+ debug_start_timer();
+ written = lzxpress_decompress(p.compressed.data,
+ p.compressed.length,
+ dest,
+ p.decompressed.length);
+ debug_end_timer("decompress", p.decompressed.length);
+ if (written != -1 &&
+ written == p.decompressed.length &&
+ memcmp(dest, p.decompressed.data, p.decompressed.length) == 0) {
+ debug_message("\033[1;32mdecompressed %s!\033[0m\n", p.name);
+ score++;
+ } else {
+ debug_message("\033[1;31mfailed to decompress %s!\033[0m\n",
+ p.name);
+ debug_message("size %zd vs reference %zu\n",
+ written, p.decompressed.length);
+ }
+ talloc_free(tmp_ctx);
+ }
+ debug_message("%d/%d correct\n", score, found);
+ assert_int_equal(score, found);
+}
+
+
+/*
+ * attempt_round_trip() tests whether a data blob can survive a compression
+ * and decompression cycle. If save_name is not NULL and LZX_DEBUG_FILES
+ * evals to true, the various stages are saved in files with that name and the
+ * '-original', '-compressed', and '-decompressed' suffixes. If ref_compressed
+ * has data, it'll print a message saying whether the compressed data matches
+ * that.
+ */
+
+static ssize_t attempt_round_trip(TALLOC_CTX *mem_ctx,
+ DATA_BLOB original,
+ const char *save_name,
+ DATA_BLOB ref_compressed)
+{
+ TALLOC_CTX *tmp_ctx = talloc_new(mem_ctx);
+ DATA_BLOB compressed = data_blob_talloc(tmp_ctx, NULL,
+ original.length * 8 / 7 + 8);
+ DATA_BLOB decompressed = data_blob_talloc(tmp_ctx, NULL,
+ original.length);
+ ssize_t comp_written, decomp_written;
+ debug_start_timer();
+ comp_written = lzxpress_compress(original.data,
+ original.length,
+ compressed.data,
+ compressed.length);
+ debug_end_timer("compress", original.length);
+ if (comp_written <= 0) {
+ talloc_free(tmp_ctx);
+ return -1;
+ }
+
+ if (ref_compressed.data != NULL) {
+ /*
+ * This is informational, not an assertion; there are
+ * ~infinite legitimate ways to compress the data, many as
+ * good as each other (think of compression as a language, not
+ * a format).
+ */
+ debug_message("compressed size %zd vs reference %zu\n",
+ comp_written, ref_compressed.length);
+
+ if (comp_written == compressed.length &&
+ memcmp(compressed.data, ref_compressed.data, comp_written) == 0) {
+ debug_message("\033[1;32mbyte identical!\033[0m\n");
+ }
+ }
+ debug_start_timer();
+ decomp_written = lzxpress_decompress(compressed.data,
+ comp_written,
+ decompressed.data,
+ decompressed.length);
+ debug_end_timer("decompress", original.length);
+ if (save_name != NULL && LZX_DEBUG_FILES) {
+ char s[300];
+ FILE *fh = NULL;
+
+ snprintf(s, sizeof(s), "%s-original", save_name);
+ fprintf(stderr, "Saving %zu bytes to %s\n", original.length, s);
+ fh = fopen(s, "w");
+ fwrite(original.data, 1, original.length, fh);
+ fclose(fh);
+
+ snprintf(s, sizeof(s), "%s-compressed", save_name);
+ fprintf(stderr, "Saving %zu bytes to %s\n", comp_written, s);
+ fh = fopen(s, "w");
+ fwrite(compressed.data, 1, comp_written, fh);
+ fclose(fh);
+ /*
+ * We save the decompressed file using original.length, not
+ * the returned size. If these differ, the returned size will
+ * be -1. By saving the whole buffer we can see at what point
+ * it went haywire.
+ */
+ snprintf(s, sizeof(s), "%s-decompressed", save_name);
+ fprintf(stderr, "Saving %zu bytes to %s\n", original.length, s);
+ fh = fopen(s, "w");
+ fwrite(decompressed.data, 1, original.length, fh);
+ fclose(fh);
+ }
+
+ if (original.length != decomp_written ||
+ memcmp(decompressed.data,
+ original.data,
+ original.length) != 0) {
+ debug_message("\033[1;31mgot %zd, expected %zu\033[0m\n",
+ decomp_written,
+ original.length);
+ talloc_free(tmp_ctx);
+ return -1;
+ }
+ talloc_free(tmp_ctx);
+ return comp_written;
+}
+
+
+static void test_lzxpress_plain_round_trip_files(void **state)
+{
+ size_t i;
+ int score = 0;
+ ssize_t compressed_total = 0;
+ ssize_t reference_total = 0;
+ TALLOC_CTX *mem_ctx = talloc_new(NULL);
+ for (i = 0; file_names[i] != NULL; i++) {
+ char filename[200];
+ char *debug_files = NULL;
+ TALLOC_CTX *tmp_ctx = talloc_new(mem_ctx);
+ ssize_t comp_size;
+ struct lzx_pair p = {
+ .name = file_names[i]
+ };
+ debug_message("-------------------\n");
+ debug_message("%s\n", p.name);
+
+ snprintf(filename, sizeof(filename),
+ "%s/%s.decomp", DECOMP_DIR, p.name);
+
+ p.decompressed = datablob_from_file(tmp_ctx, filename);
+ assert_non_null(p.decompressed.data);
+
+ snprintf(filename, sizeof(filename),
+ "%s/%s.lzplain", COMP_DIR, p.name);
+
+ p.compressed = datablob_from_file(tmp_ctx, filename);
+ if (p.compressed.data == NULL) {
+ debug_message(
+ "Could not load %s reference file %s\n",
+ p.name, filename);
+ debug_message("%s decompressed %zu\n", p.name,
+ p.decompressed.length);
+ } else {
+ debug_message("%s: reference compressed %zu decomp %zu\n",
+ p.name,
+ p.compressed.length,
+ p.decompressed.length);
+ }
+ if (1) {
+ /*
+ * We're going to save copies in /tmp.
+ */
+ snprintf(filename, sizeof(filename),
+ "/tmp/lzxplain-%s", p.name);
+ debug_files = filename;
+ }
+
+ comp_size = attempt_round_trip(mem_ctx, p.decompressed,
+ debug_files,
+ p.compressed);
+ if (comp_size > 0) {
+ debug_message("\033[1;32mround trip!\033[0m\n");
+ score++;
+ if (p.compressed.length) {
+ compressed_total += comp_size;
+ reference_total += p.compressed.length;
+ }
+ }
+ talloc_free(tmp_ctx);
+ }
+ debug_message("%d/%zu correct\n", score, i);
+ print_message("\033[1;34mtotal compressed size: %zu\033[0m\n",
+ compressed_total);
+ print_message("total reference size: %zd \n", reference_total);
+ print_message("diff: %7zd \n",
+ reference_total - compressed_total);
+ assert_true(reference_total != 0);
+ print_message("ratio: \033[1;3%dm%.2f\033[0m \n",
+ 2 + (compressed_total >= reference_total),
+ ((double)compressed_total) / reference_total);
+ /*
+ * Assert that the compression is better than Windows. Unlike the
+ * Huffman variant, where things are very even, here we do much better
+ * than Windows without especially trying.
+ */
+ assert_true(compressed_total <= reference_total);
+
+ assert_int_equal(score, i);
+ talloc_free(mem_ctx);
+}
+
+
+/*
+ * Bob Jenkins' Small Fast RNG.
+ *
+ * We don't need it to be this good, but we do need it to be reproduceable
+ * across platforms, which rand() etc aren't.
+ *
+ * http://burtleburtle.net/bob/rand/smallprng.html
+ */
+
+struct jsf_rng {
+ uint32_t a;
+ uint32_t b;
+ uint32_t c;
+ uint32_t d;
+};
+
+#define ROTATE32(x, k) (((x) << (k)) | ((x) >> (32 - (k))))
+
+static uint32_t jsf32(struct jsf_rng *x) {
+ uint32_t e = x->a - ROTATE32(x->b, 27);
+ x->a = x->b ^ ROTATE32(x->c, 17);
+ x->b = x->c + x->d;
+ x->c = x->d + e;
+ x->d = e + x->a;
+ return x->d;
+}
+
+static void jsf32_init(struct jsf_rng *x, uint32_t seed) {
+ size_t i;
+ x->a = 0xf1ea5eed;
+ x->b = x->c = x->d = seed;
+ for (i = 0; i < 20; ++i) {
+ jsf32(x);
+ }
+}
+
+
+static void test_lzxpress_plain_long_gpl_round_trip(void **state)
+{
+ /*
+ * We use a kind of model-free Markov model to generate a massively
+ * extended pastiche of the GPLv3 (chosen because it is right there in
+ * "COPYING" and won't change often).
+ *
+ * The point is to check a round trip of a very long message with
+ * multiple repetitions on many scales, without having to add a very
+ * large file.
+ */
+ size_t i, j, k;
+ uint8_t c;
+ TALLOC_CTX *mem_ctx = talloc_new(NULL);
+ DATA_BLOB gpl = datablob_from_file(mem_ctx, "COPYING");
+ DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, 5 * 1024 * 1024);
+ DATA_BLOB ref = {0};
+ ssize_t comp_size;
+ struct jsf_rng rng;
+
+
+ jsf32_init(&rng, 1);
+
+ j = 1;
+ original.data[0] = gpl.data[0];
+ for (i = 1; i < original.length; i++) {
+ size_t m;
+ char p = original.data[i - 1];
+ c = gpl.data[j];
+ original.data[i] = c;
+ j++;
+ m = (j + jsf32(&rng)) % (gpl.length - 50);
+ for (k = m; k < m + 30; k++) {
+ if (p == gpl.data[k] &&
+ c == gpl.data[k + 1]) {
+ j = k + 2;
+ break;
+ }
+ }
+ if (j == gpl.length) {
+ j = 1;
+ }
+ }
+
+ comp_size = attempt_round_trip(mem_ctx, original, "/tmp/gpl", ref);
+ assert_true(comp_size > 0);
+ assert_true(comp_size < original.length);
+
+ talloc_free(mem_ctx);
+}
+
+
+static void test_lzxpress_plain_long_random_graph_round_trip(void **state)
+{
+ size_t i;
+ TALLOC_CTX *mem_ctx = talloc_new(NULL);
+ DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, 5 * 1024 * 1024);
+ DATA_BLOB ref = {0};
+ /*
+ * There's a random trigram graph, with each pair of sequential bytes
+ * pointing to a successor. This would probably fall into a fairly
+ * simple loop, but we introduce damage into the system, randomly
+ * flipping about 1 bit in 64.
+ *
+ * The result is semi-structured and compressible.
+ */
+ uint8_t *d = original.data;
+ uint8_t *table = talloc_array(mem_ctx, uint8_t, 65536);
+ uint32_t *table32 = (void*)table;
+ ssize_t comp_size;
+ struct jsf_rng rng;
+
+ jsf32_init(&rng, 1);
+ for (i = 0; i < (65536 / 4); i++) {
+ table32[i] = jsf32(&rng);
+ }
+
+ d[0] = 'a';
+ d[1] = 'b';
+
+ for (i = 2; i < original.length; i++) {
+ uint16_t k = (d[i - 2] << 8) | d[i - 1];
+ uint32_t damage = jsf32(&rng) & jsf32(&rng) & jsf32(&rng);
+ damage &= (damage >> 16);
+ k ^= damage & 0xffff;
+ d[i] = table[k];
+ }
+
+ comp_size = attempt_round_trip(mem_ctx, original, "/tmp/random-graph", ref);
+ assert_true(comp_size > 0);
+ assert_true(comp_size < original.length);
+
+ talloc_free(mem_ctx);
+}
+
+
+static void test_lzxpress_plain_chaos_graph_round_trip(void **state)
+{
+ size_t i;
+ TALLOC_CTX *mem_ctx = talloc_new(NULL);
+ DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, 5 * 1024 * 1024);
+ DATA_BLOB ref = {0};
+ /*
+ * There's a random trigram graph, with each pair of sequential bytes
+ * pointing to a successor. This would probably fall into a fairly
+ * simple loop, but we keep changing the graph. The result is long
+ * periods of stability separatd by bursts of noise.
+ */
+ uint8_t *d = original.data;
+ uint8_t *table = talloc_array(mem_ctx, uint8_t, 65536);
+ uint32_t *table32 = (void*)table;
+ ssize_t comp_size;
+ struct jsf_rng rng;
+
+ jsf32_init(&rng, 1);
+ for (i = 0; i < (65536 / 4); i++) {
+ table32[i] = jsf32(&rng);
+ }
+
+ d[0] = 'a';
+ d[1] = 'b';
+
+ for (i = 2; i < original.length; i++) {
+ uint16_t k = (d[i - 2] << 8) | d[i - 1];
+ uint32_t damage = jsf32(&rng);
+ d[i] = table[k];
+ if ((damage >> 29) == 0) {
+ uint16_t index = damage & 0xffff;
+ uint8_t value = (damage >> 16) & 0xff;
+ table[index] = value;
+ }
+ }
+
+ comp_size = attempt_round_trip(mem_ctx, original, "/tmp/chaos-graph", ref);
+ assert_true(comp_size > 0);
+ assert_true(comp_size < original.length);
+
+ talloc_free(mem_ctx);
+}
+
+
+static void test_lzxpress_plain_sparse_random_graph_round_trip(void **state)
+{
+ size_t i;
+ TALLOC_CTX *mem_ctx = talloc_new(NULL);
+ DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, 5 * 1024 * 1024);
+ DATA_BLOB ref = {0};
+ /*
+ * There's a random trigram graph, with each pair of sequential bytes
+ * pointing to a successor. This will fall into a fairly simple loops,
+ * but we introduce damage into the system, randomly mangling about 1
+ * byte in 65536.
+ *
+ * The result has very long repetitive runs, which should lead to
+ * oversized blocks.
+ */
+ uint8_t *d = original.data;
+ uint8_t *table = talloc_array(mem_ctx, uint8_t, 65536);
+ uint32_t *table32 = (void*)table;
+ ssize_t comp_size;
+ struct jsf_rng rng;
+
+ jsf32_init(&rng, 3);
+ for (i = 0; i < (65536 / 4); i++) {
+ table32[i] = jsf32(&rng);
+ }
+
+ d[0] = 'a';
+ d[1] = 'b';
+
+ for (i = 2; i < original.length; i++) {
+ uint16_t k = (d[i - 2] << 8) | d[i - 1];
+ uint32_t damage = jsf32(&rng);
+ if ((damage & 0xffff0000) == 0) {
+ k ^= damage & 0xffff;
+ }
+ d[i] = table[k];
+ }
+
+ comp_size = attempt_round_trip(mem_ctx, original, "/tmp/sparse-random-graph", ref);
+ assert_true(comp_size > 0);
+ assert_true(comp_size < original.length);
+
+ talloc_free(mem_ctx);
+}
+
+
+static void test_lzxpress_plain_random_noise_round_trip(void **state)
+{
+ size_t i;
+ size_t len = 10 * 1024 * 1024;
+ TALLOC_CTX *mem_ctx = talloc_new(NULL);
+ DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, len);
+ DATA_BLOB ref = {0};
+ ssize_t comp_size;
+ /*
+ * We are filling this up with incompressible noise, but we can assert
+ * quite tight bounds on how badly it will fail to compress.
+ *
+ * There is one additional bit for each code, which says whether the
+ * code is a literal byte or a match. If *all* codes are literal
+ * bytes, the length should be 9/8 the original (with rounding
+ * issues regarding the indicator bit blocks).
+ *
+ * If some matches are found the length will be a bit less. We would
+ * expect one 3 byte match per 1 << 24 tries, but we try 8192 times
+ * per position. That means there'll a match 1/2048 of the time at
+ * best. 255 times out of 256 this will be exactly a 3 byte match,
+ * encoded as two bytes, so we could get a 1 / 2048 saving on top of
+ * the 1/8 cost. There'll be a smattering of longer matches too, and
+ * the potential for complicated maths to account for those, but we'll
+ * skimp on that by allowing for a 1/1500 saving.
+ *
+ * With the hash table, we take a shortcut in the "8192 tries", and
+ * the size of the table makes a difference in how we perform, with 13
+ * bits (8192 slots) naturally being luckier than 12. Ultimately,
+ * either way, the compressed file is still 12.5% bigger than the
+ * original.
+ */
+ size_t limit = len * 9 / 8 + 4;
+
+ uint32_t *d32 = (uint32_t*)((void*)original.data);
+ struct jsf_rng rng;
+ jsf32_init(&rng, 2);
+
+ for (i = 0; i < (len / 4); i++) {
+ d32[i] = jsf32(&rng);
+ }
+
+ comp_size = attempt_round_trip(mem_ctx, original, "/tmp/random-noise", ref);
+ debug_message("original size %zu; compressed size %zd; ratio %.5f\n",
+ len, comp_size, ((double)comp_size) / len);
+ debug_message("expected range %zu - %zu\n",
+ limit - limit / 1500, limit);
+
+ assert_true(comp_size > 0);
+ assert_true(comp_size < limit);
+ assert_true(comp_size >= limit - limit / 1500);
+ talloc_free(mem_ctx);
+}
+
+
+/* Tests based on [MS-XCA] 3.1 Examples */
+static void test_msft_data1(void **state)
+{
+ TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+
+ const char *fixed_data = "abcdefghijklmnopqrstuvwxyz";
+ const uint8_t fixed_out[] = {
+ 0x3f, 0x00, 0x00, 0x00, 0x61, 0x62, 0x63, 0x64,
+ 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c,
+ 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74,
+ 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a };
+
+ ssize_t c_size;
+ uint8_t *out, *out2;
+
+ out = talloc_size(tmp_ctx, 2048);
+ memset(out, 0x42, talloc_get_size(out));
+
+ c_size = lzxpress_compress((const uint8_t *)fixed_data,
+ strlen(fixed_data),
+ out,
+ talloc_get_size(out));
+ assert_int_not_equal(c_size, -1);
+ assert_int_equal(c_size, sizeof(fixed_out));
+ assert_memory_equal(out, fixed_out, c_size);
+ out2 = talloc_size(tmp_ctx, strlen(fixed_data));
+ c_size = lzxpress_decompress(out,
+ sizeof(fixed_out),
+ out2,
+ talloc_get_size(out2));
+ assert_int_not_equal(c_size, -1);
+ assert_int_equal(c_size, strlen(fixed_data));
+ assert_memory_equal(out2, fixed_data, c_size);
+
+ talloc_free(tmp_ctx);
+}
+
+
+static void test_msft_data2(void **state)
+{
+ TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+
+ const char *fixed_data =
+ "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+ "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+ "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+ "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+ "abcabcabcabcabcabcabcabc";
+ const uint8_t fixed_out[] = {
+ 0xff, 0xff, 0xff, 0x1f, 0x61, 0x62, 0x63, 0x17,
+ 0x00, 0x0f, 0xff, 0x26, 0x01};
+
+ ssize_t c_size;
+ uint8_t *out, *out2;
+
+ out = talloc_size(tmp_ctx, 2048);
+ memset(out, 0x42, talloc_get_size(out));
+
+ c_size = lzxpress_compress((const uint8_t *)fixed_data,
+ strlen(fixed_data),
+ out,
+ talloc_get_size(out));
+ assert_int_not_equal(c_size, -1);
+ assert_int_equal(c_size, sizeof(fixed_out));
+ assert_memory_equal(out, fixed_out, c_size);
+
+ out2 = talloc_size(tmp_ctx, strlen(fixed_data));
+ c_size = lzxpress_decompress(out,
+ sizeof(fixed_out),
+ out2,
+ talloc_get_size(out2));
+
+ assert_int_not_equal(c_size, -1);
+ assert_int_equal(c_size, strlen(fixed_data));
+ assert_memory_equal(out2, fixed_data, c_size);
+
+ talloc_free(tmp_ctx);
+}
+
+/*
+ test lzxpress
+ */
+static void test_lzxpress(void **state)
+{
+ TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+ const char *fixed_data = "this is a test. and this is a test too";
+ const uint8_t fixed_out[] = {
+ 0xff, 0x21, 0x00, 0x04, 0x74, 0x68, 0x69, 0x73,
+ 0x20, 0x10, 0x00, 0x61, 0x20, 0x74, 0x65, 0x73,
+ 0x74, 0x2E, 0x20, 0x61, 0x6E, 0x64, 0x20, 0x9F,
+ 0x00, 0x04, 0x20, 0x74, 0x6F, 0x6F };
+
+ const uint8_t fixed_out_old_version[] = {
+ 0x00, 0x20, 0x00, 0x04, 0x74, 0x68, 0x69, 0x73,
+ 0x20, 0x10, 0x00, 0x61, 0x20, 0x74, 0x65, 0x73,
+ 0x74, 0x2E, 0x20, 0x61, 0x6E, 0x64, 0x20, 0x9F,
+ 0x00, 0x04, 0x20, 0x74, 0x6F, 0x6F, 0x00, 0x00,
+ 0x00, 0x00 };
+
+ ssize_t c_size;
+ uint8_t *out, *out2, *out3;
+
+ out = talloc_size(tmp_ctx, 2048);
+ memset(out, 0x42, talloc_get_size(out));
+
+ c_size = lzxpress_compress((const uint8_t *)fixed_data,
+ strlen(fixed_data),
+ out,
+ talloc_get_size(out));
+
+ assert_int_not_equal(c_size, -1);
+ assert_int_equal(c_size, sizeof(fixed_out));
+ assert_memory_equal(out, fixed_out, c_size);
+
+ out2 = talloc_size(tmp_ctx, strlen(fixed_data));
+ c_size = lzxpress_decompress(out,
+ sizeof(fixed_out),
+ out2,
+ talloc_get_size(out2));
+
+ assert_int_not_equal(c_size, -1);
+ assert_int_equal(c_size, strlen(fixed_data));
+ assert_memory_equal(out2, fixed_data, c_size);
+
+ out3 = talloc_size(tmp_ctx, strlen(fixed_data));
+ c_size = lzxpress_decompress(fixed_out_old_version,
+ sizeof(fixed_out_old_version),
+ out3,
+ talloc_get_size(out3));
+
+ assert_int_not_equal(c_size, -1);
+ assert_int_equal(c_size, strlen(fixed_data));
+ assert_memory_equal(out3, fixed_data, c_size);
+
+ talloc_free(tmp_ctx);
+}
+
+static void test_lzxpress2(void **state)
+{
+ /*
+ * Use two matches, separated by a literal, and each with a length
+ * greater than 10, to test the use of nibble_index. Both length values
+ * (less ten) should be stored as adjacent nibbles to form the 0x21
+ * byte.
+ */
+
+ TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+ const char *fixed_data = "aaaaaaaaaaaabaaaaaaaaaaaa";
+ const uint8_t fixed_out[] = {
+ 0xff, 0xff, 0xff, 0x5f, 0x61, 0x07, 0x00, 0x21,
+ 0x62, 0x67, 0x00};
+
+ ssize_t c_size;
+ uint8_t *out, *out2;
+
+ out = talloc_size(tmp_ctx, 2048);
+ memset(out, 0x42, talloc_get_size(out));
+
+ c_size = lzxpress_compress((const uint8_t *)fixed_data,
+ strlen(fixed_data),
+ out,
+ talloc_get_size(out));
+
+ assert_int_not_equal(c_size, -1);
+ assert_int_equal(c_size, sizeof(fixed_out));
+ assert_memory_equal(out, fixed_out, c_size);
+
+ out2 = talloc_size(tmp_ctx, strlen(fixed_data));
+ c_size = lzxpress_decompress(out,
+ sizeof(fixed_out),
+ out2,
+ talloc_get_size(out2));
+
+ assert_int_not_equal(c_size, -1);
+ assert_int_equal(c_size, strlen(fixed_data));
+ assert_memory_equal(out2, fixed_data, c_size);
+
+ talloc_free(tmp_ctx);
+}
+
+static void test_lzxpress3(void **state)
+{
+ /*
+ * Use a series of 31 literals, followed by a single minimum-length
+ * match (and a terminating literal), to test setting indic_pos when the
+ * 32-bit flags value overflows after a match.
+ */
+
+ TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+ const char *fixed_data = "abcdefghijklmnopqrstuvwxyz01234abca";
+ const uint8_t fixed_out[] = {
+ 0x01, 0x00, 0x00, 0x00, 0x61, 0x62, 0x63, 0x64,
+ 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c,
+ 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74,
+ 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x30, 0x31,
+ 0x32, 0x33, 0x34, 0xf0, 0x00, 0xff, 0xff, 0xff,
+ 0x7f, 0x61};
+
+ ssize_t c_size;
+ uint8_t *out, *out2;
+
+ out = talloc_size(tmp_ctx, 2048);
+ memset(out, 0x42, talloc_get_size(out));
+
+ c_size = lzxpress_compress((const uint8_t *)fixed_data,
+ strlen(fixed_data),
+ out,
+ talloc_get_size(out));
+
+ assert_int_not_equal(c_size, -1);
+ assert_int_equal(c_size, sizeof(fixed_out));
+ assert_memory_equal(out, fixed_out, c_size);
+
+ out2 = talloc_size(tmp_ctx, strlen(fixed_data));
+ c_size = lzxpress_decompress(out,
+ sizeof(fixed_out),
+ out2,
+ talloc_get_size(out2));
+
+ assert_int_not_equal(c_size, -1);
+ assert_int_equal(c_size, strlen(fixed_data));
+ assert_memory_equal(out2, fixed_data, c_size);
+
+ talloc_free(tmp_ctx);
+}
+
+static void test_lzxpress4(void **state)
+{
+ /*
+ * Use a series of 31 literals, followed by a single minimum-length
+ * match, to test that the final set of 32-bit flags is written
+ * correctly when it is empty.
+ */
+
+ TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+ const char *fixed_data = "abcdefghijklmnopqrstuvwxyz01234abc";
+ const uint8_t fixed_out[] = {
+ 0x01, 0x00, 0x00, 0x00, 0x61, 0x62, 0x63, 0x64,
+ 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c,
+ 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74,
+ 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x30, 0x31,
+ 0x32, 0x33, 0x34, 0xf0, 0x00, 0xff, 0xff, 0xff,
+ 0xff};
+
+ ssize_t c_size;
+ uint8_t *out, *out2;
+
+ out = talloc_size(tmp_ctx, 2048);
+ memset(out, 0x42, talloc_get_size(out));
+
+ c_size = lzxpress_compress((const uint8_t *)fixed_data,
+ strlen(fixed_data),
+ out,
+ talloc_get_size(out));
+
+ assert_int_not_equal(c_size, -1);
+ assert_int_equal(c_size, sizeof(fixed_out));
+ assert_memory_equal(out, fixed_out, c_size);
+
+ out2 = talloc_size(tmp_ctx, strlen(fixed_data));
+ c_size = lzxpress_decompress(out,
+ sizeof(fixed_out),
+ out2,
+ talloc_get_size(out2));
+
+ assert_int_not_equal(c_size, -1);
+ assert_int_equal(c_size, strlen(fixed_data));
+ assert_memory_equal(out2, fixed_data, c_size);
+
+ talloc_free(tmp_ctx);
+}
+
+
+static void test_lzxpress_many_zeros(void **state)
+{
+ /*
+ * Repeated values (zero is convenient but not special) will lead to
+ * very long substring searches in compression, which can be very slow
+ * if we're not careful.
+ *
+ * This test makes a very loose assertion about how long it should
+ * take to compress a million zeros.
+ *
+ * Wall clock time *should* be < 0.1 seconds with the fix and around a
+ * minute without it. We try for CLOCK_THREAD_CPUTIME_ID which should
+ * filter out some noise on the machine, and set the threshold at 5
+ * seconds.
+ */
+
+ TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+ const size_t N_ZEROS = 1000000;
+ const uint8_t *zeros = talloc_zero_size(tmp_ctx, N_ZEROS);
+ const ssize_t expected_c_size_max = 120;
+ const ssize_t expected_c_size_min = 93;
+ ssize_t c_size;
+ uint8_t *comp, *decomp;
+ static struct timespec t_start, t_end;
+ uint64_t elapsed_ns;
+
+ if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &t_start) != 0) {
+ if (clock_gettime(CUSTOM_CLOCK_MONOTONIC, &t_start) != 0) {
+ clock_gettime(CLOCK_REALTIME, &t_start);
+ }
+ }
+
+ comp = talloc_zero_size(tmp_ctx, 2048);
+
+ c_size = lzxpress_compress(zeros,
+ N_ZEROS,
+ comp,
+ talloc_get_size(comp));
+ /*
+ * Because our compression depends on heuristics, we don't insist on
+ * an exact size in this case.
+ */
+
+ assert_true(c_size <= expected_c_size_max);
+ assert_true(c_size >= expected_c_size_min);
+
+ decomp = talloc_size(tmp_ctx, N_ZEROS * 2);
+ c_size = lzxpress_decompress(comp,
+ c_size,
+ decomp,
+ N_ZEROS * 2);
+
+ if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &t_end) != 0) {
+ if (clock_gettime(CUSTOM_CLOCK_MONOTONIC, &t_end) != 0) {
+ clock_gettime(CLOCK_REALTIME, &t_end);
+ }
+ }
+ elapsed_ns = (
+ (t_end.tv_sec - t_start.tv_sec) * 1000U * 1000U * 1000U) +
+ (t_end.tv_nsec - t_start.tv_nsec);
+ print_message("round-trip time: %"PRIu64" ns\n", elapsed_ns);
+ assert_true(elapsed_ns < 3 * 1000U * 1000U * 1000U);
+ assert_memory_equal(decomp, zeros, N_ZEROS);
+
+ talloc_free(tmp_ctx);
+}
+
+
+static void test_lzxpress_round_trip(void **state)
+{
+ /*
+ * Examples found using via fuzzing.
+ */
+ TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+ size_t i;
+ struct b64_pair {
+ const char *uncompressed;
+ const char *compressed;
+ } pairs[] = {
+ { /* this results in a trailing flags block */
+ "AAICAmq/EKdP785YU2Ddh7d4vUtdlQyLeHV09LHpUBw=",
+ "AAAAAAACAgJqvxCnT+/OWFNg3Ye3eL1LXZUMi3h1dPSx6VAc/////w==",
+ },
+ { /* empty string compresses to empty string */
+ "", ""
+ },
+ };
+ const size_t alloc_size = 1000;
+ uint8_t *data = talloc_array(tmp_ctx, uint8_t, alloc_size);
+
+ for (i = 0; i < ARRAY_SIZE(pairs); i++) {
+ ssize_t len;
+ DATA_BLOB uncomp = base64_decode_data_blob_talloc(
+ tmp_ctx,
+ pairs[i].uncompressed);
+ DATA_BLOB comp = base64_decode_data_blob_talloc(
+ tmp_ctx,
+ pairs[i].compressed);
+
+ len = lzxpress_compress(uncomp.data,
+ uncomp.length,
+ data,
+ alloc_size);
+
+ assert_int_not_equal(len, -1);
+ assert_int_equal(len, comp.length);
+
+ assert_memory_equal(comp.data, data, len);
+
+ len = lzxpress_decompress(comp.data,
+ comp.length,
+ data,
+ alloc_size);
+
+ assert_int_not_equal(len, -1);
+ assert_int_equal(len, uncomp.length);
+
+ assert_memory_equal(uncomp.data, data, len);
+ }
+ talloc_free(tmp_ctx);
+}
+
+
+int main(void)
+{
+ const struct CMUnitTest tests[] = {
+ cmocka_unit_test(test_lzxpress_plain_decompress_files),
+ cmocka_unit_test(test_lzxpress_plain_decompress_more_compressed_files),
+ cmocka_unit_test(test_lzxpress_plain_round_trip_files),
+ cmocka_unit_test(test_lzxpress_plain_long_gpl_round_trip),
+ cmocka_unit_test(test_lzxpress_plain_long_random_graph_round_trip),
+ cmocka_unit_test(test_lzxpress_plain_chaos_graph_round_trip),
+ cmocka_unit_test(test_lzxpress_plain_sparse_random_graph_round_trip),
+ cmocka_unit_test(test_lzxpress_plain_long_random_graph_round_trip),
+ cmocka_unit_test(test_lzxpress_plain_random_noise_round_trip),
+ cmocka_unit_test(test_lzxpress),
+ cmocka_unit_test(test_msft_data1),
+ cmocka_unit_test(test_msft_data2),
+ cmocka_unit_test(test_lzxpress2),
+ cmocka_unit_test(test_lzxpress3),
+ cmocka_unit_test(test_lzxpress4),
+ cmocka_unit_test(test_lzxpress_many_zeros),
+ cmocka_unit_test(test_lzxpress_round_trip),
+ };
+ if (!isatty(1)) {
+ cmocka_set_message_output(CM_OUTPUT_SUBUNIT);
+ }
+ return cmocka_run_group_tests(tests, NULL, NULL);
+}