1 files changed, 1194 insertions, 0 deletions
diff --git a/lib/compression/tests/test_lzxpress_plain.c b/lib/compression/tests/test_lzxpress_plain.c
new file mode 100644
index 0000000..1c14793
--- /dev/null
+++ b/lib/compression/tests/test_lzxpress_plain.c
@@ -0,0 +1,1194 @@
+/*
+   Unix SMB/CIFS implementation.
+   test suite for the compression functions
+
+   Copyright (C) Jelmer Vernooij 2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <stdarg.h>
+#include <stddef.h>
+#include <setjmp.h>
+#include <sys/stat.h>
+#include <cmocka.h>
+#include "includes.h"
+#include "talloc.h"
+#include "lzxpress.h"
+#include "lib/util/base64.h"
+
+
+/* set LZX_DEBUG_FILES to true to save round-trip files in /tmp. */
+#define LZX_DEBUG_FILES false
+
+/* set LZX_DEBUG_VERBOSE to true to print more. */
+#define LZX_DEBUG_VERBOSE false
+
+
+#if LZX_DEBUG_VERBOSE
+#define debug_message(...) print_message(__VA_ARGS__)
+
+#include <time.h>
+
+struct timespec start = {0};
+struct timespec end = {0};
+static void debug_start_timer(void)
+{
+	clock_gettime(CLOCK_MONOTONIC, &start);
+}
+
+static void debug_end_timer(const char *name, size_t len)
+{
+	uint64_t ns;
+	double secs;
+	double rate;
+	clock_gettime(CLOCK_MONOTONIC, &end);
+	ns = end.tv_nsec;
+	ns += end.tv_sec * 1000 * 1000 * 1000;
+	ns -= start.tv_nsec;
+	ns -= start.tv_sec * 1000 * 1000 * 1000;
+	secs = ns / 1e9;
+	rate = len / (secs * 1024 * 1024);
+	debug_message("%s %zu bytes in %.2g: \033[1;35m%.2f\033[0m MB per second\n",
+		      name, len, secs, rate);
+}
+
+#else
+#define debug_message(...) /* debug_message */
+#define debug_start_timer(...) /* debug_start_timer */
+#define debug_end_timer(...) /* debug_end_timer */
+#endif
+
+
+struct lzx_pair {
+	const char *name;
+	DATA_BLOB compressed;
+	DATA_BLOB decompressed;
+};
+
+struct lzx_file_pair {
+	const char *name;
+	const char *compressed_file;
+	const char *decompressed_file;
+};
+
+
+#define DECOMP_DIR "testdata/compression/decompressed"
+#define COMP_DIR "testdata/compression/compressed-plain"
+#define MORE_COMP_DIR "testdata/compression/compressed-more-plain"
+
+
+#define BLOB_FROM_ARRAY(...)                             \
+	{                                                \
+		.data = (uint8_t[]){__VA_ARGS__},          \
+		.length = sizeof((uint8_t[]){__VA_ARGS__}) \
+	}
+
+#define BLOB_FROM_STRING(s)                                      \
+	{                                                            \
+		.data = discard_const_p(uint8_t, s),		     \
+		.length = (sizeof(s) - 1)		     \
+	}
+
+
+const char * file_names[] = {
+	"generate-windows-test-vectors.c",
+	"fib_shuffle-128k+",
+	"fuzzing-0fc2d461b56cd8103c91",
+	"fuzzing-3ec3bca27bb9eb40c128",
+	"fuzzing-a3115a81d1ac500318f9",
+	"fuzzing-3591f9dc02bb00a54b60",
+	"27826-8.txt",
+	"5d049b4cb1bd933f5e8ex19",
+	"638e61e96d54279981c3x5",
+	"64k-minus-one-zeros",
+	"64k-plus-one-zeros",
+	"64k-zeros",
+	"96f696a4e5ce56c61a3dx10",
+	"9e0b6a12febf38e98f13",
+	"abc-times-101",
+	"abc-times-105",
+	"abc-times-200",
+	"b63289ccc7f218c0d56b",
+	"beta-variate1-128k+",
+	"beta-variate3-128k+",
+	"decayed_alphabet_128k+",
+	"decayed_alphabet_64k",
+	"f00842317dc6d5695b02",
+	"fib_shuffle",
+	"midsummer-nights-dream.txt",
+	"notes-on-the-underground.txt",
+	"pg22009.txt",
+	"repeating",
+	"repeating-exactly-64k",
+	"setup.log",
+	"slow-015ddc36a71412ccc50d",
+	"slow-100e9f966a7feb9ca40a",
+	"slow-2a671c3cff4f1574cbab",
+	"slow-33d90a24e70515b14cd0",
+	"slow-49d8c05261e3f412fc72",
+	"slow-50a249d2fe56873e56a0",
+	"slow-63e9f0b52235fb0129fa",
+	"slow-73b7f971d65908ac0095",
+	"slow-8b61e3dd267908544531",
+	"slow-9d1c5a079b0462986f1f",
+	"slow-aa7262a821dabdcf04a6",
+	"slow-b8a91d142b0d2af7f5ca",
+	"slow-c79142457734bbc8d575",
+	"slow-d736544545b90d83fe75",
+	"slow-e3b9bdfaed7d1a606fdb",
+	"slow-f3f1c02a9d006e5e1703",
+	"trigram_128k+",
+	"trigram_64k",
+	"trigram_sum_128k+",
+	"trigram_sum_64k",
+	NULL
+};
+
+
+
+static DATA_BLOB datablob_from_file(TALLOC_CTX *mem_ctx,
+				    const char *filename)
+{
+	DATA_BLOB b = {0};
+	FILE *fh = fopen(filename, "rb");
+	int ret;
+	struct stat s;
+	size_t len;
+	if (fh == NULL) {
+		debug_message("could not open '%s'\n", filename);
+		return b;
+	}
+	ret = fstat(fileno(fh), &s);
+	if (ret != 0) {
+		fclose(fh);
+		return b;
+	}
+	b.data = talloc_array(mem_ctx, uint8_t, s.st_size);
+	if (b.data == NULL) {
+		fclose(fh);
+		return b;
+	}
+	len = fread(b.data, 1, s.st_size, fh);
+	if (ferror(fh) || len != s.st_size) {
+		TALLOC_FREE(b.data);
+	} else {
+		b.length = len;
+	}
+	fclose(fh);
+	return b;
+}
+
+
+
+static void test_lzxpress_plain_decompress_files(void **state)
+{
+	size_t i;
+	int score = 0;
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	for (i = 0; file_names[i] != NULL; i++) {
+		char filename[200];
+		uint8_t *dest = NULL;
+		ssize_t written;
+		TALLOC_CTX *tmp_ctx = talloc_new(mem_ctx);
+		struct lzx_pair p = {
+			.name = file_names[i]
+		};
+
+		debug_message("%s\n", p.name);
+
+		snprintf(filename, sizeof(filename),
+			 "%s/%s.decomp", DECOMP_DIR, p.name);
+
+		p.decompressed = datablob_from_file(tmp_ctx, filename);
+		assert_non_null(p.decompressed.data);
+
+		snprintf(filename, sizeof(filename),
+			 "%s/%s.lzplain", COMP_DIR, p.name);
+
+		p.compressed = datablob_from_file(tmp_ctx, filename);
+		assert_non_null(p.compressed.data);
+
+		dest = talloc_array(tmp_ctx, uint8_t, p.decompressed.length);
+		debug_start_timer();
+		written = lzxpress_decompress(p.compressed.data,
+					      p.compressed.length,
+					      dest,
+					      p.decompressed.length);
+		debug_end_timer("decompress", p.decompressed.length);
+		if (written == p.decompressed.length &&
+		    memcmp(dest, p.decompressed.data, p.decompressed.length) == 0) {
+			debug_message("\033[1;32mdecompressed %s!\033[0m\n", p.name);
+			score++;
+		} else {
+			debug_message("\033[1;31mfailed to decompress %s!\033[0m\n",
+				      p.name);
+			debug_message("size %zd vs reference %zu\n",
+				      written, p.decompressed.length);
+		}
+		talloc_free(tmp_ctx);
+	}
+	debug_message("%d/%zu correct\n", score, i);
+	assert_int_equal(score, i);
+}
+
+
+static void test_lzxpress_plain_decompress_more_compressed_files(void **state)
+{
+	/*
+	 * This tests the decompression of files that have been compressed on
+	 * Windows with the level turned up (to 1, default for MS-XCA is 0).
+	 *
+	 * The format is identical, but it will have tried harder to find
+	 * matches.
+	 */
+	size_t i;
+	int score = 0;
+	int found = 0;
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	for (i = 0; file_names[i] != NULL; i++) {
+		char filename[200];
+		uint8_t *dest = NULL;
+		ssize_t written;
+		TALLOC_CTX *tmp_ctx = talloc_new(mem_ctx);
+		struct lzx_pair p = {
+			.name = file_names[i]
+		};
+
+		debug_message("%s\n", p.name);
+
+		snprintf(filename, sizeof(filename),
+			 "%s/%s.decomp", DECOMP_DIR, p.name);
+
+		p.decompressed = datablob_from_file(tmp_ctx, filename);
+		assert_non_null(p.decompressed.data);
+
+		snprintf(filename, sizeof(filename),
+			 "%s/%s.lzplain", MORE_COMP_DIR, p.name);
+
+		p.compressed = datablob_from_file(tmp_ctx, filename);
+		if (p.compressed.data == NULL) {
+			/*
+			 * We don't have all the vectors in the
+			 * more-compressed directory, which is OK, we skip
+			 * them.
+			 */
+			continue;
+		}
+		found++;
+		dest = talloc_array(tmp_ctx, uint8_t, p.decompressed.length);
+		debug_start_timer();
+		written = lzxpress_decompress(p.compressed.data,
+					      p.compressed.length,
+					      dest,
+					      p.decompressed.length);
+		debug_end_timer("decompress", p.decompressed.length);
+		if (written != -1 &&
+		    written == p.decompressed.length &&
+		    memcmp(dest, p.decompressed.data, p.decompressed.length) == 0) {
+			debug_message("\033[1;32mdecompressed %s!\033[0m\n", p.name);
+			score++;
+		} else {
+			debug_message("\033[1;31mfailed to decompress %s!\033[0m\n",
+				      p.name);
+			debug_message("size %zd vs reference %zu\n",
+				      written, p.decompressed.length);
+		}
+		talloc_free(tmp_ctx);
+	}
+	debug_message("%d/%d correct\n", score, found);
+	assert_int_equal(score, found);
+}
+
+
+/*
+ * attempt_round_trip() tests whether a data blob can survive a compression
+ * and decompression cycle. If save_name is not NULL and LZX_DEBUG_FILES
+ * evals to true, the various stages are saved in files with that name and the
+ * '-original', '-compressed', and '-decompressed' suffixes. If ref_compressed
+ * has data, it'll print a message saying whether the compressed data matches
+ * that.
+ */
+
+static ssize_t attempt_round_trip(TALLOC_CTX *mem_ctx,
+				  DATA_BLOB original,
+				  const char *save_name,
+				  DATA_BLOB ref_compressed)
+{
+	TALLOC_CTX *tmp_ctx = talloc_new(mem_ctx);
+	DATA_BLOB compressed = data_blob_talloc(tmp_ctx, NULL,
+						original.length * 8 / 7 + 8);
+	DATA_BLOB decompressed = data_blob_talloc(tmp_ctx, NULL,
+						  original.length);
+	ssize_t comp_written, decomp_written;
+	debug_start_timer();
+	comp_written = lzxpress_compress(original.data,
+					 original.length,
+					 compressed.data,
+					 compressed.length);
+	debug_end_timer("compress", original.length);
+	if (comp_written <= 0) {
+		talloc_free(tmp_ctx);
+		return -1;
+	}
+
+	if (ref_compressed.data != NULL) {
+		/*
+		 * This is informational, not an assertion; there are
+		 * ~infinite legitimate ways to compress the data, many as
+		 * good as each other (think of compression as a language, not
+		 * a format).
+		 */
+		debug_message("compressed size %zd vs reference %zu\n",
+			      comp_written, ref_compressed.length);
+
+		if (comp_written == compressed.length &&
+		    memcmp(compressed.data, ref_compressed.data, comp_written) == 0) {
+			debug_message("\033[1;32mbyte identical!\033[0m\n");
+		}
+	}
+	debug_start_timer();
+	decomp_written = lzxpress_decompress(compressed.data,
+					     comp_written,
+					     decompressed.data,
+					     decompressed.length);
+	debug_end_timer("decompress", original.length);
+	if (save_name != NULL && LZX_DEBUG_FILES) {
+		char s[300];
+		FILE *fh = NULL;
+
+		snprintf(s, sizeof(s), "%s-original", save_name);
+		fprintf(stderr, "Saving %zu bytes to %s\n", original.length, s);
+		fh = fopen(s, "w");
+		fwrite(original.data, 1, original.length, fh);
+		fclose(fh);
+
+		snprintf(s, sizeof(s), "%s-compressed", save_name);
+		fprintf(stderr, "Saving %zu bytes to %s\n", comp_written, s);
+		fh = fopen(s, "w");
+		fwrite(compressed.data, 1, comp_written, fh);
+		fclose(fh);
+		/*
+		 * We save the decompressed file using original.length, not
+		 * the returned size. If these differ, the returned size will
+		 * be -1. By saving the whole buffer we can see at what point
+		 * it went haywire.
+		 */
+		snprintf(s, sizeof(s), "%s-decompressed", save_name);
+		fprintf(stderr, "Saving %zu bytes to %s\n", original.length, s);
+		fh = fopen(s, "w");
+		fwrite(decompressed.data, 1, original.length, fh);
+		fclose(fh);
+	}
+
+	if (original.length != decomp_written ||
+	    memcmp(decompressed.data,
+		   original.data,
+		   original.length) != 0) {
+		debug_message("\033[1;31mgot %zd, expected %zu\033[0m\n",
+			      decomp_written,
+			      original.length);
+		talloc_free(tmp_ctx);
+		return -1;
+	}
+	talloc_free(tmp_ctx);
+	return comp_written;
+}
+
+
+static void test_lzxpress_plain_round_trip_files(void **state)
+{
+	size_t i;
+	int score = 0;
+	ssize_t compressed_total = 0;
+	ssize_t reference_total = 0;
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	for (i = 0; file_names[i] != NULL; i++) {
+		char filename[200];
+		char *debug_files = NULL;
+		TALLOC_CTX *tmp_ctx = talloc_new(mem_ctx);
+		ssize_t comp_size;
+		struct lzx_pair p = {
+			.name = file_names[i]
+		};
+		debug_message("-------------------\n");
+		debug_message("%s\n", p.name);
+
+		snprintf(filename, sizeof(filename),
+			 "%s/%s.decomp", DECOMP_DIR, p.name);
+
+		p.decompressed = datablob_from_file(tmp_ctx, filename);
+		assert_non_null(p.decompressed.data);
+
+		snprintf(filename, sizeof(filename),
+			 "%s/%s.lzplain", COMP_DIR, p.name);
+
+		p.compressed = datablob_from_file(tmp_ctx, filename);
+		if (p.compressed.data == NULL) {
+			debug_message(
+				"Could not load %s reference file %s\n",
+				p.name, filename);
+			debug_message("%s decompressed %zu\n", p.name,
+				      p.decompressed.length);
+		} else {
+			debug_message("%s: reference compressed %zu decomp %zu\n",
+				      p.name,
+				      p.compressed.length,
+				      p.decompressed.length);
+		}
+		if (1) {
+			/*
+			 * We're going to save copies in /tmp.
+			 */
+			snprintf(filename, sizeof(filename),
+				 "/tmp/lzxplain-%s", p.name);
+			debug_files = filename;
+		}
+
+		comp_size = attempt_round_trip(mem_ctx, p.decompressed,
+					       debug_files,
+					       p.compressed);
+		if (comp_size > 0) {
+			debug_message("\033[1;32mround trip!\033[0m\n");
+			score++;
+			if (p.compressed.length) {
+				compressed_total += comp_size;
+				reference_total += p.compressed.length;
+			}
+		}
+		talloc_free(tmp_ctx);
+	}
+	debug_message("%d/%zu correct\n", score, i);
+	print_message("\033[1;34mtotal compressed size: %zu\033[0m\n",
+		      compressed_total);
+	print_message("total reference size:  %zd \n", reference_total);
+	print_message("diff:                  %7zd \n",
+		      reference_total - compressed_total);
+	assert_true(reference_total != 0);
+	print_message("ratio: \033[1;3%dm%.2f\033[0m \n",
+		      2 + (compressed_total >= reference_total),
+		      ((double)compressed_total) / reference_total);
+	/*
+	 * Assert that the compression is better than Windows. Unlike the
+	 * Huffman variant, where things are very even, here we do much better
+	 * than Windows without especially trying.
+	 */
+	assert_true(compressed_total <= reference_total);
+
+	assert_int_equal(score, i);
+	talloc_free(mem_ctx);
+}
+
+
+/*
+ * Bob Jenkins' Small Fast RNG.
+ *
+ * We don't need it to be this good, but we do need it to be reproduceable
+ * across platforms, which rand() etc aren't.
+ *
+ * http://burtleburtle.net/bob/rand/smallprng.html
+ */
+
+struct jsf_rng {
+	uint32_t a;
+	uint32_t b;
+	uint32_t c;
+	uint32_t d;
+};
+
+#define ROTATE32(x, k) (((x) << (k)) | ((x) >> (32 - (k))))
+
+static uint32_t jsf32(struct jsf_rng *x) {
+	uint32_t e = x->a - ROTATE32(x->b, 27);
+	x->a = x->b ^ ROTATE32(x->c, 17);
+	x->b = x->c + x->d;
+	x->c = x->d + e;
+	x->d = e + x->a;
+	return x->d;
+}
+
+static void jsf32_init(struct jsf_rng *x, uint32_t seed) {
+	size_t i;
+	x->a = 0xf1ea5eed;
+	x->b = x->c = x->d = seed;
+	for (i = 0; i < 20; ++i) {
+		jsf32(x);
+	}
+}
+
+
+static void test_lzxpress_plain_long_gpl_round_trip(void **state)
+{
+	/*
+	 * We use a kind of model-free Markov model to generate a massively
+	 * extended pastiche of the GPLv3 (chosen because it is right there in
+	 * "COPYING" and won't change often).
+	 *
+	 * The point is to check a round trip of a very long message with
+	 * multiple repetitions on many scales, without having to add a very
+	 * large file.
+	 */
+	size_t i, j, k;
+	uint8_t c;
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	DATA_BLOB gpl = datablob_from_file(mem_ctx, "COPYING");
+	DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, 5 * 1024 * 1024);
+	DATA_BLOB ref = {0};
+	ssize_t comp_size;
+	struct jsf_rng rng;
+
+
+	jsf32_init(&rng, 1);
+
+	j = 1;
+	original.data[0] = gpl.data[0];
+	for (i = 1; i < original.length; i++) {
+		size_t m;
+		char p = original.data[i - 1];
+		c = gpl.data[j];
+		original.data[i] = c;
+		j++;
+		m = (j + jsf32(&rng)) % (gpl.length - 50);
+		for (k = m; k < m + 30; k++) {
+			if (p == gpl.data[k] &&
+			    c == gpl.data[k + 1]) {
+				j = k + 2;
+				break;
+			}
+		}
+		if (j == gpl.length) {
+			j = 1;
+		}
+	}
+
+	comp_size = attempt_round_trip(mem_ctx, original, "/tmp/gpl", ref);
+	assert_true(comp_size > 0);
+	assert_true(comp_size < original.length);
+
+	talloc_free(mem_ctx);
+}
+
+
+static void test_lzxpress_plain_long_random_graph_round_trip(void **state)
+{
+	size_t i;
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, 5 * 1024 * 1024);
+	DATA_BLOB ref = {0};
+	/*
+	 * There's a random trigram graph, with each pair of sequential bytes
+	 * pointing to a successor. This would probably fall into a fairly
+	 * simple loop, but we introduce damage into the system, randomly
+	 * flipping about 1 bit in 64.
+	 *
+	 * The result is semi-structured and compressible.
+	 */
+	uint8_t *d = original.data;
+	uint8_t *table = talloc_array(mem_ctx, uint8_t, 65536);
+	uint32_t *table32 = (void*)table;
+	ssize_t comp_size;
+	struct jsf_rng rng;
+
+	jsf32_init(&rng, 1);
+	for (i = 0; i < (65536 / 4); i++) {
+		table32[i] = jsf32(&rng);
+	}
+
+	d[0] = 'a';
+	d[1] = 'b';
+
+	for (i = 2; i < original.length; i++) {
+		uint16_t k = (d[i - 2] << 8) | d[i - 1];
+		uint32_t damage = jsf32(&rng) & jsf32(&rng) & jsf32(&rng);
+		damage &= (damage >> 16);
+		k ^= damage & 0xffff;
+		d[i] = table[k];
+	}
+
+	comp_size = attempt_round_trip(mem_ctx, original, "/tmp/random-graph", ref);
+	assert_true(comp_size > 0);
+	assert_true(comp_size < original.length);
+
+	talloc_free(mem_ctx);
+}
+
+
+static void test_lzxpress_plain_chaos_graph_round_trip(void **state)
+{
+	size_t i;
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, 5 * 1024 * 1024);
+	DATA_BLOB ref = {0};
+	/*
+	 * There's a random trigram graph, with each pair of sequential bytes
+	 * pointing to a successor. This would probably fall into a fairly
+	 * simple loop, but we keep changing the graph. The result is long
+	 * periods of stability separatd by bursts of noise.
+	 */
+	uint8_t *d = original.data;
+	uint8_t *table = talloc_array(mem_ctx, uint8_t, 65536);
+	uint32_t *table32 = (void*)table;
+	ssize_t comp_size;
+	struct jsf_rng rng;
+
+	jsf32_init(&rng, 1);
+	for (i = 0; i < (65536 / 4); i++) {
+		table32[i] = jsf32(&rng);
+	}
+
+	d[0] = 'a';
+	d[1] = 'b';
+
+	for (i = 2; i < original.length; i++) {
+		uint16_t k = (d[i - 2] << 8) | d[i - 1];
+		uint32_t damage = jsf32(&rng);
+		d[i] = table[k];
+		if ((damage >> 29) == 0) {
+			uint16_t index = damage & 0xffff;
+			uint8_t value = (damage >> 16) & 0xff;
+			table[index] = value;
+		}
+	}
+
+	comp_size = attempt_round_trip(mem_ctx, original, "/tmp/chaos-graph", ref);
+	assert_true(comp_size > 0);
+	assert_true(comp_size < original.length);
+
+	talloc_free(mem_ctx);
+}
+
+
+static void test_lzxpress_plain_sparse_random_graph_round_trip(void **state)
+{
+	size_t i;
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, 5 * 1024 * 1024);
+	DATA_BLOB ref = {0};
+	/*
+	 * There's a random trigram graph, with each pair of sequential bytes
+	 * pointing to a successor. This will fall into a fairly simple loops,
+	 * but we introduce damage into the system, randomly mangling about 1
+	 * byte in 65536.
+	 *
+	 * The result has very long repetitive runs, which should lead to
+	 * oversized blocks.
+	 */
+	uint8_t *d = original.data;
+	uint8_t *table = talloc_array(mem_ctx, uint8_t, 65536);
+	uint32_t *table32 = (void*)table;
+	ssize_t comp_size;
+	struct jsf_rng rng;
+
+	jsf32_init(&rng, 3);
+	for (i = 0; i < (65536 / 4); i++) {
+		table32[i] = jsf32(&rng);
+	}
+
+	d[0] = 'a';
+	d[1] = 'b';
+
+	for (i = 2; i < original.length; i++) {
+		uint16_t k = (d[i - 2] << 8) | d[i - 1];
+		uint32_t damage = jsf32(&rng);
+		if ((damage & 0xffff0000) == 0) {
+			k ^= damage & 0xffff;
+		}
+		d[i] = table[k];
+	}
+
+	comp_size = attempt_round_trip(mem_ctx, original, "/tmp/sparse-random-graph", ref);
+	assert_true(comp_size > 0);
+	assert_true(comp_size < original.length);
+
+	talloc_free(mem_ctx);
+}
+
+
+static void test_lzxpress_plain_random_noise_round_trip(void **state)
+{
+	size_t i;
+	size_t len = 10 * 1024 * 1024;
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, len);
+	DATA_BLOB ref = {0};
+	ssize_t comp_size;
+	/*
+	 * We are filling this up with incompressible noise, but we can assert
+	 * quite tight bounds on how badly it will fail to compress.
+	 *
+	 * There is one additional bit for each code, which says whether the
+	 * code is a literal byte or a match. If *all* codes are literal
+	 * bytes, the length should be 9/8 the original (with rounding
+	 * issues regarding the indicator bit blocks).
+	 *
+	 * If some matches are found the length will be a bit less. We would
+	 * expect one 3 byte match per 1 << 24 tries, but we try 8192 times
+	 * per position. That means there'll a match 1/2048 of the time at
+	 * best. 255 times out of 256 this will be exactly a 3 byte match,
+	 * encoded as two bytes, so we could get a 1 / 2048 saving on top of
+	 * the 1/8 cost. There'll be a smattering of longer matches too, and
+	 * the potential for complicated maths to account for those, but we'll
+	 * skimp on that by allowing for a 1/1500 saving.
+	 *
+	 * With the hash table, we take a shortcut in the "8192 tries", and
+	 * the size of the table makes a difference in how we perform, with 13
+	 * bits (8192 slots) naturally being luckier than 12. Ultimately,
+	 * either way, the compressed file is still 12.5% bigger than the
+	 * original.
+	 */
+	size_t limit = len * 9 / 8 + 4;
+
+	uint32_t *d32 = (uint32_t*)((void*)original.data);
+	struct jsf_rng rng;
+	jsf32_init(&rng, 2);
+
+	for (i = 0; i < (len / 4); i++) {
+		d32[i] = jsf32(&rng);
+	}
+
+	comp_size = attempt_round_trip(mem_ctx, original, "/tmp/random-noise", ref);
+	debug_message("original size %zu; compressed size %zd; ratio %.5f\n",
+		      len, comp_size, ((double)comp_size) / len);
+	debug_message("expected range %zu - %zu\n",
+		      limit - limit / 1500, limit);
+
+	assert_true(comp_size > 0);
+	assert_true(comp_size < limit);
+	assert_true(comp_size >= limit - limit / 1500);
+	talloc_free(mem_ctx);
+}
+
+
+/* Tests based on [MS-XCA] 3.1 Examples */
+static void test_msft_data1(void **state)
+{
+	TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+
+	const char *fixed_data = "abcdefghijklmnopqrstuvwxyz";
+	const uint8_t fixed_out[] = {
+		0x3f, 0x00, 0x00, 0x00, 0x61, 0x62, 0x63, 0x64,
+		0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c,
+		0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74,
+		0x75, 0x76, 0x77, 0x78, 0x79, 0x7a };
+
+	ssize_t c_size;
+	uint8_t *out, *out2;
+
+	out  = talloc_size(tmp_ctx, 2048);
+	memset(out, 0x42, talloc_get_size(out));
+
+	c_size = lzxpress_compress((const uint8_t *)fixed_data,
+				   strlen(fixed_data),
+				   out,
+				   talloc_get_size(out));
+	assert_int_not_equal(c_size, -1);
+	assert_int_equal(c_size, sizeof(fixed_out));
+	assert_memory_equal(out, fixed_out, c_size);
+	out2  = talloc_size(tmp_ctx, strlen(fixed_data));
+	c_size = lzxpress_decompress(out,
+				     sizeof(fixed_out),
+				     out2,
+				     talloc_get_size(out2));
+	assert_int_not_equal(c_size, -1);
+	assert_int_equal(c_size, strlen(fixed_data));
+	assert_memory_equal(out2, fixed_data, c_size);
+
+	talloc_free(tmp_ctx);
+}
+
+
+static void test_msft_data2(void **state)
+{
+	TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+
+	const char *fixed_data =
+		"abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+		"abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+		"abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+		"abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+		"abcabcabcabcabcabcabcabc";
+	const uint8_t fixed_out[] = {
+		0xff, 0xff, 0xff, 0x1f, 0x61, 0x62, 0x63, 0x17,
+		0x00, 0x0f, 0xff, 0x26, 0x01};
+
+	ssize_t c_size;
+	uint8_t *out, *out2;
+
+	out  = talloc_size(tmp_ctx, 2048);
+	memset(out, 0x42, talloc_get_size(out));
+
+	c_size = lzxpress_compress((const uint8_t *)fixed_data,
+				   strlen(fixed_data),
+				   out,
+				   talloc_get_size(out));
+	assert_int_not_equal(c_size, -1);
+	assert_int_equal(c_size, sizeof(fixed_out));
+	assert_memory_equal(out, fixed_out, c_size);
+
+	out2  = talloc_size(tmp_ctx, strlen(fixed_data));
+	c_size = lzxpress_decompress(out,
+				     sizeof(fixed_out),
+				     out2,
+				     talloc_get_size(out2));
+
+	assert_int_not_equal(c_size, -1);
+	assert_int_equal(c_size, strlen(fixed_data));
+	assert_memory_equal(out2, fixed_data, c_size);
+
+	talloc_free(tmp_ctx);
+}
+
+/*
+  test lzxpress
+ */
+static void test_lzxpress(void **state)
+{
+	TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+	const char *fixed_data = "this is a test. and this is a test too";
+	const uint8_t fixed_out[] = {
+		0xff, 0x21, 0x00, 0x04, 0x74, 0x68, 0x69, 0x73,
+		0x20, 0x10, 0x00, 0x61, 0x20, 0x74, 0x65, 0x73,
+		0x74, 0x2E, 0x20, 0x61, 0x6E, 0x64, 0x20, 0x9F,
+		0x00, 0x04, 0x20, 0x74, 0x6F, 0x6F };
+
+	const uint8_t fixed_out_old_version[] = {
+		0x00, 0x20, 0x00, 0x04, 0x74, 0x68, 0x69, 0x73,
+		0x20, 0x10, 0x00, 0x61, 0x20, 0x74, 0x65, 0x73,
+		0x74, 0x2E, 0x20, 0x61, 0x6E, 0x64, 0x20, 0x9F,
+		0x00, 0x04, 0x20, 0x74, 0x6F, 0x6F, 0x00, 0x00,
+		0x00, 0x00 };
+
+	ssize_t c_size;
+	uint8_t *out, *out2, *out3;
+
+	out  = talloc_size(tmp_ctx, 2048);
+	memset(out, 0x42, talloc_get_size(out));
+
+	c_size = lzxpress_compress((const uint8_t *)fixed_data,
+				   strlen(fixed_data),
+				   out,
+				   talloc_get_size(out));
+
+	assert_int_not_equal(c_size, -1);
+	assert_int_equal(c_size, sizeof(fixed_out));
+	assert_memory_equal(out, fixed_out, c_size);
+
+	out2  = talloc_size(tmp_ctx, strlen(fixed_data));
+	c_size = lzxpress_decompress(out,
+				     sizeof(fixed_out),
+				     out2,
+				     talloc_get_size(out2));
+
+	assert_int_not_equal(c_size, -1);
+	assert_int_equal(c_size, strlen(fixed_data));
+	assert_memory_equal(out2, fixed_data, c_size);
+
+	out3  = talloc_size(tmp_ctx, strlen(fixed_data));
+	c_size = lzxpress_decompress(fixed_out_old_version,
+				     sizeof(fixed_out_old_version),
+				     out3,
+				     talloc_get_size(out3));
+
+	assert_int_not_equal(c_size, -1);
+	assert_int_equal(c_size, strlen(fixed_data));
+	assert_memory_equal(out3, fixed_data, c_size);
+
+	talloc_free(tmp_ctx);
+}
+
+static void test_lzxpress2(void **state)
+{
+	/*
+	 * Use two matches, separated by a literal, and each with a length
+	 * greater than 10, to test the use of nibble_index. Both length values
+	 * (less ten) should be stored as adjacent nibbles to form the 0x21
+	 * byte.
+	 */
+
+	TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+	const char *fixed_data = "aaaaaaaaaaaabaaaaaaaaaaaa";
+	const uint8_t fixed_out[] = {
+		0xff, 0xff, 0xff, 0x5f, 0x61, 0x07, 0x00, 0x21,
+		0x62, 0x67, 0x00};
+
+	ssize_t c_size;
+	uint8_t *out, *out2;
+
+	out  = talloc_size(tmp_ctx, 2048);
+	memset(out, 0x42, talloc_get_size(out));
+
+	c_size = lzxpress_compress((const uint8_t *)fixed_data,
+				   strlen(fixed_data),
+				   out,
+				   talloc_get_size(out));
+
+	assert_int_not_equal(c_size, -1);
+	assert_int_equal(c_size, sizeof(fixed_out));
+	assert_memory_equal(out, fixed_out, c_size);
+
+	out2  = talloc_size(tmp_ctx, strlen(fixed_data));
+	c_size = lzxpress_decompress(out,
+				     sizeof(fixed_out),
+				     out2,
+				     talloc_get_size(out2));
+
+	assert_int_not_equal(c_size, -1);
+	assert_int_equal(c_size, strlen(fixed_data));
+	assert_memory_equal(out2, fixed_data, c_size);
+
+	talloc_free(tmp_ctx);
+}
+
+static void test_lzxpress3(void **state)
+{
+	/*
+	 * Use a series of 31 literals, followed by a single minimum-length
+	 * match (and a terminating literal), to test setting indic_pos when the
+	 * 32-bit flags value overflows after a match.
+	 */
+
+	TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+	const char *fixed_data = "abcdefghijklmnopqrstuvwxyz01234abca";
+	const uint8_t fixed_out[] = {
+		0x01, 0x00, 0x00, 0x00, 0x61, 0x62, 0x63, 0x64,
+		0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c,
+		0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74,
+		0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x30, 0x31,
+		0x32, 0x33, 0x34, 0xf0, 0x00, 0xff, 0xff, 0xff,
+		0x7f, 0x61};
+
+	ssize_t c_size;
+	uint8_t *out, *out2;
+
+	out  = talloc_size(tmp_ctx, 2048);
+	memset(out, 0x42, talloc_get_size(out));
+
+	c_size = lzxpress_compress((const uint8_t *)fixed_data,
+				   strlen(fixed_data),
+				   out,
+				   talloc_get_size(out));
+
+	assert_int_not_equal(c_size, -1);
+	assert_int_equal(c_size, sizeof(fixed_out));
+	assert_memory_equal(out, fixed_out, c_size);
+
+	out2  = talloc_size(tmp_ctx, strlen(fixed_data));
+	c_size = lzxpress_decompress(out,
+				     sizeof(fixed_out),
+				     out2,
+				     talloc_get_size(out2));
+
+	assert_int_not_equal(c_size, -1);
+	assert_int_equal(c_size, strlen(fixed_data));
+	assert_memory_equal(out2, fixed_data, c_size);
+
+	talloc_free(tmp_ctx);
+}
+
+static void test_lzxpress4(void **state)
+{
+	/*
+	 * Use a series of 31 literals, followed by a single minimum-length
+	 * match, to test that the final set of 32-bit flags is written
+	 * correctly when it is empty.
+	 */
+
+	TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+	const char *fixed_data = "abcdefghijklmnopqrstuvwxyz01234abc";
+	const uint8_t fixed_out[] = {
+		0x01, 0x00, 0x00, 0x00, 0x61, 0x62, 0x63, 0x64,
+		0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c,
+		0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74,
+		0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x30, 0x31,
+		0x32, 0x33, 0x34, 0xf0, 0x00, 0xff, 0xff, 0xff,
+		0xff};
+
+	ssize_t c_size;
+	uint8_t *out, *out2;
+
+	out  = talloc_size(tmp_ctx, 2048);
+	memset(out, 0x42, talloc_get_size(out));
+
+	c_size = lzxpress_compress((const uint8_t *)fixed_data,
+				   strlen(fixed_data),
+				   out,
+				   talloc_get_size(out));
+
+	assert_int_not_equal(c_size, -1);
+	assert_int_equal(c_size, sizeof(fixed_out));
+	assert_memory_equal(out, fixed_out, c_size);
+
+	out2  = talloc_size(tmp_ctx, strlen(fixed_data));
+	c_size = lzxpress_decompress(out,
+				     sizeof(fixed_out),
+				     out2,
+				     talloc_get_size(out2));
+
+	assert_int_not_equal(c_size, -1);
+	assert_int_equal(c_size, strlen(fixed_data));
+	assert_memory_equal(out2, fixed_data, c_size);
+
+	talloc_free(tmp_ctx);
+}
+
+
+static void test_lzxpress_many_zeros(void **state)
+{
+	/*
+	 * Repeated values (zero is convenient but not special) will lead to
+	 * very long substring searches in compression, which can be very slow
+	 * if we're not careful.
+	 *
+	 * This test makes a very loose assertion about how long it should
+	 * take to compress a million zeros.
+	 *
+	 * Wall clock time *should* be < 0.1 seconds with the fix and around a
+	 * minute without it. We try for CLOCK_THREAD_CPUTIME_ID which should
+	 * filter out some noise on the machine, and set the threshold at 5
+	 * seconds.
+	 */
+
+	TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+	const size_t N_ZEROS = 1000000;
+	const uint8_t *zeros = talloc_zero_size(tmp_ctx, N_ZEROS);
+	const ssize_t expected_c_size_max = 120;
+	const ssize_t expected_c_size_min = 93;
+	ssize_t c_size;
+	uint8_t *comp, *decomp;
+	static struct timespec t_start, t_end;
+	uint64_t elapsed_ns;
+
+	if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &t_start) != 0) {
+		if (clock_gettime(CUSTOM_CLOCK_MONOTONIC, &t_start) != 0) {
+			clock_gettime(CLOCK_REALTIME, &t_start);
+		}
+	}
+
+	comp = talloc_zero_size(tmp_ctx, 2048);
+
+	c_size = lzxpress_compress(zeros,
+				   N_ZEROS,
+				   comp,
+				   talloc_get_size(comp));
+	/*
+	 * Because our compression depends on heuristics, we don't insist on
+	 * an exact size in this case.
+	 */
+
+	assert_true(c_size <= expected_c_size_max);
+	assert_true(c_size >= expected_c_size_min);
+
+	decomp = talloc_size(tmp_ctx, N_ZEROS * 2);
+	c_size = lzxpress_decompress(comp,
+				     c_size,
+				     decomp,
+				     N_ZEROS * 2);
+
+	if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &t_end) != 0) {
+		if (clock_gettime(CUSTOM_CLOCK_MONOTONIC, &t_end) != 0) {
+			clock_gettime(CLOCK_REALTIME, &t_end);
+		}
+	}
+	elapsed_ns = (
+		(t_end.tv_sec - t_start.tv_sec) * 1000U * 1000U * 1000U) +
+		(t_end.tv_nsec - t_start.tv_nsec);
+	print_message("round-trip time: %"PRIu64" ns\n", elapsed_ns);
+	assert_true(elapsed_ns < 3 * 1000U * 1000U * 1000U);
+	assert_memory_equal(decomp, zeros, N_ZEROS);
+
+	talloc_free(tmp_ctx);
+}
+
+
+static void test_lzxpress_round_trip(void **state)
+{
+	/*
+	 * Examples found using via fuzzing.
+	 */
+	TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+	size_t i;
+	struct b64_pair {
+		const char *uncompressed;
+		const char *compressed;
+	} pairs[] = {
+		{   /* this results in a trailing flags block */
+			"AAICAmq/EKdP785YU2Ddh7d4vUtdlQyLeHV09LHpUBw=",
+			"AAAAAAACAgJqvxCnT+/OWFNg3Ye3eL1LXZUMi3h1dPSx6VAc/////w==",
+		},
+		{    /* empty string compresses to empty string */
+			"",  ""
+		},
+	};
+	const size_t alloc_size = 1000;
+	uint8_t *data = talloc_array(tmp_ctx, uint8_t, alloc_size);
+
+	for (i = 0; i < ARRAY_SIZE(pairs); i++) {
+		ssize_t len;
+		DATA_BLOB uncomp = base64_decode_data_blob_talloc(
+			tmp_ctx,
+			pairs[i].uncompressed);
+		DATA_BLOB comp = base64_decode_data_blob_talloc(
+			tmp_ctx,
+			pairs[i].compressed);
+
+		len = lzxpress_compress(uncomp.data,
+					uncomp.length,
+					data,
+					alloc_size);
+
+		assert_int_not_equal(len, -1);
+		assert_int_equal(len, comp.length);
+
+		assert_memory_equal(comp.data, data, len);
+
+		len = lzxpress_decompress(comp.data,
+					  comp.length,
+					  data,
+					  alloc_size);
+
+		assert_int_not_equal(len, -1);
+		assert_int_equal(len, uncomp.length);
+
+		assert_memory_equal(uncomp.data, data, len);
+	}
+	talloc_free(tmp_ctx);
+}
+
+
+int main(void)
+{
+	const struct CMUnitTest tests[] = {
+		cmocka_unit_test(test_lzxpress_plain_decompress_files),
+		cmocka_unit_test(test_lzxpress_plain_decompress_more_compressed_files),
+		cmocka_unit_test(test_lzxpress_plain_round_trip_files),
+		cmocka_unit_test(test_lzxpress_plain_long_gpl_round_trip),
+		cmocka_unit_test(test_lzxpress_plain_long_random_graph_round_trip),
+		cmocka_unit_test(test_lzxpress_plain_chaos_graph_round_trip),
+		cmocka_unit_test(test_lzxpress_plain_sparse_random_graph_round_trip),
+		cmocka_unit_test(test_lzxpress_plain_long_random_graph_round_trip),
+		cmocka_unit_test(test_lzxpress_plain_random_noise_round_trip),
+		cmocka_unit_test(test_lzxpress),
+		cmocka_unit_test(test_msft_data1),
+		cmocka_unit_test(test_msft_data2),
+		cmocka_unit_test(test_lzxpress2),
+		cmocka_unit_test(test_lzxpress3),
+		cmocka_unit_test(test_lzxpress4),
+		cmocka_unit_test(test_lzxpress_many_zeros),
+		cmocka_unit_test(test_lzxpress_round_trip),
+	};
+	if (!isatty(1)) {
+		cmocka_set_message_output(CM_OUTPUT_SUBUNIT);
+	}
+	return cmocka_run_group_tests(tests, NULL, NULL);
+}