Adding upstream version 2:4.20.0+dfsg.upstream/2%4.20.0+dfsg

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-19 17:20:00 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-19 17:20:00 +0000
commit: 8daa83a594a2e98f39d764422bfbdbc62c9efd44 (patch)
tree: 4099e8021376c7d8c05bdf8503093d80e9c7bad0 /lib/compression
parent: Initial commit. (diff)
download: samba-8daa83a594a2e98f39d764422bfbdbc62c9efd44.tar.xz
samba-8daa83a594a2e98f39d764422bfbdbc62c9efd44.zip
14 files changed, 6033 insertions, 0 deletions
diff --git a/lib/compression/lzxpress.c b/lib/compression/lzxpress.c
new file mode 100644
index 0000000..5e5e5ba
--- /dev/null
+++ b/lib/compression/lzxpress.c
@@ -0,0 +1,507 @@
+/*
+ * Copyright (C) Matthieu Suiche 2008
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the author nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include "replace.h"
+#include "lzxpress.h"
+#include "../lib/util/byteorder.h"
+
+
+#define __CHECK_BYTES(__size, __index, __needed) do { \
+	if (unlikely(__index >= __size)) { \
+		return -1; \
+	} else { \
+		uint32_t __avail = __size - __index; \
+		if (unlikely(__needed > __avail)) { \
+			return -1; \
+		} \
+	} \
+} while(0)
+
+
+/*
+ * LZX_PLAIN_COMP_HASH_BITS determines how big the hash table for finding
+ * matches will be.
+ *
+ * The window in which we look for matches is 8192 bytes. That means with
+ * random data a value of 13 is getting close to no collisions, while a 12
+ * will miss about half the possible matches. With compressible data there
+ * will generally be fewer and less diverse entries, so collisions are rarer.
+ *
+ * In the testsuite, bith 12 and 13 give better compression than Windows, but
+ * 12 is faster. 11 does not save time and costs accuracy. Thus we prefer 12.
+ */
+#define LZX_PLAIN_COMP_HASH_BITS 12
+/*
+ * LZX_PLAIN_COMP_HASH_SEARCH_ATTEMPTS is how far ahead to search in the
+ * circular hash table for a match, before we give up. A bigger number will
+ * generally lead to better but slower compression, but a stupidly big number
+ * will just be worse.
+ */
+#define LZX_PLAIN_COMP_HASH_SEARCH_ATTEMPTS 5
+#define HASH_MASK ((1 << LZX_PLAIN_COMP_HASH_BITS) - 1)
+
+static inline uint16_t three_byte_hash(const uint8_t *bytes)
+{
+	uint16_t a = bytes[0];
+	uint16_t b = bytes[1] ^ 0x2e;
+	uint16_t c = bytes[2] ^ 0x55;
+	uint16_t ca = c - a;
+	uint16_t d = ((a + b) << 8) ^ (ca << 5) ^ (c + b) ^ (0xcab + a);
+	return d & HASH_MASK;
+}
+
+
+static inline void store_match(uint32_t *hash_table,
+			       uint16_t h,
+			       uint32_t offset)
+{
+	int i;
+	uint32_t o = hash_table[h];
+	uint16_t h2;
+	uint16_t worst_h;
+	int worst_score;
+
+	if (o >= offset) {
+		/* there is nothing there yet */
+		hash_table[h] = offset;
+		return;
+	}
+	for (i = 1; i < LZX_PLAIN_COMP_HASH_SEARCH_ATTEMPTS; i++) {
+		h2 = (h + i) & HASH_MASK;
+		if (hash_table[h2] >= offset) {
+			hash_table[h2] = offset;
+			return;
+		}
+	}
+	/*
+	 * There are no slots, but we really want to store this, so we'll kick
+	 * out the one with the longest distance.
+	 */
+	worst_h = h;
+	worst_score = offset - o;
+	for (i = 1; i < LZX_PLAIN_COMP_HASH_SEARCH_ATTEMPTS; i++) {
+		int score;
+		h2 = (h + i) & HASH_MASK;
+		o = hash_table[h2];
+		score = offset - o;
+		if (score > worst_score) {
+			worst_score = score;
+			worst_h = h2;
+		}
+	}
+	hash_table[worst_h] = offset;
+}
+
+
+struct match {
+	const uint8_t *there;
+	uint32_t length;
+};
+
+
+static inline struct match lookup_match(uint32_t *hash_table,
+					uint16_t h,
+					const uint8_t *data,
+					uint32_t offset,
+					size_t max_len)
+{
+	int i;
+	uint32_t o;
+	uint16_t h2;
+	size_t len;
+	const uint8_t *there = NULL;
+	const uint8_t *here = data + offset;
+	struct match best = {0};
+
+	for (i = 0; i < LZX_PLAIN_COMP_HASH_SEARCH_ATTEMPTS; i++) {
+		h2 = (h + i) & HASH_MASK;
+		o = hash_table[h2];
+		if (o >= offset) {
+			/*
+			 * Either this is 0xffffffff, or something is really
+			 * wrong.
+			 *
+			 * In setting this, we would never have stepped over
+			 * an 0xffffffff, so we won't now.
+			 */
+			break;
+		}
+		if (offset - o > 8192) {
+			/* Too far away to use */
+			continue;
+		}
+		there = data + o;
+		/*
+		 * When we already have a long match, we can try to avoid
+		 * measuring out another long, but shorter match.
+		 */
+		if (best.length > 1000 &&
+		    there[best.length - 1] != best.there[best.length - 1]) {
+			continue;
+		}
+
+		for (len = 0;
+		     len < max_len && here[len] == there[len];
+		     len++) {
+			/* counting */
+		}
+		if (len > 2) {
+			if (len > best.length) {
+				best.length = len;
+				best.there = there;
+			}
+		}
+	}
+	return best;
+}
+
+struct write_context {
+	uint8_t *compressed;
+	uint32_t compressed_pos;
+	uint32_t max_compressed_size;
+	uint32_t indic;
+	uint32_t indic_bit;
+	uint32_t indic_pos;
+	uint32_t nibble_index;
+};
+
+
+#define CHECK_INPUT_BYTES(__needed) \
+	__CHECK_BYTES(uncompressed_size, uncompressed_pos, __needed)
+#define CHECK_OUTPUT_BYTES(__needed) \
+	__CHECK_BYTES(wc->max_compressed_size, wc->compressed_pos, __needed)
+
+
+static inline ssize_t push_indicator_bit(struct write_context *wc, uint32_t bit)
+{
+	wc->indic = (wc->indic << 1) | bit;
+	wc->indic_bit += 1;
+
+	if (wc->indic_bit == 32) {
+		PUSH_LE_U32(wc->compressed, wc->indic_pos, wc->indic);
+		wc->indic_bit = 0;
+		CHECK_OUTPUT_BYTES(sizeof(uint32_t));
+		wc->indic_pos = wc->compressed_pos;
+		wc->compressed_pos += sizeof(uint32_t);
+	}
+	return wc->indic_pos;
+}
+
+
+static ssize_t encode_match(struct write_context *wc,
+			    struct match match,
+			    const uint8_t *here)
+{
+	uint32_t match_len = match.length - 3;
+	uint32_t best_offset = here - match.there - 1;
+	uint16_t metadata;
+
+	if (best_offset > 8191) {
+		return -1;
+	}
+
+	CHECK_OUTPUT_BYTES(sizeof(uint16_t));
+	metadata = (uint16_t)((best_offset << 3) | MIN(match_len, 7));
+	PUSH_LE_U16(wc->compressed, wc->compressed_pos, metadata);
+	wc->compressed_pos += sizeof(uint16_t);
+
+	if (match_len >= 7) {
+		match_len -= 7;
+
+		if (wc->nibble_index == 0) {
+			wc->nibble_index = wc->compressed_pos;
+
+			CHECK_OUTPUT_BYTES(sizeof(uint8_t));
+			wc->compressed[wc->nibble_index] = MIN(match_len, 15);
+			wc->compressed_pos += sizeof(uint8_t);
+		} else {
+			wc->compressed[wc->nibble_index] |= MIN(match_len, 15) << 4;
+			wc->nibble_index = 0;
+		}
+
+		if (match_len >= 15) {
+			match_len -= 15;
+
+			CHECK_OUTPUT_BYTES(sizeof(uint8_t));
+			wc->compressed[wc->compressed_pos] = MIN(match_len, 255);
+			wc->compressed_pos += sizeof(uint8_t);
+
+			if (match_len >= 255) {
+				/* Additional match_len */
+
+				match_len += 7 + 15;
+
+				if (match_len < (1 << 16)) {
+					CHECK_OUTPUT_BYTES(sizeof(uint16_t));
+					PUSH_LE_U16(wc->compressed, wc->compressed_pos,
+						    match_len);
+					wc->compressed_pos += sizeof(uint16_t);
+				} else {
+					CHECK_OUTPUT_BYTES(sizeof(uint16_t) +
+							   sizeof(uint32_t));
+					PUSH_LE_U16(wc->compressed,
+						    wc->compressed_pos, 0);
+					wc->compressed_pos += sizeof(uint16_t);
+
+					PUSH_LE_U32(wc->compressed,
+						    wc->compressed_pos,
+						    match_len);
+					wc->compressed_pos += sizeof(uint32_t);
+				}
+			}
+		}
+	}
+	return push_indicator_bit(wc, 1);
+}
+
+#undef CHECK_OUTPUT_BYTES
+#define CHECK_OUTPUT_BYTES(__needed) \
+	__CHECK_BYTES(wc.max_compressed_size, wc.compressed_pos, __needed)
+
+
+ssize_t lzxpress_compress(const uint8_t *uncompressed,
+			  uint32_t uncompressed_size,
+			  uint8_t *compressed,
+			  uint32_t max_compressed_size)
+{
+	/*
+	 * This is the algorithm in [MS-XCA] 2.3 "Plain LZ77 Compression".
+	 *
+	 * It avoids Huffman encoding by including literal bytes inline when a
+	 * match is not found. Every so often it includes a uint32 bit map
+	 * flagging which positions contain matches and which contain
+	 * literals. The encoding of matches is of variable size, depending on
+	 * the match length; they are always at least 16 bits long, and can
+	 * implicitly use unused half-bytes from earlier in the stream.
+	 */
+	ssize_t ret;
+	uint32_t uncompressed_pos;
+	struct write_context wc = {
+		.indic = 0,
+		.indic_pos = 0,
+		.indic_bit = 0,
+		.nibble_index = 0,
+		.compressed = compressed,
+		.compressed_pos = 0,
+		.max_compressed_size = max_compressed_size
+	};
+	uint32_t hash_table[1 << LZX_PLAIN_COMP_HASH_BITS];
+	memset(hash_table, 0xff, sizeof(hash_table));
+
+	if (!uncompressed_size) {
+		return 0;
+	}
+
+	uncompressed_pos = 0;
+	CHECK_OUTPUT_BYTES(sizeof(uint32_t));
+	PUSH_LE_U32(wc.compressed, wc.compressed_pos, 0);
+	wc.compressed_pos += sizeof(uint32_t);
+
+	while ((uncompressed_pos < uncompressed_size) &&
+	       (wc.compressed_pos < wc.max_compressed_size)) {
+
+		/* maximum len we can encode into metadata */
+		const uint32_t max_len = MIN(0xFFFF + 3,
+					     uncompressed_size - uncompressed_pos);
+		const uint8_t *here = uncompressed + uncompressed_pos;
+		uint16_t h;
+		struct match match = {0};
+
+		if (max_len >= 3) {
+			h = three_byte_hash(here);
+			match = lookup_match(hash_table,
+					     h,
+					     uncompressed,
+					     uncompressed_pos,
+					     max_len);
+
+			store_match(hash_table, h, uncompressed_pos);
+		} else {
+			match.there = NULL;
+			match.length = 0;
+		}
+
+		if (match.there == NULL) {
+			/*
+			 * This is going to be a literal byte, which we flag
+			 * by setting a bit in an indicator field somewhere
+			 * earlier in the stream.
+			 */
+			CHECK_INPUT_BYTES(sizeof(uint8_t));
+			CHECK_OUTPUT_BYTES(sizeof(uint8_t));
+			wc.compressed[wc.compressed_pos++] = *here;
+			uncompressed_pos++;
+
+			ret = push_indicator_bit(&wc, 0);
+			if (ret < 0) {
+				return ret;
+			}
+		} else {
+			ret = encode_match(&wc, match, here);
+			if (ret < 0) {
+				return ret;
+			}
+			uncompressed_pos += match.length;
+		}
+	}
+
+	if (wc.indic_bit != 0) {
+		wc.indic <<= 32 - wc.indic_bit;
+	}
+	wc.indic |= UINT32_MAX >> wc.indic_bit;
+	PUSH_LE_U32(wc.compressed, wc.indic_pos, wc.indic);
+
+	return wc.compressed_pos;
+}
+
+ssize_t lzxpress_decompress(const uint8_t *input,
+			    uint32_t input_size,
+			    uint8_t *output,
+			    uint32_t max_output_size)
+{
+	/*
+	 * This is the algorithm in [MS-XCA] 2.4 "Plain LZ77 Decompression
+	 * Algorithm Details".
+	 */
+	uint32_t output_index, input_index;
+	uint32_t indicator, indicator_bit;
+	uint32_t nibble_index;
+
+	if (input_size == 0) {
+		return 0;
+	}
+
+	output_index = 0;
+	input_index = 0;
+	indicator = 0;
+	indicator_bit = 0;
+	nibble_index = 0;
+
+#undef CHECK_INPUT_BYTES
+#define CHECK_INPUT_BYTES(__needed) \
+	__CHECK_BYTES(input_size, input_index, __needed)
+#undef CHECK_OUTPUT_BYTES
+#define CHECK_OUTPUT_BYTES(__needed) \
+	__CHECK_BYTES(max_output_size, output_index, __needed)
+
+	do {
+		if (indicator_bit == 0) {
+			CHECK_INPUT_BYTES(sizeof(uint32_t));
+			indicator = PULL_LE_U32(input, input_index);
+			input_index += sizeof(uint32_t);
+			if (input_index == input_size) {
+				/*
+				 * The compressor left room for indicator
+				 * flags for data that doesn't exist.
+				 */
+				break;
+			}
+			indicator_bit = 32;
+		}
+		indicator_bit--;
+
+		/*
+		 * check whether the bit specified by indicator_bit is set or not
+		 * set in indicator. For example, if indicator_bit has value 4
+		 * check whether the 4th bit of the value in indicator is set
+		 */
+		if (((indicator >> indicator_bit) & 1) == 0) {
+			CHECK_INPUT_BYTES(sizeof(uint8_t));
+			CHECK_OUTPUT_BYTES(sizeof(uint8_t));
+			output[output_index] = input[input_index];
+			input_index += sizeof(uint8_t);
+			output_index += sizeof(uint8_t);
+		} else {
+			uint32_t length;
+			uint32_t offset;
+
+			CHECK_INPUT_BYTES(sizeof(uint16_t));
+			length = PULL_LE_U16(input, input_index);
+			input_index += sizeof(uint16_t);
+			offset = (length >> 3) + 1;
+			length &= 7;
+
+			if (length == 7) {
+				if (nibble_index == 0) {
+					CHECK_INPUT_BYTES(sizeof(uint8_t));
+					nibble_index = input_index;
+					length = input[input_index] & 0xf;
+					input_index += sizeof(uint8_t);
+				} else {
+					length = input[nibble_index] >> 4;
+					nibble_index = 0;
+				}
+
+				if (length == 15) {
+					CHECK_INPUT_BYTES(sizeof(uint8_t));
+					length = input[input_index];
+					input_index += sizeof(uint8_t);
+					if (length == 255) {
+						CHECK_INPUT_BYTES(sizeof(uint16_t));
+						length = PULL_LE_U16(input, input_index);
+						input_index += sizeof(uint16_t);
+						if (length == 0) {
+							CHECK_INPUT_BYTES(sizeof(uint32_t));
+							length = PULL_LE_U32(input, input_index);
+							input_index += sizeof(uint32_t);
+						}
+
+						if (length < (15 + 7)) {
+							return -1;
+						}
+						length -= (15 + 7);
+					}
+					length += 15;
+				}
+				length += 7;
+			}
+			length += 3;
+
+			if (length == 0) {
+				return -1;
+			}
+
+			for (; length > 0; --length) {
+				if (offset > output_index) {
+					return -1;
+				}
+				CHECK_OUTPUT_BYTES(sizeof(uint8_t));
+				output[output_index] = output[output_index - offset];
+				output_index += sizeof(uint8_t);
+			}
+		}
+	} while ((output_index < max_output_size) && (input_index < (input_size)));
+
+	return output_index;
+}
diff --git a/lib/compression/lzxpress.h b/lib/compression/lzxpress.h
new file mode 100644
index 0000000..df0ee59
--- /dev/null
+++ b/lib/compression/lzxpress.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (C) Matthieu Suiche 2008
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the author nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#ifndef _LZXPRESS_H
+#define _LZXPRESS_H
+
+#define XPRESS_BLOCK_SIZE 0x10000
+
+ssize_t lzxpress_compress(const uint8_t *uncompressed,
+			  uint32_t uncompressed_size,
+			  uint8_t *compressed,
+			  uint32_t max_compressed_size);
+
+ssize_t lzxpress_decompress(const uint8_t *input,
+			    uint32_t input_size,
+			    uint8_t *output,
+			    uint32_t max_output_size);
+
+#endif /* _LZXPRESS_H */
diff --git a/lib/compression/lzxpress_huffman.c b/lib/compression/lzxpress_huffman.c
new file mode 100644
index 0000000..e14419c
--- /dev/null
+++ b/lib/compression/lzxpress_huffman.c
@@ -0,0 +1,2045 @@
+/*
+ * Samba compression library - LGPLv3
+ *
+ * Copyright © Catalyst IT 2022
+ *
+ * Written by Douglas Bagnall <douglas.bagnall@catalyst.net.nz>
+ *        and Joseph Sutton   <josephsutton@catalyst.net.nz>
+ *
+ *  ** NOTE! The following LGPL license applies to this file.
+ *  ** It does NOT imply that all of Samba is released under the LGPL
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 3 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <talloc.h>
+
+#include "replace.h"
+#include "lzxpress_huffman.h"
+#include "lib/util/stable_sort.h"
+#include "lib/util/debug.h"
+#include "lib/util/byteorder.h"
+#include "lib/util/bytearray.h"
+
+/*
+ * DEBUG_NO_LZ77_MATCHES toggles the encoding of matches as matches. If it is
+ * false the potential match is written as a series of literals, which is a
+ * valid but usually inefficient encoding. This is useful for isolating a
+ * problem to either the LZ77 or the Huffman stage.
+ */
+#ifndef DEBUG_NO_LZ77_MATCHES
+#define DEBUG_NO_LZ77_MATCHES false
+#endif
+
+/*
+ * DEBUG_HUFFMAN_TREE forces the drawing of ascii art huffman trees during
+ * compression and decompression.
+ *
+ * These trees will also be drawn at DEBUG level 10, but that doesn't work
+ * with cmocka tests.
+ */
+#ifndef DEBUG_HUFFMAN_TREE
+#define DEBUG_HUFFMAN_TREE false
+#endif
+
+#if DEBUG_HUFFMAN_TREE
+#define DBG(...) fprintf(stderr, __VA_ARGS__)
+#else
+#define DBG(...) DBG_INFO(__VA_ARGS__)
+#endif
+
+
+#define LZXPRESS_ERROR -1LL
+
+/*
+ * We won't encode a match length longer than MAX_MATCH_LENGTH.
+ *
+ * Reports are that Windows has a limit at 64M.
+ */
+#define MAX_MATCH_LENGTH (64 * 1024 * 1024)
+
+
+struct bitstream {
+	const uint8_t *bytes;
+	size_t byte_pos;
+	size_t byte_size;
+	uint32_t bits;
+	int remaining_bits;
+	uint16_t *table;
+};
+
+
+#if ! defined __has_builtin
+#define __has_builtin(x) 0
+#endif
+
+/*
+ * bitlen_nonzero_16() returns the bit number of the most significant bit, or
+ * put another way, the integer log base 2. Log(0) is undefined; the argument
+ * has to be non-zero!
+ * 1     -> 0
+ * 2,3   -> 1
+ * 4-7   -> 2
+ * 1024  -> 10, etc
+ *
+ * Probably this is handled by a compiler intrinsic function that maps to a
+ * dedicated machine instruction.
+ */
+
+static inline int bitlen_nonzero_16(uint16_t x)
+{
+#if  __has_builtin(__builtin_clz)
+
+	/* __builtin_clz returns the number of leading zeros */
+	return (sizeof(unsigned int) * CHAR_BIT) - 1
+		- __builtin_clz((unsigned int) x);
+
+#else
+
+	int count = -1;
+	while(x) {
+		x >>= 1;
+		count++;
+	}
+	return count;
+
+#endif
+}
+
+
+struct lzxhuff_compressor_context {
+	const uint8_t *input_bytes;
+	size_t input_size;
+	size_t input_pos;
+	size_t prev_block_pos;
+	uint8_t *output;
+	size_t available_size;
+	size_t output_pos;
+};
+
+static int compare_huffman_node_count(struct huffman_node *a,
+				      struct huffman_node *b)
+{
+	return a->count - b->count;
+}
+
+static int compare_huffman_node_depth(struct huffman_node *a,
+				      struct huffman_node *b)
+{
+	int c = a->depth - b->depth;
+	if (c != 0) {
+		return c;
+	}
+	return (int)a->symbol - (int)b->symbol;
+}
+
+
+#define HASH_MASK ((1 << LZX_HUFF_COMP_HASH_BITS) - 1)
+
+static inline uint16_t three_byte_hash(const uint8_t *bytes)
+{
+	/*
+	 * MS-XCA says "three byte hash", but does not specify it.
+	 *
+	 * This one is just cobbled together, but has quite good distribution
+	 * in the 12-14 bit forms, which is what we care about most.
+	 * e.g: 13 bit: median 2048, min 2022, max 2074, stddev 6.0
+	 */
+	uint16_t a = bytes[0];
+	uint16_t b = bytes[1] ^ 0x2e;
+	uint16_t c = bytes[2] ^ 0x55;
+	uint16_t ca = c - a;
+	uint16_t d = ((a + b) << 8) ^ (ca << 5) ^ (c + b) ^ (0xcab + a);
+	return d & HASH_MASK;
+}
+
+
+static inline uint16_t encode_match(size_t len, size_t offset)
+{
+	uint16_t code = 256;
+	code |= MIN(len - 3, 15);
+	code |= bitlen_nonzero_16(offset) << 4;
+	return code;
+}
+
+/*
+ * debug_huffman_tree() uses debug_huffman_tree_print() to draw the Huffman
+ * tree in ascii art.
+ *
+ * Note that the Huffman tree is probably not the same as that implied by the
+ * canonical Huffman encoding that is finally used. That tree would be the
+ * same shape, but with the left and right toggled to sort the branches by
+ * length, after which the symbols for each length sorted by value.
+ */
+
+static void debug_huffman_tree_print(struct huffman_node *node,
+				     int *trail, int depth)
+{
+	if (node->left == NULL) {
+		/* time to print a row */
+		int j;
+		bool branched = false;
+		int row[17];
+		char c[100];
+		int s = node->symbol;
+		char code[17];
+		if (depth > 15) {
+			fprintf(stderr,
+				" \033[1;31m Max depth exceeded! (%d)\033[0m "
+				" symbol %#3x claimed depth %d count %d\n",
+				depth, node->symbol, node->depth, node->count);
+			return;
+		}
+		for (j = depth - 1; j >= 0; j--) {
+			if (branched) {
+				if (trail[j] == -1) {
+					row[j] = -3;
+				} else {
+					row[j] = -2;
+				}
+			} else if (trail[j] == -1) {
+				row[j] = -1;
+				branched = true;
+			} else {
+				row[j] = trail[j];
+			}
+		}
+		for (j = 0; j < depth; j++) {
+			switch (row[j]) {
+			case -3:
+				code[j] = '1';
+				fprintf(stderr, "        ");
+				break;
+			case -2:
+				code[j] = '0';
+				fprintf(stderr, "      │ ");
+				break;
+			case -1:
+				code[j] = '1';
+				fprintf(stderr, "      ╰─");
+				break;
+			default:
+				code[j] = '0';
+				fprintf(stderr, "%5d─┬─", row[j]);
+				break;
+			}
+		}
+		code[depth] = 0;
+		if (s < 32) {
+			snprintf(c, sizeof(c),
+				"\033[1;32m%02x\033[0m \033[1;33m%c%c%c\033[0m",
+				 s,
+				 0xE2, 0x90, 0x80 + s); /* utf-8 for symbol */
+		}  else if (s < 127) {
+			snprintf(c, sizeof(c),
+				 "\033[1;32m%2x\033[0m '\033[10;32m%c\033[0m'",
+				 s, s);
+		} else if (s < 256) {
+			snprintf(c, sizeof(c), "\033[1;32m%2x\033[0m", s);
+		} else {
+			uint16_t len = (s & 15) + 3;
+			uint16_t dbits = ((s >> 4) & 15) + 1;
+			snprintf(c, sizeof(c),
+				 " \033[0;33mlen:%2d%s, "
+				 "dist:%d-%d \033[0m \033[1;32m%3x\033[0m%s",
+				 len,
+				 len == 18 ? "+" : "",
+				 1 << (dbits - 1),
+				 (1 << dbits) - 1,
+				 s,
+				 s == 256 ? " \033[1;31mEOF\033[0m" : "");
+
+		}
+
+		fprintf(stderr, "──%5d %s \033[2;37m%s\033[0m\n",
+			node->count, c, code);
+		return;
+	}
+	trail[depth] = node->count;
+	debug_huffman_tree_print(node->left, trail, depth + 1);
+	trail[depth] = -1;
+	debug_huffman_tree_print(node->right, trail, depth + 1);
+}
+
+
+/*
+ * If DEBUG_HUFFMAN_TREE is defined true, debug_huffman_tree()
+ * will print a tree looking something like this:
+ *
+ *     7─┬───    3  len:18+, dist:1-1  10f 0
+ *       ╰─    4─┬─    2─┬───    1 61 'a' 100
+ *               │       ╰───    1 62 'b' 101
+ *               ╰─    2─┬───    1 63 'c' 110
+ *                       ╰───    1  len: 3, dist:1-1  100 EOF 111
+ *
+ * This is based off a Huffman root node, and the tree may not be the same as
+ * the canonical tree.
+ */
+static void debug_huffman_tree(struct huffman_node *root)
+{
+	int trail[17];
+	debug_huffman_tree_print(root, trail, 0);
+}
+
+
+/*
+ * If DEBUG_HUFFMAN_TREE is defined true, debug_huffman_tree_from_table()
+ * will print something like this based on a decoding symbol table.
+ *
+ *  Tree from decoding table 9 nodes → 5 codes
+ * 10000─┬─── 5000  len:18+, dist:1-1  10f 0
+ *       ╰─ 5000─┬─ 2500─┬─── 1250 61 'a' 100
+ *               │       ╰─── 1250 62 'b' 101
+ *               ╰─ 2500─┬─── 1250 63 'c' 110
+ *                       ╰─── 1250  len: 3, dist:1-1  100 EOF 111
+ *
+ * This is the canonical form of the Huffman tree where the actual counts
+ * aren't known (we use "10000" to help indicate relative frequencies).
+ */
+static void debug_huffman_tree_from_table(uint16_t *table)
+{
+	int trail[17];
+	struct huffman_node nodes[1024] = {{0}};
+	uint16_t codes[1024];
+	size_t n = 1;
+	size_t i = 0;
+	codes[0] = 0;
+	nodes[0].count = 10000;
+
+	while (i < n) {
+		uint16_t index = codes[i];
+		struct huffman_node *node = &nodes[i];
+		if (table[index] == 0xffff) {
+			/* internal node */
+			index <<= 1;
+			/* left */
+			index++;
+			codes[n] = index;
+			node->left = nodes + n;
+			nodes[n].count = node->count >> 1;
+			n++;
+			/*right*/
+			index++;
+			codes[n] = index;
+			node->right = nodes + n;
+			nodes[n].count = node->count >> 1;
+			n++;
+		} else {
+			/* leaf node */
+			node->symbol = table[index] & 511;
+		}
+		i++;
+	}
+
+	fprintf(stderr,
+		"\033[1;34m Tree from decoding table\033[0m "
+		"%zu nodes → %zu codes\n",
+		n, (n + 1) / 2);
+	debug_huffman_tree_print(nodes, trail, 0);
+}
+
+
+static bool depth_walk(struct huffman_node *n, uint32_t depth)
+{
+	bool ok;
+	if (n->left == NULL) {
+		/* this is a leaf, record the depth */
+		n->depth = depth;
+		return true;
+	}
+	if (depth > 14) {
+		return false;
+	}
+	ok = (depth_walk(n->left, depth + 1) &&
+	      depth_walk(n->right, depth + 1));
+
+	return ok;
+}
+
+
+static bool check_and_record_depths(struct huffman_node *root)
+{
+	return depth_walk(root, 0);
+}
+
+
+static bool encode_values(struct huffman_node *leaves,
+			  size_t n_leaves,
+			  uint16_t symbol_values[512])
+{
+	size_t i;
+	/*
+	 * See, we have a leading 1 in our internal code representation, which
+	 * indicates the code length.
+	 */
+	uint32_t code = 1;
+	uint32_t code_len = 0;
+	memset(symbol_values, 0, sizeof(uint16_t) * 512);
+	for (i = 0; i < n_leaves; i++) {
+		code <<= leaves[i].depth - code_len;
+		code_len = leaves[i].depth;
+
+		symbol_values[leaves[i].symbol] = code;
+		code++;
+	}
+	/*
+	 * The last code should be 11111... with code_len + 1 ones. The final
+	 * code++ will wrap this round to 1000... with code_len + 1 zeroes.
+	 */
+
+	if (code != 2 << code_len) {
+		return false;
+	}
+	return true;
+}
+
+
+static int generate_huffman_codes(struct huffman_node *leaf_nodes,
+				  struct huffman_node *internal_nodes,
+				  uint16_t symbol_values[512])
+{
+	size_t head_leaf = 0;
+	size_t head_branch = 0;
+	size_t tail_branch = 0;
+	struct huffman_node *huffman_root = NULL;
+	size_t i, j;
+	size_t n_leaves = 0;
+
+	/*
+	 * Before we sort the nodes, we can eliminate the unused ones.
+	 */
+	for (i = 0; i < 512; i++) {
+		if (leaf_nodes[i].count) {
+			leaf_nodes[n_leaves] = leaf_nodes[i];
+			n_leaves++;
+		}
+	}
+	if (n_leaves == 0) {
+		return LZXPRESS_ERROR;
+	}
+	if (n_leaves == 1) {
+		/*
+		 * There is *almost* no way this should happen, and it would
+		 * ruin the tree (because the shortest possible codes are 1
+		 * bit long, and there are two of them).
+		 *
+		 * The only way to get here is in an internal block in a
+		 * 3-or-more block message (i.e. > 128k), which consists
+		 * entirely of a match starting in the previous block (if it
+		 * was the end block, it would have the EOF symbol).
+		 *
+		 * What we do is add a dummy symbol which is this one XOR 256.
+		 * It won't be used in the stream but will balance the tree.
+		 */
+		leaf_nodes[1] = leaf_nodes[0];
+		leaf_nodes[1].symbol ^= 0x100;
+		n_leaves = 2;
+	}
+
+	/* note, in sort we're using internal_nodes as auxiliary space */
+	stable_sort(leaf_nodes,
+		    internal_nodes,
+		    n_leaves,
+		    sizeof(struct huffman_node),
+		    (samba_compare_fn_t)compare_huffman_node_count);
+
+	/*
+	 * This outer loop is for re-quantizing the counts if the tree is too
+	 * tall (>15), which we need to do because the final encoding can't
+	 * express a tree that deep.
+	 *
+	 * In theory, this should be a 'while (true)' loop, but we chicken
+	 * out with 10 iterations, just in case.
+	 *
+	 * In practice it will almost always resolve in the first round; if
+	 * not then, in the second or third. Remember we'll looking at 64k or
+	 * less, so the rarest we can have is 1 in 64k; each round of
+	 * quantization effectively doubles its frequency to 1 in 32k, 1 in
+	 * 16k, etc, until we're treating the rare symbol as actually quite
+	 * common.
+	 */
+	for (j = 0; j < 10; j++) {
+		bool less_than_15_bits;
+		while (true) {
+			struct huffman_node *a = NULL;
+			struct huffman_node *b = NULL;
+			size_t leaf_len = n_leaves - head_leaf;
+			size_t internal_len = tail_branch - head_branch;
+
+			if (leaf_len + internal_len == 1) {
+				/*
+				 * We have the complete tree. The root will be
+				 * an internal node unless there is just one
+				 * symbol, which is already impossible.
+				 */
+				if (unlikely(leaf_len == 1)) {
+					return LZXPRESS_ERROR;
+				} else {
+					huffman_root = \
+						&internal_nodes[head_branch];
+				}
+				break;
+			}
+			/*
+			 * We know here we have at least two nodes, and we
+			 * want to select the two lowest scoring ones. Those
+			 * have to be either a) the head of each queue, or b)
+			 * the first two nodes of either queue.
+			 *
+			 * The complicating factors are: a) we need to check
+			 * the length of each queue, and b) in the case of
+			 * ties, we prefer to pair leaves with leaves.
+			 *
+			 * Note a complication we don't have: the leaf node
+			 * queue never grows, and the subtree queue starts
+			 * empty and cannot grow beyond n - 1. It feeds on
+			 * itself. We don't need to think about overflow.
+			 */
+			if (leaf_len == 0) {
+				/* two from subtrees */
+				a = &internal_nodes[head_branch];
+				b = &internal_nodes[head_branch + 1];
+				head_branch += 2;
+			} else if (internal_len == 0) {
+				/* two from nodes */
+				a = &leaf_nodes[head_leaf];
+				b = &leaf_nodes[head_leaf + 1];
+				head_leaf += 2;
+			} else if (leaf_len == 1 && internal_len == 1) {
+				/* one of each */
+				a = &leaf_nodes[head_leaf];
+				b = &internal_nodes[head_branch];
+				head_branch++;
+				head_leaf++;
+			} else {
+				/*
+				 * Take the lowest head, twice, checking for
+				 * length after taking the first one.
+				 */
+				if (leaf_nodes[head_leaf].count >
+				    internal_nodes[head_branch].count) {
+					a = &internal_nodes[head_branch];
+					head_branch++;
+					if (internal_len == 1) {
+						b = &leaf_nodes[head_leaf];
+						head_leaf++;
+						goto done;
+					}
+				} else {
+					a = &leaf_nodes[head_leaf];
+					head_leaf++;
+					if (leaf_len == 1) {
+						b = &internal_nodes[head_branch];
+						head_branch++;
+						goto done;
+					}
+				}
+				/* the other node */
+				if (leaf_nodes[head_leaf].count >
+				    internal_nodes[head_branch].count) {
+					b = &internal_nodes[head_branch];
+					head_branch++;
+				} else {
+					b = &leaf_nodes[head_leaf];
+					head_leaf++;
+				}
+			}
+		done:
+			/*
+			 * Now we add a new node to the subtrees list that
+			 * combines the score of node_a and node_b, and points
+			 * to them as children.
+			 */
+			internal_nodes[tail_branch].count = a->count + b->count;
+			internal_nodes[tail_branch].left = a;
+			internal_nodes[tail_branch].right = b;
+			tail_branch++;
+			if (tail_branch == n_leaves) {
+				/*
+				 * We're not getting here, no way, never ever.
+				 * Unless we made a terrible mistake.
+				 *
+				 * That is, in a binary tree with n leaves,
+				 * there are ALWAYS n-1 internal nodes.
+				 */
+				return LZXPRESS_ERROR;
+			}
+		}
+		if (CHECK_DEBUGLVL(10) || DEBUG_HUFFMAN_TREE) {
+			debug_huffman_tree(huffman_root);
+		}
+		/*
+		 * We have a tree, and need to turn it into a lookup table,
+		 * and see if it is shallow enough (<= 15).
+		 */
+		less_than_15_bits = check_and_record_depths(huffman_root);
+		if (less_than_15_bits) {
+			/*
+			 * Now the leaf nodes know how deep they are, and we
+			 * no longer need the internal nodes.
+			 *
+			 * We need to sort the nodes of equal depth, so that
+			 * they are sorted by depth first, and symbol value
+			 * second. The internal_nodes can again be auxiliary
+			 * memory.
+			 */
+			stable_sort(
+				leaf_nodes,
+				internal_nodes,
+				n_leaves,
+				sizeof(struct huffman_node),
+				(samba_compare_fn_t)compare_huffman_node_depth);
+
+			encode_values(leaf_nodes, n_leaves, symbol_values);
+
+			return n_leaves;
+		}
+
+		/*
+		 * requantize by halving and rounding up, so that small counts
+		 * become relatively bigger. This will lead to a flatter tree.
+		 */
+		for (i = 0; i < n_leaves; i++) {
+			leaf_nodes[i].count >>= 1;
+			leaf_nodes[i].count += 1;
+		}
+		head_leaf = 0;
+		head_branch = 0;
+		tail_branch = 0;
+	}
+	return LZXPRESS_ERROR;
+}
+
+/*
+ * LZX_HUFF_COMP_HASH_SEARCH_ATTEMPTS is how far ahead to search in the
+ * circular hash table for a match, before we give up. A bigger number will
+ * generally lead to better but slower compression, but a stupidly big number
+ * will just be worse.
+ *
+ * If you're fiddling with this, consider also fiddling with
+ * LZX_HUFF_COMP_HASH_BITS.
+ */
+#define LZX_HUFF_COMP_HASH_SEARCH_ATTEMPTS 5
+
+static inline void store_match(uint16_t *hash_table,
+			       uint16_t h,
+			       uint16_t offset)
+{
+	int i;
+	uint16_t o = hash_table[h];
+	uint16_t h2;
+	uint16_t worst_h;
+	int worst_score;
+
+	if (o == 0xffff) {
+		/* there is nothing there yet */
+		hash_table[h] = offset;
+		return;
+	}
+	for (i = 1; i < LZX_HUFF_COMP_HASH_SEARCH_ATTEMPTS; i++) {
+		h2 = (h + i) & HASH_MASK;
+		if (hash_table[h2] == 0xffff) {
+			hash_table[h2] = offset;
+			return;
+		}
+	}
+	/*
+	 * There are no slots, but we really want to store this, so we'll kick
+	 * out the one with the longest distance.
+	 */
+	worst_h = h;
+	worst_score = offset - o;
+	for (i = 1; i < LZX_HUFF_COMP_HASH_SEARCH_ATTEMPTS; i++) {
+		int score;
+		h2 = (h + i) & HASH_MASK;
+		o = hash_table[h2];
+		score = offset - o;
+		if (score > worst_score) {
+			worst_score = score;
+			worst_h = h2;
+		}
+	}
+	hash_table[worst_h] = offset;
+}
+
+
+/*
+ * Yes, struct match looks a lot like a DATA_BLOB.
+ */
+struct match {
+	const uint8_t *there;
+	size_t length;
+};
+
+
+static inline struct match lookup_match(uint16_t *hash_table,
+					uint16_t h,
+					const uint8_t *data,
+					const uint8_t *here,
+					size_t max_len)
+{
+	int i;
+	uint16_t o = hash_table[h];
+	uint16_t h2;
+	size_t len;
+	const uint8_t *there = NULL;
+	struct match best = {0};
+
+	for (i = 0; i < LZX_HUFF_COMP_HASH_SEARCH_ATTEMPTS; i++) {
+		h2 = (h + i) & HASH_MASK;
+		o = hash_table[h2];
+		if (o == 0xffff) {
+			/*
+			 * in setting this, we would never have stepped over
+			 * an 0xffff, so we won't now.
+			 */
+			break;
+		}
+		there = data + o;
+		if (here - there > 65534 || there > here) {
+			continue;
+		}
+
+		/*
+		 * When we already have a long match, we can try to avoid
+		 * measuring out another long, but shorter match.
+		 */
+		if (best.length > 1000 &&
+		    there[best.length - 1] != best.there[best.length - 1]) {
+			continue;
+		}
+
+		for (len = 0;
+		     len < max_len && here[len] == there[len];
+		     len++) {
+			/* counting */
+		}
+		if (len > 2) {
+			/*
+			 * As a tiebreaker, we prefer the closer match which
+			 * is likely to encode smaller (and certainly no worse).
+			 */
+			if (len > best.length ||
+			    (len == best.length && there > best.there)) {
+				best.length = len;
+				best.there = there;
+			}
+		}
+	}
+	return best;
+}
+
+
+
+static ssize_t lz77_encode_block(struct lzxhuff_compressor_context *cmp_ctx,
+				 struct lzxhuff_compressor_mem *cmp_mem,
+				 uint16_t *hash_table,
+				 uint16_t *prev_hash_table)
+{
+	uint16_t *intermediate = cmp_mem->intermediate;
+	struct huffman_node *leaf_nodes = cmp_mem->leaf_nodes;
+	uint16_t *symbol_values = cmp_mem->symbol_values;
+	size_t i, j, intermediate_len;
+	const uint8_t *data = cmp_ctx->input_bytes + cmp_ctx->input_pos;
+	const uint8_t *prev_block = NULL;
+	size_t remaining_size = cmp_ctx->input_size - cmp_ctx->input_pos;
+	size_t block_end = MIN(65536, remaining_size);
+	struct match match;
+	int n_symbols;
+
+	if (cmp_ctx->input_size < cmp_ctx->input_pos) {
+		return LZXPRESS_ERROR;
+	}
+
+	if (cmp_ctx->prev_block_pos != cmp_ctx->input_pos) {
+		prev_block = cmp_ctx->input_bytes + cmp_ctx->prev_block_pos;
+	} else if (prev_hash_table != NULL) {
+		/* we've got confused! hash and block should go together */
+		return LZXPRESS_ERROR;
+	}
+
+	/*
+	 * leaf_nodes is used to count the symbols seen, for later Huffman
+	 * encoding.
+	 */
+	for (i = 0; i < 512; i++) {
+		leaf_nodes[i] = (struct huffman_node) {
+			.symbol = i
+		};
+	}
+
+	j = 0;
+
+	if (remaining_size < 41 || DEBUG_NO_LZ77_MATCHES) {
+		/*
+		 * There is no point doing a hash table and looking for
+		 * matches in this tiny block (remembering we are committed to
+		 * using 32 bits, so there's a good chance we wouldn't even
+		 * save a byte). The threshold of 41 matches Windows.
+		 * If remaining_size < 3, we *can't* do the hash.
+		 */
+		i = 0;
+	} else {
+		/*
+		 * We use 0xffff as the unset value for table, because it is
+		 * not a valid match offset (and 0x0 is).
+		 */
+		memset(hash_table, 0xff, sizeof(cmp_mem->hash_table1));
+
+		for (i = 0; i <= block_end - 3; i++) {
+			uint16_t code;
+			const uint8_t *here = data + i;
+			uint16_t h = three_byte_hash(here);
+			size_t max_len = MIN(remaining_size - i, MAX_MATCH_LENGTH);
+			match = lookup_match(hash_table,
+					     h,
+					     data,
+					     here,
+					     max_len);
+
+			if (match.there == NULL && prev_hash_table != NULL) {
+				/*
+				 * If this is not the first block,
+				 * backreferences can look into the previous
+				 * block (but only as far as 65535 bytes, so
+				 * the end of this block cannot see the start
+				 * of the last one).
+				 */
+				match = lookup_match(prev_hash_table,
+						     h,
+						     prev_block,
+						     here,
+						     remaining_size - i);
+			}
+
+			store_match(hash_table, h, i);
+
+			if (match.there == NULL) {
+				/* add a literal and move on. */
+				uint8_t c = data[i];
+				leaf_nodes[c].count++;
+				intermediate[j] = c;
+				j++;
+				continue;
+			}
+
+			/* a real match */
+			if (match.length <= 65538) {
+				intermediate[j] = 0xffff;
+				intermediate[j + 1] = match.length - 3;
+				intermediate[j + 2] = here - match.there;
+				j += 3;
+			} else {
+				size_t m = match.length - 3;
+				intermediate[j] = 0xfffe;
+				intermediate[j + 1] = m & 0xffff;
+				intermediate[j + 2] = m >> 16;
+				intermediate[j + 3] = here - match.there;
+				j += 4;
+			}
+			code = encode_match(match.length, here - match.there);
+			leaf_nodes[code].count++;
+			i += match.length - 1; /* `- 1` for the loop i++ */
+			/*
+			 * A match can take us past the intended block length,
+			 * extending the block. We don't need to do anything
+			 * special for this case -- the loops will naturally
+			 * do the right thing.
+			 */
+		}
+	}
+
+	/*
+	 * There might be some bytes at the end.
+	 */
+	for (; i < block_end; i++) {
+		leaf_nodes[data[i]].count++;
+		intermediate[j] = data[i];
+		j++;
+	}
+
+	if (i == remaining_size) {
+		/* add a trailing EOF marker (256) */
+		intermediate[j] = 0xffff;
+		intermediate[j + 1] = 0;
+		intermediate[j + 2] = 1;
+		j += 3;
+		leaf_nodes[256].count++;
+	}
+
+	intermediate_len = j;
+
+	cmp_ctx->prev_block_pos = cmp_ctx->input_pos;
+	cmp_ctx->input_pos += i;
+
+	/* fill in the symbols table */
+	n_symbols = generate_huffman_codes(leaf_nodes,
+					   cmp_mem->internal_nodes,
+					   symbol_values);
+	if (n_symbols < 0) {
+		return n_symbols;
+	}
+
+	return intermediate_len;
+}
+
+
+
+static ssize_t write_huffman_table(uint16_t symbol_values[512],
+				   uint8_t *output,
+				   size_t available_size)
+{
+	size_t i;
+
+	if (available_size < 256) {
+		return LZXPRESS_ERROR;
+	}
+
+	for (i = 0; i < 256; i++) {
+		uint8_t b = 0;
+		uint16_t even = symbol_values[i * 2];
+		uint16_t odd = symbol_values[i * 2 + 1];
+		if (even != 0) {
+			b = bitlen_nonzero_16(even);
+		}
+		if (odd != 0) {
+			b |= bitlen_nonzero_16(odd) << 4;
+		}
+		output[i] = b;
+	}
+	return i;
+}
+
+
+struct write_context {
+	uint8_t *dest;
+	size_t dest_len;
+	size_t head;                 /* where lengths go */
+	size_t next_code;            /* where symbol stream goes */
+	size_t pending_next_code;    /* will be next_code */
+	unsigned bit_len;
+	uint32_t bits;
+};
+
+/*
+ * Write out 16 bits, little-endian, for write_huffman_codes()
+ *
+ * As you'll notice, there's a bit to do.
+ *
+ * We are collecting up bits in a uint32_t, then when there are 16 of them we
+ * write out a word into the stream, using a trio of offsets (wc->next_code,
+ * wc->pending_next_code, and wc->head) which dance around ensuring that the
+ * bitstream and the interspersed lengths are in the right places relative to
+ * each other.
+ */
+
+static inline bool write_bits(struct write_context *wc,
+			      uint16_t code, uint16_t length)
+{
+	wc->bits <<= length;
+	wc->bits |= code;
+	wc->bit_len += length;
+	if (wc->bit_len > 16) {
+		uint32_t w = wc->bits >> (wc->bit_len - 16);
+		wc->bit_len -= 16;
+		if (wc->next_code + 2 > wc->dest_len ||
+		    unlikely(wc->bit_len > 16)) {
+			return false;
+		}
+		wc->dest[wc->next_code] = w & 0xff;
+		wc->dest[wc->next_code + 1] = (w >> 8) & 0xff;
+		wc->next_code = wc->pending_next_code;
+		wc->pending_next_code = wc->head;
+		wc->head += 2;
+	}
+	return true;
+}
+
+
+static inline bool write_code(struct write_context *wc, uint16_t code)
+{
+	int code_bit_len = bitlen_nonzero_16(code);
+	if (unlikely(code == 0)) {
+		return false;
+	}
+	code &= (1 << code_bit_len) - 1;
+	return  write_bits(wc, code, code_bit_len);
+}
+
+static inline bool write_byte(struct write_context *wc, uint8_t byte)
+{
+	if (wc->head + 1 > wc->dest_len) {
+		return false;
+	}
+	wc->dest[wc->head] = byte;
+	wc->head++;
+	return true;
+}
+
+
+static inline bool write_long_len(struct write_context *wc, size_t len)
+{
+	if (len < 65535) {
+		if (wc->head + 3 > wc->dest_len) {
+			return false;
+		}
+		wc->dest[wc->head] = 255;
+		wc->dest[wc->head + 1] = len & 255;
+		wc->dest[wc->head + 2] = len >> 8;
+		wc->head += 3;
+	} else {
+		if (wc->head + 7 > wc->dest_len) {
+			return false;
+		}
+		wc->dest[wc->head] = 255;
+		wc->dest[wc->head + 1] = 0;
+		wc->dest[wc->head + 2] = 0;
+		wc->dest[wc->head + 3] = len & 255;
+		wc->dest[wc->head + 4] = (len >> 8) & 255;
+		wc->dest[wc->head + 5] = (len >> 16) & 255;
+		wc->dest[wc->head + 6] = (len >> 24) & 255;
+		wc->head += 7;
+	}
+	return true;
+}
+
+static ssize_t write_compressed_bytes(uint16_t symbol_values[512],
+				      uint16_t *intermediate,
+				      size_t intermediate_len,
+				      uint8_t *dest,
+				      size_t dest_len)
+{
+	bool ok;
+	size_t i;
+	size_t end;
+	struct write_context wc = {
+		.head = 4,
+		.pending_next_code = 2,
+		.dest = dest,
+		.dest_len = dest_len
+	};
+	for (i = 0; i < intermediate_len; i++) {
+		uint16_t c = intermediate[i];
+		size_t len;
+		uint16_t distance;
+		uint16_t code_len = 0;
+		uint16_t code_dist = 0;
+		if (c < 256) {
+			ok = write_code(&wc, symbol_values[c]);
+			if (!ok) {
+				return LZXPRESS_ERROR;
+			}
+			continue;
+		}
+
+		if (c == 0xfffe) {
+			if (i > intermediate_len - 4) {
+				return LZXPRESS_ERROR;
+			}
+
+			len = intermediate[i + 1];
+			len |= (uint32_t)intermediate[i + 2] << 16;
+			distance = intermediate[i + 3];
+			i += 3;
+		} else if (c == 0xffff) {
+			if (i > intermediate_len - 3) {
+				return LZXPRESS_ERROR;
+			}
+			len = intermediate[i + 1];
+			distance = intermediate[i + 2];
+			i += 2;
+		} else {
+			return LZXPRESS_ERROR;
+		}
+		if (unlikely(distance == 0)) {
+			return LZXPRESS_ERROR;
+		}
+		/* len has already had 3 subtracted */
+		if (len >= 15) {
+			/*
+			 * We are going to need to write extra length
+			 * bytes into the stream, but we don't do it
+			 * now, we do it after the code has been
+			 * written (and before the distance bits).
+			 */
+			code_len = 15;
+		} else {
+			code_len = len;
+		}
+		code_dist = bitlen_nonzero_16(distance);
+		c = 256 | (code_dist << 4) | code_len;
+		if (c > 511) {
+			return LZXPRESS_ERROR;
+		}
+
+		ok = write_code(&wc, symbol_values[c]);
+		if (!ok) {
+			return LZXPRESS_ERROR;
+		}
+
+		if (code_len == 15) {
+			if (len >= 270) {
+				ok = write_long_len(&wc, len);
+			} else {
+				ok = write_byte(&wc, len - 15);
+			}
+			if (! ok) {
+				return LZXPRESS_ERROR;
+			}
+		}
+		if (code_dist != 0) {
+			uint16_t dist_bits = distance - (1 << code_dist);
+			ok = write_bits(&wc, dist_bits, code_dist);
+			if (!ok) {
+				return LZXPRESS_ERROR;
+			}
+		}
+	}
+	/*
+	 * There are some intricacies around flushing the bits and returning
+	 * the length.
+	 *
+	 * If the returned length is not exactly right and there is another
+	 * block, that block will read its huffman table from the wrong place,
+	 * and have all the symbol codes out by a multiple of 4.
+	 */
+	end = wc.head;
+	if (wc.bit_len == 0) {
+		end -= 2;
+	}
+	ok = write_bits(&wc, 0, 16 - wc.bit_len);
+	if (!ok) {
+		return LZXPRESS_ERROR;
+	}
+	for (i = 0; i < 2; i++) {
+		/*
+		 * Flush out the bits with zeroes. It doesn't matter if we do
+		 * a round too many, as we have buffer space, and have already
+		 * determined the returned length (end).
+		 */
+		ok = write_bits(&wc, 0, 16);
+		if (!ok) {
+			return LZXPRESS_ERROR;
+		}
+	}
+	return end;
+}
+
+
+static ssize_t lzx_huffman_compress_block(struct lzxhuff_compressor_context *cmp_ctx,
+					  struct lzxhuff_compressor_mem *cmp_mem,
+					  size_t block_no)
+{
+	ssize_t intermediate_size;
+	uint16_t *hash_table = NULL;
+	uint16_t *back_window_hash_table = NULL;
+	ssize_t bytes_written;
+
+	if (cmp_ctx->available_size - cmp_ctx->output_pos < 260) {
+		/* huffman block + 4 bytes */
+		return LZXPRESS_ERROR;
+	}
+
+	/*
+	 * For LZ77 compression, we keep a hash table for the previous block,
+	 * via alternation after the first block.
+	 *
+	 * LZ77 writes into the intermediate buffer in the cmp_mem context.
+	 */
+	if (block_no == 0) {
+		hash_table = cmp_mem->hash_table1;
+		back_window_hash_table = NULL;
+	} else if (block_no & 1) {
+		hash_table = cmp_mem->hash_table2;
+		back_window_hash_table = cmp_mem->hash_table1;
+	} else {
+		hash_table = cmp_mem->hash_table1;
+		back_window_hash_table = cmp_mem->hash_table2;
+	}
+
+	intermediate_size = lz77_encode_block(cmp_ctx,
+					      cmp_mem,
+					      hash_table,
+					      back_window_hash_table);
+
+	if (intermediate_size < 0) {
+		return intermediate_size;
+	}
+
+	/*
+	 * Write the 256 byte Huffman table, based on the counts gained in
+	 * LZ77 phase.
+	 */
+	bytes_written = write_huffman_table(
+		cmp_mem->symbol_values,
+		cmp_ctx->output + cmp_ctx->output_pos,
+		cmp_ctx->available_size - cmp_ctx->output_pos);
+
+	if (bytes_written != 256) {
+		return LZXPRESS_ERROR;
+	}
+	cmp_ctx->output_pos += 256;
+
+	/*
+	 * Write the compressed bytes using the LZ77 matches and Huffman codes
+	 * worked out in the previous steps.
+	 */
+	bytes_written = write_compressed_bytes(
+		cmp_mem->symbol_values,
+		cmp_mem->intermediate,
+		intermediate_size,
+		cmp_ctx->output + cmp_ctx->output_pos,
+		cmp_ctx->available_size - cmp_ctx->output_pos);
+
+	if (bytes_written < 0) {
+		return bytes_written;
+	}
+
+	cmp_ctx->output_pos += bytes_written;
+	return bytes_written;
+}
+
+/*
+ * lzxpress_huffman_max_compressed_size()
+ *
+ * Return the most bytes the compression can take, to allow
+ * pre-allocation.
+ */
+size_t lzxpress_huffman_max_compressed_size(size_t input_size)
+{
+	/*
+	 * In the worst case, the output size should be about the same as the
+	 * input size, plus the 256 byte header per 64k block. We aim for
+	 * ample, but within the order of magnitude.
+	 */
+	return input_size + (input_size / 8) + 270;
+}
+
+/*
+ * lzxpress_huffman_compress_talloc()
+ *
+ * This is the convenience function that allocates the compressor context and
+ * output memory for you. The return value is the number of bytes written to
+ * the location indicated by the output pointer.
+ *
+ * The maximum input_size is effectively around 227MB due to the need to guess
+ * an upper bound on the output size that hits an internal limitation in
+ * talloc.
+ *
+ * @param mem_ctx      TALLOC_CTX parent for the compressed buffer.
+ * @param input_bytes  memory to be compressed.
+ * @param input_size   length of the input buffer.
+ * @param output       destination pointer for the compressed data.
+ *
+ * @return the number of bytes written or -1 on error.
+ */
+
+ssize_t lzxpress_huffman_compress_talloc(TALLOC_CTX *mem_ctx,
+					 const uint8_t *input_bytes,
+					 size_t input_size,
+					 uint8_t **output)
+{
+	struct lzxhuff_compressor_mem *cmp = NULL;
+	size_t alloc_size = lzxpress_huffman_max_compressed_size(input_size);
+
+	ssize_t output_size;
+
+	*output = talloc_array(mem_ctx, uint8_t, alloc_size);
+	if (*output == NULL) {
+		return LZXPRESS_ERROR;
+	}
+
+	cmp = talloc(mem_ctx, struct lzxhuff_compressor_mem);
+	if (cmp == NULL) {
+		TALLOC_FREE(*output);
+		return LZXPRESS_ERROR;
+	}
+
+	output_size = lzxpress_huffman_compress(cmp,
+						input_bytes,
+						input_size,
+						*output,
+						alloc_size);
+
+	talloc_free(cmp);
+
+	if (output_size < 0) {
+		TALLOC_FREE(*output);
+		return LZXPRESS_ERROR;
+	}
+
+	*output = talloc_realloc(mem_ctx, *output, uint8_t, output_size);
+	if (*output == NULL) {
+		return LZXPRESS_ERROR;
+	}
+
+	return output_size;
+}
+
+/*
+ * lzxpress_huffman_compress()
+ *
+ * This is the inconvenience function, slightly faster and fiddlier than
+ * lzxpress_huffman_compress_talloc().
+ *
+ * To use this, you need to have allocated (but not initialised) a `struct
+ * lzxhuff_compressor_mem`, and an output buffer. If the buffer is not big
+ * enough (per `output_size`), you'll get a negative return value, otherwise
+ * the number of bytes actually consumed, which will always be at least 260.
+ *
+ * The `struct lzxhuff_compressor_mem` is reusable -- it is basically a
+ * collection of uninitialised memory buffers. The total size is less than
+ * 150k, so stack allocation is plausible.
+ *
+ * input_size and available_size are limited to the minimum of UINT32_MAX and
+ * SSIZE_MAX. On 64 bit machines that will be UINT32_MAX, or 4GB.
+ *
+ * @param cmp_mem         a struct lzxhuff_compressor_mem.
+ * @param input_bytes     memory to be compressed.
+ * @param input_size      length of the input buffer.
+ * @param output          destination for the compressed data.
+ * @param available_size  allocated output bytes.
+ *
+ * @return the number of bytes written or -1 on error.
+ */
+ssize_t lzxpress_huffman_compress(struct lzxhuff_compressor_mem *cmp_mem,
+				  const uint8_t *input_bytes,
+				  size_t input_size,
+				  uint8_t *output,
+				  size_t available_size)
+{
+	size_t i = 0;
+	struct lzxhuff_compressor_context cmp_ctx = {
+		.input_bytes = input_bytes,
+		.input_size = input_size,
+		.input_pos = 0,
+		.prev_block_pos = 0,
+		.output = output,
+		.available_size = available_size,
+		.output_pos = 0
+	};
+
+	if (input_size == 0) {
+		/*
+		 * We can't deal with this for a number of reasons (e.g. it
+		 * breaks the Huffman tree), and the output will be infinitely
+		 * bigger than the input. The caller needs to go and think
+		 * about what they're trying to do here.
+		 */
+		return LZXPRESS_ERROR;
+	}
+
+	if (input_size > SSIZE_MAX ||
+	    input_size > UINT32_MAX ||
+	    available_size > SSIZE_MAX ||
+	    available_size > UINT32_MAX ||
+	    available_size == 0) {
+		/*
+		 * We use negative ssize_t to return errors, which is limiting
+		 * on 32 bit machines; otherwise we adhere to Microsoft's 4GB
+		 * limit.
+		 *
+		 * lzxpress_huffman_compress_talloc() will not get this far,
+		 * having already have failed on talloc's 256 MB limit.
+		 */
+		return LZXPRESS_ERROR;
+	}
+
+	if (cmp_mem == NULL ||
+	    output == NULL ||
+	    input_bytes == NULL) {
+		return LZXPRESS_ERROR;
+	}
+
+	while (cmp_ctx.input_pos < cmp_ctx.input_size) {
+		ssize_t ret;
+		ret = lzx_huffman_compress_block(&cmp_ctx,
+						 cmp_mem,
+						 i);
+		if (ret < 0) {
+			return ret;
+		}
+		i++;
+	}
+
+	return cmp_ctx.output_pos;
+}
+
+static void debug_tree_codes(struct bitstream *input)
+{
+	/*
+	 */
+	size_t head = 0;
+	size_t tail = 2;
+	size_t ffff_count = 0;
+	struct q {
+		uint16_t tree_code;
+		uint16_t code_code;
+	};
+	struct q queue[65536];
+	char bits[17];
+	uint16_t *t = input->table;
+	queue[0].tree_code = 1;
+	queue[0].code_code = 2;
+	queue[1].tree_code = 2;
+	queue[1].code_code = 3;
+	while (head < tail) {
+		struct q q = queue[head];
+		uint16_t x = t[q.tree_code];
+		if (x != 0xffff) {
+			int k;
+			uint16_t j = q.code_code;
+			size_t offset = bitlen_nonzero_16(j) - 1;
+			if (unlikely(j == 0)) {
+				DBG("BROKEN code is 0!\n");
+				return;
+			}
+
+			for (k = 0; k <= offset; k++) {
+				bool b = (j >> (offset - k)) & 1;
+				bits[k] = b ? '1' : '0';
+			}
+			bits[k] = 0;
+			DBG("%03x   %s\n", x & 511, bits);
+			head++;
+			continue;
+		}
+		ffff_count++;
+		queue[tail].tree_code = q.tree_code * 2 + 1;
+		queue[tail].code_code = q.code_code * 2;
+		tail++;
+		queue[tail].tree_code = q.tree_code * 2 + 1 + 1;
+		queue[tail].code_code = q.code_code * 2 + 1;
+		tail++;
+		head++;
+	}
+	DBG("0xffff count: %zu\n", ffff_count);
+}
+
+/**
+ * Determines the sort order of one prefix_code_symbol relative to another
+ */
+static int compare_uint16(const uint16_t *a, const uint16_t *b)
+{
+	if (*a < *b) {
+		return -1;
+	}
+	if (*a > *b) {
+		return 1;
+	}
+	return 0;
+}
+
+
+static bool fill_decomp_table(struct bitstream *input)
+{
+	/*
+	 * There are 512 symbols, each encoded in 4 bits, which indicates
+	 * their depth in the Huffman tree. The even numbers get the lower
+	 * nibble of each byte, so that the byte hex values look backwards
+	 * (i.e. 0xab encodes b then a). These are allocated Huffman codes in
+	 * order of appearance, per depth.
+	 *
+	 * For example, if the first two bytes were:
+	 *
+	 * 0x23 0x53
+	 *
+	 * the first four codes have the lengths 3, 2, 3, 5.
+	 * Let's call them A, B, C, D.
+	 *
+	 * Suppose there is no other codeword with length 1 (which is
+	 * necessarily true in this example) or 2, but there might be others
+	 * of length 3 or 4. Then we can say this about the codes:
+	 *
+	 *        _ --*--_
+	 *      /          \
+	 *     0           1
+	 *    / \         / \
+	 *   0   1       0   1
+	 *  B    |\     / \  |\
+	 *       0 1   0   1 0 1
+	 *       A C   |\ /| | |\
+	 *
+	 * pos bits  code
+	 * A    3    010
+	 * B    2    00
+	 * C    3    011
+	 * D    5    1????
+	 *
+	 * B has the shortest code, so takes the leftmost branch, 00. That
+	 * ends the branch -- nothing else can start with 00. There are no
+	 * more 2s, so we look at the 3s, starting as far left as possible. So
+	 * A takes 010 and C takes 011. That means everything else has to
+	 * start with 1xx. We don't know how many codewords of length 3 or 4
+	 * there are; if there are none, D would end up with 10000, the
+	 * leftmost available code of length 5. If the compressor is any good,
+	 * there should be no unused leaf nodes left dangling at the end.
+	 *
+	 * (this is "Canonical Huffman Coding").
+	 *
+	 *
+	 * But what symbols do these codes actually stand for?
+	 * --------------------------------------------------
+	 *
+	 * Good question. The first 256 codes stand for the corresponding
+	 * literal bytes. The codes from 256 to 511 stand for LZ77 matches,
+	 * which have a distance and a length, encoded in a strange way that
+	 * isn't entirely the purview of this function.
+	 *
+	 * What does the value 0 mean?
+	 * ---------------------------
+	 *
+	 * The code does not occur. For example, if the next byte in the
+	 * example above was 0x07, that would give the byte 0x04 a 7-long
+	 * code, and no code to the 0x05 byte, which means we there is no way
+	 * we going to see a 5 in the decoded stream.
+	 *
+	 * Isn't LZ77 + Huffman what zip/gzip/zlib do?
+	 * -------------------------------------------
+	 *
+	 * Yes, DEFLATE is LZ77 + Huffman, but the details are quite different.
+	 */
+	uint16_t symbols[512];
+	uint16_t sort_mem[512];
+	size_t i, n_symbols;
+	ssize_t code;
+	uint16_t len = 0, prev_len;
+	const uint8_t *table_bytes = input->bytes + input->byte_pos;
+
+	if (input->byte_pos + 260 > input->byte_size) {
+		return false;
+	}
+
+	n_symbols = 0;
+	for (i = 0; i < 256; i++) {
+		uint16_t even = table_bytes[i] & 15;
+		uint16_t odd = table_bytes[i] >> 4;
+		if (even != 0) {
+			symbols[n_symbols] = (even << 9) + i * 2;
+			n_symbols++;
+		}
+		if (odd != 0) {
+			symbols[n_symbols] = (odd << 9) + i * 2 + 1;
+			n_symbols++;
+		}
+	}
+	input->byte_pos += 256;
+	if (n_symbols == 0) {
+		return false;
+	}
+
+	stable_sort(symbols, sort_mem, n_symbols, sizeof(uint16_t),
+		    (samba_compare_fn_t)compare_uint16);
+
+	/*
+	 * we're using an implicit binary tree, as you'd see in a heap.
+	 * table[0] = unused
+	 * table[1] = '0'
+	 * table[2] = '1'
+	 * table[3] = '00'     <-- '00' and '01' are children of '0'
+	 * table[4] = '01'     <-- '0' is [0], children are [0 * 2 + {1,2}]
+	 * table[5] = '10'
+	 * table[6] = '11'
+	 * table[7] = '000'
+	 * table[8] = '001'
+	 * table[9] = '010'
+	 * table[10]= '011'
+	 * table[11]= '100
+	 *'
+	 * table[1 << n - 1] = '0' * n
+	 * table[1 << n - 1 + x] = n-bit wide x (left padded with '0')
+	 * table[1 << n - 2] = '1' * (n - 1)
+	 *
+	 * table[i]->left =  table[i*2 + 1]
+	 * table[i]->right = table[i*2 + 2]
+	 * table[0xffff] = unused (16 '0's, max len is 15)
+	 *
+	 * therefore e.g. table[70] = table[64     - 1 + 7]
+	 *                          = table[1 << 6 - 1 + 7]
+	 *                          = '000111' (binary 7, widened to 6 bits)
+	 *
+	 *   and if '000111' is a code,
+	 *   '00011', '0001', '000', '00', '0' are unavailable prefixes.
+	 *       34      16      7     3    1  are their indices
+	 *   and (i - 1) >> 1 is the path back from 70 through these.
+	 *
+	 * the lookup is
+	 *
+	 * 1 start with i = 0
+	 * 2 extract a symbol bit (i = (i << 1) + bit + 1)
+	 * 3 is table[i] == 0xffff?
+	 * 4  yes -- goto 2
+	 * 4  table[i] & 511 is the symbol, stop
+	 *
+	 * and the construction (here) is sort of the reverse.
+	 *
+	 * Most of this table is free space that can never be reached, and
+	 * most of the activity is at the beginning (since all codes start
+	 * there, and by design the shortest codes are the most common).
+	 */
+	for (i = 0; i < 32; i++) {
+		/* prefill the table head */
+		input->table[i] = 0xffff;
+	}
+	code = -1;
+	prev_len = 0;
+	for (i = 0; i < n_symbols; i++) {
+		uint16_t s = symbols[i];
+		uint16_t prefix;
+		len = (s >> 9) & 15;
+		s &= 511;
+		code++;
+		while (len != prev_len) {
+			code <<= 1;
+			code++;
+			prev_len++;
+		}
+
+		if (code >= 65535) {
+			return false;
+		}
+		input->table[code] = s;
+		for(prefix = (code - 1) >> 1;
+		    prefix > 31;
+		    prefix = (prefix - 1) >> 1) {
+			input->table[prefix] = 0xffff;
+		}
+	}
+	if (CHECK_DEBUGLVL(10)) {
+		debug_tree_codes(input);
+	}
+
+	/*
+	 * check that the last code encodes 11111..., with right number of
+	 * ones, pointing to the right symbol -- otherwise we have a dangling
+	 * uninitialised symbol.
+	 */
+	if (code != (1 << (len + 1)) - 2) {
+		return false;
+	}
+	return true;
+}
+
+
+#define CHECK_READ_32(dest)					  \
+	do {							  \
+		if (input->byte_pos + 4 > input->byte_size) {     \
+			return LZXPRESS_ERROR;			   \
+		}						   \
+		dest = PULL_LE_U32(input->bytes, input->byte_pos); \
+		input->byte_pos += 4;				   \
+	} while (0)
+
+#define CHECK_READ_16(dest)					  \
+	do {							  \
+		if (input->byte_pos + 2 > input->byte_size) {     \
+			return LZXPRESS_ERROR;			   \
+		}						   \
+		dest = PULL_LE_U16(input->bytes, input->byte_pos); \
+		input->byte_pos += 2;				   \
+	} while (0)
+
+#define CHECK_READ_8(dest) \
+	do {								\
+		if (input->byte_pos >= input->byte_size) {		\
+			return LZXPRESS_ERROR;				\
+		}							\
+		dest = PULL_LE_U8(input->bytes, input->byte_pos);	\
+		input->byte_pos++;					\
+	} while(0)
+
+
+static inline ssize_t pull_bits(struct bitstream *input)
+{
+	if (input->byte_pos + 1 < input->byte_size) {
+		uint16_t tmp;
+		CHECK_READ_16(tmp);
+		input->remaining_bits += 16;
+		input->bits <<= 16;
+		input->bits |= tmp;
+	} else if (input->byte_pos < input->byte_size) {
+		uint8_t tmp;
+		CHECK_READ_8(tmp);
+		input->remaining_bits += 8;
+		input->bits <<= 8;
+		input->bits |= tmp;
+	} else {
+		return LZXPRESS_ERROR;
+	}
+	return 0;
+}
+
+
+/*
+ * Decompress a block. The actual decompressed size is returned (or -1 on
+ * error). The putative block length is 64k (or shorter, if the message ends
+ * first), but a match can run over the end, extending the block. That's why
+ * we need the overall output size as well as the block size. A match encoded
+ * in this block can point back to previous blocks, but not before the
+ * beginning of the message, so we also need the previously decoded size.
+ *
+ * The compressed block will have 256 bytes for the Huffman table, and at
+ * least 4 bytes of (possibly padded) encoded values.
+ */
+static ssize_t lzx_huffman_decompress_block(struct bitstream *input,
+					    uint8_t *output,
+					    size_t block_size,
+					    size_t output_size,
+					    size_t previous_size)
+{
+	size_t output_pos = 0;
+	uint16_t symbol;
+	size_t index;
+	uint16_t distance_bits_wanted = 0;
+	size_t distance = 0;
+	size_t length = 0;
+	bool ok;
+	uint32_t tmp;
+	bool seen_eof_marker = false;
+
+	ok = fill_decomp_table(input);
+	if (! ok) {
+		return LZXPRESS_ERROR;
+	}
+	if (CHECK_DEBUGLVL(10) || DEBUG_HUFFMAN_TREE) {
+		debug_huffman_tree_from_table(input->table);
+	}
+	/*
+	 * Always read 32 bits at the start, even if we don't need them.
+	 */
+	CHECK_READ_16(tmp);
+	CHECK_READ_16(input->bits);
+	input->bits |= tmp << 16;
+	input->remaining_bits = 32;
+
+	/*
+	 * This loop iterates over individual *bits*. These are read from
+	 * little-endian 16 bit words, most significant bit first.
+	 *
+	 * At points in the bitstream, the following are possible:
+	 *
+	 * # the source word is empty and needs to be refilled from the input
+	 *    stream.
+	 * # an incomplete codeword is being extended.
+	 * # a codeword is resolved, either as a literal or a match.
+	 * # a literal is written.
+	 * # a match is collecting distance bits.
+	 * # the output stream is copied, as specified by a match.
+	 * # input bytes are read for match lengths.
+	 *
+	 * Note that we *don't* specifically check for the EOF marker (symbol
+	 * 256) in this loop, because the precondition for stopping for the
+	 * EOF marker is that the output buffer is full (otherwise, you
+	 * wouldn't know which 256 is EOF, rather than an actual symbol), and
+	 * we *always* want to stop when the buffer is full. So we work out if
+	 * there is an EOF in another loop after we stop writing.
+	 */
+
+	index = 0;
+	while (output_pos < block_size) {
+		uint16_t b;
+		if (input->remaining_bits == 16) {
+			ssize_t ret = pull_bits(input);
+			if (ret) {
+				return ret;
+			}
+		}
+		input->remaining_bits--;
+
+		b = (input->bits >> input->remaining_bits) & 1;
+		if (length == 0) {
+			/* not in a match; pulling a codeword */
+			index <<= 1;
+			index += b + 1;
+			if (input->table[index] == 0xffff) {
+				/* incomplete codeword, the common case */
+				continue;
+			}
+			/* found the symbol, reset the code string */
+			symbol = input->table[index] & 511;
+			index = 0;
+			if (symbol < 256) {
+				/* a literal, the easy case */
+				output[output_pos] = symbol;
+				output_pos++;
+				continue;
+			}
+
+			/* the beginning of a match */
+			distance_bits_wanted = (symbol >> 4) & 15;
+			distance = 1 << distance_bits_wanted;
+			length = symbol & 15;
+			if (length == 15) {
+				CHECK_READ_8(tmp);
+				length += tmp;
+				if (length == 255 + 15) {
+					/*
+					 * note, we discard (don't add) the
+					 * length so far.
+					 */
+					CHECK_READ_16(length);
+					if (length == 0) {
+						CHECK_READ_32(length);
+					}
+				}
+			}
+			length += 3;
+		} else {
+			/* we are pulling extra distance bits */
+			distance_bits_wanted--;
+			distance |= b << distance_bits_wanted;
+		}
+
+		if (distance_bits_wanted == 0) {
+			/*
+			 * We have a complete match, and it is time to do the
+			 * copy (byte by byte, because the ranges can overlap,
+			 * and we might need to copy bytes we just copied in).
+			 *
+			 * It is possible that this match will extend beyond
+			 * the end of the expected block. That's fine, so long
+			 * as it doesn't extend past the total output size.
+			 */
+			size_t i;
+			size_t end = output_pos + length;
+			uint8_t *here = output + output_pos;
+			uint8_t *there = here - distance;
+			if (end > output_size ||
+			    previous_size + output_pos < distance ||
+			    unlikely(end < output_pos || there > here)) {
+				return LZXPRESS_ERROR;
+			}
+			for (i = 0; i < length; i++) {
+				here[i] = there[i];
+			}
+			output_pos += length;
+			distance = 0;
+			length = 0;
+		}
+	}
+
+	if (length != 0 || index != 0) {
+		/* it seems like we've hit an early end, mid-code */
+		return LZXPRESS_ERROR;
+	}
+
+	if (input->byte_pos + 256 < input->byte_size) {
+		/*
+		 * This block is over, but it clearly isn't the last block, so
+		 * we don't want to look for the EOF.
+		 */
+		return output_pos;
+	}
+	/*
+	 * We won't write any more, but we try to read some more to make sure
+	 * we're finishing in a good place. That means we want to see a 256
+	 * symbol and then some number of zeroes, possibly zero, but as many
+	 * as 32.
+	 *
+	 * In this we are perhaps a bit stricter than Windows, which
+	 * apparently does not insist on the EOF marker, nor on a lack of
+	 * trailing bytes.
+	 */
+	while (true) {
+		uint16_t b;
+		if (input->remaining_bits == 16) {
+			ssize_t ret;
+			if (input->byte_pos == input->byte_size) {
+				/* FIN */
+				break;
+			}
+			ret = pull_bits(input);
+			if (ret) {
+				return ret;
+			}
+		}
+		input->remaining_bits--;
+		b = (input->bits >> input->remaining_bits) & 1;
+		if (seen_eof_marker) {
+			/*
+			 * we have read an EOF symbols. Now we just want to
+			 * see zeroes.
+			 */
+			if (b != 0) {
+				return LZXPRESS_ERROR;
+			}
+			continue;
+		}
+
+		/* we're pulling in a symbol, which had better be 256 */
+		index <<= 1;
+		index += b + 1;
+		if (input->table[index] == 0xffff) {
+			continue;
+		}
+
+		symbol = input->table[index] & 511;
+		if (symbol != 256) {
+			return LZXPRESS_ERROR;
+		}
+		seen_eof_marker = true;
+		continue;
+	}
+
+	if (! seen_eof_marker) {
+		return LZXPRESS_ERROR;
+	}
+
+	return output_pos;
+}
+
+static ssize_t lzxpress_huffman_decompress_internal(struct bitstream *input,
+						    uint8_t *output,
+						    size_t output_size)
+{
+	size_t output_pos = 0;
+
+	if (input->byte_size < 260) {
+		return LZXPRESS_ERROR;
+	}
+
+	while (input->byte_pos < input->byte_size) {
+		ssize_t block_output_pos;
+		ssize_t block_output_size;
+		size_t remaining_output_size = output_size - output_pos;
+
+		block_output_size = MIN(65536, remaining_output_size);
+
+		block_output_pos = lzx_huffman_decompress_block(
+			input,
+			output + output_pos,
+			block_output_size,
+			remaining_output_size,
+			output_pos);
+
+		if (block_output_pos < block_output_size) {
+			return LZXPRESS_ERROR;
+		}
+		output_pos += block_output_pos;
+		if (output_pos > output_size) {
+			/* not expecting to get here. */
+			return LZXPRESS_ERROR;
+		}
+	}
+
+	if (input->byte_pos != input->byte_size) {
+		return LZXPRESS_ERROR;
+	}
+
+	return output_pos;
+}
+
+
+/*
+ * lzxpress_huffman_decompress()
+ *
+ * output_size must be the expected length of the decompressed data.
+ * input_size and output_size are limited to the minimum of UINT32_MAX and
+ * SSIZE_MAX. On 64 bit machines that will be UINT32_MAX, or 4GB.
+ *
+ * @param input_bytes  memory to be decompressed.
+ * @param input_size   length of the compressed buffer.
+ * @param output       destination for the decompressed data.
+ * @param output_size  exact expected length of the decompressed data.
+ *
+ * @return the number of bytes written or -1 on error.
+ */
+
+ssize_t lzxpress_huffman_decompress(const uint8_t *input_bytes,
+				    size_t input_size,
+				    uint8_t *output,
+				    size_t output_size)
+{
+	uint16_t table[65536];
+	struct bitstream input = {
+		.bytes = input_bytes,
+		.byte_size = input_size,
+		.byte_pos = 0,
+		.bits = 0,
+		.remaining_bits = 0,
+		.table = table
+	};
+
+	if (input_size > SSIZE_MAX ||
+	    input_size > UINT32_MAX ||
+	    output_size > SSIZE_MAX ||
+	    output_size > UINT32_MAX ||
+	    input_size == 0 ||
+	    output_size == 0 ||
+	    input_bytes == NULL ||
+	    output == NULL) {
+		/*
+		 * We use negative ssize_t to return errors, which is limiting
+		 * on 32 bit machines, and the 4GB limit exists on Windows.
+		 */
+		return  LZXPRESS_ERROR;
+	}
+
+	return lzxpress_huffman_decompress_internal(&input,
+						    output,
+						    output_size);
+}
+
+
+/**
+ * lzxpress_huffman_decompress_talloc()
+ *
+ * The caller must provide the exact size of the expected output.
+ *
+ * The input_size is limited to the minimum of UINT32_MAX and SSIZE_MAX, but
+ * output_size is limited to 256MB due to a limit in talloc. This effectively
+ * limits input_size too, as non-crafted compressed data will not exceed the
+ * decompressed size by very much.
+ *
+ * @param mem_ctx      TALLOC_CTX parent for the decompressed buffer.
+ * @param input_bytes  memory to be decompressed.
+ * @param input_size   length of the compressed buffer.
+ * @param output_size  expected decompressed size.
+ *
+ * @return a talloc'ed buffer exactly output_size in length, or NULL.
+ */
+
+uint8_t *lzxpress_huffman_decompress_talloc(TALLOC_CTX *mem_ctx,
+					    const uint8_t *input_bytes,
+					    size_t input_size,
+					    size_t output_size)
+{
+	ssize_t result;
+	uint8_t *output = NULL;
+	struct bitstream input = {
+		.bytes = input_bytes,
+		.byte_size = input_size
+	};
+
+	output = talloc_array(mem_ctx, uint8_t, output_size);
+	if (output == NULL) {
+		return NULL;
+	}
+
+	input.table = talloc_array(mem_ctx, uint16_t, 65536);
+	if (input.table == NULL) {
+		talloc_free(output);
+		return NULL;
+	}
+	result = lzxpress_huffman_decompress_internal(&input,
+						      output,
+						      output_size);
+	talloc_free(input.table);
+
+	if (result != output_size) {
+		talloc_free(output);
+		return NULL;
+	}
+	return output;
+}
diff --git a/lib/compression/lzxpress_huffman.h b/lib/compression/lzxpress_huffman.h
new file mode 100644
index 0000000..41cca5c
--- /dev/null
+++ b/lib/compression/lzxpress_huffman.h
@@ -0,0 +1,95 @@
+/*
+ * Samba compression library - LGPLv3
+ *
+ * Copyright © Catalyst IT 2022
+ *
+ *  ** NOTE! The following LGPL license applies to this file.
+ *  ** It does NOT imply that all of Samba is released under the LGPL
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 3 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HAVE_LZXPRESS_HUFFMAN_H
+#define HAVE_LZXPRESS_HUFFMAN_H
+
+
+struct huffman_node {
+	struct huffman_node *left;
+	struct huffman_node *right;
+	uint32_t count;
+	uint16_t symbol;
+	int8_t depth;
+};
+
+
+/*
+ * LZX_HUFF_COMP_HASH_BITS is how big to make the hash tables
+ * (12 means 4096, etc).
+ *
+ * A larger number (up to 16) will be faster on long messages (fewer
+ * collisions), but probably slower on short ones (more prep).
+ */
+#define LZX_HUFF_COMP_HASH_BITS 14
+
+
+/*
+ * This struct just coalesces all the memory you need for LZ77 + Huffman
+ * compression together in one bundle.
+ *
+ * There are a few different things you want, you usually want them all, so
+ * this makes it easy to allocate them all at once.
+ */
+
+struct lzxhuff_compressor_mem {
+	struct huffman_node leaf_nodes[512];
+	struct huffman_node internal_nodes[512];
+	uint16_t symbol_values[512];
+	uint16_t intermediate[65536 + 6];
+	uint16_t hash_table1[1 << LZX_HUFF_COMP_HASH_BITS];
+	uint16_t hash_table2[1 << LZX_HUFF_COMP_HASH_BITS];
+};
+
+
+ssize_t lzxpress_huffman_compress(struct lzxhuff_compressor_mem *cmp,
+				  const uint8_t *input_bytes,
+				  size_t input_size,
+				  uint8_t *output,
+				  size_t available_size);
+
+
+ssize_t lzxpress_huffman_compress_talloc(TALLOC_CTX *mem_ctx,
+					 const uint8_t *input_bytes,
+					 size_t input_size,
+					 uint8_t **output);
+
+ssize_t lzxpress_huffman_decompress(const uint8_t *input,
+				    size_t input_size,
+				    uint8_t *output,
+				    size_t max_output_size);
+
+uint8_t *lzxpress_huffman_decompress_talloc(TALLOC_CTX *mem_ctx,
+					    const uint8_t *input_bytes,
+					    size_t input_size,
+					    size_t output_size);
+
+/*
+ * lzxpress_huffman_max_compressed_size()
+ *
+ * Return the most bytes the compression can take, to allow
+ * pre-allocation.
+ */
+size_t lzxpress_huffman_max_compressed_size(size_t input_size);
+
+
+#endif /* HAVE_LZXPRESS_HUFFMAN_H */
diff --git a/lib/compression/pycompression.c b/lib/compression/pycompression.c
new file mode 100644
index 0000000..5c3fd82
--- /dev/null
+++ b/lib/compression/pycompression.c
@@ -0,0 +1,304 @@
+/*
+   Samba Unix SMB/CIFS implementation.
+
+   Python bindings for compression functions.
+
+   Copyright (C) Petr Viktorin 2015
+   Copyright (C) Douglas Bagnall 2022
+
+     ** NOTE! The following LGPL license applies to the talloc
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include <talloc.h>
+#include "lib/replace/system/python.h"
+#include "lzxpress.h"
+#include "lzxpress_huffman.h"
+
+/* CompressionError is filled out in module init */
+static PyObject *CompressionError = NULL;
+
+static PyObject *plain_compress(PyObject *mod, PyObject *args)
+{
+	uint8_t *src = NULL;
+	Py_ssize_t src_len;
+	char *dest = NULL;
+	Py_ssize_t dest_len;
+	PyObject *dest_obj = NULL;
+	size_t alloc_len;
+	int ret;
+
+	if (!PyArg_ParseTuple(args, "s#", &src, &src_len)) {
+		return NULL;
+	}
+
+	/*
+	 * 9/8 + 4 is the worst case growth, but we add room.
+	 *
+	 * alloc_len can't overflow as src_len is ssize_t while alloc_len is
+	 * size_t.
+	 */
+	alloc_len = src_len + src_len / 8 + 500;
+
+	dest_obj = PyBytes_FromStringAndSize(NULL, alloc_len);
+	if (dest_obj == NULL) {
+		return NULL;
+	}
+	dest = PyBytes_AS_STRING(dest_obj);
+
+	dest_len = lzxpress_compress(src,
+				     src_len,
+				     (uint8_t *)dest,
+				     alloc_len);
+	if (dest_len < 0) {
+		PyErr_SetString(CompressionError, "unable to compress data");
+		Py_DECREF(dest_obj);
+		return NULL;
+	}
+
+	ret = _PyBytes_Resize(&dest_obj, dest_len);
+	if (ret != 0) {
+		/*
+		 * Don't try to free dest_obj, as we're in deep MemoryError
+		 * territory here.
+		 */
+		return NULL;
+	}
+	return dest_obj;
+}
+
+
+static PyObject *plain_decompress(PyObject *mod, PyObject *args)
+{
+	uint8_t *src = NULL;
+	Py_ssize_t src_len;
+	char *dest = NULL;
+	Py_ssize_t dest_len;
+	PyObject *dest_obj = NULL;
+	Py_ssize_t alloc_len = 0;
+	Py_ssize_t given_len = 0;
+	int ret;
+
+	if (!PyArg_ParseTuple(args, "s#|n", &src, &src_len, &given_len)) {
+		return NULL;
+	}
+	if (given_len != 0) {
+		/*
+		 * With plain decompression, we don't *need* the exact output
+		 * size (as we do with LZ77+Huffman), but it certainly helps
+		 * when guessing the size.
+		 */
+		alloc_len = given_len;
+	} else if (src_len > UINT32_MAX) {
+		/*
+		 * The underlying decompress function will reject this, but by
+		 * checking here we can give a better message and be clearer
+		 * about overflow risks.
+		 *
+		 * Note, the limit is actually the smallest of UINT32_MAX and
+		 * SSIZE_MAX, but src_len is ssize_t so it already can't
+		 * exceed that.
+		 */
+		PyErr_Format(CompressionError,
+			     "The maximum size for compressed data is 4GB "
+			     "cannot decompress %zu bytes.", src_len);
+	} else {
+		/*
+		 * The data can expand massively (though not beyond the
+		 * 4GB limit) so we guess a big number for small inputs
+		 * (we expect small inputs), and a relatively conservative
+		 * number for big inputs.
+		 */
+		if (src_len <= 3333333) {
+			alloc_len = 10000000;
+		} else if (src_len > UINT32_MAX / 3) {
+			alloc_len = UINT32_MAX;
+		} else {
+			alloc_len = src_len * 3;
+		}
+	}
+
+	dest_obj = PyBytes_FromStringAndSize(NULL, alloc_len);
+	if (dest_obj == NULL) {
+		return NULL;
+	}
+	dest = PyBytes_AS_STRING(dest_obj);
+
+	dest_len = lzxpress_decompress(src,
+				       src_len,
+				       (uint8_t *)dest,
+				       alloc_len);
+	if (dest_len < 0) {
+		if (alloc_len == given_len) {
+			PyErr_Format(CompressionError,
+				     "unable to decompress data into a buffer "
+				     "of %zd bytes.", alloc_len);
+		} else {
+			PyErr_Format(CompressionError,
+				     "unable to decompress data into a buffer "
+				     "of %zd bytes. If you know the length, "
+				     "supply it as the second argument.",
+				     alloc_len);
+		}
+		Py_DECREF(dest_obj);
+		return NULL;
+	}
+
+	ret = _PyBytes_Resize(&dest_obj, dest_len);
+	if (ret != 0) {
+		/*
+		 * Don't try to free dest_obj, as we're in deep MemoryError
+		 * territory here.
+		 */
+		return NULL;
+	}
+	return dest_obj;
+}
+
+
+
+static PyObject *huffman_compress(PyObject *mod, PyObject *args)
+{
+	uint8_t *src = NULL;
+	Py_ssize_t src_len;
+	char *dest = NULL;
+	Py_ssize_t dest_len;
+	PyObject *dest_obj = NULL;
+	size_t alloc_len;
+	int ret;
+	struct lzxhuff_compressor_mem cmp_mem;
+
+	if (!PyArg_ParseTuple(args, "s#", &src, &src_len)) {
+		return NULL;
+	}
+	/*
+	 * worst case is roughly 256 per 64k or less.
+	 *
+	 * alloc_len won't overflow as src_len is ssize_t while alloc_len is
+	 * size_t.
+	 */
+	alloc_len = src_len + src_len / 8 + 500;
+
+	dest_obj = PyBytes_FromStringAndSize(NULL, alloc_len);
+	if (dest_obj == NULL) {
+		return NULL;
+	}
+	dest = PyBytes_AS_STRING(dest_obj);
+
+	dest_len = lzxpress_huffman_compress(&cmp_mem,
+					     src,
+					     src_len,
+					     (uint8_t *)dest,
+					     alloc_len);
+	if (dest_len < 0) {
+		PyErr_SetString(CompressionError, "unable to compress data");
+		Py_DECREF(dest_obj);
+		return NULL;
+	}
+
+	ret = _PyBytes_Resize(&dest_obj, dest_len);
+	if (ret != 0) {
+		return NULL;
+	}
+	return dest_obj;
+}
+
+
+static PyObject *huffman_decompress(PyObject *mod, PyObject *args)
+{
+	uint8_t *src = NULL;
+	Py_ssize_t src_len;
+	char *dest = NULL;
+	Py_ssize_t dest_len;
+	PyObject *dest_obj = NULL;
+	Py_ssize_t given_len = 0;
+	/*
+	 * Here it is always necessary to supply the exact length.
+	 */
+
+	if (!PyArg_ParseTuple(args, "s#n", &src, &src_len, &given_len)) {
+		return NULL;
+	}
+
+	dest_obj = PyBytes_FromStringAndSize(NULL, given_len);
+	if (dest_obj == NULL) {
+		return NULL;
+	}
+	dest = PyBytes_AS_STRING(dest_obj);
+
+	dest_len = lzxpress_huffman_decompress(src,
+					       src_len,
+					       (uint8_t *)dest,
+					       given_len);
+	if (dest_len != given_len) {
+		PyErr_Format(CompressionError,
+			     "unable to decompress data into a %zd bytes.",
+			     given_len);
+		Py_DECREF(dest_obj);
+		return NULL;
+	}
+	/* no resize here */
+	return dest_obj;
+}
+
+
+static PyMethodDef mod_methods[] = {
+	{ "plain_compress", (PyCFunction)plain_compress, METH_VARARGS,
+		"compress bytes using lzxpress plain compression"},
+	{ "plain_decompress", (PyCFunction)plain_decompress, METH_VARARGS,
+		"decompress lzxpress plain compressed bytes"},
+	{ "huffman_compress", (PyCFunction)huffman_compress, METH_VARARGS,
+		"compress bytes using lzxpress plain compression"},
+	{ "huffman_decompress", (PyCFunction)huffman_decompress, METH_VARARGS,
+		"decompress lzxpress plain compressed bytes"},
+	{0}
+};
+
+
+#define MODULE_DOC PyDoc_STR("LZXpress compression/decompression bindings")
+
+static struct PyModuleDef moduledef = {
+    PyModuleDef_HEAD_INIT,
+    .m_name = "compression",
+    .m_doc = MODULE_DOC,
+    .m_size = -1,
+    .m_methods = mod_methods,
+};
+
+
+static PyObject *module_init(void)
+{
+	PyObject *m = PyModule_Create(&moduledef);
+	if (m == NULL) {
+		return NULL;
+	}
+
+	CompressionError = PyErr_NewException(
+		"compression.CompressionError",
+		PyExc_Exception,
+		NULL);
+	PyModule_AddObject(m, "CompressionError", CompressionError);
+
+	return m;
+}
+
+PyMODINIT_FUNC PyInit_compression(void);
+PyMODINIT_FUNC PyInit_compression(void)
+{
+	return module_init();
+}
diff --git a/lib/compression/tests/scripts/README b/lib/compression/tests/scripts/README
new file mode 100644
index 0000000..aff1047
--- /dev/null
+++ b/lib/compression/tests/scripts/README
@@ -0,0 +1,19 @@
+Tools used in the development and testing the LZ77 + Huffman code.
+
+These might not be of use to anyone ever, but here they are.
+
+make-fuzz-examples
+encodes compressed files for decompression fuzzer.
+
+decode-huffman-header
+print Huffman codes and validate first header in a compressed file.
+
+three-byte-hash
+check that a three byte hash works.
+
+make-test-vectors
+make files with randomly unbalanced symbol distributions.
+
+generate-windows-test-vectors.c
+if compiled under Cygwin or similar on Windows, this can be used to
+generate and verify test vectors.
diff --git a/lib/compression/tests/scripts/decode-huffman-header b/lib/compression/tests/scripts/decode-huffman-header
new file mode 100755
index 0000000..9c1279b
--- /dev/null
+++ b/lib/compression/tests/scripts/decode-huffman-header
@@ -0,0 +1,54 @@
+#!/usr/bin/python3
+"""Print the codes in the first Huffman tree header in the given file.
+
+USAGE: decode-huffman-header FILE
+
+The number of codes of different length is printed first, followed by
+the implied total frequency of codes, followed by the deduced codes.
+
+If the total is not 1.0, the header is invalid.
+"""
+
+import sys
+
+
+if '--help' in sys.argv or '-h' in sys.argv or len(sys.argv) != 2:
+    print(__doc__)
+    exit(len(sys.argv) != 2)
+
+
+def read_table(data):
+    lengths = [[] for x in range(16)]
+    for i, b in enumerate(data):
+        even = b & 15
+        odd = b >> 4
+        lengths[even].append(i * 2)
+        lengths[odd].append(i * 2 + 1)
+
+    code = 0
+
+    total = 0.0
+    for i, bucket in enumerate(lengths):
+        if bucket and i:
+            portion = 1.0 / (1 << i) * len(bucket)
+            total += portion
+            print(f"length {i:2}: {len(bucket):4} ({portion})")
+    print(f"total {total}")
+
+    for i, bucket in enumerate(lengths):
+        if i == 0:
+            continue
+        code <<= 1
+        for c in bucket:
+            print(f'{c:03x}   {code:0{i}b}')
+            code += 1
+
+
+def main():
+    fn = sys.argv[1]
+    with open(fn, 'rb') as f:
+        data = f.read(256)
+        read_table(data)
+
+
+main()
diff --git a/lib/compression/tests/scripts/generate-windows-test-vectors.c b/lib/compression/tests/scripts/generate-windows-test-vectors.c
new file mode 100644
index 0000000..28724d2
--- /dev/null
+++ b/lib/compression/tests/scripts/generate-windows-test-vectors.c
@@ -0,0 +1,206 @@
+/*
+ * Generate test vectorsa for Windows LZ77 Huffman compression.
+ *
+ * Copyright (c) 2022 Douglas Bagnall <dbagnall@samba.org>
+ *
+ * GPLv3+.
+ *
+ * Can be compiled on Windows 2012r2 under Cygwin
+ *
+ * gcc -o generate-windows-test-vectors  \
+ *       generate-windows-test-vectors.c \
+ *	 C:\Windows\SysWOW64\cabinet.dll \
+ *	 -lcabinet
+ *
+ * There might be better ways.
+ *
+ * See https://learn.microsoft.com/en-us/windows/win32/cmpapi/-compression-portal
+ */
+
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+/* compressapi.h is in the Windows API. mingw-w64 has a copy. */
+#include <compressapi.h>
+#include <errhandlingapi.h>
+
+struct blob {
+	uint8_t *data;
+	size_t length;
+};
+
+/* Windows size_t is different than Cygwin size_t (though still 64 bit) */
+typedef unsigned long long wsize_t;
+
+
+#define compression_flags (COMPRESS_ALGORITHM_XPRESS_HUFF | COMPRESS_RAW)
+
+int32_t compression_level = 0;
+
+static struct blob compress(struct blob input)
+{
+	COMPRESSOR_HANDLE handle;
+	struct blob output;
+	bool ok;
+	wsize_t used;
+
+	ok = CreateCompressor(compression_flags, NULL, &handle);
+
+	if (! ok) {
+		fprintf(stderr, "CreateCompressor failed\n");
+		exit(1);
+	}
+
+	output.length = input.length * 3 + 256;
+	output.data = malloc(output.length);
+	if (output.data == NULL) {
+		fprintf(stderr, "output allocation failed (estimated %zu)\n",
+			output.length);
+		exit(1);
+	}
+
+
+	ok = SetCompressorInformation(handle,
+				      COMPRESS_INFORMATION_CLASS_LEVEL,
+				      &compression_level,
+				      sizeof(compression_level));
+
+	if (! ok) {
+	  fprintf(stderr, "SetCompressorInformation failed: %d\n",
+		  GetLastError());
+	  //exit(1);
+	}
+
+	ok = Compress(handle,
+		      input.data,
+		      input.length,
+		      output.data,
+		      output.length,
+		      &used);
+	if (! ok) {
+		fprintf(stderr, "Compress failed\n");
+		exit(1);
+	}
+	output.data = realloc(output.data, used);
+	if (output.data == NULL) {
+		fprintf(stderr,
+			"failed to shrinkwrap output! (from %zu to %llu)\n",
+			output.length, used);
+		exit(1);
+	}
+	output.length = used;
+	CloseCompressor(handle);
+	return output;
+}
+
+
+struct blob decompress(struct blob input,
+		       size_t expected_size)
+{
+	DECOMPRESSOR_HANDLE handle;
+	struct blob output;
+	bool ok;
+	wsize_t used;
+
+	ok = CreateDecompressor(compression_flags, NULL, &handle);
+
+	if (! ok) {
+		fprintf(stderr, "CreateDecompressor failed\n");
+		exit(1);
+	}
+
+	output.length = expected_size;
+	output.data = malloc(output.length);
+	if (output.data == NULL) {
+		fprintf(stderr, "output allocation failed (%zu)\n",
+			output.length);
+		exit(1);
+	}
+
+	ok = Decompress(handle,
+			input.data,
+			input.length,
+			output.data,
+			output.length,
+			&used);
+	if (! ok) {
+		fprintf(stderr, "Decompress failed\n");
+		exit(1);
+	}
+	CloseDecompressor(handle);
+	return output;
+}
+
+
+static void __attribute__((noreturn)) usage(int ret)
+{
+	fprintf(stderr,
+		"USAGE: test-win-vectors {c,d} filename [length|level] > DEST\n\n");
+	fprintf(stderr, "c for< compression, d for decompression\n");
+	fprintf(stderr, "decompressed length is required for decompression\n");
+	fprintf(stderr, "compression level flag is optional [default 0]\n");
+	exit(ret);
+}
+
+int main(int argc, const char *argv[])
+{
+	FILE *fh;
+	const char *filename;
+	struct stat s;
+	int ret;
+	struct blob input = {0};
+	struct blob output = {0};
+
+	if (argc < 3 || argc > 4) {
+		usage(1);
+	}
+	filename = argv[2];
+
+	fh = fopen(filename, "rb");
+	if (fh == NULL) {
+		fprintf(stderr, "Could not open %s\n", filename);
+		usage(1);
+	}
+
+	ret = fstat(fileno(fh), &s);
+	if (ret != 0) {
+		fprintf(stderr, "Could not stat %s: %d\n", filename, ret);
+		usage(1);
+	}
+	input.length = s.st_size;
+	input.data = malloc(input.length);
+	if (input.data == NULL) {
+		fprintf(stderr, "input too big for memory?! (%zu)\n",
+			s.st_size);
+		exit(1);
+	}
+
+	fread(input.data, 1, input.length, fh);
+
+	if (strcmp(argv[1], "c") == 0) {
+		if (argc == 4 && strcmp(argv[3], "0")) {
+			compression_level = 1;
+		}	       		
+		output = compress(input);
+	} else if (strcmp(argv[1], "d") == 0) {
+		size_t decomp_size;
+		if (argc != 4) {
+			fprintf(stderr, "no length given\n");
+			usage(1);
+		}
+		decomp_size = atoi(argv[3]);
+		output = decompress(input, decomp_size);
+	} else {
+		usage(1);
+	}
+	fwrite(output.data, 1, output.length, stdout);
+	free(output.data);
+	return 0;
+}
diff --git a/lib/compression/tests/scripts/make-fuzz-examples b/lib/compression/tests/scripts/make-fuzz-examples
new file mode 100755
index 0000000..09200fb
--- /dev/null
+++ b/lib/compression/tests/scripts/make-fuzz-examples
@@ -0,0 +1,45 @@
+#!/usr/bin/python3
+#
+"""Pack the compressed files created by test_lzx_huffman.c (with
+LZXHUFF_DEBUG_FILES) into the format used by the decompression fuzzer.
+
+That is, the first 3 bytes are the length of the decompressed file,
+and the rest of the file is the compressed data.
+
+USAGE: make-fuzz-examples DIR
+
+where DIR is probably '/tmp'.
+"""
+import os
+import sys
+
+
+if '--help' in sys.argv or '-h' in sys.argv or len(sys.argv) != 2:
+    print(__doc__)
+    exit(len(sys.argv) != 2)
+
+
+def main():
+    files = set(os.listdir(sys.argv[1]))
+
+    for fn in files:
+        if fn.endswith('-compressed'):
+            fn2 = fn.replace('-compressed', '-decompressed')
+            if fn2 not in files:
+                print(f"skipping {fn}, no {fn2}")
+                continue
+            cfn = '/tmp/' + fn
+            dfn = '/tmp/' + fn2
+            wfn = '/tmp/' + fn.replace('-compressed', '.fuzz')
+
+            size = os.stat(dfn).st_size
+            sbytes = bytes([(size & 0xff), (size >> 8) & 0xff, (size >> 16) & 0xff])
+
+            with open(cfn, 'rb') as f:
+                s = f.read()
+
+            with open(wfn, 'wb') as f:
+                s = f.write(sbytes + s)
+
+
+main()
diff --git a/lib/compression/tests/scripts/make-test-vectors b/lib/compression/tests/scripts/make-test-vectors
new file mode 100755
index 0000000..6f25866
--- /dev/null
+++ b/lib/compression/tests/scripts/make-test-vectors
@@ -0,0 +1,185 @@
+#!/usr/bin/python3
+"""Generate a few strings with unbalanced distributions to test the
+regeneration of the Huffman tree when it gets too deep.
+
+USAGE: make-test-vectors DIR
+
+This will fill up DIR with test files.
+"""
+import sys
+import random
+from collections import defaultdict
+
+
+if '--help' in sys.argv or '-h' in sys.argv or len(sys.argv) != 2:
+    print(__doc__)
+    exit(len(sys.argv) != 2)
+
+
+DIR = sys.argv[1]
+
+SIZE = (1 << 17) + (23)  # two and a bit blocks
+SIZE_NAME = "128k+"
+# SIZE = (1 << 16)
+# SIZE_NAME = "64"
+
+
+random.seed(1)
+
+
+def squares(n):
+    array = []
+    for i in range(n):
+        a = random.random()
+        b = random.random()
+        array.append(int(a * b * 256))
+    return bytes(array)
+
+
+def skewed_choices(n):
+    b = list(range(256))
+    array = random.choices(b, weights=b, k=n)
+    return bytes(array)
+
+
+def fib_shuffle(n):
+    array = []
+    a, b = 1, 1
+    for i in range(100):
+        array.extend([i] * a)
+        a, b = a + b, a
+        if len(array) > 1000000:
+            break
+    random.shuffle(array)
+    return bytes(array[:n])
+
+
+def exp_shuffle(n):
+    array = []
+    for i in range(256):
+        array.extend([i] * int(1.04 ** i))
+        if len(array) > 1000000:
+            break
+    random.shuffle(array)
+    return bytes(array[:n])
+
+
+def and_rand(n):
+    array = []
+    for i in range(n):
+        a = random.randrange(256)
+        b = random.randrange(256)
+        array.append(a & b)
+    return bytes(array)
+
+
+def betavar(n, a, b):
+    array = []
+    for i in range(n):
+        x = random.betavariate(a, b)
+        array.append(int(x * 255.999999999999))
+    return bytes(array)
+
+
+def repeated_alphabet(n):
+    a = b'abcdefghijklmnopqrstuvwxyz'
+    na = n // len(a) + 1
+    s = a * na
+    return s[:n]
+
+
+def decayed_alphabet(n):
+    s = list(repeated_alphabet(n))
+    for i in range(256):
+        j = random.randrange(n)
+        s[j] = i
+
+    return bytes(s)
+
+
+def trigram_model(n):
+    with open(__file__, 'rb') as f:
+        data = f.read()
+    lut = defaultdict(list)
+    for a, b, c in zip(data, data[1:], data[2:]):
+        k = bytes([a, b])
+        lut[k].append(c)
+
+    k = random.choice(list(lut.keys()))
+    s = []
+    p = k[1]
+    for i in range(n + 10):
+        c = random.choice(lut[k])
+        s.append(c)
+        k = bytes([p, c])
+        p = c
+
+    return bytes(s[10:])
+
+
+def trigram_sum_model(n):
+    with open(__file__, 'rb') as f:
+        data = f.read()
+    lut = [[random.randrange(256)] for i in range(512)]
+    for a, b, c in zip(data, data[1:], data[2:]):
+        lut[a + b].append(c)
+
+    s = []
+    i = random.randrange(len(data) - 1)
+    a = data[i]
+    b = data[i + 1]
+
+    for i in range(n + 10):
+        x = lut[a + b]
+        c = random.choice(x)
+        s.append(c)
+        a = b
+        b = c
+
+    return bytes(s[10:])
+
+
+def the_classics():
+    # this used to be main()
+    sq = squares(SIZE)
+    ch = skewed_choices(SIZE)
+    fs = fib_shuffle(SIZE)
+    es = exp_shuffle(SIZE)
+    ar = and_rand(SIZE)
+    bv1 = betavar(SIZE, 0.1, 1.5)
+    bv2 = betavar(SIZE, 0.5, 2.0)
+    bv3 = betavar(SIZE, 0.05, 0.05)
+
+    print("n     sq       ch      fs       es")
+    for i in range(256):
+        print(f"{i:3} {sq.count(i):5}   {ch.count(i):5}   "
+              f"{fs.count(i):5}   {es.count(i):5}"
+              f"{ar.count(i):5}   {bv1.count(i):5}"
+              f"{bv2.count(i):5}   {bv3.count(i):5}"
+              )
+
+    for series, fn in ((sq, "square_series"),
+                       (ch, "skewed_choices"),
+                       (fs, "fib_shuffle"),
+                       (es, "exp_shuffle"),
+                       (ar, "and_rand"),
+                       (bv1, "beta-variate1"),
+                       (bv2, "beta-variate2"),
+                       (bv3, "beta-variate3"),
+                       ):
+        with open(f"{DIR}/{fn}-{SIZE_NAME}", "wb") as f:
+            f.write(series)
+
+
+def main():
+    if True:
+        the_classics()
+    for series, fn in ((decayed_alphabet(SIZE), "decayed_alphabet"),
+                       (trigram_model(SIZE), "trigram"),
+                       (trigram_sum_model(SIZE), "trigram_sum"),
+                       ):
+        with open(f"{DIR}/{fn}_{SIZE_NAME}", "wb") as f:
+            f.write(series)
+
+
+main()
diff --git a/lib/compression/tests/scripts/three-byte-hash b/lib/compression/tests/scripts/three-byte-hash
new file mode 100755
index 0000000..100d0bc
--- /dev/null
+++ b/lib/compression/tests/scripts/three-byte-hash
@@ -0,0 +1,49 @@
+#!/usr/bin/python3
+"""Print statistics about a certain three byte hash.
+
+USAGE: three_byte_hash
+"""
+import sys
+
+if '--help' in sys.argv or '-h' in sys.argv or len(sys.argv) > 1:
+    print(__doc__)
+    exit(not ('--help' in sys.argv or '-h' in sys.argv))
+
+
+from statistics import mean, pstdev, median
+
+
+def h(*args, bits=12):
+    a = args[0]
+    b = args[1] ^ 0x2e
+    c = args[2] ^ 0x55
+    d = ((a + b) << 8) ^ (((c - a) & 0xffff) << 5) ^ (c + b) ^ (0xcab + a)
+    return d & ((1 << bits) - 1)
+
+
+def count(fn, bits, filter=None):
+    counts = [0] * (1 << bits)
+    for i in range(256 ** 3):
+        a, b, c = i & 255, (i >> 8) & 255, i >> 16
+        if filter and not (filter(a) and filter(b) and filter(c)):
+            continue
+
+        h = fn(a, b, c, bits=bits)
+        counts[h] += 1
+
+    print(f" {bits} bits; {len(counts)} buckets, "
+          f"expected {(1<<24) / len(counts)}")
+    print(f"median {median(counts)}")
+    print(f"mean {mean(counts)}")
+    print(f"min {min(counts)}")
+    print(f"max {max(counts)}")
+    print(f"stddev {pstdev(counts)}")
+
+
+for b in (12, 13, 14):
+    count(h, b)
+
+    print("With ASCII filter")
+    letters = set(range(32, 127))
+    letters |= set(b'\r\n\t\0')
+    count(h, b, filter=letters.__contains__)
diff --git a/lib/compression/tests/test_lzx_huffman.c b/lib/compression/tests/test_lzx_huffman.c
new file mode 100644
index 0000000..7770535
--- /dev/null
+++ b/lib/compression/tests/test_lzx_huffman.c
@@ -0,0 +1,1255 @@
+/*
+ * Samba compression library - LGPLv3
+ *
+ * Copyright © Catalyst IT 2022
+ *
+ * Written by Douglas Bagnall <douglas.bagnall@catalyst.net.nz>
+ *
+ *  ** NOTE! The following LGPL license applies to this file.
+ *  ** It does NOT imply that all of Samba is released under the LGPL
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 3 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdarg.h>
+#include <stddef.h>
+#include <setjmp.h>
+#include <cmocka.h>
+#include <stdbool.h>
+#include <sys/stat.h>
+#include "replace.h"
+#include <talloc.h>
+#include "lzxpress_huffman.h"
+#include "lib/util/stable_sort.h"
+#include "lib/util/data_blob.h"
+
+/* set LZXHUFF_DEBUG_FILES to true to save round-trip files in /tmp. */
+#define LZXHUFF_DEBUG_FILES false
+
+/* set LZXHUFF_DEBUG_VERBOSE to true to print more. */
+#define LZXHUFF_DEBUG_VERBOSE false
+
+
+#if LZXHUFF_DEBUG_VERBOSE
+#define debug_message(...) print_message(__VA_ARGS__)
+
+#include <time.h>
+
+struct timespec start = {0};
+struct timespec end = {0};
+static void debug_start_timer(void)
+{
+	clock_gettime(CLOCK_MONOTONIC, &start);
+}
+
+static void debug_end_timer(const char *name, size_t len)
+{
+	uint64_t ns;
+	double secs;
+	double rate;
+	clock_gettime(CLOCK_MONOTONIC, &end);
+	ns = end.tv_nsec;
+	ns += end.tv_sec * 1000 * 1000 * 1000;
+	ns -= start.tv_nsec;
+	ns -= start.tv_sec * 1000 * 1000 * 1000;
+	secs = ns / 1e9;
+	rate = len / (secs * 1024 * 1024);
+	debug_message("%s %zu bytes in %.2g: \033[1;35m%.2f\033[0m MB per second\n",
+		      name, len, secs, rate);
+}
+
+#else
+#define debug_message(...) /* debug_message */
+#define debug_start_timer(...) /* debug_start_timer */
+#define debug_end_timer(...) /* debug_end_timer */
+#endif
+
+
+struct lzx_pair {
+	const char *name;
+	DATA_BLOB compressed;
+	DATA_BLOB decompressed;
+};
+
+struct lzx_file_pair {
+	const char *name;
+	const char *compressed_file;
+	const char *decompressed_file;
+};
+
+
+#define DECOMP_DIR "testdata/compression/decompressed"
+#define COMP_DIR "testdata/compression/compressed-huffman"
+#define MORE_COMP_DIR "testdata/compression/compressed-more-huffman"
+
+
+#define VARRGH(...) __VA_ARGS__
+
+#define BLOB_FROM_ARRAY(...)                             \
+	{                                                \
+		.data = (uint8_t[]){__VA_ARGS__},          \
+		.length = sizeof((uint8_t[]){__VA_ARGS__}) \
+	}
+
+#define BLOB_FROM_STRING(s)                                      \
+	{                                                            \
+		.data = discard_const_p(uint8_t, s),		     \
+		.length = (sizeof(s) - 1)		     \
+	}
+
+
+const char * file_names[] = {
+	"27826-8.txt",
+	"5d049b4cb1bd933f5e8ex19",
+	"638e61e96d54279981c3x5",
+	"64k-minus-one-zeros",
+	"64k-plus-one-zeros",
+	"64k-zeros",
+	"96f696a4e5ce56c61a3dx10",
+	"9e0b6a12febf38e98f13",
+	"abc-times-101",
+	"abc-times-105",
+	"abc-times-200",
+	"and_rand",
+	"and_rand-128k+",
+	"b63289ccc7f218c0d56b",
+	"beta-variate1-128k+",
+	"beta-variate2-128k+",
+	"beta-variate3-128k+",
+	"decayed_alphabet_128k+",
+	"decayed_alphabet_64k",
+	"exp_shuffle",
+	"exp_shuffle-128k+",
+	"f00842317dc6d5695b02",
+	"fib_shuffle",
+	"fib_shuffle-128k+",
+	"fuzzing-0fc2d461b56cd8103c91",
+	"fuzzing-17c961778538cc10ab7c",
+	"fuzzing-3591f9dc02bb00a54b60",
+	"fuzzing-3ec3bca27bb9eb40c128",
+	"fuzzing-80b4fa18ff5f8dd04862",
+	"fuzzing-a3115a81d1ac500318f9",
+	"generate-windows-test-vectors.c",
+	"midsummer-nights-dream.txt",
+	"notes-on-the-underground.txt",
+	"pg22009.txt",
+	"repeating",
+	"repeating-exactly-64k",
+	"setup.log",
+	"skewed_choices",
+	"skewed_choices-128k+",
+	/* These ones were deathly slow in fuzzing at one point */
+	"slow-015ddc36a71412ccc50d",
+	"slow-100e9f966a7feb9ca40a",
+	"slow-2a671c3cff4f1574cbab",
+	"slow-33d90a24e70515b14cd0",
+	"slow-49d8c05261e3f412fc72",
+	"slow-50a249d2fe56873e56a0",
+	"slow-63e9f0b52235fb0129fa",
+	"slow-73b7f971d65908ac0095",
+	"slow-8b61e3dd267908544531",
+	"slow-9d1c5a079b0462986f1f",
+	"slow-aa7262a821dabdcf04a6",
+	"slow-b8a91d142b0d2af7f5ca",
+	"slow-c79142457734bbc8d575",
+	"slow-d736544545b90d83fe75",
+	"slow-e3b9bdfaed7d1a606fdb",
+	"slow-f3f1c02a9d006e5e1703",
+	"square_series",
+	"square_series-128k+",
+	"trigram_128k+",
+	"trigram_64k",
+	"trigram_sum_128k+",
+	"trigram_sum_64k",
+	NULL
+};
+
+struct lzx_pair bidirectional_pairs[] = {
+
+	{.name = "abc__100_repeats", /* [MS-XCA] 3.2 example 2. */
+	 .decompressed = BLOB_FROM_STRING(
+		 "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+		 "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+		 "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+		 "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+		 "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+		 ),
+	 .compressed = BLOB_FROM_ARRAY(
+		 /*
+		  * The 'a', 'b', and 'c' bytes are 0x61, 0x62, 0x63. No other
+		  * symbols occur. That means we need 48 0x00 bytes for the
+		  * first 96 symbol-nybbles, then some short codes, then zeros
+		  * again for the rest of the literals.
+		  */
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+		 0,0,0,0,0, 0,0,0,
+		 0x30, 0x23, /* a_ cb */
+		 0,0,0,0,0, 0,0,0,0,0,
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, /* 100 bytes */
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+		 0,0,0,0,0, 0,0,0, /* here end the 0-255 literals (128 bytes) */
+		 0x02, /* 'EOF' symbol 256 (byte 128 low) */
+		 0,0,0,0,0, 0,0,0,0,0, 0,                    /* 140 bytes */
+	         0,0,0,
+		 0x20, /* codepoint 287 (byte 143 high) */
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,         /* 160 bytes */
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, /* 240 bytes */
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,
+		 /*
+		  * So that's the tree.
+		  *
+		  * length 2 codes for 'c', EOF, 287
+		  * length 3 for 'a', 'b'.
+		  *
+		  * c:   00
+		  * EOF: 01
+		  * 287: 10
+		  * a:   110
+		  * b:   111
+		  *
+		  * thus the literal string "abc" is 110-111-00.
+		  *
+		  * Now for the lz77 match definitions for EOF and 287.
+		  *
+		  * Why 287? It encodes the match distance and offset.
+		  *
+		  * 287 - 256 = 31
+		  *
+		  * _length = 31 % 16 = 15
+		  * _distance = 31 / 16 = 1
+		  *
+		  * (it's easier to look at the hex, 0x11f:
+		  * 1xx means a match; x1x is _distance; xxf is _length)
+		  *
+		  * _distance 1 means a two bit distance (10 or 11; 2 or 3).
+		  * That means the next bit will be the least significant bit
+		  * of distance (1 in this case, meaning distance 3).
+		  *
+		  * if _length is 15, real length is included inline.
+		  *
+		  * 'EOF' == 256 means _length = 0, _distance = 0.
+		  *
+		  * _distance 0 means 1, so no further bits needed.
+		  * _length 0 means length 3.
+		  *
+		  * but when used as EOF, this doesn't matter.
+		  */
+		 0xa8, 0xdc, 0x00, 0x00, 0xff, 0x26, 0x01
+		  /* These remaining bytes are:
+		  *
+		  * 10101000 11011100 00000000 00000000 11111111
+		  * 00100110 00000001
+		  *
+		  * and we read them as 16 chunks (i.e. flipping odd/even order)
+		  *
+		  * 110-111-00  10-1-01-000
+		  *   a  b   c 287 | EOF
+		  *                |
+		  *                this is the 287 distance low bit.
+		  *
+		  * The last 3 bits are not used. The 287 length is sort of
+		  * out of band, coming up soon (because 287 encodes length
+		  * 15; most codes do not do this).
+		  *
+		  * 00000000 00000000
+		  *
+		  * This padding is there because the first 32 bits are read
+		  * at the beginning of decoding. If there were more things to
+		  * be encoded, they would be in here.
+		  *
+		  * 11111111
+		  *
+		  * This byte is pulled as the length for the 287 match.
+		  * Because it is 0xff, we pull a further 2 bytes for the
+		  * actual length, i.e. a 16 bit number greater than 270.
+		  *
+		  * 00000001 00100110
+		  *
+		  * that is 0x126 = 294 = the match length - 3 (because we're
+		  * encoding ["abc", <copy from 3 back, 297 chars>, EOF]).
+		  *
+		  */
+		 )
+	},
+	{.name = "abcdefghijklmnopqrstuvwxyz", /* [MS-XCA] 3.2 example 1. */
+	 .decompressed = BLOB_FROM_STRING("abcdefghijklmnopqrstuvwxyz"),
+	 .compressed = BLOB_FROM_ARRAY(
+		 /*
+		  * In this case there are no matches encoded as there are no
+		  * repeated symbols. Including the EOF, there are 27 symbols
+		  * all occurring exactly as frequently as each other (once).
+		  * From that we would expect the codes to be mostly 5 bits
+		  * long, because 27 < 2^5 (32), but greater than 2^4. And
+		  * that's what we see.
+		  */
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+		 0,0,0,0,0, 0,0,0,
+		 /* 14 non-zero bytes for 26 letters/nibbles */
+		 0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+		 0x55, 0x55, 0x55, 0x45, 0x44, 0x04,
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,   /* 80 */
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+		 0,0,0,0,0, 0,0,0,
+		 0x04,                                 /* 0x100 EOF */
+		 /* no matches */
+		 0,0,0,0,0, 0,0,0,0,0, 0,                    /* 140 */
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, /* 240 */
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,
+
+		 0xd8, 0x52, 0x3e, 0xd7, 0x94, 0x11, 0x5b, 0xe9,
+		 0x19, 0x5f, 0xf9, 0xd6, 0x7c, 0xdf, 0x8d, 0x04,
+		 0x00, 0x00, 0x00, 0x00)
+	},
+	{0}
+};
+
+
+static void test_lzxpress_huffman_decompress(void **state)
+{
+	size_t i;
+	ssize_t written;
+	uint8_t *dest = NULL;
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	for (i = 0; bidirectional_pairs[i].name != NULL; i++) {
+		struct lzx_pair p = bidirectional_pairs[i];
+		dest = talloc_array(mem_ctx, uint8_t, p.decompressed.length);
+
+		debug_message("%s compressed %zu decomp %zu\n", p.name,
+			      p.compressed.length,
+			      p.decompressed.length);
+
+		written = lzxpress_huffman_decompress(p.compressed.data,
+						      p.compressed.length,
+						      dest,
+						      p.decompressed.length);
+		assert_int_not_equal(written, -1);
+		assert_int_equal(written, p.decompressed.length);
+
+		assert_memory_equal(dest, p.decompressed.data, p.decompressed.length);
+		talloc_free(dest);
+	}
+}
+
+static void test_lzxpress_huffman_compress(void **state)
+{
+	size_t i;
+	ssize_t written;
+	uint8_t *dest = NULL;
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	for (i = 0; bidirectional_pairs[i].name != NULL; i++) {
+		struct lzx_pair p = bidirectional_pairs[i];
+		debug_message("%s compressed %zu decomp %zu\n", p.name,
+			      p.compressed.length,
+			      p.decompressed.length);
+
+		written = lzxpress_huffman_compress_talloc(mem_ctx,
+							   p.decompressed.data,
+							   p.decompressed.length,
+							   &dest);
+
+		assert_int_not_equal(written, -1);
+		assert_int_equal(written, p.compressed.length);
+		assert_memory_equal(dest, p.compressed.data, p.compressed.length);
+		talloc_free(dest);
+	}
+}
+
+
+static DATA_BLOB datablob_from_file(TALLOC_CTX *mem_ctx,
+				    const char *filename)
+{
+	DATA_BLOB b = {0};
+	FILE *fh = fopen(filename, "rb");
+	int ret;
+	struct stat s;
+	size_t len;
+	if (fh == NULL) {
+		debug_message("could not open '%s'\n", filename);
+		return b;
+	}
+	ret = fstat(fileno(fh), &s);
+	if (ret != 0) {
+		fclose(fh);
+		return b;
+	}
+	b.data = talloc_array(mem_ctx, uint8_t, s.st_size);
+	if (b.data == NULL) {
+		fclose(fh);
+		return b;
+	}
+	len = fread(b.data, 1, s.st_size, fh);
+	if (ferror(fh) || len != s.st_size) {
+		TALLOC_FREE(b.data);
+	} else {
+		b.length = len;
+	}
+	fclose(fh);
+	return b;
+}
+
+
+
+static void test_lzxpress_huffman_decompress_files(void **state)
+{
+	size_t i;
+	int score = 0;
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	for (i = 0; file_names[i] != NULL; i++) {
+		char filename[200];
+		uint8_t *dest = NULL;
+		ssize_t written;
+		TALLOC_CTX *tmp_ctx = talloc_new(mem_ctx);
+		struct lzx_pair p = {
+			.name = file_names[i]
+		};
+
+		debug_message("%s\n", p.name);
+
+		snprintf(filename, sizeof(filename),
+			 "%s/%s.decomp", DECOMP_DIR, p.name);
+
+		p.decompressed = datablob_from_file(tmp_ctx, filename);
+		assert_non_null(p.decompressed.data);
+
+		snprintf(filename, sizeof(filename),
+			 "%s/%s.lzhuff", COMP_DIR, p.name);
+
+		p.compressed = datablob_from_file(tmp_ctx, filename);
+		assert_non_null(p.compressed.data);
+
+		dest = talloc_array(tmp_ctx, uint8_t, p.decompressed.length);
+		debug_start_timer();
+		written = lzxpress_huffman_decompress(p.compressed.data,
+						      p.compressed.length,
+						      dest,
+						      p.decompressed.length);
+		debug_end_timer("decompress", p.decompressed.length);
+		if (written != -1 &&
+		    written == p.decompressed.length &&
+		    memcmp(dest, p.decompressed.data, p.decompressed.length) == 0) {
+			debug_message("\033[1;32mdecompressed %s!\033[0m\n", p.name);
+			score++;
+		} else {
+			debug_message("\033[1;31mfailed to decompress %s!\033[0m\n",
+				      p.name);
+			debug_message("size %zd vs reference %zu\n",
+				      written, p.decompressed.length);
+		}
+		talloc_free(tmp_ctx);
+	}
+	debug_message("%d/%zu correct\n", score, i);
+	assert_int_equal(score, i);
+}
+
+
+static void test_lzxpress_huffman_decompress_more_compressed_files(void **state)
+{
+	/*
+	 * This tests the decompression of files that have been compressed on
+	 * Windows with the level turned up (to 1, default for MS-XCA is 0).
+	 *
+	 * The format is identical, but it will have tried harder to find
+	 * matches.
+	 */
+	size_t i;
+	int score = 0;
+	int found = 0;
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	for (i = 0; file_names[i] != NULL; i++) {
+		char filename[200];
+		uint8_t *dest = NULL;
+		ssize_t written;
+		TALLOC_CTX *tmp_ctx = talloc_new(mem_ctx);
+		struct lzx_pair p = {
+			.name = file_names[i]
+		};
+
+		debug_message("%s\n", p.name);
+
+		snprintf(filename, sizeof(filename),
+			 "%s/%s.decomp", DECOMP_DIR, p.name);
+
+		p.decompressed = datablob_from_file(tmp_ctx, filename);
+		assert_non_null(p.decompressed.data);
+
+		snprintf(filename, sizeof(filename),
+			 "%s/%s.lzhuff", MORE_COMP_DIR, p.name);
+
+		p.compressed = datablob_from_file(tmp_ctx, filename);
+		if (p.compressed.data == NULL) {
+			/*
+			 * We don't have all the vectors in the
+			 * more-compressed directory, which is OK, we skip
+			 * them.
+			 */
+			continue;
+		}
+		found++;
+		dest = talloc_array(tmp_ctx, uint8_t, p.decompressed.length);
+		debug_start_timer();
+		written = lzxpress_huffman_decompress(p.compressed.data,
+						      p.compressed.length,
+						      dest,
+						      p.decompressed.length);
+		debug_end_timer("decompress", p.decompressed.length);
+		if (written == p.decompressed.length &&
+		    memcmp(dest, p.decompressed.data, p.decompressed.length) == 0) {
+			debug_message("\033[1;32mdecompressed %s!\033[0m\n", p.name);
+			score++;
+		} else {
+			debug_message("\033[1;31mfailed to decompress %s!\033[0m\n",
+				      p.name);
+			debug_message("size %zd vs reference %zu\n",
+				      written, p.decompressed.length);
+		}
+		talloc_free(tmp_ctx);
+	}
+	debug_message("%d/%d correct\n", score, found);
+	assert_int_equal(score, found);
+}
+
+
+/*
+ * attempt_round_trip() tests whether a data blob can survive a compression
+ * and decompression cycle. If save_name is not NULL and LZXHUFF_DEBUG_FILES
+ * evals to true, the various stages are saved in files with that name and the
+ * '-original', '-compressed', and '-decompressed' suffixes. If ref_compressed
+ * has data, it'll print a message saying whether the compressed data matches
+ * that.
+ */
+
+static ssize_t attempt_round_trip(TALLOC_CTX *mem_ctx,
+				  DATA_BLOB original,
+				  const char *save_name,
+				  DATA_BLOB ref_compressed)
+{
+	TALLOC_CTX *tmp_ctx = talloc_new(mem_ctx);
+	DATA_BLOB compressed = data_blob_talloc(tmp_ctx, NULL,
+						original.length * 4 / 3 + 260);
+	DATA_BLOB decompressed = data_blob_talloc(tmp_ctx, NULL,
+						original.length);
+	ssize_t comp_written, decomp_written;
+	debug_start_timer();
+	comp_written = lzxpress_huffman_compress_talloc(tmp_ctx,
+							original.data,
+							original.length,
+							&compressed.data);
+	debug_end_timer("compress", original.length);
+	if (comp_written <= 0) {
+		talloc_free(tmp_ctx);
+		return -1;
+	}
+
+	if (ref_compressed.data != NULL) {
+		/*
+		 * This is informational, not an assertion; there are
+		 * ~infinite legitimate ways to compress the data, many as
+		 * good as each other (think of compression as a language, not
+		 * a format).
+		 */
+		debug_message("compressed size %zd vs reference %zu\n",
+			      comp_written, ref_compressed.length);
+
+		if (comp_written == compressed.length &&
+		    memcmp(compressed.data, ref_compressed.data, comp_written) == 0) {
+			debug_message("\033[1;32mbyte identical!\033[0m\n");
+		}
+	}
+	debug_start_timer();
+	decomp_written = lzxpress_huffman_decompress(compressed.data,
+						     comp_written,
+						     decompressed.data,
+						     original.length);
+	debug_end_timer("decompress", original.length);
+	if (save_name != NULL && LZXHUFF_DEBUG_FILES) {
+		char s[300];
+		FILE *fh = NULL;
+
+		snprintf(s, sizeof(s), "%s-original", save_name);
+		fprintf(stderr, "Saving %zu bytes to %s\n", original.length, s);
+		fh = fopen(s, "w");
+		fwrite(original.data, 1, original.length, fh);
+		fclose(fh);
+
+		snprintf(s, sizeof(s), "%s-compressed", save_name);
+		fprintf(stderr, "Saving %zu bytes to %s\n", comp_written, s);
+		fh = fopen(s, "w");
+		fwrite(compressed.data, 1, comp_written, fh);
+		fclose(fh);
+		/*
+		 * We save the decompressed file using original.length, not
+		 * the returned size. If these differ, the returned size will
+		 * be -1. By saving the whole buffer we can see at what point
+		 * it went haywire.
+		 */
+		snprintf(s, sizeof(s), "%s-decompressed", save_name);
+		fprintf(stderr, "Saving %zu bytes to %s\n", original.length, s);
+		fh = fopen(s, "w");
+		fwrite(decompressed.data, 1, original.length, fh);
+		fclose(fh);
+	}
+
+	if (original.length != decomp_written ||
+	    memcmp(decompressed.data,
+		   original.data,
+		   original.length) != 0) {
+		debug_message("\033[1;31mgot %zd, expected %zu\033[0m\n",
+			      decomp_written,
+			      original.length);
+		talloc_free(tmp_ctx);
+		return -1;
+	}
+	talloc_free(tmp_ctx);
+	return comp_written;
+}
+
+
+static void test_lzxpress_huffman_round_trip(void **state)
+{
+	size_t i;
+	int score = 0;
+	ssize_t compressed_total = 0;
+	ssize_t reference_total = 0;
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	for (i = 0; file_names[i] != NULL; i++) {
+		char filename[200];
+		char *debug_files = NULL;
+		TALLOC_CTX *tmp_ctx = talloc_new(mem_ctx);
+		ssize_t comp_size;
+		struct lzx_pair p = {
+			.name = file_names[i]
+		};
+		debug_message("-------------------\n");
+		debug_message("%s\n", p.name);
+
+		snprintf(filename, sizeof(filename),
+			 "%s/%s.decomp", DECOMP_DIR, p.name);
+
+		p.decompressed = datablob_from_file(tmp_ctx, filename);
+		assert_non_null(p.decompressed.data);
+
+		snprintf(filename, sizeof(filename),
+			 "%s/%s.lzhuff", COMP_DIR, p.name);
+
+		p.compressed = datablob_from_file(tmp_ctx, filename);
+		if (p.compressed.data == NULL) {
+			debug_message(
+				"Could not load %s reference file %s\n",
+				p.name, filename);
+			debug_message("%s decompressed %zu\n", p.name,
+				      p.decompressed.length);
+		} else {
+			debug_message("%s: reference compressed %zu decomp %zu\n",
+				      p.name,
+				      p.compressed.length,
+				      p.decompressed.length);
+		}
+		if (1) {
+			/*
+			 * We're going to save copies in /tmp.
+			 */
+			snprintf(filename, sizeof(filename),
+				 "/tmp/lzxhuffman-%s", p.name);
+			debug_files = filename;
+		}
+
+		comp_size = attempt_round_trip(mem_ctx, p.decompressed,
+					       debug_files,
+					       p.compressed);
+		if (comp_size > 0) {
+			debug_message("\033[1;32mround trip!\033[0m\n");
+			score++;
+			if (p.compressed.length) {
+				compressed_total += comp_size;
+				reference_total += p.compressed.length;
+			}
+		}
+		talloc_free(tmp_ctx);
+	}
+	debug_message("%d/%zu correct\n", score, i);
+	print_message("\033[1;34mtotal compressed size: %zu\033[0m\n",
+		      compressed_total);
+	print_message("total reference size:  %zd \n", reference_total);
+	print_message("diff:                  %7zd \n",
+		      reference_total - compressed_total);
+	assert_true(reference_total != 0);
+	print_message("ratio: \033[1;3%dm%.2f\033[0m \n",
+		      2 + (compressed_total >= reference_total),
+		      ((double)compressed_total) / reference_total);
+	/*
+	 * Assert that the compression is *about* as good as Windows. Of course
+	 * it doesn't matter if we do better, but mysteriously getting better
+	 * is usually a sign that something is wrong.
+	 *
+	 * At the time of writing, compressed_total is 2674004, or 10686 more
+	 * than the Windows reference total. That's < 0.5% difference, we're
+	 * asserting at 2%.
+	 */
+	assert_true(labs(compressed_total - reference_total) <
+		    compressed_total / 50);
+
+	assert_int_equal(score, i);
+	talloc_free(mem_ctx);
+}
+
+/*
+ * Bob Jenkins' Small Fast RNG.
+ *
+ * We don't need it to be this good, but we do need it to be reproduceable
+ * across platforms, which rand() etc aren't.
+ *
+ * http://burtleburtle.net/bob/rand/smallprng.html
+ */
+
+struct jsf_rng {
+	uint32_t a;
+	uint32_t b;
+	uint32_t c;
+	uint32_t d;
+};
+
+#define ROTATE32(x, k) (((x) << (k)) | ((x) >> (32 - (k))))
+
+static uint32_t jsf32(struct jsf_rng *x) {
+	uint32_t e = x->a - ROTATE32(x->b, 27);
+	x->a = x->b ^ ROTATE32(x->c, 17);
+	x->b = x->c + x->d;
+	x->c = x->d + e;
+	x->d = e + x->a;
+	return x->d;
+}
+
+static void jsf32_init(struct jsf_rng *x, uint32_t seed) {
+	size_t i;
+	x->a = 0xf1ea5eed;
+	x->b = x->c = x->d = seed;
+	for (i = 0; i < 20; ++i) {
+		jsf32(x);
+	}
+}
+
+
+static void test_lzxpress_huffman_long_gpl_round_trip(void **state)
+{
+	/*
+	 * We use a kind of model-free Markov model to generate a massively
+	 * extended pastiche of the GPLv3 (chosen because it is right there in
+	 * "COPYING" and won't change often).
+	 *
+	 * The point is to check a round trip of a very long message with
+	 * multiple repetitions on many scales, without having to add a very
+	 * large file.
+	 */
+	size_t i, j, k;
+	uint8_t c;
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	DATA_BLOB gpl = datablob_from_file(mem_ctx, "COPYING");
+	DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, 5 * 1024 * 1024);
+	DATA_BLOB ref = {0};
+	ssize_t comp_size;
+	struct jsf_rng rng;
+
+	if (gpl.data == NULL) {
+		print_message("could not read COPYING\n");
+		fail();
+	}
+
+	jsf32_init(&rng, 1);
+
+	j = 1;
+	original.data[0] = gpl.data[0];
+	for (i = 1; i < original.length; i++) {
+		size_t m;
+		char p = original.data[i - 1];
+		c = gpl.data[j];
+		original.data[i] = c;
+		j++;
+		m = (j + jsf32(&rng)) % (gpl.length - 50);
+		for (k = m; k < m + 30; k++) {
+			if (p == gpl.data[k] &&
+			    c == gpl.data[k + 1]) {
+				j = k + 2;
+				break;
+			}
+		}
+		if (j == gpl.length) {
+			j = 1;
+		}
+	}
+
+	comp_size = attempt_round_trip(mem_ctx, original, "/tmp/gpl", ref);
+	assert_true(comp_size > 0);
+	assert_true(comp_size < original.length);
+
+	talloc_free(mem_ctx);
+}
+
+
+static void test_lzxpress_huffman_long_random_graph_round_trip(void **state)
+{
+	size_t i;
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, 5 * 1024 * 1024);
+	DATA_BLOB ref = {0};
+	/*
+	 * There's a random trigram graph, with each pair of sequential bytes
+	 * pointing to a successor. This would probably fall into a fairly
+	 * simple loop, but we introduce damage into the system, randomly
+	 * flipping about 1 bit in 64.
+	 *
+	 * The result is semi-structured and compressible.
+	 */
+	uint8_t *d = original.data;
+	uint8_t *table = talloc_array(mem_ctx, uint8_t, 65536);
+	uint32_t *table32 = (void*)table;
+	ssize_t comp_size;
+	struct jsf_rng rng;
+
+	jsf32_init(&rng, 1);
+	for (i = 0; i < (65536 / 4); i++) {
+		table32[i] = jsf32(&rng);
+	}
+
+	d[0] = 'a';
+	d[1] = 'b';
+
+	for (i = 2; i < original.length; i++) {
+		uint16_t k = (d[i - 2] << 8) | d[i - 1];
+		uint32_t damage = jsf32(&rng) & jsf32(&rng) & jsf32(&rng);
+		damage &= (damage >> 16);
+		k ^= damage & 0xffff;
+		d[i] = table[k];
+	}
+
+	comp_size = attempt_round_trip(mem_ctx, original, "/tmp/random-graph", ref);
+	assert_true(comp_size > 0);
+	assert_true(comp_size < original.length);
+
+	talloc_free(mem_ctx);
+}
+
+
+static void test_lzxpress_huffman_chaos_graph_round_trip(void **state)
+{
+	size_t i;
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, 5 * 1024 * 1024);
+	DATA_BLOB ref = {0};
+	/*
+	 * There's a random trigram graph, with each pair of sequential bytes
+	 * pointing to a successor. This would probably fall into a fairly
+	 * simple loop, but we keep changing the graph. The result is long
+	 * periods of stability separatd by bursts of noise.
+	 */
+	uint8_t *d = original.data;
+	uint8_t *table = talloc_array(mem_ctx, uint8_t, 65536);
+	uint32_t *table32 = (void*)table;
+	ssize_t comp_size;
+	struct jsf_rng rng;
+
+	jsf32_init(&rng, 1);
+	for (i = 0; i < (65536 / 4); i++) {
+		table32[i] = jsf32(&rng);
+	}
+
+	d[0] = 'a';
+	d[1] = 'b';
+
+	for (i = 2; i < original.length; i++) {
+		uint16_t k = (d[i - 2] << 8) | d[i - 1];
+		uint32_t damage = jsf32(&rng);
+		d[i] = table[k];
+		if ((damage >> 29) == 0) {
+			uint16_t index = damage & 0xffff;
+			uint8_t value = (damage >> 16) & 0xff;
+			table[index] = value;
+		}
+	}
+
+	comp_size = attempt_round_trip(mem_ctx, original, "/tmp/chaos-graph", ref);
+	assert_true(comp_size > 0);
+	assert_true(comp_size < original.length);
+
+	talloc_free(mem_ctx);
+}
+
+
+static void test_lzxpress_huffman_sparse_random_graph_round_trip(void **state)
+{
+	size_t i;
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, 5 * 1024 * 1024);
+	DATA_BLOB ref = {0};
+	/*
+	 * There's a random trigram graph, with each pair of sequential bytes
+	 * pointing to a successor. This will fall into a fairly simple loops,
+	 * but we introduce damage into the system, randomly mangling about 1
+	 * byte in 65536.
+	 *
+	 * The result has very long repetitive runs, which should lead to
+	 * oversized blocks.
+	 */
+	uint8_t *d = original.data;
+	uint8_t *table = talloc_array(mem_ctx, uint8_t, 65536);
+	uint32_t *table32 = (void*)table;
+	ssize_t comp_size;
+	struct jsf_rng rng;
+
+	jsf32_init(&rng, 3);
+	for (i = 0; i < (65536 / 4); i++) {
+		table32[i] = jsf32(&rng);
+	}
+
+	d[0] = 'a';
+	d[1] = 'b';
+
+	for (i = 2; i < original.length; i++) {
+		uint16_t k = (d[i - 2] << 8) | d[i - 1];
+		uint32_t damage = jsf32(&rng);
+		if ((damage & 0xffff0000) == 0) {
+			k ^= damage & 0xffff;
+		}
+		d[i] = table[k];
+	}
+
+	comp_size = attempt_round_trip(mem_ctx, original, "/tmp/sparse-random-graph", ref);
+	assert_true(comp_size > 0);
+	assert_true(comp_size < original.length);
+
+	talloc_free(mem_ctx);
+}
+
+
+static void test_lzxpress_huffman_random_noise_round_trip(void **state)
+{
+	size_t i;
+	size_t len = 1024 * 1024;
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, len);
+	DATA_BLOB ref = {0};
+	ssize_t comp_size;
+	/*
+	 * We are filling this up with incompressible noise, but we can assert
+	 * quite tight bounds on how badly it will fail to compress.
+	 *
+	 * Specifically, with randomly distributed codes, the Huffman table
+	 * should come out as roughly even, averaging 8 bit codes. Then there
+	 * will be a 256 byte table every 64k, which is a 1/256 overhead (i.e.
+	 * the compressed length will be 257/256 the original *on average*).
+	 * We assert it is less than 1 in 200 but more than 1 in 300.
+	 */
+	uint32_t *d32 = (uint32_t*)((void*)original.data);
+	struct jsf_rng rng;
+	jsf32_init(&rng, 2);
+
+	for (i = 0; i < (len / 4); i++) {
+		d32[i] = jsf32(&rng);
+	}
+
+	comp_size = attempt_round_trip(mem_ctx, original, "/tmp/random-noise", ref);
+	assert_true(comp_size > 0);
+	assert_true(comp_size > original.length + original.length / 300);
+	assert_true(comp_size < original.length + original.length / 200);
+	debug_message("original size %zu; compressed size %zd; ratio %.3f\n",
+		      len, comp_size, ((double)comp_size) / len);
+
+	talloc_free(mem_ctx);
+}
+
+
+static void test_lzxpress_huffman_overlong_matches(void **state)
+{
+	size_t i, j = 0;
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, 1024 * 1024);
+	DATA_BLOB ref = {0};
+	uint8_t *d = original.data;
+	char filename[300];
+	/*
+	 * We are testing with something like "aaaaaaaaaaaaaaaaaaaaaaabbbbb"
+	 * where typically the number of "a"s is > 65536, and the number of
+	 * "b"s is < 42.
+	 */
+	ssize_t na[] = {65535, 65536, 65537, 65559, 65575, 200000, -1};
+	ssize_t nb[] = {1, 2, 20, 39, 40, 41, 42, -1};
+	int score = 0;
+	ssize_t comp_size;
+
+	for (i = 0; na[i] >= 0; i++) {
+		ssize_t a = na[i];
+		memset(d, 'a', a);
+		for (j = 0; nb[j] >= 0; j++) {
+			ssize_t b = nb[j];
+			memset(d + a, 'b', b);
+			original.length = a + b;
+			snprintf(filename, sizeof(filename),
+				 "/tmp/overlong-%zd-%zd", a, b);
+			comp_size = attempt_round_trip(mem_ctx,
+						       original,
+						       filename, ref);
+			if (comp_size > 0) {
+				score++;
+			}
+		}
+	}
+	debug_message("%d/%zu correct\n", score, i * j);
+	assert_int_equal(score, i * j);
+	talloc_free(mem_ctx);
+}
+
+
+static void test_lzxpress_huffman_overlong_matches_abc(void **state)
+{
+	size_t i, j = 0, k = 0;
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, 1024 * 1024);
+	DATA_BLOB ref = {0};
+	uint8_t *d = original.data;
+	char filename[300];
+	/*
+	 * We are testing with something like "aaaabbbbcc" where typically
+	 * the number of "a"s + "b"s is around 65536, and the number of "c"s
+	 * is < 43.
+	 */
+	ssize_t nab[] = {1, 21, 32767, 32768, 32769, -1};
+	ssize_t nc[] = {1, 2, 20, 39, 40, 41, 42, -1};
+	int score = 0;
+	ssize_t comp_size;
+
+	for (i = 0; nab[i] >= 0; i++) {
+		ssize_t a = nab[i];
+		memset(d, 'a', a);
+		for (j = 0; nab[j] >= 0; j++) {
+			ssize_t b = nab[j];
+			memset(d + a, 'b', b);
+			for (k = 0; nc[k] >= 0; k++) {
+				ssize_t c = nc[k];
+				memset(d + a + b, 'c', c);
+				original.length = a + b + c;
+				snprintf(filename, sizeof(filename),
+					 "/tmp/overlong-abc-%zd-%zd-%zd",
+					 a, b, c);
+				comp_size = attempt_round_trip(mem_ctx,
+							       original,
+							       filename, ref);
+				if (comp_size > 0) {
+					score++;
+				}
+			}
+		}
+	}
+	debug_message("%d/%zu correct\n", score, i * j * k);
+	assert_int_equal(score, i * j * k);
+	talloc_free(mem_ctx);
+}
+
+
+static void test_lzxpress_huffman_extremely_compressible_middle(void **state)
+{
+	size_t len = 192 * 1024;
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, len);
+	DATA_BLOB ref = {0};
+	ssize_t comp_size;
+	/*
+	 * When a middle block (i.e. not the first and not the last of >= 3),
+	 * can be entirely expressed as a match starting in the previous
+	 * block, the Huffman tree would end up with 1 element, which does not
+	 * work for the code construction. It really wants to use both bits.
+	 * So we need to ensure we have some way of dealing with this.
+	 */
+	memset(original.data, 'a', 0x10000 - 1);
+	memset(original.data + 0x10000 - 1, 'b', 0x10000 + 1);
+	memset(original.data + 0x20000, 'a', 0x10000);
+	comp_size = attempt_round_trip(mem_ctx, original, "/tmp/compressible-middle", ref);
+	assert_true(comp_size > 0);
+	assert_true(comp_size < 1024);
+	debug_message("original size %zu; compressed size %zd; ratio %.3f\n",
+		      len, comp_size, ((double)comp_size) / len);
+
+	talloc_free(mem_ctx);
+}
+
+
+static void test_lzxpress_huffman_max_length_limit(void **state)
+{
+	size_t len = 65 * 1024 * 1024;
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	DATA_BLOB original = data_blob_talloc_zero(mem_ctx, len);
+	DATA_BLOB ref = {0};
+	ssize_t comp_size;
+	/*
+	 * Reputedly Windows has a 64MB limit in the maximum match length it
+	 * will encode. We follow this, and test that here with nearly 65 MB
+	 * of zeros between two letters; this should be encoded in three
+	 * blocks:
+	 *
+	 * 1. 'a', 64M × '\0'
+	 * 2. (1M - 2) × '\0' -- finishing off what would have been the same match
+	 * 3. 'b' EOF
+	 *
+	 * Which we can assert by saying the length is > 768, < 1024.
+	 */
+	original.data[0] = 'a';
+	original.data[len - 1] = 'b';
+	comp_size = attempt_round_trip(mem_ctx, original, "/tmp/max-length-limit", ref);
+	assert_true(comp_size > 0x300);
+	assert_true(comp_size < 0x400);
+	debug_message("original size %zu; compressed size %zd; ratio %.3f\n",
+		      len, comp_size, ((double)comp_size) / len);
+
+	talloc_free(mem_ctx);
+}
+
+
+static void test_lzxpress_huffman_short_boring_strings(void **state)
+{
+	size_t len = 64 * 1024;
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, len);
+	DATA_BLOB ref = {0};
+	ssize_t comp_size;
+	ssize_t lengths[] = {
+		1, 2, 20, 39, 40, 41, 42, 256, 270, 273, 274, 1000, 64000, -1};
+	char filename[300];
+	size_t i;
+	/*
+	 * How do short repetitive strings work? We're poking at the limit
+	 * around which LZ77 comprssion is turned on.
+	 *
+	 * For this test we don't change the blob memory between runs, just
+	 * the declared length.
+	 */
+	memset(original.data, 'a', len);
+	for (i = 0; lengths[i] >= 0; i++) {
+		original.length = lengths[i];
+		snprintf(filename, sizeof(filename),
+			 "/tmp/short-boring-%zu",
+			 original.length);
+		comp_size = attempt_round_trip(mem_ctx, original, filename, ref);
+		if (original.length < 41) {
+			assert_true(comp_size > 256 + original.length / 8);
+		} else if (original.length < 274) {
+			assert_true(comp_size == 261);
+		} else {
+			assert_true(comp_size == 263);
+		}
+		assert_true(comp_size < 261 + original.length / 8);
+	}
+	/* let's just show we didn't change the original */
+	for (i = 0; i < len; i++) {
+		if (original.data[i] != 'a') {
+			fail_msg("input data[%zu] was changed! (%2x, expected %2x)\n",
+				 i, original.data[i], 'a');
+		}
+	}
+
+	talloc_free(mem_ctx);
+}
+
+
+static void test_lzxpress_huffman_compress_empty_or_null(void **state)
+{
+	/*
+	 * We expect these to fail with a -1, except the last one, which does
+	 * the real thing.
+	 */
+	ssize_t ret;
+	const uint8_t *input = bidirectional_pairs[0].decompressed.data;
+	size_t ilen = bidirectional_pairs[0].decompressed.length;
+	size_t olen = bidirectional_pairs[0].compressed.length;
+	uint8_t output[olen];
+	struct lzxhuff_compressor_mem cmp_mem;
+
+	ret = lzxpress_huffman_compress(&cmp_mem, input, 0, output, olen);
+	assert_int_equal(ret, -1LL);
+	ret = lzxpress_huffman_compress(&cmp_mem, input, ilen, output, 0);
+	assert_int_equal(ret, -1LL);
+
+	ret = lzxpress_huffman_compress(&cmp_mem, NULL, ilen, output, olen);
+	assert_int_equal(ret, -1LL);
+	ret = lzxpress_huffman_compress(&cmp_mem, input, ilen, NULL, olen);
+	assert_int_equal(ret, -1LL);
+	ret = lzxpress_huffman_compress(NULL, input, ilen, output, olen);
+	assert_int_equal(ret, -1LL);
+
+	ret = lzxpress_huffman_compress(&cmp_mem, input, ilen, output, olen);
+	assert_int_equal(ret, olen);
+}
+
+
+static void test_lzxpress_huffman_decompress_empty_or_null(void **state)
+{
+	/*
+	 * We expect these to fail with a -1, except the last one.
+	 */
+	ssize_t ret;
+	const uint8_t *input = bidirectional_pairs[0].compressed.data;
+	size_t ilen = bidirectional_pairs[0].compressed.length;
+	size_t olen = bidirectional_pairs[0].decompressed.length;
+	uint8_t output[olen];
+
+	ret = lzxpress_huffman_decompress(input, 0, output, olen);
+	assert_int_equal(ret, -1LL);
+	ret = lzxpress_huffman_decompress(input, ilen, output, 0);
+	assert_int_equal(ret, -1LL);
+
+	ret = lzxpress_huffman_decompress(NULL, ilen, output, olen);
+	assert_int_equal(ret, -1LL);
+	ret = lzxpress_huffman_decompress(input, ilen, NULL, olen);
+	assert_int_equal(ret, -1LL);
+
+	ret = lzxpress_huffman_decompress(input, ilen, output, olen);
+	assert_int_equal(ret, olen);
+}
+
+
+int main(void) {
+	const struct CMUnitTest tests[] = {
+		cmocka_unit_test(test_lzxpress_huffman_short_boring_strings),
+		cmocka_unit_test(test_lzxpress_huffman_max_length_limit),
+		cmocka_unit_test(test_lzxpress_huffman_extremely_compressible_middle),
+		cmocka_unit_test(test_lzxpress_huffman_long_random_graph_round_trip),
+		cmocka_unit_test(test_lzxpress_huffman_chaos_graph_round_trip),
+		cmocka_unit_test(test_lzxpress_huffman_sparse_random_graph_round_trip),
+		cmocka_unit_test(test_lzxpress_huffman_round_trip),
+		cmocka_unit_test(test_lzxpress_huffman_decompress_files),
+		cmocka_unit_test(test_lzxpress_huffman_decompress_more_compressed_files),
+		cmocka_unit_test(test_lzxpress_huffman_compress),
+		cmocka_unit_test(test_lzxpress_huffman_decompress),
+		cmocka_unit_test(test_lzxpress_huffman_long_gpl_round_trip),
+		cmocka_unit_test(test_lzxpress_huffman_long_random_graph_round_trip),
+		cmocka_unit_test(test_lzxpress_huffman_random_noise_round_trip),
+		cmocka_unit_test(test_lzxpress_huffman_overlong_matches_abc),
+		cmocka_unit_test(test_lzxpress_huffman_overlong_matches),
+		cmocka_unit_test(test_lzxpress_huffman_decompress_empty_or_null),
+		cmocka_unit_test(test_lzxpress_huffman_compress_empty_or_null),
+	};
+	if (!isatty(1)) {
+		cmocka_set_message_output(CM_OUTPUT_SUBUNIT);
+	}
+
+	return cmocka_run_group_tests(tests, NULL, NULL);
+}
diff --git a/lib/compression/tests/test_lzxpress_plain.c b/lib/compression/tests/test_lzxpress_plain.c
new file mode 100644
index 0000000..1c14793
--- /dev/null
+++ b/lib/compression/tests/test_lzxpress_plain.c
@@ -0,0 +1,1194 @@
+/*
+   Unix SMB/CIFS implementation.
+   test suite for the compression functions
+
+   Copyright (C) Jelmer Vernooij 2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <stdarg.h>
+#include <stddef.h>
+#include <setjmp.h>
+#include <sys/stat.h>
+#include <cmocka.h>
+#include "includes.h"
+#include "talloc.h"
+#include "lzxpress.h"
+#include "lib/util/base64.h"
+
+
+/* set LZX_DEBUG_FILES to true to save round-trip files in /tmp. */
+#define LZX_DEBUG_FILES false
+
+/* set LZX_DEBUG_VERBOSE to true to print more. */
+#define LZX_DEBUG_VERBOSE false
+
+
+#if LZX_DEBUG_VERBOSE
+#define debug_message(...) print_message(__VA_ARGS__)
+
+#include <time.h>
+
+struct timespec start = {0};
+struct timespec end = {0};
+static void debug_start_timer(void)
+{
+	clock_gettime(CLOCK_MONOTONIC, &start);
+}
+
+static void debug_end_timer(const char *name, size_t len)
+{
+	uint64_t ns;
+	double secs;
+	double rate;
+	clock_gettime(CLOCK_MONOTONIC, &end);
+	ns = end.tv_nsec;
+	ns += end.tv_sec * 1000 * 1000 * 1000;
+	ns -= start.tv_nsec;
+	ns -= start.tv_sec * 1000 * 1000 * 1000;
+	secs = ns / 1e9;
+	rate = len / (secs * 1024 * 1024);
+	debug_message("%s %zu bytes in %.2g: \033[1;35m%.2f\033[0m MB per second\n",
+		      name, len, secs, rate);
+}
+
+#else
+#define debug_message(...) /* debug_message */
+#define debug_start_timer(...) /* debug_start_timer */
+#define debug_end_timer(...) /* debug_end_timer */
+#endif
+
+
+struct lzx_pair {
+	const char *name;
+	DATA_BLOB compressed;
+	DATA_BLOB decompressed;
+};
+
+struct lzx_file_pair {
+	const char *name;
+	const char *compressed_file;
+	const char *decompressed_file;
+};
+
+
+#define DECOMP_DIR "testdata/compression/decompressed"
+#define COMP_DIR "testdata/compression/compressed-plain"
+#define MORE_COMP_DIR "testdata/compression/compressed-more-plain"
+
+
+#define BLOB_FROM_ARRAY(...)                             \
+	{                                                \
+		.data = (uint8_t[]){__VA_ARGS__},          \
+		.length = sizeof((uint8_t[]){__VA_ARGS__}) \
+	}
+
+#define BLOB_FROM_STRING(s)                                      \
+	{                                                            \
+		.data = discard_const_p(uint8_t, s),		     \
+		.length = (sizeof(s) - 1)		     \
+	}
+
+
+const char * file_names[] = {
+	"generate-windows-test-vectors.c",
+	"fib_shuffle-128k+",
+	"fuzzing-0fc2d461b56cd8103c91",
+	"fuzzing-3ec3bca27bb9eb40c128",
+	"fuzzing-a3115a81d1ac500318f9",
+	"fuzzing-3591f9dc02bb00a54b60",
+	"27826-8.txt",
+	"5d049b4cb1bd933f5e8ex19",
+	"638e61e96d54279981c3x5",
+	"64k-minus-one-zeros",
+	"64k-plus-one-zeros",
+	"64k-zeros",
+	"96f696a4e5ce56c61a3dx10",
+	"9e0b6a12febf38e98f13",
+	"abc-times-101",
+	"abc-times-105",
+	"abc-times-200",
+	"b63289ccc7f218c0d56b",
+	"beta-variate1-128k+",
+	"beta-variate3-128k+",
+	"decayed_alphabet_128k+",
+	"decayed_alphabet_64k",
+	"f00842317dc6d5695b02",
+	"fib_shuffle",
+	"midsummer-nights-dream.txt",
+	"notes-on-the-underground.txt",
+	"pg22009.txt",
+	"repeating",
+	"repeating-exactly-64k",
+	"setup.log",
+	"slow-015ddc36a71412ccc50d",
+	"slow-100e9f966a7feb9ca40a",
+	"slow-2a671c3cff4f1574cbab",
+	"slow-33d90a24e70515b14cd0",
+	"slow-49d8c05261e3f412fc72",
+	"slow-50a249d2fe56873e56a0",
+	"slow-63e9f0b52235fb0129fa",
+	"slow-73b7f971d65908ac0095",
+	"slow-8b61e3dd267908544531",
+	"slow-9d1c5a079b0462986f1f",
+	"slow-aa7262a821dabdcf04a6",
+	"slow-b8a91d142b0d2af7f5ca",
+	"slow-c79142457734bbc8d575",
+	"slow-d736544545b90d83fe75",
+	"slow-e3b9bdfaed7d1a606fdb",
+	"slow-f3f1c02a9d006e5e1703",
+	"trigram_128k+",
+	"trigram_64k",
+	"trigram_sum_128k+",
+	"trigram_sum_64k",
+	NULL
+};
+
+
+
+static DATA_BLOB datablob_from_file(TALLOC_CTX *mem_ctx,
+				    const char *filename)
+{
+	DATA_BLOB b = {0};
+	FILE *fh = fopen(filename, "rb");
+	int ret;
+	struct stat s;
+	size_t len;
+	if (fh == NULL) {
+		debug_message("could not open '%s'\n", filename);
+		return b;
+	}
+	ret = fstat(fileno(fh), &s);
+	if (ret != 0) {
+		fclose(fh);
+		return b;
+	}
+	b.data = talloc_array(mem_ctx, uint8_t, s.st_size);
+	if (b.data == NULL) {
+		fclose(fh);
+		return b;
+	}
+	len = fread(b.data, 1, s.st_size, fh);
+	if (ferror(fh) || len != s.st_size) {
+		TALLOC_FREE(b.data);
+	} else {
+		b.length = len;
+	}
+	fclose(fh);
+	return b;
+}
+
+
+
+static void test_lzxpress_plain_decompress_files(void **state)
+{
+	size_t i;
+	int score = 0;
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	for (i = 0; file_names[i] != NULL; i++) {
+		char filename[200];
+		uint8_t *dest = NULL;
+		ssize_t written;
+		TALLOC_CTX *tmp_ctx = talloc_new(mem_ctx);
+		struct lzx_pair p = {
+			.name = file_names[i]
+		};
+
+		debug_message("%s\n", p.name);
+
+		snprintf(filename, sizeof(filename),
+			 "%s/%s.decomp", DECOMP_DIR, p.name);
+
+		p.decompressed = datablob_from_file(tmp_ctx, filename);
+		assert_non_null(p.decompressed.data);
+
+		snprintf(filename, sizeof(filename),
+			 "%s/%s.lzplain", COMP_DIR, p.name);
+
+		p.compressed = datablob_from_file(tmp_ctx, filename);
+		assert_non_null(p.compressed.data);
+
+		dest = talloc_array(tmp_ctx, uint8_t, p.decompressed.length);
+		debug_start_timer();
+		written = lzxpress_decompress(p.compressed.data,
+					      p.compressed.length,
+					      dest,
+					      p.decompressed.length);
+		debug_end_timer("decompress", p.decompressed.length);
+		if (written == p.decompressed.length &&
+		    memcmp(dest, p.decompressed.data, p.decompressed.length) == 0) {
+			debug_message("\033[1;32mdecompressed %s!\033[0m\n", p.name);
+			score++;
+		} else {
+			debug_message("\033[1;31mfailed to decompress %s!\033[0m\n",
+				      p.name);
+			debug_message("size %zd vs reference %zu\n",
+				      written, p.decompressed.length);
+		}
+		talloc_free(tmp_ctx);
+	}
+	debug_message("%d/%zu correct\n", score, i);
+	assert_int_equal(score, i);
+}
+
+
+static void test_lzxpress_plain_decompress_more_compressed_files(void **state)
+{
+	/*
+	 * This tests the decompression of files that have been compressed on
+	 * Windows with the level turned up (to 1, default for MS-XCA is 0).
+	 *
+	 * The format is identical, but it will have tried harder to find
+	 * matches.
+	 */
+	size_t i;
+	int score = 0;
+	int found = 0;
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	for (i = 0; file_names[i] != NULL; i++) {
+		char filename[200];
+		uint8_t *dest = NULL;
+		ssize_t written;
+		TALLOC_CTX *tmp_ctx = talloc_new(mem_ctx);
+		struct lzx_pair p = {
+			.name = file_names[i]
+		};
+
+		debug_message("%s\n", p.name);
+
+		snprintf(filename, sizeof(filename),
+			 "%s/%s.decomp", DECOMP_DIR, p.name);
+
+		p.decompressed = datablob_from_file(tmp_ctx, filename);
+		assert_non_null(p.decompressed.data);
+
+		snprintf(filename, sizeof(filename),
+			 "%s/%s.lzplain", MORE_COMP_DIR, p.name);
+
+		p.compressed = datablob_from_file(tmp_ctx, filename);
+		if (p.compressed.data == NULL) {
+			/*
+			 * We don't have all the vectors in the
+			 * more-compressed directory, which is OK, we skip
+			 * them.
+			 */
+			continue;
+		}
+		found++;
+		dest = talloc_array(tmp_ctx, uint8_t, p.decompressed.length);
+		debug_start_timer();
+		written = lzxpress_decompress(p.compressed.data,
+					      p.compressed.length,
+					      dest,
+					      p.decompressed.length);
+		debug_end_timer("decompress", p.decompressed.length);
+		if (written != -1 &&
+		    written == p.decompressed.length &&
+		    memcmp(dest, p.decompressed.data, p.decompressed.length) == 0) {
+			debug_message("\033[1;32mdecompressed %s!\033[0m\n", p.name);
+			score++;
+		} else {
+			debug_message("\033[1;31mfailed to decompress %s!\033[0m\n",
+				      p.name);
+			debug_message("size %zd vs reference %zu\n",
+				      written, p.decompressed.length);
+		}
+		talloc_free(tmp_ctx);
+	}
+	debug_message("%d/%d correct\n", score, found);
+	assert_int_equal(score, found);
+}
+
+
+/*
+ * attempt_round_trip() tests whether a data blob can survive a compression
+ * and decompression cycle. If save_name is not NULL and LZX_DEBUG_FILES
+ * evals to true, the various stages are saved in files with that name and the
+ * '-original', '-compressed', and '-decompressed' suffixes. If ref_compressed
+ * has data, it'll print a message saying whether the compressed data matches
+ * that.
+ */
+
+static ssize_t attempt_round_trip(TALLOC_CTX *mem_ctx,
+				  DATA_BLOB original,
+				  const char *save_name,
+				  DATA_BLOB ref_compressed)
+{
+	TALLOC_CTX *tmp_ctx = talloc_new(mem_ctx);
+	DATA_BLOB compressed = data_blob_talloc(tmp_ctx, NULL,
+						original.length * 8 / 7 + 8);
+	DATA_BLOB decompressed = data_blob_talloc(tmp_ctx, NULL,
+						  original.length);
+	ssize_t comp_written, decomp_written;
+	debug_start_timer();
+	comp_written = lzxpress_compress(original.data,
+					 original.length,
+					 compressed.data,
+					 compressed.length);
+	debug_end_timer("compress", original.length);
+	if (comp_written <= 0) {
+		talloc_free(tmp_ctx);
+		return -1;
+	}
+
+	if (ref_compressed.data != NULL) {
+		/*
+		 * This is informational, not an assertion; there are
+		 * ~infinite legitimate ways to compress the data, many as
+		 * good as each other (think of compression as a language, not
+		 * a format).
+		 */
+		debug_message("compressed size %zd vs reference %zu\n",
+			      comp_written, ref_compressed.length);
+
+		if (comp_written == compressed.length &&
+		    memcmp(compressed.data, ref_compressed.data, comp_written) == 0) {
+			debug_message("\033[1;32mbyte identical!\033[0m\n");
+		}
+	}
+	debug_start_timer();
+	decomp_written = lzxpress_decompress(compressed.data,
+					     comp_written,
+					     decompressed.data,
+					     decompressed.length);
+	debug_end_timer("decompress", original.length);
+	if (save_name != NULL && LZX_DEBUG_FILES) {
+		char s[300];
+		FILE *fh = NULL;
+
+		snprintf(s, sizeof(s), "%s-original", save_name);
+		fprintf(stderr, "Saving %zu bytes to %s\n", original.length, s);
+		fh = fopen(s, "w");
+		fwrite(original.data, 1, original.length, fh);
+		fclose(fh);
+
+		snprintf(s, sizeof(s), "%s-compressed", save_name);
+		fprintf(stderr, "Saving %zu bytes to %s\n", comp_written, s);
+		fh = fopen(s, "w");
+		fwrite(compressed.data, 1, comp_written, fh);
+		fclose(fh);
+		/*
+		 * We save the decompressed file using original.length, not
+		 * the returned size. If these differ, the returned size will
+		 * be -1. By saving the whole buffer we can see at what point
+		 * it went haywire.
+		 */
+		snprintf(s, sizeof(s), "%s-decompressed", save_name);
+		fprintf(stderr, "Saving %zu bytes to %s\n", original.length, s);
+		fh = fopen(s, "w");
+		fwrite(decompressed.data, 1, original.length, fh);
+		fclose(fh);
+	}
+
+	if (original.length != decomp_written ||
+	    memcmp(decompressed.data,
+		   original.data,
+		   original.length) != 0) {
+		debug_message("\033[1;31mgot %zd, expected %zu\033[0m\n",
+			      decomp_written,
+			      original.length);
+		talloc_free(tmp_ctx);
+		return -1;
+	}
+	talloc_free(tmp_ctx);
+	return comp_written;
+}
+
+
+static void test_lzxpress_plain_round_trip_files(void **state)
+{
+	size_t i;
+	int score = 0;
+	ssize_t compressed_total = 0;
+	ssize_t reference_total = 0;
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	for (i = 0; file_names[i] != NULL; i++) {
+		char filename[200];
+		char *debug_files = NULL;
+		TALLOC_CTX *tmp_ctx = talloc_new(mem_ctx);
+		ssize_t comp_size;
+		struct lzx_pair p = {
+			.name = file_names[i]
+		};
+		debug_message("-------------------\n");
+		debug_message("%s\n", p.name);
+
+		snprintf(filename, sizeof(filename),
+			 "%s/%s.decomp", DECOMP_DIR, p.name);
+
+		p.decompressed = datablob_from_file(tmp_ctx, filename);
+		assert_non_null(p.decompressed.data);
+
+		snprintf(filename, sizeof(filename),
+			 "%s/%s.lzplain", COMP_DIR, p.name);
+
+		p.compressed = datablob_from_file(tmp_ctx, filename);
+		if (p.compressed.data == NULL) {
+			debug_message(
+				"Could not load %s reference file %s\n",
+				p.name, filename);
+			debug_message("%s decompressed %zu\n", p.name,
+				      p.decompressed.length);
+		} else {
+			debug_message("%s: reference compressed %zu decomp %zu\n",
+				      p.name,
+				      p.compressed.length,
+				      p.decompressed.length);
+		}
+		if (1) {
+			/*
+			 * We're going to save copies in /tmp.
+			 */
+			snprintf(filename, sizeof(filename),
+				 "/tmp/lzxplain-%s", p.name);
+			debug_files = filename;
+		}
+
+		comp_size = attempt_round_trip(mem_ctx, p.decompressed,
+					       debug_files,
+					       p.compressed);
+		if (comp_size > 0) {
+			debug_message("\033[1;32mround trip!\033[0m\n");
+			score++;
+			if (p.compressed.length) {
+				compressed_total += comp_size;
+				reference_total += p.compressed.length;
+			}
+		}
+		talloc_free(tmp_ctx);
+	}
+	debug_message("%d/%zu correct\n", score, i);
+	print_message("\033[1;34mtotal compressed size: %zu\033[0m\n",
+		      compressed_total);
+	print_message("total reference size:  %zd \n", reference_total);
+	print_message("diff:                  %7zd \n",
+		      reference_total - compressed_total);
+	assert_true(reference_total != 0);
+	print_message("ratio: \033[1;3%dm%.2f\033[0m \n",
+		      2 + (compressed_total >= reference_total),
+		      ((double)compressed_total) / reference_total);
+	/*
+	 * Assert that the compression is better than Windows. Unlike the
+	 * Huffman variant, where things are very even, here we do much better
+	 * than Windows without especially trying.
+	 */
+	assert_true(compressed_total <= reference_total);
+
+	assert_int_equal(score, i);
+	talloc_free(mem_ctx);
+}
+
+
+/*
+ * Bob Jenkins' Small Fast RNG.
+ *
+ * We don't need it to be this good, but we do need it to be reproduceable
+ * across platforms, which rand() etc aren't.
+ *
+ * http://burtleburtle.net/bob/rand/smallprng.html
+ */
+
+struct jsf_rng {
+	uint32_t a;
+	uint32_t b;
+	uint32_t c;
+	uint32_t d;
+};
+
+#define ROTATE32(x, k) (((x) << (k)) | ((x) >> (32 - (k))))
+
+static uint32_t jsf32(struct jsf_rng *x) {
+	uint32_t e = x->a - ROTATE32(x->b, 27);
+	x->a = x->b ^ ROTATE32(x->c, 17);
+	x->b = x->c + x->d;
+	x->c = x->d + e;
+	x->d = e + x->a;
+	return x->d;
+}
+
+static void jsf32_init(struct jsf_rng *x, uint32_t seed) {
+	size_t i;
+	x->a = 0xf1ea5eed;
+	x->b = x->c = x->d = seed;
+	for (i = 0; i < 20; ++i) {
+		jsf32(x);
+	}
+}
+
+
+static void test_lzxpress_plain_long_gpl_round_trip(void **state)
+{
+	/*
+	 * We use a kind of model-free Markov model to generate a massively
+	 * extended pastiche of the GPLv3 (chosen because it is right there in
+	 * "COPYING" and won't change often).
+	 *
+	 * The point is to check a round trip of a very long message with
+	 * multiple repetitions on many scales, without having to add a very
+	 * large file.
+	 */
+	size_t i, j, k;
+	uint8_t c;
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	DATA_BLOB gpl = datablob_from_file(mem_ctx, "COPYING");
+	DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, 5 * 1024 * 1024);
+	DATA_BLOB ref = {0};
+	ssize_t comp_size;
+	struct jsf_rng rng;
+
+
+	jsf32_init(&rng, 1);
+
+	j = 1;
+	original.data[0] = gpl.data[0];
+	for (i = 1; i < original.length; i++) {
+		size_t m;
+		char p = original.data[i - 1];
+		c = gpl.data[j];
+		original.data[i] = c;
+		j++;
+		m = (j + jsf32(&rng)) % (gpl.length - 50);
+		for (k = m; k < m + 30; k++) {
+			if (p == gpl.data[k] &&
+			    c == gpl.data[k + 1]) {
+				j = k + 2;
+				break;
+			}
+		}
+		if (j == gpl.length) {
+			j = 1;
+		}
+	}
+
+	comp_size = attempt_round_trip(mem_ctx, original, "/tmp/gpl", ref);
+	assert_true(comp_size > 0);
+	assert_true(comp_size < original.length);
+
+	talloc_free(mem_ctx);
+}
+
+
+static void test_lzxpress_plain_long_random_graph_round_trip(void **state)
+{
+	size_t i;
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, 5 * 1024 * 1024);
+	DATA_BLOB ref = {0};
+	/*
+	 * There's a random trigram graph, with each pair of sequential bytes
+	 * pointing to a successor. This would probably fall into a fairly
+	 * simple loop, but we introduce damage into the system, randomly
+	 * flipping about 1 bit in 64.
+	 *
+	 * The result is semi-structured and compressible.
+	 */
+	uint8_t *d = original.data;
+	uint8_t *table = talloc_array(mem_ctx, uint8_t, 65536);
+	uint32_t *table32 = (void*)table;
+	ssize_t comp_size;
+	struct jsf_rng rng;
+
+	jsf32_init(&rng, 1);
+	for (i = 0; i < (65536 / 4); i++) {
+		table32[i] = jsf32(&rng);
+	}
+
+	d[0] = 'a';
+	d[1] = 'b';
+
+	for (i = 2; i < original.length; i++) {
+		uint16_t k = (d[i - 2] << 8) | d[i - 1];
+		uint32_t damage = jsf32(&rng) & jsf32(&rng) & jsf32(&rng);
+		damage &= (damage >> 16);
+		k ^= damage & 0xffff;
+		d[i] = table[k];
+	}
+
+	comp_size = attempt_round_trip(mem_ctx, original, "/tmp/random-graph", ref);
+	assert_true(comp_size > 0);
+	assert_true(comp_size < original.length);
+
+	talloc_free(mem_ctx);
+}
+
+
+static void test_lzxpress_plain_chaos_graph_round_trip(void **state)
+{
+	size_t i;
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, 5 * 1024 * 1024);
+	DATA_BLOB ref = {0};
+	/*
+	 * There's a random trigram graph, with each pair of sequential bytes
+	 * pointing to a successor. This would probably fall into a fairly
+	 * simple loop, but we keep changing the graph. The result is long
+	 * periods of stability separatd by bursts of noise.
+	 */
+	uint8_t *d = original.data;
+	uint8_t *table = talloc_array(mem_ctx, uint8_t, 65536);
+	uint32_t *table32 = (void*)table;
+	ssize_t comp_size;
+	struct jsf_rng rng;
+
+	jsf32_init(&rng, 1);
+	for (i = 0; i < (65536 / 4); i++) {
+		table32[i] = jsf32(&rng);
+	}
+
+	d[0] = 'a';
+	d[1] = 'b';
+
+	for (i = 2; i < original.length; i++) {
+		uint16_t k = (d[i - 2] << 8) | d[i - 1];
+		uint32_t damage = jsf32(&rng);
+		d[i] = table[k];
+		if ((damage >> 29) == 0) {
+			uint16_t index = damage & 0xffff;
+			uint8_t value = (damage >> 16) & 0xff;
+			table[index] = value;
+		}
+	}
+
+	comp_size = attempt_round_trip(mem_ctx, original, "/tmp/chaos-graph", ref);
+	assert_true(comp_size > 0);
+	assert_true(comp_size < original.length);
+
+	talloc_free(mem_ctx);
+}
+
+
+static void test_lzxpress_plain_sparse_random_graph_round_trip(void **state)
+{
+	size_t i;
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, 5 * 1024 * 1024);
+	DATA_BLOB ref = {0};
+	/*
+	 * There's a random trigram graph, with each pair of sequential bytes
+	 * pointing to a successor. This will fall into a fairly simple loops,
+	 * but we introduce damage into the system, randomly mangling about 1
+	 * byte in 65536.
+	 *
+	 * The result has very long repetitive runs, which should lead to
+	 * oversized blocks.
+	 */
+	uint8_t *d = original.data;
+	uint8_t *table = talloc_array(mem_ctx, uint8_t, 65536);
+	uint32_t *table32 = (void*)table;
+	ssize_t comp_size;
+	struct jsf_rng rng;
+
+	jsf32_init(&rng, 3);
+	for (i = 0; i < (65536 / 4); i++) {
+		table32[i] = jsf32(&rng);
+	}
+
+	d[0] = 'a';
+	d[1] = 'b';
+
+	for (i = 2; i < original.length; i++) {
+		uint16_t k = (d[i - 2] << 8) | d[i - 1];
+		uint32_t damage = jsf32(&rng);
+		if ((damage & 0xffff0000) == 0) {
+			k ^= damage & 0xffff;
+		}
+		d[i] = table[k];
+	}
+
+	comp_size = attempt_round_trip(mem_ctx, original, "/tmp/sparse-random-graph", ref);
+	assert_true(comp_size > 0);
+	assert_true(comp_size < original.length);
+
+	talloc_free(mem_ctx);
+}
+
+
+static void test_lzxpress_plain_random_noise_round_trip(void **state)
+{
+	size_t i;
+	size_t len = 10 * 1024 * 1024;
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, len);
+	DATA_BLOB ref = {0};
+	ssize_t comp_size;
+	/*
+	 * We are filling this up with incompressible noise, but we can assert
+	 * quite tight bounds on how badly it will fail to compress.
+	 *
+	 * There is one additional bit for each code, which says whether the
+	 * code is a literal byte or a match. If *all* codes are literal
+	 * bytes, the length should be 9/8 the original (with rounding
+	 * issues regarding the indicator bit blocks).
+	 *
+	 * If some matches are found the length will be a bit less. We would
+	 * expect one 3 byte match per 1 << 24 tries, but we try 8192 times
+	 * per position. That means there'll a match 1/2048 of the time at
+	 * best. 255 times out of 256 this will be exactly a 3 byte match,
+	 * encoded as two bytes, so we could get a 1 / 2048 saving on top of
+	 * the 1/8 cost. There'll be a smattering of longer matches too, and
+	 * the potential for complicated maths to account for those, but we'll
+	 * skimp on that by allowing for a 1/1500 saving.
+	 *
+	 * With the hash table, we take a shortcut in the "8192 tries", and
+	 * the size of the table makes a difference in how we perform, with 13
+	 * bits (8192 slots) naturally being luckier than 12. Ultimately,
+	 * either way, the compressed file is still 12.5% bigger than the
+	 * original.
+	 */
+	size_t limit = len * 9 / 8 + 4;
+
+	uint32_t *d32 = (uint32_t*)((void*)original.data);
+	struct jsf_rng rng;
+	jsf32_init(&rng, 2);
+
+	for (i = 0; i < (len / 4); i++) {
+		d32[i] = jsf32(&rng);
+	}
+
+	comp_size = attempt_round_trip(mem_ctx, original, "/tmp/random-noise", ref);
+	debug_message("original size %zu; compressed size %zd; ratio %.5f\n",
+		      len, comp_size, ((double)comp_size) / len);
+	debug_message("expected range %zu - %zu\n",
+		      limit - limit / 1500, limit);
+
+	assert_true(comp_size > 0);
+	assert_true(comp_size < limit);
+	assert_true(comp_size >= limit - limit / 1500);
+	talloc_free(mem_ctx);
+}
+
+
+/* Tests based on [MS-XCA] 3.1 Examples */
+static void test_msft_data1(void **state)
+{
+	TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+
+	const char *fixed_data = "abcdefghijklmnopqrstuvwxyz";
+	const uint8_t fixed_out[] = {
+		0x3f, 0x00, 0x00, 0x00, 0x61, 0x62, 0x63, 0x64,
+		0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c,
+		0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74,
+		0x75, 0x76, 0x77, 0x78, 0x79, 0x7a };
+
+	ssize_t c_size;
+	uint8_t *out, *out2;
+
+	out  = talloc_size(tmp_ctx, 2048);
+	memset(out, 0x42, talloc_get_size(out));
+
+	c_size = lzxpress_compress((const uint8_t *)fixed_data,
+				   strlen(fixed_data),
+				   out,
+				   talloc_get_size(out));
+	assert_int_not_equal(c_size, -1);
+	assert_int_equal(c_size, sizeof(fixed_out));
+	assert_memory_equal(out, fixed_out, c_size);
+	out2  = talloc_size(tmp_ctx, strlen(fixed_data));
+	c_size = lzxpress_decompress(out,
+				     sizeof(fixed_out),
+				     out2,
+				     talloc_get_size(out2));
+	assert_int_not_equal(c_size, -1);
+	assert_int_equal(c_size, strlen(fixed_data));
+	assert_memory_equal(out2, fixed_data, c_size);
+
+	talloc_free(tmp_ctx);
+}
+
+
+static void test_msft_data2(void **state)
+{
+	TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+
+	const char *fixed_data =
+		"abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+		"abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+		"abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+		"abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+		"abcabcabcabcabcabcabcabc";
+	const uint8_t fixed_out[] = {
+		0xff, 0xff, 0xff, 0x1f, 0x61, 0x62, 0x63, 0x17,
+		0x00, 0x0f, 0xff, 0x26, 0x01};
+
+	ssize_t c_size;
+	uint8_t *out, *out2;
+
+	out  = talloc_size(tmp_ctx, 2048);
+	memset(out, 0x42, talloc_get_size(out));
+
+	c_size = lzxpress_compress((const uint8_t *)fixed_data,
+				   strlen(fixed_data),
+				   out,
+				   talloc_get_size(out));
+	assert_int_not_equal(c_size, -1);
+	assert_int_equal(c_size, sizeof(fixed_out));
+	assert_memory_equal(out, fixed_out, c_size);
+
+	out2  = talloc_size(tmp_ctx, strlen(fixed_data));
+	c_size = lzxpress_decompress(out,
+				     sizeof(fixed_out),
+				     out2,
+				     talloc_get_size(out2));
+
+	assert_int_not_equal(c_size, -1);
+	assert_int_equal(c_size, strlen(fixed_data));
+	assert_memory_equal(out2, fixed_data, c_size);
+
+	talloc_free(tmp_ctx);
+}
+
+/*
+  test lzxpress
+ */
+static void test_lzxpress(void **state)
+{
+	TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+	const char *fixed_data = "this is a test. and this is a test too";
+	const uint8_t fixed_out[] = {
+		0xff, 0x21, 0x00, 0x04, 0x74, 0x68, 0x69, 0x73,
+		0x20, 0x10, 0x00, 0x61, 0x20, 0x74, 0x65, 0x73,
+		0x74, 0x2E, 0x20, 0x61, 0x6E, 0x64, 0x20, 0x9F,
+		0x00, 0x04, 0x20, 0x74, 0x6F, 0x6F };
+
+	const uint8_t fixed_out_old_version[] = {
+		0x00, 0x20, 0x00, 0x04, 0x74, 0x68, 0x69, 0x73,
+		0x20, 0x10, 0x00, 0x61, 0x20, 0x74, 0x65, 0x73,
+		0x74, 0x2E, 0x20, 0x61, 0x6E, 0x64, 0x20, 0x9F,
+		0x00, 0x04, 0x20, 0x74, 0x6F, 0x6F, 0x00, 0x00,
+		0x00, 0x00 };
+
+	ssize_t c_size;
+	uint8_t *out, *out2, *out3;
+
+	out  = talloc_size(tmp_ctx, 2048);
+	memset(out, 0x42, talloc_get_size(out));
+
+	c_size = lzxpress_compress((const uint8_t *)fixed_data,
+				   strlen(fixed_data),
+				   out,
+				   talloc_get_size(out));
+
+	assert_int_not_equal(c_size, -1);
+	assert_int_equal(c_size, sizeof(fixed_out));
+	assert_memory_equal(out, fixed_out, c_size);
+
+	out2  = talloc_size(tmp_ctx, strlen(fixed_data));
+	c_size = lzxpress_decompress(out,
+				     sizeof(fixed_out),
+				     out2,
+				     talloc_get_size(out2));
+
+	assert_int_not_equal(c_size, -1);
+	assert_int_equal(c_size, strlen(fixed_data));
+	assert_memory_equal(out2, fixed_data, c_size);
+
+	out3  = talloc_size(tmp_ctx, strlen(fixed_data));
+	c_size = lzxpress_decompress(fixed_out_old_version,
+				     sizeof(fixed_out_old_version),
+				     out3,
+				     talloc_get_size(out3));
+
+	assert_int_not_equal(c_size, -1);
+	assert_int_equal(c_size, strlen(fixed_data));
+	assert_memory_equal(out3, fixed_data, c_size);
+
+	talloc_free(tmp_ctx);
+}
+
+static void test_lzxpress2(void **state)
+{
+	/*
+	 * Use two matches, separated by a literal, and each with a length
+	 * greater than 10, to test the use of nibble_index. Both length values
+	 * (less ten) should be stored as adjacent nibbles to form the 0x21
+	 * byte.
+	 */
+
+	TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+	const char *fixed_data = "aaaaaaaaaaaabaaaaaaaaaaaa";
+	const uint8_t fixed_out[] = {
+		0xff, 0xff, 0xff, 0x5f, 0x61, 0x07, 0x00, 0x21,
+		0x62, 0x67, 0x00};
+
+	ssize_t c_size;
+	uint8_t *out, *out2;
+
+	out  = talloc_size(tmp_ctx, 2048);
+	memset(out, 0x42, talloc_get_size(out));
+
+	c_size = lzxpress_compress((const uint8_t *)fixed_data,
+				   strlen(fixed_data),
+				   out,
+				   talloc_get_size(out));
+
+	assert_int_not_equal(c_size, -1);
+	assert_int_equal(c_size, sizeof(fixed_out));
+	assert_memory_equal(out, fixed_out, c_size);
+
+	out2  = talloc_size(tmp_ctx, strlen(fixed_data));
+	c_size = lzxpress_decompress(out,
+				     sizeof(fixed_out),
+				     out2,
+				     talloc_get_size(out2));
+
+	assert_int_not_equal(c_size, -1);
+	assert_int_equal(c_size, strlen(fixed_data));
+	assert_memory_equal(out2, fixed_data, c_size);
+
+	talloc_free(tmp_ctx);
+}
+
+static void test_lzxpress3(void **state)
+{
+	/*
+	 * Use a series of 31 literals, followed by a single minimum-length
+	 * match (and a terminating literal), to test setting indic_pos when the
+	 * 32-bit flags value overflows after a match.
+	 */
+
+	TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+	const char *fixed_data = "abcdefghijklmnopqrstuvwxyz01234abca";
+	const uint8_t fixed_out[] = {
+		0x01, 0x00, 0x00, 0x00, 0x61, 0x62, 0x63, 0x64,
+		0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c,
+		0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74,
+		0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x30, 0x31,
+		0x32, 0x33, 0x34, 0xf0, 0x00, 0xff, 0xff, 0xff,
+		0x7f, 0x61};
+
+	ssize_t c_size;
+	uint8_t *out, *out2;
+
+	out  = talloc_size(tmp_ctx, 2048);
+	memset(out, 0x42, talloc_get_size(out));
+
+	c_size = lzxpress_compress((const uint8_t *)fixed_data,
+				   strlen(fixed_data),
+				   out,
+				   talloc_get_size(out));
+
+	assert_int_not_equal(c_size, -1);
+	assert_int_equal(c_size, sizeof(fixed_out));
+	assert_memory_equal(out, fixed_out, c_size);
+
+	out2  = talloc_size(tmp_ctx, strlen(fixed_data));
+	c_size = lzxpress_decompress(out,
+				     sizeof(fixed_out),
+				     out2,
+				     talloc_get_size(out2));
+
+	assert_int_not_equal(c_size, -1);
+	assert_int_equal(c_size, strlen(fixed_data));
+	assert_memory_equal(out2, fixed_data, c_size);
+
+	talloc_free(tmp_ctx);
+}
+
+static void test_lzxpress4(void **state)
+{
+	/*
+	 * Use a series of 31 literals, followed by a single minimum-length
+	 * match, to test that the final set of 32-bit flags is written
+	 * correctly when it is empty.
+	 */
+
+	TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+	const char *fixed_data = "abcdefghijklmnopqrstuvwxyz01234abc";
+	const uint8_t fixed_out[] = {
+		0x01, 0x00, 0x00, 0x00, 0x61, 0x62, 0x63, 0x64,
+		0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c,
+		0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74,
+		0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x30, 0x31,
+		0x32, 0x33, 0x34, 0xf0, 0x00, 0xff, 0xff, 0xff,
+		0xff};
+
+	ssize_t c_size;
+	uint8_t *out, *out2;
+
+	out  = talloc_size(tmp_ctx, 2048);
+	memset(out, 0x42, talloc_get_size(out));
+
+	c_size = lzxpress_compress((const uint8_t *)fixed_data,
+				   strlen(fixed_data),
+				   out,
+				   talloc_get_size(out));
+
+	assert_int_not_equal(c_size, -1);
+	assert_int_equal(c_size, sizeof(fixed_out));
+	assert_memory_equal(out, fixed_out, c_size);
+
+	out2  = talloc_size(tmp_ctx, strlen(fixed_data));
+	c_size = lzxpress_decompress(out,
+				     sizeof(fixed_out),
+				     out2,
+				     talloc_get_size(out2));
+
+	assert_int_not_equal(c_size, -1);
+	assert_int_equal(c_size, strlen(fixed_data));
+	assert_memory_equal(out2, fixed_data, c_size);
+
+	talloc_free(tmp_ctx);
+}
+
+
+static void test_lzxpress_many_zeros(void **state)
+{
+	/*
+	 * Repeated values (zero is convenient but not special) will lead to
+	 * very long substring searches in compression, which can be very slow
+	 * if we're not careful.
+	 *
+	 * This test makes a very loose assertion about how long it should
+	 * take to compress a million zeros.
+	 *
+	 * Wall clock time *should* be < 0.1 seconds with the fix and around a
+	 * minute without it. We try for CLOCK_THREAD_CPUTIME_ID which should
+	 * filter out some noise on the machine, and set the threshold at 5
+	 * seconds.
+	 */
+
+	TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+	const size_t N_ZEROS = 1000000;
+	const uint8_t *zeros = talloc_zero_size(tmp_ctx, N_ZEROS);
+	const ssize_t expected_c_size_max = 120;
+	const ssize_t expected_c_size_min = 93;
+	ssize_t c_size;
+	uint8_t *comp, *decomp;
+	static struct timespec t_start, t_end;
+	uint64_t elapsed_ns;
+
+	if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &t_start) != 0) {
+		if (clock_gettime(CUSTOM_CLOCK_MONOTONIC, &t_start) != 0) {
+			clock_gettime(CLOCK_REALTIME, &t_start);
+		}
+	}
+
+	comp = talloc_zero_size(tmp_ctx, 2048);
+
+	c_size = lzxpress_compress(zeros,
+				   N_ZEROS,
+				   comp,
+				   talloc_get_size(comp));
+	/*
+	 * Because our compression depends on heuristics, we don't insist on
+	 * an exact size in this case.
+	 */
+
+	assert_true(c_size <= expected_c_size_max);
+	assert_true(c_size >= expected_c_size_min);
+
+	decomp = talloc_size(tmp_ctx, N_ZEROS * 2);
+	c_size = lzxpress_decompress(comp,
+				     c_size,
+				     decomp,
+				     N_ZEROS * 2);
+
+	if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &t_end) != 0) {
+		if (clock_gettime(CUSTOM_CLOCK_MONOTONIC, &t_end) != 0) {
+			clock_gettime(CLOCK_REALTIME, &t_end);
+		}
+	}
+	elapsed_ns = (
+		(t_end.tv_sec - t_start.tv_sec) * 1000U * 1000U * 1000U) +
+		(t_end.tv_nsec - t_start.tv_nsec);
+	print_message("round-trip time: %"PRIu64" ns\n", elapsed_ns);
+	assert_true(elapsed_ns < 3 * 1000U * 1000U * 1000U);
+	assert_memory_equal(decomp, zeros, N_ZEROS);
+
+	talloc_free(tmp_ctx);
+}
+
+
+static void test_lzxpress_round_trip(void **state)
+{
+	/*
+	 * Examples found using via fuzzing.
+	 */
+	TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+	size_t i;
+	struct b64_pair {
+		const char *uncompressed;
+		const char *compressed;
+	} pairs[] = {
+		{   /* this results in a trailing flags block */
+			"AAICAmq/EKdP785YU2Ddh7d4vUtdlQyLeHV09LHpUBw=",
+			"AAAAAAACAgJqvxCnT+/OWFNg3Ye3eL1LXZUMi3h1dPSx6VAc/////w==",
+		},
+		{    /* empty string compresses to empty string */
+			"",  ""
+		},
+	};
+	const size_t alloc_size = 1000;
+	uint8_t *data = talloc_array(tmp_ctx, uint8_t, alloc_size);
+
+	for (i = 0; i < ARRAY_SIZE(pairs); i++) {
+		ssize_t len;
+		DATA_BLOB uncomp = base64_decode_data_blob_talloc(
+			tmp_ctx,
+			pairs[i].uncompressed);
+		DATA_BLOB comp = base64_decode_data_blob_talloc(
+			tmp_ctx,
+			pairs[i].compressed);
+
+		len = lzxpress_compress(uncomp.data,
+					uncomp.length,
+					data,
+					alloc_size);
+
+		assert_int_not_equal(len, -1);
+		assert_int_equal(len, comp.length);
+
+		assert_memory_equal(comp.data, data, len);
+
+		len = lzxpress_decompress(comp.data,
+					  comp.length,
+					  data,
+					  alloc_size);
+
+		assert_int_not_equal(len, -1);
+		assert_int_equal(len, uncomp.length);
+
+		assert_memory_equal(uncomp.data, data, len);
+	}
+	talloc_free(tmp_ctx);
+}
+
+
+int main(void)
+{
+	const struct CMUnitTest tests[] = {
+		cmocka_unit_test(test_lzxpress_plain_decompress_files),
+		cmocka_unit_test(test_lzxpress_plain_decompress_more_compressed_files),
+		cmocka_unit_test(test_lzxpress_plain_round_trip_files),
+		cmocka_unit_test(test_lzxpress_plain_long_gpl_round_trip),
+		cmocka_unit_test(test_lzxpress_plain_long_random_graph_round_trip),
+		cmocka_unit_test(test_lzxpress_plain_chaos_graph_round_trip),
+		cmocka_unit_test(test_lzxpress_plain_sparse_random_graph_round_trip),
+		cmocka_unit_test(test_lzxpress_plain_long_random_graph_round_trip),
+		cmocka_unit_test(test_lzxpress_plain_random_noise_round_trip),
+		cmocka_unit_test(test_lzxpress),
+		cmocka_unit_test(test_msft_data1),
+		cmocka_unit_test(test_msft_data2),
+		cmocka_unit_test(test_lzxpress2),
+		cmocka_unit_test(test_lzxpress3),
+		cmocka_unit_test(test_lzxpress4),
+		cmocka_unit_test(test_lzxpress_many_zeros),
+		cmocka_unit_test(test_lzxpress_round_trip),
+	};
+	if (!isatty(1)) {
+		cmocka_set_message_output(CM_OUTPUT_SUBUNIT);
+	}
+	return cmocka_run_group_tests(tests, NULL, NULL);
+}
diff --git a/lib/compression/wscript_build b/lib/compression/wscript_build
new file mode 100644
index 0000000..61fe4a9
--- /dev/null
+++ b/lib/compression/wscript_build
@@ -0,0 +1,25 @@
+#!/usr/bin/env python
+
+bld.SAMBA_SUBSYSTEM('LZXPRESS',
+                    deps='replace talloc stable_sort samba-debug',
+                    source='lzxpress.c lzxpress_huffman.c'
+                    )
+
+bld.SAMBA_BINARY('test_lzx_huffman',
+                 source='tests/test_lzx_huffman.c',
+                 deps=('cmocka replace LZXPRESS'
+                       ' samba-util'),
+                 local_include=False,
+                 for_selftest=True)
+
+bld.SAMBA_BINARY('test_lzxpress_plain',
+                 source='tests/test_lzxpress_plain.c',
+                 deps=('cmocka replace LZXPRESS'
+                       ' samba-util'),
+                 local_include=False,
+                 for_selftest=True)
+
+bld.SAMBA_PYTHON('pycompression',
+                 'pycompression.c',
+                 deps='LZXPRESS',
+                 realname='samba/compression.so')
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-19 17:20:00 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-19 17:20:00 +0000
commit	8daa83a594a2e98f39d764422bfbdbc62c9efd44 (patch)
tree	4099e8021376c7d8c05bdf8503093d80e9c7bad0 /lib/compression
parent	Initial commit. (diff)
download	samba-8daa83a594a2e98f39d764422bfbdbc62c9efd44.tar.xz samba-8daa83a594a2e98f39d764422bfbdbc62c9efd44.zip