/*
 * Copyright (C) Matthieu Suiche 2008
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the author nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 */

#include "replace.h"
#include "lzxpress.h"
#include "../lib/util/byteorder.h"


#define __CHECK_BYTES(__size, __index, __needed) do { \
	if (unlikely(__index >= __size)) { \
		return -1; \
	} else { \
		uint32_t __avail = __size - __index; \
		if (unlikely(__needed > __avail)) { \
			return -1; \
		} \
	} \
} while(0)


/*
 * LZX_PLAIN_COMP_HASH_BITS determines how big the hash table for finding
 * matches will be.
 *
 * The window in which we look for matches is 8192 bytes. That means with
 * random data a value of 13 is getting close to no collisions, while a 12
 * will miss about half the possible matches. With compressible data there
 * will generally be fewer and less diverse entries, so collisions are rarer.
 *
 * In the testsuite, bith 12 and 13 give better compression than Windows, but
 * 12 is faster. 11 does not save time and costs accuracy. Thus we prefer 12.
 */
#define LZX_PLAIN_COMP_HASH_BITS 12
/*
 * LZX_PLAIN_COMP_HASH_SEARCH_ATTEMPTS is how far ahead to search in the
 * circular hash table for a match, before we give up. A bigger number will
 * generally lead to better but slower compression, but a stupidly big number
 * will just be worse.
 */
#define LZX_PLAIN_COMP_HASH_SEARCH_ATTEMPTS 5
#define HASH_MASK ((1 << LZX_PLAIN_COMP_HASH_BITS) - 1)

static inline uint16_t three_byte_hash(const uint8_t *bytes)
{
	uint16_t a = bytes[0];
	uint16_t b = bytes[1] ^ 0x2e;
	uint16_t c = bytes[2] ^ 0x55;
	uint16_t ca = c - a;
	uint16_t d = ((a + b) << 8) ^ (ca << 5) ^ (c + b) ^ (0xcab + a);
	return d & HASH_MASK;
}


static inline void store_match(uint32_t *hash_table,
			       uint16_t h,
			       uint32_t offset)
{
	int i;
	uint32_t o = hash_table[h];
	uint16_t h2;
	uint16_t worst_h;
	int worst_score;

	if (o >= offset) {
		/* there is nothing there yet */
		hash_table[h] = offset;
		return;
	}
	for (i = 1; i < LZX_PLAIN_COMP_HASH_SEARCH_ATTEMPTS; i++) {
		h2 = (h + i) & HASH_MASK;
		if (hash_table[h2] >= offset) {
			hash_table[h2] = offset;
			return;
		}
	}
	/*
	 * There are no slots, but we really want to store this, so we'll kick
	 * out the one with the longest distance.
	 */
	worst_h = h;
	worst_score = offset - o;
	for (i = 1; i < LZX_PLAIN_COMP_HASH_SEARCH_ATTEMPTS; i++) {
		int score;
		h2 = (h + i) & HASH_MASK;
		o = hash_table[h2];
		score = offset - o;
		if (score > worst_score) {
			worst_score = score;
			worst_h = h2;
		}
	}
	hash_table[worst_h] = offset;
}


struct match {
	const uint8_t *there;
	uint32_t length;
};


static inline struct match lookup_match(uint32_t *hash_table,
					uint16_t h,
					const uint8_t *data,
					uint32_t offset,
					size_t max_len)
{
	int i;
	uint32_t o;
	uint16_t h2;
	size_t len;
	const uint8_t *there = NULL;
	const uint8_t *here = data + offset;
	struct match best = {0};

	for (i = 0; i < LZX_PLAIN_COMP_HASH_SEARCH_ATTEMPTS; i++) {
		h2 = (h + i) & HASH_MASK;
		o = hash_table[h2];
		if (o >= offset) {
			/*
			 * Either this is 0xffffffff, or something is really
			 * wrong.
			 *
			 * In setting this, we would never have stepped over
			 * an 0xffffffff, so we won't now.
			 */
			break;
		}
		if (offset - o > 8192) {
			/* Too far away to use */
			continue;
		}
		there = data + o;
		/*
		 * When we already have a long match, we can try to avoid
		 * measuring out another long, but shorter match.
		 */
		if (best.length > 1000 &&
		    there[best.length - 1] != best.there[best.length - 1]) {
			continue;
		}

		for (len = 0;
		     len < max_len && here[len] == there[len];
		     len++) {
			/* counting */
		}
		if (len > 2) {
			if (len > best.length) {
				best.length = len;
				best.there = there;
			}
		}
	}
	return best;
}

struct write_context {
	uint8_t *compressed;
	uint32_t compressed_pos;
	uint32_t max_compressed_size;
	uint32_t indic;
	uint32_t indic_bit;
	uint32_t indic_pos;
	uint32_t nibble_index;
};


#define CHECK_INPUT_BYTES(__needed) \
	__CHECK_BYTES(uncompressed_size, uncompressed_pos, __needed)
#define CHECK_OUTPUT_BYTES(__needed) \
	__CHECK_BYTES(wc->max_compressed_size, wc->compressed_pos, __needed)


static inline ssize_t push_indicator_bit(struct write_context *wc, uint32_t bit)
{
	wc->indic = (wc->indic << 1) | bit;
	wc->indic_bit += 1;

	if (wc->indic_bit == 32) {
		PUSH_LE_U32(wc->compressed, wc->indic_pos, wc->indic);
		wc->indic_bit = 0;
		CHECK_OUTPUT_BYTES(sizeof(uint32_t));
		wc->indic_pos = wc->compressed_pos;
		wc->compressed_pos += sizeof(uint32_t);
	}
	return wc->indic_pos;
}


static ssize_t encode_match(struct write_context *wc,
			    struct match match,
			    const uint8_t *here)
{
	uint32_t match_len = match.length - 3;
	uint32_t best_offset = here - match.there - 1;
	uint16_t metadata;

	if (best_offset > 8191) {
		return -1;
	}

	CHECK_OUTPUT_BYTES(sizeof(uint16_t));
	metadata = (uint16_t)((best_offset << 3) | MIN(match_len, 7));
	PUSH_LE_U16(wc->compressed, wc->compressed_pos, metadata);
	wc->compressed_pos += sizeof(uint16_t);

	if (match_len >= 7) {
		match_len -= 7;

		if (wc->nibble_index == 0) {
			wc->nibble_index = wc->compressed_pos;

			CHECK_OUTPUT_BYTES(sizeof(uint8_t));
			wc->compressed[wc->nibble_index] = MIN(match_len, 15);
			wc->compressed_pos += sizeof(uint8_t);
		} else {
			wc->compressed[wc->nibble_index] |= MIN(match_len, 15) << 4;
			wc->nibble_index = 0;
		}

		if (match_len >= 15) {
			match_len -= 15;

			CHECK_OUTPUT_BYTES(sizeof(uint8_t));
			wc->compressed[wc->compressed_pos] = MIN(match_len, 255);
			wc->compressed_pos += sizeof(uint8_t);

			if (match_len >= 255) {
				/* Additional match_len */

				match_len += 7 + 15;

				if (match_len < (1 << 16)) {
					CHECK_OUTPUT_BYTES(sizeof(uint16_t));
					PUSH_LE_U16(wc->compressed, wc->compressed_pos,
						    match_len);
					wc->compressed_pos += sizeof(uint16_t);
				} else {
					CHECK_OUTPUT_BYTES(sizeof(uint16_t) +
							   sizeof(uint32_t));
					PUSH_LE_U16(wc->compressed,
						    wc->compressed_pos, 0);
					wc->compressed_pos += sizeof(uint16_t);

					PUSH_LE_U32(wc->compressed,
						    wc->compressed_pos,
						    match_len);
					wc->compressed_pos += sizeof(uint32_t);
				}
			}
		}
	}
	return push_indicator_bit(wc, 1);
}

#undef CHECK_OUTPUT_BYTES
#define CHECK_OUTPUT_BYTES(__needed) \
	__CHECK_BYTES(wc.max_compressed_size, wc.compressed_pos, __needed)


ssize_t lzxpress_compress(const uint8_t *uncompressed,
			  uint32_t uncompressed_size,
			  uint8_t *compressed,
			  uint32_t max_compressed_size)
{
	/*
	 * This is the algorithm in [MS-XCA] 2.3 "Plain LZ77 Compression".
	 *
	 * It avoids Huffman encoding by including literal bytes inline when a
	 * match is not found. Every so often it includes a uint32 bit map
	 * flagging which positions contain matches and which contain
	 * literals. The encoding of matches is of variable size, depending on
	 * the match length; they are always at least 16 bits long, and can
	 * implicitly use unused half-bytes from earlier in the stream.
	 */
	ssize_t ret;
	uint32_t uncompressed_pos;
	struct write_context wc = {
		.indic = 0,
		.indic_pos = 0,
		.indic_bit = 0,
		.nibble_index = 0,
		.compressed = compressed,
		.compressed_pos = 0,
		.max_compressed_size = max_compressed_size
	};
	uint32_t hash_table[1 << LZX_PLAIN_COMP_HASH_BITS];
	memset(hash_table, 0xff, sizeof(hash_table));

	if (!uncompressed_size) {
		return 0;
	}

	uncompressed_pos = 0;
	CHECK_OUTPUT_BYTES(sizeof(uint32_t));
	PUSH_LE_U32(wc.compressed, wc.compressed_pos, 0);
	wc.compressed_pos += sizeof(uint32_t);

	while ((uncompressed_pos < uncompressed_size) &&
	       (wc.compressed_pos < wc.max_compressed_size)) {

		/* maximum len we can encode into metadata */
		const uint32_t max_len = MIN(0xFFFF + 3,
					     uncompressed_size - uncompressed_pos);
		const uint8_t *here = uncompressed + uncompressed_pos;
		uint16_t h;
		struct match match = {0};

		if (max_len >= 3) {
			h = three_byte_hash(here);
			match = lookup_match(hash_table,
					     h,
					     uncompressed,
					     uncompressed_pos,
					     max_len);

			store_match(hash_table, h, uncompressed_pos);
		} else {
			match.there = NULL;
			match.length = 0;
		}

		if (match.there == NULL) {
			/*
			 * This is going to be a literal byte, which we flag
			 * by setting a bit in an indicator field somewhere
			 * earlier in the stream.
			 */
			CHECK_INPUT_BYTES(sizeof(uint8_t));
			CHECK_OUTPUT_BYTES(sizeof(uint8_t));
			wc.compressed[wc.compressed_pos++] = *here;
			uncompressed_pos++;

			ret = push_indicator_bit(&wc, 0);
			if (ret < 0) {
				return ret;
			}
		} else {
			ret = encode_match(&wc, match, here);
			if (ret < 0) {
				return ret;
			}
			uncompressed_pos += match.length;
		}
	}

	if (wc.indic_bit != 0) {
		wc.indic <<= 32 - wc.indic_bit;
	}
	wc.indic |= UINT32_MAX >> wc.indic_bit;
	PUSH_LE_U32(wc.compressed, wc.indic_pos, wc.indic);

	return wc.compressed_pos;
}

ssize_t lzxpress_decompress(const uint8_t *input,
			    uint32_t input_size,
			    uint8_t *output,
			    uint32_t max_output_size)
{
	/*
	 * This is the algorithm in [MS-XCA] 2.4 "Plain LZ77 Decompression
	 * Algorithm Details".
	 */
	uint32_t output_index, input_index;
	uint32_t indicator, indicator_bit;
	uint32_t nibble_index;

	if (input_size == 0) {
		return 0;
	}

	output_index = 0;
	input_index = 0;
	indicator = 0;
	indicator_bit = 0;
	nibble_index = 0;

#undef CHECK_INPUT_BYTES
#define CHECK_INPUT_BYTES(__needed) \
	__CHECK_BYTES(input_size, input_index, __needed)
#undef CHECK_OUTPUT_BYTES
#define CHECK_OUTPUT_BYTES(__needed) \
	__CHECK_BYTES(max_output_size, output_index, __needed)

	do {
		if (indicator_bit == 0) {
			CHECK_INPUT_BYTES(sizeof(uint32_t));
			indicator = PULL_LE_U32(input, input_index);
			input_index += sizeof(uint32_t);
			if (input_index == input_size) {
				/*
				 * The compressor left room for indicator
				 * flags for data that doesn't exist.
				 */
				break;
			}
			indicator_bit = 32;
		}
		indicator_bit--;

		/*
		 * check whether the bit specified by indicator_bit is set or not
		 * set in indicator. For example, if indicator_bit has value 4
		 * check whether the 4th bit of the value in indicator is set
		 */
		if (((indicator >> indicator_bit) & 1) == 0) {
			CHECK_INPUT_BYTES(sizeof(uint8_t));
			CHECK_OUTPUT_BYTES(sizeof(uint8_t));
			output[output_index] = input[input_index];
			input_index += sizeof(uint8_t);
			output_index += sizeof(uint8_t);
		} else {
			uint32_t length;
			uint32_t offset;

			CHECK_INPUT_BYTES(sizeof(uint16_t));
			length = PULL_LE_U16(input, input_index);
			input_index += sizeof(uint16_t);
			offset = (length >> 3) + 1;
			length &= 7;

			if (length == 7) {
				if (nibble_index == 0) {
					CHECK_INPUT_BYTES(sizeof(uint8_t));
					nibble_index = input_index;
					length = input[input_index] & 0xf;
					input_index += sizeof(uint8_t);
				} else {
					length = input[nibble_index] >> 4;
					nibble_index = 0;
				}

				if (length == 15) {
					CHECK_INPUT_BYTES(sizeof(uint8_t));
					length = input[input_index];
					input_index += sizeof(uint8_t);
					if (length == 255) {
						CHECK_INPUT_BYTES(sizeof(uint16_t));
						length = PULL_LE_U16(input, input_index);
						input_index += sizeof(uint16_t);
						if (length == 0) {
							CHECK_INPUT_BYTES(sizeof(uint32_t));
							length = PULL_LE_U32(input, input_index);
							input_index += sizeof(uint32_t);
						}

						if (length < (15 + 7)) {
							return -1;
						}
						length -= (15 + 7);
					}
					length += 15;
				}
				length += 7;
			}
			length += 3;

			if (length == 0) {
				return -1;
			}

			for (; length > 0; --length) {
				if (offset > output_index) {
					return -1;
				}
				CHECK_OUTPUT_BYTES(sizeof(uint8_t));
				output[output_index] = output[output_index - offset];
				output_index += sizeof(uint8_t);
			}
		}
	} while ((output_index < max_output_size) && (input_index < (input_size)));

	return output_index;
}