summaryrefslogtreecommitdiffstats
path: root/src/isa-l/igzip/generate_custom_hufftables.c
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
commite6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree64f88b554b444a49f656b6c656111a145cbbaa28 /src/isa-l/igzip/generate_custom_hufftables.c
parentInitial commit. (diff)
downloadceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz
ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/isa-l/igzip/generate_custom_hufftables.c')
-rw-r--r--src/isa-l/igzip/generate_custom_hufftables.c480
1 files changed, 480 insertions, 0 deletions
diff --git a/src/isa-l/igzip/generate_custom_hufftables.c b/src/isa-l/igzip/generate_custom_hufftables.c
new file mode 100644
index 000000000..449f70983
--- /dev/null
+++ b/src/isa-l/igzip/generate_custom_hufftables.c
@@ -0,0 +1,480 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+/* This program can be used to generate custom a custom huffman encoding to get
+ * better data compression. This is most useful when the type of data being
+ * compressed is well known.
+ *
+ * To use generate_custom_hufftables, pass a sequence of files to the program
+ * that together form an accurate representation of the data that is being
+ * compressed. Generate_custom_hufftables will then produce the file
+ * hufftables_c.c, which should be moved to replace its counterpart in the igzip
+ * source folder. After recompiling the Isa-l library, the igzip compression
+ * functions will use the new hufftables.
+ *
+ * Generate_custom_hufftables should be compiled with the same compile time
+ * parameters as the igzip source code. Generating custom hufftables with
+ * different compile time parameters may cause igzip to produce invalid output
+ * for the reasons described below. The default parameters used by
+ * generate_custom_hufftables are the same as the default parameters used by
+ * igzip.
+ *
+ * *WARNING* generate custom hufftables must be compiled with a IGZIP_HIST_SIZE
+ * that is at least as large as the IGZIP_HIST_SIZE used by igzip. By default
+ * IGZIP_HIST_SIZE is 32K, the maximum usable IGZIP_HIST_SIZE is 32K. The reason
+ * for this is to generate better compression. Igzip cannot produce look back
+ * distances with sizes larger than the IGZIP_HIST_SIZE igzip was compiled with,
+ * so look back distances with sizes larger than IGZIP_HIST_SIZE are not
+ * assigned a huffman code. The definition of LONGER_HUFFTABLES must be
+ * consistent as well since that definition changes the size of the structures
+ * printed by this tool.
+ *
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <inttypes.h>
+#include <string.h>
+#include <stdlib.h>
+#include "igzip_lib.h"
+
+#include "huff_codes.h"
+#include "huffman.h"
+
+/*These max code lengths are limited by how the data is stored in
+ * hufftables.asm. The deflate standard max is 15.*/
+
+#define MAX_HEADER_SIZE ISAL_DEF_MAX_HDR_SIZE
+
+#define GZIP_HEADER_SIZE 10
+#define GZIP_TRAILER_SIZE 8
+#define ZLIB_HEADER_SIZE 2
+#define ZLIB_TRAILER_SIZE 4
+
+/**
+ * @brief Prints a table of uint8_t elements to a file.
+ * @param outfile: the file the table is printed to.
+ * @param table: the table to be printed.
+ * @param length: number of elements to be printed.
+ * @param header: header to append in front of the table.
+ * @param footer: footer to append at the end of the table.
+ * @param begin_line: string printed at beginning of new line
+ */
+void fprint_uint8_table(FILE * outfile, uint8_t * table, uint64_t length, char *header,
+ char *footer, char *begin_line)
+{
+ int i;
+ fprintf(outfile, "%s", header);
+ for (i = 0; i < length - 1; i++) {
+ if ((i & 7) == 0)
+ fprintf(outfile, "\n%s", begin_line);
+ else
+ fprintf(outfile, " ");
+ fprintf(outfile, "0x%02x,", table[i]);
+ }
+
+ if ((i & 7) == 0)
+ fprintf(outfile, "\n%s", begin_line);
+ else
+ fprintf(outfile, " ");
+ fprintf(outfile, "0x%02x", table[i]);
+ fprintf(outfile, "%s", footer);
+
+}
+
+/**
+ * @brief Prints a table of uint16_t elements to a file.
+ * @param outfile: the file the table is printed to.
+ * @param table: the table to be printed.
+ * @param length: number of elements to be printed.
+ * @param header: header to append in front of the table.
+ * @param footer: footer to append at the end of the table.
+ * @param begin_line: string printed at beginning of new line
+ */
+void fprint_uint16_table(FILE * outfile, uint16_t * table, uint64_t length, char *header,
+ char *footer, char *begin_line)
+{
+ int i;
+ fprintf(outfile, "%s", header);
+ for (i = 0; i < length - 1; i++) {
+ if ((i & 7) == 0)
+ fprintf(outfile, "\n%s", begin_line);
+ else
+ fprintf(outfile, " ");
+ fprintf(outfile, "0x%04x,", table[i]);
+ }
+
+ if ((i & 7) == 0)
+ fprintf(outfile, "\n%s", begin_line);
+ else
+ fprintf(outfile, " ");
+ fprintf(outfile, "0x%04x", table[i]);
+ fprintf(outfile, "%s", footer);
+
+}
+
+/**
+ * @brief Prints a table of uint32_t elements to a file.
+ * @param outfile: the file the table is printed to.
+ * @param table: the table to be printed.
+ * @param length: number of elements to be printed.
+ * @param header: header to append in front of the table.
+ * @param footer: footer to append at the end of the table.
+ * @param begin_line: string printed at beginning of new line
+ */
+void fprint_uint32_table(FILE * outfile, uint32_t * table, uint64_t length, char *header,
+ char *footer, char *begin_line)
+{
+ int i;
+ fprintf(outfile, "%s", header);
+ for (i = 0; i < length - 1; i++) {
+ if ((i & 3) == 0)
+ fprintf(outfile, "\n%s", begin_line);
+ else
+ fprintf(outfile, " ");
+ fprintf(outfile, "0x%08x,", table[i]);
+ }
+
+ if ((i & 3) == 0)
+ fprintf(outfile, "%s", begin_line);
+ else
+ fprintf(outfile, " ");
+ fprintf(outfile, "0x%08x", table[i]);
+ fprintf(outfile, "%s", footer);
+
+}
+
+void fprint_hufftables(FILE * output_file, char *hufftables_name,
+ struct isal_hufftables *hufftables)
+{
+ fprintf(output_file, "struct isal_hufftables %s = {\n\n", hufftables_name);
+
+ fprint_uint8_table(output_file, hufftables->deflate_hdr,
+ hufftables->deflate_hdr_count +
+ (hufftables->deflate_hdr_extra_bits + 7) / 8,
+ "\t.deflate_hdr = {", "},\n\n", "\t\t");
+
+ fprintf(output_file, "\t.deflate_hdr_count = %d,\n", hufftables->deflate_hdr_count);
+ fprintf(output_file, "\t.deflate_hdr_extra_bits = %d,\n\n",
+ hufftables->deflate_hdr_extra_bits);
+
+ fprint_uint32_table(output_file, hufftables->dist_table, IGZIP_DIST_TABLE_SIZE,
+ "\t.dist_table = {", "},\n\n", "\t\t");
+
+ fprint_uint32_table(output_file, hufftables->len_table, IGZIP_LEN_TABLE_SIZE,
+ "\t.len_table = {", "},\n\n", "\t\t");
+
+ fprint_uint16_table(output_file, hufftables->lit_table, IGZIP_LIT_TABLE_SIZE,
+ "\t.lit_table = {", "},\n\n", "\t\t");
+ fprint_uint8_table(output_file, hufftables->lit_table_sizes, IGZIP_LIT_TABLE_SIZE,
+ "\t.lit_table_sizes = {", "},\n\n", "\t\t");
+
+ fprint_uint16_table(output_file, hufftables->dcodes,
+ ISAL_DEF_DIST_SYMBOLS - IGZIP_DECODE_OFFSET,
+ "\t.dcodes = {", "},\n\n", "\t\t");
+ fprint_uint8_table(output_file, hufftables->dcodes_sizes,
+ ISAL_DEF_DIST_SYMBOLS - IGZIP_DECODE_OFFSET,
+ "\t.dcodes_sizes = {", "}\n", "\t\t");
+ fprintf(output_file, "};\n");
+}
+
+void fprint_header(FILE * output_file)
+{
+
+ fprintf(output_file, "#include <stdint.h>\n");
+ fprintf(output_file, "#include <igzip_lib.h>\n\n");
+
+ fprintf(output_file, "#if IGZIP_HIST_SIZE > %d\n"
+ "# error \"Invalid history size for the custom hufftable\"\n"
+ "#endif\n", IGZIP_HIST_SIZE);
+
+#ifdef LONGER_HUFFTABLE
+ fprintf(output_file, "#ifndef LONGER_HUFFTABLE\n"
+ "# error \"Custom hufftable requires LONGER_HUFFTABLE to be defined \"\n"
+ "#endif\n");
+#else
+ fprintf(output_file, "#ifdef LONGER_HUFFTABLE\n"
+ "# error \"Custom hufftable requires LONGER_HUFFTABLE to not be defined \"\n"
+ "#endif\n");
+#endif
+ fprintf(output_file, "\n");
+
+ fprintf(output_file, "const uint8_t gzip_hdr[] = {\n"
+ "\t0x1f, 0x8b, 0x08, 0x00, 0x00,\n" "\t0x00, 0x00, 0x00, 0x00, 0xff\t};\n\n");
+
+ fprintf(output_file, "const uint32_t gzip_hdr_bytes = %d;\n", GZIP_HEADER_SIZE);
+ fprintf(output_file, "const uint32_t gzip_trl_bytes = %d;\n\n", GZIP_TRAILER_SIZE);
+
+ fprintf(output_file, "const uint8_t zlib_hdr[] = { 0x78, 0x01 };\n\n");
+ fprintf(output_file, "const uint32_t zlib_hdr_bytes = %d;\n", ZLIB_HEADER_SIZE);
+ fprintf(output_file, "const uint32_t zlib_trl_bytes = %d;\n", ZLIB_TRAILER_SIZE);
+}
+
+static uint32_t convert_dist_to_dist_sym(uint32_t dist)
+{
+ assert(dist <= 32768 && dist > 0);
+ if (dist <= 32768) {
+ uint32_t msb = dist > 4 ? bsr(dist - 1) - 2 : 0;
+ return (msb * 2) + ((dist - 1) >> msb);
+ } else {
+ return ~0;
+ }
+}
+
+/**
+ * @brief Returns the deflate symbol value for a repeat length.
+ */
+static uint32_t convert_length_to_len_sym(uint32_t length)
+{
+ assert(length > 2 && length < 259);
+
+ /* Based on tables on page 11 in RFC 1951 */
+ if (length < 11)
+ return 257 + length - 3;
+ else if (length < 19)
+ return 261 + (length - 3) / 2;
+ else if (length < 35)
+ return 265 + (length - 3) / 4;
+ else if (length < 67)
+ return 269 + (length - 3) / 8;
+ else if (length < 131)
+ return 273 + (length - 3) / 16;
+ else if (length < 258)
+ return 277 + (length - 3) / 32;
+ else
+ return 285;
+}
+
+void isal_update_histogram_dict(uint8_t * start_stream, int dict_length, int length,
+ struct isal_huff_histogram *histogram)
+{
+ uint32_t literal = 0, hash;
+ uint16_t seen, *last_seen = histogram->hash_table;
+ uint8_t *current, *end_stream, *next_hash, *end, *end_dict;
+ uint32_t match_length;
+ uint32_t dist;
+ uint64_t *lit_len_histogram = histogram->lit_len_histogram;
+ uint64_t *dist_histogram = histogram->dist_histogram;
+
+ if (length <= 0)
+ return;
+
+ end_stream = start_stream + dict_length + length;
+ end_dict = start_stream + dict_length;
+
+ memset(last_seen, 0, sizeof(histogram->hash_table)); /* Initialize last_seen to be 0. */
+
+ for (current = start_stream; current < end_dict - 4; current++) {
+ literal = load_u32(current);
+ hash = compute_hash(literal) & LVL0_HASH_MASK;
+ last_seen[hash] = (current - start_stream) & 0xFFFF;
+ }
+
+ for (current = start_stream + dict_length; current < end_stream - 3; current++) {
+ literal = load_u32(current);
+ hash = compute_hash(literal) & LVL0_HASH_MASK;
+ seen = last_seen[hash];
+ last_seen[hash] = (current - start_stream) & 0xFFFF;
+ dist = (current - start_stream - seen) & 0xFFFF;
+ if (dist - 1 < D - 1) {
+ assert(start_stream <= current - dist);
+ match_length =
+ compare258(current - dist, current, end_stream - current);
+ if (match_length >= SHORTEST_MATCH) {
+ next_hash = current;
+#ifdef ISAL_LIMIT_HASH_UPDATE
+ end = next_hash + 3;
+#else
+ end = next_hash + match_length;
+#endif
+ if (end > end_stream - 3)
+ end = end_stream - 3;
+ next_hash++;
+ for (; next_hash < end; next_hash++) {
+ literal = load_u32(next_hash);
+ hash = compute_hash(literal) & LVL0_HASH_MASK;
+ last_seen[hash] = (next_hash - start_stream) & 0xFFFF;
+ }
+
+ dist_histogram[convert_dist_to_dist_sym(dist)] += 1;
+ lit_len_histogram[convert_length_to_len_sym(match_length)] +=
+ 1;
+ current += match_length - 1;
+ continue;
+ }
+ }
+ lit_len_histogram[literal & 0xFF] += 1;
+ }
+
+ for (; current < end_stream; current++)
+ lit_len_histogram[*current] += 1;
+
+ lit_len_histogram[256] += 1;
+ return;
+}
+
+int main(int argc, char *argv[])
+{
+ long int file_length;
+ int argi = 1;
+ uint8_t *stream = NULL;
+ struct isal_hufftables hufftables;
+ struct isal_huff_histogram histogram;
+ struct isal_zstream tmp_stream;
+ FILE *file = NULL;
+ FILE *dict_file = NULL;
+ FILE *hist_file = NULL;
+ long int dict_file_length = 0;
+ long int hist_file_length = 0;
+ uint8_t *dict_stream = NULL;
+
+ if (argc == 1) {
+ printf("Error, no input file.\n");
+ return 1;
+ }
+
+ if (argc > 3 && argv[1][0] == '-' && argv[1][1] == 'd') {
+ dict_file = fopen(argv[2], "r");
+
+ fseek(dict_file, 0, SEEK_END);
+ dict_file_length = ftell(dict_file);
+ fseek(dict_file, 0, SEEK_SET);
+ dict_file_length -= ftell(dict_file);
+ dict_stream = malloc(dict_file_length);
+ if (dict_stream == NULL) {
+ printf("Failed to allocate memory to read in dictionary file\n");
+ fclose(dict_file);
+ return 1;
+ }
+ if (fread(dict_stream, 1, dict_file_length, dict_file) != dict_file_length) {
+ printf("Error occurred when reading dictionary file");
+ fclose(dict_file);
+ free(dict_stream);
+ return 1;
+ }
+ isal_update_histogram(dict_stream, dict_file_length, &histogram);
+
+ printf("Read %ld bytes of dictionary file %s\n", dict_file_length, argv[2]);
+ argi += 2;
+ fclose(dict_file);
+ free(dict_stream);
+ }
+
+ if ((argc > argi + 1) && argv[argi][0] == '-' && argv[argi][1] == 'h') {
+ hist_file = fopen(argv[argi + 1], "r+");
+ fseek(hist_file, 0, SEEK_END);
+ hist_file_length = ftell(hist_file);
+ fseek(hist_file, 0, SEEK_SET);
+ hist_file_length -= ftell(hist_file);
+ if (hist_file_length > sizeof(histogram)) {
+ printf("Histogram file too long\n");
+ return 1;
+ }
+ if (fread(&histogram, 1, hist_file_length, hist_file) != hist_file_length) {
+ printf("Error occurred when reading history file");
+ fclose(hist_file);
+ return 1;
+ }
+ fseek(hist_file, 0, SEEK_SET);
+
+ printf("Read %ld bytes of history file %s\n", hist_file_length,
+ argv[argi + 1]);
+ argi += 2;
+ } else
+ memset(&histogram, 0, sizeof(histogram)); /* Initialize histograms. */
+
+ while (argi < argc) {
+ printf("Processing %s\n", argv[argi]);
+ file = fopen(argv[argi], "r");
+ if (file == NULL) {
+ printf("Error opening file\n");
+ return 1;
+ }
+ fseek(file, 0, SEEK_END);
+ file_length = ftell(file);
+ fseek(file, 0, SEEK_SET);
+ file_length -= ftell(file);
+ stream = malloc(file_length + dict_file_length);
+ if (stream == NULL) {
+ printf("Failed to allocate memory to read in file\n");
+ fclose(file);
+ return 1;
+ }
+ if (dict_file_length > 0)
+ memcpy(stream, dict_stream, dict_file_length);
+
+ if (fread(&stream[dict_file_length], 1, file_length, file) != file_length) {
+ printf("Error occurred when reading file");
+ fclose(file);
+ free(stream);
+ return 1;
+ }
+
+ /* Create a histogram of frequency of symbols found in stream to
+ * generate the huffman tree.*/
+ if (0 == dict_file_length)
+ isal_update_histogram(stream, file_length, &histogram);
+ else
+ isal_update_histogram_dict(stream, dict_file_length, file_length,
+ &histogram);
+
+ fclose(file);
+ free(stream);
+ argi++;
+ }
+
+ isal_create_hufftables(&hufftables, &histogram);
+
+ file = fopen("hufftables_c.c", "w");
+ if (file == NULL) {
+ printf("Error creating file hufftables_c.c\n");
+ return 1;
+ }
+
+ fprint_header(file);
+
+ fprintf(file, "\n");
+
+ fprint_hufftables(file, "hufftables_default", &hufftables);
+
+ fprintf(file, "\n");
+
+ isal_deflate_stateless_init(&tmp_stream);
+ isal_deflate_set_hufftables(&tmp_stream, NULL, IGZIP_HUFFTABLE_STATIC);
+ fprint_hufftables(file, "hufftables_static", tmp_stream.hufftables);
+
+ fclose(file);
+
+ if (hist_file) {
+ int len = fwrite(&histogram, 1, sizeof(histogram), hist_file);
+ printf("wrote %d bytes of histogram file\n", len);
+ fclose(hist_file);
+ }
+ return 0;
+}