diff options
Diffstat (limited to 'doc')
33 files changed, 4889 insertions, 0 deletions
diff --git a/doc/examples/00_README.txt b/doc/examples/00_README.txt new file mode 100644 index 0000000..120e1eb --- /dev/null +++ b/doc/examples/00_README.txt @@ -0,0 +1,31 @@ + +liblzma example programs +======================== + +Introduction + + The examples are written so that the same comments aren't + repeated (much) in later files. + + On POSIX systems, the examples should build by just typing "make". + + The examples that use stdin or stdout don't set stdin and stdout + to binary mode. On systems where it matters (e.g. Windows) it is + possible that the examples won't work without modification. + + +List of examples + + 01_compress_easy.c Multi-call compression using + a compression preset + + 02_decompress.c Multi-call decompression + + 03_compress_custom.c Like 01_compress_easy.c but using + a custom filter chain + (x86 BCJ + LZMA2) + + 04_compress_easy_mt.c Multi-threaded multi-call + compression using a compression + preset + diff --git a/doc/examples/01_compress_easy.c b/doc/examples/01_compress_easy.c new file mode 100644 index 0000000..ec32a37 --- /dev/null +++ b/doc/examples/01_compress_easy.c @@ -0,0 +1,297 @@ +/////////////////////////////////////////////////////////////////////////////// +// +/// \file 01_compress_easy.c +/// \brief Compress from stdin to stdout in multi-call mode +/// +/// Usage: ./01_compress_easy PRESET < INFILE > OUTFILE +/// +/// Example: ./01_compress_easy 6 < foo > foo.xz +// +// Author: Lasse Collin +// +// This file has been put into the public domain. +// You can do whatever you want with this file. +// +/////////////////////////////////////////////////////////////////////////////// + +#include <stdbool.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <lzma.h> + + +static void +show_usage_and_exit(const char *argv0) +{ + fprintf(stderr, "Usage: %s PRESET < INFILE > OUTFILE\n" + "PRESET is a number 0-9 and can optionally be " + "followed by `e' to indicate extreme preset\n", + argv0); + exit(EXIT_FAILURE); +} + + +static uint32_t +get_preset(int argc, char **argv) +{ + // One argument whose first char must be 0-9. + if (argc != 2 || argv[1][0] < '0' || argv[1][0] > '9') + show_usage_and_exit(argv[0]); + + // Calculate the preste level 0-9. + uint32_t preset = argv[1][0] - '0'; + + // If there is a second char, it must be 'e'. It will set + // the LZMA_PRESET_EXTREME flag. + if (argv[1][1] != '\0') { + if (argv[1][1] != 'e' || argv[1][2] != '\0') + show_usage_and_exit(argv[0]); + + preset |= LZMA_PRESET_EXTREME; + } + + return preset; +} + + +static bool +init_encoder(lzma_stream *strm, uint32_t preset) +{ + // Initialize the encoder using a preset. Set the integrity to check + // to CRC64, which is the default in the xz command line tool. If + // the .xz file needs to be decompressed with XZ Embedded, use + // LZMA_CHECK_CRC32 instead. + lzma_ret ret = lzma_easy_encoder(strm, preset, LZMA_CHECK_CRC64); + + // Return successfully if the initialization went fine. + if (ret == LZMA_OK) + return true; + + // Something went wrong. The possible errors are documented in + // lzma/container.h (src/liblzma/api/lzma/container.h in the source + // package or e.g. /usr/include/lzma/container.h depending on the + // install prefix). + const char *msg; + switch (ret) { + case LZMA_MEM_ERROR: + msg = "Memory allocation failed"; + break; + + case LZMA_OPTIONS_ERROR: + msg = "Specified preset is not supported"; + break; + + case LZMA_UNSUPPORTED_CHECK: + msg = "Specified integrity check is not supported"; + break; + + default: + // This is most likely LZMA_PROG_ERROR indicating a bug in + // this program or in liblzma. It is inconvenient to have a + // separate error message for errors that should be impossible + // to occur, but knowing the error code is important for + // debugging. That's why it is good to print the error code + // at least when there is no good error message to show. + msg = "Unknown error, possibly a bug"; + break; + } + + fprintf(stderr, "Error initializing the encoder: %s (error code %u)\n", + msg, ret); + return false; +} + + +static bool +compress(lzma_stream *strm, FILE *infile, FILE *outfile) +{ + // This will be LZMA_RUN until the end of the input file is reached. + // This tells lzma_code() when there will be no more input. + lzma_action action = LZMA_RUN; + + // Buffers to temporarily hold uncompressed input + // and compressed output. + uint8_t inbuf[BUFSIZ]; + uint8_t outbuf[BUFSIZ]; + + // Initialize the input and output pointers. Initializing next_in + // and avail_in isn't really necessary when we are going to encode + // just one file since LZMA_STREAM_INIT takes care of initializing + // those already. But it doesn't hurt much and it will be needed + // if encoding more than one file like we will in 02_decompress.c. + // + // While we don't care about strm->total_in or strm->total_out in this + // example, it is worth noting that initializing the encoder will + // always reset total_in and total_out to zero. But the encoder + // initialization doesn't touch next_in, avail_in, next_out, or + // avail_out. + strm->next_in = NULL; + strm->avail_in = 0; + strm->next_out = outbuf; + strm->avail_out = sizeof(outbuf); + + // Loop until the file has been successfully compressed or until + // an error occurs. + while (true) { + // Fill the input buffer if it is empty. + if (strm->avail_in == 0 && !feof(infile)) { + strm->next_in = inbuf; + strm->avail_in = fread(inbuf, 1, sizeof(inbuf), + infile); + + if (ferror(infile)) { + fprintf(stderr, "Read error: %s\n", + strerror(errno)); + return false; + } + + // Once the end of the input file has been reached, + // we need to tell lzma_code() that no more input + // will be coming and that it should finish the + // encoding. + if (feof(infile)) + action = LZMA_FINISH; + } + + // Tell liblzma do the actual encoding. + // + // This reads up to strm->avail_in bytes of input starting + // from strm->next_in. avail_in will be decremented and + // next_in incremented by an equal amount to match the + // number of input bytes consumed. + // + // Up to strm->avail_out bytes of compressed output will be + // written starting from strm->next_out. avail_out and next_out + // will be incremented by an equal amount to match the number + // of output bytes written. + // + // The encoder has to do internal buffering, which means that + // it may take quite a bit of input before the same data is + // available in compressed form in the output buffer. + lzma_ret ret = lzma_code(strm, action); + + // If the output buffer is full or if the compression finished + // successfully, write the data from the output buffer to + // the output file. + if (strm->avail_out == 0 || ret == LZMA_STREAM_END) { + // When lzma_code() has returned LZMA_STREAM_END, + // the output buffer is likely to be only partially + // full. Calculate how much new data there is to + // be written to the output file. + size_t write_size = sizeof(outbuf) - strm->avail_out; + + if (fwrite(outbuf, 1, write_size, outfile) + != write_size) { + fprintf(stderr, "Write error: %s\n", + strerror(errno)); + return false; + } + + // Reset next_out and avail_out. + strm->next_out = outbuf; + strm->avail_out = sizeof(outbuf); + } + + // Normally the return value of lzma_code() will be LZMA_OK + // until everything has been encoded. + if (ret != LZMA_OK) { + // Once everything has been encoded successfully, the + // return value of lzma_code() will be LZMA_STREAM_END. + // + // It is important to check for LZMA_STREAM_END. Do not + // assume that getting ret != LZMA_OK would mean that + // everything has gone well. + if (ret == LZMA_STREAM_END) + return true; + + // It's not LZMA_OK nor LZMA_STREAM_END, + // so it must be an error code. See lzma/base.h + // (src/liblzma/api/lzma/base.h in the source package + // or e.g. /usr/include/lzma/base.h depending on the + // install prefix) for the list and documentation of + // possible values. Most values listen in lzma_ret + // enumeration aren't possible in this example. + const char *msg; + switch (ret) { + case LZMA_MEM_ERROR: + msg = "Memory allocation failed"; + break; + + case LZMA_DATA_ERROR: + // This error is returned if the compressed + // or uncompressed size get near 8 EiB + // (2^63 bytes) because that's where the .xz + // file format size limits currently are. + // That is, the possibility of this error + // is mostly theoretical unless you are doing + // something very unusual. + // + // Note that strm->total_in and strm->total_out + // have nothing to do with this error. Changing + // those variables won't increase or decrease + // the chance of getting this error. + msg = "File size limits exceeded"; + break; + + default: + // This is most likely LZMA_PROG_ERROR, but + // if this program is buggy (or liblzma has + // a bug), it may be e.g. LZMA_BUF_ERROR or + // LZMA_OPTIONS_ERROR too. + // + // It is inconvenient to have a separate + // error message for errors that should be + // impossible to occur, but knowing the error + // code is important for debugging. That's why + // it is good to print the error code at least + // when there is no good error message to show. + msg = "Unknown error, possibly a bug"; + break; + } + + fprintf(stderr, "Encoder error: %s (error code %u)\n", + msg, ret); + return false; + } + } +} + + +extern int +main(int argc, char **argv) +{ + // Get the preset number from the command line. + uint32_t preset = get_preset(argc, argv); + + // Initialize a lzma_stream structure. When it is allocated on stack, + // it is simplest to use LZMA_STREAM_INIT macro like below. When it + // is allocated on heap, using memset(strmptr, 0, sizeof(*strmptr)) + // works (as long as NULL pointers are represented with zero bits + // as they are on practically all computers today). + lzma_stream strm = LZMA_STREAM_INIT; + + // Initialize the encoder. If it succeeds, compress from + // stdin to stdout. + bool success = init_encoder(&strm, preset); + if (success) + success = compress(&strm, stdin, stdout); + + // Free the memory allocated for the encoder. If we were encoding + // multiple files, this would only need to be done after the last + // file. See 02_decompress.c for handling of multiple files. + // + // It is OK to call lzma_end() multiple times or when it hasn't been + // actually used except initialized with LZMA_STREAM_INIT. + lzma_end(&strm); + + // Close stdout to catch possible write errors that can occur + // when pending data is flushed from the stdio buffers. + if (fclose(stdout)) { + fprintf(stderr, "Write error: %s\n", strerror(errno)); + success = false; + } + + return success ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/doc/examples/02_decompress.c b/doc/examples/02_decompress.c new file mode 100644 index 0000000..98339be --- /dev/null +++ b/doc/examples/02_decompress.c @@ -0,0 +1,287 @@ +/////////////////////////////////////////////////////////////////////////////// +// +/// \file 02_decompress.c +/// \brief Decompress .xz files to stdout +/// +/// Usage: ./02_decompress INPUT_FILES... > OUTFILE +/// +/// Example: ./02_decompress foo.xz bar.xz > foobar +// +// Author: Lasse Collin +// +// This file has been put into the public domain. +// You can do whatever you want with this file. +// +/////////////////////////////////////////////////////////////////////////////// + +#include <stdbool.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <lzma.h> + + +static bool +init_decoder(lzma_stream *strm) +{ + // Initialize a .xz decoder. The decoder supports a memory usage limit + // and a set of flags. + // + // The memory usage of the decompressor depends on the settings used + // to compress a .xz file. It can vary from less than a megabyte to + // a few gigabytes, but in practice (at least for now) it rarely + // exceeds 65 MiB because that's how much memory is required to + // decompress files created with "xz -9". Settings requiring more + // memory take extra effort to use and don't (at least for now) + // provide significantly better compression in most cases. + // + // Memory usage limit is useful if it is important that the + // decompressor won't consume gigabytes of memory. The need + // for limiting depends on the application. In this example, + // no memory usage limiting is used. This is done by setting + // the limit to UINT64_MAX. + // + // The .xz format allows concatenating compressed files as is: + // + // echo foo | xz > foobar.xz + // echo bar | xz >> foobar.xz + // + // When decompressing normal standalone .xz files, LZMA_CONCATENATED + // should always be used to support decompression of concatenated + // .xz files. If LZMA_CONCATENATED isn't used, the decoder will stop + // after the first .xz stream. This can be useful when .xz data has + // been embedded inside another file format. + // + // Flags other than LZMA_CONCATENATED are supported too, and can + // be combined with bitwise-or. See lzma/container.h + // (src/liblzma/api/lzma/container.h in the source package or e.g. + // /usr/include/lzma/container.h depending on the install prefix) + // for details. + lzma_ret ret = lzma_stream_decoder( + strm, UINT64_MAX, LZMA_CONCATENATED); + + // Return successfully if the initialization went fine. + if (ret == LZMA_OK) + return true; + + // Something went wrong. The possible errors are documented in + // lzma/container.h (src/liblzma/api/lzma/container.h in the source + // package or e.g. /usr/include/lzma/container.h depending on the + // install prefix). + // + // Note that LZMA_MEMLIMIT_ERROR is never possible here. If you + // specify a very tiny limit, the error will be delayed until + // the first headers have been parsed by a call to lzma_code(). + const char *msg; + switch (ret) { + case LZMA_MEM_ERROR: + msg = "Memory allocation failed"; + break; + + case LZMA_OPTIONS_ERROR: + msg = "Unsupported decompressor flags"; + break; + + default: + // This is most likely LZMA_PROG_ERROR indicating a bug in + // this program or in liblzma. It is inconvenient to have a + // separate error message for errors that should be impossible + // to occur, but knowing the error code is important for + // debugging. That's why it is good to print the error code + // at least when there is no good error message to show. + msg = "Unknown error, possibly a bug"; + break; + } + + fprintf(stderr, "Error initializing the decoder: %s (error code %u)\n", + msg, ret); + return false; +} + + +static bool +decompress(lzma_stream *strm, const char *inname, FILE *infile, FILE *outfile) +{ + // When LZMA_CONCATENATED flag was used when initializing the decoder, + // we need to tell lzma_code() when there will be no more input. + // This is done by setting action to LZMA_FINISH instead of LZMA_RUN + // in the same way as it is done when encoding. + // + // When LZMA_CONCATENATED isn't used, there is no need to use + // LZMA_FINISH to tell when all the input has been read, but it + // is still OK to use it if you want. When LZMA_CONCATENATED isn't + // used, the decoder will stop after the first .xz stream. In that + // case some unused data may be left in strm->next_in. + lzma_action action = LZMA_RUN; + + uint8_t inbuf[BUFSIZ]; + uint8_t outbuf[BUFSIZ]; + + strm->next_in = NULL; + strm->avail_in = 0; + strm->next_out = outbuf; + strm->avail_out = sizeof(outbuf); + + while (true) { + if (strm->avail_in == 0 && !feof(infile)) { + strm->next_in = inbuf; + strm->avail_in = fread(inbuf, 1, sizeof(inbuf), + infile); + + if (ferror(infile)) { + fprintf(stderr, "%s: Read error: %s\n", + inname, strerror(errno)); + return false; + } + + // Once the end of the input file has been reached, + // we need to tell lzma_code() that no more input + // will be coming. As said before, this isn't required + // if the LZMA_CONCATENATED flag isn't used when + // initializing the decoder. + if (feof(infile)) + action = LZMA_FINISH; + } + + lzma_ret ret = lzma_code(strm, action); + + if (strm->avail_out == 0 || ret == LZMA_STREAM_END) { + size_t write_size = sizeof(outbuf) - strm->avail_out; + + if (fwrite(outbuf, 1, write_size, outfile) + != write_size) { + fprintf(stderr, "Write error: %s\n", + strerror(errno)); + return false; + } + + strm->next_out = outbuf; + strm->avail_out = sizeof(outbuf); + } + + if (ret != LZMA_OK) { + // Once everything has been decoded successfully, the + // return value of lzma_code() will be LZMA_STREAM_END. + // + // It is important to check for LZMA_STREAM_END. Do not + // assume that getting ret != LZMA_OK would mean that + // everything has gone well or that when you aren't + // getting more output it must have successfully + // decoded everything. + if (ret == LZMA_STREAM_END) + return true; + + // It's not LZMA_OK nor LZMA_STREAM_END, + // so it must be an error code. See lzma/base.h + // (src/liblzma/api/lzma/base.h in the source package + // or e.g. /usr/include/lzma/base.h depending on the + // install prefix) for the list and documentation of + // possible values. Many values listen in lzma_ret + // enumeration aren't possible in this example, but + // can be made possible by enabling memory usage limit + // or adding flags to the decoder initialization. + const char *msg; + switch (ret) { + case LZMA_MEM_ERROR: + msg = "Memory allocation failed"; + break; + + case LZMA_FORMAT_ERROR: + // .xz magic bytes weren't found. + msg = "The input is not in the .xz format"; + break; + + case LZMA_OPTIONS_ERROR: + // For example, the headers specify a filter + // that isn't supported by this liblzma + // version (or it hasn't been enabled when + // building liblzma, but no-one sane does + // that unless building liblzma for an + // embedded system). Upgrading to a newer + // liblzma might help. + // + // Note that it is unlikely that the file has + // accidentally became corrupt if you get this + // error. The integrity of the .xz headers is + // always verified with a CRC32, so + // unintentionally corrupt files can be + // distinguished from unsupported files. + msg = "Unsupported compression options"; + break; + + case LZMA_DATA_ERROR: + msg = "Compressed file is corrupt"; + break; + + case LZMA_BUF_ERROR: + // Typically this error means that a valid + // file has got truncated, but it might also + // be a damaged part in the file that makes + // the decoder think the file is truncated. + // If you prefer, you can use the same error + // message for this as for LZMA_DATA_ERROR. + msg = "Compressed file is truncated or " + "otherwise corrupt"; + break; + + default: + // This is most likely LZMA_PROG_ERROR. + msg = "Unknown error, possibly a bug"; + break; + } + + fprintf(stderr, "%s: Decoder error: " + "%s (error code %u)\n", + inname, msg, ret); + return false; + } + } +} + + +extern int +main(int argc, char **argv) +{ + if (argc <= 1) { + fprintf(stderr, "Usage: %s FILES...\n", argv[0]); + return EXIT_FAILURE; + } + + lzma_stream strm = LZMA_STREAM_INIT; + + bool success = true; + + // Try to decompress all files. + for (int i = 1; i < argc; ++i) { + if (!init_decoder(&strm)) { + // Decoder initialization failed. There's no point + // to retry it so we need to exit. + success = false; + break; + } + + FILE *infile = fopen(argv[i], "rb"); + + if (infile == NULL) { + fprintf(stderr, "%s: Error opening the " + "input file: %s\n", + argv[i], strerror(errno)); + success = false; + } else { + success &= decompress(&strm, argv[i], infile, stdout); + fclose(infile); + } + } + + // Free the memory allocated for the decoder. This only needs to be + // done after the last file. + lzma_end(&strm); + + if (fclose(stdout)) { + fprintf(stderr, "Write error: %s\n", strerror(errno)); + success = false; + } + + return success ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/doc/examples/03_compress_custom.c b/doc/examples/03_compress_custom.c new file mode 100644 index 0000000..40c85e3 --- /dev/null +++ b/doc/examples/03_compress_custom.c @@ -0,0 +1,193 @@ +/////////////////////////////////////////////////////////////////////////////// +// +/// \file 03_compress_custom.c +/// \brief Compress in multi-call mode using x86 BCJ and LZMA2 +/// +/// Usage: ./03_compress_custom < INFILE > OUTFILE +/// +/// Example: ./03_compress_custom < foo > foo.xz +// +// Author: Lasse Collin +// +// This file has been put into the public domain. +// You can do whatever you want with this file. +// +/////////////////////////////////////////////////////////////////////////////// + +#include <stdbool.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <lzma.h> + + +static bool +init_encoder(lzma_stream *strm) +{ + // Use the default preset (6) for LZMA2. + // + // The lzma_options_lzma structure and the lzma_lzma_preset() function + // are declared in lzma/lzma12.h (src/liblzma/api/lzma/lzma12.h in the + // source package or e.g. /usr/include/lzma/lzma12.h depending on + // the install prefix). + lzma_options_lzma opt_lzma2; + if (lzma_lzma_preset(&opt_lzma2, LZMA_PRESET_DEFAULT)) { + // It should never fail because the default preset + // (and presets 0-9 optionally with LZMA_PRESET_EXTREME) + // are supported by all stable liblzma versions. + // + // (The encoder initialization later in this function may + // still fail due to unsupported preset *if* the features + // required by the preset have been disabled at build time, + // but no-one does such things except on embedded systems.) + fprintf(stderr, "Unsupported preset, possibly a bug\n"); + return false; + } + + // Now we could customize the LZMA2 options if we wanted. For example, + // we could set the the dictionary size (opt_lzma2.dict_size) to + // something else than the default (8 MiB) of the default preset. + // See lzma/lzma12.h for details of all LZMA2 options. + // + // The x86 BCJ filter will try to modify the x86 instruction stream so + // that LZMA2 can compress it better. The x86 BCJ filter doesn't need + // any options so it will be set to NULL below. + // + // Construct the filter chain. The uncompressed data goes first to + // the first filter in the array, in this case the x86 BCJ filter. + // The array is always terminated by setting .id = LZMA_VLI_UNKNOWN. + // + // See lzma/filter.h for more information about the lzma_filter + // structure. + lzma_filter filters[] = { + { .id = LZMA_FILTER_X86, .options = NULL }, + { .id = LZMA_FILTER_LZMA2, .options = &opt_lzma2 }, + { .id = LZMA_VLI_UNKNOWN, .options = NULL }, + }; + + // Initialize the encoder using the custom filter chain. + lzma_ret ret = lzma_stream_encoder(strm, filters, LZMA_CHECK_CRC64); + + if (ret == LZMA_OK) + return true; + + const char *msg; + switch (ret) { + case LZMA_MEM_ERROR: + msg = "Memory allocation failed"; + break; + + case LZMA_OPTIONS_ERROR: + // We are no longer using a plain preset so this error + // message has been edited accordingly compared to + // 01_compress_easy.c. + msg = "Specified filter chain is not supported"; + break; + + case LZMA_UNSUPPORTED_CHECK: + msg = "Specified integrity check is not supported"; + break; + + default: + msg = "Unknown error, possibly a bug"; + break; + } + + fprintf(stderr, "Error initializing the encoder: %s (error code %u)\n", + msg, ret); + return false; +} + + +// This function is identical to the one in 01_compress_easy.c. +static bool +compress(lzma_stream *strm, FILE *infile, FILE *outfile) +{ + lzma_action action = LZMA_RUN; + + uint8_t inbuf[BUFSIZ]; + uint8_t outbuf[BUFSIZ]; + + strm->next_in = NULL; + strm->avail_in = 0; + strm->next_out = outbuf; + strm->avail_out = sizeof(outbuf); + + while (true) { + if (strm->avail_in == 0 && !feof(infile)) { + strm->next_in = inbuf; + strm->avail_in = fread(inbuf, 1, sizeof(inbuf), + infile); + + if (ferror(infile)) { + fprintf(stderr, "Read error: %s\n", + strerror(errno)); + return false; + } + + if (feof(infile)) + action = LZMA_FINISH; + } + + lzma_ret ret = lzma_code(strm, action); + + if (strm->avail_out == 0 || ret == LZMA_STREAM_END) { + size_t write_size = sizeof(outbuf) - strm->avail_out; + + if (fwrite(outbuf, 1, write_size, outfile) + != write_size) { + fprintf(stderr, "Write error: %s\n", + strerror(errno)); + return false; + } + + strm->next_out = outbuf; + strm->avail_out = sizeof(outbuf); + } + + if (ret != LZMA_OK) { + if (ret == LZMA_STREAM_END) + return true; + + const char *msg; + switch (ret) { + case LZMA_MEM_ERROR: + msg = "Memory allocation failed"; + break; + + case LZMA_DATA_ERROR: + msg = "File size limits exceeded"; + break; + + default: + msg = "Unknown error, possibly a bug"; + break; + } + + fprintf(stderr, "Encoder error: %s (error code %u)\n", + msg, ret); + return false; + } + } +} + + +extern int +main(void) +{ + lzma_stream strm = LZMA_STREAM_INIT; + + bool success = init_encoder(&strm); + if (success) + success = compress(&strm, stdin, stdout); + + lzma_end(&strm); + + if (fclose(stdout)) { + fprintf(stderr, "Write error: %s\n", strerror(errno)); + success = false; + } + + return success ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/doc/examples/04_compress_easy_mt.c b/doc/examples/04_compress_easy_mt.c new file mode 100644 index 0000000..efe5697 --- /dev/null +++ b/doc/examples/04_compress_easy_mt.c @@ -0,0 +1,206 @@ +/////////////////////////////////////////////////////////////////////////////// +// +/// \file 04_compress_easy_mt.c +/// \brief Compress in multi-call mode using LZMA2 in multi-threaded mode +/// +/// Usage: ./04_compress_easy_mt < INFILE > OUTFILE +/// +/// Example: ./04_compress_easy_mt < foo > foo.xz +// +// Author: Lasse Collin +// +// This file has been put into the public domain. +// You can do whatever you want with this file. +// +/////////////////////////////////////////////////////////////////////////////// + +#include <stdbool.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <lzma.h> + + +static bool +init_encoder(lzma_stream *strm) +{ + // The threaded encoder takes the options as pointer to + // a lzma_mt structure. + lzma_mt mt = { + // No flags are needed. + .flags = 0, + + // Let liblzma determine a sane block size. + .block_size = 0, + + // Use no timeout for lzma_code() calls by setting timeout + // to zero. That is, sometimes lzma_code() might block for + // a long time (from several seconds to even minutes). + // If this is not OK, for example due to progress indicator + // needing updates, specify a timeout in milliseconds here. + // See the documentation of lzma_mt in lzma/container.h for + // information how to choose a reasonable timeout. + .timeout = 0, + + // Use the default preset (6) for LZMA2. + // To use a preset, filters must be set to NULL. + .preset = LZMA_PRESET_DEFAULT, + .filters = NULL, + + // Use CRC64 for integrity checking. See also + // 01_compress_easy.c about choosing the integrity check. + .check = LZMA_CHECK_CRC64, + }; + + // Detect how many threads the CPU supports. + mt.threads = lzma_cputhreads(); + + // If the number of CPU cores/threads cannot be detected, + // use one thread. Note that this isn't the same as the normal + // single-threaded mode as this will still split the data into + // blocks and use more RAM than the normal single-threaded mode. + // You may want to consider using lzma_easy_encoder() or + // lzma_stream_encoder() instead of lzma_stream_encoder_mt() if + // lzma_cputhreads() returns 0 or 1. + if (mt.threads == 0) + mt.threads = 1; + + // If the number of CPU cores/threads exceeds threads_max, + // limit the number of threads to keep memory usage lower. + // The number 8 is arbitrarily chosen and may be too low or + // high depending on the compression preset and the computer + // being used. + // + // FIXME: A better way could be to check the amount of RAM + // (or available RAM) and use lzma_stream_encoder_mt_memusage() + // to determine if the number of threads should be reduced. + const uint32_t threads_max = 8; + if (mt.threads > threads_max) + mt.threads = threads_max; + + // Initialize the threaded encoder. + lzma_ret ret = lzma_stream_encoder_mt(strm, &mt); + + if (ret == LZMA_OK) + return true; + + const char *msg; + switch (ret) { + case LZMA_MEM_ERROR: + msg = "Memory allocation failed"; + break; + + case LZMA_OPTIONS_ERROR: + // We are no longer using a plain preset so this error + // message has been edited accordingly compared to + // 01_compress_easy.c. + msg = "Specified filter chain is not supported"; + break; + + case LZMA_UNSUPPORTED_CHECK: + msg = "Specified integrity check is not supported"; + break; + + default: + msg = "Unknown error, possibly a bug"; + break; + } + + fprintf(stderr, "Error initializing the encoder: %s (error code %u)\n", + msg, ret); + return false; +} + + +// This function is identical to the one in 01_compress_easy.c. +static bool +compress(lzma_stream *strm, FILE *infile, FILE *outfile) +{ + lzma_action action = LZMA_RUN; + + uint8_t inbuf[BUFSIZ]; + uint8_t outbuf[BUFSIZ]; + + strm->next_in = NULL; + strm->avail_in = 0; + strm->next_out = outbuf; + strm->avail_out = sizeof(outbuf); + + while (true) { + if (strm->avail_in == 0 && !feof(infile)) { + strm->next_in = inbuf; + strm->avail_in = fread(inbuf, 1, sizeof(inbuf), + infile); + + if (ferror(infile)) { + fprintf(stderr, "Read error: %s\n", + strerror(errno)); + return false; + } + + if (feof(infile)) + action = LZMA_FINISH; + } + + lzma_ret ret = lzma_code(strm, action); + + if (strm->avail_out == 0 || ret == LZMA_STREAM_END) { + size_t write_size = sizeof(outbuf) - strm->avail_out; + + if (fwrite(outbuf, 1, write_size, outfile) + != write_size) { + fprintf(stderr, "Write error: %s\n", + strerror(errno)); + return false; + } + + strm->next_out = outbuf; + strm->avail_out = sizeof(outbuf); + } + + if (ret != LZMA_OK) { + if (ret == LZMA_STREAM_END) + return true; + + const char *msg; + switch (ret) { + case LZMA_MEM_ERROR: + msg = "Memory allocation failed"; + break; + + case LZMA_DATA_ERROR: + msg = "File size limits exceeded"; + break; + + default: + msg = "Unknown error, possibly a bug"; + break; + } + + fprintf(stderr, "Encoder error: %s (error code %u)\n", + msg, ret); + return false; + } + } +} + + +extern int +main(void) +{ + lzma_stream strm = LZMA_STREAM_INIT; + + bool success = init_encoder(&strm); + if (success) + success = compress(&strm, stdin, stdout); + + lzma_end(&strm); + + if (fclose(stdout)) { + fprintf(stderr, "Write error: %s\n", strerror(errno)); + success = false; + } + + return success ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/doc/examples/Makefile b/doc/examples/Makefile new file mode 100644 index 0000000..e8839d8 --- /dev/null +++ b/doc/examples/Makefile @@ -0,0 +1,25 @@ +# +# Author: Lasse Collin +# +# This file has been put into the public domain. +# You can do whatever you want with this file. +# + +CC = c99 +CFLAGS = -g +LDFLAGS = -llzma + +PROGS = \ + 01_compress_easy \ + 02_decompress \ + 03_compress_custom \ + 04_compress_easy_mt \ + 11_file_info + +all: $(PROGS) + +.c: + $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) + +clean: + -rm -f $(PROGS) diff --git a/doc/examples_old/xz_pipe_comp.c b/doc/examples_old/xz_pipe_comp.c new file mode 100644 index 0000000..9f9224b --- /dev/null +++ b/doc/examples_old/xz_pipe_comp.c @@ -0,0 +1,127 @@ +/* + * xz_pipe_comp.c + * A simple example of pipe-only xz compressor implementation. + * version: 2010-07-12 - by Daniel Mealha Cabrita + * Not copyrighted -- provided to the public domain. + * + * Compiling: + * Link with liblzma. GCC example: + * $ gcc -llzma xz_pipe_comp.c -o xz_pipe_comp + * + * Usage example: + * $ cat some_file | ./xz_pipe_comp > some_file.xz + */ + +#include <stdio.h> +#include <stdint.h> +#include <inttypes.h> +#include <stdbool.h> +#include <lzma.h> + + +/* COMPRESSION SETTINGS */ + +/* analogous to xz CLI options: -0 to -9 */ +#define COMPRESSION_LEVEL 6 + +/* boolean setting, analogous to xz CLI option: -e */ +#define COMPRESSION_EXTREME true + +/* see: /usr/include/lzma/check.h LZMA_CHECK_* */ +#define INTEGRITY_CHECK LZMA_CHECK_CRC64 + + +/* read/write buffer sizes */ +#define IN_BUF_MAX 4096 +#define OUT_BUF_MAX 4096 + +/* error codes */ +#define RET_OK 0 +#define RET_ERROR_INIT 1 +#define RET_ERROR_INPUT 2 +#define RET_ERROR_OUTPUT 3 +#define RET_ERROR_COMPRESSION 4 + + +/* note: in_file and out_file must be open already */ +int xz_compress (FILE *in_file, FILE *out_file) +{ + uint32_t preset = COMPRESSION_LEVEL | (COMPRESSION_EXTREME ? LZMA_PRESET_EXTREME : 0); + lzma_check check = INTEGRITY_CHECK; + lzma_stream strm = LZMA_STREAM_INIT; /* alloc and init lzma_stream struct */ + uint8_t in_buf [IN_BUF_MAX]; + uint8_t out_buf [OUT_BUF_MAX]; + size_t in_len; /* length of useful data in in_buf */ + size_t out_len; /* length of useful data in out_buf */ + bool in_finished = false; + bool out_finished = false; + lzma_action action; + lzma_ret ret_xz; + int ret; + + ret = RET_OK; + + /* initialize xz encoder */ + ret_xz = lzma_easy_encoder (&strm, preset, check); + if (ret_xz != LZMA_OK) { + fprintf (stderr, "lzma_easy_encoder error: %d\n", (int) ret_xz); + return RET_ERROR_INIT; + } + + while ((! in_finished) && (! out_finished)) { + /* read incoming data */ + in_len = fread (in_buf, 1, IN_BUF_MAX, in_file); + + if (feof (in_file)) { + in_finished = true; + } + if (ferror (in_file)) { + in_finished = true; + ret = RET_ERROR_INPUT; + } + + strm.next_in = in_buf; + strm.avail_in = in_len; + + /* if no more data from in_buf, flushes the + internal xz buffers and closes the xz data + with LZMA_FINISH */ + action = in_finished ? LZMA_FINISH : LZMA_RUN; + + /* loop until there's no pending compressed output */ + do { + /* out_buf is clean at this point */ + strm.next_out = out_buf; + strm.avail_out = OUT_BUF_MAX; + + /* compress data */ + ret_xz = lzma_code (&strm, action); + + if ((ret_xz != LZMA_OK) && (ret_xz != LZMA_STREAM_END)) { + fprintf (stderr, "lzma_code error: %d\n", (int) ret_xz); + out_finished = true; + ret = RET_ERROR_COMPRESSION; + } else { + /* write compressed data */ + out_len = OUT_BUF_MAX - strm.avail_out; + fwrite (out_buf, 1, out_len, out_file); + if (ferror (out_file)) { + out_finished = true; + ret = RET_ERROR_OUTPUT; + } + } + } while (strm.avail_out == 0); + } + + lzma_end (&strm); + return ret; +} + +int main () +{ + int ret; + + ret = xz_compress (stdin, stdout); + return ret; +} + diff --git a/doc/examples_old/xz_pipe_decomp.c b/doc/examples_old/xz_pipe_decomp.c new file mode 100644 index 0000000..fb5ad89 --- /dev/null +++ b/doc/examples_old/xz_pipe_decomp.c @@ -0,0 +1,123 @@ +/* + * xz_pipe_decomp.c + * A simple example of pipe-only xz decompressor implementation. + * version: 2012-06-14 - by Daniel Mealha Cabrita + * Not copyrighted -- provided to the public domain. + * + * Compiling: + * Link with liblzma. GCC example: + * $ gcc -llzma xz_pipe_decomp.c -o xz_pipe_decomp + * + * Usage example: + * $ cat some_file.xz | ./xz_pipe_decomp > some_file + */ + +#include <stdio.h> +#include <stdint.h> +#include <inttypes.h> +#include <stdbool.h> +#include <lzma.h> + + +/* read/write buffer sizes */ +#define IN_BUF_MAX 4096 +#define OUT_BUF_MAX 4096 + +/* error codes */ +#define RET_OK 0 +#define RET_ERROR_INIT 1 +#define RET_ERROR_INPUT 2 +#define RET_ERROR_OUTPUT 3 +#define RET_ERROR_DECOMPRESSION 4 + + +/* note: in_file and out_file must be open already */ +int xz_decompress (FILE *in_file, FILE *out_file) +{ + lzma_stream strm = LZMA_STREAM_INIT; /* alloc and init lzma_stream struct */ + const uint32_t flags = LZMA_TELL_UNSUPPORTED_CHECK | LZMA_CONCATENATED; + const uint64_t memory_limit = UINT64_MAX; /* no memory limit */ + uint8_t in_buf [IN_BUF_MAX]; + uint8_t out_buf [OUT_BUF_MAX]; + size_t in_len; /* length of useful data in in_buf */ + size_t out_len; /* length of useful data in out_buf */ + bool in_finished = false; + bool out_finished = false; + lzma_action action; + lzma_ret ret_xz; + int ret; + + ret = RET_OK; + + /* initialize xz decoder */ + ret_xz = lzma_stream_decoder (&strm, memory_limit, flags); + if (ret_xz != LZMA_OK) { + fprintf (stderr, "lzma_stream_decoder error: %d\n", (int) ret_xz); + return RET_ERROR_INIT; + } + + while ((! in_finished) && (! out_finished)) { + /* read incoming data */ + in_len = fread (in_buf, 1, IN_BUF_MAX, in_file); + + if (feof (in_file)) { + in_finished = true; + } + if (ferror (in_file)) { + in_finished = true; + ret = RET_ERROR_INPUT; + } + + strm.next_in = in_buf; + strm.avail_in = in_len; + + /* if no more data from in_buf, flushes the + internal xz buffers and closes the decompressed data + with LZMA_FINISH */ + action = in_finished ? LZMA_FINISH : LZMA_RUN; + + /* loop until there's no pending decompressed output */ + do { + /* out_buf is clean at this point */ + strm.next_out = out_buf; + strm.avail_out = OUT_BUF_MAX; + + /* decompress data */ + ret_xz = lzma_code (&strm, action); + + if ((ret_xz != LZMA_OK) && (ret_xz != LZMA_STREAM_END)) { + fprintf (stderr, "lzma_code error: %d\n", (int) ret_xz); + out_finished = true; + ret = RET_ERROR_DECOMPRESSION; + } else { + /* write decompressed data */ + out_len = OUT_BUF_MAX - strm.avail_out; + fwrite (out_buf, 1, out_len, out_file); + if (ferror (out_file)) { + out_finished = true; + ret = RET_ERROR_OUTPUT; + } + } + } while (strm.avail_out == 0); + } + + /* Bug fix (2012-06-14): If no errors were detected, check + that the last lzma_code() call returned LZMA_STREAM_END. + If not, the file is probably truncated. */ + if ((ret == RET_OK) && (ret_xz != LZMA_STREAM_END)) { + fprintf (stderr, "Input truncated or corrupt\n"); + ret = RET_ERROR_DECOMPRESSION; + } + + lzma_end (&strm); + return ret; +} + +int main () +{ + int ret; + + ret = xz_decompress (stdin, stdout); + return ret; +} + diff --git a/doc/faq.txt b/doc/faq.txt new file mode 100644 index 0000000..3f9068b --- /dev/null +++ b/doc/faq.txt @@ -0,0 +1,244 @@ + +XZ Utils FAQ +============ + +Q: What do the letters XZ mean? + +A: Nothing. They are just two letters, which come from the file format + suffix .xz. The .xz suffix was selected, because it seemed to be + pretty much unused. It has no deeper meaning. + + +Q: What are LZMA and LZMA2? + +A: LZMA stands for Lempel-Ziv-Markov chain-Algorithm. It is the name + of the compression algorithm designed by Igor Pavlov for 7-Zip. + LZMA is based on LZ77 and range encoding. + + LZMA2 is an updated version of the original LZMA to fix a couple of + practical issues. In context of XZ Utils, LZMA is called LZMA1 to + emphasize that LZMA is not the same thing as LZMA2. LZMA2 is the + primary compression algorithm in the .xz file format. + + +Q: There are many LZMA related projects. How does XZ Utils relate to them? + +A: 7-Zip and LZMA SDK are the original projects. LZMA SDK is roughly + a subset of the 7-Zip source tree. + + p7zip is 7-Zip's command-line tools ported to POSIX-like systems. + + LZMA Utils provide a gzip-like lzma tool for POSIX-like systems. + LZMA Utils are based on LZMA SDK. XZ Utils are the successor to + LZMA Utils. + + There are several other projects using LZMA. Most are more or less + based on LZMA SDK. See <https://7-zip.org/links.html>. + + +Q: Why is liblzma named liblzma if its primary file format is .xz? + Shouldn't it be e.g. libxz? + +A: When the designing of the .xz format began, the idea was to replace + the .lzma format and use the same .lzma suffix. It would have been + quite OK to reuse the suffix when there were very few .lzma files + around. However, the old .lzma format became popular before the + new format was finished. The new format was renamed to .xz but the + name of liblzma wasn't changed. + + +Q: Do XZ Utils support the .7z format? + +A: No. Use 7-Zip (Windows) or p7zip (POSIX-like systems) to handle .7z + files. + + +Q: I have many .tar.7z files. Can I convert them to .tar.xz without + spending hours recompressing the data? + +A: In the "extra" directory, there is a script named 7z2lzma.bash which + is able to convert some .7z files to the .lzma format (not .xz). It + needs the 7za (or 7z) command from p7zip. The script may silently + produce corrupt output if certain assumptions are not met, so + decompress the resulting .lzma file and compare it against the + original before deleting the original file! + + +Q: I have many .lzma files. Can I quickly convert them to the .xz format? + +A: For now, no. Since XZ Utils supports the .lzma format, it's usually + not too bad to keep the old files in the old format. If you want to + do the conversion anyway, you need to decompress the .lzma files and + then recompress to the .xz format. + + Technically, there is a way to make the conversion relatively fast + (roughly twice the time that normal decompression takes). Writing + such a tool would take quite a bit of time though, and would probably + be useful to only a few people. If you really want such a conversion + tool, contact Lasse Collin and offer some money. + + +Q: I have installed xz, but my tar doesn't recognize .tar.xz files. + How can I extract .tar.xz files? + +A: xz -dc foo.tar.xz | tar xf - + + +Q: Can I recover parts of a broken .xz file (e.g. a corrupted CD-R)? + +A: It may be possible if the file consists of multiple blocks, which + typically is not the case if the file was created in single-threaded + mode. There is no recovery program yet. + + +Q: Is (some part of) XZ Utils patented? + +A: Lasse Collin is not aware of any patents that could affect XZ Utils. + However, due to the nature of software patents, it's not possible to + guarantee that XZ Utils isn't affected by any third party patent(s). + + +Q: Where can I find documentation about the file format and algorithms? + +A: The .xz format is documented in xz-file-format.txt. It is a container + format only, and doesn't include descriptions of any non-trivial + filters. + + Documenting LZMA and LZMA2 is planned, but for now, there is no other + documentation than the source code. Before you begin, you should know + the basics of LZ77 and range-coding algorithms. LZMA is based on LZ77, + but LZMA is a lot more complex. Range coding is used to compress + the final bitstream like Huffman coding is used in Deflate. + + +Q: I cannot find BCJ and BCJ2 filters. Don't they exist in liblzma? + +A: BCJ filter is called "x86" in liblzma. BCJ2 is not included, + because it requires using more than one encoded output stream. + + +Q: I need to use a script that runs "xz -9". On a system with 256 MiB + of RAM, xz says that it cannot allocate memory. Can I make the + script work without modifying it? + +A: Set a default memory usage limit for compression. You can do it e.g. + in a shell initialization script such as ~/.bashrc or /etc/profile: + + XZ_DEFAULTS=--memlimit-compress=150MiB + export XZ_DEFAULTS + + xz will then scale the compression settings down so that the given + memory usage limit is not reached. This way xz shouldn't run out + of memory. + + Check also that memory-related resource limits are high enough. + On most systems, "ulimit -a" will show the current resource limits. + + +Q: How do I create files that can be decompressed with XZ Embedded? + +A: See the documentation in XZ Embedded. In short, something like + this is a good start: + + xz --check=crc32 --lzma2=preset=6e,dict=64KiB + + Or if a BCJ filter is needed too, e.g. if compressing + a kernel image for PowerPC: + + xz --check=crc32 --powerpc --lzma2=preset=6e,dict=64KiB + + Adjust the dictionary size to get a good compromise between + compression ratio and decompressor memory usage. Note that + in single-call decompression mode of XZ Embedded, a big + dictionary doesn't increase memory usage. + + +Q: How is multi-threaded compression implemented in XZ Utils? + +A: The simplest method is splitting the uncompressed data into blocks + and compressing them in parallel independent from each other. + This is currently the only threading method supported in XZ Utils. + Since the blocks are compressed independently, they can also be + decompressed independently. Together with the index feature in .xz, + this allows using threads to create .xz files for random-access + reading. This also makes threaded decompression possible. + + The independent blocks method has a couple of disadvantages too. It + will compress worse than a single-block method. Often the difference + is not too big (maybe 1-2 %) but sometimes it can be too big. Also, + the memory usage of the compressor increases linearly when adding + threads. + + At least two other threading methods are possible but these haven't + been implemented in XZ Utils: + + Match finder parallelization has been in 7-Zip for ages. It doesn't + affect compression ratio or memory usage significantly. Among the + three threading methods, only this is useful when compressing small + files (files that are not significantly bigger than the dictionary). + Unfortunately this method scales only to about two CPU cores. + + The third method is pigz-style threading (I use that name, because + pigz <https://www.zlib.net/pigz/> uses that method). It doesn't + affect compression ratio significantly and scales to many cores. + The memory usage scales linearly when threads are added. This isn't + significant with pigz, because Deflate uses only a 32 KiB dictionary, + but with LZMA2 the memory usage will increase dramatically just like + with the independent-blocks method. There is also a constant + computational overhead, which may make pigz-method a bit dull on + dual-core compared to the parallel match finder method, but with more + cores the overhead is not a big deal anymore. + + Combining the threading methods will be possible and also useful. + For example, combining match finder parallelization with pigz-style + threading or independent-blocks-threading can cut the memory usage + by 50 %. + + +Q: I told xz to use many threads but it is using only one or two + processor cores. What is wrong? + +A: Since multi-threaded compression is done by splitting the data into + blocks that are compressed individually, if the input file is too + small for the block size, then many threads cannot be used. The + default block size increases when the compression level is + increased. For example, xz -6 uses 8 MiB LZMA2 dictionary and + 24 MiB blocks, and xz -9 uses 64 MiB LZMA dictionary and 192 MiB + blocks. If the input file is 100 MiB, xz -6 can use five threads + of which one will finish quickly as it has only 4 MiB to compress. + However, for the same file, xz -9 can only use one thread. + + One can adjust block size with --block-size=SIZE but making the + block size smaller than LZMA2 dictionary is waste of RAM: using + xz -9 with 6 MiB blocks isn't any better than using xz -6 with + 6 MiB blocks. The default settings use a block size bigger than + the LZMA2 dictionary size because this was seen as a reasonable + compromise between RAM usage and compression ratio. + + When decompressing, the ability to use threads depends on how the + file was created. If it was created in multi-threaded mode then + it can be decompressed in multi-threaded mode too if there are + multiple blocks in the file. + + +Q: How do I build a program that needs liblzmadec (lzmadec.h)? + +A: liblzmadec is part of LZMA Utils. XZ Utils has liblzma, but no + liblzmadec. The code using liblzmadec should be ported to use + liblzma instead. If you cannot or don't want to do that, download + LZMA Utils from <https://tukaani.org/lzma/>. + + +Q: The default build of liblzma is too big. How can I make it smaller? + +A: Give --enable-small to the configure script. Use also appropriate + --enable or --disable options to include only those filter encoders + and decoders and integrity checks that you actually need. Use + CFLAGS=-Os (with GCC) or equivalent to tell your compiler to optimize + for size. See INSTALL for information about configure options. + + If the result is still too big, take a look at XZ Embedded. It is + a separate project, which provides a limited but significantly + smaller XZ decoder implementation than XZ Utils. You can find it + at <https://tukaani.org/xz/embedded.html>. + diff --git a/doc/history.txt b/doc/history.txt new file mode 100644 index 0000000..8545e23 --- /dev/null +++ b/doc/history.txt @@ -0,0 +1,150 @@ + +History of LZMA Utils and XZ Utils +================================== + +Tukaani distribution + + In 2005, there was a small group working on the Tukaani distribution, + which was a Slackware fork. One of the project's goals was to fit the + distro on a single 700 MiB ISO-9660 image. Using LZMA instead of gzip + helped a lot. Roughly speaking, one could fit data that took 1000 MiB + in gzipped form into 700 MiB with LZMA. Naturally, the compression + ratio varied across packages, but this was what we got on average. + + Slackware packages have traditionally had .tgz as the filename suffix, + which is an abbreviation of .tar.gz. A logical naming for LZMA + compressed packages was .tlz, being an abbreviation of .tar.lzma. + + At the end of the year 2007, there was no distribution under the + Tukaani project anymore, but development of LZMA Utils was kept going. + Still, there were .tlz packages around, because at least Vector Linux + (a Slackware based distribution) used LZMA for its packages. + + First versions of the modified pkgtools used the LZMA_Alone tool from + Igor Pavlov's LZMA SDK as is. It was fine, because users wouldn't need + to interact with LZMA_Alone directly. But people soon wanted to use + LZMA for other files too, and the interface of LZMA_Alone wasn't + comfortable for those used to gzip and bzip2. + + +First steps of LZMA Utils + + The first version of LZMA Utils (4.22.0) included a shell script called + lzmash. It was a wrapper that had a gzip-like command-line interface. It + used the LZMA_Alone tool from LZMA SDK to do all the real work. zgrep, + zdiff, and related scripts from gzip were adapted to work with LZMA and + were part of the first LZMA Utils release too. + + LZMA Utils 4.22.0 included also lzmadec, which was a small (less than + 10 KiB) decoder-only command-line tool. It was written on top of the + decoder-only C code found from the LZMA SDK. lzmadec was convenient in + situations where LZMA_Alone (a few hundred KiB) would be too big. + + lzmash and lzmadec were written by Lasse Collin. + + +Second generation + + The lzmash script was an ugly and not very secure hack. The last + version of LZMA Utils to use lzmash was 4.27.1. + + LZMA Utils 4.32.0beta1 introduced a new lzma command-line tool written + by Ville Koskinen. It was written in C++, and used the encoder and + decoder from C++ LZMA SDK with some little modifications. This tool + replaced both the lzmash script and the LZMA_Alone command-line tool + in LZMA Utils. + + Introducing this new tool caused some temporary incompatibilities, + because the LZMA_Alone executable was simply named lzma like the new + command-line tool, but they had a completely different command-line + interface. The file format was still the same. + + Lasse wrote liblzmadec, which was a small decoder-only library based + on the C code found from LZMA SDK. liblzmadec had an API similar to + zlib, although there were some significant differences, which made it + non-trivial to use it in some applications designed for zlib and + libbzip2. + + The lzmadec command-line tool was converted to use liblzmadec. + + Alexandre Sauvé helped converting the build system to use GNU + Autotools. This made it easier to test for certain less portable + features needed by the new command-line tool. + + Since the new command-line tool never got completely finished (for + example, it didn't support the LZMA_OPT environment variable), the + intent was to not call 4.32.x stable. Similarly, liblzmadec wasn't + polished, but appeared to work well enough, so some people started + using it too. + + Because the development of the third generation of LZMA Utils was + delayed considerably (3-4 years), the 4.32.x branch had to be kept + maintained. It got some bug fixes now and then, and finally it was + decided to call it stable, although most of the missing features were + never added. + + +File format problems + + The file format used by LZMA_Alone was primitive. It was designed with + embedded systems in mind, and thus provided only a minimal set of + features. The two biggest problems for non-embedded use were the lack + of magic bytes and an integrity check. + + Igor and Lasse started developing a new file format with some help + from Ville Koskinen. Also Mark Adler, Mikko Pouru, H. Peter Anvin, + and Lars Wirzenius helped with some minor things at some point of the + development. Designing the new format took quite a long time (actually, + too long a time would be a more appropriate expression). It was mostly + because Lasse was quite slow at getting things done due to personal + reasons. + + Originally the new format was supposed to use the same .lzma suffix + that was already used by the old file format. Switching to the new + format wouldn't have caused much trouble when the old format wasn't + used by many people. But since the development of the new format took + such a long time, the old format got quite popular, and it was decided + that the new file format must use a different suffix. + + It was decided to use .xz as the suffix of the new file format. The + first stable .xz file format specification was finally released in + December 2008. In addition to fixing the most obvious problems of + the old .lzma format, the .xz format added some new features like + support for multiple filters (compression algorithms), filter chaining + (like piping on the command line), and limited random-access reading. + + Currently the primary compression algorithm used in .xz is LZMA2. + It is an extension on top of the original LZMA to fix some practical + problems: LZMA2 adds support for flushing the encoder, uncompressed + chunks, eases stateful decoder implementations, and improves support + for multithreading. Since LZMA2 is better than the original LZMA, the + original LZMA is not supported in .xz. + + +Transition to XZ Utils + + The early versions of XZ Utils were called LZMA Utils. The first + releases were 4.42.0alphas. They dropped the rest of the C++ LZMA SDK. + The code was still directly based on LZMA SDK but ported to C and + converted from a callback API to a stateful API. Later, Igor Pavlov + made a C version of the LZMA encoder too; these ports from C++ to C + were independent in LZMA SDK and LZMA Utils. + + The core of the new LZMA Utils was liblzma, a compression library with + a zlib-like API. liblzma supported both the old and new file format. + The gzip-like lzma command-line tool was rewritten to use liblzma. + + The new LZMA Utils code base was renamed to XZ Utils when the name + of the new file format had been decided. The liblzma compression + library retained its name though, because changing it would have + caused unnecessary breakage in applications already using the early + liblzma snapshots. + + The xz command-line tool can emulate the gzip-like lzma tool by + creating appropriate symlinks (e.g. lzma -> xz). Thus, practically + all scripts using the lzma tool from LZMA Utils will work as is with + XZ Utils (and will keep using the old .lzma format). Still, the .lzma + format is more or less deprecated. XZ Utils will keep supporting it, + but new applications should use the .xz format, and migrating old + applications to .xz is often a good idea too. + diff --git a/doc/lzma-file-format.txt b/doc/lzma-file-format.txt new file mode 100644 index 0000000..4865def --- /dev/null +++ b/doc/lzma-file-format.txt @@ -0,0 +1,173 @@ + +The .lzma File Format +===================== + + 0. Preface + 0.1. Notices and Acknowledgements + 0.2. Changes + 1. File Format + 1.1. Header + 1.1.1. Properties + 1.1.2. Dictionary Size + 1.1.3. Uncompressed Size + 1.2. LZMA Compressed Data + 2. References + + +0. Preface + + This document describes the .lzma file format, which is + sometimes also called LZMA_Alone format. It is a legacy file + format, which is being or has been replaced by the .xz format. + The MIME type of the .lzma format is `application/x-lzma'. + + The most commonly used software to handle .lzma files are + LZMA SDK, LZMA Utils, 7-Zip, and XZ Utils. This document + describes some of the differences between these implementations + and gives hints what subset of the .lzma format is the most + portable. + + +0.1. Notices and Acknowledgements + + This file format was designed by Igor Pavlov for use in + LZMA SDK. This document was written by Lasse Collin + <lasse.collin@tukaani.org> using the documentation found + from the LZMA SDK. + + This document has been put into the public domain. + + +0.2. Changes + + Last modified: 2022-07-13 21:00+0300 + + Compared to the previous version (2011-04-12 11:55+0300) + the section 1.1.3 was modified to allow End of Payload Marker + with a known Uncompressed Size. + + +1. File Format + + +-+-+-+-+-+-+-+-+-+-+-+-+-+==========================+ + | Header | LZMA Compressed Data | + +-+-+-+-+-+-+-+-+-+-+-+-+-+==========================+ + + The .lzma format file consist of 13-byte Header followed by + the LZMA Compressed Data. + + Unlike the .gz, .bz2, and .xz formats, it is not possible to + concatenate multiple .lzma files as is and expect the + decompression tool to decode the resulting file as if it were + a single .lzma file. + + For example, the command line tools from LZMA Utils and + LZMA SDK silently ignore all the data after the first .lzma + stream. In contrast, the command line tool from XZ Utils + considers the .lzma file to be corrupt if there is data after + the first .lzma stream. + + +1.1. Header + + +------------+----+----+----+----+--+--+--+--+--+--+--+--+ + | Properties | Dictionary Size | Uncompressed Size | + +------------+----+----+----+----+--+--+--+--+--+--+--+--+ + + +1.1.1. Properties + + The Properties field contains three properties. An abbreviation + is given in parentheses, followed by the value range of the + property. The field consists of + + 1) the number of literal context bits (lc, [0, 8]); + 2) the number of literal position bits (lp, [0, 4]); and + 3) the number of position bits (pb, [0, 4]). + + The properties are encoded using the following formula: + + Properties = (pb * 5 + lp) * 9 + lc + + The following C code illustrates a straightforward way to + decode the Properties field: + + uint8_t lc, lp, pb; + uint8_t prop = get_lzma_properties(); + if (prop > (4 * 5 + 4) * 9 + 8) + return LZMA_PROPERTIES_ERROR; + + pb = prop / (9 * 5); + prop -= pb * 9 * 5; + lp = prop / 9; + lc = prop - lp * 9; + + XZ Utils has an additional requirement: lc + lp <= 4. Files + which don't follow this requirement cannot be decompressed + with XZ Utils. Usually this isn't a problem since the most + common lc/lp/pb values are 3/0/2. It is the only lc/lp/pb + combination that the files created by LZMA Utils can have, + but LZMA Utils can decompress files with any lc/lp/pb. + + +1.1.2. Dictionary Size + + Dictionary Size is stored as an unsigned 32-bit little endian + integer. Any 32-bit value is possible, but for maximum + portability, only sizes of 2^n and 2^n + 2^(n-1) should be + used. + + LZMA Utils creates only files with dictionary size 2^n, + 16 <= n <= 25. LZMA Utils can decompress files with any + dictionary size. + + XZ Utils creates and decompresses .lzma files only with + dictionary sizes 2^n and 2^n + 2^(n-1). If some other + dictionary size is specified when compressing, the value + stored in the Dictionary Size field is a rounded up, but the + specified value is still used in the actual compression code. + + +1.1.3. Uncompressed Size + + Uncompressed Size is stored as unsigned 64-bit little endian + integer. A special value of 0xFFFF_FFFF_FFFF_FFFF indicates + that Uncompressed Size is unknown. End of Payload Marker (*) + is used if Uncompressed Size is unknown. End of Payload Marker + is allowed but rarely used if Uncompressed Size is known. + XZ Utils 5.2.5 and older don't support .lzma files that have + End of Payload Marker together with a known Uncompressed Size. + + XZ Utils rejects files whose Uncompressed Size field specifies + a known size that is 256 GiB or more. This is to reject false + positives when trying to guess if the input file is in the + .lzma format. When Uncompressed Size is unknown, there is no + limit for the uncompressed size of the file. + + (*) Some tools use the term End of Stream (EOS) marker + instead of End of Payload Marker. + + +1.2. LZMA Compressed Data + + Detailed description of the format of this field is out of + scope of this document. + + +2. References + + LZMA SDK - The original LZMA implementation + http://7-zip.org/sdk.html + + 7-Zip + http://7-zip.org/ + + LZMA Utils - LZMA adapted to POSIX-like systems + http://tukaani.org/lzma/ + + XZ Utils - The next generation of LZMA Utils + http://tukaani.org/xz/ + + The .xz file format - The successor of the .lzma format + http://tukaani.org/xz/xz-file-format.txt + diff --git a/doc/man/pdf-a4/lzmainfo-a4.pdf b/doc/man/pdf-a4/lzmainfo-a4.pdf Binary files differnew file mode 100644 index 0000000..91f2561 --- /dev/null +++ b/doc/man/pdf-a4/lzmainfo-a4.pdf diff --git a/doc/man/pdf-a4/xz-a4.pdf b/doc/man/pdf-a4/xz-a4.pdf Binary files differnew file mode 100644 index 0000000..6156cfe --- /dev/null +++ b/doc/man/pdf-a4/xz-a4.pdf diff --git a/doc/man/pdf-a4/xzdec-a4.pdf b/doc/man/pdf-a4/xzdec-a4.pdf Binary files differnew file mode 100644 index 0000000..b3a761a --- /dev/null +++ b/doc/man/pdf-a4/xzdec-a4.pdf diff --git a/doc/man/pdf-a4/xzdiff-a4.pdf b/doc/man/pdf-a4/xzdiff-a4.pdf Binary files differnew file mode 100644 index 0000000..084cb80 --- /dev/null +++ b/doc/man/pdf-a4/xzdiff-a4.pdf diff --git a/doc/man/pdf-a4/xzgrep-a4.pdf b/doc/man/pdf-a4/xzgrep-a4.pdf Binary files differnew file mode 100644 index 0000000..e3c54fb --- /dev/null +++ b/doc/man/pdf-a4/xzgrep-a4.pdf diff --git a/doc/man/pdf-a4/xzless-a4.pdf b/doc/man/pdf-a4/xzless-a4.pdf Binary files differnew file mode 100644 index 0000000..01176ed --- /dev/null +++ b/doc/man/pdf-a4/xzless-a4.pdf diff --git a/doc/man/pdf-a4/xzmore-a4.pdf b/doc/man/pdf-a4/xzmore-a4.pdf Binary files differnew file mode 100644 index 0000000..f829bf4 --- /dev/null +++ b/doc/man/pdf-a4/xzmore-a4.pdf diff --git a/doc/man/pdf-letter/lzmainfo-letter.pdf b/doc/man/pdf-letter/lzmainfo-letter.pdf Binary files differnew file mode 100644 index 0000000..6f2ae17 --- /dev/null +++ b/doc/man/pdf-letter/lzmainfo-letter.pdf diff --git a/doc/man/pdf-letter/xz-letter.pdf b/doc/man/pdf-letter/xz-letter.pdf Binary files differnew file mode 100644 index 0000000..9a733d9 --- /dev/null +++ b/doc/man/pdf-letter/xz-letter.pdf diff --git a/doc/man/pdf-letter/xzdec-letter.pdf b/doc/man/pdf-letter/xzdec-letter.pdf Binary files differnew file mode 100644 index 0000000..89b2ded --- /dev/null +++ b/doc/man/pdf-letter/xzdec-letter.pdf diff --git a/doc/man/pdf-letter/xzdiff-letter.pdf b/doc/man/pdf-letter/xzdiff-letter.pdf Binary files differnew file mode 100644 index 0000000..143e979 --- /dev/null +++ b/doc/man/pdf-letter/xzdiff-letter.pdf diff --git a/doc/man/pdf-letter/xzgrep-letter.pdf b/doc/man/pdf-letter/xzgrep-letter.pdf Binary files differnew file mode 100644 index 0000000..54fc897 --- /dev/null +++ b/doc/man/pdf-letter/xzgrep-letter.pdf diff --git a/doc/man/pdf-letter/xzless-letter.pdf b/doc/man/pdf-letter/xzless-letter.pdf Binary files differnew file mode 100644 index 0000000..0e48e01 --- /dev/null +++ b/doc/man/pdf-letter/xzless-letter.pdf diff --git a/doc/man/pdf-letter/xzmore-letter.pdf b/doc/man/pdf-letter/xzmore-letter.pdf Binary files differnew file mode 100644 index 0000000..1a019e0 --- /dev/null +++ b/doc/man/pdf-letter/xzmore-letter.pdf diff --git a/doc/man/txt/lzmainfo.txt b/doc/man/txt/lzmainfo.txt new file mode 100644 index 0000000..fa4e51c --- /dev/null +++ b/doc/man/txt/lzmainfo.txt @@ -0,0 +1,40 @@ +LZMAINFO(1) XZ Utils LZMAINFO(1) + + + +NAME + lzmainfo - show information stored in the .lzma file header + +SYNOPSIS + lzmainfo [--help] [--version] [file...] + +DESCRIPTION + lzmainfo shows information stored in the .lzma file header. It reads + the first 13 bytes from the specified file, decodes the header, and + prints it to standard output in human readable format. If no files are + given or file is -, standard input is read. + + Usually the most interesting information is the uncompressed size and + the dictionary size. Uncompressed size can be shown only if the file + is in the non-streamed .lzma format variant. The amount of memory re- + quired to decompress the file is a few dozen kilobytes plus the dictio- + nary size. + + lzmainfo is included in XZ Utils primarily for backward compatibility + with LZMA Utils. + +EXIT STATUS + 0 All is good. + + 1 An error occurred. + +BUGS + lzmainfo uses MB while the correct suffix would be MiB (2^20 bytes). + This is to keep the output compatible with LZMA Utils. + +SEE ALSO + xz(1) + + + +Tukaani 2013-06-30 LZMAINFO(1) diff --git a/doc/man/txt/xz.txt b/doc/man/txt/xz.txt new file mode 100644 index 0000000..be24360 --- /dev/null +++ b/doc/man/txt/xz.txt @@ -0,0 +1,1589 @@ +XZ(1) XZ Utils XZ(1) + + + +NAME + xz, unxz, xzcat, lzma, unlzma, lzcat - Compress or decompress .xz and + .lzma files + +SYNOPSIS + xz [option...] [file...] + +COMMAND ALIASES + unxz is equivalent to xz --decompress. + xzcat is equivalent to xz --decompress --stdout. + lzma is equivalent to xz --format=lzma. + unlzma is equivalent to xz --format=lzma --decompress. + lzcat is equivalent to xz --format=lzma --decompress --stdout. + + When writing scripts that need to decompress files, it is recommended + to always use the name xz with appropriate arguments (xz -d or xz -dc) + instead of the names unxz and xzcat. + +DESCRIPTION + xz is a general-purpose data compression tool with command line syntax + similar to gzip(1) and bzip2(1). The native file format is the .xz + format, but the legacy .lzma format used by LZMA Utils and raw com- + pressed streams with no container format headers are also supported. + In addition, decompression of the .lz format used by lzip is supported. + + xz compresses or decompresses each file according to the selected oper- + ation mode. If no files are given or file is -, xz reads from standard + input and writes the processed data to standard output. xz will refuse + (display an error and skip the file) to write compressed data to stan- + dard output if it is a terminal. Similarly, xz will refuse to read + compressed data from standard input if it is a terminal. + + Unless --stdout is specified, files other than - are written to a new + file whose name is derived from the source file name: + + o When compressing, the suffix of the target file format (.xz or + .lzma) is appended to the source filename to get the target file- + name. + + o When decompressing, the .xz, .lzma, or .lz suffix is removed from + the filename to get the target filename. xz also recognizes the + suffixes .txz and .tlz, and replaces them with the .tar suffix. + + If the target file already exists, an error is displayed and the file + is skipped. + + Unless writing to standard output, xz will display a warning and skip + the file if any of the following applies: + + o File is not a regular file. Symbolic links are not followed, and + thus they are not considered to be regular files. + + o File has more than one hard link. + + o File has setuid, setgid, or sticky bit set. + + o The operation mode is set to compress and the file already has a + suffix of the target file format (.xz or .txz when compressing to + the .xz format, and .lzma or .tlz when compressing to the .lzma for- + mat). + + o The operation mode is set to decompress and the file doesn't have a + suffix of any of the supported file formats (.xz, .txz, .lzma, .tlz, + or .lz). + + After successfully compressing or decompressing the file, xz copies the + owner, group, permissions, access time, and modification time from the + source file to the target file. If copying the group fails, the per- + missions are modified so that the target file doesn't become accessible + to users who didn't have permission to access the source file. xz + doesn't support copying other metadata like access control lists or ex- + tended attributes yet. + + Once the target file has been successfully closed, the source file is + removed unless --keep was specified. The source file is never removed + if the output is written to standard output or if an error occurs. + + Sending SIGINFO or SIGUSR1 to the xz process makes it print progress + information to standard error. This has only limited use since when + standard error is a terminal, using --verbose will display an automati- + cally updating progress indicator. + + Memory usage + The memory usage of xz varies from a few hundred kilobytes to several + gigabytes depending on the compression settings. The settings used + when compressing a file determine the memory requirements of the decom- + pressor. Typically the decompressor needs 5 % to 20 % of the amount of + memory that the compressor needed when creating the file. For example, + decompressing a file created with xz -9 currently requires 65 MiB of + memory. Still, it is possible to have .xz files that require several + gigabytes of memory to decompress. + + Especially users of older systems may find the possibility of very + large memory usage annoying. To prevent uncomfortable surprises, xz + has a built-in memory usage limiter, which is disabled by default. + While some operating systems provide ways to limit the memory usage of + processes, relying on it wasn't deemed to be flexible enough (for exam- + ple, using ulimit(1) to limit virtual memory tends to cripple mmap(2)). + + The memory usage limiter can be enabled with the command line option + --memlimit=limit. Often it is more convenient to enable the limiter by + default by setting the environment variable XZ_DEFAULTS, for example, + XZ_DEFAULTS=--memlimit=150MiB. It is possible to set the limits sepa- + rately for compression and decompression by using --memlimit-com- + press=limit and --memlimit-decompress=limit. Using these two options + outside XZ_DEFAULTS is rarely useful because a single run of xz cannot + do both compression and decompression and --memlimit=limit (or -M + limit) is shorter to type on the command line. + + If the specified memory usage limit is exceeded when decompressing, xz + will display an error and decompressing the file will fail. If the + limit is exceeded when compressing, xz will try to scale the settings + down so that the limit is no longer exceeded (except when using --for- + mat=raw or --no-adjust). This way the operation won't fail unless the + limit is very small. The scaling of the settings is done in steps that + don't match the compression level presets, for example, if the limit is + only slightly less than the amount required for xz -9, the settings + will be scaled down only a little, not all the way down to xz -8. + + Concatenation and padding with .xz files + It is possible to concatenate .xz files as is. xz will decompress such + files as if they were a single .xz file. + + It is possible to insert padding between the concatenated parts or af- + ter the last part. The padding must consist of null bytes and the size + of the padding must be a multiple of four bytes. This can be useful, + for example, if the .xz file is stored on a medium that measures file + sizes in 512-byte blocks. + + Concatenation and padding are not allowed with .lzma files or raw + streams. + +OPTIONS + Integer suffixes and special values + In most places where an integer argument is expected, an optional suf- + fix is supported to easily indicate large integers. There must be no + space between the integer and the suffix. + + KiB Multiply the integer by 1,024 (2^10). Ki, k, kB, K, and KB are + accepted as synonyms for KiB. + + MiB Multiply the integer by 1,048,576 (2^20). Mi, m, M, and MB are + accepted as synonyms for MiB. + + GiB Multiply the integer by 1,073,741,824 (2^30). Gi, g, G, and GB + are accepted as synonyms for GiB. + + The special value max can be used to indicate the maximum integer value + supported by the option. + + Operation mode + If multiple operation mode options are given, the last one takes ef- + fect. + + -z, --compress + Compress. This is the default operation mode when no operation + mode option is specified and no other operation mode is implied + from the command name (for example, unxz implies --decompress). + + -d, --decompress, --uncompress + Decompress. + + -t, --test + Test the integrity of compressed files. This option is equiva- + lent to --decompress --stdout except that the decompressed data + is discarded instead of being written to standard output. No + files are created or removed. + + -l, --list + Print information about compressed files. No uncompressed out- + put is produced, and no files are created or removed. In list + mode, the program cannot read the compressed data from standard + input or from other unseekable sources. + + The default listing shows basic information about files, one + file per line. To get more detailed information, use also the + --verbose option. For even more information, use --verbose + twice, but note that this may be slow, because getting all the + extra information requires many seeks. The width of verbose + output exceeds 80 characters, so piping the output to, for exam- + ple, less -S may be convenient if the terminal isn't wide + enough. + + The exact output may vary between xz versions and different lo- + cales. For machine-readable output, --robot --list should be + used. + + Operation modifiers + -k, --keep + Don't delete the input files. + + Since xz 5.2.6, this option also makes xz compress or decompress + even if the input is a symbolic link to a regular file, has more + than one hard link, or has the setuid, setgid, or sticky bit + set. The setuid, setgid, and sticky bits are not copied to the + target file. In earlier versions this was only done with + --force. + + -f, --force + This option has several effects: + + o If the target file already exists, delete it before compress- + ing or decompressing. + + o Compress or decompress even if the input is a symbolic link + to a regular file, has more than one hard link, or has the + setuid, setgid, or sticky bit set. The setuid, setgid, and + sticky bits are not copied to the target file. + + o When used with --decompress --stdout and xz cannot recognize + the type of the source file, copy the source file as is to + standard output. This allows xzcat --force to be used like + cat(1) for files that have not been compressed with xz. Note + that in future, xz might support new compressed file formats, + which may make xz decompress more types of files instead of + copying them as is to standard output. --format=format can + be used to restrict xz to decompress only a single file for- + mat. + + -c, --stdout, --to-stdout + Write the compressed or decompressed data to standard output in- + stead of a file. This implies --keep. + + --single-stream + Decompress only the first .xz stream, and silently ignore possi- + ble remaining input data following the stream. Normally such + trailing garbage makes xz display an error. + + xz never decompresses more than one stream from .lzma files or + raw streams, but this option still makes xz ignore the possible + trailing data after the .lzma file or raw stream. + + This option has no effect if the operation mode is not --decom- + press or --test. + + --no-sparse + Disable creation of sparse files. By default, if decompressing + into a regular file, xz tries to make the file sparse if the de- + compressed data contains long sequences of binary zeros. It + also works when writing to standard output as long as standard + output is connected to a regular file and certain additional + conditions are met to make it safe. Creating sparse files may + save disk space and speed up the decompression by reducing the + amount of disk I/O. + + -S .suf, --suffix=.suf + When compressing, use .suf as the suffix for the target file in- + stead of .xz or .lzma. If not writing to standard output and + the source file already has the suffix .suf, a warning is dis- + played and the file is skipped. + + When decompressing, recognize files with the suffix .suf in ad- + dition to files with the .xz, .txz, .lzma, .tlz, or .lz suffix. + If the source file has the suffix .suf, the suffix is removed to + get the target filename. + + When compressing or decompressing raw streams (--format=raw), + the suffix must always be specified unless writing to standard + output, because there is no default suffix for raw streams. + + --files[=file] + Read the filenames to process from file; if file is omitted, + filenames are read from standard input. Filenames must be ter- + minated with the newline character. A dash (-) is taken as a + regular filename; it doesn't mean standard input. If filenames + are given also as command line arguments, they are processed be- + fore the filenames read from file. + + --files0[=file] + This is identical to --files[=file] except that each filename + must be terminated with the null character. + + Basic file format and compression options + -F format, --format=format + Specify the file format to compress or decompress: + + auto This is the default. When compressing, auto is equiva- + lent to xz. When decompressing, the format of the input + file is automatically detected. Note that raw streams + (created with --format=raw) cannot be auto-detected. + + xz Compress to the .xz file format, or accept only .xz files + when decompressing. + + lzma, alone + Compress to the legacy .lzma file format, or accept only + .lzma files when decompressing. The alternative name + alone is provided for backwards compatibility with LZMA + Utils. + + lzip Accept only .lz files when decompressing. Compression is + not supported. + + The .lz format version 0 and the unextended version 1 are + supported. Version 0 files were produced by lzip 1.3 and + older. Such files aren't common but may be found from + file archives as a few source packages were released in + this format. People might have old personal files in + this format too. Decompression support for the format + version 0 was removed in lzip 1.18. + + lzip 1.4 and later create files in the format version 1. + The sync flush marker extension to the format version 1 + was added in lzip 1.6. This extension is rarely used and + isn't supported by xz (diagnosed as corrupt input). + + raw Compress or uncompress a raw stream (no headers). This + is meant for advanced users only. To decode raw streams, + you need use --format=raw and explicitly specify the fil- + ter chain, which normally would have been stored in the + container headers. + + -C check, --check=check + Specify the type of the integrity check. The check is calcu- + lated from the uncompressed data and stored in the .xz file. + This option has an effect only when compressing into the .xz + format; the .lzma format doesn't support integrity checks. The + integrity check (if any) is verified when the .xz file is decom- + pressed. + + Supported check types: + + none Don't calculate an integrity check at all. This is usu- + ally a bad idea. This can be useful when integrity of + the data is verified by other means anyway. + + crc32 Calculate CRC32 using the polynomial from IEEE-802.3 + (Ethernet). + + crc64 Calculate CRC64 using the polynomial from ECMA-182. This + is the default, since it is slightly better than CRC32 at + detecting damaged files and the speed difference is neg- + ligible. + + sha256 Calculate SHA-256. This is somewhat slower than CRC32 + and CRC64. + + Integrity of the .xz headers is always verified with CRC32. It + is not possible to change or disable it. + + --ignore-check + Don't verify the integrity check of the compressed data when de- + compressing. The CRC32 values in the .xz headers will still be + verified normally. + + Do not use this option unless you know what you are doing. Pos- + sible reasons to use this option: + + o Trying to recover data from a corrupt .xz file. + + o Speeding up decompression. This matters mostly with SHA-256 + or with files that have compressed extremely well. It's rec- + ommended to not use this option for this purpose unless the + file integrity is verified externally in some other way. + + -0 ... -9 + Select a compression preset level. The default is -6. If mul- + tiple preset levels are specified, the last one takes effect. + If a custom filter chain was already specified, setting a com- + pression preset level clears the custom filter chain. + + The differences between the presets are more significant than + with gzip(1) and bzip2(1). The selected compression settings + determine the memory requirements of the decompressor, thus us- + ing a too high preset level might make it painful to decompress + the file on an old system with little RAM. Specifically, it's + not a good idea to blindly use -9 for everything like it often + is with gzip(1) and bzip2(1). + + -0 ... -3 + These are somewhat fast presets. -0 is sometimes faster + than gzip -9 while compressing much better. The higher + ones often have speed comparable to bzip2(1) with compa- + rable or better compression ratio, although the results + depend a lot on the type of data being compressed. + + -4 ... -6 + Good to very good compression while keeping decompressor + memory usage reasonable even for old systems. -6 is the + default, which is usually a good choice for distributing + files that need to be decompressible even on systems with + only 16 MiB RAM. (-5e or -6e may be worth considering + too. See --extreme.) + + -7 ... -9 + These are like -6 but with higher compressor and decom- + pressor memory requirements. These are useful only when + compressing files bigger than 8 MiB, 16 MiB, and 32 MiB, + respectively. + + On the same hardware, the decompression speed is approximately a + constant number of bytes of compressed data per second. In + other words, the better the compression, the faster the decom- + pression will usually be. This also means that the amount of + uncompressed output produced per second can vary a lot. + + The following table summarises the features of the presets: + + Preset DictSize CompCPU CompMem DecMem + -0 256 KiB 0 3 MiB 1 MiB + -1 1 MiB 1 9 MiB 2 MiB + -2 2 MiB 2 17 MiB 3 MiB + -3 4 MiB 3 32 MiB 5 MiB + -4 4 MiB 4 48 MiB 5 MiB + -5 8 MiB 5 94 MiB 9 MiB + -6 8 MiB 6 94 MiB 9 MiB + -7 16 MiB 6 186 MiB 17 MiB + -8 32 MiB 6 370 MiB 33 MiB + -9 64 MiB 6 674 MiB 65 MiB + + Column descriptions: + + o DictSize is the LZMA2 dictionary size. It is waste of memory + to use a dictionary bigger than the size of the uncompressed + file. This is why it is good to avoid using the presets -7 + ... -9 when there's no real need for them. At -6 and lower, + the amount of memory wasted is usually low enough to not mat- + ter. + + o CompCPU is a simplified representation of the LZMA2 settings + that affect compression speed. The dictionary size affects + speed too, so while CompCPU is the same for levels -6 ... -9, + higher levels still tend to be a little slower. To get even + slower and thus possibly better compression, see --extreme. + + o CompMem contains the compressor memory requirements in the + single-threaded mode. It may vary slightly between xz ver- + sions. Memory requirements of some of the future multi- + threaded modes may be dramatically higher than that of the + single-threaded mode. + + o DecMem contains the decompressor memory requirements. That + is, the compression settings determine the memory require- + ments of the decompressor. The exact decompressor memory us- + age is slightly more than the LZMA2 dictionary size, but the + values in the table have been rounded up to the next full + MiB. + + -e, --extreme + Use a slower variant of the selected compression preset level + (-0 ... -9) to hopefully get a little bit better compression ra- + tio, but with bad luck this can also make it worse. Decompres- + sor memory usage is not affected, but compressor memory usage + increases a little at preset levels -0 ... -3. + + Since there are two presets with dictionary sizes 4 MiB and + 8 MiB, the presets -3e and -5e use slightly faster settings + (lower CompCPU) than -4e and -6e, respectively. That way no two + presets are identical. + + Preset DictSize CompCPU CompMem DecMem + -0e 256 KiB 8 4 MiB 1 MiB + -1e 1 MiB 8 13 MiB 2 MiB + -2e 2 MiB 8 25 MiB 3 MiB + -3e 4 MiB 7 48 MiB 5 MiB + -4e 4 MiB 8 48 MiB 5 MiB + -5e 8 MiB 7 94 MiB 9 MiB + -6e 8 MiB 8 94 MiB 9 MiB + -7e 16 MiB 8 186 MiB 17 MiB + -8e 32 MiB 8 370 MiB 33 MiB + -9e 64 MiB 8 674 MiB 65 MiB + + For example, there are a total of four presets that use 8 MiB + dictionary, whose order from the fastest to the slowest is -5, + -6, -5e, and -6e. + + --fast + --best These are somewhat misleading aliases for -0 and -9, respec- + tively. These are provided only for backwards compatibility + with LZMA Utils. Avoid using these options. + + --block-size=size + When compressing to the .xz format, split the input data into + blocks of size bytes. The blocks are compressed independently + from each other, which helps with multi-threading and makes lim- + ited random-access decompression possible. This option is typi- + cally used to override the default block size in multi-threaded + mode, but this option can be used in single-threaded mode too. + + In multi-threaded mode about three times size bytes will be al- + located in each thread for buffering input and output. The de- + fault size is three times the LZMA2 dictionary size or 1 MiB, + whichever is more. Typically a good value is 2-4 times the size + of the LZMA2 dictionary or at least 1 MiB. Using size less than + the LZMA2 dictionary size is waste of RAM because then the LZMA2 + dictionary buffer will never get fully used. The sizes of the + blocks are stored in the block headers, which a future version + of xz will use for multi-threaded decompression. + + In single-threaded mode no block splitting is done by default. + Setting this option doesn't affect memory usage. No size infor- + mation is stored in block headers, thus files created in single- + threaded mode won't be identical to files created in multi- + threaded mode. The lack of size information also means that a + future version of xz won't be able decompress the files in + multi-threaded mode. + + --block-list=sizes + When compressing to the .xz format, start a new block after the + given intervals of uncompressed data. + + The uncompressed sizes of the blocks are specified as a comma- + separated list. Omitting a size (two or more consecutive com- + mas) is a shorthand to use the size of the previous block. + + If the input file is bigger than the sum of sizes, the last + value in sizes is repeated until the end of the file. A special + value of 0 may be used as the last value to indicate that the + rest of the file should be encoded as a single block. + + If one specifies sizes that exceed the encoder's block size (ei- + ther the default value in threaded mode or the value specified + with --block-size=size), the encoder will create additional + blocks while keeping the boundaries specified in sizes. For ex- + ample, if one specifies --block-size=10MiB + --block-list=5MiB,10MiB,8MiB,12MiB,24MiB and the input file is + 80 MiB, one will get 11 blocks: 5, 10, 8, 10, 2, 10, 10, 4, 10, + 10, and 1 MiB. + + In multi-threaded mode the sizes of the blocks are stored in the + block headers. This isn't done in single-threaded mode, so the + encoded output won't be identical to that of the multi-threaded + mode. + + --flush-timeout=timeout + When compressing, if more than timeout milliseconds (a positive + integer) has passed since the previous flush and reading more + input would block, all the pending input data is flushed from + the encoder and made available in the output stream. This can + be useful if xz is used to compress data that is streamed over a + network. Small timeout values make the data available at the + receiving end with a small delay, but large timeout values give + better compression ratio. + + This feature is disabled by default. If this option is speci- + fied more than once, the last one takes effect. The special + timeout value of 0 can be used to explicitly disable this fea- + ture. + + This feature is not available on non-POSIX systems. + + This feature is still experimental. Currently xz is unsuitable + for decompressing the stream in real time due to how xz does + buffering. + + --memlimit-compress=limit + Set a memory usage limit for compression. If this option is + specified multiple times, the last one takes effect. + + If the compression settings exceed the limit, xz will attempt to + adjust the settings downwards so that the limit is no longer ex- + ceeded and display a notice that automatic adjustment was done. + The adjustments are done in this order: reducing the number of + threads, switching to single-threaded mode if even one thread in + multi-threaded mode exceeds the limit, and finally reducing the + LZMA2 dictionary size. + + When compressing with --format=raw or if --no-adjust has been + specified, only the number of threads may be reduced since it + can be done without affecting the compressed output. + + If the limit cannot be met even with the adjustments described + above, an error is displayed and xz will exit with exit status + 1. + + The limit can be specified in multiple ways: + + o The limit can be an absolute value in bytes. Using an inte- + ger suffix like MiB can be useful. Example: --memlimit-com- + press=80MiB + + o The limit can be specified as a percentage of total physical + memory (RAM). This can be useful especially when setting the + XZ_DEFAULTS environment variable in a shell initialization + script that is shared between different computers. That way + the limit is automatically bigger on systems with more mem- + ory. Example: --memlimit-compress=70% + + o The limit can be reset back to its default value by setting + it to 0. This is currently equivalent to setting the limit + to max (no memory usage limit). + + For 32-bit xz there is a special case: if the limit would be + over 4020 MiB, the limit is set to 4020 MiB. On MIPS32 2000 MiB + is used instead. (The values 0 and max aren't affected by this. + A similar feature doesn't exist for decompression.) This can be + helpful when a 32-bit executable has access to 4 GiB address + space (2 GiB on MIPS32) while hopefully doing no harm in other + situations. + + See also the section Memory usage. + + --memlimit-decompress=limit + Set a memory usage limit for decompression. This also affects + the --list mode. If the operation is not possible without ex- + ceeding the limit, xz will display an error and decompressing + the file will fail. See --memlimit-compress=limit for possible + ways to specify the limit. + + --memlimit-mt-decompress=limit + Set a memory usage limit for multi-threaded decompression. This + can only affect the number of threads; this will never make xz + refuse to decompress a file. If limit is too low to allow any + multi-threading, the limit is ignored and xz will continue in + single-threaded mode. Note that if also --memlimit-decompress + is used, it will always apply to both single-threaded and multi- + threaded modes, and so the effective limit for multi-threading + will never be higher than the limit set with --memlimit-decom- + press. + + In contrast to the other memory usage limit options, --mem- + limit-mt-decompress=limit has a system-specific default limit. + xz --info-memory can be used to see the current value. + + This option and its default value exist because without any + limit the threaded decompressor could end up allocating an in- + sane amount of memory with some input files. If the default + limit is too low on your system, feel free to increase the limit + but never set it to a value larger than the amount of usable RAM + as with appropriate input files xz will attempt to use that + amount of memory even with a low number of threads. Running out + of memory or swapping will not improve decompression perfor- + mance. + + See --memlimit-compress=limit for possible ways to specify the + limit. Setting limit to 0 resets the limit to the default sys- + tem-specific value. + + + + -M limit, --memlimit=limit, --memory=limit + This is equivalent to specifying --memlimit-compress=limit + --memlimit-decompress=limit --memlimit-mt-decompress=limit. + + --no-adjust + Display an error and exit if the memory usage limit cannot be + met without adjusting settings that affect the compressed out- + put. That is, this prevents xz from switching the encoder from + multi-threaded mode to single-threaded mode and from reducing + the LZMA2 dictionary size. Even when this option is used the + number of threads may be reduced to meet the memory usage limit + as that won't affect the compressed output. + + Automatic adjusting is always disabled when creating raw streams + (--format=raw). + + -T threads, --threads=threads + Specify the number of worker threads to use. Setting threads to + a special value 0 makes xz use up to as many threads as the pro- + cessor(s) on the system support. The actual number of threads + can be fewer than threads if the input file is not big enough + for threading with the given settings or if using more threads + would exceed the memory usage limit. + + The single-threaded and multi-threaded compressors produce dif- + ferent output. Single-threaded compressor will give the small- + est file size but only the output from the multi-threaded com- + pressor can be decompressed using multiple threads. Setting + threads to 1 will use the single-threaded mode. Setting threads + to any other value, including 0, will use the multi-threaded + compressor even if the system supports only one hardware thread. + (xz 5.2.x used single-threaded mode in this situation.) + + To use multi-threaded mode with only one thread, set threads to + +1. The + prefix has no effect with values other than 1. A + memory usage limit can still make xz switch to single-threaded + mode unless --no-adjust is used. Support for the + prefix was + added in xz 5.4.0. + + If an automatic number of threads has been requested and no mem- + ory usage limit has been specified, then a system-specific de- + fault soft limit will be used to possibly limit the number of + threads. It is a soft limit in sense that it is ignored if the + number of threads becomes one, thus a soft limit will never stop + xz from compressing or decompressing. This default soft limit + will not make xz switch from multi-threaded mode to single- + threaded mode. The active limits can be seen with xz + --info-memory. + + Currently the only threading method is to split the input into + blocks and compress them independently from each other. The de- + fault block size depends on the compression level and can be + overridden with the --block-size=size option. + + Threaded decompression only works on files that contain multiple + blocks with size information in block headers. All large enough + files compressed in multi-threaded mode meet this condition, but + files compressed in single-threaded mode don't even if + --block-size=size has been used. + + Custom compressor filter chains + A custom filter chain allows specifying the compression settings in de- + tail instead of relying on the settings associated to the presets. + When a custom filter chain is specified, preset options (-0 ... -9 and + --extreme) earlier on the command line are forgotten. If a preset op- + tion is specified after one or more custom filter chain options, the + new preset takes effect and the custom filter chain options specified + earlier are forgotten. + + A filter chain is comparable to piping on the command line. When com- + pressing, the uncompressed input goes to the first filter, whose output + goes to the next filter (if any). The output of the last filter gets + written to the compressed file. The maximum number of filters in the + chain is four, but typically a filter chain has only one or two fil- + ters. + + Many filters have limitations on where they can be in the filter chain: + some filters can work only as the last filter in the chain, some only + as a non-last filter, and some work in any position in the chain. De- + pending on the filter, this limitation is either inherent to the filter + design or exists to prevent security issues. + + A custom filter chain is specified by using one or more filter options + in the order they are wanted in the filter chain. That is, the order + of filter options is significant! When decoding raw streams (--for- + mat=raw), the filter chain is specified in the same order as it was + specified when compressing. + + Filters take filter-specific options as a comma-separated list. Extra + commas in options are ignored. Every option has a default value, so + you need to specify only those you want to change. + + To see the whole filter chain and options, use xz -vv (that is, use + --verbose twice). This works also for viewing the filter chain options + used by presets. + + --lzma1[=options] + --lzma2[=options] + Add LZMA1 or LZMA2 filter to the filter chain. These filters + can be used only as the last filter in the chain. + + LZMA1 is a legacy filter, which is supported almost solely due + to the legacy .lzma file format, which supports only LZMA1. + LZMA2 is an updated version of LZMA1 to fix some practical is- + sues of LZMA1. The .xz format uses LZMA2 and doesn't support + LZMA1 at all. Compression speed and ratios of LZMA1 and LZMA2 + are practically the same. + + LZMA1 and LZMA2 share the same set of options: + + preset=preset + Reset all LZMA1 or LZMA2 options to preset. Preset con- + sist of an integer, which may be followed by single-let- + ter preset modifiers. The integer can be from 0 to 9, + matching the command line options -0 ... -9. The only + supported modifier is currently e, which matches --ex- + treme. If no preset is specified, the default values of + LZMA1 or LZMA2 options are taken from the preset 6. + + dict=size + Dictionary (history buffer) size indicates how many bytes + of the recently processed uncompressed data is kept in + memory. The algorithm tries to find repeating byte se- + quences (matches) in the uncompressed data, and replace + them with references to the data currently in the dictio- + nary. The bigger the dictionary, the higher is the + chance to find a match. Thus, increasing dictionary size + usually improves compression ratio, but a dictionary big- + ger than the uncompressed file is waste of memory. + + Typical dictionary size is from 64 KiB to 64 MiB. The + minimum is 4 KiB. The maximum for compression is cur- + rently 1.5 GiB (1536 MiB). The decompressor already sup- + ports dictionaries up to one byte less than 4 GiB, which + is the maximum for the LZMA1 and LZMA2 stream formats. + + Dictionary size and match finder (mf) together determine + the memory usage of the LZMA1 or LZMA2 encoder. The same + (or bigger) dictionary size is required for decompressing + that was used when compressing, thus the memory usage of + the decoder is determined by the dictionary size used + when compressing. The .xz headers store the dictionary + size either as 2^n or 2^n + 2^(n-1), so these sizes are + somewhat preferred for compression. Other sizes will get + rounded up when stored in the .xz headers. + + lc=lc Specify the number of literal context bits. The minimum + is 0 and the maximum is 4; the default is 3. In addi- + tion, the sum of lc and lp must not exceed 4. + + All bytes that cannot be encoded as matches are encoded + as literals. That is, literals are simply 8-bit bytes + that are encoded one at a time. + + The literal coding makes an assumption that the highest + lc bits of the previous uncompressed byte correlate with + the next byte. For example, in typical English text, an + upper-case letter is often followed by a lower-case let- + ter, and a lower-case letter is usually followed by an- + other lower-case letter. In the US-ASCII character set, + the highest three bits are 010 for upper-case letters and + 011 for lower-case letters. When lc is at least 3, the + literal coding can take advantage of this property in the + uncompressed data. + + The default value (3) is usually good. If you want maxi- + mum compression, test lc=4. Sometimes it helps a little, + and sometimes it makes compression worse. If it makes it + worse, test lc=2 too. + + lp=lp Specify the number of literal position bits. The minimum + is 0 and the maximum is 4; the default is 0. + + Lp affects what kind of alignment in the uncompressed + data is assumed when encoding literals. See pb below for + more information about alignment. + + pb=pb Specify the number of position bits. The minimum is 0 + and the maximum is 4; the default is 2. + + Pb affects what kind of alignment in the uncompressed + data is assumed in general. The default means four-byte + alignment (2^pb=2^2=4), which is often a good choice when + there's no better guess. + + When the alignment is known, setting pb accordingly may + reduce the file size a little. For example, with text + files having one-byte alignment (US-ASCII, ISO-8859-*, + UTF-8), setting pb=0 can improve compression slightly. + For UTF-16 text, pb=1 is a good choice. If the alignment + is an odd number like 3 bytes, pb=0 might be the best + choice. + + Even though the assumed alignment can be adjusted with pb + and lp, LZMA1 and LZMA2 still slightly favor 16-byte + alignment. It might be worth taking into account when + designing file formats that are likely to be often com- + pressed with LZMA1 or LZMA2. + + mf=mf Match finder has a major effect on encoder speed, memory + usage, and compression ratio. Usually Hash Chain match + finders are faster than Binary Tree match finders. The + default depends on the preset: 0 uses hc3, 1-3 use hc4, + and the rest use bt4. + + The following match finders are supported. The memory + usage formulas below are rough approximations, which are + closest to the reality when dict is a power of two. + + hc3 Hash Chain with 2- and 3-byte hashing + Minimum value for nice: 3 + Memory usage: + dict * 7.5 (if dict <= 16 MiB); + dict * 5.5 + 64 MiB (if dict > 16 MiB) + + hc4 Hash Chain with 2-, 3-, and 4-byte hashing + Minimum value for nice: 4 + Memory usage: + dict * 7.5 (if dict <= 32 MiB); + dict * 6.5 (if dict > 32 MiB) + + bt2 Binary Tree with 2-byte hashing + Minimum value for nice: 2 + Memory usage: dict * 9.5 + + bt3 Binary Tree with 2- and 3-byte hashing + Minimum value for nice: 3 + Memory usage: + dict * 11.5 (if dict <= 16 MiB); + dict * 9.5 + 64 MiB (if dict > 16 MiB) + + bt4 Binary Tree with 2-, 3-, and 4-byte hashing + Minimum value for nice: 4 + Memory usage: + dict * 11.5 (if dict <= 32 MiB); + dict * 10.5 (if dict > 32 MiB) + + mode=mode + Compression mode specifies the method to analyze the data + produced by the match finder. Supported modes are fast + and normal. The default is fast for presets 0-3 and nor- + mal for presets 4-9. + + Usually fast is used with Hash Chain match finders and + normal with Binary Tree match finders. This is also what + the presets do. + + nice=nice + Specify what is considered to be a nice length for a + match. Once a match of at least nice bytes is found, the + algorithm stops looking for possibly better matches. + + Nice can be 2-273 bytes. Higher values tend to give bet- + ter compression ratio at the expense of speed. The de- + fault depends on the preset. + + depth=depth + Specify the maximum search depth in the match finder. + The default is the special value of 0, which makes the + compressor determine a reasonable depth from mf and nice. + + Reasonable depth for Hash Chains is 4-100 and 16-1000 for + Binary Trees. Using very high values for depth can make + the encoder extremely slow with some files. Avoid set- + ting the depth over 1000 unless you are prepared to in- + terrupt the compression in case it is taking far too + long. + + When decoding raw streams (--format=raw), LZMA2 needs only the + dictionary size. LZMA1 needs also lc, lp, and pb. + + --x86[=options] + --arm[=options] + --armthumb[=options] + --arm64[=options] + --powerpc[=options] + --ia64[=options] + --sparc[=options] + Add a branch/call/jump (BCJ) filter to the filter chain. These + filters can be used only as a non-last filter in the filter + chain. + + A BCJ filter converts relative addresses in the machine code to + their absolute counterparts. This doesn't change the size of + the data but it increases redundancy, which can help LZMA2 to + produce 0-15 % smaller .xz file. The BCJ filters are always re- + versible, so using a BCJ filter for wrong type of data doesn't + cause any data loss, although it may make the compression ratio + slightly worse. The BCJ filters are very fast and use an in- + significant amount of memory. + + These BCJ filters have known problems related to the compression + ratio: + + o Some types of files containing executable code (for example, + object files, static libraries, and Linux kernel modules) + have the addresses in the instructions filled with filler + values. These BCJ filters will still do the address conver- + sion, which will make the compression worse with these files. + + o If a BCJ filter is applied on an archive, it is possible that + it makes the compression ratio worse than not using a BCJ + filter. For example, if there are similar or even identical + executables then filtering will likely make the files less + similar and thus compression is worse. The contents of non- + executable files in the same archive can matter too. In + practice one has to try with and without a BCJ filter to see + which is better in each situation. + + Different instruction sets have different alignment: the exe- + cutable file must be aligned to a multiple of this value in the + input data to make the filter work. + + Filter Alignment Notes + x86 1 32-bit or 64-bit x86 + ARM 4 + ARM-Thumb 2 + ARM64 4 4096-byte alignment is best + PowerPC 4 Big endian only + IA-64 16 Itanium + SPARC 4 + + Since the BCJ-filtered data is usually compressed with LZMA2, + the compression ratio may be improved slightly if the LZMA2 op- + tions are set to match the alignment of the selected BCJ filter. + For example, with the IA-64 filter, it's good to set pb=4 or + even pb=4,lp=4,lc=0 with LZMA2 (2^4=16). The x86 filter is an + exception; it's usually good to stick to LZMA2's default four- + byte alignment when compressing x86 executables. + + All BCJ filters support the same options: + + start=offset + Specify the start offset that is used when converting be- + tween relative and absolute addresses. The offset must + be a multiple of the alignment of the filter (see the ta- + ble above). The default is zero. In practice, the de- + fault is good; specifying a custom offset is almost never + useful. + + --delta[=options] + Add the Delta filter to the filter chain. The Delta filter can + be only used as a non-last filter in the filter chain. + + Currently only simple byte-wise delta calculation is supported. + It can be useful when compressing, for example, uncompressed + bitmap images or uncompressed PCM audio. However, special pur- + pose algorithms may give significantly better results than Delta + + LZMA2. This is true especially with audio, which compresses + faster and better, for example, with flac(1). + + Supported options: + + dist=distance + Specify the distance of the delta calculation in bytes. + distance must be 1-256. The default is 1. + + For example, with dist=2 and eight-byte input A1 B1 A2 B3 + A3 B5 A4 B7, the output will be A1 B1 01 02 01 02 01 02. + + Other options + -q, --quiet + Suppress warnings and notices. Specify this twice to suppress + errors too. This option has no effect on the exit status. That + is, even if a warning was suppressed, the exit status to indi- + cate a warning is still used. + + -v, --verbose + Be verbose. If standard error is connected to a terminal, xz + will display a progress indicator. Specifying --verbose twice + will give even more verbose output. + + The progress indicator shows the following information: + + o Completion percentage is shown if the size of the input file + is known. That is, the percentage cannot be shown in pipes. + + o Amount of compressed data produced (compressing) or consumed + (decompressing). + + o Amount of uncompressed data consumed (compressing) or pro- + duced (decompressing). + + o Compression ratio, which is calculated by dividing the amount + of compressed data processed so far by the amount of uncom- + pressed data processed so far. + + o Compression or decompression speed. This is measured as the + amount of uncompressed data consumed (compression) or pro- + duced (decompression) per second. It is shown after a few + seconds have passed since xz started processing the file. + + o Elapsed time in the format M:SS or H:MM:SS. + + o Estimated remaining time is shown only when the size of the + input file is known and a couple of seconds have already + passed since xz started processing the file. The time is + shown in a less precise format which never has any colons, + for example, 2 min 30 s. + + When standard error is not a terminal, --verbose will make xz + print the filename, compressed size, uncompressed size, compres- + sion ratio, and possibly also the speed and elapsed time on a + single line to standard error after compressing or decompressing + the file. The speed and elapsed time are included only when the + operation took at least a few seconds. If the operation didn't + finish, for example, due to user interruption, also the comple- + tion percentage is printed if the size of the input file is + known. + + -Q, --no-warn + Don't set the exit status to 2 even if a condition worth a warn- + ing was detected. This option doesn't affect the verbosity + level, thus both --quiet and --no-warn have to be used to not + display warnings and to not alter the exit status. + + --robot + Print messages in a machine-parsable format. This is intended + to ease writing frontends that want to use xz instead of li- + blzma, which may be the case with various scripts. The output + with this option enabled is meant to be stable across xz re- + leases. See the section ROBOT MODE for details. + + --info-memory + Display, in human-readable format, how much physical memory + (RAM) and how many processor threads xz thinks the system has + and the memory usage limits for compression and decompression, + and exit successfully. + + -h, --help + Display a help message describing the most commonly used op- + tions, and exit successfully. + + -H, --long-help + Display a help message describing all features of xz, and exit + successfully + + -V, --version + Display the version number of xz and liblzma in human readable + format. To get machine-parsable output, specify --robot before + --version. + +ROBOT MODE + The robot mode is activated with the --robot option. It makes the out- + put of xz easier to parse by other programs. Currently --robot is sup- + ported only together with --version, --info-memory, and --list. It + will be supported for compression and decompression in the future. + + Version + xz --robot --version will print the version number of xz and liblzma in + the following format: + + XZ_VERSION=XYYYZZZS + LIBLZMA_VERSION=XYYYZZZS + + X Major version. + + YYY Minor version. Even numbers are stable. Odd numbers are alpha + or beta versions. + + ZZZ Patch level for stable releases or just a counter for develop- + ment releases. + + S Stability. 0 is alpha, 1 is beta, and 2 is stable. S should be + always 2 when YYY is even. + + XYYYZZZS are the same on both lines if xz and liblzma are from the same + XZ Utils release. + + Examples: 4.999.9beta is 49990091 and 5.0.0 is 50000002. + + Memory limit information + xz --robot --info-memory prints a single line with three tab-separated + columns: + + 1. Total amount of physical memory (RAM) in bytes. + + 2. Memory usage limit for compression in bytes (--memlimit-compress). + A special value of 0 indicates the default setting which for sin- + gle-threaded mode is the same as no limit. + + 3. Memory usage limit for decompression in bytes (--memlimit-decom- + press). A special value of 0 indicates the default setting which + for single-threaded mode is the same as no limit. + + 4. Since xz 5.3.4alpha: Memory usage for multi-threaded decompression + in bytes (--memlimit-mt-decompress). This is never zero because a + system-specific default value shown in the column 5 is used if no + limit has been specified explicitly. This is also never greater + than the value in the column 3 even if a larger value has been + specified with --memlimit-mt-decompress. + + 5. Since xz 5.3.4alpha: A system-specific default memory usage limit + that is used to limit the number of threads when compressing with + an automatic number of threads (--threads=0) and no memory usage + limit has been specified (--memlimit-compress). This is also used + as the default value for --memlimit-mt-decompress. + + 6. Since xz 5.3.4alpha: Number of available processor threads. + + In the future, the output of xz --robot --info-memory may have more + columns, but never more than a single line. + + List mode + xz --robot --list uses tab-separated output. The first column of every + line has a string that indicates the type of the information found on + that line: + + name This is always the first line when starting to list a file. The + second column on the line is the filename. + + file This line contains overall information about the .xz file. This + line is always printed after the name line. + + stream This line type is used only when --verbose was specified. There + are as many stream lines as there are streams in the .xz file. + + block This line type is used only when --verbose was specified. There + are as many block lines as there are blocks in the .xz file. + The block lines are shown after all the stream lines; different + line types are not interleaved. + + summary + This line type is used only when --verbose was specified twice. + This line is printed after all block lines. Like the file line, + the summary line contains overall information about the .xz + file. + + totals This line is always the very last line of the list output. It + shows the total counts and sizes. + + The columns of the file lines: + 2. Number of streams in the file + 3. Total number of blocks in the stream(s) + 4. Compressed size of the file + 5. Uncompressed size of the file + 6. Compression ratio, for example, 0.123. If ratio is over + 9.999, three dashes (---) are displayed instead of the ra- + tio. + 7. Comma-separated list of integrity check names. The follow- + ing strings are used for the known check types: None, CRC32, + CRC64, and SHA-256. For unknown check types, Unknown-N is + used, where N is the Check ID as a decimal number (one or + two digits). + 8. Total size of stream padding in the file + + The columns of the stream lines: + 2. Stream number (the first stream is 1) + 3. Number of blocks in the stream + 4. Compressed start offset + 5. Uncompressed start offset + 6. Compressed size (does not include stream padding) + 7. Uncompressed size + 8. Compression ratio + 9. Name of the integrity check + 10. Size of stream padding + + The columns of the block lines: + 2. Number of the stream containing this block + 3. Block number relative to the beginning of the stream (the + first block is 1) + 4. Block number relative to the beginning of the file + 5. Compressed start offset relative to the beginning of the + file + 6. Uncompressed start offset relative to the beginning of the + file + 7. Total compressed size of the block (includes headers) + 8. Uncompressed size + 9. Compression ratio + 10. Name of the integrity check + + If --verbose was specified twice, additional columns are included on + the block lines. These are not displayed with a single --verbose, be- + cause getting this information requires many seeks and can thus be + slow: + 11. Value of the integrity check in hexadecimal + 12. Block header size + 13. Block flags: c indicates that compressed size is present, + and u indicates that uncompressed size is present. If the + flag is not set, a dash (-) is shown instead to keep the + string length fixed. New flags may be added to the end of + the string in the future. + 14. Size of the actual compressed data in the block (this ex- + cludes the block header, block padding, and check fields) + 15. Amount of memory (in bytes) required to decompress this + block with this xz version + 16. Filter chain. Note that most of the options used at com- + pression time cannot be known, because only the options that + are needed for decompression are stored in the .xz headers. + + The columns of the summary lines: + 2. Amount of memory (in bytes) required to decompress this file + with this xz version + 3. yes or no indicating if all block headers have both com- + pressed size and uncompressed size stored in them + Since xz 5.1.2alpha: + 4. Minimum xz version required to decompress the file + + The columns of the totals line: + 2. Number of streams + 3. Number of blocks + 4. Compressed size + 5. Uncompressed size + 6. Average compression ratio + 7. Comma-separated list of integrity check names that were + present in the files + 8. Stream padding size + 9. Number of files. This is here to keep the order of the ear- + lier columns the same as on file lines. + + If --verbose was specified twice, additional columns are included on + the totals line: + 10. Maximum amount of memory (in bytes) required to decompress + the files with this xz version + 11. yes or no indicating if all block headers have both com- + pressed size and uncompressed size stored in them + Since xz 5.1.2alpha: + 12. Minimum xz version required to decompress the file + + Future versions may add new line types and new columns can be added to + the existing line types, but the existing columns won't be changed. + +EXIT STATUS + 0 All is good. + + 1 An error occurred. + + 2 Something worth a warning occurred, but no actual errors oc- + curred. + + Notices (not warnings or errors) printed on standard error don't affect + the exit status. + +ENVIRONMENT + xz parses space-separated lists of options from the environment vari- + ables XZ_DEFAULTS and XZ_OPT, in this order, before parsing the options + from the command line. Note that only options are parsed from the en- + vironment variables; all non-options are silently ignored. Parsing is + done with getopt_long(3) which is used also for the command line argu- + ments. + + XZ_DEFAULTS + User-specific or system-wide default options. Typically this is + set in a shell initialization script to enable xz's memory usage + limiter by default. Excluding shell initialization scripts and + similar special cases, scripts must never set or unset XZ_DE- + FAULTS. + + XZ_OPT This is for passing options to xz when it is not possible to set + the options directly on the xz command line. This is the case + when xz is run by a script or tool, for example, GNU tar(1): + + XZ_OPT=-2v tar caf foo.tar.xz foo + + Scripts may use XZ_OPT, for example, to set script-specific de- + fault compression options. It is still recommended to allow + users to override XZ_OPT if that is reasonable. For example, in + sh(1) scripts one may use something like this: + + XZ_OPT=${XZ_OPT-"-7e"} + export XZ_OPT + +LZMA UTILS COMPATIBILITY + The command line syntax of xz is practically a superset of lzma, un- + lzma, and lzcat as found from LZMA Utils 4.32.x. In most cases, it is + possible to replace LZMA Utils with XZ Utils without breaking existing + scripts. There are some incompatibilities though, which may sometimes + cause problems. + + Compression preset levels + The numbering of the compression level presets is not identical in xz + and LZMA Utils. The most important difference is how dictionary sizes + are mapped to different presets. Dictionary size is roughly equal to + the decompressor memory usage. + + Level xz LZMA Utils + -0 256 KiB N/A + -1 1 MiB 64 KiB + -2 2 MiB 1 MiB + -3 4 MiB 512 KiB + -4 4 MiB 1 MiB + -5 8 MiB 2 MiB + -6 8 MiB 4 MiB + -7 16 MiB 8 MiB + -8 32 MiB 16 MiB + -9 64 MiB 32 MiB + + The dictionary size differences affect the compressor memory usage too, + but there are some other differences between LZMA Utils and XZ Utils, + which make the difference even bigger: + + Level xz LZMA Utils 4.32.x + -0 3 MiB N/A + -1 9 MiB 2 MiB + -2 17 MiB 12 MiB + -3 32 MiB 12 MiB + -4 48 MiB 16 MiB + -5 94 MiB 26 MiB + -6 94 MiB 45 MiB + -7 186 MiB 83 MiB + -8 370 MiB 159 MiB + -9 674 MiB 311 MiB + + The default preset level in LZMA Utils is -7 while in XZ Utils it is + -6, so both use an 8 MiB dictionary by default. + + Streamed vs. non-streamed .lzma files + The uncompressed size of the file can be stored in the .lzma header. + LZMA Utils does that when compressing regular files. The alternative + is to mark that uncompressed size is unknown and use end-of-payload + marker to indicate where the decompressor should stop. LZMA Utils uses + this method when uncompressed size isn't known, which is the case, for + example, in pipes. + + xz supports decompressing .lzma files with or without end-of-payload + marker, but all .lzma files created by xz will use end-of-payload + marker and have uncompressed size marked as unknown in the .lzma + header. This may be a problem in some uncommon situations. For exam- + ple, a .lzma decompressor in an embedded device might work only with + files that have known uncompressed size. If you hit this problem, you + need to use LZMA Utils or LZMA SDK to create .lzma files with known un- + compressed size. + + Unsupported .lzma files + The .lzma format allows lc values up to 8, and lp values up to 4. LZMA + Utils can decompress files with any lc and lp, but always creates files + with lc=3 and lp=0. Creating files with other lc and lp is possible + with xz and with LZMA SDK. + + The implementation of the LZMA1 filter in liblzma requires that the sum + of lc and lp must not exceed 4. Thus, .lzma files, which exceed this + limitation, cannot be decompressed with xz. + + LZMA Utils creates only .lzma files which have a dictionary size of 2^n + (a power of 2) but accepts files with any dictionary size. liblzma ac- + cepts only .lzma files which have a dictionary size of 2^n or 2^n + + 2^(n-1). This is to decrease false positives when detecting .lzma + files. + + These limitations shouldn't be a problem in practice, since practically + all .lzma files have been compressed with settings that liblzma will + accept. + + Trailing garbage + When decompressing, LZMA Utils silently ignore everything after the + first .lzma stream. In most situations, this is a bug. This also + means that LZMA Utils don't support decompressing concatenated .lzma + files. + + If there is data left after the first .lzma stream, xz considers the + file to be corrupt unless --single-stream was used. This may break ob- + scure scripts which have assumed that trailing garbage is ignored. + +NOTES + Compressed output may vary + The exact compressed output produced from the same uncompressed input + file may vary between XZ Utils versions even if compression options are + identical. This is because the encoder can be improved (faster or bet- + ter compression) without affecting the file format. The output can + vary even between different builds of the same XZ Utils version, if + different build options are used. + + The above means that once --rsyncable has been implemented, the result- + ing files won't necessarily be rsyncable unless both old and new files + have been compressed with the same xz version. This problem can be + fixed if a part of the encoder implementation is frozen to keep rsynca- + ble output stable across xz versions. + + Embedded .xz decompressors + Embedded .xz decompressor implementations like XZ Embedded don't neces- + sarily support files created with integrity check types other than none + and crc32. Since the default is --check=crc64, you must use + --check=none or --check=crc32 when creating files for embedded systems. + + Outside embedded systems, all .xz format decompressors support all the + check types, or at least are able to decompress the file without veri- + fying the integrity check if the particular check is not supported. + + XZ Embedded supports BCJ filters, but only with the default start off- + set. + +EXAMPLES + Basics + Compress the file foo into foo.xz using the default compression level + (-6), and remove foo if compression is successful: + + xz foo + + Decompress bar.xz into bar and don't remove bar.xz even if decompres- + sion is successful: + + xz -dk bar.xz + + Create baz.tar.xz with the preset -4e (-4 --extreme), which is slower + than the default -6, but needs less memory for compression and decom- + pression (48 MiB and 5 MiB, respectively): + + tar cf - baz | xz -4e > baz.tar.xz + + A mix of compressed and uncompressed files can be decompressed to stan- + dard output with a single command: + + xz -dcf a.txt b.txt.xz c.txt d.txt.lzma > abcd.txt + + Parallel compression of many files + On GNU and *BSD, find(1) and xargs(1) can be used to parallelize com- + pression of many files: + + find . -type f \! -name '*.xz' -print0 \ + | xargs -0r -P4 -n16 xz -T1 + + The -P option to xargs(1) sets the number of parallel xz processes. + The best value for the -n option depends on how many files there are to + be compressed. If there are only a couple of files, the value should + probably be 1; with tens of thousands of files, 100 or even more may be + appropriate to reduce the number of xz processes that xargs(1) will + eventually create. + + The option -T1 for xz is there to force it to single-threaded mode, be- + cause xargs(1) is used to control the amount of parallelization. + + Robot mode + Calculate how many bytes have been saved in total after compressing + multiple files: + + xz --robot --list *.xz | awk '/^totals/{print $5-$4}' + + A script may want to know that it is using new enough xz. The follow- + ing sh(1) script checks that the version number of the xz tool is at + least 5.0.0. This method is compatible with old beta versions, which + didn't support the --robot option: + + if ! eval "$(xz --robot --version 2> /dev/null)" || + [ "$XZ_VERSION" -lt 50000002 ]; then + echo "Your xz is too old." + fi + unset XZ_VERSION LIBLZMA_VERSION + + Set a memory usage limit for decompression using XZ_OPT, but if a limit + has already been set, don't increase it: + + NEWLIM=$((123 << 20)) # 123 MiB + OLDLIM=$(xz --robot --info-memory | cut -f3) + if [ $OLDLIM -eq 0 -o $OLDLIM -gt $NEWLIM ]; then + XZ_OPT="$XZ_OPT --memlimit-decompress=$NEWLIM" + export XZ_OPT + fi + + Custom compressor filter chains + The simplest use for custom filter chains is customizing a LZMA2 pre- + set. This can be useful, because the presets cover only a subset of + the potentially useful combinations of compression settings. + + The CompCPU columns of the tables from the descriptions of the options + -0 ... -9 and --extreme are useful when customizing LZMA2 presets. + Here are the relevant parts collected from those two tables: + + Preset CompCPU + -0 0 + + -1 1 + -2 2 + -3 3 + -4 4 + -5 5 + -6 6 + -5e 7 + -6e 8 + + If you know that a file requires somewhat big dictionary (for example, + 32 MiB) to compress well, but you want to compress it quicker than xz + -8 would do, a preset with a low CompCPU value (for example, 1) can be + modified to use a bigger dictionary: + + xz --lzma2=preset=1,dict=32MiB foo.tar + + With certain files, the above command may be faster than xz -6 while + compressing significantly better. However, it must be emphasized that + only some files benefit from a big dictionary while keeping the CompCPU + value low. The most obvious situation, where a big dictionary can help + a lot, is an archive containing very similar files of at least a few + megabytes each. The dictionary size has to be significantly bigger + than any individual file to allow LZMA2 to take full advantage of the + similarities between consecutive files. + + If very high compressor and decompressor memory usage is fine, and the + file being compressed is at least several hundred megabytes, it may be + useful to use an even bigger dictionary than the 64 MiB that xz -9 + would use: + + xz -vv --lzma2=dict=192MiB big_foo.tar + + Using -vv (--verbose --verbose) like in the above example can be useful + to see the memory requirements of the compressor and decompressor. Re- + member that using a dictionary bigger than the size of the uncompressed + file is waste of memory, so the above command isn't useful for small + files. + + Sometimes the compression time doesn't matter, but the decompressor + memory usage has to be kept low, for example, to make it possible to + decompress the file on an embedded system. The following command uses + -6e (-6 --extreme) as a base and sets the dictionary to only 64 KiB. + The resulting file can be decompressed with XZ Embedded (that's why + there is --check=crc32) using about 100 KiB of memory. + + xz --check=crc32 --lzma2=preset=6e,dict=64KiB foo + + If you want to squeeze out as many bytes as possible, adjusting the + number of literal context bits (lc) and number of position bits (pb) + can sometimes help. Adjusting the number of literal position bits (lp) + might help too, but usually lc and pb are more important. For example, + a source code archive contains mostly US-ASCII text, so something like + the following might give slightly (like 0.1 %) smaller file than xz -6e + (try also without lc=4): + + xz --lzma2=preset=6e,pb=0,lc=4 source_code.tar + + Using another filter together with LZMA2 can improve compression with + certain file types. For example, to compress a x86-32 or x86-64 shared + library using the x86 BCJ filter: + + xz --x86 --lzma2 libfoo.so + + Note that the order of the filter options is significant. If --x86 is + specified after --lzma2, xz will give an error, because there cannot be + any filter after LZMA2, and also because the x86 BCJ filter cannot be + used as the last filter in the chain. + + The Delta filter together with LZMA2 can give good results with bitmap + images. It should usually beat PNG, which has a few more advanced fil- + ters than simple delta but uses Deflate for the actual compression. + + The image has to be saved in uncompressed format, for example, as un- + compressed TIFF. The distance parameter of the Delta filter is set to + match the number of bytes per pixel in the image. For example, 24-bit + RGB bitmap needs dist=3, and it is also good to pass pb=0 to LZMA2 to + accommodate the three-byte alignment: + + xz --delta=dist=3 --lzma2=pb=0 foo.tiff + + If multiple images have been put into a single archive (for example, + .tar), the Delta filter will work on that too as long as all images + have the same number of bytes per pixel. + +SEE ALSO + xzdec(1), xzdiff(1), xzgrep(1), xzless(1), xzmore(1), gzip(1), + bzip2(1), 7z(1) + + XZ Utils: <https://tukaani.org/xz/> + XZ Embedded: <https://tukaani.org/xz/embedded.html> + LZMA SDK: <http://7-zip.org/sdk.html> + + + +Tukaani 2022-12-01 XZ(1) diff --git a/doc/man/txt/xzdec.txt b/doc/man/txt/xzdec.txt new file mode 100644 index 0000000..a914e20 --- /dev/null +++ b/doc/man/txt/xzdec.txt @@ -0,0 +1,80 @@ +XZDEC(1) XZ Utils XZDEC(1) + + + +NAME + xzdec, lzmadec - Small .xz and .lzma decompressors + +SYNOPSIS + xzdec [option...] [file...] + lzmadec [option...] [file...] + +DESCRIPTION + xzdec is a liblzma-based decompression-only tool for .xz (and only .xz) + files. xzdec is intended to work as a drop-in replacement for xz(1) in + the most common situations where a script has been written to use xz + --decompress --stdout (and possibly a few other commonly used options) + to decompress .xz files. lzmadec is identical to xzdec except that lz- + madec supports .lzma files instead of .xz files. + + To reduce the size of the executable, xzdec doesn't support multi- + threading or localization, and doesn't read options from XZ_DEFAULTS + and XZ_OPT environment variables. xzdec doesn't support displaying in- + termediate progress information: sending SIGINFO to xzdec does nothing, + but sending SIGUSR1 terminates the process instead of displaying + progress information. + +OPTIONS + -d, --decompress, --uncompress + Ignored for xz(1) compatibility. xzdec supports only decompres- + sion. + + -k, --keep + Ignored for xz(1) compatibility. xzdec never creates or removes + any files. + + -c, --stdout, --to-stdout + Ignored for xz(1) compatibility. xzdec always writes the decom- + pressed data to standard output. + + -q, --quiet + Specifying this once does nothing since xzdec never displays any + warnings or notices. Specify this twice to suppress errors. + + -Q, --no-warn + Ignored for xz(1) compatibility. xzdec never uses the exit sta- + tus 2. + + -h, --help + Display a help message and exit successfully. + + -V, --version + Display the version number of xzdec and liblzma. + +EXIT STATUS + 0 All was good. + + 1 An error occurred. + + xzdec doesn't have any warning messages like xz(1) has, thus the exit + status 2 is not used by xzdec. + +NOTES + Use xz(1) instead of xzdec or lzmadec for normal everyday use. xzdec + or lzmadec are meant only for situations where it is important to have + a smaller decompressor than the full-featured xz(1). + + xzdec and lzmadec are not really that small. The size can be reduced + further by dropping features from liblzma at compile time, but that + shouldn't usually be done for executables distributed in typical non- + embedded operating system distributions. If you need a truly small .xz + decompressor, consider using XZ Embedded. + +SEE ALSO + xz(1) + + XZ Embedded: <https://tukaani.org/xz/embedded.html> + + + +Tukaani 2017-04-19 XZDEC(1) diff --git a/doc/man/txt/xzdiff.txt b/doc/man/txt/xzdiff.txt new file mode 100644 index 0000000..681b00c --- /dev/null +++ b/doc/man/txt/xzdiff.txt @@ -0,0 +1,37 @@ +XZDIFF(1) XZ Utils XZDIFF(1) + + + +NAME + xzcmp, xzdiff, lzcmp, lzdiff - compare compressed files + +SYNOPSIS + xzcmp [cmp_options] file1 [file2] + xzdiff [diff_options] file1 [file2] + lzcmp [cmp_options] file1 [file2] + lzdiff [diff_options] file1 [file2] + +DESCRIPTION + xzcmp and xzdiff invoke cmp(1) or diff(1) on files compressed with + xz(1), lzma(1), gzip(1), bzip2(1), lzop(1), or zstd(1). All options + specified are passed directly to cmp(1) or diff(1). If only one file + is specified, then the files compared are file1 (which must have a suf- + fix of a supported compression format) and file1 from which the com- + pression format suffix has been stripped. If two files are specified, + then they are uncompressed if necessary and fed to cmp(1) or diff(1). + The exit status from cmp(1) or diff(1) is preserved unless a decompres- + sion error occurs; then exit status is 2. + + The names lzcmp and lzdiff are provided for backward compatibility with + LZMA Utils. + +SEE ALSO + cmp(1), diff(1), xz(1), gzip(1), bzip2(1), lzop(1), zstd(1), zdiff(1) + +BUGS + Messages from the cmp(1) or diff(1) programs refer to temporary file- + names instead of those specified. + + + +Tukaani 2021-06-04 XZDIFF(1) diff --git a/doc/man/txt/xzgrep.txt b/doc/man/txt/xzgrep.txt new file mode 100644 index 0000000..596520c --- /dev/null +++ b/doc/man/txt/xzgrep.txt @@ -0,0 +1,49 @@ +XZGREP(1) XZ Utils XZGREP(1) + + + +NAME + xzgrep - search compressed files for a regular expression + +SYNOPSIS + xzgrep [grep_options] [-e] pattern [file...] + xzegrep ... + xzfgrep ... + lzgrep ... + lzegrep ... + lzfgrep ... + +DESCRIPTION + xzgrep invokes grep(1) on files which may be either uncompressed or + compressed with xz(1), lzma(1), gzip(1), bzip2(1), lzop(1), or zstd(1). + All options specified are passed directly to grep(1). + + If no file is specified, then standard input is decompressed if neces- + sary and fed to grep(1). When reading from standard input, gzip(1), + bzip2(1), lzop(1), and zstd(1) compressed files are not supported. + + If xzgrep is invoked as xzegrep or xzfgrep then grep -E or grep -F is + used instead of grep(1). The same applies to names lzgrep, lzegrep, + and lzfgrep, which are provided for backward compatibility with LZMA + Utils. + +EXIT STATUS + 0 At least one match was found from at least one of the input + files. No errors occurred. + + 1 No matches were found from any of the input files. No errors + occurred. + + >1 One or more errors occurred. It is unknown if matches were + found. + +ENVIRONMENT + GREP If the GREP environment variable is set, xzgrep uses it instead + of grep(1), grep -E, or grep -F. + +SEE ALSO + grep(1), xz(1), gzip(1), bzip2(1), lzop(1), zstd(1), zgrep(1) + + + +Tukaani 2022-07-19 XZGREP(1) diff --git a/doc/man/txt/xzless.txt b/doc/man/txt/xzless.txt new file mode 100644 index 0000000..5c14c80 --- /dev/null +++ b/doc/man/txt/xzless.txt @@ -0,0 +1,39 @@ +XZLESS(1) XZ Utils XZLESS(1) + + + +NAME + xzless, lzless - view xz or lzma compressed (text) files + +SYNOPSIS + xzless [file...] + lzless [file...] + +DESCRIPTION + xzless is a filter that displays text from compressed files to a termi- + nal. It works on files compressed with xz(1) or lzma(1). If no files + are given, xzless reads from standard input. + + xzless uses less(1) to present its output. Unlike xzmore, its choice + of pager cannot be altered by setting an environment variable. Com- + mands are based on both more(1) and vi(1) and allow back and forth + movement and searching. See the less(1) manual for more information. + + The command named lzless is provided for backward compatibility with + LZMA Utils. + +ENVIRONMENT + LESSMETACHARS + A list of characters special to the shell. Set by xzless unless + it is already set in the environment. + + LESSOPEN + Set to a command line to invoke the xz(1) decompressor for pre- + processing the input files to less(1). + +SEE ALSO + less(1), xz(1), xzmore(1), zless(1) + + + +Tukaani 2010-09-27 XZLESS(1) diff --git a/doc/man/txt/xzmore.txt b/doc/man/txt/xzmore.txt new file mode 100644 index 0000000..5a9d86c --- /dev/null +++ b/doc/man/txt/xzmore.txt @@ -0,0 +1,34 @@ +XZMORE(1) XZ Utils XZMORE(1) + + + +NAME + xzmore, lzmore - view xz or lzma compressed (text) files + +SYNOPSIS + xzmore [file...] + lzmore [file...] + +DESCRIPTION + xzmore is a filter which allows examination of xz(1) or lzma(1) com- + pressed text files one screenful at a time on a soft-copy terminal. + + To use a pager other than the default more, set environment variable + PAGER to the name of the desired program. The name lzmore is provided + for backward compatibility with LZMA Utils. + + e or q When the prompt --More--(Next file: file) is printed, this com- + mand causes xzmore to exit. + + s When the prompt --More--(Next file: file) is printed, this com- + mand causes xzmore to skip the next file and continue. + + For list of keyboard commands supported while actually viewing the con- + tent of a file, refer to manual of the pager you use, usually more(1). + +SEE ALSO + more(1), xz(1), xzless(1), zmore(1) + + + +Tukaani 2013-06-30 XZMORE(1) diff --git a/doc/xz-file-format.txt b/doc/xz-file-format.txt new file mode 100644 index 0000000..09c83e0 --- /dev/null +++ b/doc/xz-file-format.txt @@ -0,0 +1,1165 @@ + +The .xz File Format +=================== + +Version 1.1.0 (2022-12-11) + + + 0. Preface + 0.1. Notices and Acknowledgements + 0.2. Getting the Latest Version + 0.3. Version History + 1. Conventions + 1.1. Byte and Its Representation + 1.2. Multibyte Integers + 2. Overall Structure of .xz File + 2.1. Stream + 2.1.1. Stream Header + 2.1.1.1. Header Magic Bytes + 2.1.1.2. Stream Flags + 2.1.1.3. CRC32 + 2.1.2. Stream Footer + 2.1.2.1. CRC32 + 2.1.2.2. Backward Size + 2.1.2.3. Stream Flags + 2.1.2.4. Footer Magic Bytes + 2.2. Stream Padding + 3. Block + 3.1. Block Header + 3.1.1. Block Header Size + 3.1.2. Block Flags + 3.1.3. Compressed Size + 3.1.4. Uncompressed Size + 3.1.5. List of Filter Flags + 3.1.6. Header Padding + 3.1.7. CRC32 + 3.2. Compressed Data + 3.3. Block Padding + 3.4. Check + 4. Index + 4.1. Index Indicator + 4.2. Number of Records + 4.3. List of Records + 4.3.1. Unpadded Size + 4.3.2. Uncompressed Size + 4.4. Index Padding + 4.5. CRC32 + 5. Filter Chains + 5.1. Alignment + 5.2. Security + 5.3. Filters + 5.3.1. LZMA2 + 5.3.2. Branch/Call/Jump Filters for Executables + 5.3.3. Delta + 5.3.3.1. Format of the Encoded Output + 5.4. Custom Filter IDs + 5.4.1. Reserved Custom Filter ID Ranges + 6. Cyclic Redundancy Checks + 7. References + + +0. Preface + + This document describes the .xz file format (filename suffix + ".xz", MIME type "application/x-xz"). It is intended that this + this format replace the old .lzma format used by LZMA SDK and + LZMA Utils. + + +0.1. Notices and Acknowledgements + + This file format was designed by Lasse Collin + <lasse.collin@tukaani.org> and Igor Pavlov. + + Special thanks for helping with this document goes to + Ville Koskinen. Thanks for helping with this document goes to + Mark Adler, H. Peter Anvin, Mikko Pouru, and Lars Wirzenius. + + This document has been put into the public domain. + + +0.2. Getting the Latest Version + + The latest official version of this document can be downloaded + from <http://tukaani.org/xz/xz-file-format.txt>. + + Specific versions of this document have a filename + xz-file-format-X.Y.Z.txt where X.Y.Z is the version number. + For example, the version 1.0.0 of this document is available + at <http://tukaani.org/xz/xz-file-format-1.0.0.txt>. + + +0.3. Version History + + Version Date Description + + 1.1.0 2022-12-11 Added ARM64 filter and clarified 32-bit + ARM endianness in Section 5.3.2, + language improvements in Section 5.4 + + 1.0.4 2009-08-27 Language improvements in Sections 1.2, + 2.1.1.2, 3.1.1, 3.1.2, and 5.3.1 + + 1.0.3 2009-06-05 Spelling fixes in Sections 5.1 and 5.4 + + 1.0.2 2009-06-04 Typo fixes in Sections 4 and 5.3.1 + + 1.0.1 2009-06-01 Typo fix in Section 0.3 and minor + clarifications to Sections 2, 2.2, + 3.3, 4.4, and 5.3.2 + + 1.0.0 2009-01-14 The first official version + + +1. Conventions + + The key words "MUST", "MUST NOT", "REQUIRED", "SHOULD", + "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this + document are to be interpreted as described in [RFC-2119]. + + Indicating a warning means displaying a message, returning + appropriate exit status, or doing something else to let the + user know that something worth warning occurred. The operation + SHOULD still finish if a warning is indicated. + + Indicating an error means displaying a message, returning + appropriate exit status, or doing something else to let the + user know that something prevented successfully finishing the + operation. The operation MUST be aborted once an error has + been indicated. + + +1.1. Byte and Its Representation + + In this document, byte is always 8 bits. + + A "null byte" has all bits unset. That is, the value of a null + byte is 0x00. + + To represent byte blocks, this document uses notation that + is similar to the notation used in [RFC-1952]: + + +-------+ + | Foo | One byte. + +-------+ + + +---+---+ + | Foo | Two bytes; that is, some of the vertical bars + +---+---+ can be missing. + + +=======+ + | Foo | Zero or more bytes. + +=======+ + + In this document, a boxed byte or a byte sequence declared + using this notation is called "a field". The example field + above would be called "the Foo field" or plain "Foo". + + If there are many fields, they may be split to multiple lines. + This is indicated with an arrow ("--->"): + + +=====+ + | Foo | + +=====+ + + +=====+ + ---> | Bar | + +=====+ + + The above is equivalent to this: + + +=====+=====+ + | Foo | Bar | + +=====+=====+ + + +1.2. Multibyte Integers + + Multibyte integers of static length, such as CRC values, + are stored in little endian byte order (least significant + byte first). + + When smaller values are more likely than bigger values (for + example file sizes), multibyte integers are encoded in a + variable-length representation: + - Numbers in the range [0, 127] are copied as is, and take + one byte of space. + - Bigger numbers will occupy two or more bytes. All but the + last byte of the multibyte representation have the highest + (eighth) bit set. + + For now, the value of the variable-length integers is limited + to 63 bits, which limits the encoded size of the integer to + nine bytes. These limits may be increased in the future if + needed. + + The following C code illustrates encoding and decoding of + variable-length integers. The functions return the number of + bytes occupied by the integer (1-9), or zero on error. + + #include <stddef.h> + #include <inttypes.h> + + size_t + encode(uint8_t buf[static 9], uint64_t num) + { + if (num > UINT64_MAX / 2) + return 0; + + size_t i = 0; + + while (num >= 0x80) { + buf[i++] = (uint8_t)(num) | 0x80; + num >>= 7; + } + + buf[i++] = (uint8_t)(num); + + return i; + } + + size_t + decode(const uint8_t buf[], size_t size_max, uint64_t *num) + { + if (size_max == 0) + return 0; + + if (size_max > 9) + size_max = 9; + + *num = buf[0] & 0x7F; + size_t i = 0; + + while (buf[i++] & 0x80) { + if (i >= size_max || buf[i] == 0x00) + return 0; + + *num |= (uint64_t)(buf[i] & 0x7F) << (i * 7); + } + + return i; + } + + +2. Overall Structure of .xz File + + A standalone .xz files consist of one or more Streams which may + have Stream Padding between or after them: + + +========+================+========+================+ + | Stream | Stream Padding | Stream | Stream Padding | ... + +========+================+========+================+ + + The sizes of Stream and Stream Padding are always multiples + of four bytes, thus the size of every valid .xz file MUST be + a multiple of four bytes. + + While a typical file contains only one Stream and no Stream + Padding, a decoder handling standalone .xz files SHOULD support + files that have more than one Stream or Stream Padding. + + In contrast to standalone .xz files, when the .xz file format + is used as an internal part of some other file format or + communication protocol, it usually is expected that the decoder + stops after the first Stream, and doesn't look for Stream + Padding or possibly other Streams. + + +2.1. Stream + + +-+-+-+-+-+-+-+-+-+-+-+-+=======+=======+ +=======+ + | Stream Header | Block | Block | ... | Block | + +-+-+-+-+-+-+-+-+-+-+-+-+=======+=======+ +=======+ + + +=======+-+-+-+-+-+-+-+-+-+-+-+-+ + ---> | Index | Stream Footer | + +=======+-+-+-+-+-+-+-+-+-+-+-+-+ + + All the above fields have a size that is a multiple of four. If + Stream is used as an internal part of another file format, it + is RECOMMENDED to make the Stream start at an offset that is + a multiple of four bytes. + + Stream Header, Index, and Stream Footer are always present in + a Stream. The maximum size of the Index field is 16 GiB (2^34). + + There are zero or more Blocks. The maximum number of Blocks is + limited only by the maximum size of the Index field. + + Total size of a Stream MUST be less than 8 EiB (2^63 bytes). + The same limit applies to the total amount of uncompressed + data stored in a Stream. + + If an implementation supports handling .xz files with multiple + concatenated Streams, it MAY apply the above limits to the file + as a whole instead of limiting per Stream basis. + + +2.1.1. Stream Header + + +---+---+---+---+---+---+-------+------+--+--+--+--+ + | Header Magic Bytes | Stream Flags | CRC32 | + +---+---+---+---+---+---+-------+------+--+--+--+--+ + + +2.1.1.1. Header Magic Bytes + + The first six (6) bytes of the Stream are so called Header + Magic Bytes. They can be used to identify the file type. + + Using a C array and ASCII: + const uint8_t HEADER_MAGIC[6] + = { 0xFD, '7', 'z', 'X', 'Z', 0x00 }; + + In plain hexadecimal: + FD 37 7A 58 5A 00 + + Notes: + - The first byte (0xFD) was chosen so that the files cannot + be erroneously detected as being in .lzma format, in which + the first byte is in the range [0x00, 0xE0]. + - The sixth byte (0x00) was chosen to prevent applications + from misdetecting the file as a text file. + + If the Header Magic Bytes don't match, the decoder MUST + indicate an error. + + +2.1.1.2. Stream Flags + + The first byte of Stream Flags is always a null byte. In the + future, this byte may be used to indicate a new Stream version + or other Stream properties. + + The second byte of Stream Flags is a bit field: + + Bit(s) Mask Description + 0-3 0x0F Type of Check (see Section 3.4): + ID Size Check name + 0x00 0 bytes None + 0x01 4 bytes CRC32 + 0x02 4 bytes (Reserved) + 0x03 4 bytes (Reserved) + 0x04 8 bytes CRC64 + 0x05 8 bytes (Reserved) + 0x06 8 bytes (Reserved) + 0x07 16 bytes (Reserved) + 0x08 16 bytes (Reserved) + 0x09 16 bytes (Reserved) + 0x0A 32 bytes SHA-256 + 0x0B 32 bytes (Reserved) + 0x0C 32 bytes (Reserved) + 0x0D 64 bytes (Reserved) + 0x0E 64 bytes (Reserved) + 0x0F 64 bytes (Reserved) + 4-7 0xF0 Reserved for future use; MUST be zero for now. + + Implementations SHOULD support at least the Check IDs 0x00 + (None) and 0x01 (CRC32). Supporting other Check IDs is + OPTIONAL. If an unsupported Check is used, the decoder SHOULD + indicate a warning or error. + + If any reserved bit is set, the decoder MUST indicate an error. + It is possible that there is a new field present which the + decoder is not aware of, and can thus parse the Stream Header + incorrectly. + + +2.1.1.3. CRC32 + + The CRC32 is calculated from the Stream Flags field. It is + stored as an unsigned 32-bit little endian integer. If the + calculated value does not match the stored one, the decoder + MUST indicate an error. + + The idea is that Stream Flags would always be two bytes, even + if new features are needed. This way old decoders will be able + to verify the CRC32 calculated from Stream Flags, and thus + distinguish between corrupt files (CRC32 doesn't match) and + files that the decoder doesn't support (CRC32 matches but + Stream Flags has reserved bits set). + + +2.1.2. Stream Footer + + +-+-+-+-+---+---+---+---+-------+------+----------+---------+ + | CRC32 | Backward Size | Stream Flags | Footer Magic Bytes | + +-+-+-+-+---+---+---+---+-------+------+----------+---------+ + + +2.1.2.1. CRC32 + + The CRC32 is calculated from the Backward Size and Stream Flags + fields. It is stored as an unsigned 32-bit little endian + integer. If the calculated value does not match the stored one, + the decoder MUST indicate an error. + + The reason to have the CRC32 field before the Backward Size and + Stream Flags fields is to keep the four-byte fields aligned to + a multiple of four bytes. + + +2.1.2.2. Backward Size + + Backward Size is stored as a 32-bit little endian integer, + which indicates the size of the Index field as multiple of + four bytes, minimum value being four bytes: + + real_backward_size = (stored_backward_size + 1) * 4; + + If the stored value does not match the real size of the Index + field, the decoder MUST indicate an error. + + Using a fixed-size integer to store Backward Size makes + it slightly simpler to parse the Stream Footer when the + application needs to parse the Stream backwards. + + +2.1.2.3. Stream Flags + + This is a copy of the Stream Flags field from the Stream + Header. The information stored to Stream Flags is needed + when parsing the Stream backwards. The decoder MUST compare + the Stream Flags fields in both Stream Header and Stream + Footer, and indicate an error if they are not identical. + + +2.1.2.4. Footer Magic Bytes + + As the last step of the decoding process, the decoder MUST + verify the existence of Footer Magic Bytes. If they don't + match, an error MUST be indicated. + + Using a C array and ASCII: + const uint8_t FOOTER_MAGIC[2] = { 'Y', 'Z' }; + + In hexadecimal: + 59 5A + + The primary reason to have Footer Magic Bytes is to make + it easier to detect incomplete files quickly, without + uncompressing. If the file does not end with Footer Magic Bytes + (excluding Stream Padding described in Section 2.2), it cannot + be undamaged, unless someone has intentionally appended garbage + after the end of the Stream. + + +2.2. Stream Padding + + Only the decoders that support decoding of concatenated Streams + MUST support Stream Padding. + + Stream Padding MUST contain only null bytes. To preserve the + four-byte alignment of consecutive Streams, the size of Stream + Padding MUST be a multiple of four bytes. Empty Stream Padding + is allowed. If these requirements are not met, the decoder MUST + indicate an error. + + Note that non-empty Stream Padding is allowed at the end of the + file; there doesn't need to be a new Stream after non-empty + Stream Padding. This can be convenient in certain situations + [GNU-tar]. + + The possibility of Stream Padding MUST be taken into account + when designing an application that parses Streams backwards, + and the application supports concatenated Streams. + + +3. Block + + +==============+=================+===============+=======+ + | Block Header | Compressed Data | Block Padding | Check | + +==============+=================+===============+=======+ + + +3.1. Block Header + + +-------------------+-------------+=================+ + | Block Header Size | Block Flags | Compressed Size | + +-------------------+-------------+=================+ + + +===================+======================+ + ---> | Uncompressed Size | List of Filter Flags | + +===================+======================+ + + +================+--+--+--+--+ + ---> | Header Padding | CRC32 | + +================+--+--+--+--+ + + +3.1.1. Block Header Size + + This field overlaps with the Index Indicator field (see + Section 4.1). + + This field contains the size of the Block Header field, + including the Block Header Size field itself. Valid values are + in the range [0x01, 0xFF], which indicate the size of the Block + Header as multiples of four bytes, minimum size being eight + bytes: + + real_header_size = (encoded_header_size + 1) * 4; + + If a Block Header bigger than 1024 bytes is needed in the + future, a new field can be added between the Block Header and + Compressed Data fields. The presence of this new field would + be indicated in the Block Header field. + + +3.1.2. Block Flags + + The Block Flags field is a bit field: + + Bit(s) Mask Description + 0-1 0x03 Number of filters (1-4) + 2-5 0x3C Reserved for future use; MUST be zero for now. + 6 0x40 The Compressed Size field is present. + 7 0x80 The Uncompressed Size field is present. + + If any reserved bit is set, the decoder MUST indicate an error. + It is possible that there is a new field present which the + decoder is not aware of, and can thus parse the Block Header + incorrectly. + + +3.1.3. Compressed Size + + This field is present only if the appropriate bit is set in + the Block Flags field (see Section 3.1.2). + + The Compressed Size field contains the size of the Compressed + Data field, which MUST be non-zero. Compressed Size is stored + using the encoding described in Section 1.2. If the Compressed + Size doesn't match the size of the Compressed Data field, the + decoder MUST indicate an error. + + +3.1.4. Uncompressed Size + + This field is present only if the appropriate bit is set in + the Block Flags field (see Section 3.1.2). + + The Uncompressed Size field contains the size of the Block + after uncompressing. Uncompressed Size is stored using the + encoding described in Section 1.2. If the Uncompressed Size + does not match the real uncompressed size, the decoder MUST + indicate an error. + + Storing the Compressed Size and Uncompressed Size fields serves + several purposes: + - The decoder knows how much memory it needs to allocate + for a temporary buffer in multithreaded mode. + - Simple error detection: wrong size indicates a broken file. + - Seeking forwards to a specific location in streamed mode. + + It should be noted that the only reliable way to determine + the real uncompressed size is to uncompress the Block, + because the Block Header and Index fields may contain + (intentionally or unintentionally) invalid information. + + +3.1.5. List of Filter Flags + + +================+================+ +================+ + | Filter 0 Flags | Filter 1 Flags | ... | Filter n Flags | + +================+================+ +================+ + + The number of Filter Flags fields is stored in the Block Flags + field (see Section 3.1.2). + + The format of each Filter Flags field is as follows: + + +===========+====================+===================+ + | Filter ID | Size of Properties | Filter Properties | + +===========+====================+===================+ + + Both Filter ID and Size of Properties are stored using the + encoding described in Section 1.2. Size of Properties indicates + the size of the Filter Properties field as bytes. The list of + officially defined Filter IDs and the formats of their Filter + Properties are described in Section 5.3. + + Filter IDs greater than or equal to 0x4000_0000_0000_0000 + (2^62) are reserved for implementation-specific internal use. + These Filter IDs MUST never be used in List of Filter Flags. + + +3.1.6. Header Padding + + This field contains as many null byte as it is needed to make + the Block Header have the size specified in Block Header Size. + If any of the bytes are not null bytes, the decoder MUST + indicate an error. It is possible that there is a new field + present which the decoder is not aware of, and can thus parse + the Block Header incorrectly. + + +3.1.7. CRC32 + + The CRC32 is calculated over everything in the Block Header + field except the CRC32 field itself. It is stored as an + unsigned 32-bit little endian integer. If the calculated + value does not match the stored one, the decoder MUST indicate + an error. + + By verifying the CRC32 of the Block Header before parsing the + actual contents allows the decoder to distinguish between + corrupt and unsupported files. + + +3.2. Compressed Data + + The format of Compressed Data depends on Block Flags and List + of Filter Flags. Excluding the descriptions of the simplest + filters in Section 5.3, the format of the filter-specific + encoded data is out of scope of this document. + + +3.3. Block Padding + + Block Padding MUST contain 0-3 null bytes to make the size of + the Block a multiple of four bytes. This can be needed when + the size of Compressed Data is not a multiple of four. If any + of the bytes in Block Padding are not null bytes, the decoder + MUST indicate an error. + + +3.4. Check + + The type and size of the Check field depends on which bits + are set in the Stream Flags field (see Section 2.1.1.2). + + The Check, when used, is calculated from the original + uncompressed data. If the calculated Check does not match the + stored one, the decoder MUST indicate an error. If the selected + type of Check is not supported by the decoder, it SHOULD + indicate a warning or error. + + +4. Index + + +-----------------+===================+ + | Index Indicator | Number of Records | + +-----------------+===================+ + + +=================+===============+-+-+-+-+ + ---> | List of Records | Index Padding | CRC32 | + +=================+===============+-+-+-+-+ + + Index serves several purposes. Using it, one can + - verify that all Blocks in a Stream have been processed; + - find out the uncompressed size of a Stream; and + - quickly access the beginning of any Block (random access). + + +4.1. Index Indicator + + This field overlaps with the Block Header Size field (see + Section 3.1.1). The value of Index Indicator is always 0x00. + + +4.2. Number of Records + + This field indicates how many Records there are in the List + of Records field, and thus how many Blocks there are in the + Stream. The value is stored using the encoding described in + Section 1.2. If the decoder has decoded all the Blocks of the + Stream, and then notices that the Number of Records doesn't + match the real number of Blocks, the decoder MUST indicate an + error. + + +4.3. List of Records + + List of Records consists of as many Records as indicated by the + Number of Records field: + + +========+========+ + | Record | Record | ... + +========+========+ + + Each Record contains information about one Block: + + +===============+===================+ + | Unpadded Size | Uncompressed Size | + +===============+===================+ + + If the decoder has decoded all the Blocks of the Stream, it + MUST verify that the contents of the Records match the real + Unpadded Size and Uncompressed Size of the respective Blocks. + + Implementation hint: It is possible to verify the Index with + constant memory usage by calculating for example SHA-256 of + both the real size values and the List of Records, then + comparing the hash values. Implementing this using + non-cryptographic hash like CRC32 SHOULD be avoided unless + small code size is important. + + If the decoder supports random-access reading, it MUST verify + that Unpadded Size and Uncompressed Size of every completely + decoded Block match the sizes stored in the Index. If only + partial Block is decoded, the decoder MUST verify that the + processed sizes don't exceed the sizes stored in the Index. + + +4.3.1. Unpadded Size + + This field indicates the size of the Block excluding the Block + Padding field. That is, Unpadded Size is the size of the Block + Header, Compressed Data, and Check fields. Unpadded Size is + stored using the encoding described in Section 1.2. The value + MUST never be zero; with the current structure of Blocks, the + actual minimum value for Unpadded Size is five. + + Implementation note: Because the size of the Block Padding + field is not included in Unpadded Size, calculating the total + size of a Stream or doing random-access reading requires + calculating the actual size of the Blocks by rounding Unpadded + Sizes up to the next multiple of four. + + The reason to exclude Block Padding from Unpadded Size is to + ease making a raw copy of Compressed Data without Block + Padding. This can be useful, for example, if someone wants + to convert Streams to some other file format quickly. + + +4.3.2. Uncompressed Size + + This field indicates the Uncompressed Size of the respective + Block as bytes. The value is stored using the encoding + described in Section 1.2. + + +4.4. Index Padding + + This field MUST contain 0-3 null bytes to pad the Index to + a multiple of four bytes. If any of the bytes are not null + bytes, the decoder MUST indicate an error. + + +4.5. CRC32 + + The CRC32 is calculated over everything in the Index field + except the CRC32 field itself. The CRC32 is stored as an + unsigned 32-bit little endian integer. If the calculated + value does not match the stored one, the decoder MUST indicate + an error. + + +5. Filter Chains + + The Block Flags field defines how many filters are used. When + more than one filter is used, the filters are chained; that is, + the output of one filter is the input of another filter. The + following figure illustrates the direction of data flow. + + v Uncompressed Data ^ + | Filter 0 | + Encoder | Filter 1 | Decoder + | Filter n | + v Compressed Data ^ + + +5.1. Alignment + + Alignment of uncompressed input data is usually the job of + the application producing the data. For example, to get the + best results, an archiver tool should make sure that all + PowerPC executable files in the archive stream start at + offsets that are multiples of four bytes. + + Some filters, for example LZMA2, can be configured to take + advantage of specified alignment of input data. Note that + taking advantage of aligned input can be beneficial also when + a filter is not the first filter in the chain. For example, + if you compress PowerPC executables, you may want to use the + PowerPC filter and chain that with the LZMA2 filter. Because + not only the input but also the output alignment of the PowerPC + filter is four bytes, it is now beneficial to set LZMA2 + settings so that the LZMA2 encoder can take advantage of its + four-byte-aligned input data. + + The output of the last filter in the chain is stored to the + Compressed Data field, which is is guaranteed to be aligned + to a multiple of four bytes relative to the beginning of the + Stream. This can increase + - speed, if the filtered data is handled multiple bytes at + a time by the filter-specific encoder and decoder, + because accessing aligned data in computer memory is + usually faster; and + - compression ratio, if the output data is later compressed + with an external compression tool. + + +5.2. Security + + If filters would be allowed to be chained freely, it would be + possible to create malicious files, that would be very slow to + decode. Such files could be used to create denial of service + attacks. + + Slow files could occur when multiple filters are chained: + + v Compressed input data + | Filter 1 decoder (last filter) + | Filter 0 decoder (non-last filter) + v Uncompressed output data + + The decoder of the last filter in the chain produces a lot of + output from little input. Another filter in the chain takes the + output of the last filter, and produces very little output + while consuming a lot of input. As a result, a lot of data is + moved inside the filter chain, but the filter chain as a whole + gets very little work done. + + To prevent this kind of slow files, there are restrictions on + how the filters can be chained. These restrictions MUST be + taken into account when designing new filters. + + The maximum number of filters in the chain has been limited to + four, thus there can be at maximum of three non-last filters. + Of these three non-last filters, only two are allowed to change + the size of the data. + + The non-last filters, that change the size of the data, MUST + have a limit how much the decoder can compress the data: the + decoder SHOULD produce at least n bytes of output when the + filter is given 2n bytes of input. This limit is not + absolute, but significant deviations MUST be avoided. + + The above limitations guarantee that if the last filter in the + chain produces 4n bytes of output, the chain as a whole will + produce at least n bytes of output. + + +5.3. Filters + +5.3.1. LZMA2 + + LZMA (Lempel-Ziv-Markov chain-Algorithm) is a general-purpose + compression algorithm with high compression ratio and fast + decompression. LZMA is based on LZ77 and range coding + algorithms. + + LZMA2 is an extension on top of the original LZMA. LZMA2 uses + LZMA internally, but adds support for flushing the encoder, + uncompressed chunks, eases stateful decoder implementations, + and improves support for multithreading. Thus, the plain LZMA + will not be supported in this file format. + + Filter ID: 0x21 + Size of Filter Properties: 1 byte + Changes size of data: Yes + Allow as a non-last filter: No + Allow as the last filter: Yes + + Preferred alignment: + Input data: Adjustable to 1/2/4/8/16 byte(s) + Output data: 1 byte + + The format of the one-byte Filter Properties field is as + follows: + + Bits Mask Description + 0-5 0x3F Dictionary Size + 6-7 0xC0 Reserved for future use; MUST be zero for now. + + Dictionary Size is encoded with one-bit mantissa and five-bit + exponent. The smallest dictionary size is 4 KiB and the biggest + is 4 GiB. + + Raw value Mantissa Exponent Dictionary size + 0 2 11 4 KiB + 1 3 11 6 KiB + 2 2 12 8 KiB + 3 3 12 12 KiB + 4 2 13 16 KiB + 5 3 13 24 KiB + 6 2 14 32 KiB + ... ... ... ... + 35 3 27 768 MiB + 36 2 28 1024 MiB + 37 3 29 1536 MiB + 38 2 30 2048 MiB + 39 3 30 3072 MiB + 40 2 31 4096 MiB - 1 B + + Instead of having a table in the decoder, the dictionary size + can be decoded using the following C code: + + const uint8_t bits = get_dictionary_flags() & 0x3F; + if (bits > 40) + return DICTIONARY_TOO_BIG; // Bigger than 4 GiB + + uint32_t dictionary_size; + if (bits == 40) { + dictionary_size = UINT32_MAX; + } else { + dictionary_size = 2 | (bits & 1); + dictionary_size <<= bits / 2 + 11; + } + + +5.3.2. Branch/Call/Jump Filters for Executables + + These filters convert relative branch, call, and jump + instructions to their absolute counterparts in executable + files. This conversion increases redundancy and thus + compression ratio. + + Size of Filter Properties: 0 or 4 bytes + Changes size of data: No + Allow as a non-last filter: Yes + Allow as the last filter: No + + Below is the list of filters in this category. The alignment + is the same for both input and output data. + + Filter ID Alignment Description + 0x04 1 byte x86 filter (BCJ) + 0x05 4 bytes PowerPC (big endian) filter + 0x06 16 bytes IA64 filter + 0x07 4 bytes ARM filter [1] + 0x08 2 bytes ARM Thumb filter [1] + 0x09 4 bytes SPARC filter + 0x0A 4 bytes ARM64 filter [2] + + [1] These are for little endian instruction encoding. + This must not be confused with data endianness. + A processor configured for big endian data access + may still use little endian instruction encoding. + The filters don't care about the data endianness. + + [2] 4096-byte alignment gives the best results + because the address in the ADRP instruction + is a multiple of 4096 bytes. + + If the size of Filter Properties is four bytes, the Filter + Properties field contains the start offset used for address + conversions. It is stored as an unsigned 32-bit little endian + integer. The start offset MUST be a multiple of the alignment + of the filter as listed in the table above; if it isn't, the + decoder MUST indicate an error. If the size of Filter + Properties is zero, the start offset is zero. + + Setting the start offset may be useful if an executable has + multiple sections, and there are many cross-section calls. + Taking advantage of this feature usually requires usage of + the Subblock filter, whose design is not complete yet. + + +5.3.3. Delta + + The Delta filter may increase compression ratio when the value + of the next byte correlates with the value of an earlier byte + at specified distance. + + Filter ID: 0x03 + Size of Filter Properties: 1 byte + Changes size of data: No + Allow as a non-last filter: Yes + Allow as the last filter: No + + Preferred alignment: + Input data: 1 byte + Output data: Same as the original input data + + The Properties byte indicates the delta distance, which can be + 1-256 bytes backwards from the current byte: 0x00 indicates + distance of 1 byte and 0xFF distance of 256 bytes. + + +5.3.3.1. Format of the Encoded Output + + The code below illustrates both encoding and decoding with + the Delta filter. + + // Distance is in the range [1, 256]. + const unsigned int distance = get_properties_byte() + 1; + uint8_t pos = 0; + uint8_t delta[256]; + + memset(delta, 0, sizeof(delta)); + + while (1) { + const int byte = read_byte(); + if (byte == EOF) + break; + + uint8_t tmp = delta[(uint8_t)(distance + pos)]; + if (is_encoder) { + tmp = (uint8_t)(byte) - tmp; + delta[pos] = (uint8_t)(byte); + } else { + tmp = (uint8_t)(byte) + tmp; + delta[pos] = tmp; + } + + write_byte(tmp); + --pos; + } + + +5.4. Custom Filter IDs + + If a developer wants to use custom Filter IDs, there are two + choices. The first choice is to contact Lasse Collin and ask + him to allocate a range of IDs for the developer. + + The second choice is to generate a 40-bit random integer + which the developer can use as a personal Developer ID. + To minimize the risk of collisions, Developer ID has to be + a randomly generated integer, not manually selected "hex word". + The following command, which works on many free operating + systems, can be used to generate Developer ID: + + dd if=/dev/urandom bs=5 count=1 | hexdump + + The developer can then use the Developer ID to create unique + (well, hopefully unique) Filter IDs. + + Bits Mask Description + 0-15 0x0000_0000_0000_FFFF Filter ID + 16-55 0x00FF_FFFF_FFFF_0000 Developer ID + 56-62 0x3F00_0000_0000_0000 Static prefix: 0x3F + + The resulting 63-bit integer will use 9 bytes of space when + stored using the encoding described in Section 1.2. To get + a shorter ID, see the beginning of this Section how to + request a custom ID range. + + +5.4.1. Reserved Custom Filter ID Ranges + + Range Description + 0x0000_0300 - 0x0000_04FF Reserved to ease .7z compatibility + 0x0002_0000 - 0x0007_FFFF Reserved to ease .7z compatibility + 0x0200_0000 - 0x07FF_FFFF Reserved to ease .7z compatibility + + +6. Cyclic Redundancy Checks + + There are several incompatible variations to calculate CRC32 + and CRC64. For simplicity and clarity, complete examples are + provided to calculate the checks as they are used in this file + format. Implementations MAY use different code as long as it + gives identical results. + + The program below reads data from standard input, calculates + the CRC32 and CRC64 values, and prints the calculated values + as big endian hexadecimal strings to standard output. + + #include <stddef.h> + #include <inttypes.h> + #include <stdio.h> + + uint32_t crc32_table[256]; + uint64_t crc64_table[256]; + + void + init(void) + { + static const uint32_t poly32 = UINT32_C(0xEDB88320); + static const uint64_t poly64 + = UINT64_C(0xC96C5795D7870F42); + + for (size_t i = 0; i < 256; ++i) { + uint32_t crc32 = i; + uint64_t crc64 = i; + + for (size_t j = 0; j < 8; ++j) { + if (crc32 & 1) + crc32 = (crc32 >> 1) ^ poly32; + else + crc32 >>= 1; + + if (crc64 & 1) + crc64 = (crc64 >> 1) ^ poly64; + else + crc64 >>= 1; + } + + crc32_table[i] = crc32; + crc64_table[i] = crc64; + } + } + + uint32_t + crc32(const uint8_t *buf, size_t size, uint32_t crc) + { + crc = ~crc; + for (size_t i = 0; i < size; ++i) + crc = crc32_table[buf[i] ^ (crc & 0xFF)] + ^ (crc >> 8); + return ~crc; + } + + uint64_t + crc64(const uint8_t *buf, size_t size, uint64_t crc) + { + crc = ~crc; + for (size_t i = 0; i < size; ++i) + crc = crc64_table[buf[i] ^ (crc & 0xFF)] + ^ (crc >> 8); + return ~crc; + } + + int + main() + { + init(); + + uint32_t value32 = 0; + uint64_t value64 = 0; + uint64_t total_size = 0; + uint8_t buf[8192]; + + while (1) { + const size_t buf_size + = fread(buf, 1, sizeof(buf), stdin); + if (buf_size == 0) + break; + + total_size += buf_size; + value32 = crc32(buf, buf_size, value32); + value64 = crc64(buf, buf_size, value64); + } + + printf("Bytes: %" PRIu64 "\n", total_size); + printf("CRC-32: 0x%08" PRIX32 "\n", value32); + printf("CRC-64: 0x%016" PRIX64 "\n", value64); + + return 0; + } + + +7. References + + LZMA SDK - The original LZMA implementation + http://7-zip.org/sdk.html + + LZMA Utils - LZMA adapted to POSIX-like systems + http://tukaani.org/lzma/ + + XZ Utils - The next generation of LZMA Utils + http://tukaani.org/xz/ + + [RFC-1952] + GZIP file format specification version 4.3 + http://www.ietf.org/rfc/rfc1952.txt + - Notation of byte boxes in section "2.1. Overall conventions" + + [RFC-2119] + Key words for use in RFCs to Indicate Requirement Levels + http://www.ietf.org/rfc/rfc2119.txt + + [GNU-tar] + GNU tar 1.21 manual + http://www.gnu.org/software/tar/manual/html_node/Blocking-Factor.html + - Node 9.4.2 "Blocking Factor", paragraph that begins + "gzip will complain about trailing garbage" + - Note that this URL points to the latest version of the + manual, and may some day not contain the note which is in + 1.21. For the exact version of the manual, download GNU + tar 1.21: ftp://ftp.gnu.org/pub/gnu/tar/tar-1.21.tar.gz + |