summaryrefslogtreecommitdiffstats
path: root/doc
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 21:12:04 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 21:12:04 +0000
commiteac54b7c4aec25060d7bd856f7cdc290943d6aae (patch)
tree9a6d81c9f88df4698e746d63d14ddafeddd918b8 /doc
parentInitial commit. (diff)
downloadxz-utils-eac54b7c4aec25060d7bd856f7cdc290943d6aae.tar.xz
xz-utils-eac54b7c4aec25060d7bd856f7cdc290943d6aae.zip
Adding upstream version 5.4.1.upstream/5.4.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'doc')
-rw-r--r--doc/examples/00_README.txt31
-rw-r--r--doc/examples/01_compress_easy.c297
-rw-r--r--doc/examples/02_decompress.c287
-rw-r--r--doc/examples/03_compress_custom.c193
-rw-r--r--doc/examples/04_compress_easy_mt.c206
-rw-r--r--doc/examples/Makefile25
-rw-r--r--doc/examples_old/xz_pipe_comp.c127
-rw-r--r--doc/examples_old/xz_pipe_decomp.c123
-rw-r--r--doc/faq.txt244
-rw-r--r--doc/history.txt150
-rw-r--r--doc/lzma-file-format.txt173
-rw-r--r--doc/man/pdf-a4/lzmainfo-a4.pdfbin0 -> 16382 bytes
-rw-r--r--doc/man/pdf-a4/xz-a4.pdfbin0 -> 114914 bytes
-rw-r--r--doc/man/pdf-a4/xzdec-a4.pdfbin0 -> 19981 bytes
-rw-r--r--doc/man/pdf-a4/xzdiff-a4.pdfbin0 -> 17198 bytes
-rw-r--r--doc/man/pdf-a4/xzgrep-a4.pdfbin0 -> 18339 bytes
-rw-r--r--doc/man/pdf-a4/xzless-a4.pdfbin0 -> 15033 bytes
-rw-r--r--doc/man/pdf-a4/xzmore-a4.pdfbin0 -> 15207 bytes
-rw-r--r--doc/man/pdf-letter/lzmainfo-letter.pdfbin0 -> 16392 bytes
-rw-r--r--doc/man/pdf-letter/xz-letter.pdfbin0 -> 115913 bytes
-rw-r--r--doc/man/pdf-letter/xzdec-letter.pdfbin0 -> 19973 bytes
-rw-r--r--doc/man/pdf-letter/xzdiff-letter.pdfbin0 -> 17225 bytes
-rw-r--r--doc/man/pdf-letter/xzgrep-letter.pdfbin0 -> 18323 bytes
-rw-r--r--doc/man/pdf-letter/xzless-letter.pdfbin0 -> 15026 bytes
-rw-r--r--doc/man/pdf-letter/xzmore-letter.pdfbin0 -> 15163 bytes
-rw-r--r--doc/man/txt/lzmainfo.txt40
-rw-r--r--doc/man/txt/xz.txt1589
-rw-r--r--doc/man/txt/xzdec.txt80
-rw-r--r--doc/man/txt/xzdiff.txt37
-rw-r--r--doc/man/txt/xzgrep.txt49
-rw-r--r--doc/man/txt/xzless.txt39
-rw-r--r--doc/man/txt/xzmore.txt34
-rw-r--r--doc/xz-file-format.txt1165
33 files changed, 4889 insertions, 0 deletions
diff --git a/doc/examples/00_README.txt b/doc/examples/00_README.txt
new file mode 100644
index 0000000..120e1eb
--- /dev/null
+++ b/doc/examples/00_README.txt
@@ -0,0 +1,31 @@
+
+liblzma example programs
+========================
+
+Introduction
+
+ The examples are written so that the same comments aren't
+ repeated (much) in later files.
+
+ On POSIX systems, the examples should build by just typing "make".
+
+ The examples that use stdin or stdout don't set stdin and stdout
+ to binary mode. On systems where it matters (e.g. Windows) it is
+ possible that the examples won't work without modification.
+
+
+List of examples
+
+ 01_compress_easy.c Multi-call compression using
+ a compression preset
+
+ 02_decompress.c Multi-call decompression
+
+ 03_compress_custom.c Like 01_compress_easy.c but using
+ a custom filter chain
+ (x86 BCJ + LZMA2)
+
+ 04_compress_easy_mt.c Multi-threaded multi-call
+ compression using a compression
+ preset
+
diff --git a/doc/examples/01_compress_easy.c b/doc/examples/01_compress_easy.c
new file mode 100644
index 0000000..ec32a37
--- /dev/null
+++ b/doc/examples/01_compress_easy.c
@@ -0,0 +1,297 @@
+///////////////////////////////////////////////////////////////////////////////
+//
+/// \file 01_compress_easy.c
+/// \brief Compress from stdin to stdout in multi-call mode
+///
+/// Usage: ./01_compress_easy PRESET < INFILE > OUTFILE
+///
+/// Example: ./01_compress_easy 6 < foo > foo.xz
+//
+// Author: Lasse Collin
+//
+// This file has been put into the public domain.
+// You can do whatever you want with this file.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <lzma.h>
+
+
+static void
+show_usage_and_exit(const char *argv0)
+{
+ fprintf(stderr, "Usage: %s PRESET < INFILE > OUTFILE\n"
+ "PRESET is a number 0-9 and can optionally be "
+ "followed by `e' to indicate extreme preset\n",
+ argv0);
+ exit(EXIT_FAILURE);
+}
+
+
+static uint32_t
+get_preset(int argc, char **argv)
+{
+ // One argument whose first char must be 0-9.
+ if (argc != 2 || argv[1][0] < '0' || argv[1][0] > '9')
+ show_usage_and_exit(argv[0]);
+
+ // Calculate the preste level 0-9.
+ uint32_t preset = argv[1][0] - '0';
+
+ // If there is a second char, it must be 'e'. It will set
+ // the LZMA_PRESET_EXTREME flag.
+ if (argv[1][1] != '\0') {
+ if (argv[1][1] != 'e' || argv[1][2] != '\0')
+ show_usage_and_exit(argv[0]);
+
+ preset |= LZMA_PRESET_EXTREME;
+ }
+
+ return preset;
+}
+
+
+static bool
+init_encoder(lzma_stream *strm, uint32_t preset)
+{
+ // Initialize the encoder using a preset. Set the integrity to check
+ // to CRC64, which is the default in the xz command line tool. If
+ // the .xz file needs to be decompressed with XZ Embedded, use
+ // LZMA_CHECK_CRC32 instead.
+ lzma_ret ret = lzma_easy_encoder(strm, preset, LZMA_CHECK_CRC64);
+
+ // Return successfully if the initialization went fine.
+ if (ret == LZMA_OK)
+ return true;
+
+ // Something went wrong. The possible errors are documented in
+ // lzma/container.h (src/liblzma/api/lzma/container.h in the source
+ // package or e.g. /usr/include/lzma/container.h depending on the
+ // install prefix).
+ const char *msg;
+ switch (ret) {
+ case LZMA_MEM_ERROR:
+ msg = "Memory allocation failed";
+ break;
+
+ case LZMA_OPTIONS_ERROR:
+ msg = "Specified preset is not supported";
+ break;
+
+ case LZMA_UNSUPPORTED_CHECK:
+ msg = "Specified integrity check is not supported";
+ break;
+
+ default:
+ // This is most likely LZMA_PROG_ERROR indicating a bug in
+ // this program or in liblzma. It is inconvenient to have a
+ // separate error message for errors that should be impossible
+ // to occur, but knowing the error code is important for
+ // debugging. That's why it is good to print the error code
+ // at least when there is no good error message to show.
+ msg = "Unknown error, possibly a bug";
+ break;
+ }
+
+ fprintf(stderr, "Error initializing the encoder: %s (error code %u)\n",
+ msg, ret);
+ return false;
+}
+
+
+static bool
+compress(lzma_stream *strm, FILE *infile, FILE *outfile)
+{
+ // This will be LZMA_RUN until the end of the input file is reached.
+ // This tells lzma_code() when there will be no more input.
+ lzma_action action = LZMA_RUN;
+
+ // Buffers to temporarily hold uncompressed input
+ // and compressed output.
+ uint8_t inbuf[BUFSIZ];
+ uint8_t outbuf[BUFSIZ];
+
+ // Initialize the input and output pointers. Initializing next_in
+ // and avail_in isn't really necessary when we are going to encode
+ // just one file since LZMA_STREAM_INIT takes care of initializing
+ // those already. But it doesn't hurt much and it will be needed
+ // if encoding more than one file like we will in 02_decompress.c.
+ //
+ // While we don't care about strm->total_in or strm->total_out in this
+ // example, it is worth noting that initializing the encoder will
+ // always reset total_in and total_out to zero. But the encoder
+ // initialization doesn't touch next_in, avail_in, next_out, or
+ // avail_out.
+ strm->next_in = NULL;
+ strm->avail_in = 0;
+ strm->next_out = outbuf;
+ strm->avail_out = sizeof(outbuf);
+
+ // Loop until the file has been successfully compressed or until
+ // an error occurs.
+ while (true) {
+ // Fill the input buffer if it is empty.
+ if (strm->avail_in == 0 && !feof(infile)) {
+ strm->next_in = inbuf;
+ strm->avail_in = fread(inbuf, 1, sizeof(inbuf),
+ infile);
+
+ if (ferror(infile)) {
+ fprintf(stderr, "Read error: %s\n",
+ strerror(errno));
+ return false;
+ }
+
+ // Once the end of the input file has been reached,
+ // we need to tell lzma_code() that no more input
+ // will be coming and that it should finish the
+ // encoding.
+ if (feof(infile))
+ action = LZMA_FINISH;
+ }
+
+ // Tell liblzma do the actual encoding.
+ //
+ // This reads up to strm->avail_in bytes of input starting
+ // from strm->next_in. avail_in will be decremented and
+ // next_in incremented by an equal amount to match the
+ // number of input bytes consumed.
+ //
+ // Up to strm->avail_out bytes of compressed output will be
+ // written starting from strm->next_out. avail_out and next_out
+ // will be incremented by an equal amount to match the number
+ // of output bytes written.
+ //
+ // The encoder has to do internal buffering, which means that
+ // it may take quite a bit of input before the same data is
+ // available in compressed form in the output buffer.
+ lzma_ret ret = lzma_code(strm, action);
+
+ // If the output buffer is full or if the compression finished
+ // successfully, write the data from the output buffer to
+ // the output file.
+ if (strm->avail_out == 0 || ret == LZMA_STREAM_END) {
+ // When lzma_code() has returned LZMA_STREAM_END,
+ // the output buffer is likely to be only partially
+ // full. Calculate how much new data there is to
+ // be written to the output file.
+ size_t write_size = sizeof(outbuf) - strm->avail_out;
+
+ if (fwrite(outbuf, 1, write_size, outfile)
+ != write_size) {
+ fprintf(stderr, "Write error: %s\n",
+ strerror(errno));
+ return false;
+ }
+
+ // Reset next_out and avail_out.
+ strm->next_out = outbuf;
+ strm->avail_out = sizeof(outbuf);
+ }
+
+ // Normally the return value of lzma_code() will be LZMA_OK
+ // until everything has been encoded.
+ if (ret != LZMA_OK) {
+ // Once everything has been encoded successfully, the
+ // return value of lzma_code() will be LZMA_STREAM_END.
+ //
+ // It is important to check for LZMA_STREAM_END. Do not
+ // assume that getting ret != LZMA_OK would mean that
+ // everything has gone well.
+ if (ret == LZMA_STREAM_END)
+ return true;
+
+ // It's not LZMA_OK nor LZMA_STREAM_END,
+ // so it must be an error code. See lzma/base.h
+ // (src/liblzma/api/lzma/base.h in the source package
+ // or e.g. /usr/include/lzma/base.h depending on the
+ // install prefix) for the list and documentation of
+ // possible values. Most values listen in lzma_ret
+ // enumeration aren't possible in this example.
+ const char *msg;
+ switch (ret) {
+ case LZMA_MEM_ERROR:
+ msg = "Memory allocation failed";
+ break;
+
+ case LZMA_DATA_ERROR:
+ // This error is returned if the compressed
+ // or uncompressed size get near 8 EiB
+ // (2^63 bytes) because that's where the .xz
+ // file format size limits currently are.
+ // That is, the possibility of this error
+ // is mostly theoretical unless you are doing
+ // something very unusual.
+ //
+ // Note that strm->total_in and strm->total_out
+ // have nothing to do with this error. Changing
+ // those variables won't increase or decrease
+ // the chance of getting this error.
+ msg = "File size limits exceeded";
+ break;
+
+ default:
+ // This is most likely LZMA_PROG_ERROR, but
+ // if this program is buggy (or liblzma has
+ // a bug), it may be e.g. LZMA_BUF_ERROR or
+ // LZMA_OPTIONS_ERROR too.
+ //
+ // It is inconvenient to have a separate
+ // error message for errors that should be
+ // impossible to occur, but knowing the error
+ // code is important for debugging. That's why
+ // it is good to print the error code at least
+ // when there is no good error message to show.
+ msg = "Unknown error, possibly a bug";
+ break;
+ }
+
+ fprintf(stderr, "Encoder error: %s (error code %u)\n",
+ msg, ret);
+ return false;
+ }
+ }
+}
+
+
+extern int
+main(int argc, char **argv)
+{
+ // Get the preset number from the command line.
+ uint32_t preset = get_preset(argc, argv);
+
+ // Initialize a lzma_stream structure. When it is allocated on stack,
+ // it is simplest to use LZMA_STREAM_INIT macro like below. When it
+ // is allocated on heap, using memset(strmptr, 0, sizeof(*strmptr))
+ // works (as long as NULL pointers are represented with zero bits
+ // as they are on practically all computers today).
+ lzma_stream strm = LZMA_STREAM_INIT;
+
+ // Initialize the encoder. If it succeeds, compress from
+ // stdin to stdout.
+ bool success = init_encoder(&strm, preset);
+ if (success)
+ success = compress(&strm, stdin, stdout);
+
+ // Free the memory allocated for the encoder. If we were encoding
+ // multiple files, this would only need to be done after the last
+ // file. See 02_decompress.c for handling of multiple files.
+ //
+ // It is OK to call lzma_end() multiple times or when it hasn't been
+ // actually used except initialized with LZMA_STREAM_INIT.
+ lzma_end(&strm);
+
+ // Close stdout to catch possible write errors that can occur
+ // when pending data is flushed from the stdio buffers.
+ if (fclose(stdout)) {
+ fprintf(stderr, "Write error: %s\n", strerror(errno));
+ success = false;
+ }
+
+ return success ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/doc/examples/02_decompress.c b/doc/examples/02_decompress.c
new file mode 100644
index 0000000..98339be
--- /dev/null
+++ b/doc/examples/02_decompress.c
@@ -0,0 +1,287 @@
+///////////////////////////////////////////////////////////////////////////////
+//
+/// \file 02_decompress.c
+/// \brief Decompress .xz files to stdout
+///
+/// Usage: ./02_decompress INPUT_FILES... > OUTFILE
+///
+/// Example: ./02_decompress foo.xz bar.xz > foobar
+//
+// Author: Lasse Collin
+//
+// This file has been put into the public domain.
+// You can do whatever you want with this file.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <lzma.h>
+
+
+static bool
+init_decoder(lzma_stream *strm)
+{
+ // Initialize a .xz decoder. The decoder supports a memory usage limit
+ // and a set of flags.
+ //
+ // The memory usage of the decompressor depends on the settings used
+ // to compress a .xz file. It can vary from less than a megabyte to
+ // a few gigabytes, but in practice (at least for now) it rarely
+ // exceeds 65 MiB because that's how much memory is required to
+ // decompress files created with "xz -9". Settings requiring more
+ // memory take extra effort to use and don't (at least for now)
+ // provide significantly better compression in most cases.
+ //
+ // Memory usage limit is useful if it is important that the
+ // decompressor won't consume gigabytes of memory. The need
+ // for limiting depends on the application. In this example,
+ // no memory usage limiting is used. This is done by setting
+ // the limit to UINT64_MAX.
+ //
+ // The .xz format allows concatenating compressed files as is:
+ //
+ // echo foo | xz > foobar.xz
+ // echo bar | xz >> foobar.xz
+ //
+ // When decompressing normal standalone .xz files, LZMA_CONCATENATED
+ // should always be used to support decompression of concatenated
+ // .xz files. If LZMA_CONCATENATED isn't used, the decoder will stop
+ // after the first .xz stream. This can be useful when .xz data has
+ // been embedded inside another file format.
+ //
+ // Flags other than LZMA_CONCATENATED are supported too, and can
+ // be combined with bitwise-or. See lzma/container.h
+ // (src/liblzma/api/lzma/container.h in the source package or e.g.
+ // /usr/include/lzma/container.h depending on the install prefix)
+ // for details.
+ lzma_ret ret = lzma_stream_decoder(
+ strm, UINT64_MAX, LZMA_CONCATENATED);
+
+ // Return successfully if the initialization went fine.
+ if (ret == LZMA_OK)
+ return true;
+
+ // Something went wrong. The possible errors are documented in
+ // lzma/container.h (src/liblzma/api/lzma/container.h in the source
+ // package or e.g. /usr/include/lzma/container.h depending on the
+ // install prefix).
+ //
+ // Note that LZMA_MEMLIMIT_ERROR is never possible here. If you
+ // specify a very tiny limit, the error will be delayed until
+ // the first headers have been parsed by a call to lzma_code().
+ const char *msg;
+ switch (ret) {
+ case LZMA_MEM_ERROR:
+ msg = "Memory allocation failed";
+ break;
+
+ case LZMA_OPTIONS_ERROR:
+ msg = "Unsupported decompressor flags";
+ break;
+
+ default:
+ // This is most likely LZMA_PROG_ERROR indicating a bug in
+ // this program or in liblzma. It is inconvenient to have a
+ // separate error message for errors that should be impossible
+ // to occur, but knowing the error code is important for
+ // debugging. That's why it is good to print the error code
+ // at least when there is no good error message to show.
+ msg = "Unknown error, possibly a bug";
+ break;
+ }
+
+ fprintf(stderr, "Error initializing the decoder: %s (error code %u)\n",
+ msg, ret);
+ return false;
+}
+
+
+static bool
+decompress(lzma_stream *strm, const char *inname, FILE *infile, FILE *outfile)
+{
+ // When LZMA_CONCATENATED flag was used when initializing the decoder,
+ // we need to tell lzma_code() when there will be no more input.
+ // This is done by setting action to LZMA_FINISH instead of LZMA_RUN
+ // in the same way as it is done when encoding.
+ //
+ // When LZMA_CONCATENATED isn't used, there is no need to use
+ // LZMA_FINISH to tell when all the input has been read, but it
+ // is still OK to use it if you want. When LZMA_CONCATENATED isn't
+ // used, the decoder will stop after the first .xz stream. In that
+ // case some unused data may be left in strm->next_in.
+ lzma_action action = LZMA_RUN;
+
+ uint8_t inbuf[BUFSIZ];
+ uint8_t outbuf[BUFSIZ];
+
+ strm->next_in = NULL;
+ strm->avail_in = 0;
+ strm->next_out = outbuf;
+ strm->avail_out = sizeof(outbuf);
+
+ while (true) {
+ if (strm->avail_in == 0 && !feof(infile)) {
+ strm->next_in = inbuf;
+ strm->avail_in = fread(inbuf, 1, sizeof(inbuf),
+ infile);
+
+ if (ferror(infile)) {
+ fprintf(stderr, "%s: Read error: %s\n",
+ inname, strerror(errno));
+ return false;
+ }
+
+ // Once the end of the input file has been reached,
+ // we need to tell lzma_code() that no more input
+ // will be coming. As said before, this isn't required
+ // if the LZMA_CONCATENATED flag isn't used when
+ // initializing the decoder.
+ if (feof(infile))
+ action = LZMA_FINISH;
+ }
+
+ lzma_ret ret = lzma_code(strm, action);
+
+ if (strm->avail_out == 0 || ret == LZMA_STREAM_END) {
+ size_t write_size = sizeof(outbuf) - strm->avail_out;
+
+ if (fwrite(outbuf, 1, write_size, outfile)
+ != write_size) {
+ fprintf(stderr, "Write error: %s\n",
+ strerror(errno));
+ return false;
+ }
+
+ strm->next_out = outbuf;
+ strm->avail_out = sizeof(outbuf);
+ }
+
+ if (ret != LZMA_OK) {
+ // Once everything has been decoded successfully, the
+ // return value of lzma_code() will be LZMA_STREAM_END.
+ //
+ // It is important to check for LZMA_STREAM_END. Do not
+ // assume that getting ret != LZMA_OK would mean that
+ // everything has gone well or that when you aren't
+ // getting more output it must have successfully
+ // decoded everything.
+ if (ret == LZMA_STREAM_END)
+ return true;
+
+ // It's not LZMA_OK nor LZMA_STREAM_END,
+ // so it must be an error code. See lzma/base.h
+ // (src/liblzma/api/lzma/base.h in the source package
+ // or e.g. /usr/include/lzma/base.h depending on the
+ // install prefix) for the list and documentation of
+ // possible values. Many values listen in lzma_ret
+ // enumeration aren't possible in this example, but
+ // can be made possible by enabling memory usage limit
+ // or adding flags to the decoder initialization.
+ const char *msg;
+ switch (ret) {
+ case LZMA_MEM_ERROR:
+ msg = "Memory allocation failed";
+ break;
+
+ case LZMA_FORMAT_ERROR:
+ // .xz magic bytes weren't found.
+ msg = "The input is not in the .xz format";
+ break;
+
+ case LZMA_OPTIONS_ERROR:
+ // For example, the headers specify a filter
+ // that isn't supported by this liblzma
+ // version (or it hasn't been enabled when
+ // building liblzma, but no-one sane does
+ // that unless building liblzma for an
+ // embedded system). Upgrading to a newer
+ // liblzma might help.
+ //
+ // Note that it is unlikely that the file has
+ // accidentally became corrupt if you get this
+ // error. The integrity of the .xz headers is
+ // always verified with a CRC32, so
+ // unintentionally corrupt files can be
+ // distinguished from unsupported files.
+ msg = "Unsupported compression options";
+ break;
+
+ case LZMA_DATA_ERROR:
+ msg = "Compressed file is corrupt";
+ break;
+
+ case LZMA_BUF_ERROR:
+ // Typically this error means that a valid
+ // file has got truncated, but it might also
+ // be a damaged part in the file that makes
+ // the decoder think the file is truncated.
+ // If you prefer, you can use the same error
+ // message for this as for LZMA_DATA_ERROR.
+ msg = "Compressed file is truncated or "
+ "otherwise corrupt";
+ break;
+
+ default:
+ // This is most likely LZMA_PROG_ERROR.
+ msg = "Unknown error, possibly a bug";
+ break;
+ }
+
+ fprintf(stderr, "%s: Decoder error: "
+ "%s (error code %u)\n",
+ inname, msg, ret);
+ return false;
+ }
+ }
+}
+
+
+extern int
+main(int argc, char **argv)
+{
+ if (argc <= 1) {
+ fprintf(stderr, "Usage: %s FILES...\n", argv[0]);
+ return EXIT_FAILURE;
+ }
+
+ lzma_stream strm = LZMA_STREAM_INIT;
+
+ bool success = true;
+
+ // Try to decompress all files.
+ for (int i = 1; i < argc; ++i) {
+ if (!init_decoder(&strm)) {
+ // Decoder initialization failed. There's no point
+ // to retry it so we need to exit.
+ success = false;
+ break;
+ }
+
+ FILE *infile = fopen(argv[i], "rb");
+
+ if (infile == NULL) {
+ fprintf(stderr, "%s: Error opening the "
+ "input file: %s\n",
+ argv[i], strerror(errno));
+ success = false;
+ } else {
+ success &= decompress(&strm, argv[i], infile, stdout);
+ fclose(infile);
+ }
+ }
+
+ // Free the memory allocated for the decoder. This only needs to be
+ // done after the last file.
+ lzma_end(&strm);
+
+ if (fclose(stdout)) {
+ fprintf(stderr, "Write error: %s\n", strerror(errno));
+ success = false;
+ }
+
+ return success ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/doc/examples/03_compress_custom.c b/doc/examples/03_compress_custom.c
new file mode 100644
index 0000000..40c85e3
--- /dev/null
+++ b/doc/examples/03_compress_custom.c
@@ -0,0 +1,193 @@
+///////////////////////////////////////////////////////////////////////////////
+//
+/// \file 03_compress_custom.c
+/// \brief Compress in multi-call mode using x86 BCJ and LZMA2
+///
+/// Usage: ./03_compress_custom < INFILE > OUTFILE
+///
+/// Example: ./03_compress_custom < foo > foo.xz
+//
+// Author: Lasse Collin
+//
+// This file has been put into the public domain.
+// You can do whatever you want with this file.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <lzma.h>
+
+
+static bool
+init_encoder(lzma_stream *strm)
+{
+ // Use the default preset (6) for LZMA2.
+ //
+ // The lzma_options_lzma structure and the lzma_lzma_preset() function
+ // are declared in lzma/lzma12.h (src/liblzma/api/lzma/lzma12.h in the
+ // source package or e.g. /usr/include/lzma/lzma12.h depending on
+ // the install prefix).
+ lzma_options_lzma opt_lzma2;
+ if (lzma_lzma_preset(&opt_lzma2, LZMA_PRESET_DEFAULT)) {
+ // It should never fail because the default preset
+ // (and presets 0-9 optionally with LZMA_PRESET_EXTREME)
+ // are supported by all stable liblzma versions.
+ //
+ // (The encoder initialization later in this function may
+ // still fail due to unsupported preset *if* the features
+ // required by the preset have been disabled at build time,
+ // but no-one does such things except on embedded systems.)
+ fprintf(stderr, "Unsupported preset, possibly a bug\n");
+ return false;
+ }
+
+ // Now we could customize the LZMA2 options if we wanted. For example,
+ // we could set the the dictionary size (opt_lzma2.dict_size) to
+ // something else than the default (8 MiB) of the default preset.
+ // See lzma/lzma12.h for details of all LZMA2 options.
+ //
+ // The x86 BCJ filter will try to modify the x86 instruction stream so
+ // that LZMA2 can compress it better. The x86 BCJ filter doesn't need
+ // any options so it will be set to NULL below.
+ //
+ // Construct the filter chain. The uncompressed data goes first to
+ // the first filter in the array, in this case the x86 BCJ filter.
+ // The array is always terminated by setting .id = LZMA_VLI_UNKNOWN.
+ //
+ // See lzma/filter.h for more information about the lzma_filter
+ // structure.
+ lzma_filter filters[] = {
+ { .id = LZMA_FILTER_X86, .options = NULL },
+ { .id = LZMA_FILTER_LZMA2, .options = &opt_lzma2 },
+ { .id = LZMA_VLI_UNKNOWN, .options = NULL },
+ };
+
+ // Initialize the encoder using the custom filter chain.
+ lzma_ret ret = lzma_stream_encoder(strm, filters, LZMA_CHECK_CRC64);
+
+ if (ret == LZMA_OK)
+ return true;
+
+ const char *msg;
+ switch (ret) {
+ case LZMA_MEM_ERROR:
+ msg = "Memory allocation failed";
+ break;
+
+ case LZMA_OPTIONS_ERROR:
+ // We are no longer using a plain preset so this error
+ // message has been edited accordingly compared to
+ // 01_compress_easy.c.
+ msg = "Specified filter chain is not supported";
+ break;
+
+ case LZMA_UNSUPPORTED_CHECK:
+ msg = "Specified integrity check is not supported";
+ break;
+
+ default:
+ msg = "Unknown error, possibly a bug";
+ break;
+ }
+
+ fprintf(stderr, "Error initializing the encoder: %s (error code %u)\n",
+ msg, ret);
+ return false;
+}
+
+
+// This function is identical to the one in 01_compress_easy.c.
+static bool
+compress(lzma_stream *strm, FILE *infile, FILE *outfile)
+{
+ lzma_action action = LZMA_RUN;
+
+ uint8_t inbuf[BUFSIZ];
+ uint8_t outbuf[BUFSIZ];
+
+ strm->next_in = NULL;
+ strm->avail_in = 0;
+ strm->next_out = outbuf;
+ strm->avail_out = sizeof(outbuf);
+
+ while (true) {
+ if (strm->avail_in == 0 && !feof(infile)) {
+ strm->next_in = inbuf;
+ strm->avail_in = fread(inbuf, 1, sizeof(inbuf),
+ infile);
+
+ if (ferror(infile)) {
+ fprintf(stderr, "Read error: %s\n",
+ strerror(errno));
+ return false;
+ }
+
+ if (feof(infile))
+ action = LZMA_FINISH;
+ }
+
+ lzma_ret ret = lzma_code(strm, action);
+
+ if (strm->avail_out == 0 || ret == LZMA_STREAM_END) {
+ size_t write_size = sizeof(outbuf) - strm->avail_out;
+
+ if (fwrite(outbuf, 1, write_size, outfile)
+ != write_size) {
+ fprintf(stderr, "Write error: %s\n",
+ strerror(errno));
+ return false;
+ }
+
+ strm->next_out = outbuf;
+ strm->avail_out = sizeof(outbuf);
+ }
+
+ if (ret != LZMA_OK) {
+ if (ret == LZMA_STREAM_END)
+ return true;
+
+ const char *msg;
+ switch (ret) {
+ case LZMA_MEM_ERROR:
+ msg = "Memory allocation failed";
+ break;
+
+ case LZMA_DATA_ERROR:
+ msg = "File size limits exceeded";
+ break;
+
+ default:
+ msg = "Unknown error, possibly a bug";
+ break;
+ }
+
+ fprintf(stderr, "Encoder error: %s (error code %u)\n",
+ msg, ret);
+ return false;
+ }
+ }
+}
+
+
+extern int
+main(void)
+{
+ lzma_stream strm = LZMA_STREAM_INIT;
+
+ bool success = init_encoder(&strm);
+ if (success)
+ success = compress(&strm, stdin, stdout);
+
+ lzma_end(&strm);
+
+ if (fclose(stdout)) {
+ fprintf(stderr, "Write error: %s\n", strerror(errno));
+ success = false;
+ }
+
+ return success ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/doc/examples/04_compress_easy_mt.c b/doc/examples/04_compress_easy_mt.c
new file mode 100644
index 0000000..efe5697
--- /dev/null
+++ b/doc/examples/04_compress_easy_mt.c
@@ -0,0 +1,206 @@
+///////////////////////////////////////////////////////////////////////////////
+//
+/// \file 04_compress_easy_mt.c
+/// \brief Compress in multi-call mode using LZMA2 in multi-threaded mode
+///
+/// Usage: ./04_compress_easy_mt < INFILE > OUTFILE
+///
+/// Example: ./04_compress_easy_mt < foo > foo.xz
+//
+// Author: Lasse Collin
+//
+// This file has been put into the public domain.
+// You can do whatever you want with this file.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <lzma.h>
+
+
+static bool
+init_encoder(lzma_stream *strm)
+{
+ // The threaded encoder takes the options as pointer to
+ // a lzma_mt structure.
+ lzma_mt mt = {
+ // No flags are needed.
+ .flags = 0,
+
+ // Let liblzma determine a sane block size.
+ .block_size = 0,
+
+ // Use no timeout for lzma_code() calls by setting timeout
+ // to zero. That is, sometimes lzma_code() might block for
+ // a long time (from several seconds to even minutes).
+ // If this is not OK, for example due to progress indicator
+ // needing updates, specify a timeout in milliseconds here.
+ // See the documentation of lzma_mt in lzma/container.h for
+ // information how to choose a reasonable timeout.
+ .timeout = 0,
+
+ // Use the default preset (6) for LZMA2.
+ // To use a preset, filters must be set to NULL.
+ .preset = LZMA_PRESET_DEFAULT,
+ .filters = NULL,
+
+ // Use CRC64 for integrity checking. See also
+ // 01_compress_easy.c about choosing the integrity check.
+ .check = LZMA_CHECK_CRC64,
+ };
+
+ // Detect how many threads the CPU supports.
+ mt.threads = lzma_cputhreads();
+
+ // If the number of CPU cores/threads cannot be detected,
+ // use one thread. Note that this isn't the same as the normal
+ // single-threaded mode as this will still split the data into
+ // blocks and use more RAM than the normal single-threaded mode.
+ // You may want to consider using lzma_easy_encoder() or
+ // lzma_stream_encoder() instead of lzma_stream_encoder_mt() if
+ // lzma_cputhreads() returns 0 or 1.
+ if (mt.threads == 0)
+ mt.threads = 1;
+
+ // If the number of CPU cores/threads exceeds threads_max,
+ // limit the number of threads to keep memory usage lower.
+ // The number 8 is arbitrarily chosen and may be too low or
+ // high depending on the compression preset and the computer
+ // being used.
+ //
+ // FIXME: A better way could be to check the amount of RAM
+ // (or available RAM) and use lzma_stream_encoder_mt_memusage()
+ // to determine if the number of threads should be reduced.
+ const uint32_t threads_max = 8;
+ if (mt.threads > threads_max)
+ mt.threads = threads_max;
+
+ // Initialize the threaded encoder.
+ lzma_ret ret = lzma_stream_encoder_mt(strm, &mt);
+
+ if (ret == LZMA_OK)
+ return true;
+
+ const char *msg;
+ switch (ret) {
+ case LZMA_MEM_ERROR:
+ msg = "Memory allocation failed";
+ break;
+
+ case LZMA_OPTIONS_ERROR:
+ // We are no longer using a plain preset so this error
+ // message has been edited accordingly compared to
+ // 01_compress_easy.c.
+ msg = "Specified filter chain is not supported";
+ break;
+
+ case LZMA_UNSUPPORTED_CHECK:
+ msg = "Specified integrity check is not supported";
+ break;
+
+ default:
+ msg = "Unknown error, possibly a bug";
+ break;
+ }
+
+ fprintf(stderr, "Error initializing the encoder: %s (error code %u)\n",
+ msg, ret);
+ return false;
+}
+
+
+// This function is identical to the one in 01_compress_easy.c.
+static bool
+compress(lzma_stream *strm, FILE *infile, FILE *outfile)
+{
+ lzma_action action = LZMA_RUN;
+
+ uint8_t inbuf[BUFSIZ];
+ uint8_t outbuf[BUFSIZ];
+
+ strm->next_in = NULL;
+ strm->avail_in = 0;
+ strm->next_out = outbuf;
+ strm->avail_out = sizeof(outbuf);
+
+ while (true) {
+ if (strm->avail_in == 0 && !feof(infile)) {
+ strm->next_in = inbuf;
+ strm->avail_in = fread(inbuf, 1, sizeof(inbuf),
+ infile);
+
+ if (ferror(infile)) {
+ fprintf(stderr, "Read error: %s\n",
+ strerror(errno));
+ return false;
+ }
+
+ if (feof(infile))
+ action = LZMA_FINISH;
+ }
+
+ lzma_ret ret = lzma_code(strm, action);
+
+ if (strm->avail_out == 0 || ret == LZMA_STREAM_END) {
+ size_t write_size = sizeof(outbuf) - strm->avail_out;
+
+ if (fwrite(outbuf, 1, write_size, outfile)
+ != write_size) {
+ fprintf(stderr, "Write error: %s\n",
+ strerror(errno));
+ return false;
+ }
+
+ strm->next_out = outbuf;
+ strm->avail_out = sizeof(outbuf);
+ }
+
+ if (ret != LZMA_OK) {
+ if (ret == LZMA_STREAM_END)
+ return true;
+
+ const char *msg;
+ switch (ret) {
+ case LZMA_MEM_ERROR:
+ msg = "Memory allocation failed";
+ break;
+
+ case LZMA_DATA_ERROR:
+ msg = "File size limits exceeded";
+ break;
+
+ default:
+ msg = "Unknown error, possibly a bug";
+ break;
+ }
+
+ fprintf(stderr, "Encoder error: %s (error code %u)\n",
+ msg, ret);
+ return false;
+ }
+ }
+}
+
+
+extern int
+main(void)
+{
+ lzma_stream strm = LZMA_STREAM_INIT;
+
+ bool success = init_encoder(&strm);
+ if (success)
+ success = compress(&strm, stdin, stdout);
+
+ lzma_end(&strm);
+
+ if (fclose(stdout)) {
+ fprintf(stderr, "Write error: %s\n", strerror(errno));
+ success = false;
+ }
+
+ return success ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/doc/examples/Makefile b/doc/examples/Makefile
new file mode 100644
index 0000000..e8839d8
--- /dev/null
+++ b/doc/examples/Makefile
@@ -0,0 +1,25 @@
+#
+# Author: Lasse Collin
+#
+# This file has been put into the public domain.
+# You can do whatever you want with this file.
+#
+
+CC = c99
+CFLAGS = -g
+LDFLAGS = -llzma
+
+PROGS = \
+ 01_compress_easy \
+ 02_decompress \
+ 03_compress_custom \
+ 04_compress_easy_mt \
+ 11_file_info
+
+all: $(PROGS)
+
+.c:
+ $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS)
+
+clean:
+ -rm -f $(PROGS)
diff --git a/doc/examples_old/xz_pipe_comp.c b/doc/examples_old/xz_pipe_comp.c
new file mode 100644
index 0000000..9f9224b
--- /dev/null
+++ b/doc/examples_old/xz_pipe_comp.c
@@ -0,0 +1,127 @@
+/*
+ * xz_pipe_comp.c
+ * A simple example of pipe-only xz compressor implementation.
+ * version: 2010-07-12 - by Daniel Mealha Cabrita
+ * Not copyrighted -- provided to the public domain.
+ *
+ * Compiling:
+ * Link with liblzma. GCC example:
+ * $ gcc -llzma xz_pipe_comp.c -o xz_pipe_comp
+ *
+ * Usage example:
+ * $ cat some_file | ./xz_pipe_comp > some_file.xz
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <stdbool.h>
+#include <lzma.h>
+
+
+/* COMPRESSION SETTINGS */
+
+/* analogous to xz CLI options: -0 to -9 */
+#define COMPRESSION_LEVEL 6
+
+/* boolean setting, analogous to xz CLI option: -e */
+#define COMPRESSION_EXTREME true
+
+/* see: /usr/include/lzma/check.h LZMA_CHECK_* */
+#define INTEGRITY_CHECK LZMA_CHECK_CRC64
+
+
+/* read/write buffer sizes */
+#define IN_BUF_MAX 4096
+#define OUT_BUF_MAX 4096
+
+/* error codes */
+#define RET_OK 0
+#define RET_ERROR_INIT 1
+#define RET_ERROR_INPUT 2
+#define RET_ERROR_OUTPUT 3
+#define RET_ERROR_COMPRESSION 4
+
+
+/* note: in_file and out_file must be open already */
+int xz_compress (FILE *in_file, FILE *out_file)
+{
+ uint32_t preset = COMPRESSION_LEVEL | (COMPRESSION_EXTREME ? LZMA_PRESET_EXTREME : 0);
+ lzma_check check = INTEGRITY_CHECK;
+ lzma_stream strm = LZMA_STREAM_INIT; /* alloc and init lzma_stream struct */
+ uint8_t in_buf [IN_BUF_MAX];
+ uint8_t out_buf [OUT_BUF_MAX];
+ size_t in_len; /* length of useful data in in_buf */
+ size_t out_len; /* length of useful data in out_buf */
+ bool in_finished = false;
+ bool out_finished = false;
+ lzma_action action;
+ lzma_ret ret_xz;
+ int ret;
+
+ ret = RET_OK;
+
+ /* initialize xz encoder */
+ ret_xz = lzma_easy_encoder (&strm, preset, check);
+ if (ret_xz != LZMA_OK) {
+ fprintf (stderr, "lzma_easy_encoder error: %d\n", (int) ret_xz);
+ return RET_ERROR_INIT;
+ }
+
+ while ((! in_finished) && (! out_finished)) {
+ /* read incoming data */
+ in_len = fread (in_buf, 1, IN_BUF_MAX, in_file);
+
+ if (feof (in_file)) {
+ in_finished = true;
+ }
+ if (ferror (in_file)) {
+ in_finished = true;
+ ret = RET_ERROR_INPUT;
+ }
+
+ strm.next_in = in_buf;
+ strm.avail_in = in_len;
+
+ /* if no more data from in_buf, flushes the
+ internal xz buffers and closes the xz data
+ with LZMA_FINISH */
+ action = in_finished ? LZMA_FINISH : LZMA_RUN;
+
+ /* loop until there's no pending compressed output */
+ do {
+ /* out_buf is clean at this point */
+ strm.next_out = out_buf;
+ strm.avail_out = OUT_BUF_MAX;
+
+ /* compress data */
+ ret_xz = lzma_code (&strm, action);
+
+ if ((ret_xz != LZMA_OK) && (ret_xz != LZMA_STREAM_END)) {
+ fprintf (stderr, "lzma_code error: %d\n", (int) ret_xz);
+ out_finished = true;
+ ret = RET_ERROR_COMPRESSION;
+ } else {
+ /* write compressed data */
+ out_len = OUT_BUF_MAX - strm.avail_out;
+ fwrite (out_buf, 1, out_len, out_file);
+ if (ferror (out_file)) {
+ out_finished = true;
+ ret = RET_ERROR_OUTPUT;
+ }
+ }
+ } while (strm.avail_out == 0);
+ }
+
+ lzma_end (&strm);
+ return ret;
+}
+
+int main ()
+{
+ int ret;
+
+ ret = xz_compress (stdin, stdout);
+ return ret;
+}
+
diff --git a/doc/examples_old/xz_pipe_decomp.c b/doc/examples_old/xz_pipe_decomp.c
new file mode 100644
index 0000000..fb5ad89
--- /dev/null
+++ b/doc/examples_old/xz_pipe_decomp.c
@@ -0,0 +1,123 @@
+/*
+ * xz_pipe_decomp.c
+ * A simple example of pipe-only xz decompressor implementation.
+ * version: 2012-06-14 - by Daniel Mealha Cabrita
+ * Not copyrighted -- provided to the public domain.
+ *
+ * Compiling:
+ * Link with liblzma. GCC example:
+ * $ gcc -llzma xz_pipe_decomp.c -o xz_pipe_decomp
+ *
+ * Usage example:
+ * $ cat some_file.xz | ./xz_pipe_decomp > some_file
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <stdbool.h>
+#include <lzma.h>
+
+
+/* read/write buffer sizes */
+#define IN_BUF_MAX 4096
+#define OUT_BUF_MAX 4096
+
+/* error codes */
+#define RET_OK 0
+#define RET_ERROR_INIT 1
+#define RET_ERROR_INPUT 2
+#define RET_ERROR_OUTPUT 3
+#define RET_ERROR_DECOMPRESSION 4
+
+
+/* note: in_file and out_file must be open already */
+int xz_decompress (FILE *in_file, FILE *out_file)
+{
+ lzma_stream strm = LZMA_STREAM_INIT; /* alloc and init lzma_stream struct */
+ const uint32_t flags = LZMA_TELL_UNSUPPORTED_CHECK | LZMA_CONCATENATED;
+ const uint64_t memory_limit = UINT64_MAX; /* no memory limit */
+ uint8_t in_buf [IN_BUF_MAX];
+ uint8_t out_buf [OUT_BUF_MAX];
+ size_t in_len; /* length of useful data in in_buf */
+ size_t out_len; /* length of useful data in out_buf */
+ bool in_finished = false;
+ bool out_finished = false;
+ lzma_action action;
+ lzma_ret ret_xz;
+ int ret;
+
+ ret = RET_OK;
+
+ /* initialize xz decoder */
+ ret_xz = lzma_stream_decoder (&strm, memory_limit, flags);
+ if (ret_xz != LZMA_OK) {
+ fprintf (stderr, "lzma_stream_decoder error: %d\n", (int) ret_xz);
+ return RET_ERROR_INIT;
+ }
+
+ while ((! in_finished) && (! out_finished)) {
+ /* read incoming data */
+ in_len = fread (in_buf, 1, IN_BUF_MAX, in_file);
+
+ if (feof (in_file)) {
+ in_finished = true;
+ }
+ if (ferror (in_file)) {
+ in_finished = true;
+ ret = RET_ERROR_INPUT;
+ }
+
+ strm.next_in = in_buf;
+ strm.avail_in = in_len;
+
+ /* if no more data from in_buf, flushes the
+ internal xz buffers and closes the decompressed data
+ with LZMA_FINISH */
+ action = in_finished ? LZMA_FINISH : LZMA_RUN;
+
+ /* loop until there's no pending decompressed output */
+ do {
+ /* out_buf is clean at this point */
+ strm.next_out = out_buf;
+ strm.avail_out = OUT_BUF_MAX;
+
+ /* decompress data */
+ ret_xz = lzma_code (&strm, action);
+
+ if ((ret_xz != LZMA_OK) && (ret_xz != LZMA_STREAM_END)) {
+ fprintf (stderr, "lzma_code error: %d\n", (int) ret_xz);
+ out_finished = true;
+ ret = RET_ERROR_DECOMPRESSION;
+ } else {
+ /* write decompressed data */
+ out_len = OUT_BUF_MAX - strm.avail_out;
+ fwrite (out_buf, 1, out_len, out_file);
+ if (ferror (out_file)) {
+ out_finished = true;
+ ret = RET_ERROR_OUTPUT;
+ }
+ }
+ } while (strm.avail_out == 0);
+ }
+
+ /* Bug fix (2012-06-14): If no errors were detected, check
+ that the last lzma_code() call returned LZMA_STREAM_END.
+ If not, the file is probably truncated. */
+ if ((ret == RET_OK) && (ret_xz != LZMA_STREAM_END)) {
+ fprintf (stderr, "Input truncated or corrupt\n");
+ ret = RET_ERROR_DECOMPRESSION;
+ }
+
+ lzma_end (&strm);
+ return ret;
+}
+
+int main ()
+{
+ int ret;
+
+ ret = xz_decompress (stdin, stdout);
+ return ret;
+}
+
diff --git a/doc/faq.txt b/doc/faq.txt
new file mode 100644
index 0000000..3f9068b
--- /dev/null
+++ b/doc/faq.txt
@@ -0,0 +1,244 @@
+
+XZ Utils FAQ
+============
+
+Q: What do the letters XZ mean?
+
+A: Nothing. They are just two letters, which come from the file format
+ suffix .xz. The .xz suffix was selected, because it seemed to be
+ pretty much unused. It has no deeper meaning.
+
+
+Q: What are LZMA and LZMA2?
+
+A: LZMA stands for Lempel-Ziv-Markov chain-Algorithm. It is the name
+ of the compression algorithm designed by Igor Pavlov for 7-Zip.
+ LZMA is based on LZ77 and range encoding.
+
+ LZMA2 is an updated version of the original LZMA to fix a couple of
+ practical issues. In context of XZ Utils, LZMA is called LZMA1 to
+ emphasize that LZMA is not the same thing as LZMA2. LZMA2 is the
+ primary compression algorithm in the .xz file format.
+
+
+Q: There are many LZMA related projects. How does XZ Utils relate to them?
+
+A: 7-Zip and LZMA SDK are the original projects. LZMA SDK is roughly
+ a subset of the 7-Zip source tree.
+
+ p7zip is 7-Zip's command-line tools ported to POSIX-like systems.
+
+ LZMA Utils provide a gzip-like lzma tool for POSIX-like systems.
+ LZMA Utils are based on LZMA SDK. XZ Utils are the successor to
+ LZMA Utils.
+
+ There are several other projects using LZMA. Most are more or less
+ based on LZMA SDK. See <https://7-zip.org/links.html>.
+
+
+Q: Why is liblzma named liblzma if its primary file format is .xz?
+ Shouldn't it be e.g. libxz?
+
+A: When the designing of the .xz format began, the idea was to replace
+ the .lzma format and use the same .lzma suffix. It would have been
+ quite OK to reuse the suffix when there were very few .lzma files
+ around. However, the old .lzma format became popular before the
+ new format was finished. The new format was renamed to .xz but the
+ name of liblzma wasn't changed.
+
+
+Q: Do XZ Utils support the .7z format?
+
+A: No. Use 7-Zip (Windows) or p7zip (POSIX-like systems) to handle .7z
+ files.
+
+
+Q: I have many .tar.7z files. Can I convert them to .tar.xz without
+ spending hours recompressing the data?
+
+A: In the "extra" directory, there is a script named 7z2lzma.bash which
+ is able to convert some .7z files to the .lzma format (not .xz). It
+ needs the 7za (or 7z) command from p7zip. The script may silently
+ produce corrupt output if certain assumptions are not met, so
+ decompress the resulting .lzma file and compare it against the
+ original before deleting the original file!
+
+
+Q: I have many .lzma files. Can I quickly convert them to the .xz format?
+
+A: For now, no. Since XZ Utils supports the .lzma format, it's usually
+ not too bad to keep the old files in the old format. If you want to
+ do the conversion anyway, you need to decompress the .lzma files and
+ then recompress to the .xz format.
+
+ Technically, there is a way to make the conversion relatively fast
+ (roughly twice the time that normal decompression takes). Writing
+ such a tool would take quite a bit of time though, and would probably
+ be useful to only a few people. If you really want such a conversion
+ tool, contact Lasse Collin and offer some money.
+
+
+Q: I have installed xz, but my tar doesn't recognize .tar.xz files.
+ How can I extract .tar.xz files?
+
+A: xz -dc foo.tar.xz | tar xf -
+
+
+Q: Can I recover parts of a broken .xz file (e.g. a corrupted CD-R)?
+
+A: It may be possible if the file consists of multiple blocks, which
+ typically is not the case if the file was created in single-threaded
+ mode. There is no recovery program yet.
+
+
+Q: Is (some part of) XZ Utils patented?
+
+A: Lasse Collin is not aware of any patents that could affect XZ Utils.
+ However, due to the nature of software patents, it's not possible to
+ guarantee that XZ Utils isn't affected by any third party patent(s).
+
+
+Q: Where can I find documentation about the file format and algorithms?
+
+A: The .xz format is documented in xz-file-format.txt. It is a container
+ format only, and doesn't include descriptions of any non-trivial
+ filters.
+
+ Documenting LZMA and LZMA2 is planned, but for now, there is no other
+ documentation than the source code. Before you begin, you should know
+ the basics of LZ77 and range-coding algorithms. LZMA is based on LZ77,
+ but LZMA is a lot more complex. Range coding is used to compress
+ the final bitstream like Huffman coding is used in Deflate.
+
+
+Q: I cannot find BCJ and BCJ2 filters. Don't they exist in liblzma?
+
+A: BCJ filter is called "x86" in liblzma. BCJ2 is not included,
+ because it requires using more than one encoded output stream.
+
+
+Q: I need to use a script that runs "xz -9". On a system with 256 MiB
+ of RAM, xz says that it cannot allocate memory. Can I make the
+ script work without modifying it?
+
+A: Set a default memory usage limit for compression. You can do it e.g.
+ in a shell initialization script such as ~/.bashrc or /etc/profile:
+
+ XZ_DEFAULTS=--memlimit-compress=150MiB
+ export XZ_DEFAULTS
+
+ xz will then scale the compression settings down so that the given
+ memory usage limit is not reached. This way xz shouldn't run out
+ of memory.
+
+ Check also that memory-related resource limits are high enough.
+ On most systems, "ulimit -a" will show the current resource limits.
+
+
+Q: How do I create files that can be decompressed with XZ Embedded?
+
+A: See the documentation in XZ Embedded. In short, something like
+ this is a good start:
+
+ xz --check=crc32 --lzma2=preset=6e,dict=64KiB
+
+ Or if a BCJ filter is needed too, e.g. if compressing
+ a kernel image for PowerPC:
+
+ xz --check=crc32 --powerpc --lzma2=preset=6e,dict=64KiB
+
+ Adjust the dictionary size to get a good compromise between
+ compression ratio and decompressor memory usage. Note that
+ in single-call decompression mode of XZ Embedded, a big
+ dictionary doesn't increase memory usage.
+
+
+Q: How is multi-threaded compression implemented in XZ Utils?
+
+A: The simplest method is splitting the uncompressed data into blocks
+ and compressing them in parallel independent from each other.
+ This is currently the only threading method supported in XZ Utils.
+ Since the blocks are compressed independently, they can also be
+ decompressed independently. Together with the index feature in .xz,
+ this allows using threads to create .xz files for random-access
+ reading. This also makes threaded decompression possible.
+
+ The independent blocks method has a couple of disadvantages too. It
+ will compress worse than a single-block method. Often the difference
+ is not too big (maybe 1-2 %) but sometimes it can be too big. Also,
+ the memory usage of the compressor increases linearly when adding
+ threads.
+
+ At least two other threading methods are possible but these haven't
+ been implemented in XZ Utils:
+
+ Match finder parallelization has been in 7-Zip for ages. It doesn't
+ affect compression ratio or memory usage significantly. Among the
+ three threading methods, only this is useful when compressing small
+ files (files that are not significantly bigger than the dictionary).
+ Unfortunately this method scales only to about two CPU cores.
+
+ The third method is pigz-style threading (I use that name, because
+ pigz <https://www.zlib.net/pigz/> uses that method). It doesn't
+ affect compression ratio significantly and scales to many cores.
+ The memory usage scales linearly when threads are added. This isn't
+ significant with pigz, because Deflate uses only a 32 KiB dictionary,
+ but with LZMA2 the memory usage will increase dramatically just like
+ with the independent-blocks method. There is also a constant
+ computational overhead, which may make pigz-method a bit dull on
+ dual-core compared to the parallel match finder method, but with more
+ cores the overhead is not a big deal anymore.
+
+ Combining the threading methods will be possible and also useful.
+ For example, combining match finder parallelization with pigz-style
+ threading or independent-blocks-threading can cut the memory usage
+ by 50 %.
+
+
+Q: I told xz to use many threads but it is using only one or two
+ processor cores. What is wrong?
+
+A: Since multi-threaded compression is done by splitting the data into
+ blocks that are compressed individually, if the input file is too
+ small for the block size, then many threads cannot be used. The
+ default block size increases when the compression level is
+ increased. For example, xz -6 uses 8 MiB LZMA2 dictionary and
+ 24 MiB blocks, and xz -9 uses 64 MiB LZMA dictionary and 192 MiB
+ blocks. If the input file is 100 MiB, xz -6 can use five threads
+ of which one will finish quickly as it has only 4 MiB to compress.
+ However, for the same file, xz -9 can only use one thread.
+
+ One can adjust block size with --block-size=SIZE but making the
+ block size smaller than LZMA2 dictionary is waste of RAM: using
+ xz -9 with 6 MiB blocks isn't any better than using xz -6 with
+ 6 MiB blocks. The default settings use a block size bigger than
+ the LZMA2 dictionary size because this was seen as a reasonable
+ compromise between RAM usage and compression ratio.
+
+ When decompressing, the ability to use threads depends on how the
+ file was created. If it was created in multi-threaded mode then
+ it can be decompressed in multi-threaded mode too if there are
+ multiple blocks in the file.
+
+
+Q: How do I build a program that needs liblzmadec (lzmadec.h)?
+
+A: liblzmadec is part of LZMA Utils. XZ Utils has liblzma, but no
+ liblzmadec. The code using liblzmadec should be ported to use
+ liblzma instead. If you cannot or don't want to do that, download
+ LZMA Utils from <https://tukaani.org/lzma/>.
+
+
+Q: The default build of liblzma is too big. How can I make it smaller?
+
+A: Give --enable-small to the configure script. Use also appropriate
+ --enable or --disable options to include only those filter encoders
+ and decoders and integrity checks that you actually need. Use
+ CFLAGS=-Os (with GCC) or equivalent to tell your compiler to optimize
+ for size. See INSTALL for information about configure options.
+
+ If the result is still too big, take a look at XZ Embedded. It is
+ a separate project, which provides a limited but significantly
+ smaller XZ decoder implementation than XZ Utils. You can find it
+ at <https://tukaani.org/xz/embedded.html>.
+
diff --git a/doc/history.txt b/doc/history.txt
new file mode 100644
index 0000000..8545e23
--- /dev/null
+++ b/doc/history.txt
@@ -0,0 +1,150 @@
+
+History of LZMA Utils and XZ Utils
+==================================
+
+Tukaani distribution
+
+ In 2005, there was a small group working on the Tukaani distribution,
+ which was a Slackware fork. One of the project's goals was to fit the
+ distro on a single 700 MiB ISO-9660 image. Using LZMA instead of gzip
+ helped a lot. Roughly speaking, one could fit data that took 1000 MiB
+ in gzipped form into 700 MiB with LZMA. Naturally, the compression
+ ratio varied across packages, but this was what we got on average.
+
+ Slackware packages have traditionally had .tgz as the filename suffix,
+ which is an abbreviation of .tar.gz. A logical naming for LZMA
+ compressed packages was .tlz, being an abbreviation of .tar.lzma.
+
+ At the end of the year 2007, there was no distribution under the
+ Tukaani project anymore, but development of LZMA Utils was kept going.
+ Still, there were .tlz packages around, because at least Vector Linux
+ (a Slackware based distribution) used LZMA for its packages.
+
+ First versions of the modified pkgtools used the LZMA_Alone tool from
+ Igor Pavlov's LZMA SDK as is. It was fine, because users wouldn't need
+ to interact with LZMA_Alone directly. But people soon wanted to use
+ LZMA for other files too, and the interface of LZMA_Alone wasn't
+ comfortable for those used to gzip and bzip2.
+
+
+First steps of LZMA Utils
+
+ The first version of LZMA Utils (4.22.0) included a shell script called
+ lzmash. It was a wrapper that had a gzip-like command-line interface. It
+ used the LZMA_Alone tool from LZMA SDK to do all the real work. zgrep,
+ zdiff, and related scripts from gzip were adapted to work with LZMA and
+ were part of the first LZMA Utils release too.
+
+ LZMA Utils 4.22.0 included also lzmadec, which was a small (less than
+ 10 KiB) decoder-only command-line tool. It was written on top of the
+ decoder-only C code found from the LZMA SDK. lzmadec was convenient in
+ situations where LZMA_Alone (a few hundred KiB) would be too big.
+
+ lzmash and lzmadec were written by Lasse Collin.
+
+
+Second generation
+
+ The lzmash script was an ugly and not very secure hack. The last
+ version of LZMA Utils to use lzmash was 4.27.1.
+
+ LZMA Utils 4.32.0beta1 introduced a new lzma command-line tool written
+ by Ville Koskinen. It was written in C++, and used the encoder and
+ decoder from C++ LZMA SDK with some little modifications. This tool
+ replaced both the lzmash script and the LZMA_Alone command-line tool
+ in LZMA Utils.
+
+ Introducing this new tool caused some temporary incompatibilities,
+ because the LZMA_Alone executable was simply named lzma like the new
+ command-line tool, but they had a completely different command-line
+ interface. The file format was still the same.
+
+ Lasse wrote liblzmadec, which was a small decoder-only library based
+ on the C code found from LZMA SDK. liblzmadec had an API similar to
+ zlib, although there were some significant differences, which made it
+ non-trivial to use it in some applications designed for zlib and
+ libbzip2.
+
+ The lzmadec command-line tool was converted to use liblzmadec.
+
+ Alexandre Sauvé helped converting the build system to use GNU
+ Autotools. This made it easier to test for certain less portable
+ features needed by the new command-line tool.
+
+ Since the new command-line tool never got completely finished (for
+ example, it didn't support the LZMA_OPT environment variable), the
+ intent was to not call 4.32.x stable. Similarly, liblzmadec wasn't
+ polished, but appeared to work well enough, so some people started
+ using it too.
+
+ Because the development of the third generation of LZMA Utils was
+ delayed considerably (3-4 years), the 4.32.x branch had to be kept
+ maintained. It got some bug fixes now and then, and finally it was
+ decided to call it stable, although most of the missing features were
+ never added.
+
+
+File format problems
+
+ The file format used by LZMA_Alone was primitive. It was designed with
+ embedded systems in mind, and thus provided only a minimal set of
+ features. The two biggest problems for non-embedded use were the lack
+ of magic bytes and an integrity check.
+
+ Igor and Lasse started developing a new file format with some help
+ from Ville Koskinen. Also Mark Adler, Mikko Pouru, H. Peter Anvin,
+ and Lars Wirzenius helped with some minor things at some point of the
+ development. Designing the new format took quite a long time (actually,
+ too long a time would be a more appropriate expression). It was mostly
+ because Lasse was quite slow at getting things done due to personal
+ reasons.
+
+ Originally the new format was supposed to use the same .lzma suffix
+ that was already used by the old file format. Switching to the new
+ format wouldn't have caused much trouble when the old format wasn't
+ used by many people. But since the development of the new format took
+ such a long time, the old format got quite popular, and it was decided
+ that the new file format must use a different suffix.
+
+ It was decided to use .xz as the suffix of the new file format. The
+ first stable .xz file format specification was finally released in
+ December 2008. In addition to fixing the most obvious problems of
+ the old .lzma format, the .xz format added some new features like
+ support for multiple filters (compression algorithms), filter chaining
+ (like piping on the command line), and limited random-access reading.
+
+ Currently the primary compression algorithm used in .xz is LZMA2.
+ It is an extension on top of the original LZMA to fix some practical
+ problems: LZMA2 adds support for flushing the encoder, uncompressed
+ chunks, eases stateful decoder implementations, and improves support
+ for multithreading. Since LZMA2 is better than the original LZMA, the
+ original LZMA is not supported in .xz.
+
+
+Transition to XZ Utils
+
+ The early versions of XZ Utils were called LZMA Utils. The first
+ releases were 4.42.0alphas. They dropped the rest of the C++ LZMA SDK.
+ The code was still directly based on LZMA SDK but ported to C and
+ converted from a callback API to a stateful API. Later, Igor Pavlov
+ made a C version of the LZMA encoder too; these ports from C++ to C
+ were independent in LZMA SDK and LZMA Utils.
+
+ The core of the new LZMA Utils was liblzma, a compression library with
+ a zlib-like API. liblzma supported both the old and new file format.
+ The gzip-like lzma command-line tool was rewritten to use liblzma.
+
+ The new LZMA Utils code base was renamed to XZ Utils when the name
+ of the new file format had been decided. The liblzma compression
+ library retained its name though, because changing it would have
+ caused unnecessary breakage in applications already using the early
+ liblzma snapshots.
+
+ The xz command-line tool can emulate the gzip-like lzma tool by
+ creating appropriate symlinks (e.g. lzma -> xz). Thus, practically
+ all scripts using the lzma tool from LZMA Utils will work as is with
+ XZ Utils (and will keep using the old .lzma format). Still, the .lzma
+ format is more or less deprecated. XZ Utils will keep supporting it,
+ but new applications should use the .xz format, and migrating old
+ applications to .xz is often a good idea too.
+
diff --git a/doc/lzma-file-format.txt b/doc/lzma-file-format.txt
new file mode 100644
index 0000000..4865def
--- /dev/null
+++ b/doc/lzma-file-format.txt
@@ -0,0 +1,173 @@
+
+The .lzma File Format
+=====================
+
+ 0. Preface
+ 0.1. Notices and Acknowledgements
+ 0.2. Changes
+ 1. File Format
+ 1.1. Header
+ 1.1.1. Properties
+ 1.1.2. Dictionary Size
+ 1.1.3. Uncompressed Size
+ 1.2. LZMA Compressed Data
+ 2. References
+
+
+0. Preface
+
+ This document describes the .lzma file format, which is
+ sometimes also called LZMA_Alone format. It is a legacy file
+ format, which is being or has been replaced by the .xz format.
+ The MIME type of the .lzma format is `application/x-lzma'.
+
+ The most commonly used software to handle .lzma files are
+ LZMA SDK, LZMA Utils, 7-Zip, and XZ Utils. This document
+ describes some of the differences between these implementations
+ and gives hints what subset of the .lzma format is the most
+ portable.
+
+
+0.1. Notices and Acknowledgements
+
+ This file format was designed by Igor Pavlov for use in
+ LZMA SDK. This document was written by Lasse Collin
+ <lasse.collin@tukaani.org> using the documentation found
+ from the LZMA SDK.
+
+ This document has been put into the public domain.
+
+
+0.2. Changes
+
+ Last modified: 2022-07-13 21:00+0300
+
+ Compared to the previous version (2011-04-12 11:55+0300)
+ the section 1.1.3 was modified to allow End of Payload Marker
+ with a known Uncompressed Size.
+
+
+1. File Format
+
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+==========================+
+ | Header | LZMA Compressed Data |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+==========================+
+
+ The .lzma format file consist of 13-byte Header followed by
+ the LZMA Compressed Data.
+
+ Unlike the .gz, .bz2, and .xz formats, it is not possible to
+ concatenate multiple .lzma files as is and expect the
+ decompression tool to decode the resulting file as if it were
+ a single .lzma file.
+
+ For example, the command line tools from LZMA Utils and
+ LZMA SDK silently ignore all the data after the first .lzma
+ stream. In contrast, the command line tool from XZ Utils
+ considers the .lzma file to be corrupt if there is data after
+ the first .lzma stream.
+
+
+1.1. Header
+
+ +------------+----+----+----+----+--+--+--+--+--+--+--+--+
+ | Properties | Dictionary Size | Uncompressed Size |
+ +------------+----+----+----+----+--+--+--+--+--+--+--+--+
+
+
+1.1.1. Properties
+
+ The Properties field contains three properties. An abbreviation
+ is given in parentheses, followed by the value range of the
+ property. The field consists of
+
+ 1) the number of literal context bits (lc, [0, 8]);
+ 2) the number of literal position bits (lp, [0, 4]); and
+ 3) the number of position bits (pb, [0, 4]).
+
+ The properties are encoded using the following formula:
+
+ Properties = (pb * 5 + lp) * 9 + lc
+
+ The following C code illustrates a straightforward way to
+ decode the Properties field:
+
+ uint8_t lc, lp, pb;
+ uint8_t prop = get_lzma_properties();
+ if (prop > (4 * 5 + 4) * 9 + 8)
+ return LZMA_PROPERTIES_ERROR;
+
+ pb = prop / (9 * 5);
+ prop -= pb * 9 * 5;
+ lp = prop / 9;
+ lc = prop - lp * 9;
+
+ XZ Utils has an additional requirement: lc + lp <= 4. Files
+ which don't follow this requirement cannot be decompressed
+ with XZ Utils. Usually this isn't a problem since the most
+ common lc/lp/pb values are 3/0/2. It is the only lc/lp/pb
+ combination that the files created by LZMA Utils can have,
+ but LZMA Utils can decompress files with any lc/lp/pb.
+
+
+1.1.2. Dictionary Size
+
+ Dictionary Size is stored as an unsigned 32-bit little endian
+ integer. Any 32-bit value is possible, but for maximum
+ portability, only sizes of 2^n and 2^n + 2^(n-1) should be
+ used.
+
+ LZMA Utils creates only files with dictionary size 2^n,
+ 16 <= n <= 25. LZMA Utils can decompress files with any
+ dictionary size.
+
+ XZ Utils creates and decompresses .lzma files only with
+ dictionary sizes 2^n and 2^n + 2^(n-1). If some other
+ dictionary size is specified when compressing, the value
+ stored in the Dictionary Size field is a rounded up, but the
+ specified value is still used in the actual compression code.
+
+
+1.1.3. Uncompressed Size
+
+ Uncompressed Size is stored as unsigned 64-bit little endian
+ integer. A special value of 0xFFFF_FFFF_FFFF_FFFF indicates
+ that Uncompressed Size is unknown. End of Payload Marker (*)
+ is used if Uncompressed Size is unknown. End of Payload Marker
+ is allowed but rarely used if Uncompressed Size is known.
+ XZ Utils 5.2.5 and older don't support .lzma files that have
+ End of Payload Marker together with a known Uncompressed Size.
+
+ XZ Utils rejects files whose Uncompressed Size field specifies
+ a known size that is 256 GiB or more. This is to reject false
+ positives when trying to guess if the input file is in the
+ .lzma format. When Uncompressed Size is unknown, there is no
+ limit for the uncompressed size of the file.
+
+ (*) Some tools use the term End of Stream (EOS) marker
+ instead of End of Payload Marker.
+
+
+1.2. LZMA Compressed Data
+
+ Detailed description of the format of this field is out of
+ scope of this document.
+
+
+2. References
+
+ LZMA SDK - The original LZMA implementation
+ http://7-zip.org/sdk.html
+
+ 7-Zip
+ http://7-zip.org/
+
+ LZMA Utils - LZMA adapted to POSIX-like systems
+ http://tukaani.org/lzma/
+
+ XZ Utils - The next generation of LZMA Utils
+ http://tukaani.org/xz/
+
+ The .xz file format - The successor of the .lzma format
+ http://tukaani.org/xz/xz-file-format.txt
+
diff --git a/doc/man/pdf-a4/lzmainfo-a4.pdf b/doc/man/pdf-a4/lzmainfo-a4.pdf
new file mode 100644
index 0000000..91f2561
--- /dev/null
+++ b/doc/man/pdf-a4/lzmainfo-a4.pdf
Binary files differ
diff --git a/doc/man/pdf-a4/xz-a4.pdf b/doc/man/pdf-a4/xz-a4.pdf
new file mode 100644
index 0000000..6156cfe
--- /dev/null
+++ b/doc/man/pdf-a4/xz-a4.pdf
Binary files differ
diff --git a/doc/man/pdf-a4/xzdec-a4.pdf b/doc/man/pdf-a4/xzdec-a4.pdf
new file mode 100644
index 0000000..b3a761a
--- /dev/null
+++ b/doc/man/pdf-a4/xzdec-a4.pdf
Binary files differ
diff --git a/doc/man/pdf-a4/xzdiff-a4.pdf b/doc/man/pdf-a4/xzdiff-a4.pdf
new file mode 100644
index 0000000..084cb80
--- /dev/null
+++ b/doc/man/pdf-a4/xzdiff-a4.pdf
Binary files differ
diff --git a/doc/man/pdf-a4/xzgrep-a4.pdf b/doc/man/pdf-a4/xzgrep-a4.pdf
new file mode 100644
index 0000000..e3c54fb
--- /dev/null
+++ b/doc/man/pdf-a4/xzgrep-a4.pdf
Binary files differ
diff --git a/doc/man/pdf-a4/xzless-a4.pdf b/doc/man/pdf-a4/xzless-a4.pdf
new file mode 100644
index 0000000..01176ed
--- /dev/null
+++ b/doc/man/pdf-a4/xzless-a4.pdf
Binary files differ
diff --git a/doc/man/pdf-a4/xzmore-a4.pdf b/doc/man/pdf-a4/xzmore-a4.pdf
new file mode 100644
index 0000000..f829bf4
--- /dev/null
+++ b/doc/man/pdf-a4/xzmore-a4.pdf
Binary files differ
diff --git a/doc/man/pdf-letter/lzmainfo-letter.pdf b/doc/man/pdf-letter/lzmainfo-letter.pdf
new file mode 100644
index 0000000..6f2ae17
--- /dev/null
+++ b/doc/man/pdf-letter/lzmainfo-letter.pdf
Binary files differ
diff --git a/doc/man/pdf-letter/xz-letter.pdf b/doc/man/pdf-letter/xz-letter.pdf
new file mode 100644
index 0000000..9a733d9
--- /dev/null
+++ b/doc/man/pdf-letter/xz-letter.pdf
Binary files differ
diff --git a/doc/man/pdf-letter/xzdec-letter.pdf b/doc/man/pdf-letter/xzdec-letter.pdf
new file mode 100644
index 0000000..89b2ded
--- /dev/null
+++ b/doc/man/pdf-letter/xzdec-letter.pdf
Binary files differ
diff --git a/doc/man/pdf-letter/xzdiff-letter.pdf b/doc/man/pdf-letter/xzdiff-letter.pdf
new file mode 100644
index 0000000..143e979
--- /dev/null
+++ b/doc/man/pdf-letter/xzdiff-letter.pdf
Binary files differ
diff --git a/doc/man/pdf-letter/xzgrep-letter.pdf b/doc/man/pdf-letter/xzgrep-letter.pdf
new file mode 100644
index 0000000..54fc897
--- /dev/null
+++ b/doc/man/pdf-letter/xzgrep-letter.pdf
Binary files differ
diff --git a/doc/man/pdf-letter/xzless-letter.pdf b/doc/man/pdf-letter/xzless-letter.pdf
new file mode 100644
index 0000000..0e48e01
--- /dev/null
+++ b/doc/man/pdf-letter/xzless-letter.pdf
Binary files differ
diff --git a/doc/man/pdf-letter/xzmore-letter.pdf b/doc/man/pdf-letter/xzmore-letter.pdf
new file mode 100644
index 0000000..1a019e0
--- /dev/null
+++ b/doc/man/pdf-letter/xzmore-letter.pdf
Binary files differ
diff --git a/doc/man/txt/lzmainfo.txt b/doc/man/txt/lzmainfo.txt
new file mode 100644
index 0000000..fa4e51c
--- /dev/null
+++ b/doc/man/txt/lzmainfo.txt
@@ -0,0 +1,40 @@
+LZMAINFO(1) XZ Utils LZMAINFO(1)
+
+
+
+NAME
+ lzmainfo - show information stored in the .lzma file header
+
+SYNOPSIS
+ lzmainfo [--help] [--version] [file...]
+
+DESCRIPTION
+ lzmainfo shows information stored in the .lzma file header. It reads
+ the first 13 bytes from the specified file, decodes the header, and
+ prints it to standard output in human readable format. If no files are
+ given or file is -, standard input is read.
+
+ Usually the most interesting information is the uncompressed size and
+ the dictionary size. Uncompressed size can be shown only if the file
+ is in the non-streamed .lzma format variant. The amount of memory re-
+ quired to decompress the file is a few dozen kilobytes plus the dictio-
+ nary size.
+
+ lzmainfo is included in XZ Utils primarily for backward compatibility
+ with LZMA Utils.
+
+EXIT STATUS
+ 0 All is good.
+
+ 1 An error occurred.
+
+BUGS
+ lzmainfo uses MB while the correct suffix would be MiB (2^20 bytes).
+ This is to keep the output compatible with LZMA Utils.
+
+SEE ALSO
+ xz(1)
+
+
+
+Tukaani 2013-06-30 LZMAINFO(1)
diff --git a/doc/man/txt/xz.txt b/doc/man/txt/xz.txt
new file mode 100644
index 0000000..be24360
--- /dev/null
+++ b/doc/man/txt/xz.txt
@@ -0,0 +1,1589 @@
+XZ(1) XZ Utils XZ(1)
+
+
+
+NAME
+ xz, unxz, xzcat, lzma, unlzma, lzcat - Compress or decompress .xz and
+ .lzma files
+
+SYNOPSIS
+ xz [option...] [file...]
+
+COMMAND ALIASES
+ unxz is equivalent to xz --decompress.
+ xzcat is equivalent to xz --decompress --stdout.
+ lzma is equivalent to xz --format=lzma.
+ unlzma is equivalent to xz --format=lzma --decompress.
+ lzcat is equivalent to xz --format=lzma --decompress --stdout.
+
+ When writing scripts that need to decompress files, it is recommended
+ to always use the name xz with appropriate arguments (xz -d or xz -dc)
+ instead of the names unxz and xzcat.
+
+DESCRIPTION
+ xz is a general-purpose data compression tool with command line syntax
+ similar to gzip(1) and bzip2(1). The native file format is the .xz
+ format, but the legacy .lzma format used by LZMA Utils and raw com-
+ pressed streams with no container format headers are also supported.
+ In addition, decompression of the .lz format used by lzip is supported.
+
+ xz compresses or decompresses each file according to the selected oper-
+ ation mode. If no files are given or file is -, xz reads from standard
+ input and writes the processed data to standard output. xz will refuse
+ (display an error and skip the file) to write compressed data to stan-
+ dard output if it is a terminal. Similarly, xz will refuse to read
+ compressed data from standard input if it is a terminal.
+
+ Unless --stdout is specified, files other than - are written to a new
+ file whose name is derived from the source file name:
+
+ o When compressing, the suffix of the target file format (.xz or
+ .lzma) is appended to the source filename to get the target file-
+ name.
+
+ o When decompressing, the .xz, .lzma, or .lz suffix is removed from
+ the filename to get the target filename. xz also recognizes the
+ suffixes .txz and .tlz, and replaces them with the .tar suffix.
+
+ If the target file already exists, an error is displayed and the file
+ is skipped.
+
+ Unless writing to standard output, xz will display a warning and skip
+ the file if any of the following applies:
+
+ o File is not a regular file. Symbolic links are not followed, and
+ thus they are not considered to be regular files.
+
+ o File has more than one hard link.
+
+ o File has setuid, setgid, or sticky bit set.
+
+ o The operation mode is set to compress and the file already has a
+ suffix of the target file format (.xz or .txz when compressing to
+ the .xz format, and .lzma or .tlz when compressing to the .lzma for-
+ mat).
+
+ o The operation mode is set to decompress and the file doesn't have a
+ suffix of any of the supported file formats (.xz, .txz, .lzma, .tlz,
+ or .lz).
+
+ After successfully compressing or decompressing the file, xz copies the
+ owner, group, permissions, access time, and modification time from the
+ source file to the target file. If copying the group fails, the per-
+ missions are modified so that the target file doesn't become accessible
+ to users who didn't have permission to access the source file. xz
+ doesn't support copying other metadata like access control lists or ex-
+ tended attributes yet.
+
+ Once the target file has been successfully closed, the source file is
+ removed unless --keep was specified. The source file is never removed
+ if the output is written to standard output or if an error occurs.
+
+ Sending SIGINFO or SIGUSR1 to the xz process makes it print progress
+ information to standard error. This has only limited use since when
+ standard error is a terminal, using --verbose will display an automati-
+ cally updating progress indicator.
+
+ Memory usage
+ The memory usage of xz varies from a few hundred kilobytes to several
+ gigabytes depending on the compression settings. The settings used
+ when compressing a file determine the memory requirements of the decom-
+ pressor. Typically the decompressor needs 5 % to 20 % of the amount of
+ memory that the compressor needed when creating the file. For example,
+ decompressing a file created with xz -9 currently requires 65 MiB of
+ memory. Still, it is possible to have .xz files that require several
+ gigabytes of memory to decompress.
+
+ Especially users of older systems may find the possibility of very
+ large memory usage annoying. To prevent uncomfortable surprises, xz
+ has a built-in memory usage limiter, which is disabled by default.
+ While some operating systems provide ways to limit the memory usage of
+ processes, relying on it wasn't deemed to be flexible enough (for exam-
+ ple, using ulimit(1) to limit virtual memory tends to cripple mmap(2)).
+
+ The memory usage limiter can be enabled with the command line option
+ --memlimit=limit. Often it is more convenient to enable the limiter by
+ default by setting the environment variable XZ_DEFAULTS, for example,
+ XZ_DEFAULTS=--memlimit=150MiB. It is possible to set the limits sepa-
+ rately for compression and decompression by using --memlimit-com-
+ press=limit and --memlimit-decompress=limit. Using these two options
+ outside XZ_DEFAULTS is rarely useful because a single run of xz cannot
+ do both compression and decompression and --memlimit=limit (or -M
+ limit) is shorter to type on the command line.
+
+ If the specified memory usage limit is exceeded when decompressing, xz
+ will display an error and decompressing the file will fail. If the
+ limit is exceeded when compressing, xz will try to scale the settings
+ down so that the limit is no longer exceeded (except when using --for-
+ mat=raw or --no-adjust). This way the operation won't fail unless the
+ limit is very small. The scaling of the settings is done in steps that
+ don't match the compression level presets, for example, if the limit is
+ only slightly less than the amount required for xz -9, the settings
+ will be scaled down only a little, not all the way down to xz -8.
+
+ Concatenation and padding with .xz files
+ It is possible to concatenate .xz files as is. xz will decompress such
+ files as if they were a single .xz file.
+
+ It is possible to insert padding between the concatenated parts or af-
+ ter the last part. The padding must consist of null bytes and the size
+ of the padding must be a multiple of four bytes. This can be useful,
+ for example, if the .xz file is stored on a medium that measures file
+ sizes in 512-byte blocks.
+
+ Concatenation and padding are not allowed with .lzma files or raw
+ streams.
+
+OPTIONS
+ Integer suffixes and special values
+ In most places where an integer argument is expected, an optional suf-
+ fix is supported to easily indicate large integers. There must be no
+ space between the integer and the suffix.
+
+ KiB Multiply the integer by 1,024 (2^10). Ki, k, kB, K, and KB are
+ accepted as synonyms for KiB.
+
+ MiB Multiply the integer by 1,048,576 (2^20). Mi, m, M, and MB are
+ accepted as synonyms for MiB.
+
+ GiB Multiply the integer by 1,073,741,824 (2^30). Gi, g, G, and GB
+ are accepted as synonyms for GiB.
+
+ The special value max can be used to indicate the maximum integer value
+ supported by the option.
+
+ Operation mode
+ If multiple operation mode options are given, the last one takes ef-
+ fect.
+
+ -z, --compress
+ Compress. This is the default operation mode when no operation
+ mode option is specified and no other operation mode is implied
+ from the command name (for example, unxz implies --decompress).
+
+ -d, --decompress, --uncompress
+ Decompress.
+
+ -t, --test
+ Test the integrity of compressed files. This option is equiva-
+ lent to --decompress --stdout except that the decompressed data
+ is discarded instead of being written to standard output. No
+ files are created or removed.
+
+ -l, --list
+ Print information about compressed files. No uncompressed out-
+ put is produced, and no files are created or removed. In list
+ mode, the program cannot read the compressed data from standard
+ input or from other unseekable sources.
+
+ The default listing shows basic information about files, one
+ file per line. To get more detailed information, use also the
+ --verbose option. For even more information, use --verbose
+ twice, but note that this may be slow, because getting all the
+ extra information requires many seeks. The width of verbose
+ output exceeds 80 characters, so piping the output to, for exam-
+ ple, less -S may be convenient if the terminal isn't wide
+ enough.
+
+ The exact output may vary between xz versions and different lo-
+ cales. For machine-readable output, --robot --list should be
+ used.
+
+ Operation modifiers
+ -k, --keep
+ Don't delete the input files.
+
+ Since xz 5.2.6, this option also makes xz compress or decompress
+ even if the input is a symbolic link to a regular file, has more
+ than one hard link, or has the setuid, setgid, or sticky bit
+ set. The setuid, setgid, and sticky bits are not copied to the
+ target file. In earlier versions this was only done with
+ --force.
+
+ -f, --force
+ This option has several effects:
+
+ o If the target file already exists, delete it before compress-
+ ing or decompressing.
+
+ o Compress or decompress even if the input is a symbolic link
+ to a regular file, has more than one hard link, or has the
+ setuid, setgid, or sticky bit set. The setuid, setgid, and
+ sticky bits are not copied to the target file.
+
+ o When used with --decompress --stdout and xz cannot recognize
+ the type of the source file, copy the source file as is to
+ standard output. This allows xzcat --force to be used like
+ cat(1) for files that have not been compressed with xz. Note
+ that in future, xz might support new compressed file formats,
+ which may make xz decompress more types of files instead of
+ copying them as is to standard output. --format=format can
+ be used to restrict xz to decompress only a single file for-
+ mat.
+
+ -c, --stdout, --to-stdout
+ Write the compressed or decompressed data to standard output in-
+ stead of a file. This implies --keep.
+
+ --single-stream
+ Decompress only the first .xz stream, and silently ignore possi-
+ ble remaining input data following the stream. Normally such
+ trailing garbage makes xz display an error.
+
+ xz never decompresses more than one stream from .lzma files or
+ raw streams, but this option still makes xz ignore the possible
+ trailing data after the .lzma file or raw stream.
+
+ This option has no effect if the operation mode is not --decom-
+ press or --test.
+
+ --no-sparse
+ Disable creation of sparse files. By default, if decompressing
+ into a regular file, xz tries to make the file sparse if the de-
+ compressed data contains long sequences of binary zeros. It
+ also works when writing to standard output as long as standard
+ output is connected to a regular file and certain additional
+ conditions are met to make it safe. Creating sparse files may
+ save disk space and speed up the decompression by reducing the
+ amount of disk I/O.
+
+ -S .suf, --suffix=.suf
+ When compressing, use .suf as the suffix for the target file in-
+ stead of .xz or .lzma. If not writing to standard output and
+ the source file already has the suffix .suf, a warning is dis-
+ played and the file is skipped.
+
+ When decompressing, recognize files with the suffix .suf in ad-
+ dition to files with the .xz, .txz, .lzma, .tlz, or .lz suffix.
+ If the source file has the suffix .suf, the suffix is removed to
+ get the target filename.
+
+ When compressing or decompressing raw streams (--format=raw),
+ the suffix must always be specified unless writing to standard
+ output, because there is no default suffix for raw streams.
+
+ --files[=file]
+ Read the filenames to process from file; if file is omitted,
+ filenames are read from standard input. Filenames must be ter-
+ minated with the newline character. A dash (-) is taken as a
+ regular filename; it doesn't mean standard input. If filenames
+ are given also as command line arguments, they are processed be-
+ fore the filenames read from file.
+
+ --files0[=file]
+ This is identical to --files[=file] except that each filename
+ must be terminated with the null character.
+
+ Basic file format and compression options
+ -F format, --format=format
+ Specify the file format to compress or decompress:
+
+ auto This is the default. When compressing, auto is equiva-
+ lent to xz. When decompressing, the format of the input
+ file is automatically detected. Note that raw streams
+ (created with --format=raw) cannot be auto-detected.
+
+ xz Compress to the .xz file format, or accept only .xz files
+ when decompressing.
+
+ lzma, alone
+ Compress to the legacy .lzma file format, or accept only
+ .lzma files when decompressing. The alternative name
+ alone is provided for backwards compatibility with LZMA
+ Utils.
+
+ lzip Accept only .lz files when decompressing. Compression is
+ not supported.
+
+ The .lz format version 0 and the unextended version 1 are
+ supported. Version 0 files were produced by lzip 1.3 and
+ older. Such files aren't common but may be found from
+ file archives as a few source packages were released in
+ this format. People might have old personal files in
+ this format too. Decompression support for the format
+ version 0 was removed in lzip 1.18.
+
+ lzip 1.4 and later create files in the format version 1.
+ The sync flush marker extension to the format version 1
+ was added in lzip 1.6. This extension is rarely used and
+ isn't supported by xz (diagnosed as corrupt input).
+
+ raw Compress or uncompress a raw stream (no headers). This
+ is meant for advanced users only. To decode raw streams,
+ you need use --format=raw and explicitly specify the fil-
+ ter chain, which normally would have been stored in the
+ container headers.
+
+ -C check, --check=check
+ Specify the type of the integrity check. The check is calcu-
+ lated from the uncompressed data and stored in the .xz file.
+ This option has an effect only when compressing into the .xz
+ format; the .lzma format doesn't support integrity checks. The
+ integrity check (if any) is verified when the .xz file is decom-
+ pressed.
+
+ Supported check types:
+
+ none Don't calculate an integrity check at all. This is usu-
+ ally a bad idea. This can be useful when integrity of
+ the data is verified by other means anyway.
+
+ crc32 Calculate CRC32 using the polynomial from IEEE-802.3
+ (Ethernet).
+
+ crc64 Calculate CRC64 using the polynomial from ECMA-182. This
+ is the default, since it is slightly better than CRC32 at
+ detecting damaged files and the speed difference is neg-
+ ligible.
+
+ sha256 Calculate SHA-256. This is somewhat slower than CRC32
+ and CRC64.
+
+ Integrity of the .xz headers is always verified with CRC32. It
+ is not possible to change or disable it.
+
+ --ignore-check
+ Don't verify the integrity check of the compressed data when de-
+ compressing. The CRC32 values in the .xz headers will still be
+ verified normally.
+
+ Do not use this option unless you know what you are doing. Pos-
+ sible reasons to use this option:
+
+ o Trying to recover data from a corrupt .xz file.
+
+ o Speeding up decompression. This matters mostly with SHA-256
+ or with files that have compressed extremely well. It's rec-
+ ommended to not use this option for this purpose unless the
+ file integrity is verified externally in some other way.
+
+ -0 ... -9
+ Select a compression preset level. The default is -6. If mul-
+ tiple preset levels are specified, the last one takes effect.
+ If a custom filter chain was already specified, setting a com-
+ pression preset level clears the custom filter chain.
+
+ The differences between the presets are more significant than
+ with gzip(1) and bzip2(1). The selected compression settings
+ determine the memory requirements of the decompressor, thus us-
+ ing a too high preset level might make it painful to decompress
+ the file on an old system with little RAM. Specifically, it's
+ not a good idea to blindly use -9 for everything like it often
+ is with gzip(1) and bzip2(1).
+
+ -0 ... -3
+ These are somewhat fast presets. -0 is sometimes faster
+ than gzip -9 while compressing much better. The higher
+ ones often have speed comparable to bzip2(1) with compa-
+ rable or better compression ratio, although the results
+ depend a lot on the type of data being compressed.
+
+ -4 ... -6
+ Good to very good compression while keeping decompressor
+ memory usage reasonable even for old systems. -6 is the
+ default, which is usually a good choice for distributing
+ files that need to be decompressible even on systems with
+ only 16 MiB RAM. (-5e or -6e may be worth considering
+ too. See --extreme.)
+
+ -7 ... -9
+ These are like -6 but with higher compressor and decom-
+ pressor memory requirements. These are useful only when
+ compressing files bigger than 8 MiB, 16 MiB, and 32 MiB,
+ respectively.
+
+ On the same hardware, the decompression speed is approximately a
+ constant number of bytes of compressed data per second. In
+ other words, the better the compression, the faster the decom-
+ pression will usually be. This also means that the amount of
+ uncompressed output produced per second can vary a lot.
+
+ The following table summarises the features of the presets:
+
+ Preset DictSize CompCPU CompMem DecMem
+ -0 256 KiB 0 3 MiB 1 MiB
+ -1 1 MiB 1 9 MiB 2 MiB
+ -2 2 MiB 2 17 MiB 3 MiB
+ -3 4 MiB 3 32 MiB 5 MiB
+ -4 4 MiB 4 48 MiB 5 MiB
+ -5 8 MiB 5 94 MiB 9 MiB
+ -6 8 MiB 6 94 MiB 9 MiB
+ -7 16 MiB 6 186 MiB 17 MiB
+ -8 32 MiB 6 370 MiB 33 MiB
+ -9 64 MiB 6 674 MiB 65 MiB
+
+ Column descriptions:
+
+ o DictSize is the LZMA2 dictionary size. It is waste of memory
+ to use a dictionary bigger than the size of the uncompressed
+ file. This is why it is good to avoid using the presets -7
+ ... -9 when there's no real need for them. At -6 and lower,
+ the amount of memory wasted is usually low enough to not mat-
+ ter.
+
+ o CompCPU is a simplified representation of the LZMA2 settings
+ that affect compression speed. The dictionary size affects
+ speed too, so while CompCPU is the same for levels -6 ... -9,
+ higher levels still tend to be a little slower. To get even
+ slower and thus possibly better compression, see --extreme.
+
+ o CompMem contains the compressor memory requirements in the
+ single-threaded mode. It may vary slightly between xz ver-
+ sions. Memory requirements of some of the future multi-
+ threaded modes may be dramatically higher than that of the
+ single-threaded mode.
+
+ o DecMem contains the decompressor memory requirements. That
+ is, the compression settings determine the memory require-
+ ments of the decompressor. The exact decompressor memory us-
+ age is slightly more than the LZMA2 dictionary size, but the
+ values in the table have been rounded up to the next full
+ MiB.
+
+ -e, --extreme
+ Use a slower variant of the selected compression preset level
+ (-0 ... -9) to hopefully get a little bit better compression ra-
+ tio, but with bad luck this can also make it worse. Decompres-
+ sor memory usage is not affected, but compressor memory usage
+ increases a little at preset levels -0 ... -3.
+
+ Since there are two presets with dictionary sizes 4 MiB and
+ 8 MiB, the presets -3e and -5e use slightly faster settings
+ (lower CompCPU) than -4e and -6e, respectively. That way no two
+ presets are identical.
+
+ Preset DictSize CompCPU CompMem DecMem
+ -0e 256 KiB 8 4 MiB 1 MiB
+ -1e 1 MiB 8 13 MiB 2 MiB
+ -2e 2 MiB 8 25 MiB 3 MiB
+ -3e 4 MiB 7 48 MiB 5 MiB
+ -4e 4 MiB 8 48 MiB 5 MiB
+ -5e 8 MiB 7 94 MiB 9 MiB
+ -6e 8 MiB 8 94 MiB 9 MiB
+ -7e 16 MiB 8 186 MiB 17 MiB
+ -8e 32 MiB 8 370 MiB 33 MiB
+ -9e 64 MiB 8 674 MiB 65 MiB
+
+ For example, there are a total of four presets that use 8 MiB
+ dictionary, whose order from the fastest to the slowest is -5,
+ -6, -5e, and -6e.
+
+ --fast
+ --best These are somewhat misleading aliases for -0 and -9, respec-
+ tively. These are provided only for backwards compatibility
+ with LZMA Utils. Avoid using these options.
+
+ --block-size=size
+ When compressing to the .xz format, split the input data into
+ blocks of size bytes. The blocks are compressed independently
+ from each other, which helps with multi-threading and makes lim-
+ ited random-access decompression possible. This option is typi-
+ cally used to override the default block size in multi-threaded
+ mode, but this option can be used in single-threaded mode too.
+
+ In multi-threaded mode about three times size bytes will be al-
+ located in each thread for buffering input and output. The de-
+ fault size is three times the LZMA2 dictionary size or 1 MiB,
+ whichever is more. Typically a good value is 2-4 times the size
+ of the LZMA2 dictionary or at least 1 MiB. Using size less than
+ the LZMA2 dictionary size is waste of RAM because then the LZMA2
+ dictionary buffer will never get fully used. The sizes of the
+ blocks are stored in the block headers, which a future version
+ of xz will use for multi-threaded decompression.
+
+ In single-threaded mode no block splitting is done by default.
+ Setting this option doesn't affect memory usage. No size infor-
+ mation is stored in block headers, thus files created in single-
+ threaded mode won't be identical to files created in multi-
+ threaded mode. The lack of size information also means that a
+ future version of xz won't be able decompress the files in
+ multi-threaded mode.
+
+ --block-list=sizes
+ When compressing to the .xz format, start a new block after the
+ given intervals of uncompressed data.
+
+ The uncompressed sizes of the blocks are specified as a comma-
+ separated list. Omitting a size (two or more consecutive com-
+ mas) is a shorthand to use the size of the previous block.
+
+ If the input file is bigger than the sum of sizes, the last
+ value in sizes is repeated until the end of the file. A special
+ value of 0 may be used as the last value to indicate that the
+ rest of the file should be encoded as a single block.
+
+ If one specifies sizes that exceed the encoder's block size (ei-
+ ther the default value in threaded mode or the value specified
+ with --block-size=size), the encoder will create additional
+ blocks while keeping the boundaries specified in sizes. For ex-
+ ample, if one specifies --block-size=10MiB
+ --block-list=5MiB,10MiB,8MiB,12MiB,24MiB and the input file is
+ 80 MiB, one will get 11 blocks: 5, 10, 8, 10, 2, 10, 10, 4, 10,
+ 10, and 1 MiB.
+
+ In multi-threaded mode the sizes of the blocks are stored in the
+ block headers. This isn't done in single-threaded mode, so the
+ encoded output won't be identical to that of the multi-threaded
+ mode.
+
+ --flush-timeout=timeout
+ When compressing, if more than timeout milliseconds (a positive
+ integer) has passed since the previous flush and reading more
+ input would block, all the pending input data is flushed from
+ the encoder and made available in the output stream. This can
+ be useful if xz is used to compress data that is streamed over a
+ network. Small timeout values make the data available at the
+ receiving end with a small delay, but large timeout values give
+ better compression ratio.
+
+ This feature is disabled by default. If this option is speci-
+ fied more than once, the last one takes effect. The special
+ timeout value of 0 can be used to explicitly disable this fea-
+ ture.
+
+ This feature is not available on non-POSIX systems.
+
+ This feature is still experimental. Currently xz is unsuitable
+ for decompressing the stream in real time due to how xz does
+ buffering.
+
+ --memlimit-compress=limit
+ Set a memory usage limit for compression. If this option is
+ specified multiple times, the last one takes effect.
+
+ If the compression settings exceed the limit, xz will attempt to
+ adjust the settings downwards so that the limit is no longer ex-
+ ceeded and display a notice that automatic adjustment was done.
+ The adjustments are done in this order: reducing the number of
+ threads, switching to single-threaded mode if even one thread in
+ multi-threaded mode exceeds the limit, and finally reducing the
+ LZMA2 dictionary size.
+
+ When compressing with --format=raw or if --no-adjust has been
+ specified, only the number of threads may be reduced since it
+ can be done without affecting the compressed output.
+
+ If the limit cannot be met even with the adjustments described
+ above, an error is displayed and xz will exit with exit status
+ 1.
+
+ The limit can be specified in multiple ways:
+
+ o The limit can be an absolute value in bytes. Using an inte-
+ ger suffix like MiB can be useful. Example: --memlimit-com-
+ press=80MiB
+
+ o The limit can be specified as a percentage of total physical
+ memory (RAM). This can be useful especially when setting the
+ XZ_DEFAULTS environment variable in a shell initialization
+ script that is shared between different computers. That way
+ the limit is automatically bigger on systems with more mem-
+ ory. Example: --memlimit-compress=70%
+
+ o The limit can be reset back to its default value by setting
+ it to 0. This is currently equivalent to setting the limit
+ to max (no memory usage limit).
+
+ For 32-bit xz there is a special case: if the limit would be
+ over 4020 MiB, the limit is set to 4020 MiB. On MIPS32 2000 MiB
+ is used instead. (The values 0 and max aren't affected by this.
+ A similar feature doesn't exist for decompression.) This can be
+ helpful when a 32-bit executable has access to 4 GiB address
+ space (2 GiB on MIPS32) while hopefully doing no harm in other
+ situations.
+
+ See also the section Memory usage.
+
+ --memlimit-decompress=limit
+ Set a memory usage limit for decompression. This also affects
+ the --list mode. If the operation is not possible without ex-
+ ceeding the limit, xz will display an error and decompressing
+ the file will fail. See --memlimit-compress=limit for possible
+ ways to specify the limit.
+
+ --memlimit-mt-decompress=limit
+ Set a memory usage limit for multi-threaded decompression. This
+ can only affect the number of threads; this will never make xz
+ refuse to decompress a file. If limit is too low to allow any
+ multi-threading, the limit is ignored and xz will continue in
+ single-threaded mode. Note that if also --memlimit-decompress
+ is used, it will always apply to both single-threaded and multi-
+ threaded modes, and so the effective limit for multi-threading
+ will never be higher than the limit set with --memlimit-decom-
+ press.
+
+ In contrast to the other memory usage limit options, --mem-
+ limit-mt-decompress=limit has a system-specific default limit.
+ xz --info-memory can be used to see the current value.
+
+ This option and its default value exist because without any
+ limit the threaded decompressor could end up allocating an in-
+ sane amount of memory with some input files. If the default
+ limit is too low on your system, feel free to increase the limit
+ but never set it to a value larger than the amount of usable RAM
+ as with appropriate input files xz will attempt to use that
+ amount of memory even with a low number of threads. Running out
+ of memory or swapping will not improve decompression perfor-
+ mance.
+
+ See --memlimit-compress=limit for possible ways to specify the
+ limit. Setting limit to 0 resets the limit to the default sys-
+ tem-specific value.
+
+
+
+ -M limit, --memlimit=limit, --memory=limit
+ This is equivalent to specifying --memlimit-compress=limit
+ --memlimit-decompress=limit --memlimit-mt-decompress=limit.
+
+ --no-adjust
+ Display an error and exit if the memory usage limit cannot be
+ met without adjusting settings that affect the compressed out-
+ put. That is, this prevents xz from switching the encoder from
+ multi-threaded mode to single-threaded mode and from reducing
+ the LZMA2 dictionary size. Even when this option is used the
+ number of threads may be reduced to meet the memory usage limit
+ as that won't affect the compressed output.
+
+ Automatic adjusting is always disabled when creating raw streams
+ (--format=raw).
+
+ -T threads, --threads=threads
+ Specify the number of worker threads to use. Setting threads to
+ a special value 0 makes xz use up to as many threads as the pro-
+ cessor(s) on the system support. The actual number of threads
+ can be fewer than threads if the input file is not big enough
+ for threading with the given settings or if using more threads
+ would exceed the memory usage limit.
+
+ The single-threaded and multi-threaded compressors produce dif-
+ ferent output. Single-threaded compressor will give the small-
+ est file size but only the output from the multi-threaded com-
+ pressor can be decompressed using multiple threads. Setting
+ threads to 1 will use the single-threaded mode. Setting threads
+ to any other value, including 0, will use the multi-threaded
+ compressor even if the system supports only one hardware thread.
+ (xz 5.2.x used single-threaded mode in this situation.)
+
+ To use multi-threaded mode with only one thread, set threads to
+ +1. The + prefix has no effect with values other than 1. A
+ memory usage limit can still make xz switch to single-threaded
+ mode unless --no-adjust is used. Support for the + prefix was
+ added in xz 5.4.0.
+
+ If an automatic number of threads has been requested and no mem-
+ ory usage limit has been specified, then a system-specific de-
+ fault soft limit will be used to possibly limit the number of
+ threads. It is a soft limit in sense that it is ignored if the
+ number of threads becomes one, thus a soft limit will never stop
+ xz from compressing or decompressing. This default soft limit
+ will not make xz switch from multi-threaded mode to single-
+ threaded mode. The active limits can be seen with xz
+ --info-memory.
+
+ Currently the only threading method is to split the input into
+ blocks and compress them independently from each other. The de-
+ fault block size depends on the compression level and can be
+ overridden with the --block-size=size option.
+
+ Threaded decompression only works on files that contain multiple
+ blocks with size information in block headers. All large enough
+ files compressed in multi-threaded mode meet this condition, but
+ files compressed in single-threaded mode don't even if
+ --block-size=size has been used.
+
+ Custom compressor filter chains
+ A custom filter chain allows specifying the compression settings in de-
+ tail instead of relying on the settings associated to the presets.
+ When a custom filter chain is specified, preset options (-0 ... -9 and
+ --extreme) earlier on the command line are forgotten. If a preset op-
+ tion is specified after one or more custom filter chain options, the
+ new preset takes effect and the custom filter chain options specified
+ earlier are forgotten.
+
+ A filter chain is comparable to piping on the command line. When com-
+ pressing, the uncompressed input goes to the first filter, whose output
+ goes to the next filter (if any). The output of the last filter gets
+ written to the compressed file. The maximum number of filters in the
+ chain is four, but typically a filter chain has only one or two fil-
+ ters.
+
+ Many filters have limitations on where they can be in the filter chain:
+ some filters can work only as the last filter in the chain, some only
+ as a non-last filter, and some work in any position in the chain. De-
+ pending on the filter, this limitation is either inherent to the filter
+ design or exists to prevent security issues.
+
+ A custom filter chain is specified by using one or more filter options
+ in the order they are wanted in the filter chain. That is, the order
+ of filter options is significant! When decoding raw streams (--for-
+ mat=raw), the filter chain is specified in the same order as it was
+ specified when compressing.
+
+ Filters take filter-specific options as a comma-separated list. Extra
+ commas in options are ignored. Every option has a default value, so
+ you need to specify only those you want to change.
+
+ To see the whole filter chain and options, use xz -vv (that is, use
+ --verbose twice). This works also for viewing the filter chain options
+ used by presets.
+
+ --lzma1[=options]
+ --lzma2[=options]
+ Add LZMA1 or LZMA2 filter to the filter chain. These filters
+ can be used only as the last filter in the chain.
+
+ LZMA1 is a legacy filter, which is supported almost solely due
+ to the legacy .lzma file format, which supports only LZMA1.
+ LZMA2 is an updated version of LZMA1 to fix some practical is-
+ sues of LZMA1. The .xz format uses LZMA2 and doesn't support
+ LZMA1 at all. Compression speed and ratios of LZMA1 and LZMA2
+ are practically the same.
+
+ LZMA1 and LZMA2 share the same set of options:
+
+ preset=preset
+ Reset all LZMA1 or LZMA2 options to preset. Preset con-
+ sist of an integer, which may be followed by single-let-
+ ter preset modifiers. The integer can be from 0 to 9,
+ matching the command line options -0 ... -9. The only
+ supported modifier is currently e, which matches --ex-
+ treme. If no preset is specified, the default values of
+ LZMA1 or LZMA2 options are taken from the preset 6.
+
+ dict=size
+ Dictionary (history buffer) size indicates how many bytes
+ of the recently processed uncompressed data is kept in
+ memory. The algorithm tries to find repeating byte se-
+ quences (matches) in the uncompressed data, and replace
+ them with references to the data currently in the dictio-
+ nary. The bigger the dictionary, the higher is the
+ chance to find a match. Thus, increasing dictionary size
+ usually improves compression ratio, but a dictionary big-
+ ger than the uncompressed file is waste of memory.
+
+ Typical dictionary size is from 64 KiB to 64 MiB. The
+ minimum is 4 KiB. The maximum for compression is cur-
+ rently 1.5 GiB (1536 MiB). The decompressor already sup-
+ ports dictionaries up to one byte less than 4 GiB, which
+ is the maximum for the LZMA1 and LZMA2 stream formats.
+
+ Dictionary size and match finder (mf) together determine
+ the memory usage of the LZMA1 or LZMA2 encoder. The same
+ (or bigger) dictionary size is required for decompressing
+ that was used when compressing, thus the memory usage of
+ the decoder is determined by the dictionary size used
+ when compressing. The .xz headers store the dictionary
+ size either as 2^n or 2^n + 2^(n-1), so these sizes are
+ somewhat preferred for compression. Other sizes will get
+ rounded up when stored in the .xz headers.
+
+ lc=lc Specify the number of literal context bits. The minimum
+ is 0 and the maximum is 4; the default is 3. In addi-
+ tion, the sum of lc and lp must not exceed 4.
+
+ All bytes that cannot be encoded as matches are encoded
+ as literals. That is, literals are simply 8-bit bytes
+ that are encoded one at a time.
+
+ The literal coding makes an assumption that the highest
+ lc bits of the previous uncompressed byte correlate with
+ the next byte. For example, in typical English text, an
+ upper-case letter is often followed by a lower-case let-
+ ter, and a lower-case letter is usually followed by an-
+ other lower-case letter. In the US-ASCII character set,
+ the highest three bits are 010 for upper-case letters and
+ 011 for lower-case letters. When lc is at least 3, the
+ literal coding can take advantage of this property in the
+ uncompressed data.
+
+ The default value (3) is usually good. If you want maxi-
+ mum compression, test lc=4. Sometimes it helps a little,
+ and sometimes it makes compression worse. If it makes it
+ worse, test lc=2 too.
+
+ lp=lp Specify the number of literal position bits. The minimum
+ is 0 and the maximum is 4; the default is 0.
+
+ Lp affects what kind of alignment in the uncompressed
+ data is assumed when encoding literals. See pb below for
+ more information about alignment.
+
+ pb=pb Specify the number of position bits. The minimum is 0
+ and the maximum is 4; the default is 2.
+
+ Pb affects what kind of alignment in the uncompressed
+ data is assumed in general. The default means four-byte
+ alignment (2^pb=2^2=4), which is often a good choice when
+ there's no better guess.
+
+ When the alignment is known, setting pb accordingly may
+ reduce the file size a little. For example, with text
+ files having one-byte alignment (US-ASCII, ISO-8859-*,
+ UTF-8), setting pb=0 can improve compression slightly.
+ For UTF-16 text, pb=1 is a good choice. If the alignment
+ is an odd number like 3 bytes, pb=0 might be the best
+ choice.
+
+ Even though the assumed alignment can be adjusted with pb
+ and lp, LZMA1 and LZMA2 still slightly favor 16-byte
+ alignment. It might be worth taking into account when
+ designing file formats that are likely to be often com-
+ pressed with LZMA1 or LZMA2.
+
+ mf=mf Match finder has a major effect on encoder speed, memory
+ usage, and compression ratio. Usually Hash Chain match
+ finders are faster than Binary Tree match finders. The
+ default depends on the preset: 0 uses hc3, 1-3 use hc4,
+ and the rest use bt4.
+
+ The following match finders are supported. The memory
+ usage formulas below are rough approximations, which are
+ closest to the reality when dict is a power of two.
+
+ hc3 Hash Chain with 2- and 3-byte hashing
+ Minimum value for nice: 3
+ Memory usage:
+ dict * 7.5 (if dict <= 16 MiB);
+ dict * 5.5 + 64 MiB (if dict > 16 MiB)
+
+ hc4 Hash Chain with 2-, 3-, and 4-byte hashing
+ Minimum value for nice: 4
+ Memory usage:
+ dict * 7.5 (if dict <= 32 MiB);
+ dict * 6.5 (if dict > 32 MiB)
+
+ bt2 Binary Tree with 2-byte hashing
+ Minimum value for nice: 2
+ Memory usage: dict * 9.5
+
+ bt3 Binary Tree with 2- and 3-byte hashing
+ Minimum value for nice: 3
+ Memory usage:
+ dict * 11.5 (if dict <= 16 MiB);
+ dict * 9.5 + 64 MiB (if dict > 16 MiB)
+
+ bt4 Binary Tree with 2-, 3-, and 4-byte hashing
+ Minimum value for nice: 4
+ Memory usage:
+ dict * 11.5 (if dict <= 32 MiB);
+ dict * 10.5 (if dict > 32 MiB)
+
+ mode=mode
+ Compression mode specifies the method to analyze the data
+ produced by the match finder. Supported modes are fast
+ and normal. The default is fast for presets 0-3 and nor-
+ mal for presets 4-9.
+
+ Usually fast is used with Hash Chain match finders and
+ normal with Binary Tree match finders. This is also what
+ the presets do.
+
+ nice=nice
+ Specify what is considered to be a nice length for a
+ match. Once a match of at least nice bytes is found, the
+ algorithm stops looking for possibly better matches.
+
+ Nice can be 2-273 bytes. Higher values tend to give bet-
+ ter compression ratio at the expense of speed. The de-
+ fault depends on the preset.
+
+ depth=depth
+ Specify the maximum search depth in the match finder.
+ The default is the special value of 0, which makes the
+ compressor determine a reasonable depth from mf and nice.
+
+ Reasonable depth for Hash Chains is 4-100 and 16-1000 for
+ Binary Trees. Using very high values for depth can make
+ the encoder extremely slow with some files. Avoid set-
+ ting the depth over 1000 unless you are prepared to in-
+ terrupt the compression in case it is taking far too
+ long.
+
+ When decoding raw streams (--format=raw), LZMA2 needs only the
+ dictionary size. LZMA1 needs also lc, lp, and pb.
+
+ --x86[=options]
+ --arm[=options]
+ --armthumb[=options]
+ --arm64[=options]
+ --powerpc[=options]
+ --ia64[=options]
+ --sparc[=options]
+ Add a branch/call/jump (BCJ) filter to the filter chain. These
+ filters can be used only as a non-last filter in the filter
+ chain.
+
+ A BCJ filter converts relative addresses in the machine code to
+ their absolute counterparts. This doesn't change the size of
+ the data but it increases redundancy, which can help LZMA2 to
+ produce 0-15 % smaller .xz file. The BCJ filters are always re-
+ versible, so using a BCJ filter for wrong type of data doesn't
+ cause any data loss, although it may make the compression ratio
+ slightly worse. The BCJ filters are very fast and use an in-
+ significant amount of memory.
+
+ These BCJ filters have known problems related to the compression
+ ratio:
+
+ o Some types of files containing executable code (for example,
+ object files, static libraries, and Linux kernel modules)
+ have the addresses in the instructions filled with filler
+ values. These BCJ filters will still do the address conver-
+ sion, which will make the compression worse with these files.
+
+ o If a BCJ filter is applied on an archive, it is possible that
+ it makes the compression ratio worse than not using a BCJ
+ filter. For example, if there are similar or even identical
+ executables then filtering will likely make the files less
+ similar and thus compression is worse. The contents of non-
+ executable files in the same archive can matter too. In
+ practice one has to try with and without a BCJ filter to see
+ which is better in each situation.
+
+ Different instruction sets have different alignment: the exe-
+ cutable file must be aligned to a multiple of this value in the
+ input data to make the filter work.
+
+ Filter Alignment Notes
+ x86 1 32-bit or 64-bit x86
+ ARM 4
+ ARM-Thumb 2
+ ARM64 4 4096-byte alignment is best
+ PowerPC 4 Big endian only
+ IA-64 16 Itanium
+ SPARC 4
+
+ Since the BCJ-filtered data is usually compressed with LZMA2,
+ the compression ratio may be improved slightly if the LZMA2 op-
+ tions are set to match the alignment of the selected BCJ filter.
+ For example, with the IA-64 filter, it's good to set pb=4 or
+ even pb=4,lp=4,lc=0 with LZMA2 (2^4=16). The x86 filter is an
+ exception; it's usually good to stick to LZMA2's default four-
+ byte alignment when compressing x86 executables.
+
+ All BCJ filters support the same options:
+
+ start=offset
+ Specify the start offset that is used when converting be-
+ tween relative and absolute addresses. The offset must
+ be a multiple of the alignment of the filter (see the ta-
+ ble above). The default is zero. In practice, the de-
+ fault is good; specifying a custom offset is almost never
+ useful.
+
+ --delta[=options]
+ Add the Delta filter to the filter chain. The Delta filter can
+ be only used as a non-last filter in the filter chain.
+
+ Currently only simple byte-wise delta calculation is supported.
+ It can be useful when compressing, for example, uncompressed
+ bitmap images or uncompressed PCM audio. However, special pur-
+ pose algorithms may give significantly better results than Delta
+ + LZMA2. This is true especially with audio, which compresses
+ faster and better, for example, with flac(1).
+
+ Supported options:
+
+ dist=distance
+ Specify the distance of the delta calculation in bytes.
+ distance must be 1-256. The default is 1.
+
+ For example, with dist=2 and eight-byte input A1 B1 A2 B3
+ A3 B5 A4 B7, the output will be A1 B1 01 02 01 02 01 02.
+
+ Other options
+ -q, --quiet
+ Suppress warnings and notices. Specify this twice to suppress
+ errors too. This option has no effect on the exit status. That
+ is, even if a warning was suppressed, the exit status to indi-
+ cate a warning is still used.
+
+ -v, --verbose
+ Be verbose. If standard error is connected to a terminal, xz
+ will display a progress indicator. Specifying --verbose twice
+ will give even more verbose output.
+
+ The progress indicator shows the following information:
+
+ o Completion percentage is shown if the size of the input file
+ is known. That is, the percentage cannot be shown in pipes.
+
+ o Amount of compressed data produced (compressing) or consumed
+ (decompressing).
+
+ o Amount of uncompressed data consumed (compressing) or pro-
+ duced (decompressing).
+
+ o Compression ratio, which is calculated by dividing the amount
+ of compressed data processed so far by the amount of uncom-
+ pressed data processed so far.
+
+ o Compression or decompression speed. This is measured as the
+ amount of uncompressed data consumed (compression) or pro-
+ duced (decompression) per second. It is shown after a few
+ seconds have passed since xz started processing the file.
+
+ o Elapsed time in the format M:SS or H:MM:SS.
+
+ o Estimated remaining time is shown only when the size of the
+ input file is known and a couple of seconds have already
+ passed since xz started processing the file. The time is
+ shown in a less precise format which never has any colons,
+ for example, 2 min 30 s.
+
+ When standard error is not a terminal, --verbose will make xz
+ print the filename, compressed size, uncompressed size, compres-
+ sion ratio, and possibly also the speed and elapsed time on a
+ single line to standard error after compressing or decompressing
+ the file. The speed and elapsed time are included only when the
+ operation took at least a few seconds. If the operation didn't
+ finish, for example, due to user interruption, also the comple-
+ tion percentage is printed if the size of the input file is
+ known.
+
+ -Q, --no-warn
+ Don't set the exit status to 2 even if a condition worth a warn-
+ ing was detected. This option doesn't affect the verbosity
+ level, thus both --quiet and --no-warn have to be used to not
+ display warnings and to not alter the exit status.
+
+ --robot
+ Print messages in a machine-parsable format. This is intended
+ to ease writing frontends that want to use xz instead of li-
+ blzma, which may be the case with various scripts. The output
+ with this option enabled is meant to be stable across xz re-
+ leases. See the section ROBOT MODE for details.
+
+ --info-memory
+ Display, in human-readable format, how much physical memory
+ (RAM) and how many processor threads xz thinks the system has
+ and the memory usage limits for compression and decompression,
+ and exit successfully.
+
+ -h, --help
+ Display a help message describing the most commonly used op-
+ tions, and exit successfully.
+
+ -H, --long-help
+ Display a help message describing all features of xz, and exit
+ successfully
+
+ -V, --version
+ Display the version number of xz and liblzma in human readable
+ format. To get machine-parsable output, specify --robot before
+ --version.
+
+ROBOT MODE
+ The robot mode is activated with the --robot option. It makes the out-
+ put of xz easier to parse by other programs. Currently --robot is sup-
+ ported only together with --version, --info-memory, and --list. It
+ will be supported for compression and decompression in the future.
+
+ Version
+ xz --robot --version will print the version number of xz and liblzma in
+ the following format:
+
+ XZ_VERSION=XYYYZZZS
+ LIBLZMA_VERSION=XYYYZZZS
+
+ X Major version.
+
+ YYY Minor version. Even numbers are stable. Odd numbers are alpha
+ or beta versions.
+
+ ZZZ Patch level for stable releases or just a counter for develop-
+ ment releases.
+
+ S Stability. 0 is alpha, 1 is beta, and 2 is stable. S should be
+ always 2 when YYY is even.
+
+ XYYYZZZS are the same on both lines if xz and liblzma are from the same
+ XZ Utils release.
+
+ Examples: 4.999.9beta is 49990091 and 5.0.0 is 50000002.
+
+ Memory limit information
+ xz --robot --info-memory prints a single line with three tab-separated
+ columns:
+
+ 1. Total amount of physical memory (RAM) in bytes.
+
+ 2. Memory usage limit for compression in bytes (--memlimit-compress).
+ A special value of 0 indicates the default setting which for sin-
+ gle-threaded mode is the same as no limit.
+
+ 3. Memory usage limit for decompression in bytes (--memlimit-decom-
+ press). A special value of 0 indicates the default setting which
+ for single-threaded mode is the same as no limit.
+
+ 4. Since xz 5.3.4alpha: Memory usage for multi-threaded decompression
+ in bytes (--memlimit-mt-decompress). This is never zero because a
+ system-specific default value shown in the column 5 is used if no
+ limit has been specified explicitly. This is also never greater
+ than the value in the column 3 even if a larger value has been
+ specified with --memlimit-mt-decompress.
+
+ 5. Since xz 5.3.4alpha: A system-specific default memory usage limit
+ that is used to limit the number of threads when compressing with
+ an automatic number of threads (--threads=0) and no memory usage
+ limit has been specified (--memlimit-compress). This is also used
+ as the default value for --memlimit-mt-decompress.
+
+ 6. Since xz 5.3.4alpha: Number of available processor threads.
+
+ In the future, the output of xz --robot --info-memory may have more
+ columns, but never more than a single line.
+
+ List mode
+ xz --robot --list uses tab-separated output. The first column of every
+ line has a string that indicates the type of the information found on
+ that line:
+
+ name This is always the first line when starting to list a file. The
+ second column on the line is the filename.
+
+ file This line contains overall information about the .xz file. This
+ line is always printed after the name line.
+
+ stream This line type is used only when --verbose was specified. There
+ are as many stream lines as there are streams in the .xz file.
+
+ block This line type is used only when --verbose was specified. There
+ are as many block lines as there are blocks in the .xz file.
+ The block lines are shown after all the stream lines; different
+ line types are not interleaved.
+
+ summary
+ This line type is used only when --verbose was specified twice.
+ This line is printed after all block lines. Like the file line,
+ the summary line contains overall information about the .xz
+ file.
+
+ totals This line is always the very last line of the list output. It
+ shows the total counts and sizes.
+
+ The columns of the file lines:
+ 2. Number of streams in the file
+ 3. Total number of blocks in the stream(s)
+ 4. Compressed size of the file
+ 5. Uncompressed size of the file
+ 6. Compression ratio, for example, 0.123. If ratio is over
+ 9.999, three dashes (---) are displayed instead of the ra-
+ tio.
+ 7. Comma-separated list of integrity check names. The follow-
+ ing strings are used for the known check types: None, CRC32,
+ CRC64, and SHA-256. For unknown check types, Unknown-N is
+ used, where N is the Check ID as a decimal number (one or
+ two digits).
+ 8. Total size of stream padding in the file
+
+ The columns of the stream lines:
+ 2. Stream number (the first stream is 1)
+ 3. Number of blocks in the stream
+ 4. Compressed start offset
+ 5. Uncompressed start offset
+ 6. Compressed size (does not include stream padding)
+ 7. Uncompressed size
+ 8. Compression ratio
+ 9. Name of the integrity check
+ 10. Size of stream padding
+
+ The columns of the block lines:
+ 2. Number of the stream containing this block
+ 3. Block number relative to the beginning of the stream (the
+ first block is 1)
+ 4. Block number relative to the beginning of the file
+ 5. Compressed start offset relative to the beginning of the
+ file
+ 6. Uncompressed start offset relative to the beginning of the
+ file
+ 7. Total compressed size of the block (includes headers)
+ 8. Uncompressed size
+ 9. Compression ratio
+ 10. Name of the integrity check
+
+ If --verbose was specified twice, additional columns are included on
+ the block lines. These are not displayed with a single --verbose, be-
+ cause getting this information requires many seeks and can thus be
+ slow:
+ 11. Value of the integrity check in hexadecimal
+ 12. Block header size
+ 13. Block flags: c indicates that compressed size is present,
+ and u indicates that uncompressed size is present. If the
+ flag is not set, a dash (-) is shown instead to keep the
+ string length fixed. New flags may be added to the end of
+ the string in the future.
+ 14. Size of the actual compressed data in the block (this ex-
+ cludes the block header, block padding, and check fields)
+ 15. Amount of memory (in bytes) required to decompress this
+ block with this xz version
+ 16. Filter chain. Note that most of the options used at com-
+ pression time cannot be known, because only the options that
+ are needed for decompression are stored in the .xz headers.
+
+ The columns of the summary lines:
+ 2. Amount of memory (in bytes) required to decompress this file
+ with this xz version
+ 3. yes or no indicating if all block headers have both com-
+ pressed size and uncompressed size stored in them
+ Since xz 5.1.2alpha:
+ 4. Minimum xz version required to decompress the file
+
+ The columns of the totals line:
+ 2. Number of streams
+ 3. Number of blocks
+ 4. Compressed size
+ 5. Uncompressed size
+ 6. Average compression ratio
+ 7. Comma-separated list of integrity check names that were
+ present in the files
+ 8. Stream padding size
+ 9. Number of files. This is here to keep the order of the ear-
+ lier columns the same as on file lines.
+
+ If --verbose was specified twice, additional columns are included on
+ the totals line:
+ 10. Maximum amount of memory (in bytes) required to decompress
+ the files with this xz version
+ 11. yes or no indicating if all block headers have both com-
+ pressed size and uncompressed size stored in them
+ Since xz 5.1.2alpha:
+ 12. Minimum xz version required to decompress the file
+
+ Future versions may add new line types and new columns can be added to
+ the existing line types, but the existing columns won't be changed.
+
+EXIT STATUS
+ 0 All is good.
+
+ 1 An error occurred.
+
+ 2 Something worth a warning occurred, but no actual errors oc-
+ curred.
+
+ Notices (not warnings or errors) printed on standard error don't affect
+ the exit status.
+
+ENVIRONMENT
+ xz parses space-separated lists of options from the environment vari-
+ ables XZ_DEFAULTS and XZ_OPT, in this order, before parsing the options
+ from the command line. Note that only options are parsed from the en-
+ vironment variables; all non-options are silently ignored. Parsing is
+ done with getopt_long(3) which is used also for the command line argu-
+ ments.
+
+ XZ_DEFAULTS
+ User-specific or system-wide default options. Typically this is
+ set in a shell initialization script to enable xz's memory usage
+ limiter by default. Excluding shell initialization scripts and
+ similar special cases, scripts must never set or unset XZ_DE-
+ FAULTS.
+
+ XZ_OPT This is for passing options to xz when it is not possible to set
+ the options directly on the xz command line. This is the case
+ when xz is run by a script or tool, for example, GNU tar(1):
+
+ XZ_OPT=-2v tar caf foo.tar.xz foo
+
+ Scripts may use XZ_OPT, for example, to set script-specific de-
+ fault compression options. It is still recommended to allow
+ users to override XZ_OPT if that is reasonable. For example, in
+ sh(1) scripts one may use something like this:
+
+ XZ_OPT=${XZ_OPT-"-7e"}
+ export XZ_OPT
+
+LZMA UTILS COMPATIBILITY
+ The command line syntax of xz is practically a superset of lzma, un-
+ lzma, and lzcat as found from LZMA Utils 4.32.x. In most cases, it is
+ possible to replace LZMA Utils with XZ Utils without breaking existing
+ scripts. There are some incompatibilities though, which may sometimes
+ cause problems.
+
+ Compression preset levels
+ The numbering of the compression level presets is not identical in xz
+ and LZMA Utils. The most important difference is how dictionary sizes
+ are mapped to different presets. Dictionary size is roughly equal to
+ the decompressor memory usage.
+
+ Level xz LZMA Utils
+ -0 256 KiB N/A
+ -1 1 MiB 64 KiB
+ -2 2 MiB 1 MiB
+ -3 4 MiB 512 KiB
+ -4 4 MiB 1 MiB
+ -5 8 MiB 2 MiB
+ -6 8 MiB 4 MiB
+ -7 16 MiB 8 MiB
+ -8 32 MiB 16 MiB
+ -9 64 MiB 32 MiB
+
+ The dictionary size differences affect the compressor memory usage too,
+ but there are some other differences between LZMA Utils and XZ Utils,
+ which make the difference even bigger:
+
+ Level xz LZMA Utils 4.32.x
+ -0 3 MiB N/A
+ -1 9 MiB 2 MiB
+ -2 17 MiB 12 MiB
+ -3 32 MiB 12 MiB
+ -4 48 MiB 16 MiB
+ -5 94 MiB 26 MiB
+ -6 94 MiB 45 MiB
+ -7 186 MiB 83 MiB
+ -8 370 MiB 159 MiB
+ -9 674 MiB 311 MiB
+
+ The default preset level in LZMA Utils is -7 while in XZ Utils it is
+ -6, so both use an 8 MiB dictionary by default.
+
+ Streamed vs. non-streamed .lzma files
+ The uncompressed size of the file can be stored in the .lzma header.
+ LZMA Utils does that when compressing regular files. The alternative
+ is to mark that uncompressed size is unknown and use end-of-payload
+ marker to indicate where the decompressor should stop. LZMA Utils uses
+ this method when uncompressed size isn't known, which is the case, for
+ example, in pipes.
+
+ xz supports decompressing .lzma files with or without end-of-payload
+ marker, but all .lzma files created by xz will use end-of-payload
+ marker and have uncompressed size marked as unknown in the .lzma
+ header. This may be a problem in some uncommon situations. For exam-
+ ple, a .lzma decompressor in an embedded device might work only with
+ files that have known uncompressed size. If you hit this problem, you
+ need to use LZMA Utils or LZMA SDK to create .lzma files with known un-
+ compressed size.
+
+ Unsupported .lzma files
+ The .lzma format allows lc values up to 8, and lp values up to 4. LZMA
+ Utils can decompress files with any lc and lp, but always creates files
+ with lc=3 and lp=0. Creating files with other lc and lp is possible
+ with xz and with LZMA SDK.
+
+ The implementation of the LZMA1 filter in liblzma requires that the sum
+ of lc and lp must not exceed 4. Thus, .lzma files, which exceed this
+ limitation, cannot be decompressed with xz.
+
+ LZMA Utils creates only .lzma files which have a dictionary size of 2^n
+ (a power of 2) but accepts files with any dictionary size. liblzma ac-
+ cepts only .lzma files which have a dictionary size of 2^n or 2^n +
+ 2^(n-1). This is to decrease false positives when detecting .lzma
+ files.
+
+ These limitations shouldn't be a problem in practice, since practically
+ all .lzma files have been compressed with settings that liblzma will
+ accept.
+
+ Trailing garbage
+ When decompressing, LZMA Utils silently ignore everything after the
+ first .lzma stream. In most situations, this is a bug. This also
+ means that LZMA Utils don't support decompressing concatenated .lzma
+ files.
+
+ If there is data left after the first .lzma stream, xz considers the
+ file to be corrupt unless --single-stream was used. This may break ob-
+ scure scripts which have assumed that trailing garbage is ignored.
+
+NOTES
+ Compressed output may vary
+ The exact compressed output produced from the same uncompressed input
+ file may vary between XZ Utils versions even if compression options are
+ identical. This is because the encoder can be improved (faster or bet-
+ ter compression) without affecting the file format. The output can
+ vary even between different builds of the same XZ Utils version, if
+ different build options are used.
+
+ The above means that once --rsyncable has been implemented, the result-
+ ing files won't necessarily be rsyncable unless both old and new files
+ have been compressed with the same xz version. This problem can be
+ fixed if a part of the encoder implementation is frozen to keep rsynca-
+ ble output stable across xz versions.
+
+ Embedded .xz decompressors
+ Embedded .xz decompressor implementations like XZ Embedded don't neces-
+ sarily support files created with integrity check types other than none
+ and crc32. Since the default is --check=crc64, you must use
+ --check=none or --check=crc32 when creating files for embedded systems.
+
+ Outside embedded systems, all .xz format decompressors support all the
+ check types, or at least are able to decompress the file without veri-
+ fying the integrity check if the particular check is not supported.
+
+ XZ Embedded supports BCJ filters, but only with the default start off-
+ set.
+
+EXAMPLES
+ Basics
+ Compress the file foo into foo.xz using the default compression level
+ (-6), and remove foo if compression is successful:
+
+ xz foo
+
+ Decompress bar.xz into bar and don't remove bar.xz even if decompres-
+ sion is successful:
+
+ xz -dk bar.xz
+
+ Create baz.tar.xz with the preset -4e (-4 --extreme), which is slower
+ than the default -6, but needs less memory for compression and decom-
+ pression (48 MiB and 5 MiB, respectively):
+
+ tar cf - baz | xz -4e > baz.tar.xz
+
+ A mix of compressed and uncompressed files can be decompressed to stan-
+ dard output with a single command:
+
+ xz -dcf a.txt b.txt.xz c.txt d.txt.lzma > abcd.txt
+
+ Parallel compression of many files
+ On GNU and *BSD, find(1) and xargs(1) can be used to parallelize com-
+ pression of many files:
+
+ find . -type f \! -name '*.xz' -print0 \
+ | xargs -0r -P4 -n16 xz -T1
+
+ The -P option to xargs(1) sets the number of parallel xz processes.
+ The best value for the -n option depends on how many files there are to
+ be compressed. If there are only a couple of files, the value should
+ probably be 1; with tens of thousands of files, 100 or even more may be
+ appropriate to reduce the number of xz processes that xargs(1) will
+ eventually create.
+
+ The option -T1 for xz is there to force it to single-threaded mode, be-
+ cause xargs(1) is used to control the amount of parallelization.
+
+ Robot mode
+ Calculate how many bytes have been saved in total after compressing
+ multiple files:
+
+ xz --robot --list *.xz | awk '/^totals/{print $5-$4}'
+
+ A script may want to know that it is using new enough xz. The follow-
+ ing sh(1) script checks that the version number of the xz tool is at
+ least 5.0.0. This method is compatible with old beta versions, which
+ didn't support the --robot option:
+
+ if ! eval "$(xz --robot --version 2> /dev/null)" ||
+ [ "$XZ_VERSION" -lt 50000002 ]; then
+ echo "Your xz is too old."
+ fi
+ unset XZ_VERSION LIBLZMA_VERSION
+
+ Set a memory usage limit for decompression using XZ_OPT, but if a limit
+ has already been set, don't increase it:
+
+ NEWLIM=$((123 << 20)) # 123 MiB
+ OLDLIM=$(xz --robot --info-memory | cut -f3)
+ if [ $OLDLIM -eq 0 -o $OLDLIM -gt $NEWLIM ]; then
+ XZ_OPT="$XZ_OPT --memlimit-decompress=$NEWLIM"
+ export XZ_OPT
+ fi
+
+ Custom compressor filter chains
+ The simplest use for custom filter chains is customizing a LZMA2 pre-
+ set. This can be useful, because the presets cover only a subset of
+ the potentially useful combinations of compression settings.
+
+ The CompCPU columns of the tables from the descriptions of the options
+ -0 ... -9 and --extreme are useful when customizing LZMA2 presets.
+ Here are the relevant parts collected from those two tables:
+
+ Preset CompCPU
+ -0 0
+
+ -1 1
+ -2 2
+ -3 3
+ -4 4
+ -5 5
+ -6 6
+ -5e 7
+ -6e 8
+
+ If you know that a file requires somewhat big dictionary (for example,
+ 32 MiB) to compress well, but you want to compress it quicker than xz
+ -8 would do, a preset with a low CompCPU value (for example, 1) can be
+ modified to use a bigger dictionary:
+
+ xz --lzma2=preset=1,dict=32MiB foo.tar
+
+ With certain files, the above command may be faster than xz -6 while
+ compressing significantly better. However, it must be emphasized that
+ only some files benefit from a big dictionary while keeping the CompCPU
+ value low. The most obvious situation, where a big dictionary can help
+ a lot, is an archive containing very similar files of at least a few
+ megabytes each. The dictionary size has to be significantly bigger
+ than any individual file to allow LZMA2 to take full advantage of the
+ similarities between consecutive files.
+
+ If very high compressor and decompressor memory usage is fine, and the
+ file being compressed is at least several hundred megabytes, it may be
+ useful to use an even bigger dictionary than the 64 MiB that xz -9
+ would use:
+
+ xz -vv --lzma2=dict=192MiB big_foo.tar
+
+ Using -vv (--verbose --verbose) like in the above example can be useful
+ to see the memory requirements of the compressor and decompressor. Re-
+ member that using a dictionary bigger than the size of the uncompressed
+ file is waste of memory, so the above command isn't useful for small
+ files.
+
+ Sometimes the compression time doesn't matter, but the decompressor
+ memory usage has to be kept low, for example, to make it possible to
+ decompress the file on an embedded system. The following command uses
+ -6e (-6 --extreme) as a base and sets the dictionary to only 64 KiB.
+ The resulting file can be decompressed with XZ Embedded (that's why
+ there is --check=crc32) using about 100 KiB of memory.
+
+ xz --check=crc32 --lzma2=preset=6e,dict=64KiB foo
+
+ If you want to squeeze out as many bytes as possible, adjusting the
+ number of literal context bits (lc) and number of position bits (pb)
+ can sometimes help. Adjusting the number of literal position bits (lp)
+ might help too, but usually lc and pb are more important. For example,
+ a source code archive contains mostly US-ASCII text, so something like
+ the following might give slightly (like 0.1 %) smaller file than xz -6e
+ (try also without lc=4):
+
+ xz --lzma2=preset=6e,pb=0,lc=4 source_code.tar
+
+ Using another filter together with LZMA2 can improve compression with
+ certain file types. For example, to compress a x86-32 or x86-64 shared
+ library using the x86 BCJ filter:
+
+ xz --x86 --lzma2 libfoo.so
+
+ Note that the order of the filter options is significant. If --x86 is
+ specified after --lzma2, xz will give an error, because there cannot be
+ any filter after LZMA2, and also because the x86 BCJ filter cannot be
+ used as the last filter in the chain.
+
+ The Delta filter together with LZMA2 can give good results with bitmap
+ images. It should usually beat PNG, which has a few more advanced fil-
+ ters than simple delta but uses Deflate for the actual compression.
+
+ The image has to be saved in uncompressed format, for example, as un-
+ compressed TIFF. The distance parameter of the Delta filter is set to
+ match the number of bytes per pixel in the image. For example, 24-bit
+ RGB bitmap needs dist=3, and it is also good to pass pb=0 to LZMA2 to
+ accommodate the three-byte alignment:
+
+ xz --delta=dist=3 --lzma2=pb=0 foo.tiff
+
+ If multiple images have been put into a single archive (for example,
+ .tar), the Delta filter will work on that too as long as all images
+ have the same number of bytes per pixel.
+
+SEE ALSO
+ xzdec(1), xzdiff(1), xzgrep(1), xzless(1), xzmore(1), gzip(1),
+ bzip2(1), 7z(1)
+
+ XZ Utils: <https://tukaani.org/xz/>
+ XZ Embedded: <https://tukaani.org/xz/embedded.html>
+ LZMA SDK: <http://7-zip.org/sdk.html>
+
+
+
+Tukaani 2022-12-01 XZ(1)
diff --git a/doc/man/txt/xzdec.txt b/doc/man/txt/xzdec.txt
new file mode 100644
index 0000000..a914e20
--- /dev/null
+++ b/doc/man/txt/xzdec.txt
@@ -0,0 +1,80 @@
+XZDEC(1) XZ Utils XZDEC(1)
+
+
+
+NAME
+ xzdec, lzmadec - Small .xz and .lzma decompressors
+
+SYNOPSIS
+ xzdec [option...] [file...]
+ lzmadec [option...] [file...]
+
+DESCRIPTION
+ xzdec is a liblzma-based decompression-only tool for .xz (and only .xz)
+ files. xzdec is intended to work as a drop-in replacement for xz(1) in
+ the most common situations where a script has been written to use xz
+ --decompress --stdout (and possibly a few other commonly used options)
+ to decompress .xz files. lzmadec is identical to xzdec except that lz-
+ madec supports .lzma files instead of .xz files.
+
+ To reduce the size of the executable, xzdec doesn't support multi-
+ threading or localization, and doesn't read options from XZ_DEFAULTS
+ and XZ_OPT environment variables. xzdec doesn't support displaying in-
+ termediate progress information: sending SIGINFO to xzdec does nothing,
+ but sending SIGUSR1 terminates the process instead of displaying
+ progress information.
+
+OPTIONS
+ -d, --decompress, --uncompress
+ Ignored for xz(1) compatibility. xzdec supports only decompres-
+ sion.
+
+ -k, --keep
+ Ignored for xz(1) compatibility. xzdec never creates or removes
+ any files.
+
+ -c, --stdout, --to-stdout
+ Ignored for xz(1) compatibility. xzdec always writes the decom-
+ pressed data to standard output.
+
+ -q, --quiet
+ Specifying this once does nothing since xzdec never displays any
+ warnings or notices. Specify this twice to suppress errors.
+
+ -Q, --no-warn
+ Ignored for xz(1) compatibility. xzdec never uses the exit sta-
+ tus 2.
+
+ -h, --help
+ Display a help message and exit successfully.
+
+ -V, --version
+ Display the version number of xzdec and liblzma.
+
+EXIT STATUS
+ 0 All was good.
+
+ 1 An error occurred.
+
+ xzdec doesn't have any warning messages like xz(1) has, thus the exit
+ status 2 is not used by xzdec.
+
+NOTES
+ Use xz(1) instead of xzdec or lzmadec for normal everyday use. xzdec
+ or lzmadec are meant only for situations where it is important to have
+ a smaller decompressor than the full-featured xz(1).
+
+ xzdec and lzmadec are not really that small. The size can be reduced
+ further by dropping features from liblzma at compile time, but that
+ shouldn't usually be done for executables distributed in typical non-
+ embedded operating system distributions. If you need a truly small .xz
+ decompressor, consider using XZ Embedded.
+
+SEE ALSO
+ xz(1)
+
+ XZ Embedded: <https://tukaani.org/xz/embedded.html>
+
+
+
+Tukaani 2017-04-19 XZDEC(1)
diff --git a/doc/man/txt/xzdiff.txt b/doc/man/txt/xzdiff.txt
new file mode 100644
index 0000000..681b00c
--- /dev/null
+++ b/doc/man/txt/xzdiff.txt
@@ -0,0 +1,37 @@
+XZDIFF(1) XZ Utils XZDIFF(1)
+
+
+
+NAME
+ xzcmp, xzdiff, lzcmp, lzdiff - compare compressed files
+
+SYNOPSIS
+ xzcmp [cmp_options] file1 [file2]
+ xzdiff [diff_options] file1 [file2]
+ lzcmp [cmp_options] file1 [file2]
+ lzdiff [diff_options] file1 [file2]
+
+DESCRIPTION
+ xzcmp and xzdiff invoke cmp(1) or diff(1) on files compressed with
+ xz(1), lzma(1), gzip(1), bzip2(1), lzop(1), or zstd(1). All options
+ specified are passed directly to cmp(1) or diff(1). If only one file
+ is specified, then the files compared are file1 (which must have a suf-
+ fix of a supported compression format) and file1 from which the com-
+ pression format suffix has been stripped. If two files are specified,
+ then they are uncompressed if necessary and fed to cmp(1) or diff(1).
+ The exit status from cmp(1) or diff(1) is preserved unless a decompres-
+ sion error occurs; then exit status is 2.
+
+ The names lzcmp and lzdiff are provided for backward compatibility with
+ LZMA Utils.
+
+SEE ALSO
+ cmp(1), diff(1), xz(1), gzip(1), bzip2(1), lzop(1), zstd(1), zdiff(1)
+
+BUGS
+ Messages from the cmp(1) or diff(1) programs refer to temporary file-
+ names instead of those specified.
+
+
+
+Tukaani 2021-06-04 XZDIFF(1)
diff --git a/doc/man/txt/xzgrep.txt b/doc/man/txt/xzgrep.txt
new file mode 100644
index 0000000..596520c
--- /dev/null
+++ b/doc/man/txt/xzgrep.txt
@@ -0,0 +1,49 @@
+XZGREP(1) XZ Utils XZGREP(1)
+
+
+
+NAME
+ xzgrep - search compressed files for a regular expression
+
+SYNOPSIS
+ xzgrep [grep_options] [-e] pattern [file...]
+ xzegrep ...
+ xzfgrep ...
+ lzgrep ...
+ lzegrep ...
+ lzfgrep ...
+
+DESCRIPTION
+ xzgrep invokes grep(1) on files which may be either uncompressed or
+ compressed with xz(1), lzma(1), gzip(1), bzip2(1), lzop(1), or zstd(1).
+ All options specified are passed directly to grep(1).
+
+ If no file is specified, then standard input is decompressed if neces-
+ sary and fed to grep(1). When reading from standard input, gzip(1),
+ bzip2(1), lzop(1), and zstd(1) compressed files are not supported.
+
+ If xzgrep is invoked as xzegrep or xzfgrep then grep -E or grep -F is
+ used instead of grep(1). The same applies to names lzgrep, lzegrep,
+ and lzfgrep, which are provided for backward compatibility with LZMA
+ Utils.
+
+EXIT STATUS
+ 0 At least one match was found from at least one of the input
+ files. No errors occurred.
+
+ 1 No matches were found from any of the input files. No errors
+ occurred.
+
+ >1 One or more errors occurred. It is unknown if matches were
+ found.
+
+ENVIRONMENT
+ GREP If the GREP environment variable is set, xzgrep uses it instead
+ of grep(1), grep -E, or grep -F.
+
+SEE ALSO
+ grep(1), xz(1), gzip(1), bzip2(1), lzop(1), zstd(1), zgrep(1)
+
+
+
+Tukaani 2022-07-19 XZGREP(1)
diff --git a/doc/man/txt/xzless.txt b/doc/man/txt/xzless.txt
new file mode 100644
index 0000000..5c14c80
--- /dev/null
+++ b/doc/man/txt/xzless.txt
@@ -0,0 +1,39 @@
+XZLESS(1) XZ Utils XZLESS(1)
+
+
+
+NAME
+ xzless, lzless - view xz or lzma compressed (text) files
+
+SYNOPSIS
+ xzless [file...]
+ lzless [file...]
+
+DESCRIPTION
+ xzless is a filter that displays text from compressed files to a termi-
+ nal. It works on files compressed with xz(1) or lzma(1). If no files
+ are given, xzless reads from standard input.
+
+ xzless uses less(1) to present its output. Unlike xzmore, its choice
+ of pager cannot be altered by setting an environment variable. Com-
+ mands are based on both more(1) and vi(1) and allow back and forth
+ movement and searching. See the less(1) manual for more information.
+
+ The command named lzless is provided for backward compatibility with
+ LZMA Utils.
+
+ENVIRONMENT
+ LESSMETACHARS
+ A list of characters special to the shell. Set by xzless unless
+ it is already set in the environment.
+
+ LESSOPEN
+ Set to a command line to invoke the xz(1) decompressor for pre-
+ processing the input files to less(1).
+
+SEE ALSO
+ less(1), xz(1), xzmore(1), zless(1)
+
+
+
+Tukaani 2010-09-27 XZLESS(1)
diff --git a/doc/man/txt/xzmore.txt b/doc/man/txt/xzmore.txt
new file mode 100644
index 0000000..5a9d86c
--- /dev/null
+++ b/doc/man/txt/xzmore.txt
@@ -0,0 +1,34 @@
+XZMORE(1) XZ Utils XZMORE(1)
+
+
+
+NAME
+ xzmore, lzmore - view xz or lzma compressed (text) files
+
+SYNOPSIS
+ xzmore [file...]
+ lzmore [file...]
+
+DESCRIPTION
+ xzmore is a filter which allows examination of xz(1) or lzma(1) com-
+ pressed text files one screenful at a time on a soft-copy terminal.
+
+ To use a pager other than the default more, set environment variable
+ PAGER to the name of the desired program. The name lzmore is provided
+ for backward compatibility with LZMA Utils.
+
+ e or q When the prompt --More--(Next file: file) is printed, this com-
+ mand causes xzmore to exit.
+
+ s When the prompt --More--(Next file: file) is printed, this com-
+ mand causes xzmore to skip the next file and continue.
+
+ For list of keyboard commands supported while actually viewing the con-
+ tent of a file, refer to manual of the pager you use, usually more(1).
+
+SEE ALSO
+ more(1), xz(1), xzless(1), zmore(1)
+
+
+
+Tukaani 2013-06-30 XZMORE(1)
diff --git a/doc/xz-file-format.txt b/doc/xz-file-format.txt
new file mode 100644
index 0000000..09c83e0
--- /dev/null
+++ b/doc/xz-file-format.txt
@@ -0,0 +1,1165 @@
+
+The .xz File Format
+===================
+
+Version 1.1.0 (2022-12-11)
+
+
+ 0. Preface
+ 0.1. Notices and Acknowledgements
+ 0.2. Getting the Latest Version
+ 0.3. Version History
+ 1. Conventions
+ 1.1. Byte and Its Representation
+ 1.2. Multibyte Integers
+ 2. Overall Structure of .xz File
+ 2.1. Stream
+ 2.1.1. Stream Header
+ 2.1.1.1. Header Magic Bytes
+ 2.1.1.2. Stream Flags
+ 2.1.1.3. CRC32
+ 2.1.2. Stream Footer
+ 2.1.2.1. CRC32
+ 2.1.2.2. Backward Size
+ 2.1.2.3. Stream Flags
+ 2.1.2.4. Footer Magic Bytes
+ 2.2. Stream Padding
+ 3. Block
+ 3.1. Block Header
+ 3.1.1. Block Header Size
+ 3.1.2. Block Flags
+ 3.1.3. Compressed Size
+ 3.1.4. Uncompressed Size
+ 3.1.5. List of Filter Flags
+ 3.1.6. Header Padding
+ 3.1.7. CRC32
+ 3.2. Compressed Data
+ 3.3. Block Padding
+ 3.4. Check
+ 4. Index
+ 4.1. Index Indicator
+ 4.2. Number of Records
+ 4.3. List of Records
+ 4.3.1. Unpadded Size
+ 4.3.2. Uncompressed Size
+ 4.4. Index Padding
+ 4.5. CRC32
+ 5. Filter Chains
+ 5.1. Alignment
+ 5.2. Security
+ 5.3. Filters
+ 5.3.1. LZMA2
+ 5.3.2. Branch/Call/Jump Filters for Executables
+ 5.3.3. Delta
+ 5.3.3.1. Format of the Encoded Output
+ 5.4. Custom Filter IDs
+ 5.4.1. Reserved Custom Filter ID Ranges
+ 6. Cyclic Redundancy Checks
+ 7. References
+
+
+0. Preface
+
+ This document describes the .xz file format (filename suffix
+ ".xz", MIME type "application/x-xz"). It is intended that this
+ this format replace the old .lzma format used by LZMA SDK and
+ LZMA Utils.
+
+
+0.1. Notices and Acknowledgements
+
+ This file format was designed by Lasse Collin
+ <lasse.collin@tukaani.org> and Igor Pavlov.
+
+ Special thanks for helping with this document goes to
+ Ville Koskinen. Thanks for helping with this document goes to
+ Mark Adler, H. Peter Anvin, Mikko Pouru, and Lars Wirzenius.
+
+ This document has been put into the public domain.
+
+
+0.2. Getting the Latest Version
+
+ The latest official version of this document can be downloaded
+ from <http://tukaani.org/xz/xz-file-format.txt>.
+
+ Specific versions of this document have a filename
+ xz-file-format-X.Y.Z.txt where X.Y.Z is the version number.
+ For example, the version 1.0.0 of this document is available
+ at <http://tukaani.org/xz/xz-file-format-1.0.0.txt>.
+
+
+0.3. Version History
+
+ Version Date Description
+
+ 1.1.0 2022-12-11 Added ARM64 filter and clarified 32-bit
+ ARM endianness in Section 5.3.2,
+ language improvements in Section 5.4
+
+ 1.0.4 2009-08-27 Language improvements in Sections 1.2,
+ 2.1.1.2, 3.1.1, 3.1.2, and 5.3.1
+
+ 1.0.3 2009-06-05 Spelling fixes in Sections 5.1 and 5.4
+
+ 1.0.2 2009-06-04 Typo fixes in Sections 4 and 5.3.1
+
+ 1.0.1 2009-06-01 Typo fix in Section 0.3 and minor
+ clarifications to Sections 2, 2.2,
+ 3.3, 4.4, and 5.3.2
+
+ 1.0.0 2009-01-14 The first official version
+
+
+1. Conventions
+
+ The key words "MUST", "MUST NOT", "REQUIRED", "SHOULD",
+ "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
+ document are to be interpreted as described in [RFC-2119].
+
+ Indicating a warning means displaying a message, returning
+ appropriate exit status, or doing something else to let the
+ user know that something worth warning occurred. The operation
+ SHOULD still finish if a warning is indicated.
+
+ Indicating an error means displaying a message, returning
+ appropriate exit status, or doing something else to let the
+ user know that something prevented successfully finishing the
+ operation. The operation MUST be aborted once an error has
+ been indicated.
+
+
+1.1. Byte and Its Representation
+
+ In this document, byte is always 8 bits.
+
+ A "null byte" has all bits unset. That is, the value of a null
+ byte is 0x00.
+
+ To represent byte blocks, this document uses notation that
+ is similar to the notation used in [RFC-1952]:
+
+ +-------+
+ | Foo | One byte.
+ +-------+
+
+ +---+---+
+ | Foo | Two bytes; that is, some of the vertical bars
+ +---+---+ can be missing.
+
+ +=======+
+ | Foo | Zero or more bytes.
+ +=======+
+
+ In this document, a boxed byte or a byte sequence declared
+ using this notation is called "a field". The example field
+ above would be called "the Foo field" or plain "Foo".
+
+ If there are many fields, they may be split to multiple lines.
+ This is indicated with an arrow ("--->"):
+
+ +=====+
+ | Foo |
+ +=====+
+
+ +=====+
+ ---> | Bar |
+ +=====+
+
+ The above is equivalent to this:
+
+ +=====+=====+
+ | Foo | Bar |
+ +=====+=====+
+
+
+1.2. Multibyte Integers
+
+ Multibyte integers of static length, such as CRC values,
+ are stored in little endian byte order (least significant
+ byte first).
+
+ When smaller values are more likely than bigger values (for
+ example file sizes), multibyte integers are encoded in a
+ variable-length representation:
+ - Numbers in the range [0, 127] are copied as is, and take
+ one byte of space.
+ - Bigger numbers will occupy two or more bytes. All but the
+ last byte of the multibyte representation have the highest
+ (eighth) bit set.
+
+ For now, the value of the variable-length integers is limited
+ to 63 bits, which limits the encoded size of the integer to
+ nine bytes. These limits may be increased in the future if
+ needed.
+
+ The following C code illustrates encoding and decoding of
+ variable-length integers. The functions return the number of
+ bytes occupied by the integer (1-9), or zero on error.
+
+ #include <stddef.h>
+ #include <inttypes.h>
+
+ size_t
+ encode(uint8_t buf[static 9], uint64_t num)
+ {
+ if (num > UINT64_MAX / 2)
+ return 0;
+
+ size_t i = 0;
+
+ while (num >= 0x80) {
+ buf[i++] = (uint8_t)(num) | 0x80;
+ num >>= 7;
+ }
+
+ buf[i++] = (uint8_t)(num);
+
+ return i;
+ }
+
+ size_t
+ decode(const uint8_t buf[], size_t size_max, uint64_t *num)
+ {
+ if (size_max == 0)
+ return 0;
+
+ if (size_max > 9)
+ size_max = 9;
+
+ *num = buf[0] & 0x7F;
+ size_t i = 0;
+
+ while (buf[i++] & 0x80) {
+ if (i >= size_max || buf[i] == 0x00)
+ return 0;
+
+ *num |= (uint64_t)(buf[i] & 0x7F) << (i * 7);
+ }
+
+ return i;
+ }
+
+
+2. Overall Structure of .xz File
+
+ A standalone .xz files consist of one or more Streams which may
+ have Stream Padding between or after them:
+
+ +========+================+========+================+
+ | Stream | Stream Padding | Stream | Stream Padding | ...
+ +========+================+========+================+
+
+ The sizes of Stream and Stream Padding are always multiples
+ of four bytes, thus the size of every valid .xz file MUST be
+ a multiple of four bytes.
+
+ While a typical file contains only one Stream and no Stream
+ Padding, a decoder handling standalone .xz files SHOULD support
+ files that have more than one Stream or Stream Padding.
+
+ In contrast to standalone .xz files, when the .xz file format
+ is used as an internal part of some other file format or
+ communication protocol, it usually is expected that the decoder
+ stops after the first Stream, and doesn't look for Stream
+ Padding or possibly other Streams.
+
+
+2.1. Stream
+
+ +-+-+-+-+-+-+-+-+-+-+-+-+=======+=======+ +=======+
+ | Stream Header | Block | Block | ... | Block |
+ +-+-+-+-+-+-+-+-+-+-+-+-+=======+=======+ +=======+
+
+ +=======+-+-+-+-+-+-+-+-+-+-+-+-+
+ ---> | Index | Stream Footer |
+ +=======+-+-+-+-+-+-+-+-+-+-+-+-+
+
+ All the above fields have a size that is a multiple of four. If
+ Stream is used as an internal part of another file format, it
+ is RECOMMENDED to make the Stream start at an offset that is
+ a multiple of four bytes.
+
+ Stream Header, Index, and Stream Footer are always present in
+ a Stream. The maximum size of the Index field is 16 GiB (2^34).
+
+ There are zero or more Blocks. The maximum number of Blocks is
+ limited only by the maximum size of the Index field.
+
+ Total size of a Stream MUST be less than 8 EiB (2^63 bytes).
+ The same limit applies to the total amount of uncompressed
+ data stored in a Stream.
+
+ If an implementation supports handling .xz files with multiple
+ concatenated Streams, it MAY apply the above limits to the file
+ as a whole instead of limiting per Stream basis.
+
+
+2.1.1. Stream Header
+
+ +---+---+---+---+---+---+-------+------+--+--+--+--+
+ | Header Magic Bytes | Stream Flags | CRC32 |
+ +---+---+---+---+---+---+-------+------+--+--+--+--+
+
+
+2.1.1.1. Header Magic Bytes
+
+ The first six (6) bytes of the Stream are so called Header
+ Magic Bytes. They can be used to identify the file type.
+
+ Using a C array and ASCII:
+ const uint8_t HEADER_MAGIC[6]
+ = { 0xFD, '7', 'z', 'X', 'Z', 0x00 };
+
+ In plain hexadecimal:
+ FD 37 7A 58 5A 00
+
+ Notes:
+ - The first byte (0xFD) was chosen so that the files cannot
+ be erroneously detected as being in .lzma format, in which
+ the first byte is in the range [0x00, 0xE0].
+ - The sixth byte (0x00) was chosen to prevent applications
+ from misdetecting the file as a text file.
+
+ If the Header Magic Bytes don't match, the decoder MUST
+ indicate an error.
+
+
+2.1.1.2. Stream Flags
+
+ The first byte of Stream Flags is always a null byte. In the
+ future, this byte may be used to indicate a new Stream version
+ or other Stream properties.
+
+ The second byte of Stream Flags is a bit field:
+
+ Bit(s) Mask Description
+ 0-3 0x0F Type of Check (see Section 3.4):
+ ID Size Check name
+ 0x00 0 bytes None
+ 0x01 4 bytes CRC32
+ 0x02 4 bytes (Reserved)
+ 0x03 4 bytes (Reserved)
+ 0x04 8 bytes CRC64
+ 0x05 8 bytes (Reserved)
+ 0x06 8 bytes (Reserved)
+ 0x07 16 bytes (Reserved)
+ 0x08 16 bytes (Reserved)
+ 0x09 16 bytes (Reserved)
+ 0x0A 32 bytes SHA-256
+ 0x0B 32 bytes (Reserved)
+ 0x0C 32 bytes (Reserved)
+ 0x0D 64 bytes (Reserved)
+ 0x0E 64 bytes (Reserved)
+ 0x0F 64 bytes (Reserved)
+ 4-7 0xF0 Reserved for future use; MUST be zero for now.
+
+ Implementations SHOULD support at least the Check IDs 0x00
+ (None) and 0x01 (CRC32). Supporting other Check IDs is
+ OPTIONAL. If an unsupported Check is used, the decoder SHOULD
+ indicate a warning or error.
+
+ If any reserved bit is set, the decoder MUST indicate an error.
+ It is possible that there is a new field present which the
+ decoder is not aware of, and can thus parse the Stream Header
+ incorrectly.
+
+
+2.1.1.3. CRC32
+
+ The CRC32 is calculated from the Stream Flags field. It is
+ stored as an unsigned 32-bit little endian integer. If the
+ calculated value does not match the stored one, the decoder
+ MUST indicate an error.
+
+ The idea is that Stream Flags would always be two bytes, even
+ if new features are needed. This way old decoders will be able
+ to verify the CRC32 calculated from Stream Flags, and thus
+ distinguish between corrupt files (CRC32 doesn't match) and
+ files that the decoder doesn't support (CRC32 matches but
+ Stream Flags has reserved bits set).
+
+
+2.1.2. Stream Footer
+
+ +-+-+-+-+---+---+---+---+-------+------+----------+---------+
+ | CRC32 | Backward Size | Stream Flags | Footer Magic Bytes |
+ +-+-+-+-+---+---+---+---+-------+------+----------+---------+
+
+
+2.1.2.1. CRC32
+
+ The CRC32 is calculated from the Backward Size and Stream Flags
+ fields. It is stored as an unsigned 32-bit little endian
+ integer. If the calculated value does not match the stored one,
+ the decoder MUST indicate an error.
+
+ The reason to have the CRC32 field before the Backward Size and
+ Stream Flags fields is to keep the four-byte fields aligned to
+ a multiple of four bytes.
+
+
+2.1.2.2. Backward Size
+
+ Backward Size is stored as a 32-bit little endian integer,
+ which indicates the size of the Index field as multiple of
+ four bytes, minimum value being four bytes:
+
+ real_backward_size = (stored_backward_size + 1) * 4;
+
+ If the stored value does not match the real size of the Index
+ field, the decoder MUST indicate an error.
+
+ Using a fixed-size integer to store Backward Size makes
+ it slightly simpler to parse the Stream Footer when the
+ application needs to parse the Stream backwards.
+
+
+2.1.2.3. Stream Flags
+
+ This is a copy of the Stream Flags field from the Stream
+ Header. The information stored to Stream Flags is needed
+ when parsing the Stream backwards. The decoder MUST compare
+ the Stream Flags fields in both Stream Header and Stream
+ Footer, and indicate an error if they are not identical.
+
+
+2.1.2.4. Footer Magic Bytes
+
+ As the last step of the decoding process, the decoder MUST
+ verify the existence of Footer Magic Bytes. If they don't
+ match, an error MUST be indicated.
+
+ Using a C array and ASCII:
+ const uint8_t FOOTER_MAGIC[2] = { 'Y', 'Z' };
+
+ In hexadecimal:
+ 59 5A
+
+ The primary reason to have Footer Magic Bytes is to make
+ it easier to detect incomplete files quickly, without
+ uncompressing. If the file does not end with Footer Magic Bytes
+ (excluding Stream Padding described in Section 2.2), it cannot
+ be undamaged, unless someone has intentionally appended garbage
+ after the end of the Stream.
+
+
+2.2. Stream Padding
+
+ Only the decoders that support decoding of concatenated Streams
+ MUST support Stream Padding.
+
+ Stream Padding MUST contain only null bytes. To preserve the
+ four-byte alignment of consecutive Streams, the size of Stream
+ Padding MUST be a multiple of four bytes. Empty Stream Padding
+ is allowed. If these requirements are not met, the decoder MUST
+ indicate an error.
+
+ Note that non-empty Stream Padding is allowed at the end of the
+ file; there doesn't need to be a new Stream after non-empty
+ Stream Padding. This can be convenient in certain situations
+ [GNU-tar].
+
+ The possibility of Stream Padding MUST be taken into account
+ when designing an application that parses Streams backwards,
+ and the application supports concatenated Streams.
+
+
+3. Block
+
+ +==============+=================+===============+=======+
+ | Block Header | Compressed Data | Block Padding | Check |
+ +==============+=================+===============+=======+
+
+
+3.1. Block Header
+
+ +-------------------+-------------+=================+
+ | Block Header Size | Block Flags | Compressed Size |
+ +-------------------+-------------+=================+
+
+ +===================+======================+
+ ---> | Uncompressed Size | List of Filter Flags |
+ +===================+======================+
+
+ +================+--+--+--+--+
+ ---> | Header Padding | CRC32 |
+ +================+--+--+--+--+
+
+
+3.1.1. Block Header Size
+
+ This field overlaps with the Index Indicator field (see
+ Section 4.1).
+
+ This field contains the size of the Block Header field,
+ including the Block Header Size field itself. Valid values are
+ in the range [0x01, 0xFF], which indicate the size of the Block
+ Header as multiples of four bytes, minimum size being eight
+ bytes:
+
+ real_header_size = (encoded_header_size + 1) * 4;
+
+ If a Block Header bigger than 1024 bytes is needed in the
+ future, a new field can be added between the Block Header and
+ Compressed Data fields. The presence of this new field would
+ be indicated in the Block Header field.
+
+
+3.1.2. Block Flags
+
+ The Block Flags field is a bit field:
+
+ Bit(s) Mask Description
+ 0-1 0x03 Number of filters (1-4)
+ 2-5 0x3C Reserved for future use; MUST be zero for now.
+ 6 0x40 The Compressed Size field is present.
+ 7 0x80 The Uncompressed Size field is present.
+
+ If any reserved bit is set, the decoder MUST indicate an error.
+ It is possible that there is a new field present which the
+ decoder is not aware of, and can thus parse the Block Header
+ incorrectly.
+
+
+3.1.3. Compressed Size
+
+ This field is present only if the appropriate bit is set in
+ the Block Flags field (see Section 3.1.2).
+
+ The Compressed Size field contains the size of the Compressed
+ Data field, which MUST be non-zero. Compressed Size is stored
+ using the encoding described in Section 1.2. If the Compressed
+ Size doesn't match the size of the Compressed Data field, the
+ decoder MUST indicate an error.
+
+
+3.1.4. Uncompressed Size
+
+ This field is present only if the appropriate bit is set in
+ the Block Flags field (see Section 3.1.2).
+
+ The Uncompressed Size field contains the size of the Block
+ after uncompressing. Uncompressed Size is stored using the
+ encoding described in Section 1.2. If the Uncompressed Size
+ does not match the real uncompressed size, the decoder MUST
+ indicate an error.
+
+ Storing the Compressed Size and Uncompressed Size fields serves
+ several purposes:
+ - The decoder knows how much memory it needs to allocate
+ for a temporary buffer in multithreaded mode.
+ - Simple error detection: wrong size indicates a broken file.
+ - Seeking forwards to a specific location in streamed mode.
+
+ It should be noted that the only reliable way to determine
+ the real uncompressed size is to uncompress the Block,
+ because the Block Header and Index fields may contain
+ (intentionally or unintentionally) invalid information.
+
+
+3.1.5. List of Filter Flags
+
+ +================+================+ +================+
+ | Filter 0 Flags | Filter 1 Flags | ... | Filter n Flags |
+ +================+================+ +================+
+
+ The number of Filter Flags fields is stored in the Block Flags
+ field (see Section 3.1.2).
+
+ The format of each Filter Flags field is as follows:
+
+ +===========+====================+===================+
+ | Filter ID | Size of Properties | Filter Properties |
+ +===========+====================+===================+
+
+ Both Filter ID and Size of Properties are stored using the
+ encoding described in Section 1.2. Size of Properties indicates
+ the size of the Filter Properties field as bytes. The list of
+ officially defined Filter IDs and the formats of their Filter
+ Properties are described in Section 5.3.
+
+ Filter IDs greater than or equal to 0x4000_0000_0000_0000
+ (2^62) are reserved for implementation-specific internal use.
+ These Filter IDs MUST never be used in List of Filter Flags.
+
+
+3.1.6. Header Padding
+
+ This field contains as many null byte as it is needed to make
+ the Block Header have the size specified in Block Header Size.
+ If any of the bytes are not null bytes, the decoder MUST
+ indicate an error. It is possible that there is a new field
+ present which the decoder is not aware of, and can thus parse
+ the Block Header incorrectly.
+
+
+3.1.7. CRC32
+
+ The CRC32 is calculated over everything in the Block Header
+ field except the CRC32 field itself. It is stored as an
+ unsigned 32-bit little endian integer. If the calculated
+ value does not match the stored one, the decoder MUST indicate
+ an error.
+
+ By verifying the CRC32 of the Block Header before parsing the
+ actual contents allows the decoder to distinguish between
+ corrupt and unsupported files.
+
+
+3.2. Compressed Data
+
+ The format of Compressed Data depends on Block Flags and List
+ of Filter Flags. Excluding the descriptions of the simplest
+ filters in Section 5.3, the format of the filter-specific
+ encoded data is out of scope of this document.
+
+
+3.3. Block Padding
+
+ Block Padding MUST contain 0-3 null bytes to make the size of
+ the Block a multiple of four bytes. This can be needed when
+ the size of Compressed Data is not a multiple of four. If any
+ of the bytes in Block Padding are not null bytes, the decoder
+ MUST indicate an error.
+
+
+3.4. Check
+
+ The type and size of the Check field depends on which bits
+ are set in the Stream Flags field (see Section 2.1.1.2).
+
+ The Check, when used, is calculated from the original
+ uncompressed data. If the calculated Check does not match the
+ stored one, the decoder MUST indicate an error. If the selected
+ type of Check is not supported by the decoder, it SHOULD
+ indicate a warning or error.
+
+
+4. Index
+
+ +-----------------+===================+
+ | Index Indicator | Number of Records |
+ +-----------------+===================+
+
+ +=================+===============+-+-+-+-+
+ ---> | List of Records | Index Padding | CRC32 |
+ +=================+===============+-+-+-+-+
+
+ Index serves several purposes. Using it, one can
+ - verify that all Blocks in a Stream have been processed;
+ - find out the uncompressed size of a Stream; and
+ - quickly access the beginning of any Block (random access).
+
+
+4.1. Index Indicator
+
+ This field overlaps with the Block Header Size field (see
+ Section 3.1.1). The value of Index Indicator is always 0x00.
+
+
+4.2. Number of Records
+
+ This field indicates how many Records there are in the List
+ of Records field, and thus how many Blocks there are in the
+ Stream. The value is stored using the encoding described in
+ Section 1.2. If the decoder has decoded all the Blocks of the
+ Stream, and then notices that the Number of Records doesn't
+ match the real number of Blocks, the decoder MUST indicate an
+ error.
+
+
+4.3. List of Records
+
+ List of Records consists of as many Records as indicated by the
+ Number of Records field:
+
+ +========+========+
+ | Record | Record | ...
+ +========+========+
+
+ Each Record contains information about one Block:
+
+ +===============+===================+
+ | Unpadded Size | Uncompressed Size |
+ +===============+===================+
+
+ If the decoder has decoded all the Blocks of the Stream, it
+ MUST verify that the contents of the Records match the real
+ Unpadded Size and Uncompressed Size of the respective Blocks.
+
+ Implementation hint: It is possible to verify the Index with
+ constant memory usage by calculating for example SHA-256 of
+ both the real size values and the List of Records, then
+ comparing the hash values. Implementing this using
+ non-cryptographic hash like CRC32 SHOULD be avoided unless
+ small code size is important.
+
+ If the decoder supports random-access reading, it MUST verify
+ that Unpadded Size and Uncompressed Size of every completely
+ decoded Block match the sizes stored in the Index. If only
+ partial Block is decoded, the decoder MUST verify that the
+ processed sizes don't exceed the sizes stored in the Index.
+
+
+4.3.1. Unpadded Size
+
+ This field indicates the size of the Block excluding the Block
+ Padding field. That is, Unpadded Size is the size of the Block
+ Header, Compressed Data, and Check fields. Unpadded Size is
+ stored using the encoding described in Section 1.2. The value
+ MUST never be zero; with the current structure of Blocks, the
+ actual minimum value for Unpadded Size is five.
+
+ Implementation note: Because the size of the Block Padding
+ field is not included in Unpadded Size, calculating the total
+ size of a Stream or doing random-access reading requires
+ calculating the actual size of the Blocks by rounding Unpadded
+ Sizes up to the next multiple of four.
+
+ The reason to exclude Block Padding from Unpadded Size is to
+ ease making a raw copy of Compressed Data without Block
+ Padding. This can be useful, for example, if someone wants
+ to convert Streams to some other file format quickly.
+
+
+4.3.2. Uncompressed Size
+
+ This field indicates the Uncompressed Size of the respective
+ Block as bytes. The value is stored using the encoding
+ described in Section 1.2.
+
+
+4.4. Index Padding
+
+ This field MUST contain 0-3 null bytes to pad the Index to
+ a multiple of four bytes. If any of the bytes are not null
+ bytes, the decoder MUST indicate an error.
+
+
+4.5. CRC32
+
+ The CRC32 is calculated over everything in the Index field
+ except the CRC32 field itself. The CRC32 is stored as an
+ unsigned 32-bit little endian integer. If the calculated
+ value does not match the stored one, the decoder MUST indicate
+ an error.
+
+
+5. Filter Chains
+
+ The Block Flags field defines how many filters are used. When
+ more than one filter is used, the filters are chained; that is,
+ the output of one filter is the input of another filter. The
+ following figure illustrates the direction of data flow.
+
+ v Uncompressed Data ^
+ | Filter 0 |
+ Encoder | Filter 1 | Decoder
+ | Filter n |
+ v Compressed Data ^
+
+
+5.1. Alignment
+
+ Alignment of uncompressed input data is usually the job of
+ the application producing the data. For example, to get the
+ best results, an archiver tool should make sure that all
+ PowerPC executable files in the archive stream start at
+ offsets that are multiples of four bytes.
+
+ Some filters, for example LZMA2, can be configured to take
+ advantage of specified alignment of input data. Note that
+ taking advantage of aligned input can be beneficial also when
+ a filter is not the first filter in the chain. For example,
+ if you compress PowerPC executables, you may want to use the
+ PowerPC filter and chain that with the LZMA2 filter. Because
+ not only the input but also the output alignment of the PowerPC
+ filter is four bytes, it is now beneficial to set LZMA2
+ settings so that the LZMA2 encoder can take advantage of its
+ four-byte-aligned input data.
+
+ The output of the last filter in the chain is stored to the
+ Compressed Data field, which is is guaranteed to be aligned
+ to a multiple of four bytes relative to the beginning of the
+ Stream. This can increase
+ - speed, if the filtered data is handled multiple bytes at
+ a time by the filter-specific encoder and decoder,
+ because accessing aligned data in computer memory is
+ usually faster; and
+ - compression ratio, if the output data is later compressed
+ with an external compression tool.
+
+
+5.2. Security
+
+ If filters would be allowed to be chained freely, it would be
+ possible to create malicious files, that would be very slow to
+ decode. Such files could be used to create denial of service
+ attacks.
+
+ Slow files could occur when multiple filters are chained:
+
+ v Compressed input data
+ | Filter 1 decoder (last filter)
+ | Filter 0 decoder (non-last filter)
+ v Uncompressed output data
+
+ The decoder of the last filter in the chain produces a lot of
+ output from little input. Another filter in the chain takes the
+ output of the last filter, and produces very little output
+ while consuming a lot of input. As a result, a lot of data is
+ moved inside the filter chain, but the filter chain as a whole
+ gets very little work done.
+
+ To prevent this kind of slow files, there are restrictions on
+ how the filters can be chained. These restrictions MUST be
+ taken into account when designing new filters.
+
+ The maximum number of filters in the chain has been limited to
+ four, thus there can be at maximum of three non-last filters.
+ Of these three non-last filters, only two are allowed to change
+ the size of the data.
+
+ The non-last filters, that change the size of the data, MUST
+ have a limit how much the decoder can compress the data: the
+ decoder SHOULD produce at least n bytes of output when the
+ filter is given 2n bytes of input. This limit is not
+ absolute, but significant deviations MUST be avoided.
+
+ The above limitations guarantee that if the last filter in the
+ chain produces 4n bytes of output, the chain as a whole will
+ produce at least n bytes of output.
+
+
+5.3. Filters
+
+5.3.1. LZMA2
+
+ LZMA (Lempel-Ziv-Markov chain-Algorithm) is a general-purpose
+ compression algorithm with high compression ratio and fast
+ decompression. LZMA is based on LZ77 and range coding
+ algorithms.
+
+ LZMA2 is an extension on top of the original LZMA. LZMA2 uses
+ LZMA internally, but adds support for flushing the encoder,
+ uncompressed chunks, eases stateful decoder implementations,
+ and improves support for multithreading. Thus, the plain LZMA
+ will not be supported in this file format.
+
+ Filter ID: 0x21
+ Size of Filter Properties: 1 byte
+ Changes size of data: Yes
+ Allow as a non-last filter: No
+ Allow as the last filter: Yes
+
+ Preferred alignment:
+ Input data: Adjustable to 1/2/4/8/16 byte(s)
+ Output data: 1 byte
+
+ The format of the one-byte Filter Properties field is as
+ follows:
+
+ Bits Mask Description
+ 0-5 0x3F Dictionary Size
+ 6-7 0xC0 Reserved for future use; MUST be zero for now.
+
+ Dictionary Size is encoded with one-bit mantissa and five-bit
+ exponent. The smallest dictionary size is 4 KiB and the biggest
+ is 4 GiB.
+
+ Raw value Mantissa Exponent Dictionary size
+ 0 2 11 4 KiB
+ 1 3 11 6 KiB
+ 2 2 12 8 KiB
+ 3 3 12 12 KiB
+ 4 2 13 16 KiB
+ 5 3 13 24 KiB
+ 6 2 14 32 KiB
+ ... ... ... ...
+ 35 3 27 768 MiB
+ 36 2 28 1024 MiB
+ 37 3 29 1536 MiB
+ 38 2 30 2048 MiB
+ 39 3 30 3072 MiB
+ 40 2 31 4096 MiB - 1 B
+
+ Instead of having a table in the decoder, the dictionary size
+ can be decoded using the following C code:
+
+ const uint8_t bits = get_dictionary_flags() & 0x3F;
+ if (bits > 40)
+ return DICTIONARY_TOO_BIG; // Bigger than 4 GiB
+
+ uint32_t dictionary_size;
+ if (bits == 40) {
+ dictionary_size = UINT32_MAX;
+ } else {
+ dictionary_size = 2 | (bits & 1);
+ dictionary_size <<= bits / 2 + 11;
+ }
+
+
+5.3.2. Branch/Call/Jump Filters for Executables
+
+ These filters convert relative branch, call, and jump
+ instructions to their absolute counterparts in executable
+ files. This conversion increases redundancy and thus
+ compression ratio.
+
+ Size of Filter Properties: 0 or 4 bytes
+ Changes size of data: No
+ Allow as a non-last filter: Yes
+ Allow as the last filter: No
+
+ Below is the list of filters in this category. The alignment
+ is the same for both input and output data.
+
+ Filter ID Alignment Description
+ 0x04 1 byte x86 filter (BCJ)
+ 0x05 4 bytes PowerPC (big endian) filter
+ 0x06 16 bytes IA64 filter
+ 0x07 4 bytes ARM filter [1]
+ 0x08 2 bytes ARM Thumb filter [1]
+ 0x09 4 bytes SPARC filter
+ 0x0A 4 bytes ARM64 filter [2]
+
+ [1] These are for little endian instruction encoding.
+ This must not be confused with data endianness.
+ A processor configured for big endian data access
+ may still use little endian instruction encoding.
+ The filters don't care about the data endianness.
+
+ [2] 4096-byte alignment gives the best results
+ because the address in the ADRP instruction
+ is a multiple of 4096 bytes.
+
+ If the size of Filter Properties is four bytes, the Filter
+ Properties field contains the start offset used for address
+ conversions. It is stored as an unsigned 32-bit little endian
+ integer. The start offset MUST be a multiple of the alignment
+ of the filter as listed in the table above; if it isn't, the
+ decoder MUST indicate an error. If the size of Filter
+ Properties is zero, the start offset is zero.
+
+ Setting the start offset may be useful if an executable has
+ multiple sections, and there are many cross-section calls.
+ Taking advantage of this feature usually requires usage of
+ the Subblock filter, whose design is not complete yet.
+
+
+5.3.3. Delta
+
+ The Delta filter may increase compression ratio when the value
+ of the next byte correlates with the value of an earlier byte
+ at specified distance.
+
+ Filter ID: 0x03
+ Size of Filter Properties: 1 byte
+ Changes size of data: No
+ Allow as a non-last filter: Yes
+ Allow as the last filter: No
+
+ Preferred alignment:
+ Input data: 1 byte
+ Output data: Same as the original input data
+
+ The Properties byte indicates the delta distance, which can be
+ 1-256 bytes backwards from the current byte: 0x00 indicates
+ distance of 1 byte and 0xFF distance of 256 bytes.
+
+
+5.3.3.1. Format of the Encoded Output
+
+ The code below illustrates both encoding and decoding with
+ the Delta filter.
+
+ // Distance is in the range [1, 256].
+ const unsigned int distance = get_properties_byte() + 1;
+ uint8_t pos = 0;
+ uint8_t delta[256];
+
+ memset(delta, 0, sizeof(delta));
+
+ while (1) {
+ const int byte = read_byte();
+ if (byte == EOF)
+ break;
+
+ uint8_t tmp = delta[(uint8_t)(distance + pos)];
+ if (is_encoder) {
+ tmp = (uint8_t)(byte) - tmp;
+ delta[pos] = (uint8_t)(byte);
+ } else {
+ tmp = (uint8_t)(byte) + tmp;
+ delta[pos] = tmp;
+ }
+
+ write_byte(tmp);
+ --pos;
+ }
+
+
+5.4. Custom Filter IDs
+
+ If a developer wants to use custom Filter IDs, there are two
+ choices. The first choice is to contact Lasse Collin and ask
+ him to allocate a range of IDs for the developer.
+
+ The second choice is to generate a 40-bit random integer
+ which the developer can use as a personal Developer ID.
+ To minimize the risk of collisions, Developer ID has to be
+ a randomly generated integer, not manually selected "hex word".
+ The following command, which works on many free operating
+ systems, can be used to generate Developer ID:
+
+ dd if=/dev/urandom bs=5 count=1 | hexdump
+
+ The developer can then use the Developer ID to create unique
+ (well, hopefully unique) Filter IDs.
+
+ Bits Mask Description
+ 0-15 0x0000_0000_0000_FFFF Filter ID
+ 16-55 0x00FF_FFFF_FFFF_0000 Developer ID
+ 56-62 0x3F00_0000_0000_0000 Static prefix: 0x3F
+
+ The resulting 63-bit integer will use 9 bytes of space when
+ stored using the encoding described in Section 1.2. To get
+ a shorter ID, see the beginning of this Section how to
+ request a custom ID range.
+
+
+5.4.1. Reserved Custom Filter ID Ranges
+
+ Range Description
+ 0x0000_0300 - 0x0000_04FF Reserved to ease .7z compatibility
+ 0x0002_0000 - 0x0007_FFFF Reserved to ease .7z compatibility
+ 0x0200_0000 - 0x07FF_FFFF Reserved to ease .7z compatibility
+
+
+6. Cyclic Redundancy Checks
+
+ There are several incompatible variations to calculate CRC32
+ and CRC64. For simplicity and clarity, complete examples are
+ provided to calculate the checks as they are used in this file
+ format. Implementations MAY use different code as long as it
+ gives identical results.
+
+ The program below reads data from standard input, calculates
+ the CRC32 and CRC64 values, and prints the calculated values
+ as big endian hexadecimal strings to standard output.
+
+ #include <stddef.h>
+ #include <inttypes.h>
+ #include <stdio.h>
+
+ uint32_t crc32_table[256];
+ uint64_t crc64_table[256];
+
+ void
+ init(void)
+ {
+ static const uint32_t poly32 = UINT32_C(0xEDB88320);
+ static const uint64_t poly64
+ = UINT64_C(0xC96C5795D7870F42);
+
+ for (size_t i = 0; i < 256; ++i) {
+ uint32_t crc32 = i;
+ uint64_t crc64 = i;
+
+ for (size_t j = 0; j < 8; ++j) {
+ if (crc32 & 1)
+ crc32 = (crc32 >> 1) ^ poly32;
+ else
+ crc32 >>= 1;
+
+ if (crc64 & 1)
+ crc64 = (crc64 >> 1) ^ poly64;
+ else
+ crc64 >>= 1;
+ }
+
+ crc32_table[i] = crc32;
+ crc64_table[i] = crc64;
+ }
+ }
+
+ uint32_t
+ crc32(const uint8_t *buf, size_t size, uint32_t crc)
+ {
+ crc = ~crc;
+ for (size_t i = 0; i < size; ++i)
+ crc = crc32_table[buf[i] ^ (crc & 0xFF)]
+ ^ (crc >> 8);
+ return ~crc;
+ }
+
+ uint64_t
+ crc64(const uint8_t *buf, size_t size, uint64_t crc)
+ {
+ crc = ~crc;
+ for (size_t i = 0; i < size; ++i)
+ crc = crc64_table[buf[i] ^ (crc & 0xFF)]
+ ^ (crc >> 8);
+ return ~crc;
+ }
+
+ int
+ main()
+ {
+ init();
+
+ uint32_t value32 = 0;
+ uint64_t value64 = 0;
+ uint64_t total_size = 0;
+ uint8_t buf[8192];
+
+ while (1) {
+ const size_t buf_size
+ = fread(buf, 1, sizeof(buf), stdin);
+ if (buf_size == 0)
+ break;
+
+ total_size += buf_size;
+ value32 = crc32(buf, buf_size, value32);
+ value64 = crc64(buf, buf_size, value64);
+ }
+
+ printf("Bytes: %" PRIu64 "\n", total_size);
+ printf("CRC-32: 0x%08" PRIX32 "\n", value32);
+ printf("CRC-64: 0x%016" PRIX64 "\n", value64);
+
+ return 0;
+ }
+
+
+7. References
+
+ LZMA SDK - The original LZMA implementation
+ http://7-zip.org/sdk.html
+
+ LZMA Utils - LZMA adapted to POSIX-like systems
+ http://tukaani.org/lzma/
+
+ XZ Utils - The next generation of LZMA Utils
+ http://tukaani.org/xz/
+
+ [RFC-1952]
+ GZIP file format specification version 4.3
+ http://www.ietf.org/rfc/rfc1952.txt
+ - Notation of byte boxes in section "2.1. Overall conventions"
+
+ [RFC-2119]
+ Key words for use in RFCs to Indicate Requirement Levels
+ http://www.ietf.org/rfc/rfc2119.txt
+
+ [GNU-tar]
+ GNU tar 1.21 manual
+ http://www.gnu.org/software/tar/manual/html_node/Blocking-Factor.html
+ - Node 9.4.2 "Blocking Factor", paragraph that begins
+ "gzip will complain about trailing garbage"
+ - Note that this URL points to the latest version of the
+ manual, and may some day not contain the note which is in
+ 1.21. For the exact version of the manual, download GNU
+ tar 1.21: ftp://ftp.gnu.org/pub/gnu/tar/tar-1.21.tar.gz
+