From efe11df4b26426c3db4f96592e899b1f005a878e Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 May 2017 17:50:01 +0200 Subject: Merging upstream version 1.9. Signed-off-by: Daniel Baumann --- doc/clzip.texi | 1004 +++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 965 insertions(+), 39 deletions(-) (limited to 'doc/clzip.texi') diff --git a/doc/clzip.texi b/doc/clzip.texi index 331d4eb..2cac199 100644 --- a/doc/clzip.texi +++ b/doc/clzip.texi @@ -6,8 +6,8 @@ @finalout @c %**end of header -@set UPDATED 13 May 2016 -@set VERSION 1.8 +@set UPDATED 13 April 2017 +@set VERSION 1.9 @dircategory Data Compression @direntry @@ -37,16 +37,19 @@ This manual is for Clzip (version @value{VERSION}, @value{UPDATED}). @menu * Introduction:: Purpose and features of clzip * Invoking clzip:: Command line interface +* Quality assurance:: Design, development and testing of lzip * File format:: Detailed format of the compressed file * Algorithm:: How clzip compresses the data +* Stream format:: Format of the LZMA stream in lzip files * Trailing data:: Extra data appended to the file * Examples:: A small tutorial with examples * Problems:: Reporting bugs +* Reference source code:: Source code illustrating stream format * Concept index:: Index of concepts @end menu @sp 1 -Copyright @copyright{} 2010-2016 Antonio Diaz Diaz. +Copyright @copyright{} 2010-2017 Antonio Diaz Diaz. This manual is free documentation: you have unlimited permission to copy, distribute and modify it. @@ -56,15 +59,16 @@ to copy, distribute and modify it. @chapter Introduction @cindex introduction -Clzip is a lossless data compressor with a user interface similar to the -one of gzip or bzip2. Clzip is about as fast as gzip, compresses most -files more than bzip2, and is better than both from a data recovery -perspective. +Clzip is a C language version of lzip, fully compatible with lzip-1.4 or +newer. As clzip is written in C, it may be easier to integrate in +applications like package managers, embedded devices, or systems lacking +a C++ compiler. -Clzip uses the lzip file format; the files produced by clzip are fully -compatible with lzip-1.4 or newer, and can be rescued with lziprecover. -Clzip is in fact a C language version of lzip, intended for embedded -devices or systems lacking a C++ compiler. +Lzip is a lossless data compressor with a user interface similar to the +one of gzip or bzip2. Lzip can compress about as fast as gzip +@w{(lzip -0)}, or compress most files more than bzip2 @w{(lzip -9)}. +Decompression speed is intermediate between gzip and bzip2. Lzip is +better than gzip and bzip2 from a data recovery perspective. The lzip file format is designed for data sharing and long-term archiving, taking into account both data integrity and decoder @@ -84,10 +88,10 @@ including error-checked merging of damaged copies of a file. @item The lzip format is as simple as possible (but not simpler). The lzip -manual provides the code of a simple decompressor along with a detailed -explanation of how it works, so that with the only help of the lzip -manual it would be possible for a digital archaeologist to extract the -data from a lzip file long after quantum computers eventually render +manual provides the source code of a simple decompressor along with a +detailed explanation of how it works, so that with the only help of the +lzip manual it would be possible for a digital archaeologist to extract +the data from a lzip file long after quantum computers eventually render LZMA obsolete. @item @@ -158,16 +162,20 @@ or more compressed files. The result is the concatenation of the corresponding uncompressed files. Integrity testing of concatenated compressed files is also supported. -Clzip can produce multimember files and safely recover, with -lziprecover, the undamaged members in case of file damage. Clzip can -also split the compressed output in volumes of a given size, even when -reading from standard input. This allows the direct creation of -multivolume compressed tar archives. +Clzip can produce multimember files, and lziprecover can safely recover +the undamaged members in case of file damage. Clzip can also split the +compressed output in volumes of a given size, even when reading from +standard input. This allows the direct creation of multivolume +compressed tar archives. Clzip is able to compress and decompress streams of unlimited size by automatically creating multimember output. The members so created are large, about 2 PiB each. +LANGUAGE NOTE: Uncompressed = not compressed = plain data; it may never +have been compressed. Decompressed is used to refer to data which have +undergone the process of decompression. + @node Invoking clzip @chapter Invoking clzip @@ -239,6 +247,20 @@ Force re-compression of files whose name already has the @samp{.lz} or @itemx --keep Keep (don't delete) input files during compression or decompression. +@item -l +@itemx --list +Print the uncompressed size, compressed size and percentage saved of the +specified file(s). Trailing data are ignored. The values produced are +correct even for multimember files. If more than one file is given, a +final line containing the cumulative sizes is printed. With @samp{-v}, +the dictionary size, the number of members in the file, and the amount +of trailing data (if any) are also printed. With @samp{-vv}, the +positions and sizes of each member in multimember files are also +printed. @samp{-lq} can be used to verify quickly (without +decompressing) the structural integrity of the specified files. (Use +@samp{--test} to verify the data integrity). @samp{-alq} additionally +verifies that none of the specified files contain trailing data. + @item -m @var{bytes} @itemx --match-length=@var{bytes} Set the match length limit in bytes. After a match this long is found, @@ -286,7 +308,8 @@ EiB. Check integrity of the specified file(s), but don't decompress them. This really performs a trial decompression and throws away the result. Use it together with @samp{-v} to see information about the file(s). If -a file fails the test, clzip continues checking the rest of the files. +a file fails the test, does not exist, can't be opened, or is a +terminal, clzip continues checking the rest of the files. @item -v @itemx --verbose @@ -296,7 +319,8 @@ second @samp{-v} shows the progress of compression.@* When decompressing or testing, further -v's (up to 4) increase the verbosity level, showing status, compression ratio, dictionary size, trailer contents (CRC, data size, member size), and up to 6 bytes of -trailing data (if any). +trailing data (if any) both in hexadecimal and as a string of printable +ASCII characters. @item -0 .. -9 Set the compression parameters (dictionary size and match length limit) @@ -353,6 +377,190 @@ invalid input file, 3 for an internal consistency error (eg, bug) which caused clzip to panic. +@node Quality assurance +@chapter Design, development and testing of lzip +@cindex quality assurance + +There are two ways of constructing a software design: One way is to make +it so simple that there are obviously no deficiencies and the other way +is to make it so complicated that there are no obvious deficiencies. The +first method is far more difficult.@* +--- C.A.R. Hoare + +Lzip has been designed, written and tested with great care to be the +standard general-purpose compressor for unix-like systems. This chapter +describes the lessons learned from previous compressors (gzip and +bzip2), and their application to the design of lzip. + +@sp 1 +@section Format design + +When gzip was designed in 1992, computers and operating systems were +much less capable than they are today. Gzip tried to work around some of +those limitations, like 8.3 file names, with additional fields in its +file format. + +Today those limitations have mostly disappeared, and the format of gzip +has proved to be unnecessarily complicated. It includes fields that were +never used, others that have lost their usefulness, and finally others +that have become too limited. + +Bzip2 was designed 5 years later, and its format is simpler than the one +of gzip. + +Probably the worst defect of the gzip format from the point of view of +data safety is the variable size of its header. If the byte at offset 3 +(flags) of a gzip member gets corrupted, it may become very difficult to +recover the data, even if the compressed blocks are intact, because it +can't be known with certainty where the compressed blocks begin. + +By contrast, the header of a lzip member has a fixed length of 6. The +LZMA stream in a lzip member always starts at offset 6, making it +trivial to recover the data even if the whole header becomes corrupt. + +Bzip2 also provides a header of fixed length and marks the begin and end +of each compressed block with six magic bytes, making it possible to +find the compressed blocks even in case of file damage. But bzip2 does +not store the size of each compressed block, as lzip does. + +Lzip provides better data recovery capabilities than any other gzip-like +compressor because its format has been designed from the beginning to be +simple and safe. It also helps that the LZMA data stream as used by lzip +is extraordinarily safe. It provides embedded error detection. Any +distance larger than the dictionary size acts as a forbidden symbol, +allowing the decompressor to detect the approximate position of errors, +and leaving very little work for the check sequence (CRC and data sizes) +in the detection of errors. Lzip is usually able to detect all posible +bit-flips in the compressed data without resorting to the check +sequence. It would be very difficult to write an automatic recovery tool +like lziprecover for the gzip format. And, as far as I know, it has +never been written. + +Lzip, like gzip and bzip2, uses a CRC32 to check the integrity of the +decompressed data because it provides more accurate error detection than +CRC64 up to a compressed size of about 16 GiB, a size larger than that +of most files. In the case of lzip, the additional detection capability +of the decompressor reduces the probability of undetected errors more +than a million times, making CRC32 more accurate than CRC64 up to about +20 PiB of compressed size. + +The lzip format is designed for long-term archiving. Therefore it +excludes any unneeded features that may interfere with the future +extraction of the uncompressed data. + +@sp 1 +@subsection Gzip format (mis)features not present in lzip + +@table @samp +@item Multiple algorithms + +Gzip provides a CM (Compression Method) field that has never been used +because it is a bad idea to begin with. New compression methods may +require additional fields, making it impossible to implement new methods +and, at the same time, keep the same format. This field does not solve +the problem of format proliferation; it just makes the problem less +obvious. + +@item Optional fields in header + +Unless special precautions are taken, optional fields are generally a +bad idea because they produce a header of variable size. The gzip header +has 2 fields that, in addition to being optional, are zero-terminated. +This means that if any byte inside the field gets zeroed, or if the +terminating zero gets altered, gzip won't be able to find neither the +header CRC nor the compressed blocks. + +@item Optional CRC for the header + +Using an optional checksum for the header is not only a bad idea, it is +an error; it may prevent the extraction of perfectly good data. For +example, if the checksum is used and the bit enabling it is reset by a +bit-flip, the header will appear to be intact (in spite of being +corrupt) while the compressed blocks will appear to be totally +unrecoverable (in spite of being intact). Very misleading indeed. + +@end table + +@subsection Lzip format improvements over gzip and bzip2 + +@table @samp +@item 64-bit size field + +Probably the most frequently reported shortcoming of the gzip format is +that it only stores the least significant 32 bits of the uncompressed +size. The size of any file larger than 4 GiB gets truncated. + +Bzip2 does not store the uncompressed size of the file. + +The lzip format provides a 64-bit field for the uncompressed size. +Additionaly, lzip produces multimember output automatically when the +size is too large for a single member, allowing for an unlimited +uncompressed size. + +@item Distributed index + +The lzip format provides a distributed index that, among other things, +helps plzip to decompress several times faster than pigz and helps +lziprecover do its job. Neither the gzip format nor the bzip2 format do +provide an index. + +A distributed index is safer and more scalable than a monolithic index. +The monolithic index introduces a single point of failure in the +compressed file and may limit the number of members or the total +uncompressed size. + +@end table + +@section Quality of implementation + +@table @samp +@item Accurate and robust error detection + +The lzip format provides 3 factor integrity checking and the +decompressors report mismatches in each factor separately. This way if +just one byte in one factor fails but the other two factors match the +data, it probably means that the data are intact and the corruption just +affects the mismatching factor (CRC or data size) in the check sequence. + +@item Multiple implementations + +Just like the lzip format provides 3 factor protection against +undetected data corruption, the development methodology of the lzip +family of compressors provides 3 factor protection against undetected +programming errors. + +Three related but independent compressor implementations, lzip, clzip +and minilzip/lzlib, are developed concurrently. Every stable release of +any of them is subjected to a hundred hours of intensive testing to +verify that it produces identical output to the other two. This +guarantees that all three implement the same algorithm, and makes it +unlikely that any of them may contain serious undiscovered errors. In +fact, no errors have been discovered in lzip since 2009. + +Additionally, the three implementations have been extensively tested +with +@uref{http://www.nongnu.org/lzip/manual/lziprecover_manual.html#Unzcrash,,unzcrash}, +valgrind and @samp{american fuzzy lop} without finding a single +vulnerability or false negative. +@ifnothtml +@xref{Unzcrash,,,lziprecover}. +@end ifnothtml + +@item Dictionary size + +Lzip automatically uses the smallest possible dictionary size for each +file. In addition to reducing the amount of memory required for +decompression, this feature also minimizes the probability of being +affected by RAM errors during compression. + +@item Exit status + +Returning a warning status of 2 is a design flaw of compress that leaked +into the design of gzip. Both bzip2 and lzip are free from this flaw. + +@end table + + @node File format @chapter File format @cindex file format @@ -412,15 +620,8 @@ Valid values for dictionary size range from 4 KiB to 512 MiB. @item LZMA stream The LZMA stream, finished by an end of stream marker. Uses default -values for encoder properties. -@ifnothtml -@xref{Stream format,,,lzip}, -@end ifnothtml -@ifhtml -See -@uref{http://www.nongnu.org/lzip/manual/lzip_manual.html#Stream-format,,Stream format} -@end ifhtml -for a complete description. +values for encoder properties. @xref{Stream format}, for a complete +description. @item CRC32 (4 bytes) CRC of the uncompressed original data. @@ -502,24 +703,274 @@ range encoding), Igor Pavlov (for putting all the above together in LZMA), and Julian Seward (for bzip2's CLI). +@node Stream format +@chapter Format of the LZMA stream in lzip files +@cindex format of the LZMA stream + +The LZMA algorithm has three parameters, called "special LZMA +properties", to adjust it for some kinds of binary data. These +parameters are; @samp{literal_context_bits} (with a default value of 3), +@samp{literal_pos_state_bits} (with a default value of 0), and +@samp{pos_state_bits} (with a default value of 2). As a general purpose +compressor, lzip only uses the default values for these parameters. In +particular @samp{literal_pos_state_bits} has been optimized away and +does not even appear in the code. + +Lzip also finishes the LZMA stream with an "End Of Stream" marker (the +distance-length pair 0xFFFFFFFFU, 2), which in conjunction with the +"member size" field in the member trailer allows the verification of +stream integrity. The LZMA stream in lzip files always has these two +features (default properties and EOS marker) and is referred to in this +document as LZMA-302eos or LZMA-lzip. + +The second stage of LZMA is a range encoder that uses a different +probability model for each type of symbol; distances, lengths, literal +bytes, etc. Range encoding conceptually encodes all the symbols of the +message into one number. Unlike Huffman coding, which assigns to each +symbol a bit-pattern and concatenates all the bit-patterns together, +range encoding can compress one symbol to less than one bit. Therefore +the compressed data produced by a range encoder can't be split in pieces +that could be individually described. + +It seems that the only way of describing the LZMA-302eos stream is +describing the algorithm that decodes it. And given the many details +about the range decoder that need to be described accurately, the source +code of a real decoder seems the only appropriate reference to use. + +What follows is a description of the decoding algorithm for LZMA-302eos +streams using as reference the source code of "lzd", an educational +decompressor for lzip files which can be downloaded from the lzip +download directory. The source code of lzd is included in appendix A. +@xref{Reference source code}. + +@sp 1 +@section What is coded + +The LZMA stream includes literals, matches and repeated matches (matches +reusing a recently used distance). There are 7 different coding sequences: + +@multitable @columnfractions .35 .14 .51 +@headitem Bit sequence @tab Name @tab Description +@item 0 + byte @tab literal @tab literal byte +@item 1 + 0 + len + dis @tab match @tab distance-length pair +@item 1 + 1 + 0 + 0 @tab shortrep @tab 1 byte match at latest used +distance +@item 1 + 1 + 0 + 1 + len @tab rep0 @tab len bytes match at latest used +distance +@item 1 + 1 + 1 + 0 + len @tab rep1 @tab len bytes match at second +latest used distance +@item 1 + 1 + 1 + 1 + 0 + len @tab rep2 @tab len bytes match at third +latest used distance +@item 1 + 1 + 1 + 1 + 1 + len @tab rep3 @tab len bytes match at fourth +latest used distance +@end multitable + +@sp 1 +In the following tables, multibit sequences are coded in normal order, +from MSB to LSB, except where noted otherwise. + +Lengths (the @samp{len} in the table above) are coded as follows: + +@multitable @columnfractions .5 .5 +@headitem Bit sequence @tab Description +@item 0 + 3 bits @tab lengths from 2 to 9 +@item 1 + 0 + 3 bits @tab lengths from 10 to 17 +@item 1 + 1 + 8 bits @tab lengths from 18 to 273 +@end multitable + +@sp 1 +The coding of distances is a little more complicated, so I'll begin +explaining a simpler version of the encoding. + +Imagine you need to code a number from 0 to @w{2^32 - 1}, and you want +to do it in a way that produces shorter codes for the smaller numbers. +You may first send the position of the most significant bit that is set +to 1, which you may find by making a bit scan from the left (from the +MSB). A position of 0 means that the number is 0 (no bit is set), 1 +means the LSB is the first bit set (the number is 1), and 32 means the +MSB is set (i.e., the number is @w{>= 0x80000000}). Let's call this bit +position a "slot". Then, if slot is @w{> 1}, you send the remaining +@w{slot - 1} bits. Let's call these bits "direct_bits" because they are +coded directly by value instead of indirectly by position. + +The inconvenient of this simple method is that it needs 6 bits to code +the slot, but it just uses 33 of the 64 possible values, wasting almost +half of the codes. + +The intelligent trick of LZMA is that it encodes the position of the +most significant bit set, along with the value of the next bit, in the +same 6 bits that would take to encode the position alone. This seems to +need 66 slots (2 * position + next_bit), but for slots 0 and 1 there is +no next bit, so the number of needed slots is 64 (0 to 63). + +The 6 bits representing this "slot number" are then context-coded. If +the distance is @w{>= 4}, the remaining bits are coded as follows. +@samp{direct_bits} is the amount of remaining bits (from 0 to 30) needed +to form a complete distance, and is calculated as @w{(slot >> 1) - 1}. +If a distance needs 6 or more direct_bits, the last 4 bits are coded +separately. The last piece (all the direct_bits for distances 4 to 127 +or the last 4 bits for distances @w{>= 128}) is context-coded in reverse +order (from LSB to MSB). For distances @w{>= 128}, the +@w{@samp{direct_bits - 4}} part is coded with fixed 0.5 probability. + +@multitable @columnfractions .5 .5 +@headitem Bit sequence @tab Description +@item slot @tab distances from 0 to 3 +@item slot + direct_bits @tab distances from 4 to 127 +@item slot + (direct_bits - 4) + 4 bits @tab distances from 128 to +2^32 - 1 +@end multitable + +@sp 1 +@section The coding contexts + +These contexts (@samp{Bit_model} in the source), are integers or arrays +of integers representing the probability of the corresponding bit being 0. + +The indices used in these arrays are: + +@table @samp +@item state +A state machine (@samp{State} in the source) with 12 states (0 to 11), +coding the latest 2 to 4 types of sequences processed. The initial state +is 0. + +@item pos_state +Value of the 2 least significant bits of the current position in the +decoded data. + +@item literal_state +Value of the 3 most significant bits of the latest byte decoded. + +@item len_state +Coded value of length @w{(length - 2)}, with a maximum of 3. The +resulting value is in the range 0 to 3. + +@end table + + +In the following table, @samp{!literal} is any sequence except a literal +byte. @samp{rep} is any one of @samp{rep0}, @samp{rep1}, @samp{rep2} or +@samp{rep3}. The types of previous sequences corresponding to each state +are: + +@multitable {State} {rep or (!literal, shortrep), literal, literal} +@headitem State @tab Types of previous sequences +@item 0 @tab literal, literal, literal +@item 1 @tab match, literal, literal +@item 2 @tab rep or (!literal, shortrep), literal, literal +@item 3 @tab literal, shortrep, literal, literal +@item 4 @tab match, literal +@item 5 @tab rep or (!literal, shortrep), literal +@item 6 @tab literal, shortrep, literal +@item 7 @tab literal, match +@item 8 @tab literal, rep +@item 9 @tab literal, shortrep +@item 10 @tab !literal, match +@item 11 @tab !literal, (rep or shortrep) +@end multitable + +@sp 1 +The contexts for decoding the type of coding sequence are: + +@multitable @columnfractions .2 .4 .4 +@headitem Name @tab Indices @tab Used when +@item bm_match @tab state, pos_state @tab sequence start +@item bm_rep @tab state @tab after sequence 1 +@item bm_rep0 @tab state @tab after sequence 11 +@item bm_rep1 @tab state @tab after sequence 111 +@item bm_rep2 @tab state @tab after sequence 1111 +@item bm_len @tab state, pos_state @tab after sequence 110 +@end multitable + +@sp 1 +The contexts for decoding distances are: + +@multitable @columnfractions .2 .4 .4 +@headitem Name @tab Indices @tab Used when +@item bm_dis_slot @tab len_state, bit tree @tab distance start +@item bm_dis @tab reverse bit tree @tab after slots 4 to 13 +@item bm_align @tab reverse bit tree @tab for distances >= 128, after +fixed probability bits +@end multitable + +@sp 1 +There are two separate sets of contexts for lengths (@samp{Len_model} in +the source). One for normal matches, the other for repeated matches. The +contexts in each Len_model are (see @samp{decode_len} in the source): + +@multitable @columnfractions .2 .4 .4 +@headitem Name @tab Indices @tab Used when +@item choice1 @tab none @tab length start +@item choice2 @tab none @tab after sequence 1 +@item bm_low @tab pos_state, bit tree @tab after sequence 0 +@item bm_mid @tab pos_state, bit tree @tab after sequence 10 +@item bm_high @tab bit tree @tab after sequence 11 +@end multitable + +@sp 1 +The context array @samp{bm_literal} is special. In principle it acts as +a normal bit tree context, the one selected by @samp{literal_state}. But +if the previous decoded byte was not a literal, two other bit tree +contexts are used depending on the value of each bit in +@samp{match_byte} (the byte at the latest used distance), until a bit is +decoded that is different from its corresponding bit in +@samp{match_byte}. After the first difference is found, the rest of the +byte is decoded using the normal bit tree context. (See +@samp{decode_matched} in the source). + +@sp 1 +@section The range decoder + +The LZMA stream is consumed one byte at a time by the range decoder. +(See @samp{normalize} in the source). Every byte consumed produces a +variable number of decoded bits, depending on how well these bits agree +with their context. (See @samp{decode_bit} in the source). + +The range decoder state consists of two unsigned 32-bit variables; +@code{range} (representing the most significant part of the range size +not yet decoded), and @code{code} (representing the current point within +@code{range}). @code{range} is initialized to @w{(2^32 - 1)}, and +@code{code} is initialized to 0. + +The range encoder produces a first 0 byte that must be ignored by the +range decoder. This is done by shifting 5 bytes in the initialization of +@code{code} instead of 4. (See the @samp{Range_decoder} constructor in +the source). + +@sp 1 +@section Decoding the LZMA stream + +After decoding the member header and obtaining the dictionary size, the +range decoder is initialized and then the LZMA decoder enters a loop +(See @samp{decode_member} in the source) where it invokes the range +decoder with the appropriate contexts to decode the different coding +sequences (matches, repeated matches, and literal bytes), until the "End +Of Stream" marker is decoded. + + @node Trailing data @chapter Extra data appended to the file @cindex trailing data -Sometimes extra data is found appended to a lzip file after the last +Sometimes extra data are found appended to a lzip file after the last member. Such trailing data may be: @itemize @bullet @item Padding added to make the file size a multiple of some block size, for -example when writing to a tape. +example when writing to a tape. It is safe to append any amount of +padding zero bytes to a lzip file. @item -Garbage added by some not totally successful copy operation. +Useful data added by the user; a cryptographically secure hash, a +description of file contents, etc. It is safe to append any amount of +text to a lzip file as long as the text does not begin with the string +"LZIP", and does not contain any zero bytes (null characters). Nonzero +bytes and zero bytes can't be safely mixed in trailing data. @item -Useful data added by the user; a cryptographically secure hash, a -description of file contents, etc. +Garbage added by some not totally successful copy operation. @item Malicious data added to the file in order to make its total size and @@ -534,8 +985,12 @@ integrity information itself. Therefore it can be considered to be below the noise level. @end itemize +Trailing data are in no way part of the lzip file format, but tools +reading lzip files are expected to behave as correctly and usefully as +possible in the presence of trailing data. + Trailing data can be safely ignored in most cases. In some cases, like -that of user-added data, it is expected to be ignored. In those cases +that of user-added data, they are expected to be ignored. In those cases where a file containing trailing data must be rejected, the option @samp{--trailing-error} can be used. @xref{--trailing-error}. @@ -600,8 +1055,8 @@ clzip -c /dev/sdc > file.lz @sp 1 @anchor{concat-example} @noindent -Example 6: The right way of concatenating compressed files. -@xref{Trailing data}. +Example 6: The right way of concatenating the decompressed output of two +or more compressed files. @xref{Trailing data}. @example Don't do this @@ -671,6 +1126,477 @@ If you find a bug in clzip, please send electronic mail to find by running @w{@code{clzip --version}}. +@node Reference source code +@appendix Reference source code +@cindex reference source code + +@verbatim +/* Lzd - Educational decompressor for the lzip format + Copyright (C) 2013-2017 Antonio Diaz Diaz. + + This program is free software. Redistribution and use in source and + binary forms, with or without modification, are permitted provided + that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +*/ +/* + Exit status: 0 for a normal exit, 1 for environmental problems + (file not found, invalid flags, I/O errors, etc), 2 to indicate a + corrupt or invalid input file. +*/ + +#include +#include +#include +#include +#include +#include +#include +#if defined(__MSVCRT__) || defined(__OS2__) || defined(_MSC_VER) +#include +#include +#endif + + +class State + { + int st; + +public: + enum { states = 12 }; + State() : st( 0 ) {} + int operator()() const { return st; } + bool is_char() const { return st < 7; } + + void set_char() + { + static const int next[states] = { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5 }; + st = next[st]; + } + void set_match() { st = ( st < 7 ) ? 7 : 10; } + void set_rep() { st = ( st < 7 ) ? 8 : 11; } + void set_short_rep() { st = ( st < 7 ) ? 9 : 11; } + }; + + +enum { + min_dictionary_size = 1 << 12, + max_dictionary_size = 1 << 29, + literal_context_bits = 3, + literal_pos_state_bits = 0, // not used + pos_state_bits = 2, + pos_states = 1 << pos_state_bits, + pos_state_mask = pos_states - 1, + + len_states = 4, + dis_slot_bits = 6, + start_dis_model = 4, + end_dis_model = 14, + modeled_distances = 1 << (end_dis_model / 2), // 128 + dis_align_bits = 4, + dis_align_size = 1 << dis_align_bits, + + len_low_bits = 3, + len_mid_bits = 3, + len_high_bits = 8, + len_low_symbols = 1 << len_low_bits, + len_mid_symbols = 1 << len_mid_bits, + len_high_symbols = 1 << len_high_bits, + max_len_symbols = len_low_symbols + len_mid_symbols + len_high_symbols, + + min_match_len = 2, // must be 2 + + bit_model_move_bits = 5, + bit_model_total_bits = 11, + bit_model_total = 1 << bit_model_total_bits }; + +struct Bit_model + { + int probability; + Bit_model() : probability( bit_model_total / 2 ) {} + }; + +struct Len_model + { + Bit_model choice1; + Bit_model choice2; + Bit_model bm_low[pos_states][len_low_symbols]; + Bit_model bm_mid[pos_states][len_mid_symbols]; + Bit_model bm_high[len_high_symbols]; + }; + + +class CRC32 + { + uint32_t data[256]; // Table of CRCs of all 8-bit messages. + +public: + CRC32() + { + for( unsigned n = 0; n < 256; ++n ) + { + unsigned c = n; + for( int k = 0; k < 8; ++k ) + { if( c & 1 ) c = 0xEDB88320U ^ ( c >> 1 ); else c >>= 1; } + data[n] = c; + } + } + + void update_buf( uint32_t & crc, const uint8_t * const buffer, + const int size ) const + { + for( int i = 0; i < size; ++i ) + crc = data[(crc^buffer[i])&0xFF] ^ ( crc >> 8 ); + } + }; + +const CRC32 crc32; + + +typedef uint8_t File_header[6]; // 0-3 magic, 4 version, 5 coded_dict_size + +typedef uint8_t File_trailer[20]; + // 0-3 CRC32 of the uncompressed data + // 4-11 size of the uncompressed data + // 12-19 member size including header and trailer + +class Range_decoder + { + uint32_t code; + uint32_t range; + +public: + Range_decoder() : code( 0 ), range( 0xFFFFFFFFU ) + { + for( int i = 0; i < 5; ++i ) code = (code << 8) | get_byte(); + } + + uint8_t get_byte() { return std::getc( stdin ); } + + unsigned decode( const int num_bits ) + { + unsigned symbol = 0; + for( int i = num_bits; i > 0; --i ) + { + range >>= 1; + symbol <<= 1; + if( code >= range ) { code -= range; symbol |= 1; } + if( range <= 0x00FFFFFFU ) // normalize + { range <<= 8; code = (code << 8) | get_byte(); } + } + return symbol; + } + + unsigned decode_bit( Bit_model & bm ) + { + unsigned symbol; + const uint32_t bound = ( range >> bit_model_total_bits ) * bm.probability; + if( code < bound ) + { + range = bound; + bm.probability += (bit_model_total - bm.probability) >> bit_model_move_bits; + symbol = 0; + } + else + { + range -= bound; + code -= bound; + bm.probability -= bm.probability >> bit_model_move_bits; + symbol = 1; + } + if( range <= 0x00FFFFFFU ) // normalize + { range <<= 8; code = (code << 8) | get_byte(); } + return symbol; + } + + unsigned decode_tree( Bit_model bm[], const int num_bits ) + { + unsigned symbol = 1; + for( int i = 0; i < num_bits; ++i ) + symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); + return symbol - (1 << num_bits); + } + + unsigned decode_tree_reversed( Bit_model bm[], const int num_bits ) + { + unsigned symbol = decode_tree( bm, num_bits ); + unsigned reversed_symbol = 0; + for( int i = 0; i < num_bits; ++i ) + { + reversed_symbol = ( reversed_symbol << 1 ) | ( symbol & 1 ); + symbol >>= 1; + } + return reversed_symbol; + } + + unsigned decode_matched( Bit_model bm[], const unsigned match_byte ) + { + unsigned symbol = 1; + for( int i = 7; i >= 0; --i ) + { + const unsigned match_bit = ( match_byte >> i ) & 1; + const unsigned bit = decode_bit( bm[symbol+(match_bit<<8)+0x100] ); + symbol = ( symbol << 1 ) | bit; + if( match_bit != bit ) + { + while( symbol < 0x100 ) + symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); + break; + } + } + return symbol & 0xFF; + } + + unsigned decode_len( Len_model & lm, const int pos_state ) + { + if( decode_bit( lm.choice1 ) == 0 ) + return decode_tree( lm.bm_low[pos_state], len_low_bits ); + if( decode_bit( lm.choice2 ) == 0 ) + return len_low_symbols + + decode_tree( lm.bm_mid[pos_state], len_mid_bits ); + return len_low_symbols + len_mid_symbols + + decode_tree( lm.bm_high, len_high_bits ); + } + }; + + +class LZ_decoder + { + unsigned long long partial_data_pos; + Range_decoder rdec; + const unsigned dictionary_size; + uint8_t * const buffer; // output buffer + unsigned pos; // current pos in buffer + unsigned stream_pos; // first byte not yet written to stdout + uint32_t crc_; + bool pos_wrapped; + + void flush_data(); + + uint8_t peek( const unsigned distance ) const + { + if( pos > distance ) return buffer[pos - distance - 1]; + if( pos_wrapped ) return buffer[dictionary_size + pos - distance - 1]; + return 0; // prev_byte of first byte + } + + void put_byte( const uint8_t b ) + { + buffer[pos] = b; + if( ++pos >= dictionary_size ) flush_data(); + } + +public: + explicit LZ_decoder( const unsigned dict_size ) + : + partial_data_pos( 0 ), + dictionary_size( dict_size ), + buffer( new uint8_t[dictionary_size] ), + pos( 0 ), + stream_pos( 0 ), + crc_( 0xFFFFFFFFU ), + pos_wrapped( false ) + {} + + ~LZ_decoder() { delete[] buffer; } + + unsigned crc() const { return crc_ ^ 0xFFFFFFFFU; } + unsigned long long data_position() const { return partial_data_pos + pos; } + + bool decode_member(); + }; + + +void LZ_decoder::flush_data() + { + if( pos > stream_pos ) + { + const unsigned size = pos - stream_pos; + crc32.update_buf( crc_, buffer + stream_pos, size ); + errno = 0; + if( std::fwrite( buffer + stream_pos, 1, size, stdout ) != size ) + { std::fprintf( stderr, "Write error: %s\n", std::strerror( errno ) ); + std::exit( 1 ); } + if( pos >= dictionary_size ) + { partial_data_pos += pos; pos = 0; pos_wrapped = true; } + stream_pos = pos; + } + } + + +bool LZ_decoder::decode_member() // Returns false if error + { + Bit_model bm_literal[1<> ( 8 - literal_context_bits ); + Bit_model * const bm = bm_literal[literal_state]; + if( state.is_char() ) + put_byte( rdec.decode_tree( bm, 8 ) ); + else + put_byte( rdec.decode_matched( bm, peek( rep0 ) ) ); + state.set_char(); + } + else // match or repeated match + { + int len; + if( rdec.decode_bit( bm_rep[state()] ) != 0 ) // 2nd bit + { + if( rdec.decode_bit( bm_rep0[state()] ) == 0 ) // 3rd bit + { + if( rdec.decode_bit( bm_len[state()][pos_state] ) == 0 ) // 4th bit + { state.set_short_rep(); put_byte( peek( rep0 ) ); continue; } + } + else + { + unsigned distance; + if( rdec.decode_bit( bm_rep1[state()] ) == 0 ) // 4th bit + distance = rep1; + else + { + if( rdec.decode_bit( bm_rep2[state()] ) == 0 ) // 5th bit + distance = rep2; + else + { distance = rep3; rep3 = rep2; } + rep2 = rep1; + } + rep1 = rep0; + rep0 = distance; + } + state.set_rep(); + len = min_match_len + rdec.decode_len( rep_len_model, pos_state ); + } + else // match + { + rep3 = rep2; rep2 = rep1; rep1 = rep0; + len = min_match_len + rdec.decode_len( match_len_model, pos_state ); + const int len_state = std::min( len - min_match_len, len_states - 1 ); + rep0 = rdec.decode_tree( bm_dis_slot[len_state], dis_slot_bits ); + if( rep0 >= start_dis_model ) + { + const unsigned dis_slot = rep0; + const int direct_bits = ( dis_slot >> 1 ) - 1; + rep0 = ( 2 | ( dis_slot & 1 ) ) << direct_bits; + if( dis_slot < end_dis_model ) + rep0 += rdec.decode_tree_reversed( bm_dis + ( rep0 - dis_slot ), + direct_bits ); + else + { + rep0 += rdec.decode( direct_bits - dis_align_bits ) << dis_align_bits; + rep0 += rdec.decode_tree_reversed( bm_align, dis_align_bits ); + if( rep0 == 0xFFFFFFFFU ) // marker found + { + flush_data(); + return ( len == min_match_len ); // End Of Stream marker + } + } + } + state.set_match(); + if( rep0 >= dictionary_size || ( rep0 >= pos && !pos_wrapped ) ) + { flush_data(); return false; } + } + for( int i = 0; i < len; ++i ) put_byte( peek( rep0 ) ); + } + } + flush_data(); + return false; + } + + +int main( const int argc, const char * const argv[] ) + { + if( argc > 1 ) + { + std::printf( "Lzd %s - Educational decompressor for the lzip format.\n", + PROGVERSION ); + std::printf( "Study the source to learn how a lzip decompressor works.\n" + "See the lzip manual for an explanation of the code.\n" + "It is not safe to use lzd for any real work.\n" + "\nUsage: %s < file.lz > file\n", argv[0] ); + std::printf( "Lzd decompresses from standard input to standard output.\n" + "\nCopyright (C) 2017 Antonio Diaz Diaz.\n" + "This is free software: you are free to change and redistribute it.\n" + "There is NO WARRANTY, to the extent permitted by law.\n" + "Report bugs to lzip-bug@nongnu.org\n" + "Lzd home page: http://www.nongnu.org/lzip/lzd.html\n" ); + return 0; + } + +#if defined(__MSVCRT__) || defined(__OS2__) || defined(_MSC_VER) + setmode( fileno( stdin ), O_BINARY ); + setmode( fileno( stdout ), O_BINARY ); +#endif + + for( bool first_member = true; ; first_member = false ) + { + File_header header; // verify header + for( int i = 0; i < 6; ++i ) header[i] = std::getc( stdin ); + if( std::feof( stdin ) || std::memcmp( header, "LZIP\x01", 5 ) != 0 ) + { + if( first_member ) + { std::fputs( "Bad magic number (file not in lzip format).\n", stderr ); + return 2; } + break; + } + unsigned dict_size = 1 << ( header[5] & 0x1F ); + dict_size -= ( dict_size / 16 ) * ( ( header[5] >> 5 ) & 7 ); + if( dict_size < min_dictionary_size || dict_size > max_dictionary_size ) + { std::fputs( "Invalid dictionary size in member header.\n", stderr ); + return 2; } + + LZ_decoder decoder( dict_size ); // decode LZMA stream + if( !decoder.decode_member() ) + { std::fputs( "Data error\n", stderr ); return 2; } + + File_trailer trailer; // verify trailer + for( int i = 0; i < 20; ++i ) trailer[i] = std::getc( stdin ); + unsigned crc = 0; + for( int i = 3; i >= 0; --i ) { crc <<= 8; crc += trailer[i]; } + unsigned long long data_size = 0; + for( int i = 11; i >= 4; --i ) { data_size <<= 8; data_size += trailer[i]; } + if( crc != decoder.crc() || data_size != decoder.data_position() ) + { std::fputs( "CRC error\n", stderr ); return 2; } + } + + if( std::fclose( stdout ) != 0 ) + { std::fprintf( stderr, "Can't close stdout: %s\n", std::strerror( errno ) ); + return 1; } + return 0; + } +@end verbatim + + @node Concept index @unnumbered Concept index -- cgit v1.2.3