From 62327bcaed81d8c02f11aec2c49f7d09c5edddb7 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 7 Nov 2015 11:08:36 +0100 Subject: Adding upstream version 1.17. Signed-off-by: Daniel Baumann --- ChangeLog | 15 +- README | 13 +- configure | 2 +- decoder.cc | 28 +-- decoder.h | 16 +- doc/lzip.1 | 4 +- doc/lzip.info | 506 +++++++++++++++++++++++++++-------------------------- doc/lzip.texi | 475 +++++++++++++++++++++++++------------------------ encoder.cc | 42 ++--- encoder.h | 18 +- encoder_base.cc | 2 +- encoder_base.h | 40 ++--- fast_encoder.cc | 4 +- fast_encoder.h | 4 +- lzip.h | 20 +-- main.cc | 67 ++++--- testsuite/check.sh | 3 +- 17 files changed, 643 insertions(+), 616 deletions(-) diff --git a/ChangeLog b/ChangeLog index aa3faae..e3ebebe 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,17 +1,8 @@ -2015-05-25 Antonio Diaz Diaz +2015-07-12 Antonio Diaz Diaz - * Version 1.17-rc2 released. - * lzip.texi: Added chapter 'Quality assurance'. - -2015-04-17 Antonio Diaz Diaz - - * Version 1.17-rc1 released. - * main.cc (compress): Fixed spurious warning about uninitialized var. - -2015-03-26 Antonio Diaz Diaz - - * Version 1.17-pre1 released. + * Version 1.17 released. * Reorganization of the compression code. + * lzip.texi: Added chapter 'Quality assurance'. * Makefile.in: Added new targets 'install*-compress'. 2014-08-26 Antonio Diaz Diaz diff --git a/README b/README index 894b77a..8a31263 100644 --- a/README +++ b/README @@ -40,6 +40,13 @@ each file without exceeding the given limit. Keep in mind that the decompression memory requirement is affected at compression time by the choice of dictionary size limit. +The amount of memory required for compression is about 1 or 2 times the +dictionary size limit (1 if input file size is less than dictionary size +limit, else 2) plus 9 times the dictionary size really used. The option +'-0' is special and only requires about 1.5 MiB at most. The amount of +memory required for decompression is about 46 kB larger than the +dictionary size really used. + When compressing, lzip replaces every file given in the command line with a compressed version of itself, with the name "original_name.lz". When decompressing, lzip attempts to guess the name for the decompressed @@ -69,8 +76,8 @@ corresponding uncompressed files. Integrity testing of concatenated compressed files is also supported. Lzip can produce multi-member files and safely recover, with -lziprecover, the undamaged members in case of file damage. Lzip can also -split the compressed output in volumes of a given size, even when +lziprecover, the undamaged members in case of file damage. Lzip can +also split the compressed output in volumes of a given size, even when reading from standard input. This allows the direct creation of multivolume compressed tar archives. @@ -88,7 +95,7 @@ used by lzip could be developed, and the resulting sequence could also be coded using the LZMA coding scheme. Lzip currently implements two variants of the LZMA algorithm; fast -(used by option -0) and normal (used by all other compression levels). +(used by option '-0') and normal (used by all other compression levels). The high compression of LZMA comes from combining two basic, well-proven compression ideas: sliding dictionaries (LZ77/78) and markov models (the diff --git a/configure b/configure index 2a09e4f..9845c11 100755 --- a/configure +++ b/configure @@ -6,7 +6,7 @@ # to copy, distribute and modify it. pkgname=lzip -pkgversion=1.17-rc2 +pkgversion=1.17 progname=lzip srctrigger=doc/${pkgname}.texi diff --git a/decoder.cc b/decoder.cc index 5eb9221..113479a 100644 --- a/decoder.cc +++ b/decoder.cc @@ -43,7 +43,7 @@ void Pretty_print::operator()( const char * const msg ) const first_post = false; std::fprintf( stderr, " %s: ", name_.c_str() ); for( unsigned i = 0; i < longest_name - name_.size(); ++i ) - std::fprintf( stderr, " " ); + std::fputc( ' ', stderr ); if( !msg ) std::fflush( stderr ); } if( msg ) std::fprintf( stderr, "%s\n", msg ); @@ -62,7 +62,7 @@ int readblock( const int fd, uint8_t * const buf, const int size ) { const int n = read( fd, buf + sz, size - sz ); if( n > 0 ) sz += n; - else if( n == 0 ) break; /* EOF */ + else if( n == 0 ) break; // EOF else if( errno != EINTR ) break; errno = 0; } @@ -149,7 +149,7 @@ bool LZ_decoder::verify_trailer( const Pretty_print & pp ) const if( verbosity >= 0 ) { pp(); - std::fprintf( stderr, "CRC mismatch; trailer says %08X, data CRC is %08X.\n", + std::fprintf( stderr, "CRC mismatch; trailer says %08X, data CRC is %08X\n", trailer.data_crc(), crc() ); } } @@ -159,7 +159,7 @@ bool LZ_decoder::verify_trailer( const Pretty_print & pp ) const if( verbosity >= 0 ) { pp(); - std::fprintf( stderr, "Data size mismatch; trailer says %llu, data size is %llu (0x%llX).\n", + std::fprintf( stderr, "Data size mismatch; trailer says %llu, data size is %llu (0x%llX)\n", trailer.data_size(), data_position(), data_position() ); } } @@ -169,7 +169,7 @@ bool LZ_decoder::verify_trailer( const Pretty_print & pp ) const if( verbosity >= 0 ) { pp(); - std::fprintf( stderr, "Member size mismatch; trailer says %llu, member size is %llu (0x%llX).\n", + std::fprintf( stderr, "Member size mismatch; trailer says %llu, member size is %llu (0x%llX)\n", trailer.member_size(), member_size, member_size ); } } @@ -201,9 +201,9 @@ int LZ_decoder::decode_member( const Pretty_print & pp ) Bit_model bm_align[dis_align_size]; Len_model match_len_model; Len_model rep_len_model; - unsigned rep0 = 0; /* rep[0-3] latest four distances */ - unsigned rep1 = 0; /* used for efficient coding of */ - unsigned rep2 = 0; /* repeated distances */ + unsigned rep0 = 0; // rep[0-3] latest four distances + unsigned rep1 = 0; // used for efficient coding of + unsigned rep2 = 0; // repeated distances unsigned rep3 = 0; State state; @@ -226,7 +226,7 @@ int LZ_decoder::decode_member( const Pretty_print & pp ) peek( rep0 ) ) ); } } - else + else // match or repeated match { int len; if( rdec.decode_bit( bm_rep[state()] ) != 0 ) // 2nd bit @@ -255,7 +255,7 @@ int LZ_decoder::decode_member( const Pretty_print & pp ) state.set_rep(); len = min_match_len + rdec.decode_len( rep_len_model, pos_state ); } - else + else // match { const unsigned rep0_saved = rep0; len = min_match_len + rdec.decode_len( match_len_model, pos_state ); @@ -272,23 +272,23 @@ int LZ_decoder::decode_member( const Pretty_print & pp ) { rep0 += rdec.decode( direct_bits - dis_align_bits ) << dis_align_bits; rep0 += rdec.decode_tree_reversed4( bm_align ); - if( rep0 == 0xFFFFFFFFU ) /* marker found */ + if( rep0 == 0xFFFFFFFFU ) // marker found { rep0 = rep0_saved; rdec.normalize(); flush_data(); - if( len == min_match_len ) /* End Of Stream marker */ + if( len == min_match_len ) // End Of Stream marker { if( verify_trailer( pp ) ) return 0; else return 3; } - if( len == min_match_len + 1 ) /* Sync Flush marker */ + if( len == min_match_len + 1 ) // Sync Flush marker { rdec.load(); continue; } if( verbosity >= 0 ) { pp(); - std::fprintf( stderr, "Unsupported marker code '%d'.\n", len ); + std::fprintf( stderr, "Unsupported marker code '%d'\n", len ); } return 4; } diff --git a/decoder.h b/decoder.h index 9419669..98d42ce 100644 --- a/decoder.h +++ b/decoder.h @@ -19,12 +19,12 @@ class Range_decoder { enum { buffer_size = 16384 }; unsigned long long partial_member_pos; - uint8_t * const buffer; /* input buffer */ - int pos; /* current pos in buffer */ - int stream_pos; /* when reached, a new block must be read */ + uint8_t * const buffer; // input buffer + int pos; // current pos in buffer + int stream_pos; // when reached, a new block must be read uint32_t code; uint32_t range; - const int infd; /* input file descriptor */ + const int infd; // input file descriptor bool at_stream_end; bool read_block(); @@ -213,11 +213,11 @@ class LZ_decoder Range_decoder & rdec; const unsigned dictionary_size; const int buffer_size; - uint8_t * const buffer; /* output buffer */ - int pos; /* current pos in buffer */ - int stream_pos; /* first byte not yet written to file */ + uint8_t * const buffer; // output buffer + int pos; // current pos in buffer + int stream_pos; // first byte not yet written to file uint32_t crc_; - const int outfd; /* output file descriptor */ + const int outfd; // output file descriptor const int member_version; void flush_data(); diff --git a/doc/lzip.1 b/doc/lzip.1 index 6b779f1..b6acca6 100644 --- a/doc/lzip.1 +++ b/doc/lzip.1 @@ -1,5 +1,5 @@ .\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.46.1. -.TH LZIP "1" "May 2015" "lzip 1.17-rc2" "User Commands" +.TH LZIP "1" "July 2015" "lzip 1.17" "User Commands" .SH NAME lzip \- reduces the size of files .SH SYNOPSIS @@ -28,7 +28,7 @@ decompress overwrite existing output files .TP \fB\-F\fR, \fB\-\-recompress\fR -force recompression of compressed files +force re\-compression of compressed files .TP \fB\-k\fR, \fB\-\-keep\fR keep (don't delete) input files diff --git a/doc/lzip.info b/doc/lzip.info index 6854503..f0aa011 100644 --- a/doc/lzip.info +++ b/doc/lzip.info @@ -11,16 +11,16 @@ File: lzip.info, Node: Top, Next: Introduction, Up: (dir) Lzip Manual *********** -This manual is for Lzip (version 1.17-rc2, 25 May 2015). +This manual is for Lzip (version 1.17, 12 July 2015). * Menu: * Introduction:: Purpose and features of lzip -* Algorithm:: How lzip compresses the data * Invoking lzip:: Command line interface +* Quality assurance:: Design, development and testing of lzip * File format:: Detailed format of the compressed file +* Algorithm:: How lzip compresses the data * Stream format:: Format of the LZMA stream in lzip files -* Quality assurance:: Design, development and testing of lzip * Examples:: A small tutorial with examples * Problems:: Reporting bugs * Reference source code:: Source code illustrating stream format @@ -33,7 +33,7 @@ This manual is for Lzip (version 1.17-rc2, 25 May 2015). copy, distribute and modify it.  -File: lzip.info, Node: Introduction, Next: Algorithm, Prev: Top, Up: Top +File: lzip.info, Node: Introduction, Next: Invoking lzip, Prev: Top, Up: Top 1 Introduction ************** @@ -51,7 +51,8 @@ availability: recovery means. The lziprecover program can repair bit-flip errors (one of the most common forms of data corruption) in lzip files, and provides data recovery capabilities, including error-checked - merging of damaged copies of a file. + merging of damaged copies of a file. *note Data safety: + (lziprecover)Data safety. * The lzip format is as simple as possible (but not simpler). The lzip manual provides the code of a simple decompressor along with @@ -85,6 +86,11 @@ which makes it safer than compressors returning ambiguous warning values (like gzip) when it is used as a back end for other programs like tar or zutils. + Lzip will automatically use the smallest possible dictionary size for +each file without exceeding the given limit. Keep in mind that the +decompression memory requirement is affected at compression time by the +choice of dictionary size limit. + The amount of memory required for compression is about 1 or 2 times the dictionary size limit (1 if input file size is less than dictionary size limit, else 2) plus 9 times the dictionary size really used. The @@ -92,11 +98,6 @@ option '-0' is special and only requires about 1.5 MiB at most. The amount of memory required for decompression is about 46 kB larger than the dictionary size really used. - Lzip will automatically use the smallest possible dictionary size for -each file without exceeding the given limit. Keep in mind that the -decompression memory requirement is affected at compression time by the -choice of dictionary size limit. - When compressing, lzip replaces every file given in the command line with a compressed version of itself, with the name "original_name.lz". When decompressing, lzip attempts to guess the name for the decompressed @@ -126,8 +127,8 @@ corresponding uncompressed files. Integrity testing of concatenated compressed files is also supported. Lzip can produce multi-member files and safely recover, with -lziprecover, the undamaged members in case of file damage. Lzip can also -split the compressed output in volumes of a given size, even when +lziprecover, the undamaged members in case of file damage. Lzip can +also split the compressed output in volumes of a given size, even when reading from standard input. This allows the direct creation of multivolume compressed tar archives. @@ -136,75 +137,9 @@ automatically creating multi-member output. The members so created are large, about 2 PiB each.  -File: lzip.info, Node: Algorithm, Next: Invoking lzip, Prev: Introduction, Up: Top - -2 Algorithm -*********** - -In spite of its name (Lempel-Ziv-Markov chain-Algorithm), LZMA is not a -concrete algorithm; it is more like "any algorithm using the LZMA coding -scheme". For example, the option '-0' of lzip uses the scheme in almost -the simplest way possible; issuing the longest match it can find, or a -literal byte if it can't find a match. Inversely, a much more elaborated -way of finding coding sequences of minimum size than the one currently -used by lzip could be developed, and the resulting sequence could also -be coded using the LZMA coding scheme. - - Lzip currently implements two variants of the LZMA algorithm; fast -(used by option -0) and normal (used by all other compression levels). - - The high compression of LZMA comes from combining two basic, -well-proven compression ideas: sliding dictionaries (LZ77/78) and -markov models (the thing used by every compression algorithm that uses -a range encoder or similar order-0 entropy coder as its last stage) -with segregation of contexts according to what the bits are used for. - - Lzip is a two stage compressor. The first stage is a Lempel-Ziv -coder, which reduces redundancy by translating chunks of data to their -corresponding distance-length pairs. The second stage is a range encoder -that uses a different probability model for each type of data; -distances, lengths, literal bytes, etc. - - Here is how it works, step by step: - - 1) The member header is written to the output stream. - - 2) The first byte is coded literally, because there are no previous -bytes to which the match finder can refer to. - - 3) The main encoder advances to the next byte in the input data and -calls the match finder. - - 4) The match finder fills an array with the minimum distances before -the current byte where a match of a given length can be found. - - 5) Go back to step 3 until a sequence (formed of pairs, repeated -distances and literal bytes) of minimum price has been formed. Where the -price represents the number of output bits produced. - - 6) The range encoder encodes the sequence produced by the main -encoder and sends the produced bytes to the output stream. - - 7) Go back to step 3 until the input data are finished or until the -member or volume size limits are reached. - - 8) The range encoder is flushed. - - 9) The member trailer is written to the output stream. - - 10) If there are more data to compress, go back to step 1. - - -The ideas embodied in lzip are due to (at least) the following people: -Abraham Lempel and Jacob Ziv (for the LZ algorithm), Andrey Markov (for -the definition of Markov chains), G.N.N. Martin (for the definition of -range encoding), Igor Pavlov (for putting all the above together in -LZMA), and Julian Seward (for bzip2's CLI). - - -File: lzip.info, Node: Invoking lzip, Next: File format, Prev: Algorithm, Up: Top +File: lzip.info, Node: Invoking lzip, Next: Quality assurance, Prev: Introduction, Up: Top -3 Invoking lzip +2 Invoking lzip *************** The format for running lzip is: @@ -244,7 +179,7 @@ The format for running lzip is: '-F' '--recompress' - Force recompression of files whose name already has the '.lz' or + Force re-compression of files whose name already has the '.lz' or '.tlz' suffix. '-k' @@ -362,7 +297,155 @@ invalid input file, 3 for an internal consistency error (eg, bug) which caused lzip to panic.  -File: lzip.info, Node: File format, Next: Stream format, Prev: Invoking lzip, Up: Top +File: lzip.info, Node: Quality assurance, Next: File format, Prev: Invoking lzip, Up: Top + +3 Design, development and testing of lzip +***************************************** + +There are two ways of constructing a software design. One way is to make +it so simple that there are obviously no deficiencies and the other is +to make it so complicated that there are no obvious deficiencies. +-- C.A.R. Hoare + + Lzip has been designed, written and tested with great care to be the +standard general-purpose compressor for unix-like systems. This chapter +describes the lessons learned from previous compressors (gzip and +bzip2), and their application to the design of lzip. + + +3.1 Format design +================= + +When gzip was designed in 1992, computers and operating systems were +much less capable than they are today. Gzip tried to work around some of +those limitations, like 8.3 file names, with additional fields in its +file format. + + Today those limitations have mostly disappeared, and the format of +gzip has proved to be unnecessarily complicated. It includes fields +that were never used, others that have lost its usefulness, and finally +others that have become too limited. + + Bzip2 was designed 5 years later, and its format is simpler than the +one of gzip. + + Probably the worst defect of the gzip format from the point of view +of data safety is the variable size of its header. If the byte at +offset 3 (flags) of a gzip member gets corrupted, it mat become very +difficult to recover the data, even if the compressed blocks are +intact, because it can't be known with certainty where the compressed +blocks begin. + + By contrast, the header of a lzip member has a fixed length of 6. The +lzma stream in a lzip member always starts at offset 6, making it +trivial to recover the data even if the whole header becomes corrupt. + + Bzip2 also provides a header of fixed length and marks the begin and +end of each compressed block with six magic bytes, making it possible to +find the compressed blocks even in case of file damage. But bzip2 does +not store the size of each compressed block, as lzip does. + + Lzip provides better data recovery capabilities than any other +gzip-like compressor because its format has been designed from the +beginning to be simple and safe. It would be very difficult to write an +automatic recovery tool like lziprecover for the gzip format. And, as +far as I know, it has never been writen. + + The lzip format is designed for long-term archiving. Therefore it +excludes any unneeded features that may interfere with the future +extraction of the uncompressed data. + + +3.1.1 Gzip format (mis)features not present in lzip +--------------------------------------------------- + +'Multiple algorithms' + Gzip provides a CM (Compression Method) field that has never been + used because it is a bad idea to begin with. New compression + methods may require additional fields, making it impossible to + implement new methods and, at the same time, keep the same format. + This field does not solve the problem of format proliferation; it + just makes the problem less obvious. + +'Optional fields in header' + Unless special precautions are taken, optional fields are + generally a bad idea because they produce a header of variable + size. The gzip header has 2 fields that, in addition to being + optional, are zero-terminated. This means that if any byte inside + the field gets zeroed, or if the terminating zero gets altered, + gzip won't be able to find neither the header CRC nor the + compressed blocks. + +'Optional CRC for the header' + Using an optional checksum for the header is not only a bad idea, + it is an error; it may prevent the extraction of perfectly good + data. For example, if the checksum is used and the bit enabling it + is reset by a bit-flip, the header will appear to be intact (in + spite of being corrupt) while the compressed blocks will appear to + be totally unrecoverable (in spite of being intact). Very + misleading indeed. + + +3.1.2 Lzip format improvements over gzip and bzip2 +-------------------------------------------------- + +'64-bit size field' + Probably the most frequently reported shortcoming of the gzip + format is that it only stores the least significant 32 bits of the + uncompressed size. The size of any file larger than 4 GiB gets + truncated. + + Bzip2 does not store the uncompressed size of the file. + + The lzip format provides a 64-bit field for the uncompressed size. + Additionaly, lzip produces multi-member output automatically when + the size is too large for a single member, allowing for an + unlimited uncompressed size. + +'Distributed index' + The lzip format provides a distributed index that, among other + things, helps plzip to decompress several times faster than pigz + and helps lziprecover do its job. Neither the gzip format nor the + bzip2 format do provide an index. + + A distributed index is safer and more scalable than a monolithic + index. The monolithic index introduces a single point of failure + in the compressed file and may limit the number of members or the + total uncompressed size. + + +3.2 Quality of implementation +============================= + +'Multiple implementations' + Just like the lzip format provides 4 factor protection against + undetected data corruption, the development methodology of the lzip + family of compressors provides 3 factor protection against + undetected programming errors. + + Three related but independent compressor implementations, lzip, + clzip and minilzip/lzlib, are developed concurrently. Every stable + release of any of them is subjected to a hundred hours of + intensive testing to verify that it produces identical output to + the other two. This guarantees that all three implement the same + algorithm, and makes it unlikely that any of them may contain + serious undiscovered errors. In fact, no errors have been + discovered in lzip since 2009. + +'Dictionary size' + Lzip automatically uses the smallest possible dictionary size for + each file. In addition to reducing the amount of memory required + for decompression, this feature also minimizes the probability of + being affected by RAM errors during compression. + +'Exit status' + Returning a warning status of 2 is a design flaw of compress that + leaked into the design of gzip. Both bzip2 and lzip are free from + this flaw. + + + +File: lzip.info, Node: File format, Next: Algorithm, Prev: Quality assurance, Up: Top 4 File format ************* @@ -433,9 +516,75 @@ additional information before, between, or after them.  -File: lzip.info, Node: Stream format, Next: Quality assurance, Prev: File format, Up: Top +File: lzip.info, Node: Algorithm, Next: Stream format, Prev: File format, Up: Top + +5 Algorithm +*********** + +In spite of its name (Lempel-Ziv-Markov chain-Algorithm), LZMA is not a +concrete algorithm; it is more like "any algorithm using the LZMA coding +scheme". For example, the option '-0' of lzip uses the scheme in almost +the simplest way possible; issuing the longest match it can find, or a +literal byte if it can't find a match. Inversely, a much more elaborated +way of finding coding sequences of minimum size than the one currently +used by lzip could be developed, and the resulting sequence could also +be coded using the LZMA coding scheme. + + Lzip currently implements two variants of the LZMA algorithm; fast +(used by option '-0') and normal (used by all other compression levels). + + The high compression of LZMA comes from combining two basic, +well-proven compression ideas: sliding dictionaries (LZ77/78) and +markov models (the thing used by every compression algorithm that uses +a range encoder or similar order-0 entropy coder as its last stage) +with segregation of contexts according to what the bits are used for. + + Lzip is a two stage compressor. The first stage is a Lempel-Ziv +coder, which reduces redundancy by translating chunks of data to their +corresponding distance-length pairs. The second stage is a range encoder +that uses a different probability model for each type of data; +distances, lengths, literal bytes, etc. + + Here is how it works, step by step: + + 1) The member header is written to the output stream. + + 2) The first byte is coded literally, because there are no previous +bytes to which the match finder can refer to. + + 3) The main encoder advances to the next byte in the input data and +calls the match finder. + + 4) The match finder fills an array with the minimum distances before +the current byte where a match of a given length can be found. + + 5) Go back to step 3 until a sequence (formed of pairs, repeated +distances and literal bytes) of minimum price has been formed. Where the +price represents the number of output bits produced. + + 6) The range encoder encodes the sequence produced by the main +encoder and sends the produced bytes to the output stream. + + 7) Go back to step 3 until the input data are finished or until the +member or volume size limits are reached. + + 8) The range encoder is flushed. + + 9) The member trailer is written to the output stream. + + 10) If there are more data to compress, go back to step 1. + + +The ideas embodied in lzip are due to (at least) the following people: +Abraham Lempel and Jacob Ziv (for the LZ algorithm), Andrey Markov (for +the definition of Markov chains), G.N.N. Martin (for the definition of +range encoding), Igor Pavlov (for putting all the above together in +LZMA), and Julian Seward (for bzip2's CLI). + + +File: lzip.info, Node: Stream format, Next: Examples, Prev: Algorithm, Up: Top -5 Format of the LZMA stream in lzip files +6 Format of the LZMA stream in lzip files ***************************************** The LZMA algorithm has three parameters, called "special LZMA @@ -473,7 +622,7 @@ the lzip download directory. The source code of lzd is included in appendix A. *note Reference source code:: -5.1 What is coded +6.1 What is coded ================= The LZMA stream includes literals, matches and repeated matches (matches @@ -525,7 +674,7 @@ slot + direct_bits distances from 4 to 127 slot + (direct_bits - 4) + 4 bits distances from 128 to 2^32 - 1 -5.2 The coding contexts +6.2 The coding contexts ======================= These contexts ('Bit_model' in the source), are integers or arrays of @@ -615,7 +764,7 @@ difference is found, the rest of the byte is decoded using the normal bit tree context. (See 'decode_matched' in the source). -5.3 The range decoder +6.3 The range decoder ===================== The LZMA stream is consumed one byte at a time by the range decoder. @@ -635,7 +784,7 @@ range decoder. This is done by shifting 5 bytes in the initialization of source). -5.4 Decoding the LZMA stream +6.4 Decoding the LZMA stream ============================ After decoding the member header and obtaining the dictionary size, the @@ -646,144 +795,7 @@ with the appropriate contexts to decode the different coding sequences Stream" marker is decoded.  -File: lzip.info, Node: Quality assurance, Next: Examples, Prev: Stream format, Up: Top - -6 Design, development and testing of lzip -***************************************** - -There are two ways of constructing a software design. One way is to make -it so simple that there are obviously no deficiencies and the other is -to make it so complicated that there are no obvious deficiencies. --- C.A.R. Hoare - - Lzip has been designed, written and tested with great care to be the -standard general-purpose compressor for unix-like systems. This chapter -describes the lessons learned from previous compressors (gzip and -bzip2), and their application to the design of lzip. - - -6.1 Format design -================= - -When gzip was designed in 1992, computers and operating systems were -much less capable than they are today. Gzip tried to work around some of -those limitations, like 8.3 file names, with additional fields in its -file format. - - Today those limitations have mostly disappeared, and the format of -gzip has proved to be unnecessarily complicated. It includes fields -that were never used, others that have lost its usefulness, and finally -others that have become too limited. - - Bzip2 was designed 5 years later, and its format is in some aspects -simpler than the one of gzip. But bzip2 also shows complexities in its -file format which slow down decompression and, in retrospect, are -unnecessary. - - Probably the worst defect of the gzip format from the point of view -of data safety is the variable size of its header. If the byte at -offset 3 (flags) of a gzip member gets corrupted, it mat become very -difficult to recover the data, even if the compressed blocks are -intact, because it can't be known with certainty where the compressed -blocks begin. - - By contrast, the lzma stream in a lzip member always starts at -offset 6, making it trivial to recover the data even if the whole -header becomes corrupt. - - Lzip provides better data recovery capabilities than any other -gzip-like compressor because its format has been designed from the -beginning to be simple and safe. It would be very difficult to write an -automatic recovery tool like lziprecover for the gzip format. And, as -far as I know, it has never been writen. - - The lzip format is designed for long-term archiving. Therefore it -excludes any unneeded features that may interfere with the future -extraction of the uncompressed data. - - -6.1.1 Gzip format (mis)features not present in lzip ---------------------------------------------------- - -'Multiple algorithms' - Gzip provides a CM (Compression Method) field that has never been - used because it is a bad idea to begin with. New compression - methods may require additional fields, making it impossible to - implement new methods and, at the same time, keep the same format. - This field does not solve the problem of format proliferation; it - just makes the problem less obvious. - -'Optional fields in header' - Unless special precautions are taken, optional fields are - generally a bad idea because they produce a header of variable - size. The gzip header has 2 fields that, in addition to being - optional, are zero-terminated. This means that if any byte inside - the field gets zeroed, or if the terminating zero gets altered, - gzip won't be able to find neither the header CRC nor the - compressed blocks. - - Using an optional checksum for the header is not only a bad idea, - it is an error; it may prevent the extraction of perfectly good - data. For example, if the checksum is used and the bit enabling it - is reset by a bit-flip, the header will appear to be intact (in - spite of being corrupt) while the compressed blocks will appear to - be totally unrecoverable (in spite of being intact). Very - misleading indeed. - - -6.1.2 Lzip format improvements over gzip ----------------------------------------- - -'64-bit size field' - Probably the most frequently reported shortcoming of the gzip - format is that it only stores the least significant 32 bits of the - uncompressed size. The size of any file larger than 4 GiB gets - truncated. - - The lzip format provides a 64-bit field for the uncompressed size. - Additionaly, lzip produces multi-member output automatically when - the size is too large for a single member, allowing an unlimited - uncompressed size. - -'Distributed index' - The lzip format provides a distributed index that, among other - things, helps plzip to decompress several times faster than pigz - and helps lziprecover do its job. The gzip format does not provide - an index. - - A distributed index is safer and more scalable than a monolithic - index. The monolithic index introduces a single point of failure - in the compressed file and may limit the number of members or the - total uncompressed size. - - -6.2 Quality of implementation -============================= - -Three related but independent compressor implementations, lzip, clzip -and minilzip/lzlib, are developed concurrently. Every stable release of -any of them is subjected to a hundred hours of intensive testing to -verify that it produces identical output to the other two. This -guarantees that all three implement the same algorithm, and makes it -unlikely that any of them may contain serious undiscovered errors. In -fact, no errors have been discovered in lzip since 2009. - - Just like the lzip format provides 4 factor protection against -undetected data corruption, the development methodology described above -provides 3 factor protection against undetected programming errors in -lzip. - - Lzip automatically uses the smallest possible dictionary size for -each file. In addition to reducing the amount of memory required for -decompression, this feature also minimizes the probability of being -affected by RAM errors during compression. - - Returning a warning status of 2 is a design flaw of compress that -leaked into the design of gzip. Both bzip2 and lzip are free form this -flaw. - - -File: lzip.info, Node: Examples, Next: Problems, Prev: Quality assurance, Up: Top +File: lzip.info, Node: Examples, Next: Problems, Prev: Stream format, Up: Top 7 A small tutorial with examples ******************************** @@ -876,7 +888,7 @@ File: lzip.info, Node: Reference source code, Next: Concept index, Prev: Prob Appendix A Reference source code ******************************** -/* Lzd - Educational decompressor for lzip files +/* Lzd - Educational decompressor for the lzip format Copyright (C) 2013-2015 Antonio Diaz Diaz. This program is free software: you have unlimited permission @@ -1133,7 +1145,7 @@ class LZ_decoder } public: - LZ_decoder( const unsigned dict_size ) + explicit LZ_decoder( const unsigned dict_size ) : partial_data_pos( 0 ), dictionary_size( dict_size ), @@ -1160,7 +1172,7 @@ void LZ_decoder::flush_data() crc32.update_buf( crc_, buffer + stream_pos, size ); errno = 0; if( std::fwrite( buffer + stream_pos, 1, size, stdout ) != size ) - { std::fprintf( stderr, "Write error: %s.\n", std::strerror( errno ) ); + { std::fprintf( stderr, "Write error: %s\n", std::strerror( errno ) ); std::exit( 1 ); } if( pos >= dictionary_size ) { partial_data_pos += pos; pos = 0; } stream_pos = pos; @@ -1202,7 +1214,7 @@ bool LZ_decoder::decode_member() // Returns false if error put_byte( rdec.decode_matched( bm, peek( rep0 ) ) ); state.set_char(); } - else + else // match or repeated match { int len; if( rdec.decode_bit( bm_rep[state()] ) != 0 ) // 2nd bit @@ -1231,7 +1243,7 @@ bool LZ_decoder::decode_member() // Returns false if error state.set_rep(); len = min_match_len + rdec.decode_len( rep_len_model, pos_state ); } - else + else // match { rep3 = rep2; rep2 = rep1; rep1 = rep0; len = min_match_len + rdec.decode_len( match_len_model, pos_state ); @@ -1273,7 +1285,7 @@ int main( const int argc, const char * const argv[] ) { if( argc > 1 ) { - std::printf( "Lzd %s - Educational decompressor for lzip files.\n", + std::printf( "Lzd %s - Educational decompressor for the lzip format.\n", PROGVERSION ); std::printf( "Study the source to learn how a lzip decompressor works.\n" "See the lzip manual for an explanation of the code.\n" @@ -1300,19 +1312,19 @@ int main( const int argc, const char * const argv[] ) if( std::feof( stdin ) || std::memcmp( header, "LZIP\x01", 5 ) != 0 ) { if( first_member ) - { std::fprintf( stderr, "Bad magic number (file not in lzip format)\n" ); + { std::fputs( "Bad magic number (file not in lzip format).\n", stderr ); return 2; } break; } unsigned dict_size = 1 << ( header[5] & 0x1F ); dict_size -= ( dict_size / 16 ) * ( ( header[5] >> 5 ) & 7 ); if( dict_size < min_dictionary_size || dict_size > max_dictionary_size ) - { std::fprintf( stderr, "Invalid dictionary size in member header\n" ); + { std::fputs( "Invalid dictionary size in member header.\n", stderr ); return 2; } LZ_decoder decoder( dict_size ); if( !decoder.decode_member() ) - { std::fprintf( stderr, "Data error\n" ); return 2; } + { std::fputs( "Data error\n", stderr ); return 2; } File_trailer trailer; for( int i = 0; i < 20; ++i ) trailer[i] = std::getc( stdin ); @@ -1321,11 +1333,11 @@ int main( const int argc, const char * const argv[] ) unsigned long long data_size = 0; for( int i = 11; i >= 4; --i ) { data_size <<= 8; data_size += trailer[i]; } if( crc != decoder.crc() || data_size != decoder.data_position() ) - { std::fprintf( stderr, "CRC error\n" ); return 2; } + { std::fputs( "CRC error\n", stderr ); return 2; } } if( std::fclose( stdout ) != 0 ) - { std::fprintf( stderr, "Can't close stdout: %s.\n", std::strerror( errno ) ); + { std::fprintf( stderr, "Can't close stdout: %s\n", std::strerror( errno ) ); return 1; } return 0; } @@ -1357,16 +1369,16 @@ Concept index  Tag Table: Node: Top208 -Node: Introduction1090 -Node: Algorithm6008 -Node: Invoking lzip8833 -Node: File format14421 -Node: Stream format16806 -Node: Quality assurance26247 -Node: Examples32269 -Node: Problems34230 -Node: Reference source code34760 -Node: Concept index48358 +Node: Introduction1087 +Node: Invoking lzip6060 +Node: Quality assurance11658 +Node: File format18171 +Node: Algorithm20556 +Node: Stream format23382 +Node: Examples32812 +Node: Problems34769 +Node: Reference source code35299 +Node: Concept index48952  End Tag Table diff --git a/doc/lzip.texi b/doc/lzip.texi index ac44ee9..69f44ae 100644 --- a/doc/lzip.texi +++ b/doc/lzip.texi @@ -6,8 +6,8 @@ @finalout @c %**end of header -@set UPDATED 25 May 2015 -@set VERSION 1.17-rc2 +@set UPDATED 12 July 2015 +@set VERSION 1.17 @dircategory Data Compression @direntry @@ -36,11 +36,11 @@ This manual is for Lzip (version @value{VERSION}, @value{UPDATED}). @menu * Introduction:: Purpose and features of lzip -* Algorithm:: How lzip compresses the data * Invoking lzip:: Command line interface +* Quality assurance:: Design, development and testing of lzip * File format:: Detailed format of the compressed file +* Algorithm:: How lzip compresses the data * Stream format:: Format of the LZMA stream in lzip files -* Quality assurance:: Design, development and testing of lzip * Examples:: A small tutorial with examples * Problems:: Reporting bugs * Reference source code:: Source code illustrating stream format @@ -70,10 +70,14 @@ availability: @itemize @bullet @item The lzip format provides very safe integrity checking and some data -recovery means. The lziprecover program can repair bit-flip errors (one -of the most common forms of data corruption) in lzip files, and provides -data recovery capabilities, including error-checked merging of damaged -copies of a file. +recovery means. The +@uref{http://www.nongnu.org/lzip/manual/lziprecover_manual.html#Data-safety,,lziprecover} +program can repair bit-flip errors (one of the most common forms of data +corruption) in lzip files, and provides data recovery capabilities, +including error-checked merging of damaged copies of a file. +@ifnothtml +@ref{Data safety,,,lziprecover}. +@end ifnothtml @item The lzip format is as simple as possible (but not simpler). The lzip @@ -109,6 +113,11 @@ makes it safer than compressors returning ambiguous warning values (like gzip) when it is used as a back end for other programs like tar or zutils. +Lzip will automatically use the smallest possible dictionary size for +each file without exceeding the given limit. Keep in mind that the +decompression memory requirement is affected at compression time by the +choice of dictionary size limit. + The amount of memory required for compression is about 1 or 2 times the dictionary size limit (1 if input file size is less than dictionary size limit, else 2) plus 9 times the dictionary size really used. The option @@ -116,11 +125,6 @@ limit, else 2) plus 9 times the dictionary size really used. The option of memory required for decompression is about 46 kB larger than the dictionary size really used. -Lzip will automatically use the smallest possible dictionary size for -each file without exceeding the given limit. Keep in mind that the -decompression memory requirement is affected at compression time by the -choice of dictionary size limit. - When compressing, lzip replaces every file given in the command line with a compressed version of itself, with the name "original_name.lz". When decompressing, lzip attempts to guess the name for the decompressed @@ -152,8 +156,8 @@ corresponding uncompressed files. Integrity testing of concatenated compressed files is also supported. Lzip can produce multi-member files and safely recover, with -lziprecover, the undamaged members in case of file damage. Lzip can also -split the compressed output in volumes of a given size, even when +lziprecover, the undamaged members in case of file damage. Lzip can +also split the compressed output in volumes of a given size, even when reading from standard input. This allows the direct creation of multivolume compressed tar archives. @@ -162,72 +166,6 @@ automatically creating multi-member output. The members so created are large, about 2 PiB each. -@node Algorithm -@chapter Algorithm -@cindex algorithm - -In spite of its name (Lempel-Ziv-Markov chain-Algorithm), LZMA is not a -concrete algorithm; it is more like "any algorithm using the LZMA coding -scheme". For example, the option '-0' of lzip uses the scheme in almost -the simplest way possible; issuing the longest match it can find, or a -literal byte if it can't find a match. Inversely, a much more elaborated -way of finding coding sequences of minimum size than the one currently -used by lzip could be developed, and the resulting sequence could also -be coded using the LZMA coding scheme. - -Lzip currently implements two variants of the LZMA algorithm; fast -(used by option -0) and normal (used by all other compression levels). - -The high compression of LZMA comes from combining two basic, well-proven -compression ideas: sliding dictionaries (LZ77/78) and markov models (the -thing used by every compression algorithm that uses a range encoder or -similar order-0 entropy coder as its last stage) with segregation of -contexts according to what the bits are used for. - -Lzip is a two stage compressor. The first stage is a Lempel-Ziv coder, -which reduces redundancy by translating chunks of data to their -corresponding distance-length pairs. The second stage is a range encoder -that uses a different probability model for each type of data; -distances, lengths, literal bytes, etc. - -Here is how it works, step by step: - -1) The member header is written to the output stream. - -2) The first byte is coded literally, because there are no previous -bytes to which the match finder can refer to. - -3) The main encoder advances to the next byte in the input data and -calls the match finder. - -4) The match finder fills an array with the minimum distances before the -current byte where a match of a given length can be found. - -5) Go back to step 3 until a sequence (formed of pairs, repeated -distances and literal bytes) of minimum price has been formed. Where the -price represents the number of output bits produced. - -6) The range encoder encodes the sequence produced by the main encoder -and sends the produced bytes to the output stream. - -7) Go back to step 3 until the input data are finished or until the -member or volume size limits are reached. - -8) The range encoder is flushed. - -9) The member trailer is written to the output stream. - -10) If there are more data to compress, go back to step 1. - -@sp 1 -@noindent -The ideas embodied in lzip are due to (at least) the following people: -Abraham Lempel and Jacob Ziv (for the LZ algorithm), Andrey Markov (for -the definition of Markov chains), G.N.N. Martin (for the definition of -range encoding), Igor Pavlov (for putting all the above together in -LZMA), and Julian Seward (for bzip2's CLI). - - @node Invoking lzip @chapter Invoking lzip @cindex invoking @@ -274,7 +212,7 @@ Force overwrite of output files. @item -F @itemx --recompress -Force recompression of files whose name already has the @samp{.lz} or +Force re-compression of files whose name already has the @samp{.lz} or @samp{.tlz} suffix. @item -k @@ -392,6 +330,157 @@ invalid input file, 3 for an internal consistency error (eg, bug) which caused lzip to panic. +@node Quality assurance +@chapter Design, development and testing of lzip +@cindex quality assurance + +There are two ways of constructing a software design. One way is to make +it so simple that there are obviously no deficiencies and the other is +to make it so complicated that there are no obvious deficiencies.@* +--- C.A.R. Hoare + +Lzip has been designed, written and tested with great care to be the +standard general-purpose compressor for unix-like systems. This chapter +describes the lessons learned from previous compressors (gzip and +bzip2), and their application to the design of lzip. + +@sp 1 +@section Format design + +When gzip was designed in 1992, computers and operating systems were +much less capable than they are today. Gzip tried to work around some of +those limitations, like 8.3 file names, with additional fields in its +file format. + +Today those limitations have mostly disappeared, and the format of gzip +has proved to be unnecessarily complicated. It includes fields that were +never used, others that have lost its usefulness, and finally others +that have become too limited. + +Bzip2 was designed 5 years later, and its format is simpler than the one +of gzip. + +Probably the worst defect of the gzip format from the point of view of +data safety is the variable size of its header. If the byte at offset 3 +(flags) of a gzip member gets corrupted, it mat become very difficult to +recover the data, even if the compressed blocks are intact, because it +can't be known with certainty where the compressed blocks begin. + +By contrast, the header of a lzip member has a fixed length of 6. The +lzma stream in a lzip member always starts at offset 6, making it +trivial to recover the data even if the whole header becomes corrupt. + +Bzip2 also provides a header of fixed length and marks the begin and end +of each compressed block with six magic bytes, making it possible to +find the compressed blocks even in case of file damage. But bzip2 does +not store the size of each compressed block, as lzip does. + +Lzip provides better data recovery capabilities than any other gzip-like +compressor because its format has been designed from the beginning to be +simple and safe. It would be very difficult to write an automatic +recovery tool like lziprecover for the gzip format. And, as far as I +know, it has never been writen. + +The lzip format is designed for long-term archiving. Therefore it +excludes any unneeded features that may interfere with the future +extraction of the uncompressed data. + +@sp 1 +@subsection Gzip format (mis)features not present in lzip + +@table @samp +@item Multiple algorithms + +Gzip provides a CM (Compression Method) field that has never been used +because it is a bad idea to begin with. New compression methods may +require additional fields, making it impossible to implement new methods +and, at the same time, keep the same format. This field does not solve +the problem of format proliferation; it just makes the problem less +obvious. + +@item Optional fields in header + +Unless special precautions are taken, optional fields are generally a +bad idea because they produce a header of variable size. The gzip header +has 2 fields that, in addition to being optional, are zero-terminated. +This means that if any byte inside the field gets zeroed, or if the +terminating zero gets altered, gzip won't be able to find neither the +header CRC nor the compressed blocks. + +@item Optional CRC for the header + +Using an optional checksum for the header is not only a bad idea, it is +an error; it may prevent the extraction of perfectly good data. For +example, if the checksum is used and the bit enabling it is reset by a +bit-flip, the header will appear to be intact (in spite of being +corrupt) while the compressed blocks will appear to be totally +unrecoverable (in spite of being intact). Very misleading indeed. + +@end table + +@subsection Lzip format improvements over gzip and bzip2 + +@table @samp +@item 64-bit size field + +Probably the most frequently reported shortcoming of the gzip format is +that it only stores the least significant 32 bits of the uncompressed +size. The size of any file larger than 4 GiB gets truncated. + +Bzip2 does not store the uncompressed size of the file. + +The lzip format provides a 64-bit field for the uncompressed size. +Additionaly, lzip produces multi-member output automatically when the +size is too large for a single member, allowing for an unlimited +uncompressed size. + +@item Distributed index + +The lzip format provides a distributed index that, among other things, +helps plzip to decompress several times faster than pigz and helps +lziprecover do its job. Neither the gzip format nor the bzip2 format do +provide an index. + +A distributed index is safer and more scalable than a monolithic index. +The monolithic index introduces a single point of failure in the +compressed file and may limit the number of members or the total +uncompressed size. + +@end table + +@section Quality of implementation + +@table @samp +@item Multiple implementations + +Just like the lzip format provides 4 factor protection against +undetected data corruption, the development methodology of the lzip +family of compressors provides 3 factor protection against undetected +programming errors. + +Three related but independent compressor implementations, lzip, clzip +and minilzip/lzlib, are developed concurrently. Every stable release of +any of them is subjected to a hundred hours of intensive testing to +verify that it produces identical output to the other two. This +guarantees that all three implement the same algorithm, and makes it +unlikely that any of them may contain serious undiscovered errors. In +fact, no errors have been discovered in lzip since 2009. + +@item Dictionary size + +Lzip automatically uses the smallest possible dictionary size for each +file. In addition to reducing the amount of memory required for +decompression, this feature also minimizes the probability of being +affected by RAM errors during compression. + +@item Exit status + +Returning a warning status of 2 is a design flaw of compress that leaked +into the design of gzip. Both bzip2 and lzip are free from this flaw. + +@end table + + @node File format @chapter File format @cindex file format @@ -468,6 +557,72 @@ facilitates safe recovery of undamaged members from multi-member files. @end table +@node Algorithm +@chapter Algorithm +@cindex algorithm + +In spite of its name (Lempel-Ziv-Markov chain-Algorithm), LZMA is not a +concrete algorithm; it is more like "any algorithm using the LZMA coding +scheme". For example, the option @samp{-0} of lzip uses the scheme in almost +the simplest way possible; issuing the longest match it can find, or a +literal byte if it can't find a match. Inversely, a much more elaborated +way of finding coding sequences of minimum size than the one currently +used by lzip could be developed, and the resulting sequence could also +be coded using the LZMA coding scheme. + +Lzip currently implements two variants of the LZMA algorithm; fast +(used by option @samp{-0}) and normal (used by all other compression levels). + +The high compression of LZMA comes from combining two basic, well-proven +compression ideas: sliding dictionaries (LZ77/78) and markov models (the +thing used by every compression algorithm that uses a range encoder or +similar order-0 entropy coder as its last stage) with segregation of +contexts according to what the bits are used for. + +Lzip is a two stage compressor. The first stage is a Lempel-Ziv coder, +which reduces redundancy by translating chunks of data to their +corresponding distance-length pairs. The second stage is a range encoder +that uses a different probability model for each type of data; +distances, lengths, literal bytes, etc. + +Here is how it works, step by step: + +1) The member header is written to the output stream. + +2) The first byte is coded literally, because there are no previous +bytes to which the match finder can refer to. + +3) The main encoder advances to the next byte in the input data and +calls the match finder. + +4) The match finder fills an array with the minimum distances before the +current byte where a match of a given length can be found. + +5) Go back to step 3 until a sequence (formed of pairs, repeated +distances and literal bytes) of minimum price has been formed. Where the +price represents the number of output bits produced. + +6) The range encoder encodes the sequence produced by the main encoder +and sends the produced bytes to the output stream. + +7) Go back to step 3 until the input data are finished or until the +member or volume size limits are reached. + +8) The range encoder is flushed. + +9) The member trailer is written to the output stream. + +10) If there are more data to compress, go back to step 1. + +@sp 1 +@noindent +The ideas embodied in lzip are due to (at least) the following people: +Abraham Lempel and Jacob Ziv (for the LZ algorithm), Andrey Markov (for +the definition of Markov chains), G.N.N. Martin (for the definition of +range encoding), Igor Pavlov (for putting all the above together in +LZMA), and Julian Seward (for bzip2's CLI). + + @node Stream format @chapter Format of the LZMA stream in lzip files @cindex format of the LZMA stream @@ -690,140 +845,6 @@ sequences (matches, repeated matches, and literal bytes), until the "End Of Stream" marker is decoded. -@node Quality assurance -@chapter Design, development and testing of lzip -@cindex quality assurance - -There are two ways of constructing a software design. One way is to make -it so simple that there are obviously no deficiencies and the other is -to make it so complicated that there are no obvious deficiencies.@* ---- C.A.R. Hoare - -Lzip has been designed, written and tested with great care to be the -standard general-purpose compressor for unix-like systems. This chapter -describes the lessons learned from previous compressors (gzip and -bzip2), and their application to the design of lzip. - -@sp 1 -@section Format design - -When gzip was designed in 1992, computers and operating systems were -much less capable than they are today. Gzip tried to work around some of -those limitations, like 8.3 file names, with additional fields in its -file format. - -Today those limitations have mostly disappeared, and the format of gzip -has proved to be unnecessarily complicated. It includes fields that were -never used, others that have lost its usefulness, and finally others -that have become too limited. - -Bzip2 was designed 5 years later, and its format is in some aspects -simpler than the one of gzip. But bzip2 also shows complexities in its -file format which slow down decompression and, in retrospect, are -unnecessary. - -Probably the worst defect of the gzip format from the point of view of -data safety is the variable size of its header. If the byte at offset 3 -(flags) of a gzip member gets corrupted, it mat become very difficult to -recover the data, even if the compressed blocks are intact, because it -can't be known with certainty where the compressed blocks begin. - -By contrast, the lzma stream in a lzip member always starts at offset 6, -making it trivial to recover the data even if the whole header becomes -corrupt. - -Lzip provides better data recovery capabilities than any other gzip-like -compressor because its format has been designed from the beginning to be -simple and safe. It would be very difficult to write an automatic -recovery tool like lziprecover for the gzip format. And, as far as I -know, it has never been writen. - -The lzip format is designed for long-term archiving. Therefore it -excludes any unneeded features that may interfere with the future -extraction of the uncompressed data. - -@sp 1 -@subsection Gzip format (mis)features not present in lzip - -@table @samp -@item Multiple algorithms - -Gzip provides a CM (Compression Method) field that has never been used -because it is a bad idea to begin with. New compression methods may -require additional fields, making it impossible to implement new methods -and, at the same time, keep the same format. This field does not solve -the problem of format proliferation; it just makes the problem less -obvious. - -@item Optional fields in header - -Unless special precautions are taken, optional fields are generally a -bad idea because they produce a header of variable size. The gzip header -has 2 fields that, in addition to being optional, are zero-terminated. -This means that if any byte inside the field gets zeroed, or if the -terminating zero gets altered, gzip won't be able to find neither the -header CRC nor the compressed blocks. - -Using an optional checksum for the header is not only a bad idea, it is -an error; it may prevent the extraction of perfectly good data. For -example, if the checksum is used and the bit enabling it is reset by a -bit-flip, the header will appear to be intact (in spite of being -corrupt) while the compressed blocks will appear to be totally -unrecoverable (in spite of being intact). Very misleading indeed. - -@end table - -@subsection Lzip format improvements over gzip - -@table @samp -@item 64-bit size field - -Probably the most frequently reported shortcoming of the gzip format is -that it only stores the least significant 32 bits of the uncompressed -size. The size of any file larger than 4 GiB gets truncated. - -The lzip format provides a 64-bit field for the uncompressed size. -Additionaly, lzip produces multi-member output automatically when the -size is too large for a single member, allowing an unlimited -uncompressed size. - -@item Distributed index - -The lzip format provides a distributed index that, among other things, -helps plzip to decompress several times faster than pigz and helps -lziprecover do its job. The gzip format does not provide an index. - -A distributed index is safer and more scalable than a monolithic index. -The monolithic index introduces a single point of failure in the -compressed file and may limit the number of members or the total -uncompressed size. - -@end table - -@section Quality of implementation - -Three related but independent compressor implementations, lzip, clzip -and minilzip/lzlib, are developed concurrently. Every stable release of -any of them is subjected to a hundred hours of intensive testing to -verify that it produces identical output to the other two. This -guarantees that all three implement the same algorithm, and makes it -unlikely that any of them may contain serious undiscovered errors. In -fact, no errors have been discovered in lzip since 2009. - -Just like the lzip format provides 4 factor protection against -undetected data corruption, the development methodology described above -provides 3 factor protection against undetected programming errors in -lzip. - -Lzip automatically uses the smallest possible dictionary size for each -file. In addition to reducing the amount of memory required for -decompression, this feature also minimizes the probability of being -affected by RAM errors during compression. - -Returning a warning status of 2 is a design flaw of compress that leaked -into the design of gzip. Both bzip2 and lzip are free form this flaw. - - @node Examples @chapter A small tutorial with examples @cindex examples @@ -947,7 +968,7 @@ find by running @w{@code{lzip --version}}. @cindex reference source code @verbatim -/* Lzd - Educational decompressor for lzip files +/* Lzd - Educational decompressor for the lzip format Copyright (C) 2013-2015 Antonio Diaz Diaz. This program is free software: you have unlimited permission @@ -1204,7 +1225,7 @@ class LZ_decoder } public: - LZ_decoder( const unsigned dict_size ) + explicit LZ_decoder( const unsigned dict_size ) : partial_data_pos( 0 ), dictionary_size( dict_size ), @@ -1231,7 +1252,7 @@ void LZ_decoder::flush_data() crc32.update_buf( crc_, buffer + stream_pos, size ); errno = 0; if( std::fwrite( buffer + stream_pos, 1, size, stdout ) != size ) - { std::fprintf( stderr, "Write error: %s.\n", std::strerror( errno ) ); + { std::fprintf( stderr, "Write error: %s\n", std::strerror( errno ) ); std::exit( 1 ); } if( pos >= dictionary_size ) { partial_data_pos += pos; pos = 0; } stream_pos = pos; @@ -1273,7 +1294,7 @@ bool LZ_decoder::decode_member() // Returns false if error put_byte( rdec.decode_matched( bm, peek( rep0 ) ) ); state.set_char(); } - else + else // match or repeated match { int len; if( rdec.decode_bit( bm_rep[state()] ) != 0 ) // 2nd bit @@ -1302,7 +1323,7 @@ bool LZ_decoder::decode_member() // Returns false if error state.set_rep(); len = min_match_len + rdec.decode_len( rep_len_model, pos_state ); } - else + else // match { rep3 = rep2; rep2 = rep1; rep1 = rep0; len = min_match_len + rdec.decode_len( match_len_model, pos_state ); @@ -1344,7 +1365,7 @@ int main( const int argc, const char * const argv[] ) { if( argc > 1 ) { - std::printf( "Lzd %s - Educational decompressor for lzip files.\n", + std::printf( "Lzd %s - Educational decompressor for the lzip format.\n", PROGVERSION ); std::printf( "Study the source to learn how a lzip decompressor works.\n" "See the lzip manual for an explanation of the code.\n" @@ -1371,19 +1392,19 @@ int main( const int argc, const char * const argv[] ) if( std::feof( stdin ) || std::memcmp( header, "LZIP\x01", 5 ) != 0 ) { if( first_member ) - { std::fprintf( stderr, "Bad magic number (file not in lzip format)\n" ); + { std::fputs( "Bad magic number (file not in lzip format).\n", stderr ); return 2; } break; } unsigned dict_size = 1 << ( header[5] & 0x1F ); dict_size -= ( dict_size / 16 ) * ( ( header[5] >> 5 ) & 7 ); if( dict_size < min_dictionary_size || dict_size > max_dictionary_size ) - { std::fprintf( stderr, "Invalid dictionary size in member header\n" ); + { std::fputs( "Invalid dictionary size in member header.\n", stderr ); return 2; } LZ_decoder decoder( dict_size ); if( !decoder.decode_member() ) - { std::fprintf( stderr, "Data error\n" ); return 2; } + { std::fputs( "Data error\n", stderr ); return 2; } File_trailer trailer; for( int i = 0; i < 20; ++i ) trailer[i] = std::getc( stdin ); @@ -1392,11 +1413,11 @@ int main( const int argc, const char * const argv[] ) unsigned long long data_size = 0; for( int i = 11; i >= 4; --i ) { data_size <<= 8; data_size += trailer[i]; } if( crc != decoder.crc() || data_size != decoder.data_position() ) - { std::fprintf( stderr, "CRC error\n" ); return 2; } + { std::fputs( "CRC error\n", stderr ); return 2; } } if( std::fclose( stdout ) != 0 ) - { std::fprintf( stderr, "Can't close stdout: %s.\n", std::strerror( errno ) ); + { std::fprintf( stderr, "Can't close stdout: %s\n", std::strerror( errno ) ); return 1; } return 0; } diff --git a/encoder.cc b/encoder.cc index 3e707f3..51c0069 100644 --- a/encoder.cc +++ b/encoder.cc @@ -75,7 +75,7 @@ int LZ_encoder::get_match_pairs( Pair * pairs ) while( maxlen < len_limit && data[maxlen-delta] == data[maxlen] ) ++maxlen; pairs[num_pairs-1].len = maxlen; - if( maxlen >= len_limit ) pairs = 0; /* done. now just skip */ + if( maxlen >= len_limit ) pairs = 0; // done. now just skip } if( maxlen < 3 ) maxlen = 3; } @@ -269,10 +269,10 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], } int cur = 0; - while( true ) /* price optimization loop */ + while( true ) // price optimization loop { move_pos(); - if( ++cur >= num_trials ) /* no more initialized trials */ + if( ++cur >= num_trials ) // no more initialized trials { backward( cur ); return cur; @@ -287,7 +287,7 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], return cur; } - /* give final values to current trial */ + // give final values to current trial Trial & cur_trial = trials[cur]; State cur_state; { @@ -298,7 +298,7 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], if( prev_index2 == single_step_trial ) { cur_state = trials[prev_index].state; - if( prev_index + 1 == cur ) /* len == 1 */ + if( prev_index + 1 == cur ) // len == 1 { if( dis == 0 ) cur_state.set_short_rep(); else cur_state.set_char(); // literal @@ -306,14 +306,14 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], else if( dis < num_rep_distances ) cur_state.set_rep(); else cur_state.set_match(); } - else if( prev_index2 == dual_step_trial ) /* dis == 0 */ + else if( prev_index2 == dual_step_trial ) // dis == 0 { --prev_index; cur_state = trials[prev_index].state; cur_state.set_char(); cur_state.set_rep(); } - else /* if( prev_index2 >= 0 ) */ + else // if( prev_index2 >= 0 ) { prev_index = prev_index2; cur_state = trials[prev_index].state; @@ -340,7 +340,7 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], else next_price += price_matched( prev_byte, cur_byte, match_byte ); - /* try last updates to next trial */ + // try last updates to next trial Trial & next_trial = trials[cur+1]; next_trial.update( next_price, -1, cur ); // literal @@ -366,7 +366,7 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], const int len_limit = std::min( match_len_limit, triable_bytes ); - /* try literal + rep0 */ + // try literal + rep0 if( match_byte != cur_byte && next_trial.prev_index != cur ) { const uint8_t * const data = ptr_to_current_pos(); @@ -390,7 +390,7 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], int start_len = min_match_len; - /* try rep distances */ + // try rep distances for( int rep = 0; rep < num_rep_distances; ++rep ) { const uint8_t * const data = ptr_to_current_pos(); @@ -407,9 +407,9 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], trials[cur+i].update( price + rep_len_prices.price( i, pos_state ), rep, cur ); - if( rep == 0 ) start_len = len + 1; /* discard shorter matches */ + if( rep == 0 ) start_len = len + 1; // discard shorter matches - /* try rep + literal + rep0 */ + // try rep + literal + rep0 int len2 = len + 1; const int limit = std::min( match_len_limit + len2, triable_bytes ); while( len2 < limit && data[len2-dis] == data[len2] ) ++len2; @@ -431,7 +431,7 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], trials[cur+len+1+len2].update3( price, rep, cur + len + 1, cur ); } - /* try matches */ + // try matches if( newlen >= start_len && newlen <= len_limit ) { const int normal_match_price = match_price + @@ -449,7 +449,7 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], trials[cur+len].update( price, dis + num_rep_distances, cur ); - /* try match + literal + rep0 */ + // try match + literal + rep0 if( len == pairs[i].len ) { const uint8_t * const data = ptr_to_current_pos(); @@ -500,7 +500,7 @@ bool LZ_encoder::encode_member( const unsigned long long member_size ) for( int i = 0; i < num_rep_distances; ++i ) reps[i] = 0; if( data_position() != 0 || renc.member_position() != File_header::size ) - return false; /* can be called only once */ + return false; // can be called only once if( !data_finished() ) // encode first byte { @@ -517,7 +517,7 @@ bool LZ_encoder::encode_member( const unsigned long long member_size ) { if( price_counter <= 0 && pending_num_pairs == 0 ) { - price_counter = price_count; /* recalculate prices every these bytes */ + price_counter = price_count; // recalculate prices every these bytes if( dis_price_counter <= 0 ) { dis_price_counter = dis_price_count; update_distance_prices(); } if( align_price_counter <= 0 ) @@ -531,7 +531,7 @@ bool LZ_encoder::encode_member( const unsigned long long member_size ) } int ahead = sequence_optimizer( reps, state ); - if( ahead <= 0 ) return false; /* can't happen */ + if( ahead <= 0 ) return false; // can't happen price_counter -= ahead; for( int i = 0; ahead > 0; ) @@ -542,7 +542,7 @@ bool LZ_encoder::encode_member( const unsigned long long member_size ) bool bit = ( dis < 0 ); renc.encode_bit( bm_match[state()][pos_state], !bit ); - if( bit ) /* literal byte */ + if( bit ) // literal byte { const uint8_t prev_byte = peek( ahead + 1 ); const uint8_t cur_byte = peek( ahead ); @@ -556,13 +556,13 @@ bool LZ_encoder::encode_member( const unsigned long long member_size ) } state.set_char(); } - else /* match or repeated match */ + else // match or repeated match { crc32.update_buf( crc_, ptr_to_current_pos() - ahead, len ); mtf_reps( dis, reps ); bit = ( dis < num_rep_distances ); renc.encode_bit( bm_rep[state()], bit ); - if( bit ) /* repeated match */ + if( bit ) // repeated match { bit = ( dis == 0 ); renc.encode_bit( bm_rep0[state()], !bit ); @@ -582,7 +582,7 @@ bool LZ_encoder::encode_member( const unsigned long long member_size ) state.set_rep(); } } - else /* match */ + else // match { encode_pair( dis - num_rep_distances, len, pos_state ); if( get_slot( dis - num_rep_distances ) >= end_dis_model ) diff --git a/encoder.h b/encoder.h index 81cc1e0..9579a85 100644 --- a/encoder.h +++ b/encoder.h @@ -76,7 +76,7 @@ public: class LZ_encoder : public LZ_encoder_base { - struct Pair /* distance-length pair */ + struct Pair // distance-length pair { int dis; int len; @@ -90,12 +90,12 @@ class LZ_encoder : public LZ_encoder_base struct Trial { State state; - int price; /* dual use var; cumulative price, match length */ - int dis; /* rep index or match distance. (-1 for literal) */ - int prev_index; /* index of prev trial in trials[] */ - int prev_index2; /* -2 trial is single step */ - /* -1 literal + rep0 */ - /* >= 0 ( rep or match ) + literal + rep0 */ + int price; // dual use var; cumulative price, match length + int dis; // rep index or match distance. (-1 for literal) + int prev_index; // index of prev trial in trials[] + int prev_index2; // -2 trial is single step + // -1 literal + rep0 + // >= 0 ( rep or match ) + literal + rep0 int reps[num_rep_distances]; void update( const int pr, const int distance, const int p_i ) @@ -145,7 +145,7 @@ class LZ_encoder : public LZ_encoder_base int get_match_pairs( Pair * pairs = 0 ); void update_distance_prices(); - /* move-to-front dis in/into reps if( dis > 0 ) */ + // move-to-front dis in/into reps if( dis > 0 ) static void mtf_reps( const int dis, int reps[num_rep_distances] ) { if( dis >= num_rep_distances ) @@ -255,7 +255,7 @@ class LZ_encoder : public LZ_encoder_base const State state ); enum { before = max_num_trials + 1, - /* bytes to keep in buffer after pos */ + // bytes to keep in buffer after pos after_size = ( 2 * max_match_len ) + 1, dict_factor = 2, num_prev_positions3 = 1 << 16, diff --git a/encoder_base.cc b/encoder_base.cc index 982f12c..a8bbbd7 100644 --- a/encoder_base.cc +++ b/encoder_base.cc @@ -142,7 +142,7 @@ void Range_encoder::flush_data() } - /* End Of Stream mark => (dis == 0xFFFFFFFFU, len == min_match_len) */ + // End Of Stream mark => (dis == 0xFFFFFFFFU, len == min_match_len) void LZ_encoder_base::full_flush( const State state ) { const int pos_state = data_position() & pos_state_mask; diff --git a/encoder_base.h b/encoder_base.h index 27c7a90..b032fae 100644 --- a/encoder_base.h +++ b/encoder_base.h @@ -60,14 +60,14 @@ public: for( int i = 0; i < bit_model_total >> price_step_bits; ++i ) { unsigned val = ( i * price_step ) + ( price_step / 2 ); - int bits = 0; /* base 2 logarithm of val */ + int bits = 0; // base 2 logarithm of val for( int j = 0; j < price_shift_bits; ++j ) { val = val * val; bits <<= 1; while( val >= 1 << 16 ) { val >>= 1; ++bits; } } - bits += 15; /* remaining bits in val */ + bits += 15; // remaining bits in val data[i] = ( bit_model_total_bits << price_shift_bits ) - bits; } } @@ -155,7 +155,7 @@ inline int price_matched( const Bit_model bm[], int symbol, int match_byte ) symbol <<= 1; const int bit = symbol & 0x100; price += price_bit( bm[match_bit+(symbol>>9)+mask], bit ); - mask &= ~(match_byte ^ symbol); /* if( match_bit != bit ) mask = 0; */ + mask &= ~(match_byte ^ symbol); // if( match_bit != bit ) mask = 0; } while( symbol < 0x10000 ); return price; @@ -172,21 +172,21 @@ class Matchfinder_base protected: unsigned long long partial_data_pos; - uint8_t * buffer; /* input buffer */ - int32_t * prev_positions; /* 1 + last seen position of key. else 0 */ - int32_t * pos_array; /* may be tree or chain */ - const int before_size; /* bytes to keep in buffer before dictionary */ + uint8_t * buffer; // input buffer + int32_t * prev_positions; // 1 + last seen position of key. else 0 + int32_t * pos_array; // may be tree or chain + const int before_size; // bytes to keep in buffer before dictionary int buffer_size; - int dictionary_size; /* bytes to keep in buffer before pos */ - int pos; /* current pos in buffer */ - int cyclic_pos; /* cycles through [0, dictionary_size] */ - int stream_pos; /* first byte not yet read from file */ - int pos_limit; /* when reached, a new block must be read */ + int dictionary_size; // bytes to keep in buffer before pos + int pos; // current pos in buffer + int cyclic_pos; // cycles through [0, dictionary_size] + int stream_pos; // first byte not yet read from file + int pos_limit; // when reached, a new block must be read int key4_mask; - int num_prev_positions; /* size of prev_positions */ + int num_prev_positions; // size of prev_positions int pos_array_size; - const int infd; /* input file descriptor */ - bool at_stream_end; /* stream_pos shows real end of file */ + const int infd; // input file descriptor + bool at_stream_end; // stream_pos shows real end of file Matchfinder_base( const int before, const int dict_size, const int after_size, const int dict_factor, @@ -228,11 +228,11 @@ class Range_encoder enum { buffer_size = 65536 }; uint64_t low; unsigned long long partial_member_pos; - uint8_t * const buffer; /* output buffer */ - int pos; /* current pos in buffer */ + uint8_t * const buffer; // output buffer + int pos; // current pos in buffer uint32_t range; unsigned ff_count; - const int outfd; /* output file descriptor */ + const int outfd; // output file descriptor uint8_t cache; File_header header; @@ -377,7 +377,7 @@ public: symbol <<= 1; const int bit = symbol & 0x100; encode_bit( bm[match_bit+(symbol>>9)+mask], bit ); - mask &= ~(match_byte ^ symbol); /* if( match_bit != bit ) mask = 0; */ + mask &= ~(match_byte ^ symbol); // if( match_bit != bit ) mask = 0; } while( symbol < 0x10000 ); } @@ -406,7 +406,7 @@ class LZ_encoder_base : public Matchfinder_base { protected: enum { max_marker_size = 16, - num_rep_distances = 4 }; /* must be 4 */ + num_rep_distances = 4 }; // must be 4 uint32_t crc_; diff --git a/fast_encoder.cc b/fast_encoder.cc index 1ecd169..90361e9 100644 --- a/fast_encoder.cc +++ b/fast_encoder.cc @@ -88,7 +88,7 @@ bool FLZ_encoder::encode_member( const unsigned long long member_size ) for( int i = 0; i < num_rep_distances; ++i ) reps[i] = 0; if( data_position() != 0 || renc.member_position() != File_header::size ) - return false; /* can be called only once */ + return false; // can be called only once if( !data_finished() ) // encode first byte { @@ -179,7 +179,7 @@ bool FLZ_encoder::encode_member( const unsigned long long member_size ) } } - /* literal byte */ + // literal byte renc.encode_bit( bm_match[state()][pos_state], 0 ); if( state.is_char() ) encode_literal( prev_byte, cur_byte ); diff --git a/fast_encoder.h b/fast_encoder.h index b26e388..36177df 100644 --- a/fast_encoder.h +++ b/fast_encoder.h @@ -17,7 +17,7 @@ class FLZ_encoder : public LZ_encoder_base { - int key4; /* key made from latest 4 bytes */ + int key4; // key made from latest 4 bytes void reset_key4() { @@ -45,7 +45,7 @@ class FLZ_encoder : public LZ_encoder_base enum { before = 0, dict_size = 65536, - /* bytes to keep in buffer after pos */ + // bytes to keep in buffer after pos after_size = max_match_len, dict_factor = 16, num_prev_positions23 = 0, diff --git a/lzip.h b/lzip.h index 4a8bc98..9c16ef2 100644 --- a/lzip.h +++ b/lzip.h @@ -40,7 +40,7 @@ public: enum { min_dictionary_bits = 12, - min_dictionary_size = 1 << min_dictionary_bits, /* >= modeled_distances */ + min_dictionary_size = 1 << min_dictionary_bits, // >= modeled_distances max_dictionary_bits = 29, max_dictionary_size = 1 << max_dictionary_bits, literal_context_bits = 3, @@ -52,7 +52,7 @@ enum { dis_slot_bits = 6, start_dis_model = 4, end_dis_model = 14, - modeled_distances = 1 << (end_dis_model / 2), /* 128 */ + modeled_distances = 1 << (end_dis_model / 2), // 128 dis_align_bits = 4, dis_align_size = 1 << dis_align_bits, @@ -64,8 +64,8 @@ enum { len_high_symbols = 1 << len_high_bits, max_len_symbols = len_low_symbols + len_mid_symbols + len_high_symbols, - min_match_len = 2, /* must be 2 */ - max_match_len = min_match_len + max_len_symbols - 1, /* 273 */ + min_match_len = 2, // must be 2 + max_match_len = min_match_len + max_len_symbols - 1, // 273 min_match_len_limit = 5 }; inline int get_len_state( const int len ) @@ -185,9 +185,9 @@ const uint8_t magic_string[4] = { 0x4C, 0x5A, 0x49, 0x50 }; // "LZIP" struct File_header { - uint8_t data[6]; /* 0-3 magic bytes */ - /* 4 version */ - /* 5 coded_dict_size */ + uint8_t data[6]; // 0-3 magic bytes + // 4 version + // 5 coded_dict_size enum { size = 6 }; void set_magic() { std::memcpy( data, magic_string, 4 ); data[4] = 1; } @@ -227,9 +227,9 @@ struct File_header struct File_trailer { - uint8_t data[20]; /* 0-3 CRC32 of the uncompressed data */ - /* 4-11 size of the uncompressed data */ - /* 12-19 member size including header and trailer */ + uint8_t data[20]; // 0-3 CRC32 of the uncompressed data + // 4-11 size of the uncompressed data + // 12-19 member size including header and trailer static int size( const int version = 1 ) { return ( ( version >= 1 ) ? 20 : 12 ); } diff --git a/main.cc b/main.cc index 27cc156..ac07852 100644 --- a/main.cc +++ b/main.cc @@ -83,8 +83,8 @@ struct { const char * from; const char * to; } const known_extensions[] = { struct Lzma_options { - int dictionary_size; /* 4 KiB .. 512 MiB */ - int match_len_limit; /* 5 .. 273 */ + int dictionary_size; // 4 KiB .. 512 MiB + int match_len_limit; // 5 .. 273 }; enum Mode { m_compress, m_decompress, m_test }; @@ -108,7 +108,7 @@ void show_help() " -c, --stdout send output to standard output\n" " -d, --decompress decompress\n" " -f, --force overwrite existing output files\n" - " -F, --recompress force recompression of compressed files\n" + " -F, --recompress force re-compression of compressed files\n" " -k, --keep keep (don't delete) input files\n" " -m, --match-length= set match length limit in bytes [36]\n" " -o, --output= if reading stdin, place the output into \n" @@ -265,7 +265,7 @@ int open_instream( const char * const name, struct stat * const in_statsp, if( infd < 0 ) { if( verbosity >= 0 ) - std::fprintf( stderr, "%s: Can't open input file '%s': %s.\n", + std::fprintf( stderr, "%s: Can't open input file '%s': %s\n", program_name, name, std::strerror( errno ) ); } else @@ -282,7 +282,7 @@ int open_instream( const char * const name, struct stat * const in_statsp, std::fprintf( stderr, "%s: Input file '%s' is not a regular file%s.\n", program_name, name, ( can_read && !no_ofile ) ? - " and '--stdout' was not specified" : "" ); + ",\n and '--stdout' was not specified" : "" ); close( infd ); infd = -1; } @@ -314,7 +314,7 @@ void set_d_outname( const std::string & name, const int i ) } output_filename = name; output_filename += ".out"; if( verbosity >= 1 ) - std::fprintf( stderr, "%s: Can't guess original name for '%s' -- using '%s'.\n", + std::fprintf( stderr, "%s: Can't guess original name for '%s' -- using '%s'\n", program_name, name.c_str(), output_filename.c_str() ); } @@ -331,7 +331,7 @@ bool open_outstream( const bool force ) std::fprintf( stderr, "%s: Output file '%s' already exists, skipping.\n", program_name, output_filename.c_str() ); else - std::fprintf( stderr, "%s: Can't create output file '%s': %s.\n", + std::fprintf( stderr, "%s: Can't create output file '%s': %s\n", program_name, output_filename.c_str(), std::strerror( errno ) ); } return ( outfd >= 0 ); @@ -371,14 +371,14 @@ void cleanup_and_fail( const int retval ) } - /* Set permissions, owner and times. */ + // Set permissions, owner and times. void close_and_set_permissions( const struct stat * const in_statsp ) { bool warning = false; if( in_statsp ) { const mode_t mode = in_statsp->st_mode; - /* fchown will in many cases return with EPERM, which can be safely ignored. */ + // fchown will in many cases return with EPERM, which can be safely ignored. if( fchown( outfd, in_statsp->st_uid, in_statsp->st_gid ) == 0 ) { if( fchmod( outfd, mode ) != 0 ) warning = true; } else @@ -440,7 +440,7 @@ int compress( const unsigned long long member_size, } unsigned long long in_size = 0, out_size = 0, partial_volume_size = 0; - while( true ) /* encode one member per iteration */ + while( true ) // encode one member per iteration { const unsigned long long size = ( volume_size > 0 ) ? std::min( member_size, volume_size - partial_volume_size ) : member_size; @@ -472,7 +472,7 @@ int compress( const unsigned long long member_size, if( retval == 0 && verbosity >= 1 ) { if( in_size == 0 || out_size == 0 ) - std::fprintf( stderr, " no data compressed.\n" ); + std::fputs( " no data compressed.\n", stderr ); else std::fprintf( stderr, "%6.3f:1, %6.3f bits/byte, " "%5.2f%% saved, %llu in, %llu out.\n", @@ -582,23 +582,20 @@ int decompress( const int infd, const Pretty_print & pp, const bool testing ) if( verbosity >= 0 && result <= 2 ) { pp(); - if( result == 2 ) - std::fprintf( stderr, "File ends unexpectedly at pos %llu.\n", - partial_file_pos ); - else - std::fprintf( stderr, "Decoder error at pos %llu.\n", - partial_file_pos ); + std::fprintf( stderr, "%s at pos %llu\n", ( result == 2 ) ? + "File ends unexpectedly" : "Decoder error", + partial_file_pos ); } retval = 2; break; } if( verbosity >= 2 ) - { std::fprintf( stderr, testing ? "ok\n" : "done\n" ); pp.reset(); } + { std::fputs( testing ? "ok\n" : "done\n", stderr ); pp.reset(); } } } catch( std::bad_alloc ) { pp( "Not enough memory." ); retval = 1; } catch( Error e ) { pp(); show_error( e.msg, errno ); retval = 1; } if( verbosity == 1 && retval == 0 ) - std::fprintf( stderr, testing ? "ok\n" : "done\n" ); + std::fputs( testing ? "ok\n" : "done\n", stderr ); return retval; } @@ -631,8 +628,8 @@ void show_error( const char * const msg, const int errcode, const bool help ) { std::fprintf( stderr, "%s: %s", program_name, msg ); if( errcode > 0 ) - std::fprintf( stderr, ": %s.", std::strerror( errcode ) ); - std::fprintf( stderr, "\n" ); + std::fprintf( stderr, ": %s", std::strerror( errcode ) ); + std::fputc( '\n', stderr ); } if( help ) std::fprintf( stderr, "Try '%s --help' for more information.\n", @@ -654,14 +651,14 @@ void show_progress( const unsigned long long partial_size, const Pretty_print * const p, const unsigned long long cfile_size ) { - static unsigned long long csize = 0; /* file_size / 100 */ + static unsigned long long csize = 0; // file_size / 100 static unsigned long long psize = 0; static const Matchfinder_base * mb = 0; static const Pretty_print * pp = 0; if( verbosity >= 2 ) { - if( m ) /* initialize static vars */ + if( m ) // initialize static vars { csize = cfile_size; psize = partial_size; mb = m; pp = p; } if( mb && pp ) { @@ -681,16 +678,16 @@ int main( const int argc, const char * const argv[] ) to the corresponding LZMA compression modes. */ const Lzma_options option_mapping[] = { - { 1 << 16, 16 }, /* -0 entry values not used */ - { 1 << 20, 5 }, /* -1 */ - { 3 << 19, 6 }, /* -2 */ - { 1 << 21, 8 }, /* -3 */ - { 3 << 20, 12 }, /* -4 */ - { 1 << 22, 20 }, /* -5 */ - { 1 << 23, 36 }, /* -6 */ - { 1 << 24, 68 }, /* -7 */ - { 3 << 23, 132 }, /* -8 */ - { 1 << 25, 273 } }; /* -9 */ + { 1 << 16, 16 }, // -0 entry values not used + { 1 << 20, 5 }, // -1 + { 3 << 19, 6 }, // -2 + { 1 << 21, 8 }, // -3 + { 3 << 20, 12 }, // -4 + { 1 << 22, 20 }, // -5 + { 1 << 23, 36 }, // -6 + { 1 << 24, 68 }, // -7 + { 3 << 23, 132 }, // -8 + { 1 << 25, 273 } }; // -9 Lzma_options encoder_options = option_mapping[6]; // default = "-6" const unsigned long long max_member_size = 0x0008000000000000ULL; const unsigned long long max_volume_size = 0x4000000000000000ULL; @@ -746,7 +743,7 @@ int main( const int argc, const char * const argv[] ) for( ; argind < parser.arguments(); ++argind ) { const int code = parser.code( argind ); - if( !code ) break; /* no more options */ + if( !code ) break; // no more options const std::string & arg = parser.argument( argind ); switch( code ) { @@ -777,7 +774,7 @@ int main( const int argc, const char * const argv[] ) case 'V': show_version(); return 0; default : internal_error( "uncaught option." ); } - } /* end process options */ + } // end process options #if defined(__MSVCRT__) || defined(__OS2__) setmode( STDIN_FILENO, O_BINARY ); diff --git a/testsuite/check.sh b/testsuite/check.sh index 1f9b9e4..ba77d0a 100755 --- a/testsuite/check.sh +++ b/testsuite/check.sh @@ -1,6 +1,6 @@ #! /bin/sh # check script for Lzip - LZMA lossless data compressor -# Copyright (C) 2008-2014 Antonio Diaz Diaz. +# Copyright (C) 2008-2015 Antonio Diaz Diaz. # # This script is free software: you have unlimited permission # to copy, distribute and modify it. @@ -79,7 +79,6 @@ printf . cat in in > in2 || framework_failure "${LZIP}" -o copy2 < in2 || fail=1 "${LZIP}" -t copy2.lz || fail=1 -printf . "${LZIP}" -cd copy2.lz > copy2 || fail=1 cmp in2 copy2 || fail=1 printf . -- cgit v1.2.3