summaryrefslogtreecommitdiffstats
path: root/doc/lzip.texi
diff options
context:
space:
mode:
Diffstat (limited to 'doc/lzip.texi')
-rw-r--r--doc/lzip.texi187
1 files changed, 122 insertions, 65 deletions
diff --git a/doc/lzip.texi b/doc/lzip.texi
index 845cb42..27feeff 100644
--- a/doc/lzip.texi
+++ b/doc/lzip.texi
@@ -6,8 +6,8 @@
@finalout
@c %**end of header
-@set UPDATED 13 August 2015
-@set VERSION 1.18-pre1
+@set UPDATED 14 May 2016
+@set VERSION 1.18
@dircategory Data Compression
@direntry
@@ -49,7 +49,7 @@ This manual is for Lzip (version @value{VERSION}, @value{UPDATED}).
@end menu
@sp 1
-Copyright @copyright{} 2008-2015 Antonio Diaz Diaz.
+Copyright @copyright{} 2008-2016 Antonio Diaz Diaz.
This manual is free documentation: you have unlimited permission
to copy, distribute and modify it.
@@ -100,14 +100,14 @@ corrupt byte near the beginning is a thing of the past.
The member trailer stores the 32-bit CRC of the original data, the size
of the original data and the size of the member. These values, together
-with the value remaining in the range decoder and the end-of-stream
-marker, provide a 4 factor integrity checking which guarantees that the
-decompressed version of the data is identical to the original. This
-guards against corruption of the compressed data, and against undetected
-bugs in lzip (hopefully very unlikely). The chances of data corruption
-going undetected are microscopic. Be aware, though, that the check
-occurs upon decompression, so it can only tell you that something is
-wrong. It can't help you recover the original uncompressed data.
+with the end-of-stream marker, provide a 3 factor integrity checking
+which guarantees that the decompressed version of the data is identical
+to the original. This guards against corruption of the compressed data,
+and against undetected bugs in lzip (hopefully very unlikely). The
+chances of data corruption going undetected are microscopic. Be aware,
+though, that the check occurs upon decompression, so it can only tell
+you that something is wrong. It can't help you recover the original
+uncompressed data.
Lzip uses the same well-defined exit status values used by bzip2, which
makes it safer than compressors returning ambiguous warning values (like
@@ -156,14 +156,14 @@ or more compressed files. The result is the concatenation of the
corresponding uncompressed files. Integrity testing of concatenated
compressed files is also supported.
-Lzip can produce multi-member files and safely recover, with
-lziprecover, the undamaged members in case of file damage. Lzip can
-also split the compressed output in volumes of a given size, even when
-reading from standard input. This allows the direct creation of
-multivolume compressed tar archives.
+Lzip can produce multimember files and safely recover, with lziprecover,
+the undamaged members in case of file damage. Lzip can also split the
+compressed output in volumes of a given size, even when reading from
+standard input. This allows the direct creation of multivolume
+compressed tar archives.
Lzip is able to compress and decompress streams of unlimited size by
-automatically creating multi-member output. The members so created are
+automatically creating multimember output. The members so created are
large, about 2 PiB each.
@@ -180,6 +180,11 @@ The format for running lzip is:
lzip [@var{options}] [@var{files}]
@end example
+@noindent
+@samp{-} used as a @var{file} argument means standard input. It can be
+mixed with other @var{files} and is read just once, the first time it
+appears in the command line.
+
Lzip supports the following options:
@table @code
@@ -206,14 +211,18 @@ range from 100 kB to 2 PiB. Defaults to 2 PiB.
@item -c
@itemx --stdout
-Compress or decompress to standard output. Needed when reading from a
-named pipe (fifo) or from a device. Use it to recover as much of the
-uncompressed data as possible when decompressing a corrupt file.
+Compress or decompress to standard output; keep input files unchanged.
+If compressing several files, each file is compressed independently.
+This option is needed when reading from a named pipe (fifo) or from a
+device. Use it also to recover as much of the uncompressed data as
+possible when decompressing a corrupt file.
@item -d
@itemx --decompress
-Decompress the specified file(s). If a file fails to decompress, lzip
-exits immediately without decompressing the rest of the files.
+Decompress the specified file(s). If a file does not exist or can't be
+opened, lzip continues decompressing the rest of the files. If a file
+fails to decompress, lzip exits immediately without decompressing the
+rest of the files.
@item -f
@itemx --force
@@ -249,11 +258,13 @@ Quiet operation. Suppress all messages.
@item -s @var{bytes}
@itemx --dictionary-size=@var{bytes}
-Set the dictionary size limit in bytes. Valid values range from 4 KiB to
-512 MiB. Lzip will use the smallest possible dictionary size for each
-file without exceeding this limit. Note that dictionary sizes are
-quantized. If the specified size does not match one of the valid sizes,
-it will be rounded upwards by adding up to (@var{bytes} / 16) to it.
+Set the dictionary size limit in bytes. Lzip will use the smallest
+possible dictionary size for each file without exceeding this limit.
+Valid values range from 4 KiB to 512 MiB. Values 12 to 29 are
+interpreted as powers of two, meaning 2^12 to 2^29 bytes. Note that
+dictionary sizes are quantized. If the specified size does not match one
+of the valid sizes, it will be rounded upwards by adding up to
+@w{(@var{bytes} / 8)} to it.
For maximum compression you should use a dictionary size limit as large
as possible, but keep in mind that the decompression memory requirement
@@ -264,7 +275,7 @@ is affected at compression time by the choice of dictionary size limit.
Split the compressed output into several volume files with names
@samp{original_name00001.lz}, @samp{original_name00002.lz}, etc, and set
the volume size limit to @var{bytes}. Each volume is a complete, maybe
-multi-member, lzip file. A small volume size may degrade compression
+multimember, lzip file. A small volume size may degrade compression
ratio, so use it only when needed. Valid values range from 100 kB to 4
EiB.
@@ -287,14 +298,14 @@ trailing data (if any).
@item -0 .. -9
Set the compression parameters (dictionary size and match length limit)
-as shown in the table below. Note that @samp{-9} can be much slower than
-@samp{-0}. These options have no effect when decompressing.
+as shown in the table below. The default compression level is @samp{-6}.
+Note that @samp{-9} can be much slower than @samp{-0}. These options
+have no effect when decompressing.
The bidimensional parameter space of LZMA can't be mapped to a linear
scale optimal for all files. If your files are large, very repetitive,
-etc, you may need to use the @samp{--match-length} and
-@samp{--dictionary-size} options directly to achieve optimal
-performance.
+etc, you may need to use the @samp{--dictionary-size} and
+@samp{--match-length} options directly to achieve optimal performance.
@multitable {Level} {Dictionary size} {Match length limit}
@item Level @tab Dictionary size @tab Match length limit
@@ -365,7 +376,7 @@ file format.
Today those limitations have mostly disappeared, and the format of gzip
has proved to be unnecessarily complicated. It includes fields that were
-never used, others that have lost its usefulness, and finally others
+never used, others that have lost their usefulness, and finally others
that have become too limited.
Bzip2 was designed 5 years later, and its format is simpler than the one
@@ -373,12 +384,12 @@ of gzip.
Probably the worst defect of the gzip format from the point of view of
data safety is the variable size of its header. If the byte at offset 3
-(flags) of a gzip member gets corrupted, it mat become very difficult to
+(flags) of a gzip member gets corrupted, it may become very difficult to
recover the data, even if the compressed blocks are intact, because it
can't be known with certainty where the compressed blocks begin.
By contrast, the header of a lzip member has a fixed length of 6. The
-lzma stream in a lzip member always starts at offset 6, making it
+LZMA stream in a lzip member always starts at offset 6, making it
trivial to recover the data even if the whole header becomes corrupt.
Bzip2 also provides a header of fixed length and marks the begin and end
@@ -388,9 +399,24 @@ not store the size of each compressed block, as lzip does.
Lzip provides better data recovery capabilities than any other gzip-like
compressor because its format has been designed from the beginning to be
-simple and safe. It would be very difficult to write an automatic
-recovery tool like lziprecover for the gzip format. And, as far as I
-know, it has never been writen.
+simple and safe. It also helps that the LZMA data stream as used by lzip
+is extraordinarily safe. It provides embedded error detection. Any
+distance larger than the dictionary size acts as a forbidden symbol,
+allowing the decompressor to detect the approximate position of errors,
+and leaving very little work for the check sequence (CRC and data sizes)
+in the detection of errors. Lzip is usually able to detect all posible
+bit-flips in the compressed data without resorting to the check
+sequence. It would be very difficult to write an automatic recovery tool
+like lziprecover for the gzip format. And, as far as I know, it has
+never been written.
+
+Lzip, like gzip and bzip2, uses a CRC32 to check the integrity of the
+decompressed data because it provides more accurate error detection than
+CRC64 up to a compressed size of about 16 GiB, a size larger than that
+of most files. In the case of lzip, the additional detection capability
+of the decompressor reduces the probability of undetected errors more
+than a million times, making CRC32 more accurate than CRC64 up to about
+20 PiB of compressed size.
The lzip format is designed for long-term archiving. Therefore it
excludes any unneeded features that may interfere with the future
@@ -441,7 +467,7 @@ size. The size of any file larger than 4 GiB gets truncated.
Bzip2 does not store the uncompressed size of the file.
The lzip format provides a 64-bit field for the uncompressed size.
-Additionaly, lzip produces multi-member output automatically when the
+Additionaly, lzip produces multimember output automatically when the
size is too large for a single member, allowing for an unlimited
uncompressed size.
@@ -462,9 +488,17 @@ uncompressed size.
@section Quality of implementation
@table @samp
+@item Accurate and robust error detection
+
+The lzip format provides 3 factor integrity checking and the
+decompressors report mismatches in each factor separately. This way if
+just one byte in one factor fails but the other two factors match the
+data, it probably means that the data are intact and the corruption just
+affects the mismatching factor (CRC or data size) in the check sequence.
+
@item Multiple implementations
-Just like the lzip format provides 4 factor protection against
+Just like the lzip format provides 3 factor protection against
undetected data corruption, the development methodology of the lzip
family of compressors provides 3 factor protection against undetected
programming errors.
@@ -477,6 +511,15 @@ guarantees that all three implement the same algorithm, and makes it
unlikely that any of them may contain serious undiscovered errors. In
fact, no errors have been discovered in lzip since 2009.
+Additionally, the three implementations have been extensively tested
+with
+@uref{http://www.nongnu.org/lzip/manual/lziprecover_manual.html#Unzcrash,,unzcrash},
+valgrind and @samp{american fuzzy lop} without finding a single
+vulnerability or false negative.
+@ifnothtml
+@xref{Unzcrash,,,lziprecover}.
+@end ifnothtml
+
@item Dictionary size
Lzip automatically uses the smallest possible dictionary size for each
@@ -525,7 +568,7 @@ additional information before, between, or after them.
Each member has the following structure:
@verbatim
+--+--+--+--+----+----+=============+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-| ID string | VN | DS | Lzma stream | CRC32 | Data size | Member size |
+| ID string | VN | DS | LZMA stream | CRC32 | Data size | Member size |
+--+--+--+--+----+----+=============+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
@end verbatim
@@ -549,8 +592,8 @@ from the base size to obtain the dictionary size.@*
Example: 0xD3 = 2^19 - 6 * 2^15 = 512 KiB - 6 * 32 KiB = 320 KiB@*
Valid values for dictionary size range from 4 KiB to 512 MiB.
-@item Lzma stream
-The lzma stream, finished by an end of stream marker. Uses default
+@item LZMA stream
+The LZMA stream, finished by an end of stream marker. Uses default
values for encoder properties. @xref{Stream format}, for a complete
description.
@@ -563,7 +606,7 @@ Size of the uncompressed original data.
@item Member size (8 bytes)
Total size of the member, including header and trailer. This field acts
as a distributed index, allows the verification of stream integrity, and
-facilitates safe recovery of undamaged members from multi-member files.
+facilitates safe recovery of undamaged members from multimember files.
@end table
@@ -643,7 +686,9 @@ properties", to adjust it for some kinds of binary data. These
parameters are; @samp{literal_context_bits} (with a default value of 3),
@samp{literal_pos_state_bits} (with a default value of 0), and
@samp{pos_state_bits} (with a default value of 2). As a general purpose
-compressor, lzip only uses the default values for these parameters.
+compressor, lzip only uses the default values for these parameters. In
+particular @samp{literal_pos_state_bits} has been optimized away and
+does not even appear in the code.
Lzip also finishes the LZMA stream with an "End Of Stream" marker (the
distance-length pair 0xFFFFFFFFU, 2), which in conjunction with the
@@ -695,7 +740,7 @@ latest used distance
@end multitable
@sp 1
-In the following tables, multi-bit sequences are coded in normal order,
+In the following tables, multibit sequences are coded in normal order,
from MSB to LSB, except where noted otherwise.
Lengths (the @samp{len} in the table above) are coded as follows:
@@ -717,9 +762,9 @@ first send the position of the most significant bit that is set to 1,
which you may find by making a bit scan from the left (from the MSB). A
position of 0 means that the number is 0 (no bit is set), 1 means the
LSB is the first bit set (the number is 1), and 32 means the MSB is set
-(the number is >= 0x80000000). Lets call this bit position a "slot".
-Then, if slot is > 1, you send the remaining slot - 1 bits. Lets call
-these bits "direct_bits" because they are coded directly by value
+(i.e., the number is >= 0x80000000). Lets call this bit position a
+"slot". Then, if slot is > 1, you send the remaining slot - 1 bits. Lets
+call these bits "direct_bits" because they are coded directly by value
instead of indirectly by position.
The inconvenient of this simple method is that it needs 6 bits to code
@@ -902,7 +947,7 @@ hash value (for a chosen hash) coincide with those of another file.
@item
In very rare cases, trailing data could be the corrupt header of another
-member. In multi-member or concatenated files the probability of
+member. In multimember or concatenated files the probability of
corruption happening in the magic bytes is 5 times smaller than the
probability of getting a false positive caused by the corruption of the
integrity information itself. Therefore it can be considered to be below
@@ -910,8 +955,8 @@ the noise level.
@end itemize
Trailing data can be safely ignored in most cases. In some cases, like
-user-added data, it is expected to be ignored. In those cases where a
-file containing trailing data must be rejected, the option
+that of user-added data, it is expected to be ignored. In those cases
+where a file containing trailing data must be rejected, the option
@samp{--trailing-error} can be used. @xref{--trailing-error}.
@@ -922,7 +967,7 @@ file containing trailing data must be rejected, the option
WARNING! Even if lzip is bug-free, other causes may result in a corrupt
compressed file (bugs in the system libraries, memory errors, etc).
Therefore, if the data you are going to compress are important, give the
-@samp{--keep} option to lzip and do not remove the original file until
+@samp{--keep} option to lzip and don't remove the original file until
you verify the compressed file with a command like
@w{@samp{lzip -cd file.lz | cmp file -}}.
@@ -937,7 +982,7 @@ lzip -v file
@sp 1
@noindent
-Example 2: Like example 1 but the created @samp{file.lz} is multi-member
+Example 2: Like example 1 but the created @samp{file.lz} is multimember
with a member size of 1 MiB. The compression ratio is not shown.
@example
@@ -965,11 +1010,11 @@ lzip -tv file.lz
@sp 1
@noindent
-Example 5: Compress a whole floppy in /dev/fd0 and send the output to
+Example 5: Compress a whole device in /dev/sdc and send the output to
@samp{file.lz}.
@example
-lzip -c /dev/fd0 > file.lz
+lzip -c /dev/sdc > file.lz
@end example
@sp 1
@@ -1023,7 +1068,7 @@ lzip -cd volume_name*.lz | tar -xf -
@sp 1
@noindent
Example 11: Create a multivolume compressed backup of a large database
-file with a volume size of 650 MB, where each volume is a multi-member
+file with a volume size of 650 MB, where each volume is a multimember
file with a member size of 32 MiB.
@example
@@ -1052,10 +1097,18 @@ find by running @w{@code{lzip --version}}.
@verbatim
/* Lzd - Educational decompressor for the lzip format
- Copyright (C) 2013-2015 Antonio Diaz Diaz.
+ Copyright (C) 2013-2016 Antonio Diaz Diaz.
+
+ This program is free software. Redistribution and use in source and
+ binary forms, with or without modification, are permitted provided
+ that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
- This program is free software: you have unlimited permission
- to copy, distribute and modify it.
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -1105,6 +1158,7 @@ enum {
min_dictionary_size = 1 << 12,
max_dictionary_size = 1 << 29,
literal_context_bits = 3,
+ literal_pos_state_bits = 0, // not used
pos_state_bits = 2,
pos_states = 1 << pos_state_bits,
pos_state_mask = pos_states - 1,
@@ -1291,6 +1345,7 @@ class LZ_decoder
unsigned pos; // current pos in buffer
unsigned stream_pos; // first byte not yet written to stdout
uint32_t crc_;
+ bool pos_wrapped;
void flush_data();
@@ -1315,7 +1370,8 @@ public:
buffer( new uint8_t[dictionary_size] ),
pos( 0 ),
stream_pos( 0 ),
- crc_( 0xFFFFFFFFU )
+ crc_( 0xFFFFFFFFU ),
+ pos_wrapped( false )
{ buffer[dictionary_size-1] = 0; } // prev_byte of first byte
~LZ_decoder() { delete[] buffer; }
@@ -1337,7 +1393,8 @@ void LZ_decoder::flush_data()
if( std::fwrite( buffer + stream_pos, 1, size, stdout ) != size )
{ std::fprintf( stderr, "Write error: %s\n", std::strerror( errno ) );
std::exit( 1 ); }
- if( pos >= dictionary_size ) { partial_data_pos += pos; pos = 0; }
+ if( pos >= dictionary_size )
+ { partial_data_pos += pos; pos = 0; pos_wrapped = true; }
stream_pos = pos;
}
}
@@ -1433,7 +1490,7 @@ bool LZ_decoder::decode_member() // Returns false if error
}
}
state.set_match();
- if( rep0 >= dictionary_size || rep0 >= data_position() )
+ if( rep0 >= dictionary_size || ( rep0 >= pos && !pos_wrapped ) )
{ flush_data(); return false; }
}
for( int i = 0; i < len; ++i ) put_byte( peek( rep0 ) );
@@ -1455,7 +1512,7 @@ int main( const int argc, const char * const argv[] )
"It is not safe to use lzd for any real work.\n"
"\nUsage: %s < file.lz > file\n", argv[0] );
std::printf( "Lzd decompresses from standard input to standard output.\n"
- "\nCopyright (C) 2015 Antonio Diaz Diaz.\n"
+ "\nCopyright (C) 2016 Antonio Diaz Diaz.\n"
"This is free software: you are free to change and redistribute it.\n"
"There is NO WARRANTY, to the extent permitted by law.\n"
"Report bugs to lzip-bug@nongnu.org\n"