summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDaniel Baumann <mail@daniel-baumann.ch>2015-11-07 10:09:34 +0000
committerDaniel Baumann <mail@daniel-baumann.ch>2015-11-07 10:09:34 +0000
commit72164cf0d95398b5f82d96b9be1f931e06f8627c (patch)
tree928bd21186c7108845bca89226f712e5f089f9d1
parentAdding upstream version 1.17. (diff)
downloadlzip-72164cf0d95398b5f82d96b9be1f931e06f8627c.tar.xz
lzip-72164cf0d95398b5f82d96b9be1f931e06f8627c.zip
Adding upstream version 1.18~pre1.upstream/1.18_pre1
Signed-off-by: Daniel Baumann <mail@daniel-baumann.ch>
-rw-r--r--ChangeLog10
-rw-r--r--NEWS16
-rwxr-xr-xconfigure2
-rw-r--r--decoder.cc4
-rw-r--r--decoder.h39
-rw-r--r--doc/lzip.15
-rw-r--r--doc/lzip.info157
-rw-r--r--doc/lzip.texi137
-rw-r--r--encoder.cc14
-rw-r--r--encoder.h5
-rw-r--r--lzip.h4
-rw-r--r--main.cc68
-rwxr-xr-xtestsuite/check.sh22
13 files changed, 341 insertions, 142 deletions
diff --git a/ChangeLog b/ChangeLog
index e3ebebe..113206d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,11 @@
+2015-08-13 Antonio Diaz Diaz <antonio@gnu.org>
+
+ * Version 1.18-pre1 released.
+ * main.cc: Added new option '-a, --trailing-error'.
+ * Decompression time has been reduced by 2%.
+ * lzip.texi: Added chapter 'Trailing data'.
+ * testsuite/check.sh: Don't check error messages.
+
2015-07-12 Antonio Diaz Diaz <antonio@gnu.org>
* Version 1.17 released.
@@ -64,7 +72,7 @@
* Compression time of option '-0' has been reduced by 2%.
* main.cc (decompress): Print only one status line for each
multi-member file when only one '-v' is specified.
- * main.cc (decompress): Print up to 6 bytes of trailing garbage
+ * main.cc (decompress): Print up to 6 bytes of trailing data
when '-vvvv' is specified.
* main.cc (open_instream): Do not show the message
" and '--stdout' was not specified" for directories, etc.
diff --git a/NEWS b/NEWS
index 0a1cbf2..86b75f3 100644
--- a/NEWS
+++ b/NEWS
@@ -1,10 +1,12 @@
-Changes in version 1.17:
+Changes in version 1.18:
-The compression code has been reorganized to ease the porting of the
-fast encoder to clzip and lzlib.
+The option "-a, --trailing-error", which makes lzip exit with error
+status 2 if any remaining input is detected after decompressing the last
+member, has been added.
-The new chapter "Quality assurance" has been added to the manual.
+Decompression time has been reduced by 2%.
-The targets "install-compress", "install-strip-compress",
-"install-info-compress" and "install-man-compress" have been added to
-the Makefile.
+The new chapter "Trailing data" has been added to the manual.
+
+Fixed a harmless check failure on Windows caused by the failed
+comparison of a message in text mode.
diff --git a/configure b/configure
index 9845c11..ebf6fb9 100755
--- a/configure
+++ b/configure
@@ -6,7 +6,7 @@
# to copy, distribute and modify it.
pkgname=lzip
-pkgversion=1.17
+pkgversion=1.18-pre1
progname=lzip
srctrigger=doc/${pkgname}.texi
diff --git a/decoder.cc b/decoder.cc
index 113479a..f773e57 100644
--- a/decoder.cc
+++ b/decoder.cc
@@ -42,7 +42,7 @@ void Pretty_print::operator()( const char * const msg ) const
{
first_post = false;
std::fprintf( stderr, " %s: ", name_.c_str() );
- for( unsigned i = 0; i < longest_name - name_.size(); ++i )
+ for( unsigned i = name_.size(); i < longest_name; ++i )
std::fputc( ' ', stderr );
if( !msg ) std::fflush( stderr );
}
@@ -110,7 +110,7 @@ void LZ_decoder::flush_data()
crc32.update_buf( crc_, buffer + stream_pos, size );
if( outfd >= 0 && writeblock( outfd, buffer + stream_pos, size ) != size )
throw Error( "Write error" );
- if( pos >= buffer_size ) { partial_data_pos += pos; pos = 0; }
+ if( pos >= dictionary_size ) { partial_data_pos += pos; pos = 0; }
stream_pos = pos;
}
}
diff --git a/decoder.h b/decoder.h
index 98d42ce..f0d2de6 100644
--- a/decoder.h
+++ b/decoder.h
@@ -212,10 +212,9 @@ class LZ_decoder
unsigned long long partial_data_pos;
Range_decoder & rdec;
const unsigned dictionary_size;
- const int buffer_size;
uint8_t * const buffer; // output buffer
- int pos; // current pos in buffer
- int stream_pos; // first byte not yet written to file
+ unsigned pos; // current pos in buffer
+ unsigned stream_pos; // first byte not yet written to file
uint32_t crc_;
const int outfd; // output file descriptor
const int member_version;
@@ -225,37 +224,42 @@ class LZ_decoder
uint8_t peek_prev() const
{
- const int i = ( ( pos > 0 ) ? pos : buffer_size ) - 1;
+ const unsigned i = ( ( pos > 0 ) ? pos : dictionary_size ) - 1;
return buffer[i];
}
- uint8_t peek( const int distance ) const
+ uint8_t peek( const unsigned distance ) const
{
- int i = pos - distance - 1;
- if( i < 0 ) i += buffer_size;
+ unsigned i = pos - distance - 1;
+ if( pos <= distance ) i += dictionary_size;
return buffer[i];
}
void put_byte( const uint8_t b )
{
buffer[pos] = b;
- if( ++pos >= buffer_size ) flush_data();
+ if( ++pos >= dictionary_size ) flush_data();
}
- void copy_block( const int distance, int len )
+ void copy_block( const unsigned distance, unsigned len )
{
- int i = pos - distance - 1;
- if( i < 0 ) i += buffer_size;
- if( len < buffer_size - std::max( pos, i ) && len <= std::abs( pos - i ) )
+ unsigned i = pos - distance - 1;
+ bool fast;
+ if( pos <= distance )
+ { i += dictionary_size;
+ fast = ( len <= dictionary_size - i && len <= i - pos ); }
+ else
+ fast = ( len < dictionary_size - pos && len <= pos - i );
+ if( fast ) // no wrap, no overlap
{
- std::memcpy( buffer + pos, buffer + i, len ); // no wrap, no overlap
+ std::memcpy( buffer + pos, buffer + i, len );
pos += len;
}
else for( ; len > 0; --len )
{
buffer[pos] = buffer[i];
- if( ++pos >= buffer_size ) flush_data();
- if( ++i >= buffer_size ) i = 0;
+ if( ++pos >= dictionary_size ) flush_data();
+ if( ++i >= dictionary_size ) i = 0;
}
}
@@ -268,14 +272,13 @@ public:
partial_data_pos( 0 ),
rdec( rde ),
dictionary_size( header.dictionary_size() ),
- buffer_size( std::max( 65536U, dictionary_size ) ),
- buffer( new uint8_t[buffer_size] ),
+ buffer( new uint8_t[dictionary_size] ),
pos( 0 ),
stream_pos( 0 ),
crc_( 0xFFFFFFFFU ),
outfd( ofd ),
member_version( header.version() )
- { buffer[buffer_size-1] = 0; } // prev_byte of first byte
+ { buffer[dictionary_size-1] = 0; } // prev_byte of first byte
~LZ_decoder() { delete[] buffer; }
diff --git a/doc/lzip.1 b/doc/lzip.1
index b6acca6..aa0e5ff 100644
--- a/doc/lzip.1
+++ b/doc/lzip.1
@@ -1,5 +1,5 @@
.\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.46.1.
-.TH LZIP "1" "July 2015" "lzip 1.17" "User Commands"
+.TH LZIP "1" "August 2015" "lzip 1.18-pre1" "User Commands"
.SH NAME
lzip \- reduces the size of files
.SH SYNOPSIS
@@ -15,6 +15,9 @@ display this help and exit
\fB\-V\fR, \fB\-\-version\fR
output version information and exit
.TP
+\fB\-a\fR, \fB\-\-trailing\-error\fR
+exit with error status if trailing data
+.TP
\fB\-b\fR, \fB\-\-member\-size=\fR<bytes>
set member size limit in bytes
.TP
diff --git a/doc/lzip.info b/doc/lzip.info
index f0aa011..71d8f8e 100644
--- a/doc/lzip.info
+++ b/doc/lzip.info
@@ -11,7 +11,7 @@ File: lzip.info, Node: Top, Next: Introduction, Up: (dir)
Lzip Manual
***********
-This manual is for Lzip (version 1.17, 12 July 2015).
+This manual is for Lzip (version 1.18-pre1, 13 August 2015).
* Menu:
@@ -21,6 +21,7 @@ This manual is for Lzip (version 1.17, 12 July 2015).
* File format:: Detailed format of the compressed file
* Algorithm:: How lzip compresses the data
* Stream format:: Format of the LZMA stream in lzip files
+* Trailing data:: Extra data appended to the file
* Examples:: A small tutorial with examples
* Problems:: Reporting bugs
* Reference source code:: Source code illustrating stream format
@@ -51,7 +52,7 @@ availability:
recovery means. The lziprecover program can repair bit-flip errors
(one of the most common forms of data corruption) in lzip files,
and provides data recovery capabilities, including error-checked
- merging of damaged copies of a file. *note Data safety:
+ merging of damaged copies of a file. *Note Data safety:
(lziprecover)Data safety.
* The lzip format is as simple as possible (but not simpler). The
@@ -156,6 +157,13 @@ The format for running lzip is:
'--version'
Print the version number of lzip on the standard output and exit.
+'-a'
+'--trailing-error'
+ Exit with error status 2 if any remaining input is detected after
+ decompressing the last member. Such remaining input is usually
+ trailing garbage that can be safely ignored. *Note
+ concat-example::.
+
'-b BYTES'
'--member-size=BYTES'
Set the member size limit to BYTES. A small member size may
@@ -171,7 +179,8 @@ The format for running lzip is:
'-d'
'--decompress'
- Decompress.
+ Decompress the specified file(s). If a file fails to decompress,
+ lzip exits immediately without decompressing the rest of the files.
'-f'
'--force'
@@ -235,7 +244,8 @@ The format for running lzip is:
Check integrity of the specified file(s), but don't decompress
them. This really performs a trial decompression and throws away
the result. Use it together with '-v' to see information about
- the file.
+ the file(s). If a file fails the test, lzip continues checking the
+ rest of the files.
'-v'
'--verbose'
@@ -245,7 +255,7 @@ The format for running lzip is:
When decompressing or testing, further -v's (up to 4) increase the
verbosity level, showing status, compression ratio, dictionary
size, trailer contents (CRC, data size, member size), and up to 6
- bytes of trailing garbage (if any).
+ bytes of trailing data (if any).
'-0 .. -9'
Set the compression parameters (dictionary size and match length
@@ -302,9 +312,10 @@ File: lzip.info, Node: Quality assurance, Next: File format, Prev: Invoking l
3 Design, development and testing of lzip
*****************************************
-There are two ways of constructing a software design. One way is to make
-it so simple that there are obviously no deficiencies and the other is
-to make it so complicated that there are no obvious deficiencies.
+There are two ways of constructing a software design: One way is to make
+it so simple that there are obviously no deficiencies and the other way
+is to make it so complicated that there are no obvious deficiencies. The
+first method is far more difficult.
-- C.A.R. Hoare
Lzip has been designed, written and tested with great care to be the
@@ -479,7 +490,7 @@ additional information before, between, or after them.
All multibyte values are stored in little endian order.
-'ID string'
+'ID string (the "magic" bytes)'
A four byte string, identifying the lzip format, with the value
"LZIP" (0x4C, 0x5A, 0x49, 0x50).
@@ -582,7 +593,7 @@ range encoding), Igor Pavlov (for putting all the above together in
LZMA), and Julian Seward (for bzip2's CLI).

-File: lzip.info, Node: Stream format, Next: Examples, Prev: Algorithm, Up: Top
+File: lzip.info, Node: Stream format, Next: Trailing data, Prev: Algorithm, Up: Top
6 Format of the LZMA stream in lzip files
*****************************************
@@ -619,7 +630,7 @@ code of a real decoder seems the only appropriate reference to use.
LZMA-302eos streams using as reference the source code of "lzd", an
educational decompressor for lzip files which can be downloaded from
the lzip download directory. The source code of lzd is included in
-appendix A. *note Reference source code::
+appendix A. *Note Reference source code::.
6.1 What is coded
@@ -656,16 +667,38 @@ Bit sequence Description
1 + 1 + 8 bits lengths from 18 to 273
- The coding of distances is a little more complicated. LZMA divides
-the interval between any two powers of 2 into 2 halves, named slots. As
-possible distances range from 0 to (2^32 - 1), there are 64 slots (0 to
-63). The slot number is context-coded in 6 bits. 'direct_bits' are the
-remaining bits (from 0 to 30) needed to form a complete distance, and
-are calculated as (slot >> 1) - 1. If a distance needs 6 or more
-direct_bits, the last 4 bits are coded separately. The last piece
-(direct_bits for distances 4 to 127 or the last 4 bits for distances >=
-128) is context-coded in reverse order (from LSB to MSB). For distances
->= 128, the 'direct_bits - 4' part is coded with fixed 0.5 probability.
+ The coding of distances is a little more complicated, so I'll begin
+explaining a simpler version of the encoding.
+
+ Imagine you need to code a number from 0 to 2^32 - 1, and you want
+to do it in a way that produces shorter codes for the smaller numbers.
+You may first send the position of the most significant bit that is set
+to 1, which you may find by making a bit scan from the left (from the
+MSB). A position of 0 means that the number is 0 (no bit is set), 1
+means the LSB is the first bit set (the number is 1), and 32 means the
+MSB is set (the number is >= 0x80000000). Lets call this bit position a
+"slot". Then, if slot is > 1, you send the remaining slot - 1 bits.
+Lets call these bits "direct_bits" because they are coded directly by
+value instead of indirectly by position.
+
+ The inconvenient of this simple method is that it needs 6 bits to
+code the slot, but it just uses 33 of the 64 possible values, wasting
+almost half of the codes.
+
+ The intelligent trick of LZMA is that it encodes the position of the
+most significant bit set, along with the value of the next bit, in the
+same 6 bits that would take to encode the position alone. This seems to
+need 66 slots (2 * position + next_bit), but for slots 0 and 1 there is
+no next bit, so the number of needed slots is 64 (0 to 63).
+
+ The slot number is context-coded in 6 bits. 'direct_bits' is the
+amount of remaining bits (from 0 to 30) needed to form a complete
+distance, and is calculated as (slot >> 1) - 1. If a distance needs 6 or
+more direct_bits, the last 4 bits are coded separately. The last piece
+(all the direct_bits for distances 4 to 127 or the last 4 bits for
+distances >= 128) is context-coded in reverse order (from LSB to MSB).
+For distances >= 128, the 'direct_bits - 4' part is coded with fixed
+0.5 probability.
Bit sequence Description
--------------------------------------------------------------------------
@@ -795,9 +828,42 @@ with the appropriate contexts to decode the different coding sequences
Stream" marker is decoded.

-File: lzip.info, Node: Examples, Next: Problems, Prev: Stream format, Up: Top
+File: lzip.info, Node: Trailing data, Next: Examples, Prev: Stream format, Up: Top
+
+7 Extra data appended to the file
+*********************************
+
+Sometimes extra data is found appended to a lzip file after the last
+member. Such trailing data may be:
+
+ * Padding added to make the file size a multiple of some block size,
+ for example when writing to a tape.
+
+ * Garbage added by some not totally successful copy operation.
+
+ * Useful data added by the user; a cryptographically secure hash, a
+ description of file contents, etc.
+
+ * Malicious data added to the file in order to make its total size
+ and hash value (for a chosen hash) coincide with those of another
+ file.
+
+ * In very rare cases, trailing data could be the corrupt header of
+ another member. In multi-member or concatenated files the
+ probability of corruption happening in the magic bytes is 5 times
+ smaller than the probability of getting a false positive caused by
+ the corruption of the integrity information itself. Therefore it
+ can be considered to be below the noise level.
+
+ Trailing data can be safely ignored in most cases. In some cases,
+like user-added data, it is expected to be ignored. In those cases
+where a file containing trailing data must be rejected, the option
+'--trailing-error' can be used. *Note --trailing-error::.
+
+
+File: lzip.info, Node: Examples, Next: Problems, Prev: Trailing data, Up: Top
-7 A small tutorial with examples
+8 A small tutorial with examples
********************************
WARNING! Even if lzip is bug-free, other causes may result in a corrupt
@@ -838,30 +904,39 @@ Example 5: Compress a whole floppy in /dev/fd0 and send the output to
lzip -c /dev/fd0 > file.lz
-Example 6: Decompress 'file.lz' partially until 10 KiB of decompressed
+Example 6: The right way of concatenating compressed files. *Note
+Trailing data::.
+
+ Don't do this
+ cat file1.lz file2.lz file3.lz | lzip -d
+ Do this instead
+ lzip -cd file1.lz file2.lz file3.lz
+
+
+Example 7: Decompress 'file.lz' partially until 10 KiB of decompressed
data are produced.
lzip -cd file.lz | dd bs=1024 count=10
-Example 7: Decompress 'file.lz' partially from decompressed byte 10000
+Example 8: Decompress 'file.lz' partially from decompressed byte 10000
to decompressed byte 15000 (5000 bytes are produced).
lzip -cd file.lz | dd bs=1000 skip=10 count=5
-Example 8: Create a multivolume compressed tar archive with a volume
+Example 9: Create a multivolume compressed tar archive with a volume
size of 1440 KiB.
tar -c some_directory | lzip -S 1440KiB -o volume_name
-Example 9: Extract a multivolume compressed tar archive.
+Example 10: Extract a multivolume compressed tar archive.
lzip -cd volume_name*.lz | tar -xf -
-Example 10: Create a multivolume compressed backup of a large database
+Example 11: Create a multivolume compressed backup of a large database
file with a volume size of 650 MB, where each volume is a multi-member
file with a member size of 32 MiB.
@@ -870,7 +945,7 @@ file with a member size of 32 MiB.

File: lzip.info, Node: Problems, Next: Reference source code, Prev: Examples, Up: Top
-8 Reporting bugs
+9 Reporting bugs
****************
There are probably bugs in lzip. There are certainly errors and
@@ -1362,6 +1437,7 @@ Concept index
* options: Invoking lzip. (line 6)
* quality assurance: Quality assurance. (line 6)
* reference source code: Reference source code. (line 6)
+* trailing data: Trailing data. (line 6)
* usage: Invoking lzip. (line 6)
* version: Invoking lzip. (line 6)
@@ -1369,16 +1445,19 @@ Concept index

Tag Table:
Node: Top208
-Node: Introduction1087
-Node: Invoking lzip6060
-Node: Quality assurance11658
-Node: File format18171
-Node: Algorithm20556
-Node: Stream format23382
-Node: Examples32812
-Node: Problems34769
-Node: Reference source code35299
-Node: Concept index48952
+Node: Introduction1153
+Node: Invoking lzip6126
+Ref: --trailing-error6536
+Node: Quality assurance12171
+Node: File format18728
+Node: Algorithm21133
+Node: Stream format23959
+Node: Trailing data34502
+Node: Examples35873
+Ref: concat-example37048
+Node: Problems38049
+Node: Reference source code38579
+Node: Concept index52232

End Tag Table
diff --git a/doc/lzip.texi b/doc/lzip.texi
index 69f44ae..845cb42 100644
--- a/doc/lzip.texi
+++ b/doc/lzip.texi
@@ -6,8 +6,8 @@
@finalout
@c %**end of header
-@set UPDATED 12 July 2015
-@set VERSION 1.17
+@set UPDATED 13 August 2015
+@set VERSION 1.18-pre1
@dircategory Data Compression
@direntry
@@ -41,6 +41,7 @@ This manual is for Lzip (version @value{VERSION}, @value{UPDATED}).
* File format:: Detailed format of the compressed file
* Algorithm:: How lzip compresses the data
* Stream format:: Format of the LZMA stream in lzip files
+* Trailing data:: Extra data appended to the file
* Examples:: A small tutorial with examples
* Problems:: Reporting bugs
* Reference source code:: Source code illustrating stream format
@@ -76,7 +77,7 @@ program can repair bit-flip errors (one of the most common forms of data
corruption) in lzip files, and provides data recovery capabilities,
including error-checked merging of damaged copies of a file.
@ifnothtml
-@ref{Data safety,,,lziprecover}.
+@xref{Data safety,,,lziprecover}.
@end ifnothtml
@item
@@ -190,6 +191,13 @@ Print an informative help message describing the options and exit.
@itemx --version
Print the version number of lzip on the standard output and exit.
+@anchor{--trailing-error}
+@item -a
+@itemx --trailing-error
+Exit with error status 2 if any remaining input is detected after
+decompressing the last member. Such remaining input is usually trailing
+garbage that can be safely ignored. @xref{concat-example}.
+
@item -b @var{bytes}
@itemx --member-size=@var{bytes}
Set the member size limit to @var{bytes}. A small member size may
@@ -204,7 +212,8 @@ uncompressed data as possible when decompressing a corrupt file.
@item -d
@itemx --decompress
-Decompress.
+Decompress the specified file(s). If a file fails to decompress, lzip
+exits immediately without decompressing the rest of the files.
@item -f
@itemx --force
@@ -263,7 +272,8 @@ EiB.
@itemx --test
Check integrity of the specified file(s), but don't decompress them.
This really performs a trial decompression and throws away the result.
-Use it together with @samp{-v} to see information about the file.
+Use it together with @samp{-v} to see information about the file(s). If
+a file fails the test, lzip continues checking the rest of the files.
@item -v
@itemx --verbose
@@ -273,7 +283,7 @@ second @samp{-v} shows the progress of compression.@*
When decompressing or testing, further -v's (up to 4) increase the
verbosity level, showing status, compression ratio, dictionary size,
trailer contents (CRC, data size, member size), and up to 6 bytes of
-trailing garbage (if any).
+trailing data (if any).
@item -0 .. -9
Set the compression parameters (dictionary size and match length limit)
@@ -334,9 +344,10 @@ caused lzip to panic.
@chapter Design, development and testing of lzip
@cindex quality assurance
-There are two ways of constructing a software design. One way is to make
-it so simple that there are obviously no deficiencies and the other is
-to make it so complicated that there are no obvious deficiencies.@*
+There are two ways of constructing a software design: One way is to make
+it so simple that there are obviously no deficiencies and the other way
+is to make it so complicated that there are no obvious deficiencies. The
+first method is far more difficult.@*
--- C.A.R. Hoare
Lzip has been designed, written and tested with great care to be the
@@ -521,7 +532,7 @@ Each member has the following structure:
All multibyte values are stored in little endian order.
@table @samp
-@item ID string
+@item ID string (the "magic" bytes)
A four byte string, identifying the lzip format, with the value "LZIP"
(0x4C, 0x5A, 0x49, 0x50).
@@ -659,7 +670,7 @@ What follows is a description of the decoding algorithm for LZMA-302eos
streams using as reference the source code of "lzd", an educational
decompressor for lzip files which can be downloaded from the lzip
download directory. The source code of lzd is included in appendix A.
-@ref{Reference source code}
+@xref{Reference source code}.
@sp 1
@section What is coded
@@ -697,17 +708,38 @@ Lengths (the @samp{len} in the table above) are coded as follows:
@end multitable
@sp 1
-The coding of distances is a little more complicated. LZMA divides the
-interval between any two powers of 2 into 2 halves, named slots. As
-possible distances range from 0 to (2^32 - 1), there are 64 slots (0 to
-63). The slot number is context-coded in 6 bits. @samp{direct_bits} are
-the remaining bits (from 0 to 30) needed to form a complete distance,
-and are calculated as (slot >> 1) - 1. If a distance needs 6 or more
-direct_bits, the last 4 bits are coded separately. The last piece
-(direct_bits for distances 4 to 127 or the last 4 bits for distances >=
-128) is context-coded in reverse order (from LSB to MSB). For distances
->= 128, the @samp{direct_bits - 4} part is coded with fixed 0.5
-probability.
+The coding of distances is a little more complicated, so I'll begin
+explaining a simpler version of the encoding.
+
+Imagine you need to code a number from 0 to 2^32 - 1, and you want to do
+it in a way that produces shorter codes for the smaller numbers. You may
+first send the position of the most significant bit that is set to 1,
+which you may find by making a bit scan from the left (from the MSB). A
+position of 0 means that the number is 0 (no bit is set), 1 means the
+LSB is the first bit set (the number is 1), and 32 means the MSB is set
+(the number is >= 0x80000000). Lets call this bit position a "slot".
+Then, if slot is > 1, you send the remaining slot - 1 bits. Lets call
+these bits "direct_bits" because they are coded directly by value
+instead of indirectly by position.
+
+The inconvenient of this simple method is that it needs 6 bits to code
+the slot, but it just uses 33 of the 64 possible values, wasting almost
+half of the codes.
+
+The intelligent trick of LZMA is that it encodes the position of the
+most significant bit set, along with the value of the next bit, in the
+same 6 bits that would take to encode the position alone. This seems to
+need 66 slots (2 * position + next_bit), but for slots 0 and 1 there is
+no next bit, so the number of needed slots is 64 (0 to 63).
+
+The slot number is context-coded in 6 bits. @samp{direct_bits} is the
+amount of remaining bits (from 0 to 30) needed to form a complete
+distance, and is calculated as (slot >> 1) - 1. If a distance needs 6 or
+more direct_bits, the last 4 bits are coded separately. The last piece
+(all the direct_bits for distances 4 to 127 or the last 4 bits for
+distances >= 128) is context-coded in reverse order (from LSB to MSB).
+For distances >= 128, the @samp{direct_bits - 4} part is coded with
+fixed 0.5 probability.
@multitable @columnfractions .5 .5
@headitem Bit sequence @tab Description
@@ -845,6 +877,44 @@ sequences (matches, repeated matches, and literal bytes), until the "End
Of Stream" marker is decoded.
+@node Trailing data
+@chapter Extra data appended to the file
+@cindex trailing data
+
+Sometimes extra data is found appended to a lzip file after the last
+member. Such trailing data may be:
+
+@itemize @bullet
+@item
+Padding added to make the file size a multiple of some block size, for
+example when writing to a tape.
+
+@item
+Garbage added by some not totally successful copy operation.
+
+@item
+Useful data added by the user; a cryptographically secure hash, a
+description of file contents, etc.
+
+@item
+Malicious data added to the file in order to make its total size and
+hash value (for a chosen hash) coincide with those of another file.
+
+@item
+In very rare cases, trailing data could be the corrupt header of another
+member. In multi-member or concatenated files the probability of
+corruption happening in the magic bytes is 5 times smaller than the
+probability of getting a false positive caused by the corruption of the
+integrity information itself. Therefore it can be considered to be below
+the noise level.
+@end itemize
+
+Trailing data can be safely ignored in most cases. In some cases, like
+user-added data, it is expected to be ignored. In those cases where a
+file containing trailing data must be rejected, the option
+@samp{--trailing-error} can be used. @xref{--trailing-error}.
+
+
@node Examples
@chapter A small tutorial with examples
@cindex examples
@@ -903,8 +973,21 @@ lzip -c /dev/fd0 > file.lz
@end example
@sp 1
+@anchor{concat-example}
+@noindent
+Example 6: The right way of concatenating compressed files.
+@xref{Trailing data}.
+
+@example
+Don't do this
+ cat file1.lz file2.lz file3.lz | lzip -d
+Do this instead
+ lzip -cd file1.lz file2.lz file3.lz
+@end example
+
+@sp 1
@noindent
-Example 6: Decompress @samp{file.lz} partially until 10 KiB of
+Example 7: Decompress @samp{file.lz} partially until 10 KiB of
decompressed data are produced.
@example
@@ -913,7 +996,7 @@ lzip -cd file.lz | dd bs=1024 count=10
@sp 1
@noindent
-Example 7: Decompress @samp{file.lz} partially from decompressed byte
+Example 8: Decompress @samp{file.lz} partially from decompressed byte
10000 to decompressed byte 15000 (5000 bytes are produced).
@example
@@ -922,7 +1005,7 @@ lzip -cd file.lz | dd bs=1000 skip=10 count=5
@sp 1
@noindent
-Example 8: Create a multivolume compressed tar archive with a volume
+Example 9: Create a multivolume compressed tar archive with a volume
size of 1440 KiB.
@example
@@ -931,7 +1014,7 @@ tar -c some_directory | lzip -S 1440KiB -o volume_name
@sp 1
@noindent
-Example 9: Extract a multivolume compressed tar archive.
+Example 10: Extract a multivolume compressed tar archive.
@example
lzip -cd volume_name*.lz | tar -xf -
@@ -939,7 +1022,7 @@ lzip -cd volume_name*.lz | tar -xf -
@sp 1
@noindent
-Example 10: Create a multivolume compressed backup of a large database
+Example 11: Create a multivolume compressed backup of a large database
file with a volume size of 650 MB, where each volume is a multi-member
file with a member size of 32 MiB.
diff --git a/encoder.cc b/encoder.cc
index 51c0069..3b24c44 100644
--- a/encoder.cc
+++ b/encoder.cc
@@ -194,16 +194,16 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances],
}
if( replens[rep_index] >= match_len_limit )
{
- trials[0].dis = rep_index;
trials[0].price = replens[rep_index];
+ trials[0].dis = rep_index;
move_and_update( replens[rep_index] );
return replens[rep_index];
}
if( main_len >= match_len_limit )
{
- trials[0].dis = pairs[num_pairs-1].dis + num_rep_distances;
trials[0].price = main_len;
+ trials[0].dis = pairs[num_pairs-1].dis + num_rep_distances;
move_and_update( main_len );
return main_len;
}
@@ -213,13 +213,12 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances],
const uint8_t cur_byte = peek( 0 );
const uint8_t match_byte = peek( reps[0] + 1 );
- trials[0].state = state;
- trials[1].dis = -1; // literal
trials[1].price = price0( bm_match[state()][pos_state] );
if( state.is_char() )
trials[1].price += price_literal( prev_byte, cur_byte );
else
trials[1].price += price_matched( prev_byte, cur_byte, match_byte );
+ trials[1].dis = -1; // literal
const int match_price = price1( bm_match[state()][pos_state] );
const int rep_match_price = match_price + price1( bm_rep[state()] );
@@ -231,16 +230,15 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances],
if( num_trials < min_match_len )
{
- trials[0].dis = trials[1].dis;
trials[0].price = 1;
+ trials[0].dis = trials[1].dis;
move_pos();
return 1;
}
+ trials[0].state = state;
for( int i = 0; i < num_rep_distances; ++i )
trials[0].reps[i] = reps[i];
- trials[1].prev_index = 0;
- trials[1].prev_index2 = single_step_trial;
for( int len = min_match_len; len <= num_trials; ++len )
trials[len].price = infinite_price;
@@ -537,8 +535,8 @@ bool LZ_encoder::encode_member( const unsigned long long member_size )
for( int i = 0; ahead > 0; )
{
const int pos_state = ( data_position() - ahead ) & pos_state_mask;
- const int dis = trials[i].dis;
const int len = trials[i].price;
+ const int dis = trials[i].dis;
bool bit = ( dis < 0 );
renc.encode_bit( bm_match[state()][pos_state], !bit );
diff --git a/encoder.h b/encoder.h
index 9579a85..8bb7258 100644
--- a/encoder.h
+++ b/encoder.h
@@ -275,7 +275,10 @@ public:
rep_len_prices( rep_len_model, match_len_limit ),
pending_num_pairs( 0 ),
num_dis_slots( 2 * real_bits( dictionary_size - 1 ) )
- {}
+ {
+ trials[1].prev_index = 0;
+ trials[1].prev_index2 = single_step_trial;
+ }
void reset()
{
diff --git a/lzip.h b/lzip.h
index 9c16ef2..291e3e3 100644
--- a/lzip.h
+++ b/lzip.h
@@ -115,9 +115,11 @@ class Pretty_print
mutable bool first_post;
public:
- explicit Pretty_print( const std::vector< std::string > & filenames )
+ Pretty_print( const std::vector< std::string > & filenames,
+ const int verbosity )
: stdin_name( "(stdin)" ), longest_name( 0 ), first_post( false )
{
+ if( verbosity <= 0 ) return;
const unsigned stdin_name_len = std::strlen( stdin_name );
for( unsigned i = 0; i < filenames.size(); ++i )
{
diff --git a/main.cc b/main.cc
index ac07852..97e27b6 100644
--- a/main.cc
+++ b/main.cc
@@ -24,6 +24,7 @@
#define _FILE_OFFSET_BITS 64
#include <algorithm>
+#include <cctype>
#include <cerrno>
#include <climits>
#include <csignal>
@@ -104,6 +105,7 @@ void show_help()
std::printf( "\nOptions:\n"
" -h, --help display this help and exit\n"
" -V, --version output version information and exit\n"
+ " -a, --trailing-error exit with error status if trailing data\n"
" -b, --member-size=<bytes> set member size limit in bytes\n"
" -c, --stdout send output to standard output\n"
" -d, --decompress decompress\n"
@@ -275,7 +277,7 @@ int open_instream( const char * const name, struct stat * const in_statsp,
const bool can_read = ( i == 0 &&
( S_ISBLK( mode ) || S_ISCHR( mode ) ||
S_ISFIFO( mode ) || S_ISSOCK( mode ) ) );
- const bool no_ofile = to_stdout || program_mode == m_test;
+ const bool no_ofile = ( to_stdout || program_mode == m_test );
if( i != 0 || ( !S_ISREG( mode ) && ( !can_read || !no_ofile ) ) )
{
if( verbosity >= 0 )
@@ -501,36 +503,42 @@ unsigned char xdigit( const int value )
}
-void show_trailing_garbage( const uint8_t * const data, const int size,
- const Pretty_print & pp, const bool all )
+bool show_trailing_data( const uint8_t * const data, const int size,
+ const Pretty_print & pp, const bool all,
+ const bool ignore_garbage )
{
- std::string garbage_msg;
- if( !all ) garbage_msg = "first bytes of ";
- garbage_msg += "trailing garbage found = ";
- bool text = true;
- for( int i = 0; i < size; ++i )
- if( !std::isprint( data[i] ) ) { text = false; break; }
- if( text )
- {
- garbage_msg += '\'';
- garbage_msg.append( (const char *)data, size );
- garbage_msg += '\'';
- }
- else
+ if( verbosity >= 4 || !ignore_garbage )
{
+ std::string msg;
+ if( !all ) msg = "first bytes of ";
+ msg += "trailing data = ";
+ bool text = true;
for( int i = 0; i < size; ++i )
+ if( !std::isprint( data[i] ) ) { text = false; break; }
+ if( text )
+ {
+ msg += '\'';
+ msg.append( (const char *)data, size );
+ msg += '\'';
+ }
+ else
{
- if( i > 0 ) garbage_msg += ' ';
- garbage_msg += xdigit( data[i] >> 4 );
- garbage_msg += xdigit( data[i] & 0x0F );
+ for( int i = 0; i < size; ++i )
+ {
+ if( i > 0 ) msg += ' ';
+ msg += xdigit( data[i] >> 4 );
+ msg += xdigit( data[i] & 0x0F );
+ }
}
+ pp( msg.c_str() );
+ if( !ignore_garbage ) show_error( "Trailing data not allowed." );
}
- garbage_msg += '.';
- pp( garbage_msg.c_str() );
+ return ignore_garbage;
}
-int decompress( const int infd, const Pretty_print & pp, const bool testing )
+int decompress( const int infd, const Pretty_print & pp,
+ const bool ignore_garbage, const bool testing )
{
int retval = 0;
@@ -546,16 +554,17 @@ int decompress( const int infd, const Pretty_print & pp, const bool testing )
{
if( first_member )
{ pp( "File ends unexpectedly at member header." ); retval = 2; }
- else if( verbosity >= 4 && size > 0 )
- show_trailing_garbage( header.data, size, pp, true );
+ else if( size > 0 && !show_trailing_data( header.data, size, pp,
+ true, ignore_garbage ) )
+ retval = 2;
break;
}
if( !header.verify_magic() )
{
if( first_member )
{ pp( "Bad magic number (file not in lzip format)." ); retval = 2; }
- else if( verbosity >= 4 )
- show_trailing_garbage( header.data, size, pp, false );
+ else if( !show_trailing_data( header.data, size, pp, false, ignore_garbage ) )
+ retval = 2;
break;
}
if( !header.verify_version() )
@@ -699,6 +708,7 @@ int main( const int argc, const char * const argv[] )
int infd = -1;
Mode program_mode = m_compress;
bool force = false;
+ bool ignore_garbage = true;
bool keep_input_files = false;
bool recompress = false;
bool to_stdout = false;
@@ -717,6 +727,7 @@ int main( const int argc, const char * const argv[] )
{ '7', 0, Arg_parser::no },
{ '8', 0, Arg_parser::no },
{ '9', "best", Arg_parser::no },
+ { 'a', "trailing-error", Arg_parser::no },
{ 'b', "member-size", Arg_parser::yes },
{ 'c', "stdout", Arg_parser::no },
{ 'd', "decompress", Arg_parser::no },
@@ -751,6 +762,7 @@ int main( const int argc, const char * const argv[] )
case '5': case '6': case '7': case '8': case '9':
zero = ( code == '0' );
encoder_options = option_mapping[code-'0']; break;
+ case 'a': ignore_garbage = false; break;
case 'b': member_size = getnum( arg.c_str(), 100000, max_member_size );
break;
case 'c': to_stdout = true; break;
@@ -801,7 +813,7 @@ int main( const int argc, const char * const argv[] )
( filenames_given || default_output_filename.size() ) )
set_signals();
- Pretty_print pp( filenames );
+ Pretty_print pp( filenames, verbosity );
int retval = 0;
for( unsigned i = 0; i < filenames.size(); ++i )
@@ -869,7 +881,7 @@ int main( const int argc, const char * const argv[] )
tmp = compress( member_size, volume_size, infd, encoder_options, pp,
in_statsp, zero );
else
- tmp = decompress( infd, pp, program_mode == m_test );
+ tmp = decompress( infd, pp, ignore_garbage, program_mode == m_test );
if( tmp > retval ) retval = tmp;
if( tmp && program_mode != m_test ) cleanup_and_fail( retval );
diff --git a/testsuite/check.sh b/testsuite/check.sh
index ba77d0a..ba86c74 100755
--- a/testsuite/check.sh
+++ b/testsuite/check.sh
@@ -39,13 +39,10 @@ if [ $? = 1 ] ; then printf . ; else printf - ; fail=1 ; fi
if [ $? = 1 ] ; then printf . ; else printf - ; fail=1 ; fi
"${LZIP}" -cqs513MiB in > /dev/null
if [ $? = 1 ] ; then printf . ; else printf - ; fail=1 ; fi
-printf " in: Bad magic number (file not in lzip format).\n" > msg
-"${LZIP}" -t in 2> out
-if [ $? = 2 ] && cmp out msg ; then printf . ; else printf - ; fail=1 ; fi
-printf " (stdin): Bad magic number (file not in lzip format).\n" > msg
-"${LZIP}" -t < in 2> out
-if [ $? = 2 ] && cmp out msg ; then printf . ; else printf - ; fail=1 ; fi
-rm -f out msg
+"${LZIP}" -tq in
+if [ $? = 2 ] ; then printf . ; else printf - ; fail=1 ; fi
+"${LZIP}" -tq < in
+if [ $? = 2 ] ; then printf . ; else printf - ; fail=1 ; fi
"${LZIP}" -cdq in
if [ $? = 2 ] ; then printf . ; else printf - ; fail=1 ; fi
"${LZIP}" -cdq < in
@@ -84,12 +81,21 @@ cmp in2 copy2 || fail=1
printf .
printf "garbage" >> copy2.lz || framework_failure
+rm -f copy2
+"${LZIP}" -atq copy2.lz
+if [ $? = 2 ] ; then printf . ; else printf - ; fail=1 ; fi
+"${LZIP}" -atq < copy2.lz
+if [ $? = 2 ] ; then printf . ; else printf - ; fail=1 ; fi
+"${LZIP}" -adkq copy2.lz
+if [ $? = 2 ] && [ ! -e copy2 ] ; then printf . ; else printf - ; fail=1 ; fi
+"${LZIP}" -adkq -o copy2 < copy2.lz
+if [ $? = 2 ] && [ ! -e copy2 ] ; then printf . ; else printf - ; fail=1 ; fi
printf "to be overwritten" > copy2 || framework_failure
"${LZIP}" -df copy2.lz || fail=1
cmp in2 copy2 || fail=1
printf .
-"${LZIP}" -cfq "${in_lz}" > out
+"${LZIP}" -cfq "${in_lz}" > /dev/null
if [ $? = 1 ] ; then printf . ; else printf - ; fail=1 ; fi
"${LZIP}" -cF "${in_lz}" > out || fail=1
"${LZIP}" -cd out | "${LZIP}" -d > copy || fail=1