diff options
-rw-r--r-- | COPYING | 3 | ||||
-rw-r--r-- | ChangeLog | 46 | ||||
-rw-r--r-- | Makefile.in | 10 | ||||
-rw-r--r-- | NEWS | 27 | ||||
-rw-r--r-- | README | 41 | ||||
-rw-r--r-- | carg_parser.c | 60 | ||||
-rw-r--r-- | carg_parser.h | 43 | ||||
-rwxr-xr-x | configure | 4 | ||||
-rw-r--r-- | decoder.c | 63 | ||||
-rw-r--r-- | decoder.h | 95 | ||||
-rw-r--r-- | doc/clzip.1 | 35 | ||||
-rw-r--r-- | doc/clzip.info | 753 | ||||
-rw-r--r-- | doc/clzip.texi | 751 | ||||
-rw-r--r-- | encoder.c | 23 | ||||
-rw-r--r-- | encoder.h | 81 | ||||
-rw-r--r-- | encoder_base.c | 16 | ||||
-rw-r--r-- | encoder_base.h | 101 | ||||
-rw-r--r-- | fast_encoder.c | 4 | ||||
-rw-r--r-- | fast_encoder.h | 20 | ||||
-rw-r--r-- | list.c | 21 | ||||
-rw-r--r-- | lzip.h | 40 | ||||
-rw-r--r-- | lzip_index.c | 78 | ||||
-rw-r--r-- | lzip_index.h | 64 | ||||
-rw-r--r-- | main.c | 240 | ||||
-rwxr-xr-x | testsuite/check.sh | 163 | ||||
-rw-r--r-- | testsuite/fox6.lz | bin | 480 -> 0 bytes | |||
-rw-r--r-- | testsuite/fox6_mark.lz | bin | 480 -> 0 bytes | |||
-rw-r--r-- | testsuite/fox_nz.lz | bin | 0 -> 80 bytes | |||
-rw-r--r-- | testsuite/test.txt | 6 | ||||
-rw-r--r-- | testsuite/test.txt.lz | bin | 7376 -> 7341 bytes | |||
-rw-r--r-- | testsuite/test_em.txt.lz | bin | 14024 -> 0 bytes |
31 files changed, 1415 insertions, 1373 deletions
@@ -1,8 +1,7 @@ GNU GENERAL PUBLIC LICENSE Version 2, June 1991 - Copyright (C) 1989, 1991 Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Copyright (C) 1989, 1991 Free Software Foundation, Inc. <http://fsf.org/> Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. @@ -1,3 +1,13 @@ +2024-11-23 Antonio Diaz Diaz <antonio@gnu.org> + + * Version 1.15-rc1 released. + * Remove options '--empty-error' and '--marking-error'. + * main.c (Pp_free): New function. + * decoder.c (LZd_decode_member): Remove support for Sync Flush marker. + * clzip.texi: New chapter 'Syntax of command-line arguments'. + * check.sh: Use 'cp' instead of 'cat'. + * testsuite: Add fox_nz.lz. Remove fox6.lz,fox6_mark.lz,test_em.txt.lz. + 2024-01-22 Antonio Diaz Diaz <antonio@gnu.org> * Version 1.14 released. @@ -27,12 +37,12 @@ Make '-o' behave like '-c', but writing to file instead of stdout. Make '-c' and '-o' check whether the output is a terminal only once. Do not open output if input is a terminal. + Set a valid invocation_name even if argc == 0. * Replace 'decompressed', 'compressed' with 'out', 'in' in output. * lzip_index.c: Improve messages for corruption in last header. - * main.c: Set a valid invocation_name even if argc == 0. * Document extraction from tar.lz in manual, '--help', and man page. * clzip.texi (Introduction): Mention plzip and tarlz as alternatives. - * clzip.texi: Several fixes and improvements. + Several fixes and improvements. * testsuite: Add 9 new test files. 2019-01-03 Antonio Diaz Diaz <antonio@gnu.org> @@ -42,7 +52,7 @@ * lzip.h (Lzip_trailer): New function 'Lt_verify_consistency'. * lzip_index.c: Detect some kinds of corrupt trailers. * main.c (main): Check return value of close( infd ). - * main.c: Compile on DOS with DJGPP. + Compile on DOS with DJGPP. (Reported by Robert Riebisch). * clzip.texi: Improve descriptions of '-0..-9', '-m', and '-s'. * configure: Accept appending to CFLAGS; 'CFLAGS+=OPTIONS'. * INSTALL: Document use of CFLAGS+='-D __USE_MINGW_ANSI_STDIO'. @@ -53,13 +63,13 @@ * New option '--loose-trailing'. * Improve corrupt header detection to HD=3. * main.c: Show corrupt or truncated header in multimember file. - * main.c (main): Option '-S, --volume-size' now keeps input files. + (main): Make option '-S, --volume-size' keep input files. + (main): Show final diagnostic when testing multiple files. + (set_c_outname): Do not add a second '.lz' to the arg of '-o'. * encoder_base.*: Adjust dictionary size for each member. * Replace 'bits/byte' with inverse compression ratio in output. * Show progress of decompression at verbosity level 2 (-vv). * Show progress of (de)compression only if stderr is a terminal. - * main.c: Show final diagnostic when testing multiple files. - * main.c: Do not add a second .lz extension to the arg of -o. * decoder.c (LZd_verify_trailer): Show stored sizes also in hex. Show dictionary size at verbosity level 4 (-vvvv). * clzip.texi: New chapter 'Meaning of clzip's output'. @@ -73,7 +83,7 @@ * Compression time of options -1 to -9 has been reduced by 1%. * Decompression time has been reduced by 7%. * main.c: Continue testing if any input file is a terminal. - * main.c: Show trailing data in both hexadecimal and ASCII. + Show trailing data in both hexadecimal and ASCII. * lzip_index.c: Improve detection of bad dict and trailing data. * lzip.h: Unify messages for bad magic, trailing data, etc. * clzip.texi: Add missing chapters from lzip.texi. @@ -82,16 +92,14 @@ * Version 1.8 released. * New option '-a, --trailing-error'. - * main.c (decompress): Print up to 6 bytes of trailing data when - '-vvvv' is specified. - * decoder.c (LZd_verify_trailer): Remove test of final code. * main.c (main): Delete '--output' file if infd is a terminal. - * main.c (main): Don't use stdin more than once. + (main): Don't use stdin more than once. + (decompress): Print 6 bytes of trailing data at verbosity level 4. + * decoder.c (LZd_verify_trailer): Remove test of final code. * clzip.texi: New chapter 'Trailing data'. * configure: Avoid warning on some shells when testing for gcc. * Makefile.in: Detect the existence of install-info. - * check.sh: A POSIX shell is required to run the tests. - * check.sh: Don't check error messages. + * check.sh: Require a POSIX shell. Don't check error messages. 2015-07-07 Antonio Diaz Diaz <antonio@gnu.org> @@ -123,8 +131,8 @@ * Compression time has been reduced by 10%. * Decompression time has been reduced by 8%. * Makefile.in: New targets 'install-as-lzip' and 'install-bin'. - * main.c: Use 'setmode' instead of '_setmode' on Windows and OS/2. * main.c: Define 'strtoull' to 'strtoul' on Windows. + (main): Use 'setmode' instead of '_setmode' on Windows and OS/2. 2012-02-25 Antonio Diaz Diaz <ant_diaz@teleline.es> @@ -141,29 +149,29 @@ * Version 1.2 released. * New option '-F, --recompress'. - * main.c (decompress): Print only one status line for each - multimember file when only one '-v' is specified. * encoder.h (Lee_update_prices): Update high length symbol prices independently of the value of 'pos_state'. This gives better compression for large values of '--match-length' without being slower. * encoder.h, encoder.c: Optimize pair price calculations, reducing compression time for large values of '--match-length' by up to 6%. + * main.c (decompress): Print only one status line for each + multimember file when only one '-v' is specified. 2011-01-11 Antonio Diaz Diaz <ant_diaz@teleline.es> * Version 1.1 released. * Code has been converted to 'C89 + long long' from C99. - * main.c: Fix warning about fchown return value being ignored. * decoder.c: '-tvvvv' now shows compression ratio. * main.c: Match length limit set by options -1 to -8 has been reduced to extend range of use towards gzip. Lower numbers now compress less but faster. (-1 now takes 43% less time for only 20% larger compressed size). Exit with status 1 if any output file exists and is skipped. - * Compression ratio of option '-9' has been slightly increased. - * main.c (open_instream): Don't show the message + Fix warning about fchown's return value being ignored. + (open_instream): Don't show the message " and '--stdout' was not specified" for directories, etc. + * Compression ratio of option '-9' has been slightly increased. * New examples have been added to the manual. 2010-04-05 Antonio Diaz Diaz <ant_diaz@teleline.es> diff --git a/Makefile.in b/Makefile.in index 55e2bcb..0696854 100644 --- a/Makefile.in +++ b/Makefile.in @@ -2,8 +2,8 @@ DISTNAME = $(pkgname)-$(pkgversion) INSTALL = install INSTALL_PROGRAM = $(INSTALL) -m 755 -INSTALL_DATA = $(INSTALL) -m 644 INSTALL_DIR = $(INSTALL) -d -m 755 +INSTALL_DATA = $(INSTALL) -m 644 SHELL = /bin/sh CAN_RUN_INSTALLINFO = $(SHELL) -c "install-info --version" > /dev/null 2>&1 @@ -31,7 +31,8 @@ main.o : main.c # prevent 'make' from trying to remake source files $(VPATH)/configure $(VPATH)/Makefile.in $(VPATH)/doc/$(pkgname).texi : ; -%.h %.c : ; +MAKEFLAGS += -r +.SUFFIXES : $(objs) : Makefile carg_parser.o : carg_parser.h @@ -130,10 +131,7 @@ dist : doc $(DISTNAME)/testsuite/test.txt \ $(DISTNAME)/testsuite/fox.lz \ $(DISTNAME)/testsuite/fox_*.lz \ - $(DISTNAME)/testsuite/fox6.lz \ - $(DISTNAME)/testsuite/fox6_mark.lz \ - $(DISTNAME)/testsuite/test.txt.lz \ - $(DISTNAME)/testsuite/test_em.txt.lz + $(DISTNAME)/testsuite/test.txt.lz rm -f $(DISTNAME) lzip -v -9 $(DISTNAME).tar @@ -1,24 +1,11 @@ -Changes in version 1.14: +Changes in version 1.15: -The option '--empty-error', which forces exit status 2 if any empty member -is found, has been added. +clzip now exits with error status 2 if any empty member is found in a +multimember file. -The option '--marking-error', which forces exit status 2 if the first LZMA -byte is non-zero in any member, has been added. +clzip now exits with error status 2 if the first byte of the LZMA stream is +not 0. -File diagnostics have been reformatted as 'PROGRAM: FILE: MESSAGE'. +Options '--empty-error' and '--marking-error' have been removed. -Diagnostics caused by invalid arguments to command-line options now show the -argument and the name of the option. - -The option '-o, --output' now preserves dates, permissions, and ownership of -the file when (de)compressing exactly one file. - -The option '-o, --output' now creates missing intermediate directories when -writing to a file. - -The variable MAKEINFO has been added to configure and Makefile.in. - -It has been documented in INSTALL that when choosing a C standard, the POSIX -features need to be enabled explicitly: - ./configure CFLAGS+='--std=c99 -D_XOPEN_SOURCE=500' +The chapter 'Syntax of command-line arguments' has been added to the manual. @@ -1,20 +1,21 @@ +See the file INSTALL for compilation and installation instructions. + Description -Clzip is a C language version of lzip, compatible with lzip 1.4 or newer. As -clzip is written in C, it may be easier to integrate in applications like -package managers, embedded devices, or systems lacking a C++ compiler. +Clzip is a C language version of lzip intended for systems lacking a C++ +compiler. Lzip is a lossless data compressor with a user interface similar to the one -of gzip or bzip2. Lzip uses a simplified form of the 'Lempel-Ziv-Markov -chain-Algorithm' (LZMA) stream format to maximize interoperability. The -maximum dictionary size is 512 MiB so that any lzip file can be decompressed -on 32-bit machines. Lzip provides accurate and robust 3-factor integrity -checking. Lzip can compress about as fast as gzip (lzip -0) or compress most -files more than bzip2 (lzip -9). Decompression speed is intermediate between -gzip and bzip2. Lzip is better than gzip and bzip2 from a data recovery -perspective. Lzip has been designed, written, and tested with great care to -replace gzip and bzip2 as the standard general-purpose compressed format for -Unix-like systems. +of gzip or bzip2. Lzip uses a simplified form of LZMA (Lempel-Ziv-Markov +chain-Algorithm) designed to achieve complete interoperability between +implementations. The maximum dictionary size is 512 MiB so that any lzip +file can be decompressed on 32-bit machines. Lzip provides accurate and +robust 3-factor integrity checking. 'lzip -0' compresses about as fast as +gzip, while 'lzip -9' compresses most files more than bzip2. Decompression +speed is intermediate between gzip and bzip2. Lzip provides better data +recovery capabilities than gzip and bzip2. Lzip has been designed, written, +and tested with great care to replace gzip and bzip2 as general-purpose +compressed format for Unix-like systems. For compressing/decompressing large files on multiprocessor machines plzip can be much faster than lzip at the cost of a slightly reduced compression @@ -95,7 +96,6 @@ also supported. Clzip can produce multimember files, and lziprecover can safely recover the undamaged members in case of file damage. Clzip can also split the compressed output in volumes of a given size, even when reading from standard input. -This allows the direct creation of multivolume compressed tar archives. Clzip is able to compress and decompress streams of unlimited size by automatically creating multimember output. The members so created are large, @@ -105,16 +105,16 @@ In spite of its name (Lempel-Ziv-Markov chain-Algorithm), LZMA is not a concrete algorithm; it is more like "any algorithm using the LZMA coding scheme". For example, the option '-0' of lzip uses the scheme in almost the simplest way possible; issuing the longest match it can find, or a literal -byte if it can't find a match. Inversely, a much more elaborated way of -finding coding sequences of minimum size than the one currently used by lzip -could be developed, and the resulting sequence could also be coded using the -LZMA coding scheme. +byte if it can't find a match. Inversely, a more elaborate way of finding +coding sequences of minimum size than the one currently used by lzip could +be developed, and the resulting sequence could also be coded using the LZMA +coding scheme. Clzip currently implements two variants of the LZMA algorithm: fast (used by option '-0') and normal (used by all other compression levels). The high compression of LZMA comes from combining two basic, well-proven -compression ideas: sliding dictionaries (LZ77) and markov models (the thing +compression ideas: sliding dictionaries (LZ77) and Markov models (the thing used by every compression algorithm that uses a range encoder or similar order-0 entropy coder as its last stage) with segregation of contexts according to what the bits are used for. @@ -125,6 +125,9 @@ definition of Markov chains), G.N.N. Martin (for the definition of range encoding), Igor Pavlov (for putting all the above together in LZMA), and Julian Seward (for bzip2's CLI). +Clzip uses Arg_parser for command-line argument parsing: +http://www.nongnu.org/arg-parser/arg_parser.html + LANGUAGE NOTE: Uncompressed = not compressed = plain data; it may never have been compressed. Decompressed is used to refer to data which have undergone the process of decompression. diff --git a/carg_parser.c b/carg_parser.c index edb4eb9..9400342 100644 --- a/carg_parser.c +++ b/carg_parser.c @@ -32,15 +32,15 @@ static void * ap_resize_buffer( void * buf, const int min_size ) } -static char push_back_record( struct Arg_parser * const ap, const int code, +static char push_back_record( Arg_parser * const ap, const int code, const char * const long_name, const char * const argument ) { - struct ap_Record * p; + ap_Record * p; void * tmp = ap_resize_buffer( ap->data, - ( ap->data_size + 1 ) * sizeof (struct ap_Record) ); + ( ap->data_size + 1 ) * sizeof (ap_Record) ); if( !tmp ) return 0; - ap->data = (struct ap_Record *)tmp; + ap->data = (ap_Record *)tmp; p = &(ap->data[ap->data_size]); p->code = code; if( long_name ) @@ -71,7 +71,7 @@ static char push_back_record( struct Arg_parser * const ap, const int code, } -static char add_error( struct Arg_parser * const ap, const char * const msg ) +static char add_error( Arg_parser * const ap, const char * const msg ) { const int len = strlen( msg ); void * tmp = ap_resize_buffer( ap->error, ap->error_size + len + 1 ); @@ -83,7 +83,7 @@ static char add_error( struct Arg_parser * const ap, const char * const msg ) } -static void free_data( struct Arg_parser * const ap ) +static void free_data( Arg_parser * const ap ) { int i; for( i = 0; i < ap->data_size; ++i ) @@ -94,10 +94,9 @@ static void free_data( struct Arg_parser * const ap ) /* Return 0 only if out of memory. */ -static char parse_long_option( struct Arg_parser * const ap, +static char parse_long_option( Arg_parser * const ap, const char * const opt, const char * const arg, - const struct ap_Option options[], - int * const argindp ) + const ap_Option options[], int * const argindp ) { unsigned len; int index = -1, i; @@ -148,21 +147,21 @@ static char parse_long_option( struct Arg_parser * const ap, add_error( ap, "' requires an argument" ); return 1; } - return push_back_record( ap, options[index].code, - options[index].long_name, &opt[len+3] ); + return push_back_record( ap, options[index].code, options[index].long_name, + &opt[len+3] ); /* argument may be empty */ } - if( options[index].has_arg == ap_yes ) + if( options[index].has_arg == ap_yes || options[index].has_arg == ap_yme ) { - if( !arg || !arg[0] ) + if( !arg || ( options[index].has_arg == ap_yes && !arg[0] ) ) { add_error( ap, "option '--" ); add_error( ap, options[index].long_name ); add_error( ap, "' requires an argument" ); return 1; } ++*argindp; - return push_back_record( ap, options[index].code, - options[index].long_name, arg ); + return push_back_record( ap, options[index].code, options[index].long_name, + arg ); /* argument may be empty */ } return push_back_record( ap, options[index].code, @@ -171,10 +170,9 @@ static char parse_long_option( struct Arg_parser * const ap, /* Return 0 only if out of memory. */ -static char parse_short_option( struct Arg_parser * const ap, +static char parse_short_option( Arg_parser * const ap, const char * const opt, const char * const arg, - const struct ap_Option options[], - int * const argindp ) + const ap_Option options[], int * const argindp ) { int cind = 1; /* character index in opt */ @@ -204,15 +202,15 @@ static char parse_short_option( struct Arg_parser * const ap, if( !push_back_record( ap, c, 0, &opt[cind] ) ) return 0; ++*argindp; cind = 0; } - else if( options[index].has_arg == ap_yes ) + else if( options[index].has_arg == ap_yes || options[index].has_arg == ap_yme ) { - if( !arg || !arg[0] ) + if( !arg || ( options[index].has_arg == ap_yes && !arg[0] ) ) { add_error( ap, "option requires an argument -- '" ); add_error( ap, code_str ); add_error( ap, "'" ); return 1; } - ++*argindp; cind = 0; + ++*argindp; cind = 0; /* argument may be empty */ if( !push_back_record( ap, c, 0, arg ) ) return 0; } else if( !push_back_record( ap, c, 0, 0 ) ) return 0; @@ -221,9 +219,9 @@ static char parse_short_option( struct Arg_parser * const ap, } -char ap_init( struct Arg_parser * const ap, +char ap_init( Arg_parser * const ap, const int argc, const char * const argv[], - const struct ap_Option options[], const char in_order ) + const ap_Option options[], const char in_order ) { const char ** non_options = 0; /* skipped non-options */ int non_options_size = 0; /* number of skipped non-options */ @@ -282,7 +280,7 @@ out: if( non_options ) free( non_options ); } -void ap_free( struct Arg_parser * const ap ) +void ap_free( Arg_parser * const ap ) { free_data( ap ); if( ap->error ) { free( ap->error ); ap->error = 0; } @@ -290,29 +288,25 @@ void ap_free( struct Arg_parser * const ap ) } -const char * ap_error( const struct Arg_parser * const ap ) - { return ap->error; } - - -int ap_arguments( const struct Arg_parser * const ap ) - { return ap->data_size; } +const char * ap_error( const Arg_parser * const ap ) { return ap->error; } +int ap_arguments( const Arg_parser * const ap ) { return ap->data_size; } -int ap_code( const struct Arg_parser * const ap, const int i ) +int ap_code( const Arg_parser * const ap, const int i ) { if( i < 0 || i >= ap_arguments( ap ) ) return 0; return ap->data[i].code; } -const char * ap_parsed_name( const struct Arg_parser * const ap, const int i ) +const char * ap_parsed_name( const Arg_parser * const ap, const int i ) { if( i < 0 || i >= ap_arguments( ap ) || !ap->data[i].parsed_name ) return ""; return ap->data[i].parsed_name; } -const char * ap_argument( const struct Arg_parser * const ap, const int i ) +const char * ap_argument( const Arg_parser * const ap, const int i ) { if( i < 0 || i >= ap_arguments( ap ) || !ap->data[i].argument ) return ""; return ap->data[i].argument; diff --git a/carg_parser.h b/carg_parser.h index 69ce271..294756c 100644 --- a/carg_parser.h +++ b/carg_parser.h @@ -37,60 +37,65 @@ The argument '--' terminates all options; any following arguments are treated as non-option arguments, even if they begin with a hyphen. - The syntax for optional option arguments is '-<short_option><argument>' - (without whitespace), or '--<long_option>=<argument>'. + The syntax of options with an optional argument is + '-<short_option><argument>' (without whitespace), or + '--<long_option>=<argument>'. + + The syntax of options with an empty argument is '-<short_option> ""', + '--<long_option> ""', or '--<long_option>=""'. */ #ifdef __cplusplus extern "C" { #endif -enum ap_Has_arg { ap_no, ap_yes, ap_maybe }; +/* ap_yme = yes but maybe empty */ +typedef enum ap_Has_arg { ap_no, ap_yes, ap_maybe, ap_yme } ap_Has_arg; -struct ap_Option +typedef struct ap_Option { int code; /* Short option letter or code ( code != 0 ) */ const char * long_name; /* Long option name (maybe null) */ - enum ap_Has_arg has_arg; - }; + ap_Has_arg has_arg; + } ap_Option; -struct ap_Record +typedef struct ap_Record { int code; char * parsed_name; char * argument; - }; + } ap_Record; -struct Arg_parser +typedef struct Arg_parser { - struct ap_Record * data; + ap_Record * data; char * error; int data_size; int error_size; - }; + } Arg_parser; -char ap_init( struct Arg_parser * const ap, +char ap_init( Arg_parser * const ap, const int argc, const char * const argv[], - const struct ap_Option options[], const char in_order ); + const ap_Option options[], const char in_order ); -void ap_free( struct Arg_parser * const ap ); +void ap_free( Arg_parser * const ap ); -const char * ap_error( const struct Arg_parser * const ap ); +const char * ap_error( const Arg_parser * const ap ); /* The number of arguments parsed. May be different from argc. */ -int ap_arguments( const struct Arg_parser * const ap ); +int ap_arguments( const Arg_parser * const ap ); /* If ap_code( i ) is 0, ap_argument( i ) is a non-option. Else ap_argument( i ) is the option's argument (or empty). */ -int ap_code( const struct Arg_parser * const ap, const int i ); +int ap_code( const Arg_parser * const ap, const int i ); /* Full name of the option parsed (short or long). */ -const char * ap_parsed_name( const struct Arg_parser * const ap, const int i ); +const char * ap_parsed_name( const Arg_parser * const ap, const int i ); -const char * ap_argument( const struct Arg_parser * const ap, const int i ); +const char * ap_argument( const Arg_parser * const ap, const int i ); #ifdef __cplusplus } @@ -6,7 +6,7 @@ # to copy, distribute, and modify it. pkgname=clzip -pkgversion=1.14 +pkgversion=1.15-rc1 progname=clzip srctrigger=doc/${pkgname}.texi @@ -109,7 +109,7 @@ while [ $# != 0 ] ; do exit 1 ;; esac - # Check if the option took a separate argument + # Check whether the option took a separate argument if [ "${arg2}" = yes ] ; then if [ $# != 0 ] ; then args="${args} \"$1\"" ; shift else echo "configure: Missing argument to '${option}'" 1>&2 @@ -66,14 +66,14 @@ int writeblock( const int fd, const uint8_t * const buf, const int size ) } -bool Rd_read_block( struct Range_decoder * const rdec ) +bool Rd_read_block( Range_decoder * const rdec ) { if( !rdec->at_stream_end ) { rdec->stream_pos = readblock( rdec->infd, rdec->buffer, rd_buffer_size ); if( rdec->stream_pos != rd_buffer_size && errno ) { show_error( "Read error", errno, false ); cleanup_and_fail( 1 ); } - rdec->at_stream_end = ( rdec->stream_pos < rd_buffer_size ); + rdec->at_stream_end = rdec->stream_pos < rd_buffer_size; rdec->partial_member_pos += rdec->pos; rdec->pos = 0; show_dprogress( 0, 0, 0, 0 ); @@ -82,7 +82,7 @@ bool Rd_read_block( struct Range_decoder * const rdec ) } -void LZd_flush_data( struct LZ_decoder * const d ) +void LZd_flush_data( LZ_decoder * const d ) { if( d->pos > d->stream_pos ) { @@ -90,7 +90,7 @@ void LZd_flush_data( struct LZ_decoder * const d ) CRC32_update_buf( &d->crc, d->buffer + d->stream_pos, size ); if( d->outfd >= 0 && writeblock( d->outfd, d->buffer + d->stream_pos, size ) != size ) - { show_error( "Write error", errno, false ); cleanup_and_fail( 1 ); } + { show_error( write_error_msg, errno, false ); cleanup_and_fail( 1 ); } if( d->pos >= d->dictionary_size ) { d->partial_data_pos += d->pos; d->pos = 0; d->pos_wrapped = true; } d->stream_pos = d->pos; @@ -98,9 +98,7 @@ void LZd_flush_data( struct LZ_decoder * const d ) } -static int LZd_check_trailer( struct LZ_decoder * const d, - struct Pretty_print * const pp, - const bool ignore_empty ) +static bool LZd_check_trailer( LZ_decoder * const d, Pretty_print * const pp ) { Lzip_trailer trailer; int size = Rd_read_data( d->rdec, trailer, Lt_size ); @@ -145,8 +143,7 @@ static int LZd_check_trailer( struct LZ_decoder * const d, fprintf( stderr, "Member size mismatch; stored %llu (0x%llX), computed %llu (0x%llX)\n", tm_size, tm_size, member_size, member_size ); } } - if( error ) return 3; - if( !ignore_empty && data_size == 0 ) return 5; + if( error ) return false; if( verbosity >= 2 ) { if( verbosity >= 4 ) show_header( d->dictionary_size ); @@ -161,18 +158,16 @@ static int LZd_check_trailer( struct LZ_decoder * const d, if( verbosity >= 3 ) fprintf( stderr, "%9llu out, %8llu in. ", data_size, member_size ); } - return 0; + return true; } /* Return value: 0 = OK, 1 = decoder error, 2 = unexpected EOF, 3 = trailer error, 4 = unknown marker found, - 5 = empty member found, 6 = marked member found. */ -int LZd_decode_member( struct LZ_decoder * const d, - const struct Cl_options * const cl_opts, - struct Pretty_print * const pp ) + 5 = nonzero first LZMA byte found. */ +int LZd_decode_member( LZ_decoder * const d, Pretty_print * const pp ) { - struct Range_decoder * const rdec = d->rdec; + Range_decoder * const rdec = d->rdec; Bit_model bm_literal[1<<literal_context_bits][0x300]; Bit_model bm_match[states][pos_states]; Bit_model bm_rep[states]; @@ -183,8 +178,8 @@ int LZd_decode_member( struct LZ_decoder * const d, Bit_model bm_dis_slot[len_states][1<<dis_slot_bits]; Bit_model bm_dis[modeled_distances-end_dis_model+1]; Bit_model bm_align[dis_align_size]; - struct Len_model match_len_model; - struct Len_model rep_len_model; + Len_model match_len_model; + Len_model rep_len_model; unsigned rep0 = 0; /* rep[0-3] latest four distances */ unsigned rep1 = 0; /* used for efficient coding of */ unsigned rep2 = 0; /* repeated distances */ @@ -204,7 +199,7 @@ int LZd_decode_member( struct LZ_decoder * const d, Lm_init( &match_len_model ); Lm_init( &rep_len_model ); - if( !Rd_load( rdec, cl_opts->ignore_marking ) ) return 6; + if( !Rd_load( rdec ) ) return 5; while( !Rd_finished( rdec ) ) { const int pos_state = LZd_data_position( d ) & pos_state_mask; @@ -249,39 +244,33 @@ int LZd_decode_member( struct LZ_decoder * const d, } else /* match */ { + rep3 = rep2; rep2 = rep1; rep1 = rep0; len = Rd_decode_len( rdec, &match_len_model, pos_state ); - unsigned distance = Rd_decode_tree6( rdec, bm_dis_slot[get_len_state(len)] ); - if( distance >= start_dis_model ) + rep0 = Rd_decode_tree6( rdec, bm_dis_slot[get_len_state(len)] ); + if( rep0 >= start_dis_model ) { - const unsigned dis_slot = distance; + const unsigned dis_slot = rep0; const int direct_bits = ( dis_slot >> 1 ) - 1; - distance = ( 2 | ( dis_slot & 1 ) ) << direct_bits; + rep0 = ( 2 | ( dis_slot & 1 ) ) << direct_bits; if( dis_slot < end_dis_model ) - distance += Rd_decode_tree_reversed( rdec, - bm_dis + ( distance - dis_slot ), direct_bits ); + rep0 += Rd_decode_tree_reversed( rdec, bm_dis + ( rep0 - dis_slot ), + direct_bits ); else { - distance += - Rd_decode( rdec, direct_bits - dis_align_bits ) << dis_align_bits; - distance += Rd_decode_tree_reversed4( rdec, bm_align ); - if( distance == 0xFFFFFFFFU ) /* marker found */ + rep0 += Rd_decode( rdec, direct_bits - dis_align_bits ) << dis_align_bits; + rep0 += Rd_decode_tree_reversed4( rdec, bm_align ); + if( rep0 == 0xFFFFFFFFU ) /* marker found */ { Rd_normalize( rdec ); LZd_flush_data( d ); if( len == min_match_len ) /* End Of Stream marker */ - return LZd_check_trailer( d, pp, cl_opts->ignore_empty ); - if( len == min_match_len + 1 ) /* Sync Flush marker */ - { Rd_load( rdec, true ); continue; } - if( verbosity >= 0 ) - { - Pp_show_msg( pp, 0 ); - fprintf( stderr, "Unsupported marker code '%d'\n", len ); - } + { if( LZd_check_trailer( d, pp ) ) return 0; else return 3; } + if( verbosity >= 0 ) { Pp_show_msg( pp, 0 ); + fprintf( stderr, "Unsupported marker code '%d'\n", len ); } return 4; } } } - rep3 = rep2; rep2 = rep1; rep1 = rep0; rep0 = distance; state = St_set_match( state ); if( rep0 >= d->dictionary_size || ( rep0 >= d->pos && !d->pos_wrapped ) ) { LZd_flush_data( d ); return 1; } @@ -29,9 +29,9 @@ struct Range_decoder bool at_stream_end; }; -bool Rd_read_block( struct Range_decoder * const rdec ); +bool Rd_read_block( Range_decoder * const rdec ); -static inline bool Rd_init( struct Range_decoder * const rdec, const int ifd ) +static inline bool Rd_init( Range_decoder * const rdec, const int ifd ) { rdec->partial_member_pos = 0; rdec->buffer = (uint8_t *)malloc( rd_buffer_size ); @@ -45,27 +45,27 @@ static inline bool Rd_init( struct Range_decoder * const rdec, const int ifd ) return true; } -static inline void Rd_free( struct Range_decoder * const rdec ) +static inline void Rd_free( Range_decoder * const rdec ) { free( rdec->buffer ); } -static inline bool Rd_finished( struct Range_decoder * const rdec ) +static inline bool Rd_finished( Range_decoder * const rdec ) { return rdec->pos >= rdec->stream_pos && !Rd_read_block( rdec ); } static inline unsigned long long -Rd_member_position( const struct Range_decoder * const rdec ) +Rd_member_position( const Range_decoder * const rdec ) { return rdec->partial_member_pos + rdec->pos; } -static inline void Rd_reset_member_position( struct Range_decoder * const rdec ) +static inline void Rd_reset_member_position( Range_decoder * const rdec ) { rdec->partial_member_pos = 0; rdec->partial_member_pos -= rdec->pos; } -static inline uint8_t Rd_get_byte( struct Range_decoder * const rdec ) +static inline uint8_t Rd_get_byte( Range_decoder * const rdec ) { /* 0xFF avoids decoder error if member is truncated at EOS marker */ if( Rd_finished( rdec ) ) return 0xFF; return rdec->buffer[rdec->pos++]; } -static inline int Rd_read_data( struct Range_decoder * const rdec, +static inline int Rd_read_data( Range_decoder * const rdec, uint8_t * const outbuf, const int size ) { int sz = 0; @@ -79,25 +79,24 @@ static inline int Rd_read_data( struct Range_decoder * const rdec, return sz; } -static inline bool Rd_load( struct Range_decoder * const rdec, - const bool ignore_marking ) +static inline bool Rd_load( Range_decoder * const rdec ) { - int i; rdec->code = 0; rdec->range = 0xFFFFFFFFU; - /* check and discard first byte of the LZMA stream */ - if( Rd_get_byte( rdec ) != 0 && !ignore_marking ) return false; - for( i = 0; i < 4; ++i ) rdec->code = (rdec->code << 8) | Rd_get_byte( rdec ); + /* check first byte of the LZMA stream */ + if( Rd_get_byte( rdec ) != 0 ) return false; + int i; for( i = 0; i < 4; ++i ) + rdec->code = (rdec->code << 8) | Rd_get_byte( rdec ); return true; } -static inline void Rd_normalize( struct Range_decoder * const rdec ) +static inline void Rd_normalize( Range_decoder * const rdec ) { if( rdec->range <= 0x00FFFFFFU ) { rdec->range <<= 8; rdec->code = (rdec->code << 8) | Rd_get_byte( rdec ); } } -static inline unsigned Rd_decode( struct Range_decoder * const rdec, +static inline unsigned Rd_decode( Range_decoder * const rdec, const int num_bits ) { unsigned symbol = 0; @@ -108,14 +107,14 @@ static inline unsigned Rd_decode( struct Range_decoder * const rdec, rdec->range >>= 1; /* symbol <<= 1; */ /* if( rdec->code >= rdec->range ) { rdec->code -= rdec->range; symbol |= 1; } */ - const bool bit = ( rdec->code >= rdec->range ); + const bool bit = rdec->code >= rdec->range; symbol <<= 1; symbol += bit; rdec->code -= rdec->range & ( 0U - bit ); } return symbol; } -static inline unsigned Rd_decode_bit( struct Range_decoder * const rdec, +static inline unsigned Rd_decode_bit( Range_decoder * const rdec, Bit_model * const probability ) { Rd_normalize( rdec ); @@ -135,7 +134,7 @@ static inline unsigned Rd_decode_bit( struct Range_decoder * const rdec, } } -static inline void Rd_decode_symbol_bit( struct Range_decoder * const rdec, +static inline void Rd_decode_symbol_bit( Range_decoder * const rdec, Bit_model * const probability, unsigned * symbol ) { Rd_normalize( rdec ); @@ -155,7 +154,7 @@ static inline void Rd_decode_symbol_bit( struct Range_decoder * const rdec, } } -static inline void Rd_decode_symbol_bit_reversed( struct Range_decoder * const rdec, +static inline void Rd_decode_symbol_bit_reversed( Range_decoder * const rdec, Bit_model * const probability, unsigned * model, unsigned * symbol, const int i ) { @@ -177,7 +176,7 @@ static inline void Rd_decode_symbol_bit_reversed( struct Range_decoder * const r } } -static inline unsigned Rd_decode_tree6( struct Range_decoder * const rdec, +static inline unsigned Rd_decode_tree6( Range_decoder * const rdec, Bit_model bm[] ) { unsigned symbol = 1; @@ -190,7 +189,7 @@ static inline unsigned Rd_decode_tree6( struct Range_decoder * const rdec, return symbol & 0x3F; } -static inline unsigned Rd_decode_tree8( struct Range_decoder * const rdec, +static inline unsigned Rd_decode_tree8( Range_decoder * const rdec, Bit_model bm[] ) { unsigned symbol = 1; @@ -205,9 +204,8 @@ static inline unsigned Rd_decode_tree8( struct Range_decoder * const rdec, return symbol & 0xFF; } -static inline unsigned -Rd_decode_tree_reversed( struct Range_decoder * const rdec, - Bit_model bm[], const int num_bits ) +static inline unsigned Rd_decode_tree_reversed( Range_decoder * const rdec, + Bit_model bm[], const int num_bits ) { unsigned model = 1; unsigned symbol = 0; @@ -218,7 +216,7 @@ Rd_decode_tree_reversed( struct Range_decoder * const rdec, } static inline unsigned -Rd_decode_tree_reversed4( struct Range_decoder * const rdec, Bit_model bm[] ) +Rd_decode_tree_reversed4( Range_decoder * const rdec, Bit_model bm[] ) { unsigned model = 1; unsigned symbol = 0; @@ -229,7 +227,7 @@ Rd_decode_tree_reversed4( struct Range_decoder * const rdec, Bit_model bm[] ) return symbol; } -static inline unsigned Rd_decode_matched( struct Range_decoder * const rdec, +static inline unsigned Rd_decode_matched( Range_decoder * const rdec, Bit_model bm[], unsigned match_byte ) { unsigned symbol = 1; @@ -244,9 +242,8 @@ static inline unsigned Rd_decode_matched( struct Range_decoder * const rdec, } } -static inline unsigned Rd_decode_len( struct Range_decoder * const rdec, - struct Len_model * const lm, - const int pos_state ) +static inline unsigned Rd_decode_len( Range_decoder * const rdec, + Len_model * const lm, const int pos_state ) { Bit_model * bm; unsigned mask, offset, symbol = 1; @@ -269,10 +266,10 @@ len3: } -struct LZ_decoder +typedef struct LZ_decoder { unsigned long long partial_data_pos; - struct Range_decoder * rdec; + Range_decoder * rdec; unsigned dictionary_size; uint8_t * buffer; /* output buffer */ unsigned pos; /* current pos in buffer */ @@ -280,14 +277,14 @@ struct LZ_decoder uint32_t crc; int outfd; /* output file descriptor */ bool pos_wrapped; - }; + } LZ_decoder; -void LZd_flush_data( struct LZ_decoder * const d ); +void LZd_flush_data( LZ_decoder * const d ); -static inline uint8_t LZd_peek_prev( const struct LZ_decoder * const d ) +static inline uint8_t LZd_peek_prev( const LZ_decoder * const d ) { return d->buffer[((d->pos > 0) ? d->pos : d->dictionary_size)-1]; } -static inline uint8_t LZd_peek( const struct LZ_decoder * const d, +static inline uint8_t LZd_peek( const LZ_decoder * const d, const unsigned distance ) { const unsigned i = ( ( d->pos > distance ) ? 0 : d->dictionary_size ) + @@ -295,27 +292,27 @@ static inline uint8_t LZd_peek( const struct LZ_decoder * const d, return d->buffer[i]; } -static inline void LZd_put_byte( struct LZ_decoder * const d, const uint8_t b ) +static inline void LZd_put_byte( LZ_decoder * const d, const uint8_t b ) { d->buffer[d->pos] = b; if( ++d->pos >= d->dictionary_size ) LZd_flush_data( d ); } -static inline void LZd_copy_block( struct LZ_decoder * const d, +static inline void LZd_copy_block( LZ_decoder * const d, const unsigned distance, unsigned len ) { unsigned lpos = d->pos, i = lpos - distance - 1; bool fast, fast2; if( lpos > distance ) { - fast = ( len < d->dictionary_size - lpos ); - fast2 = ( fast && len <= lpos - i ); + fast = len < d->dictionary_size - lpos; + fast2 = fast && len <= lpos - i; } else { i += d->dictionary_size; - fast = ( len < d->dictionary_size - i ); /* (i == pos) may happen */ - fast2 = ( fast && len <= i - lpos ); + fast = len < d->dictionary_size - i; /* (i == pos) may happen */ + fast2 = fast && len <= i - lpos; } if( fast ) /* no wrap */ { @@ -333,8 +330,7 @@ static inline void LZd_copy_block( struct LZ_decoder * const d, } } -static inline bool LZd_init( struct LZ_decoder * const d, - struct Range_decoder * const rde, +static inline bool LZd_init( LZ_decoder * const d, Range_decoder * const rde, const unsigned dict_size, const int ofd ) { d->partial_data_pos = 0; @@ -352,16 +348,13 @@ static inline bool LZd_init( struct LZ_decoder * const d, return true; } -static inline void LZd_free( struct LZ_decoder * const d ) +static inline void LZd_free( LZ_decoder * const d ) { free( d->buffer ); } -static inline unsigned LZd_crc( const struct LZ_decoder * const d ) +static inline unsigned LZd_crc( const LZ_decoder * const d ) { return d->crc ^ 0xFFFFFFFFU; } -static inline unsigned long long -LZd_data_position( const struct LZ_decoder * const d ) +static inline unsigned long long LZd_data_position( const LZ_decoder * const d ) { return d->partial_data_pos + d->pos; } -int LZd_decode_member( struct LZ_decoder * const d, - const struct Cl_options * const cl_opts, - struct Pretty_print * const pp ); +int LZd_decode_member( LZ_decoder * const d, Pretty_print * const pp ); diff --git a/doc/clzip.1 b/doc/clzip.1 index 46f69e9..64d5366 100644 --- a/doc/clzip.1 +++ b/doc/clzip.1 @@ -1,26 +1,25 @@ .\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.49.2. -.TH CLZIP "1" "January 2024" "clzip 1.14" "User Commands" +.TH CLZIP "1" "November 2024" "clzip 1.15-rc1" "User Commands" .SH NAME clzip \- reduces the size of files .SH SYNOPSIS .B clzip [\fI\,options\/\fR] [\fI\,files\/\fR] .SH DESCRIPTION -Clzip is a C language version of lzip, compatible with lzip 1.4 or newer. As -clzip is written in C, it may be easier to integrate in applications like -package managers, embedded devices, or systems lacking a C++ compiler. +Clzip is a C language version of lzip intended for systems lacking a C++ +compiler. .PP Lzip is a lossless data compressor with a user interface similar to the one -of gzip or bzip2. Lzip uses a simplified form of the 'Lempel\-Ziv\-Markov -chain\-Algorithm' (LZMA) stream format to maximize interoperability. The -maximum dictionary size is 512 MiB so that any lzip file can be decompressed -on 32\-bit machines. Lzip provides accurate and robust 3\-factor integrity -checking. Lzip can compress about as fast as gzip (lzip \fB\-0\fR) or compress most -files more than bzip2 (lzip \fB\-9\fR). Decompression speed is intermediate between -gzip and bzip2. Lzip is better than gzip and bzip2 from a data recovery -perspective. Lzip has been designed, written, and tested with great care to -replace gzip and bzip2 as the standard general\-purpose compressed format for -Unix\-like systems. +of gzip or bzip2. Lzip uses a simplified form of LZMA (Lempel\-Ziv\-Markov +chain\-Algorithm) designed to achieve complete interoperability between +implementations. The maximum dictionary size is 512 MiB so that any lzip +file can be decompressed on 32\-bit machines. Lzip provides accurate and +robust 3\-factor integrity checking. 'lzip \fB\-0\fR' compresses about as fast as +gzip, while 'lzip \fB\-9\fR' compresses most files more than bzip2. Decompression +speed is intermediate between gzip and bzip2. Lzip provides better data +recovery capabilities than gzip and bzip2. Lzip has been designed, written, +and tested with great care to replace gzip and bzip2 as general\-purpose +compressed format for Unix\-like systems. .SH OPTIONS .TP \fB\-h\fR, \fB\-\-help\fR @@ -33,7 +32,7 @@ output version information and exit exit with error status if trailing data .TP \fB\-b\fR, \fB\-\-member\-size=\fR<bytes> -set member size limit in bytes +set member size limit of multimember files .TP \fB\-c\fR, \fB\-\-stdout\fR write to standard output, keep input files @@ -83,12 +82,6 @@ alias for \fB\-0\fR \fB\-\-best\fR alias for \fB\-9\fR .TP -\fB\-\-empty\-error\fR -exit with error status if empty member in file -.TP -\fB\-\-marking\-error\fR -exit with error status if 1st LZMA byte not 0 -.TP \fB\-\-loose\-trailing\fR allow trailing data seeming corrupt header .PP diff --git a/doc/clzip.info b/doc/clzip.info index 2d83e3c..adcc658 100644 --- a/doc/clzip.info +++ b/doc/clzip.info @@ -11,17 +11,18 @@ File: clzip.info, Node: Top, Next: Introduction, Up: (dir) Clzip Manual ************ -This manual is for Clzip (version 1.14, 22 January 2024). +This manual is for Clzip (version 1.15-rc1, 23 November 2024). * Menu: * Introduction:: Purpose and features of clzip * Output:: Meaning of clzip's output * Invoking clzip:: Command-line interface -* Quality assurance:: Design, development, and testing of lzip -* Algorithm:: How clzip compresses the data +* Argument syntax:: By convention, options start with a hyphen * File format:: Detailed format of the compressed file * Stream format:: Format of the LZMA stream in lzip files +* Quality assurance:: Design, development, and testing of lzip +* Algorithm:: How clzip compresses the data * Trailing data:: Extra data appended to the file * Examples:: A small tutorial with examples * Problems:: Reporting bugs @@ -40,21 +41,20 @@ File: clzip.info, Node: Introduction, Next: Output, Prev: Top, Up: Top 1 Introduction ************** -Clzip is a C language version of lzip, compatible with lzip 1.4 or newer. -As clzip is written in C, it may be easier to integrate in applications like -package managers, embedded devices, or systems lacking a C++ compiler. +Clzip is a C language version of lzip intended for systems lacking a C++ +compiler. Lzip is a lossless data compressor with a user interface similar to the -one of gzip or bzip2. Lzip uses a simplified form of the 'Lempel-Ziv-Markov -chain-Algorithm' (LZMA) stream format to maximize interoperability. The -maximum dictionary size is 512 MiB so that any lzip file can be decompressed -on 32-bit machines. Lzip provides accurate and robust 3-factor integrity -checking. Lzip can compress about as fast as gzip (lzip -0) or compress most -files more than bzip2 (lzip -9). Decompression speed is intermediate between -gzip and bzip2. Lzip is better than gzip and bzip2 from a data recovery -perspective. Lzip has been designed, written, and tested with great care to -replace gzip and bzip2 as the standard general-purpose compressed format for -Unix-like systems. +one of gzip or bzip2. Lzip uses a simplified form of LZMA (Lempel-Ziv-Markov +chain-Algorithm) designed to achieve complete interoperability between +implementations. The maximum dictionary size is 512 MiB so that any lzip +file can be decompressed on 32-bit machines. Lzip provides accurate and +robust 3-factor integrity checking. 'lzip -0' compresses about as fast as +gzip, while 'lzip -9' compresses most files more than bzip2. Decompression +speed is intermediate between gzip and bzip2. Lzip provides better data +recovery capabilities than gzip and bzip2. Lzip has been designed, written, +and tested with great care to replace gzip and bzip2 as general-purpose +compressed format for Unix-like systems. For compressing/decompressing large files on multiprocessor machines plzip can be much faster than lzip at the cost of a slightly reduced @@ -92,7 +92,7 @@ byte near the beginning is a thing of the past. The member trailer stores the 32-bit CRC of the original data, the size of the original data, and the size of the member. These values, together -with the "End Of Stream" marker, provide a 3-factor integrity checking which +with the 'End Of Stream' marker, provide a 3-factor integrity checking which guarantees that the decompressed version of the data is identical to the original. This guards against corruption of the compressed data, and against undetected bugs in clzip (hopefully very unlikely). The chances of data @@ -105,9 +105,9 @@ makes it safer than compressors returning ambiguous warning values (like gzip) when it is used as a back end for other programs like tar or zutils. Clzip automatically uses for each file the largest dictionary size that -does not exceed neither the file size nor the limit given. Keep in mind -that the decompression memory requirement is affected at compression time -by the choice of dictionary size limit. +does not exceed neither the file size nor the limit given. The dictionary +size used for decompression is the same dictionary size used for +compression. The amount of memory required for compression is about 1 or 2 times the dictionary size limit (1 if input file size is less than dictionary size @@ -146,8 +146,7 @@ also supported. Clzip can produce multimember files, and lziprecover can safely recover the undamaged members in case of file damage. Clzip can also split the compressed output in volumes of a given size, even when reading from -standard input. This allows the direct creation of multivolume compressed -tar archives. +standard input. Clzip is able to compress and decompress streams of unlimited size by automatically creating multimember output. The members so created are large, @@ -201,7 +200,7 @@ have been compressed. Decompressed is used to refer to data which have undergone the process of decompression. -File: clzip.info, Node: Invoking clzip, Next: Quality assurance, Prev: Output, Up: Top +File: clzip.info, Node: Invoking clzip, Next: Argument syntax, Prev: Output, Up: Top 3 Invoking clzip **************** @@ -216,8 +215,7 @@ means standard input. It can be mixed with other FILES and is read just once, the first time it appears in the command line. Remember to prepend './' to any file name beginning with a hyphen, or use '--'. - clzip supports the following options: *Note Argument syntax: -(arg_parser)Argument syntax. +clzip supports the following options: *Note Argument syntax::. '-h' '--help' @@ -236,11 +234,12 @@ once, the first time it appears in the command line. Remember to prepend '-b BYTES' '--member-size=BYTES' - When compressing, set the member size limit to BYTES. It is advisable - to keep members smaller than RAM size so that they can be repaired with - lziprecover in case of corruption. A small member size may degrade - compression ratio, so use it only when needed. Valid values range from - 100 kB to 2 PiB. Defaults to 2 PiB. + When compressing, set the member size limit to BYTES. If BYTES is + smaller than the compressed size, a multimember file is produced. It is + advisable to keep members smaller than RAM size so that they can be + repaired with lziprecover in case of corruption. A small member size + may degrade compression ratio, so use it only when needed. Valid + values range from 100 kB to 2 PiB. Defaults to 2 PiB. '-c' '--stdout' @@ -261,7 +260,8 @@ once, the first time it appears in the command line. Remember to prepend status 1. If a file fails to decompress, or is a terminal, clzip exits immediately with error status 2 without decompressing the rest of the files. A terminal is considered an uncompressed file, and therefore - invalid. + invalid. A multimember file with one or more empty members is accepted + if redirected to standard input. '-f' '--force' @@ -285,7 +285,8 @@ once, the first time it appears in the command line. Remember to prepend '-v', the dictionary size, the number of members in the file, and the amount of trailing data (if any) are also printed. With '-vv', the positions and sizes of each member in multimember files are also - printed. + printed. A multimember file with one or more empty members is accepted + if redirected to standard input. If any file is damaged, does not exist, can't be opened, or is not regular, the final exit status is > 0. '-lq' can be used to check @@ -360,7 +361,8 @@ once, the first time it appears in the command line. Remember to prepend fails the test, does not exist, can't be opened, or is a terminal, clzip continues testing the rest of the files. A final diagnostic is shown at verbosity level 1 or higher if any file fails the test when - testing multiple files. + testing multiple files. A multimember file with one or more empty + members is accepted if redirected to standard input. '-v' '--verbose' @@ -370,8 +372,8 @@ once, the first time it appears in the command line. Remember to prepend When decompressing or testing, further -v's (up to 4) increase the verbosity level, showing status, compression ratio, dictionary size, trailer contents (CRC, data size, member size), and up to 6 bytes of - trailing data (if any) both in hexadecimal and as a string of printable - ASCII characters. + trailing data (if any) both in hexadecimal and as a string of + printable ASCII characters. Two or more '-v' options show the progress of (de)compression. '-0 .. -9' @@ -391,6 +393,7 @@ once, the first time it appears in the command line. Remember to prepend '-s64MiB -m273' Level Dictionary size (-s) Match length limit (-m) + ------------------------------------------------------ -0 64 KiB 16 bytes -1 1 MiB 5 bytes -2 1.5 MiB 6 bytes @@ -406,21 +409,11 @@ once, the first time it appears in the command line. Remember to prepend '--best' Aliases for GNU gzip compatibility. -'--empty-error' - Exit with error status 2 if any empty member is found in the input - files. - -'--marking-error' - Exit with error status 2 if the first LZMA byte is non-zero in any - member of the input files. This may be caused by data corruption or by - deliberate insertion of tracking information in the file. Use - 'lziprecover --clear-marking' to clear any such non-zero bytes. - '--loose-trailing' When decompressing, testing, or listing, allow trailing data whose first bytes are so similar to the magic bytes of a lzip header that they can be confused with a corrupt header. Use this option if a file - triggers a "corrupt header" error and the cause is not indeed a + triggers a 'corrupt header' error and the cause is not indeed a corrupt header. @@ -431,6 +424,7 @@ and may be followed by a multiplier and an optional 'B' for "byte". Table of SI and binary prefixes (unit multipliers): Prefix Value | Prefix Value +---------------------------------------------------------------------- k kilobyte (10^3 = 1000) | Ki kibibyte (2^10 = 1024) M megabyte (10^6) | Mi mebibyte (2^20) G gigabyte (10^9) | Gi gibibyte (2^30) @@ -449,278 +443,58 @@ corrupt or invalid input file, 3 for an internal consistency error (e.g., bug) which caused clzip to panic. -File: clzip.info, Node: Quality assurance, Next: Algorithm, Prev: Invoking clzip, Up: Top - -4 Design, development, and testing of lzip -****************************************** - -There are two ways of constructing a software design: One way is to make it -so simple that there are obviously no deficiencies and the other way is to -make it so complicated that there are no obvious deficiencies. The first -method is far more difficult. --- C.A.R. Hoare - - Lzip has been designed, written, and tested with great care to replace -gzip and bzip2 as the standard general-purpose compressed format for -Unix-like systems. This chapter describes the lessons learned from these -previous formats, and their application to the design of lzip. The lzip -format specification has been reviewed carefully and is believed to be free -from design errors. - - -4.1 Format design -================= - -When gzip was designed in 1992, computers and operating systems were much -less capable than they are today. The designers of gzip tried to work around -some of those limitations, like 8.3 file names, with additional fields in -the file format. - - Today those limitations have mostly disappeared, and the format of gzip -has proved to be unnecessarily complicated. It includes fields that were -never used, others that have lost their usefulness, and finally others that -have become too limited. - - Bzip2 was designed 5 years later, and its format is simpler than the one -of gzip. - - Probably the worst defect of the gzip format from the point of view of -data safety is the variable size of its header. If the byte at offset 3 -(flags) of a gzip member gets corrupted, it may become difficult to recover -the data, even if the compressed blocks are intact, because it can't be -known with certainty where the compressed blocks begin. - - By contrast, the header of a lzip member has a fixed length of 6. The -LZMA stream in a lzip member always starts at offset 6, making it trivial to -recover the data even if the whole header becomes corrupt. - - Bzip2 also provides a header of fixed length and marks the begin and end -of each compressed block with six magic bytes, making it possible to find -the compressed blocks even in case of file damage. But bzip2 does not store -the size of each compressed block, as lzip does. - - Lziprecover is able to provide unique data recovery capabilities because -the lzip format is extraordinarily safe. The simple and safe design of the -file format complements the embedded error detection provided by the LZMA -data stream. Any distance larger than the dictionary size acts as a -forbidden symbol, allowing the decompressor to detect the approximate -position of errors, and leaving very little work for the check sequence -(CRC and data sizes) in the detection of errors. Lzip is usually able to -detect all possible bit flips in the compressed data without resorting to -the check sequence. It would be difficult to write an automatic recovery -tool like lziprecover for the gzip format. And, as far as I know, it has -never been written. - - Lzip, like gzip and bzip2, uses a CRC32 to check the integrity of the -decompressed data because it provides optimal accuracy in the detection of -errors up to a compressed size of about 16 GiB, a size larger than that of -most files. In the case of lzip, the additional detection capability of the -decompressor reduces the probability of undetected errors several million -times more, resulting in a combined integrity checking optimally accurate -for any member size produced by lzip. Preliminary results suggest that the -lzip format is safe enough to be used in critical safety avionics systems. - - The lzip format is designed for long-term archiving. Therefore it -excludes any unneeded features that may interfere with the future -extraction of the decompressed data. - - -4.1.1 Gzip format (mis)features not present in lzip ---------------------------------------------------- - -'Multiple algorithms' - Gzip provides a CM (Compression Method) field that has never been used - because it is a bad idea to begin with. New compression methods may - require additional fields, making it impossible to implement new - methods and, at the same time, keep the same format. This field does - not solve the problem of format proliferation; it just makes the - problem less obvious. - -'Optional fields in header' - Unless special precautions are taken, optional fields are generally a - bad idea because they produce a header of variable size. The gzip - header has 2 fields that, in addition to being optional, are - zero-terminated. This means that if any byte inside the field gets - zeroed, or if the terminating zero gets altered, gzip won't be able to - find neither the header CRC nor the compressed blocks. - -'Optional CRC for the header' - Using an optional CRC for the header is not only a bad idea, it is an - error; it circumvents the Hamming distance (HD) of the CRC and may - prevent the extraction of perfectly good data. For example, if the CRC - is used and the bit enabling it is reset by a bit flip, then the - header seems to be intact (in spite of being corrupt) while the - compressed blocks seem to be totally unrecoverable (in spite of being - intact). Very misleading indeed. - -'Metadata' - The gzip format stores some metadata, like the modification time of the - original file or the operating system on which compression took place. - This complicates reproducible compression (obtaining identical - compressed output from identical input). - - -4.1.2 Lzip format improvements over gzip and bzip2 --------------------------------------------------- - -'64-bit size field' - Probably the most frequently reported shortcoming of the gzip format - is that it only stores the least significant 32 bits of the - uncompressed size. The size of any file larger or equal than 4 GiB - gets truncated. - - Bzip2 does not store the uncompressed size of the file. - - The lzip format provides a 64-bit field for the uncompressed size. - Additionally, lzip produces multimember output automatically when the - size is too large for a single member, allowing for an unlimited - uncompressed size. - -'Distributed index' - The lzip format provides a distributed index that, among other things, - helps plzip to decompress several times faster than pigz and helps - lziprecover do its job. Neither the gzip format nor the bzip2 format - do provide an index. - - A distributed index is safer and more scalable than a monolithic - index. The monolithic index introduces a single point of failure in - the compressed file and may limit the number of members or the total - uncompressed size. - - -4.2 Quality of implementation -============================= - -Our civilization depends critically on software; it had better be quality -software. --- Bjarne Stroustrup - -'Accurate and robust error detection' - The lzip format provides 3-factor integrity checking, and the - decompressors report mismatches in each factor separately. This method - detects most false positives for corruption. If just one byte in one - factor fails but the other two factors match the data, it probably - means that the data are intact and the corruption just affects the - mismatching factor (CRC, data size, or member size) in the member - trailer. - -'Multiple implementations' - Just like the lzip format provides 3-factor protection against - undetected data corruption, the development methodology of the lzip - family of compressors provides 3-factor protection against undetected - programming errors. - - Three related but independent compressor implementations, lzip, clzip, - and minilzip/lzlib, are developed concurrently. Every stable release - of any of them is tested to check that it produces identical output to - the other two. This guarantees that all three implement the same - algorithm, and makes it unlikely that any of them may contain serious - undiscovered errors. In fact, no errors have been discovered in lzip - since 2009. - - Additionally, the three implementations have been extensively tested - with unzcrash, valgrind, and 'american fuzzy lop' without finding a - single vulnerability or false negative. *Note Unzcrash: - (lziprecover)Unzcrash. - -'Dictionary size' - Lzip automatically adapts the dictionary size to the size of each file. - In addition to reducing the amount of memory required for - decompression, this feature also minimizes the probability of being - affected by RAM errors during compression. - -'Exit status' - Returning a warning status of 2 is a design flaw of compress that - leaked into the design of gzip. Both bzip2 and lzip are free from this - flaw. - - - -File: clzip.info, Node: Algorithm, Next: File format, Prev: Quality assurance, Up: Top - -5 Algorithm -*********** - -In spite of its name (Lempel-Ziv-Markov chain-Algorithm), LZMA is not a -concrete algorithm; it is more like "any algorithm using the LZMA coding -scheme". LZMA compression consists in describing the uncompressed data as a -succession of coding sequences from the set shown in Section 'What is -coded' (*note what-is-coded::), and then encoding them using a range -encoder. For example, the option '-0' of clzip uses the scheme in almost -the simplest way possible; issuing the longest match it can find, or a -literal byte if it can't find a match. Inversely, a much more elaborated way -of finding coding sequences of minimum size than the one currently used by -clzip could be developed, and the resulting sequence could also be coded -using the LZMA coding scheme. +File: clzip.info, Node: Argument syntax, Next: File format, Prev: Invoking clzip, Up: Top - Clzip currently implements two variants of the LZMA algorithm: fast -(used by option '-0') and normal (used by all other compression levels). +4 Syntax of command-line arguments +********************************** - The high compression of LZMA comes from combining two basic, well-proven -compression ideas: sliding dictionaries (LZ77) and markov models (the thing -used by every compression algorithm that uses a range encoder or similar -order-0 entropy coder as its last stage) with segregation of contexts -according to what the bits are used for. +POSIX recommends these conventions for command-line arguments. - Clzip is a two stage compressor. The first stage is a Lempel-Ziv coder, -which reduces redundancy by translating chunks of data to their -corresponding distance-length pairs. The second stage is a range encoder -that uses a different probability model for each type of data: distances, -lengths, literal bytes, etc. + * A command-line argument is an option if it begins with a hyphen ('-'). - Here is how it works, step by step: + * Option names are single alphanumeric characters. - 1) The member header is written to the output stream. - - 2) The first byte is coded literally, because there are no previous -bytes to which the match finder can refer to. + * Certain options require an argument. - 3) The main encoder advances to the next byte in the input data and -calls the match finder. - - 4) The match finder fills an array with the minimum distances before the -current byte where a match of a given length can be found. - - 5) Go back to step 3 until a sequence (formed of pairs, repeated -distances, and literal bytes) of minimum price has been formed. Where the -price represents the number of output bits produced. - - 6) The range encoder encodes the sequence produced by the main encoder -and sends the bytes produced to the output stream. + * An option and its argument may or may not appear as separate tokens. + (In other words, the whitespace separating them is optional). Thus, + '-o foo' and '-ofoo' are equivalent. - 7) Go back to step 3 until the input data are finished or until the -member or volume size limits are reached. + * One or more options without arguments, followed by at most one option + that takes an argument, may follow a hyphen in a single token. Thus, + '-abc' is equivalent to '-a -b -c'. - 8) The range encoder is flushed. + * Options typically precede other non-option arguments. - 9) The member trailer is written to the output stream. + * The argument '--' terminates all options; any following arguments are + treated as non-option arguments, even if they begin with a hyphen. - 10) If there are more data to compress, go back to step 1. + * A token consisting of a single hyphen character is interpreted as an + ordinary non-option argument. By convention, it is used to specify + standard input, standard output, or a file named '-'. +GNU adds "long options" to these conventions: - During compression, clzip reads data in large blocks (one dictionary -size at a time). Therefore it may block for up to tens of seconds any -process feeding data to it through a pipe. This is normal. The blocking -intervals get longer with higher compression levels because dictionary size -increases (and compression speed decreases) with compression level. + * A long option consists of two hyphens ('--') followed by a name made + of alphanumeric characters and hyphens. Option names are typically one + to three words long, with hyphens to separate words. Abbreviations can + be used for the long option names as long as the abbreviations are + unique. -The ideas embodied in clzip are due to (at least) the following people: -Abraham Lempel and Jacob Ziv (for the LZ algorithm), Andrei Markov (for the -definition of Markov chains), G.N.N. Martin (for the definition of range -encoding), Igor Pavlov (for putting all the above together in LZMA), and -Julian Seward (for bzip2's CLI). + * A long option and its argument may or may not appear as separate + tokens. In the latter case they must be separated by an equal sign '='. + Thus, '--foo bar' and '--foo=bar' are equivalent. -File: clzip.info, Node: File format, Next: Stream format, Prev: Algorithm, Up: Top +File: clzip.info, Node: File format, Next: Stream format, Prev: Argument syntax, Up: Top -6 File format +5 File format ************* Perfection is reached, not when there is no longer anything to add, but when there is no longer anything to take away. -- Antoine de Saint-Exupery - In the diagram below, a box like this: +---+ @@ -735,12 +509,12 @@ when there is no longer anything to take away. represents a variable number of bytes. - - A lzip file consists of one or more independent "members" (compressed -data sets). The members simply appear one after another in the file, with no +A lzip file consists of one or more independent "members" (compressed data +sets). The members simply appear one after another in the file, with no additional information before, between, or after them. Each member can encode in compressed form up to 16 EiB - 1 byte of uncompressed data. The -size of a multimember file is unlimited. +size of a multimember file is unlimited. Empty members (data size = 0) are +not allowed in multimember files. Each member has the following structure: @@ -768,7 +542,7 @@ size of a multimember file is unlimited. Valid values for dictionary size range from 4 KiB to 512 MiB. 'LZMA stream' - The LZMA stream, finished by an "End Of Stream" marker. Uses default + The LZMA stream, terminated by an 'End Of Stream' marker. Uses default values for encoder properties. *Note Stream format::, for a complete description. @@ -785,14 +559,13 @@ size of a multimember file is unlimited. files. Lzip limits the member size to 2 PiB to prevent the data size field from overflowing. - -File: clzip.info, Node: Stream format, Next: Trailing data, Prev: File format, Up: Top +File: clzip.info, Node: Stream format, Next: Quality assurance, Prev: File format, Up: Top -7 Format of the LZMA stream in lzip files +6 Format of the LZMA stream in lzip files ***************************************** -The LZMA algorithm has three parameters, called "special LZMA properties", +The LZMA algorithm has three parameters, called 'special LZMA properties', to adjust it for some kinds of binary data. These parameters are: 'literal_context_bits' (with a default value of 3), 'literal_pos_state_bits' (with a default value of 0), and 'pos_state_bits' @@ -801,14 +574,17 @@ uses the default values for these parameters. In particular 'literal_pos_state_bits' has been optimized away and does not even appear in the code. - Lzip finishes the LZMA stream with an "End Of Stream" (EOS) marker (the + The first byte of the LZMA stream is set to zero to help tools like grep +recognize lzip files as binary files. + + The LZMA stream is terminated by an 'End Of Stream' (EOS) marker (the distance-length pair 0xFFFFFFFFU, 2), which in conjunction with the 'member size' field in the member trailer allows the checking of stream integrity. The EOS marker is the only LZMA marker allowed in lzip files. The LZMA stream in lzip files always has these two features (default properties and EOS marker) and is referred to in this document as LZMA-302eos. This simplified and marker-terminated form of the LZMA stream format has been -chosen to maximize interoperability and safety. +chosen to achieve complete interoperability and robust safety. The second stage of LZMA is a range encoder that uses a different probability model for each type of symbol: distances, lengths, literal @@ -825,13 +601,12 @@ the range decoder that need to be described accurately, the source code of a real decompressor seems the only appropriate reference to use. What follows is a description of the decoding algorithm for LZMA-302eos -streams using as reference the source code of "lzd", an educational +streams using as reference the source code of lzd, an educational decompressor for lzip files, included in appendix A. *Note Reference source code::. Lzd is written in C++11 and can be downloaded from the lzip download directory. - -7.1 What is coded +6.1 What is coded ================= The LZMA stream includes literals, matches, and repeated matches (matches @@ -905,7 +680,7 @@ slot + direct_bits distances from 4 to 127 slot + (direct_bits - 4) + 4 bits distances from 128 to 2^32 - 1 -7.2 The coding contexts +6.2 The coding contexts ======================= These contexts ('Bit_model' in the source), are integers or arrays of @@ -995,8 +770,7 @@ corresponding bit in 'match_byte'. After the first difference is found, the rest of the byte is decoded using the normal bit tree context. (See 'decode_matched' in the source). - -7.3 The range decoder +6.3 The range decoder ===================== The LZMA stream is consumed one byte at a time by the range decoder. (See @@ -1009,29 +783,284 @@ of decoded bits, depending on how well these bits agree with their context. decoded) and 'code' (representing the current point within 'range'). 'range' is initialized to 2^32 - 1, and 'code' is initialized to 0. - The range encoder produces a first 0 byte that must be ignored by the -range decoder. (See the 'Range_decoder' constructor in the source). - - -7.4 Decoding and checking the LZMA stream +6.4 Decoding and checking the LZMA stream ========================================= After decoding the member header and obtaining the dictionary size, the range decoder is initialized and then the LZMA decoder enters a loop (see 'decode_member' in the source) where it invokes the range decoder with the appropriate contexts to decode the different coding sequences (matches, -repeated matches, and literal bytes), until the "End Of Stream" marker is +repeated matches, and literal bytes), until the 'End Of Stream' marker is decoded. - Once the "End Of Stream" marker has been decoded, the decompressor reads + Once the 'End Of Stream' marker has been decoded, the decompressor reads and decodes the member trailer, and checks that the three integrity factors stored there (CRC, data size, and member size) match those computed from the data. -File: clzip.info, Node: Trailing data, Next: Examples, Prev: Stream format, Up: Top +File: clzip.info, Node: Quality assurance, Next: Algorithm, Prev: Stream format, Up: Top -8 Extra data appended to the file +7 Design, development, and testing of lzip +****************************************** + +There are two ways of constructing a software design: One way is to make it +so simple that there are obviously no deficiencies and the other way is to +make it so complicated that there are no obvious deficiencies. The first +method is far more difficult. +-- C.A.R. Hoare + + Lzip has been designed, written, and tested with great care to replace +gzip and bzip2 as general-purpose compressed format for Unix-like systems. +This chapter describes the lessons learned from these previous formats, and +their application to the design of lzip. The lzip format specification has +been reviewed carefully and is believed to be free from design errors. + +7.1 Format design +================= + +When gzip was designed in 1992, computers and operating systems were much +less capable than they are today. The designers of gzip tried to work around +some of those limitations, like 8.3 file names, with additional fields in +the file format. + + Today those limitations have mostly disappeared, and the format of gzip +has proved to be unnecessarily complicated. It includes fields that were +never used, others that have lost their usefulness, and finally others that +have become too limited. + + Bzip2 was designed 5 years later, and its format is simpler than the one +of gzip. + + Probably the worst defect of the gzip format from the point of view of +data safety is the variable size of its header. If the byte at offset 3 +(flags) of a gzip member gets corrupted, it may become difficult to recover +the data, even if the compressed blocks are intact, because it can't be +known with certainty where the compressed blocks begin. + + By contrast, the header of a lzip member has a fixed length of 6. The +LZMA stream in a lzip member always starts at offset 6, making it trivial to +recover the data even if the whole header becomes corrupt. + + Bzip2 also provides a header of fixed length and marks the begin and end +of each compressed block with six magic bytes, making it possible to find +the compressed blocks even in case of file damage. But bzip2 does not store +the size of each compressed block, as lzip does. + + Lziprecover is able to provide unique data recovery capabilities because +the lzip format is extraordinarily safe. The simple and safe design of the +file format complements the embedded error detection provided by the LZMA +data stream. Any distance larger than the dictionary size acts as a +forbidden symbol, allowing the decompressor to detect the approximate +position of errors, and leaving very little work for the check sequence +(CRC and data sizes) in the detection of errors. Lzip is usually able to +detect all possible bit flips in the compressed data without resorting to +the check sequence. It would be difficult to write an automatic recovery +tool like lziprecover for the gzip format. And, as far as I know, it has +never been written. + + Lzip, like gzip and bzip2, uses a CRC32 to check the integrity of the +decompressed data because it provides optimal accuracy in the detection of +errors up to a compressed size of about 16 GiB, a size larger than that of +most files. In the case of lzip, the additional detection capability of the +decompressor reduces the probability of undetected errors several million +times more, resulting in a combined integrity checking optimally accurate +for any member size produced by lzip. Preliminary results suggest that the +lzip format is safe enough to be used in critical safety avionics systems. + + The lzip format is designed for long-term archiving. Therefore it +excludes any unneeded features that may interfere with the future +extraction of the decompressed data. + +7.1.1 Gzip format (mis)features not present in lzip +--------------------------------------------------- + +'Multiple algorithms' + Gzip provides a CM (Compression Method) field that has never been used + because it is a bad idea to begin with. New compression methods may + require additional fields, making it impossible to implement new + methods and, at the same time, keep the same format. This field does + not solve the problem of format proliferation; it just makes the + problem less obvious. + +'Optional fields in header' + Unless special precautions are taken, optional fields are generally a + bad idea because they produce a header of variable size. The gzip + header has 2 fields that, in addition to being optional, are + zero-terminated. This means that if any byte inside the field gets + zeroed, or if the terminating zero gets altered, gzip won't be able to + find neither the header CRC nor the compressed blocks. + +'Optional CRC for the header' + Using an optional CRC for the header is not only a bad idea, it is an + error; it circumvents the Hamming distance (HD) of the CRC and may + prevent the extraction of perfectly good data. For example, if the CRC + is used and the bit enabling it is reset by a bit flip, then the + header seems to be intact (in spite of being corrupt) while the + compressed blocks seem to be totally unrecoverable (in spite of being + intact). Very misleading indeed. + +'Metadata' + The gzip format stores some metadata, like the modification time of the + original file or the operating system on which compression took place. + This complicates reproducible compression (obtaining identical + compressed output from identical input). + + +7.1.2 Lzip format improvements over gzip and bzip2 +-------------------------------------------------- + +'64-bit size field' + Probably the most frequently reported shortcoming of the gzip format + is that it only stores the least significant 32 bits of the + uncompressed size. The size of any file larger or equal than 4 GiB + gets truncated. + + Bzip2 does not store the uncompressed size of the file. + + The lzip format provides a 64-bit field for the uncompressed size. + Additionally, lzip produces multimember output automatically when the + size is too large for a single member, allowing for an unlimited + uncompressed size. + +'Distributed index' + The lzip format provides a distributed index that, among other things, + helps plzip to decompress several times faster than pigz and helps + lziprecover do its job. Neither the gzip format nor the bzip2 format + do provide an index. + + A distributed index is safer and more scalable than a monolithic + index. The monolithic index introduces a single point of failure in + the compressed file and may limit the number of members or the total + uncompressed size. + + +7.2 Quality of implementation +============================= + +Our civilization depends critically on software; it had better be quality +software. +-- Bjarne Stroustrup + +'Accurate and robust error detection' + The lzip format provides 3-factor integrity checking, and the + decompressors report mismatches in each factor separately. This method + detects most false positives for corruption. If just one byte in one + factor fails but the other two factors match the data, it probably + means that the data are intact and the corruption just affects the + mismatching factor (CRC, data size, or member size) in the member + trailer. + +'Multiple implementations' + Just like the lzip format provides 3-factor protection against + undetected data corruption, the development methodology of the lzip + family of compressors provides 3-factor protection against undetected + programming errors. + + Three related but independent compressor implementations, lzip, clzip, + and minilzip/lzlib, are developed concurrently. Every stable release + of any of them is tested to check that it produces identical output to + the other two. This guarantees that all three implement the same + algorithm, and makes it unlikely that any of them may contain serious + undiscovered errors. In fact, no errors have been discovered in lzip + since 2009. + + Additionally, the three implementations have been extensively tested + with unzcrash, valgrind, and 'american fuzzy lop' without finding a + single vulnerability or false negative. *Note Unzcrash: + (lziprecover)Unzcrash. + +'Dictionary size' + Lzip automatically adapts the dictionary size to the size of each file. + In addition to reducing the amount of memory required for + decompression, this feature also minimizes the probability of being + affected by RAM errors during compression. + +'Exit status' + Returning a warning status of 2 is a design flaw of compress that + leaked into the design of gzip. Both bzip2 and lzip are free from this + flaw. + + + +File: clzip.info, Node: Algorithm, Next: Trailing data, Prev: Quality assurance, Up: Top + +8 Algorithm +*********** + +In spite of its name (Lempel-Ziv-Markov chain-Algorithm), LZMA is not a +concrete algorithm; it is more like "any algorithm using the LZMA coding +scheme". LZMA compression consists in describing the uncompressed data as a +succession of coding sequences from the set shown in Section 'What is +coded' (*note what-is-coded::), and then encoding them using a range +encoder. For example, the option '-0' of lzip uses the scheme in almost the +simplest way possible; issuing the longest match it can find, or a literal +byte if it can't find a match. Inversely, a more elaborate way of finding +coding sequences of minimum size than the one currently used by lzip could +be developed, and the resulting sequence could also be coded using the LZMA +coding scheme. + + Clzip currently implements two variants of the LZMA algorithm: fast +(used by option '-0') and normal (used by all other compression levels). + + The high compression of LZMA comes from combining two basic, well-proven +compression ideas: sliding dictionaries (LZ77) and Markov models (the thing +used by every compression algorithm that uses a range encoder or similar +order-0 entropy coder as its last stage) with segregation of contexts +according to what the bits are used for. + + Clzip is a two stage compressor. The first stage is a Lempel-Ziv coder, +which reduces redundancy by translating chunks of data to their +corresponding distance-length pairs. The second stage is a range encoder +that uses a different probability model for each type of data: distances, +lengths, literal bytes, etc. + + Here is how it works, step by step: + + 1) The member header is written to the output stream. + + 2) The first byte is coded literally, because there are no previous +bytes to which the match finder can refer to. + + 3) The main encoder advances to the next byte in the input data and +calls the match finder. + + 4) The match finder fills an array with the minimum distances before the +current byte where a match of a given length can be found. + + 5) Go back to step 3 until a sequence (formed of pairs, repeated +distances, and literal bytes) of minimum price has been formed. Where the +price represents the number of output bits produced. + + 6) The range encoder encodes the sequence produced by the main encoder +and sends the bytes produced to the output stream. + + 7) Go back to step 3 until the input data are finished or until the +member or volume size limits are reached. + + 8) The range encoder is flushed. + + 9) The member trailer is written to the output stream. + + 10) If there are more data to compress, go back to step 1. + + + During compression, clzip reads data in large blocks (one dictionary +size at a time). Therefore it may block for up to tens of seconds any +process feeding data to it through a pipe. This is normal. The blocking +intervals get longer with higher compression levels because dictionary size +increases (and compression speed decreases) with compression level. + +The ideas embodied in clzip are due to (at least) the following people: +Abraham Lempel and Jacob Ziv (for the LZ algorithm), Andrei Markov (for the +definition of Markov chains), G.N.N. Martin (for the definition of range +encoding), Igor Pavlov (for putting all the above together in LZMA), and +Julian Seward (for bzip2's CLI). + + +File: clzip.info, Node: Trailing data, Next: Examples, Prev: Algorithm, Up: Top + +9 Extra data appended to the file ********************************* Sometimes extra data are found appended to a lzip file after the last @@ -1041,7 +1070,7 @@ member. Such trailing data may be: example when writing to a tape. It is safe to append any amount of padding zero bytes to a lzip file. - * Useful data added by the user; an "End Of File" string (to check that + * Useful data added by the user; an 'End Of File' string (to check that the file has not been truncated), a cryptographically secure hash, a description of file contents, etc. It is safe to append any amount of text to a lzip file as long as none of the first four bytes of the @@ -1077,8 +1106,8 @@ where a file containing trailing data must be rejected, the option File: clzip.info, Node: Examples, Next: Problems, Prev: Trailing data, Up: Top -9 A small tutorial with examples -******************************** +10 A small tutorial with examples +********************************* WARNING! Even if clzip is bug-free, other causes may result in a corrupt compressed file (bugs in the system libraries, memory errors, etc). @@ -1090,38 +1119,32 @@ comparing the compressed file with the original because the corruption happens before clzip compresses the RAM contents, resulting in a valid compressed file containing wrong data. - Example 1: Extract all the files from archive 'foo.tar.lz'. tar -xf foo.tar.lz or clzip -cd foo.tar.lz | tar -xf - - Example 2: Replace a regular file with its compressed version 'file.lz' and show the compression ratio. clzip -v file - Example 3: Like example 2 but the created 'file.lz' is multimember with a member size of 1 MiB. The compression ratio is not shown. clzip -b 1MiB file - Example 4: Restore a regular file from its compressed version 'file.lz'. If the operation is successful, 'file.lz' is removed. clzip -d file.lz - Example 5: Check the integrity of the compressed file 'file.lz' and show status. clzip -tv file.lz - Example 6: The right way of concatenating the decompressed output of two or more compressed files. *Note Trailing data::. @@ -1130,19 +1153,16 @@ more compressed files. *Note Trailing data::. Do this instead clzip -cd file1.lz file2.lz file3.lz - Example 7: Decompress 'file.lz' partially until 10 KiB of decompressed data are produced. clzip -cd file.lz | dd bs=1024 count=10 - Example 8: Decompress 'file.lz' partially from decompressed byte at offset 10000 to decompressed byte at offset 14999 (5000 bytes are produced). clzip -cd file.lz | dd bs=1000 skip=10 count=5 - Example 9: Compress a whole device in /dev/sdc and send the output to 'file.lz'. @@ -1150,18 +1170,15 @@ Example 9: Compress a whole device in /dev/sdc and send the output to or clzip /dev/sdc -o file.lz - Example 10: Create a multivolume compressed tar archive with a volume size of 1440 KiB. tar -c some_directory | clzip -S 1440KiB -o volume_name - - Example 11: Extract a multivolume compressed tar archive. clzip -cd volume_name*.lz | tar -xf - - Example 12: Create a multivolume compressed backup of a large database file with a volume size of 650 MB, where each volume is a multimember file with a member size of 32 MiB. @@ -1171,7 +1188,7 @@ a member size of 32 MiB. File: clzip.info, Node: Problems, Next: Reference source code, Prev: Examples, Up: Top -10 Reporting bugs +11 Reporting bugs ***************** There are probably bugs in clzip. There are certainly errors and omissions @@ -1340,7 +1357,8 @@ public: Range_decoder() : member_pos( header_size ), code( 0 ), range( 0xFFFFFFFFU ) { - get_byte(); // discard first byte of the LZMA stream + if( get_byte() != 0 ) // check first LZMA byte + { std::fputs( "Nonzero first LZMA byte.\n", stderr ); std::exit( 2 ); } for( int i = 0; i < 4; ++i ) code = ( code << 8 ) | get_byte(); } @@ -1583,8 +1601,7 @@ bool LZ_decoder::decode_member() // Return false if error direct_bits ); else { - rep0 += - rdec.decode( direct_bits - dis_align_bits ) << dis_align_bits; + rep0 += rdec.decode( direct_bits-dis_align_bits ) << dis_align_bits; rep0 += rdec.decode_tree_reversed( bm_align, dis_align_bits ); if( rep0 == 0xFFFFFFFFU ) // marker found { @@ -1616,8 +1633,8 @@ int main( const int argc, const char * const argv[] ) "Lzd decompresses from standard input to standard output.\n" "\nCopyright (C) 2024 Antonio Diaz Diaz.\n" "License 2-clause BSD.\n" - "This is free software: you are free to change and redistribute it.\n" - "There is NO WARRANTY, to the extent permitted by law.\n" + "This is free software: you are free to change and redistribute " + "it.\nThere is NO WARRANTY, to the extent permitted by law.\n" "Report bugs to lzip-bug@nongnu.org\n" "Lzd home page: http://www.nongnu.org/lzip/lzd.html\n", PROGVERSION, argv[0] ); @@ -1629,6 +1646,7 @@ int main( const int argc, const char * const argv[] ) setmode( STDOUT_FILENO, O_BINARY ); #endif + bool empty = false, multi = false; for( bool first_member = true; ; first_member = false ) { Lzip_header header; // check header @@ -1643,12 +1661,12 @@ int main( const int argc, const char * const argv[] ) unsigned dict_size = 1 << ( header[5] & 0x1F ); dict_size -= ( dict_size / 16 ) * ( ( header[5] >> 5 ) & 7 ); if( dict_size < min_dictionary_size || dict_size > max_dictionary_size ) - { std::fputs( "Invalid dictionary size in member header.\n", stderr ); - return 2; } + { std::fputs( "Invalid dictionary size in member header.\n", + stderr ); return 2; } LZ_decoder decoder( dict_size ); // decode LZMA stream if( !decoder.decode_member() ) - { std::fputs( "Data error\n", stderr ); return 2; } + { std::fputs( "Data error.\n", stderr ); return 2; } Lzip_trailer trailer; // check trailer for( int i = 0; i < trailer_size; ++i ) trailer[i] = decoder.get_byte(); @@ -1656,25 +1674,28 @@ int main( const int argc, const char * const argv[] ) unsigned crc = 0; for( int i = 3; i >= 0; --i ) crc = ( crc << 8 ) + trailer[i]; if( crc != decoder.crc() ) - { std::fputs( "CRC mismatch\n", stderr ); retval = 2; } + { std::fputs( "CRC mismatch.\n", stderr ); retval = 2; } unsigned long long data_size = 0; for( int i = 11; i >= 4; --i ) data_size = ( data_size << 8 ) + trailer[i]; if( data_size != decoder.data_position() ) - { std::fputs( "Data size mismatch\n", stderr ); retval = 2; } + { std::fputs( "Data size mismatch.\n", stderr ); retval = 2; } + multi = !first_member; if( data_size == 0 ) empty = true; unsigned long long member_size = 0; for( int i = 19; i >= 12; --i ) member_size = ( member_size << 8 ) + trailer[i]; if( member_size != decoder.member_position() ) - { std::fputs( "Member size mismatch\n", stderr ); retval = 2; } + { std::fputs( "Member size mismatch.\n", stderr ); retval = 2; } if( retval ) return retval; } if( std::fclose( stdout ) != 0 ) { std::fprintf( stderr, "Error closing stdout: %s\n", std::strerror( errno ) ); return 1; } + if( empty && multi ) + { std::fputs( "Empty member not allowed.\n", stderr ); return 2; } return 0; } @@ -1688,6 +1709,7 @@ Concept index * Menu: * algorithm: Algorithm. (line 6) +* argument syntax: Argument syntax. (line 6) * bugs: Problems. (line 6) * examples: Examples. (line 6) * file format: File format. (line 6) @@ -1707,22 +1729,23 @@ Concept index Tag Table: Node: Top205 -Node: Introduction1207 -Node: Output7331 -Node: Invoking clzip8934 -Ref: --trailing-error9812 -Node: Quality assurance19918 -Node: Algorithm28733 -Node: File format32141 -Ref: coded-dict-size33571 -Node: Stream format34802 -Ref: what-is-coded37198 -Node: Trailing data46072 -Node: Examples48410 -Ref: concat-example49860 -Node: Problems51090 -Node: Reference source code51626 -Node: Concept index66672 +Node: Introduction1282 +Node: Output7168 +Node: Invoking clzip8771 +Ref: --trailing-error9617 +Node: Argument syntax19833 +Node: File format21597 +Ref: coded-dict-size23096 +Node: Stream format24328 +Ref: what-is-coded26853 +Node: Quality assurance35583 +Node: Algorithm44382 +Node: Trailing data47784 +Node: Examples50118 +Ref: concat-example51564 +Node: Problems52788 +Node: Reference source code53324 +Node: Concept index68636 End Tag Table diff --git a/doc/clzip.texi b/doc/clzip.texi index c98e026..ce4d9ac 100644 --- a/doc/clzip.texi +++ b/doc/clzip.texi @@ -6,8 +6,8 @@ @finalout @c %**end of header -@set UPDATED 22 January 2024 -@set VERSION 1.14 +@set UPDATED 23 November 2024 +@set VERSION 1.15-rc1 @dircategory Compression @direntry @@ -39,10 +39,11 @@ This manual is for Clzip (version @value{VERSION}, @value{UPDATED}). * Introduction:: Purpose and features of clzip * Output:: Meaning of clzip's output * Invoking clzip:: Command-line interface -* Quality assurance:: Design, development, and testing of lzip -* Algorithm:: How clzip compresses the data +* Argument syntax:: By convention, options start with a hyphen * File format:: Detailed format of the compressed file * Stream format:: Format of the LZMA stream in lzip files +* Quality assurance:: Design, development, and testing of lzip +* Algorithm:: How clzip compresses the data * Trailing data:: Extra data appended to the file * Examples:: A small tutorial with examples * Problems:: Reporting bugs @@ -63,22 +64,20 @@ distribute, and modify it. @cindex introduction @uref{http://www.nongnu.org/lzip/clzip.html,,Clzip} -is a C language version of lzip, compatible with @w{lzip 1.4} or newer. -As clzip is written in C, it may be easier to integrate in applications like -package managers, embedded devices, or systems lacking a C++ compiler. +is a C language version of lzip intended for systems lacking a C++ compiler. @uref{http://www.nongnu.org/lzip/lzip.html,,Lzip} is a lossless data compressor with a user interface similar to the one -of gzip or bzip2. Lzip uses a simplified form of the 'Lempel-Ziv-Markov -chain-Algorithm' (LZMA) stream format to maximize interoperability. The -maximum dictionary size is 512 MiB so that any lzip file can be decompressed -on 32-bit machines. Lzip provides accurate and robust 3-factor integrity -checking. Lzip can compress about as fast as gzip @w{(lzip -0)} or compress most -files more than bzip2 @w{(lzip -9)}. Decompression speed is intermediate between -gzip and bzip2. Lzip is better than gzip and bzip2 from a data recovery -perspective. Lzip has been designed, written, and tested with great care to -replace gzip and bzip2 as the standard general-purpose compressed format for -Unix-like systems. +of gzip or bzip2. Lzip uses a simplified form of LZMA (Lempel-Ziv-Markov +chain-Algorithm) designed to achieve complete interoperability between +implementations. The maximum dictionary size is 512 MiB so that any lzip +file can be decompressed on 32-bit machines. Lzip provides accurate and +robust 3-factor integrity checking. @w{@samp{lzip -0}} compresses about as fast as +gzip, while @w{@samp{lzip -9}} compresses most files more than bzip2. Decompression +speed is intermediate between gzip and bzip2. Lzip provides better data +recovery capabilities than gzip and bzip2. Lzip has been designed, written, +and tested with great care to replace gzip and bzip2 as general-purpose +compressed format for Unix-like systems. For compressing/decompressing large files on multiprocessor machines @uref{http://www.nongnu.org/lzip/manual/plzip_manual.html,,plzip} can be @@ -130,7 +129,7 @@ the beginning is a thing of the past. The member trailer stores the 32-bit CRC of the original data, the size of the original data, and the size of the member. These values, together with -the "End Of Stream" marker, provide a 3-factor integrity checking which +the 'End Of Stream' marker, provide a 3-factor integrity checking which guarantees that the decompressed version of the data is identical to the original. This guards against corruption of the compressed data, and against undetected bugs in clzip (hopefully very unlikely). The chances of data @@ -143,9 +142,8 @@ makes it safer than compressors returning ambiguous warning values (like gzip) when it is used as a back end for other programs like tar or zutils. Clzip automatically uses for each file the largest dictionary size that does -not exceed neither the file size nor the limit given. Keep in mind that the -decompression memory requirement is affected at compression time by the -choice of dictionary size limit. +not exceed neither the file size nor the limit given. The dictionary size +used for decompression is the same dictionary size used for compression. The amount of memory required for compression is about 1 or 2 times the dictionary size limit (1 if input file size is less than dictionary size @@ -186,7 +184,6 @@ also supported. Clzip can produce multimember files, and lziprecover can safely recover the undamaged members in case of file damage. Clzip can also split the compressed output in volumes of a given size, even when reading from standard input. -This allows the direct creation of multivolume compressed tar archives. Clzip is able to compress and decompress streams of unlimited size by automatically creating multimember output. The members so created are large, @@ -263,11 +260,8 @@ argument means standard input. It can be mixed with other @var{files} and is read just once, the first time it appears in the command line. Remember to prepend @file{./} to any file name beginning with a hyphen, or use @samp{--}. -clzip supports the following -@uref{http://www.nongnu.org/arg-parser/manual/arg_parser_manual.html#Argument-syntax,,options}: -@ifnothtml -@xref{Argument syntax,,,arg_parser}. -@end ifnothtml +@noindent +clzip supports the following options: @xref{Argument syntax}. @table @code @item -h @@ -288,9 +282,10 @@ garbage that can be safely ignored. @xref{concat-example}. @item -b @var{bytes} @itemx --member-size=@var{bytes} -When compressing, set the member size limit to @var{bytes}. It is advisable -to keep members smaller than RAM size so that they can be repaired with -lziprecover in case of corruption. A small member size may degrade +When compressing, set the member size limit to @var{bytes}. If @var{bytes} +is smaller than the compressed size, a multimember file is produced. It is +advisable to keep members smaller than RAM size so that they can be repaired +with lziprecover in case of corruption. A small member size may degrade compression ratio, so use it only when needed. Valid values range from @w{100 kB} to @w{2 PiB}. Defaults to @w{2 PiB}. @@ -312,7 +307,8 @@ already exists and @option{--force} has not been specified, clzip continues decompressing the rest of the files and exits with error status 1. If a file fails to decompress, or is a terminal, clzip exits immediately with error status 2 without decompressing the rest of the files. A terminal is -considered an uncompressed file, and therefore invalid. +considered an uncompressed file, and therefore invalid. A multimember file +with one or more empty members is accepted if redirected to standard input. @item -f @itemx --force @@ -321,7 +317,7 @@ Force overwrite of output files. @item -F @itemx --recompress When compressing, force re-compression of files whose name already has -the @samp{.lz} or @samp{.tlz} suffix. +the @file{.lz} or @file{.tlz} suffix. @item -k @itemx --keep @@ -335,7 +331,8 @@ even for multimember files. If more than one file is given, a final line containing the cumulative sizes is printed. With @option{-v}, the dictionary size, the number of members in the file, and the amount of trailing data (if any) are also printed. With @option{-vv}, the positions and sizes of each -member in multimember files are also printed. +member in multimember files are also printed. A multimember file with one or +more empty members is accepted if redirected to standard input. If any file is damaged, does not exist, can't be opened, or is not regular, the final exit status is @w{> 0}. @option{-lq} can be used to check quickly @@ -362,15 +359,15 @@ to @option{-c}. @option{-o} has no effect when testing or listing. In order to keep backward compatibility with clzip versions prior to 1.12, when compressing from standard input and no other file names are given, the -extension @samp{.lz} is appended to @var{file} unless it already ends in -@samp{.lz} or @samp{.tlz}. This feature will be removed in a future version +extension @file{.lz} is appended to @var{file} unless it already ends in +@file{.lz} or @file{.tlz}. This feature will be removed in a future version of clzip. Meanwhile, redirection may be used instead of @option{-o} to write -the compressed output to a file without the extension @samp{.lz} in its +the compressed output to a file without the extension @file{.lz} in its name: @w{@samp{clzip < file > foo}}. When compressing and splitting the output in volumes, @var{file} is used as -a prefix, and several files named @samp{@var{file}00001.lz}, -@samp{@var{file}00002.lz}, etc, are created. In this case, only one input +a prefix, and several files named @file{@var{file}00001.lz}, +@file{@var{file}00002.lz}, etc, are created. In this case, only one input file is allowed. @item -q @@ -396,7 +393,7 @@ is affected at compression time by the choice of dictionary size limit. @itemx --volume-size=@var{bytes} When compressing, and @option{-c} has not been also specified, split the compressed output into several volume files with names -@samp{original_name00001.lz}, @samp{original_name00002.lz}, etc, and set the +@file{original_name00001.lz}, @file{original_name00002.lz}, etc, and set the volume size limit to @var{bytes}. Input files are kept unchanged. Each volume is a complete, maybe multimember, lzip file. A small volume size may degrade compression ratio, so use it only when needed. Valid values range @@ -410,18 +407,17 @@ together with @option{-v} to see information about the files. If a file fails the test, does not exist, can't be opened, or is a terminal, clzip continues testing the rest of the files. A final diagnostic is shown at verbosity level 1 or higher if any file fails the test when testing multiple -files. +files. A multimember file with one or more empty members is accepted if +redirected to standard input. @item -v @itemx --verbose Verbose mode.@* -When compressing, show the compression ratio and size for each file -processed.@* -When decompressing or testing, further -v's (up to 4) increase the -verbosity level, showing status, compression ratio, dictionary size, -trailer contents (CRC, data size, member size), and up to 6 bytes of -trailing data (if any) both in hexadecimal and as a string of printable -ASCII characters.@* +When compressing, show the compression ratio and size for each file processed.@* +When decompressing or testing, further -v's (up to 4) increase the verbosity +level, showing status, compression ratio, dictionary size, trailer contents +(CRC, data size, member size), and up to 6 bytes of trailing data (if any) +both in hexadecimal and as a string of printable ASCII characters.@* Two or more @option{-v} options show the progress of (de)compression. @item -0 .. -9 @@ -441,7 +437,7 @@ given, the last setting is used. For example @w{@option{-9 -s64MiB}} is equivalent to @w{@option{-s64MiB -m273}} @multitable {Level} {Dictionary size (-s)} {Match length limit (-m)} -@item Level @tab Dictionary size (-s) @tab Match length limit (-m) +@headitem Level @tab Dictionary size (-s) @tab Match length limit (-m) @item -0 @tab 64 KiB @tab 16 bytes @item -1 @tab 1 MiB @tab 5 bytes @item -2 @tab 1.5 MiB @tab 6 bytes @@ -458,20 +454,11 @@ equivalent to @w{@option{-s64MiB -m273}} @itemx --best Aliases for GNU gzip compatibility. -@item --empty-error -Exit with error status 2 if any empty member is found in the input files. - -@item --marking-error -Exit with error status 2 if the first LZMA byte is non-zero in any member of -the input files. This may be caused by data corruption or by deliberate -insertion of tracking information in the file. Use -@w{@samp{lziprecover --clear-marking}} to clear any such non-zero bytes. - @item --loose-trailing When decompressing, testing, or listing, allow trailing data whose first bytes are so similar to the magic bytes of a lzip header that they can be confused with a corrupt header. Use this option if a file triggers a -"corrupt header" error and the cause is not indeed a corrupt header. +'corrupt header' error and the cause is not indeed a corrupt header. @end table @@ -482,7 +469,7 @@ and may be followed by a multiplier and an optional @samp{B} for "byte". Table of SI and binary prefixes (unit multipliers): @multitable {Prefix} {kilobyte (10^3 = 1000)} {|} {Prefix} {kibibyte (2^10 = 1024)} -@item Prefix @tab Value @tab | @tab Prefix @tab Value +@headitem Prefix @tab Value @tab | @tab Prefix @tab Value @item k @tab kilobyte (10^3 = 1000) @tab | @tab Ki @tab kibibyte (2^10 = 1024) @item M @tab megabyte (10^6) @tab | @tab Mi @tab mebibyte (2^20) @item G @tab gigabyte (10^9) @tab | @tab Gi @tab gibibyte (2^30) @@ -502,273 +489,51 @@ indicate a corrupt or invalid input file, 3 for an internal consistency error (e.g., bug) which caused clzip to panic. -@node Quality assurance -@chapter Design, development, and testing of lzip -@cindex quality assurance - -There are two ways of constructing a software design: One way is to make it -so simple that there are obviously no deficiencies and the other way is to -make it so complicated that there are no obvious deficiencies. The first -method is far more difficult.@* ---- C.A.R. Hoare - -Lzip has been designed, written, and tested with great care to replace gzip -and bzip2 as the standard general-purpose compressed format for Unix-like -systems. This chapter describes the lessons learned from these previous -formats, and their application to the design of lzip. The lzip format -specification has been reviewed carefully and is believed to be free from -design errors. - -@sp 1 -@section Format design - -When gzip was designed in 1992, computers and operating systems were much -less capable than they are today. The designers of gzip tried to work around -some of those limitations, like 8.3 file names, with additional fields in -the file format. - -Today those limitations have mostly disappeared, and the format of gzip has -proved to be unnecessarily complicated. It includes fields that were never -used, others that have lost their usefulness, and finally others that have -become too limited. - -Bzip2 was designed 5 years later, and its format is simpler than the one of -gzip. - -Probably the worst defect of the gzip format from the point of view of data -safety is the variable size of its header. If the byte at offset 3 (flags) -of a gzip member gets corrupted, it may become difficult to recover the -data, even if the compressed blocks are intact, because it can't be known -with certainty where the compressed blocks begin. - -By contrast, the header of a lzip member has a fixed length of 6. The LZMA -stream in a lzip member always starts at offset 6, making it trivial to -recover the data even if the whole header becomes corrupt. - -Bzip2 also provides a header of fixed length and marks the begin and end of -each compressed block with six magic bytes, making it possible to find the -compressed blocks even in case of file damage. But bzip2 does not store the -size of each compressed block, as lzip does. - -Lziprecover is able to provide unique data recovery capabilities because the -lzip format is extraordinarily safe. The simple and safe design of the file -format complements the embedded error detection provided by the LZMA data -stream. Any distance larger than the dictionary size acts as a forbidden -symbol, allowing the decompressor to detect the approximate position of -errors, and leaving very little work for the check sequence (CRC and data -sizes) in the detection of errors. Lzip is usually able to detect all -possible bit flips in the compressed data without resorting to the check -sequence. It would be difficult to write an automatic recovery tool like -lziprecover for the gzip format. And, as far as I know, it has never been -written. - -Lzip, like gzip and bzip2, uses a CRC32 to check the integrity of the -decompressed data because it provides optimal accuracy in the detection of -errors up to a compressed size of about @w{16 GiB}, a size larger than that -of most files. In the case of lzip, the additional detection capability of -the decompressor reduces the probability of undetected errors several -million times more, resulting in a combined integrity checking optimally -accurate for any member size produced by lzip. Preliminary results suggest -that the lzip format is safe enough to be used in critical safety avionics -systems. - -The lzip format is designed for long-term archiving. Therefore it excludes -any unneeded features that may interfere with the future extraction of the -decompressed data. - -@sp 1 -@subsection Gzip format (mis)features not present in lzip - -@table @samp -@item Multiple algorithms - -Gzip provides a CM (Compression Method) field that has never been used -because it is a bad idea to begin with. New compression methods may require -additional fields, making it impossible to implement new methods and, at the -same time, keep the same format. This field does not solve the problem of -format proliferation; it just makes the problem less obvious. - -@item Optional fields in header +@node Argument syntax +@chapter Syntax of command-line arguments +@cindex argument syntax -Unless special precautions are taken, optional fields are generally a bad -idea because they produce a header of variable size. The gzip header has 2 -fields that, in addition to being optional, are zero-terminated. This means -that if any byte inside the field gets zeroed, or if the terminating zero -gets altered, gzip won't be able to find neither the header CRC nor the -compressed blocks. +POSIX recommends these conventions for command-line arguments. -@item Optional CRC for the header - -Using an optional CRC for the header is not only a bad idea, it is an error; -it circumvents the Hamming distance (HD) of the CRC and may prevent the -extraction of perfectly good data. For example, if the CRC is used and the -bit enabling it is reset by a bit flip, then the header seems to be intact -(in spite of being corrupt) while the compressed blocks seem to be totally -unrecoverable (in spite of being intact). Very misleading indeed. - -@item Metadata - -The gzip format stores some metadata, like the modification time of the -original file or the operating system on which compression took place. This -complicates reproducible compression (obtaining identical compressed output -from identical input). - -@end table - -@subsection Lzip format improvements over gzip and bzip2 - -@table @samp -@item 64-bit size field - -Probably the most frequently reported shortcoming of the gzip format is that -it only stores the least significant 32 bits of the uncompressed size. The -size of any file larger or equal than @w{4 GiB} gets truncated. - -Bzip2 does not store the uncompressed size of the file. - -The lzip format provides a 64-bit field for the uncompressed size. -Additionally, lzip produces multimember output automatically when the size -is too large for a single member, allowing for an unlimited uncompressed -size. - -@item Distributed index - -The lzip format provides a distributed index that, among other things, helps -plzip to decompress several times faster than pigz and helps lziprecover do -its job. Neither the gzip format nor the bzip2 format do provide an index. - -A distributed index is safer and more scalable than a monolithic index. The -monolithic index introduces a single point of failure in the compressed file -and may limit the number of members or the total uncompressed size. - -@end table - -@section Quality of implementation - -Our civilization depends critically on software; it had better be quality -software.@* ---- Bjarne Stroustrup - -@table @samp -@item Accurate and robust error detection - -The lzip format provides 3-factor integrity checking, and the decompressors -report mismatches in each factor separately. This method detects most false -positives for corruption. If just one byte in one factor fails but the other -two factors match the data, it probably means that the data are intact and -the corruption just affects the mismatching factor (CRC, data size, or -member size) in the member trailer. - -@item Multiple implementations - -Just like the lzip format provides 3-factor protection against undetected -data corruption, the development methodology of the lzip family of -compressors provides 3-factor protection against undetected programming -errors. - -Three related but independent compressor implementations, lzip, clzip, and -minilzip/lzlib, are developed concurrently. Every stable release of any of -them is tested to check that it produces identical output to the other two. -This guarantees that all three implement the same algorithm, and makes it -unlikely that any of them may contain serious undiscovered errors. In fact, -no errors have been discovered in lzip since 2009. - -Additionally, the three implementations have been extensively tested with -@uref{http://www.nongnu.org/lzip/manual/lziprecover_manual.html#Unzcrash,,unzcrash}, -valgrind, and @samp{american fuzzy lop} without finding a single -vulnerability or false negative. -@ifnothtml -@xref{Unzcrash,,,lziprecover}. -@end ifnothtml - -@item Dictionary size - -Lzip automatically adapts the dictionary size to the size of each file. -In addition to reducing the amount of memory required for decompression, -this feature also minimizes the probability of being affected by RAM errors -during compression. @c key4_mask - -@item Exit status - -Returning a warning status of 2 is a design flaw of compress that leaked -into the design of gzip. Both bzip2 and lzip are free from this flaw. - -@end table - - -@node Algorithm -@chapter Algorithm -@cindex algorithm - -In spite of its name (Lempel-Ziv-Markov chain-Algorithm), LZMA is not a -concrete algorithm; it is more like "any algorithm using the LZMA coding -scheme". LZMA compression consists in describing the uncompressed data as a -succession of coding sequences from the set shown in Section @samp{What is -coded} (@pxref{what-is-coded}), and then encoding them using a range -encoder. For example, the option @option{-0} of clzip uses the scheme in almost -the simplest way possible; issuing the longest match it can find, or a -literal byte if it can't find a match. Inversely, a much more elaborated way -of finding coding sequences of minimum size than the one currently used by -clzip could be developed, and the resulting sequence could also be coded -using the LZMA coding scheme. - -Clzip currently implements two variants of the LZMA algorithm: fast -(used by option @option{-0}) and normal (used by all other compression levels). - -The high compression of LZMA comes from combining two basic, well-proven -compression ideas: sliding dictionaries (LZ77) and markov models (the thing -used by every compression algorithm that uses a range encoder or similar -order-0 entropy coder as its last stage) with segregation of contexts -according to what the bits are used for. - -Clzip is a two stage compressor. The first stage is a Lempel-Ziv coder, -which reduces redundancy by translating chunks of data to their -corresponding distance-length pairs. The second stage is a range encoder -that uses a different probability model for each type of data: -distances, lengths, literal bytes, etc. - -Here is how it works, step by step: - -1) The member header is written to the output stream. - -2) The first byte is coded literally, because there are no previous -bytes to which the match finder can refer to. - -3) The main encoder advances to the next byte in the input data and -calls the match finder. - -4) The match finder fills an array with the minimum distances before the -current byte where a match of a given length can be found. +@itemize @bullet +@item A command-line argument is an option if it begins with a hyphen +(@samp{-}). -5) Go back to step 3 until a sequence (formed of pairs, repeated -distances, and literal bytes) of minimum price has been formed. Where the -price represents the number of output bits produced. +@item Option names are single alphanumeric characters. -6) The range encoder encodes the sequence produced by the main encoder -and sends the bytes produced to the output stream. +@item Certain options require an argument. -7) Go back to step 3 until the input data are finished or until the -member or volume size limits are reached. +@item An option and its argument may or may not appear as separate tokens. +(In other words, the whitespace separating them is optional). +Thus, @w{@option{-o foo}} and @option{-ofoo} are equivalent. -8) The range encoder is flushed. +@item One or more options without arguments, followed by at most one option +that takes an argument, may follow a hyphen in a single token. +Thus, @option{-abc} is equivalent to @w{@option{-a -b -c}}. -9) The member trailer is written to the output stream. +@item Options typically precede other non-option arguments. -10) If there are more data to compress, go back to step 1. +@item The argument @samp{--} terminates all options; any following arguments +are treated as non-option arguments, even if they begin with a hyphen. -@sp 1 -During compression, clzip reads data in large blocks (one dictionary size at -a time). Therefore it may block for up to tens of seconds any process -feeding data to it through a pipe. This is normal. The blocking intervals -get longer with higher compression levels because dictionary size increases -(and compression speed decreases) with compression level. +@item A token consisting of a single hyphen character is interpreted as an +ordinary non-option argument. By convention, it is used to specify standard +input, standard output, or a file named @samp{-}. +@end itemize @noindent -The ideas embodied in clzip are due to (at least) the following people: -Abraham Lempel and Jacob Ziv (for the LZ algorithm), Andrei Markov (for the -definition of Markov chains), G.N.N. Martin (for the definition of range -encoding), Igor Pavlov (for putting all the above together in LZMA), and -Julian Seward (for bzip2's CLI). +GNU adds @dfn{long options} to these conventions: + +@itemize @bullet +@item A long option consists of two hyphens (@samp{--}) followed by a name +made of alphanumeric characters and hyphens. Option names are typically one +to three words long, with hyphens to separate words. Abbreviations can be +used for the long option names as long as the abbreviations are unique. + +@item A long option and its argument may or may not appear as separate +tokens. In the latter case they must be separated by an equal sign @samp{=}. +Thus, @w{@option{--foo bar}} and @option{--foo=bar} are equivalent. +@end itemize @node File format @@ -779,7 +544,6 @@ Perfection is reached, not when there is no longer anything to add, but when there is no longer anything to take away.@* --- Antoine de Saint-Exupery -@sp 1 In the diagram below, a box like this: @verbatim @@ -798,12 +562,13 @@ represents one byte; a box like this: represents a variable number of bytes. -@sp 1 +@noindent A lzip file consists of one or more independent "members" (compressed data sets). The members simply appear one after another in the file, with no additional information before, between, or after them. Each member can encode in compressed form up to @w{16 EiB - 1 byte} of uncompressed data. -The size of a multimember file is unlimited. +The size of a multimember file is unlimited. Empty members (data size = 0) +are not allowed in multimember files. Each member has the following structure: @@ -834,7 +599,7 @@ Example: 0xD3 = 2^19 - 6 * 2^15 = 512 KiB - 6 * 32 KiB = 320 KiB@* Valid values for dictionary size range from 4 KiB to 512 MiB. @item LZMA stream -The LZMA stream, finished by an "End Of Stream" marker. Uses default values +The LZMA stream, terminated by an 'End Of Stream' marker. Uses default values for encoder properties. @xref{Stream format}, for a complete description. @item CRC32 (4 bytes) @@ -849,7 +614,6 @@ as a distributed index, improves the checking of stream integrity, and facilitates the safe recovery of undamaged members from multimember files. Lzip limits the member size to @w{2 PiB} to prevent the data size field from overflowing. - @end table @@ -857,8 +621,8 @@ overflowing. @chapter Format of the LZMA stream in lzip files @cindex format of the LZMA stream -The LZMA algorithm has three parameters, called "special LZMA -properties", to adjust it for some kinds of binary data. These +The LZMA algorithm has three parameters, called 'special LZMA +properties', to adjust it for some kinds of binary data. These parameters are: @samp{literal_context_bits} (with a default value of 3), @samp{literal_pos_state_bits} (with a default value of 0), and @samp{pos_state_bits} (with a default value of 2). As a general purpose @@ -866,14 +630,17 @@ compressor, lzip only uses the default values for these parameters. In particular @samp{literal_pos_state_bits} has been optimized away and does not even appear in the code. -Lzip finishes the LZMA stream with an "End Of Stream" (EOS) marker (the +The first byte of the LZMA stream is set to zero to help tools like grep +recognize lzip files as binary files. + +The LZMA stream is terminated by an 'End Of Stream' (EOS) marker (the distance-length pair @w{0xFFFFFFFFU, 2}), which in conjunction with the @samp{member size} field in the member trailer allows the checking of stream integrity. The EOS marker is the only LZMA marker allowed in lzip files. The LZMA stream in lzip files always has these two features (default properties and EOS marker) and is referred to in this document as LZMA-302eos. This simplified and marker-terminated form of the LZMA stream format has been -chosen to maximize interoperability and safety. +chosen to achieve complete interoperability and robust safety. The second stage of LZMA is a range encoder that uses a different probability model for each type of symbol: distances, lengths, literal @@ -890,12 +657,11 @@ about the range decoder that need to be described accurately, the source code of a real decompressor seems the only appropriate reference to use. What follows is a description of the decoding algorithm for LZMA-302eos -streams using as reference the source code of "lzd", an educational +streams using as reference the source code of lzd, an educational decompressor for lzip files, included in appendix A. @xref{Reference source code}. Lzd is written in C++11 and can be downloaded from the lzip download directory. -@sp 1 @section What is coded @anchor{what-is-coded} @@ -1071,7 +837,6 @@ decoded that is different from its corresponding bit in byte is decoded using the normal bit tree context. (See @samp{decode_matched} in the source). -@sp 1 @section The range decoder The LZMA stream is consumed one byte at a time by the range decoder. @@ -1085,25 +850,287 @@ not yet decoded) and @samp{code} (representing the current point within @samp{range}). @samp{range} is initialized to @w{2^32 - 1}, and @samp{code} is initialized to 0. -The range encoder produces a first 0 byte that must be ignored by the -range decoder. (See the @samp{Range_decoder} constructor in the source). - -@sp 1 @section Decoding and checking the LZMA stream After decoding the member header and obtaining the dictionary size, the range decoder is initialized and then the LZMA decoder enters a loop (see @samp{decode_member} in the source) where it invokes the range decoder with the appropriate contexts to decode the different coding -sequences (matches, repeated matches, and literal bytes), until the "End -Of Stream" marker is decoded. +sequences (matches, repeated matches, and literal bytes), until the 'End +Of Stream' marker is decoded. -Once the "End Of Stream" marker has been decoded, the decompressor reads and +Once the 'End Of Stream' marker has been decoded, the decompressor reads and decodes the member trailer, and checks that the three integrity factors stored there (CRC, data size, and member size) match those computed from the data. +@node Quality assurance +@chapter Design, development, and testing of lzip +@cindex quality assurance + +There are two ways of constructing a software design: One way is to make it +so simple that there are obviously no deficiencies and the other way is to +make it so complicated that there are no obvious deficiencies. The first +method is far more difficult.@* +--- C.A.R. Hoare + +Lzip has been designed, written, and tested with great care to replace gzip +and bzip2 as general-purpose compressed format for Unix-like systems. This +chapter describes the lessons learned from these previous formats, and their +application to the design of lzip. The lzip format specification has been +reviewed carefully and is believed to be free from design errors. + +@section Format design + +When gzip was designed in 1992, computers and operating systems were much +less capable than they are today. The designers of gzip tried to work around +some of those limitations, like 8.3 file names, with additional fields in +the file format. + +Today those limitations have mostly disappeared, and the format of gzip has +proved to be unnecessarily complicated. It includes fields that were never +used, others that have lost their usefulness, and finally others that have +become too limited. + +Bzip2 was designed 5 years later, and its format is simpler than the one of +gzip. + +Probably the worst defect of the gzip format from the point of view of data +safety is the variable size of its header. If the byte at offset 3 (flags) +of a gzip member gets corrupted, it may become difficult to recover the +data, even if the compressed blocks are intact, because it can't be known +with certainty where the compressed blocks begin. + +By contrast, the header of a lzip member has a fixed length of 6. The LZMA +stream in a lzip member always starts at offset 6, making it trivial to +recover the data even if the whole header becomes corrupt. + +Bzip2 also provides a header of fixed length and marks the begin and end of +each compressed block with six magic bytes, making it possible to find the +compressed blocks even in case of file damage. But bzip2 does not store the +size of each compressed block, as lzip does. + +Lziprecover is able to provide unique data recovery capabilities because the +lzip format is extraordinarily safe. The simple and safe design of the file +format complements the embedded error detection provided by the LZMA data +stream. Any distance larger than the dictionary size acts as a forbidden +symbol, allowing the decompressor to detect the approximate position of +errors, and leaving very little work for the check sequence (CRC and data +sizes) in the detection of errors. Lzip is usually able to detect all +possible bit flips in the compressed data without resorting to the check +sequence. It would be difficult to write an automatic recovery tool like +lziprecover for the gzip format. And, as far as I know, it has never been +written. + +Lzip, like gzip and bzip2, uses a CRC32 to check the integrity of the +decompressed data because it provides optimal accuracy in the detection of +errors up to a compressed size of about @w{16 GiB}, a size larger than that +of most files. In the case of lzip, the additional detection capability of +the decompressor reduces the probability of undetected errors several +million times more, resulting in a combined integrity checking optimally +accurate for any member size produced by lzip. Preliminary results suggest +that the lzip format is safe enough to be used in critical safety avionics +systems. + +The lzip format is designed for long-term archiving. Therefore it excludes +any unneeded features that may interfere with the future extraction of the +decompressed data. + +@subsection Gzip format (mis)features not present in lzip + +@table @samp +@item Multiple algorithms + +Gzip provides a CM (Compression Method) field that has never been used +because it is a bad idea to begin with. New compression methods may require +additional fields, making it impossible to implement new methods and, at the +same time, keep the same format. This field does not solve the problem of +format proliferation; it just makes the problem less obvious. + +@item Optional fields in header + +Unless special precautions are taken, optional fields are generally a bad +idea because they produce a header of variable size. The gzip header has 2 +fields that, in addition to being optional, are zero-terminated. This means +that if any byte inside the field gets zeroed, or if the terminating zero +gets altered, gzip won't be able to find neither the header CRC nor the +compressed blocks. + +@item Optional CRC for the header + +Using an optional CRC for the header is not only a bad idea, it is an error; +it circumvents the Hamming distance (HD) of the CRC and may prevent the +extraction of perfectly good data. For example, if the CRC is used and the +bit enabling it is reset by a bit flip, then the header seems to be intact +(in spite of being corrupt) while the compressed blocks seem to be totally +unrecoverable (in spite of being intact). Very misleading indeed. + +@item Metadata + +The gzip format stores some metadata, like the modification time of the +original file or the operating system on which compression took place. This +complicates reproducible compression (obtaining identical compressed output +from identical input). + +@end table + +@subsection Lzip format improvements over gzip and bzip2 + +@table @samp +@item 64-bit size field + +Probably the most frequently reported shortcoming of the gzip format is that +it only stores the least significant 32 bits of the uncompressed size. The +size of any file larger or equal than @w{4 GiB} gets truncated. + +Bzip2 does not store the uncompressed size of the file. + +The lzip format provides a 64-bit field for the uncompressed size. +Additionally, lzip produces multimember output automatically when the size +is too large for a single member, allowing for an unlimited uncompressed +size. + +@item Distributed index + +The lzip format provides a distributed index that, among other things, helps +plzip to decompress several times faster than pigz and helps lziprecover do +its job. Neither the gzip format nor the bzip2 format do provide an index. + +A distributed index is safer and more scalable than a monolithic index. The +monolithic index introduces a single point of failure in the compressed file +and may limit the number of members or the total uncompressed size. + +@end table + +@section Quality of implementation + +Our civilization depends critically on software; it had better be quality +software.@* +--- Bjarne Stroustrup + +@table @samp +@item Accurate and robust error detection + +The lzip format provides 3-factor integrity checking, and the decompressors +report mismatches in each factor separately. This method detects most false +positives for corruption. If just one byte in one factor fails but the other +two factors match the data, it probably means that the data are intact and +the corruption just affects the mismatching factor (CRC, data size, or +member size) in the member trailer. + +@item Multiple implementations + +Just like the lzip format provides 3-factor protection against undetected +data corruption, the development methodology of the lzip family of +compressors provides 3-factor protection against undetected programming +errors. + +Three related but independent compressor implementations, lzip, clzip, and +minilzip/lzlib, are developed concurrently. Every stable release of any of +them is tested to check that it produces identical output to the other two. +This guarantees that all three implement the same algorithm, and makes it +unlikely that any of them may contain serious undiscovered errors. In fact, +no errors have been discovered in lzip since 2009. + +Additionally, the three implementations have been extensively tested with +@uref{http://www.nongnu.org/lzip/manual/lziprecover_manual.html#Unzcrash,,unzcrash}, +valgrind, and @samp{american fuzzy lop} without finding a single +vulnerability or false negative. +@ifnothtml +@xref{Unzcrash,,,lziprecover}. +@end ifnothtml + +@item Dictionary size + +Lzip automatically adapts the dictionary size to the size of each file. +In addition to reducing the amount of memory required for decompression, +this feature also minimizes the probability of being affected by RAM errors +during compression. @c key4_mask + +@item Exit status + +Returning a warning status of 2 is a design flaw of compress that leaked +into the design of gzip. Both bzip2 and lzip are free from this flaw. + +@end table + + +@node Algorithm +@chapter Algorithm +@cindex algorithm + +In spite of its name (Lempel-Ziv-Markov chain-Algorithm), LZMA is not a +concrete algorithm; it is more like "any algorithm using the LZMA coding +scheme". LZMA compression consists in describing the uncompressed data as a +succession of coding sequences from the set shown in Section @samp{What is +coded} (@pxref{what-is-coded}), and then encoding them using a range +encoder. For example, the option @option{-0} of lzip uses the scheme in +almost the simplest way possible; issuing the longest match it can find, or +a literal byte if it can't find a match. Inversely, a more elaborate way of +finding coding sequences of minimum size than the one currently used by lzip +could be developed, and the resulting sequence could also be coded using the +LZMA coding scheme. + +Clzip currently implements two variants of the LZMA algorithm: fast +(used by option @option{-0}) and normal (used by all other compression levels). + +The high compression of LZMA comes from combining two basic, well-proven +compression ideas: sliding dictionaries (LZ77) and Markov models (the thing +used by every compression algorithm that uses a range encoder or similar +order-0 entropy coder as its last stage) with segregation of contexts +according to what the bits are used for. + +Clzip is a two stage compressor. The first stage is a Lempel-Ziv coder, +which reduces redundancy by translating chunks of data to their +corresponding distance-length pairs. The second stage is a range encoder +that uses a different probability model for each type of data: +distances, lengths, literal bytes, etc. + +Here is how it works, step by step: + +1) The member header is written to the output stream. + +2) The first byte is coded literally, because there are no previous +bytes to which the match finder can refer to. + +3) The main encoder advances to the next byte in the input data and +calls the match finder. + +4) The match finder fills an array with the minimum distances before the +current byte where a match of a given length can be found. + +5) Go back to step 3 until a sequence (formed of pairs, repeated +distances, and literal bytes) of minimum price has been formed. Where the +price represents the number of output bits produced. + +6) The range encoder encodes the sequence produced by the main encoder +and sends the bytes produced to the output stream. + +7) Go back to step 3 until the input data are finished or until the +member or volume size limits are reached. + +8) The range encoder is flushed. + +9) The member trailer is written to the output stream. + +10) If there are more data to compress, go back to step 1. + +@sp 1 +During compression, clzip reads data in large blocks (one dictionary size at +a time). Therefore it may block for up to tens of seconds any process +feeding data to it through a pipe. This is normal. The blocking intervals +get longer with higher compression levels because dictionary size increases +(and compression speed decreases) with compression level. + +@noindent +The ideas embodied in clzip are due to (at least) the following people: +Abraham Lempel and Jacob Ziv (for the LZ algorithm), Andrei Markov (for the +definition of Markov chains), G.N.N. Martin (for the definition of range +encoding), Igor Pavlov (for putting all the above together in LZMA), and +Julian Seward (for bzip2's CLI). + + @node Trailing data @chapter Extra data appended to the file @cindex trailing data @@ -1118,7 +1145,7 @@ example when writing to a tape. It is safe to append any amount of padding zero bytes to a lzip file. @item -Useful data added by the user; an "End Of File" string (to check that the +Useful data added by the user; an 'End Of File' string (to check that the file has not been truncated), a cryptographically secure hash, a description of file contents, etc. It is safe to append any amount of text to a lzip file as long as none of the first four bytes of the text matches the @@ -1170,9 +1197,8 @@ compression can only be detected by comparing the compressed file with the original because the corruption happens before clzip compresses the RAM contents, resulting in a valid compressed file containing wrong data. -@sp 1 @noindent -Example 1: Extract all the files from archive @samp{foo.tar.lz}. +Example 1: Extract all the files from archive @file{foo.tar.lz}. @example tar -xf foo.tar.lz @@ -1180,43 +1206,38 @@ or clzip -cd foo.tar.lz | tar -xf - @end example -@sp 1 @noindent -Example 2: Replace a regular file with its compressed version @samp{file.lz} +Example 2: Replace a regular file with its compressed version @file{file.lz} and show the compression ratio. @example clzip -v file @end example -@sp 1 @noindent -Example 3: Like example 2 but the created @samp{file.lz} is multimember with +Example 3: Like example 2 but the created @file{file.lz} is multimember with a member size of @w{1 MiB}. The compression ratio is not shown. @example clzip -b 1MiB file @end example -@sp 1 @noindent Example 4: Restore a regular file from its compressed version -@samp{file.lz}. If the operation is successful, @samp{file.lz} is removed. +@file{file.lz}. If the operation is successful, @file{file.lz} is removed. @example clzip -d file.lz @end example -@sp 1 @noindent -Example 5: Check the integrity of the compressed file @samp{file.lz} and +Example 5: Check the integrity of the compressed file @file{file.lz} and show status. @example clzip -tv file.lz @end example -@sp 1 @anchor{concat-example} @noindent Example 6: The right way of concatenating the decompressed output of two or @@ -1229,28 +1250,25 @@ Do this instead clzip -cd file1.lz file2.lz file3.lz @end example -@sp 1 @noindent -Example 7: Decompress @samp{file.lz} partially until @w{10 KiB} of +Example 7: Decompress @file{file.lz} partially until @w{10 KiB} of decompressed data are produced. @example clzip -cd file.lz | dd bs=1024 count=10 @end example -@sp 1 @noindent -Example 8: Decompress @samp{file.lz} partially from decompressed byte at +Example 8: Decompress @file{file.lz} partially from decompressed byte at offset 10000 to decompressed byte at offset 14999 (5000 bytes are produced). @example clzip -cd file.lz | dd bs=1000 skip=10 count=5 @end example -@sp 1 @noindent Example 9: Compress a whole device in /dev/sdc and send the output to -@samp{file.lz}. +@file{file.lz}. @example clzip -c /dev/sdc > file.lz @@ -1258,7 +1276,6 @@ or clzip /dev/sdc -o file.lz @end example -@sp 1 @noindent Example 10: Create a multivolume compressed tar archive with a volume size of @w{1440 KiB}. @@ -1267,7 +1284,6 @@ of @w{1440 KiB}. tar -c some_directory | clzip -S 1440KiB -o volume_name - @end example -@sp 1 @noindent Example 11: Extract a multivolume compressed tar archive. @@ -1275,7 +1291,6 @@ Example 11: Extract a multivolume compressed tar archive. clzip -cd volume_name*.lz | tar -xf - @end example -@sp 1 @noindent Example 12: Create a multivolume compressed backup of a large database file with a volume size of @w{650 MB}, where each volume is a multimember file @@ -1457,7 +1472,8 @@ public: Range_decoder() : member_pos( header_size ), code( 0 ), range( 0xFFFFFFFFU ) { - get_byte(); // discard first byte of the LZMA stream + if( get_byte() != 0 ) // check first LZMA byte + { std::fputs( "Nonzero first LZMA byte.\n", stderr ); std::exit( 2 ); } for( int i = 0; i < 4; ++i ) code = ( code << 8 ) | get_byte(); } @@ -1700,8 +1716,7 @@ bool LZ_decoder::decode_member() // Return false if error direct_bits ); else { - rep0 += - rdec.decode( direct_bits - dis_align_bits ) << dis_align_bits; + rep0 += rdec.decode( direct_bits-dis_align_bits ) << dis_align_bits; rep0 += rdec.decode_tree_reversed( bm_align, dis_align_bits ); if( rep0 == 0xFFFFFFFFU ) // marker found { @@ -1733,8 +1748,8 @@ int main( const int argc, const char * const argv[] ) "Lzd decompresses from standard input to standard output.\n" "\nCopyright (C) 2024 Antonio Diaz Diaz.\n" "License 2-clause BSD.\n" - "This is free software: you are free to change and redistribute it.\n" - "There is NO WARRANTY, to the extent permitted by law.\n" + "This is free software: you are free to change and redistribute " + "it.\nThere is NO WARRANTY, to the extent permitted by law.\n" "Report bugs to lzip-bug@nongnu.org\n" "Lzd home page: http://www.nongnu.org/lzip/lzd.html\n", PROGVERSION, argv[0] ); @@ -1746,6 +1761,7 @@ int main( const int argc, const char * const argv[] ) setmode( STDOUT_FILENO, O_BINARY ); #endif + bool empty = false, multi = false; for( bool first_member = true; ; first_member = false ) { Lzip_header header; // check header @@ -1760,12 +1776,12 @@ int main( const int argc, const char * const argv[] ) unsigned dict_size = 1 << ( header[5] & 0x1F ); dict_size -= ( dict_size / 16 ) * ( ( header[5] >> 5 ) & 7 ); if( dict_size < min_dictionary_size || dict_size > max_dictionary_size ) - { std::fputs( "Invalid dictionary size in member header.\n", stderr ); - return 2; } + { std::fputs( "Invalid dictionary size in member header.\n", + stderr ); return 2; } LZ_decoder decoder( dict_size ); // decode LZMA stream if( !decoder.decode_member() ) - { std::fputs( "Data error\n", stderr ); return 2; } + { std::fputs( "Data error.\n", stderr ); return 2; } Lzip_trailer trailer; // check trailer for( int i = 0; i < trailer_size; ++i ) trailer[i] = decoder.get_byte(); @@ -1773,25 +1789,28 @@ int main( const int argc, const char * const argv[] ) unsigned crc = 0; for( int i = 3; i >= 0; --i ) crc = ( crc << 8 ) + trailer[i]; if( crc != decoder.crc() ) - { std::fputs( "CRC mismatch\n", stderr ); retval = 2; } + { std::fputs( "CRC mismatch.\n", stderr ); retval = 2; } unsigned long long data_size = 0; for( int i = 11; i >= 4; --i ) data_size = ( data_size << 8 ) + trailer[i]; if( data_size != decoder.data_position() ) - { std::fputs( "Data size mismatch\n", stderr ); retval = 2; } + { std::fputs( "Data size mismatch.\n", stderr ); retval = 2; } + multi = !first_member; if( data_size == 0 ) empty = true; unsigned long long member_size = 0; for( int i = 19; i >= 12; --i ) member_size = ( member_size << 8 ) + trailer[i]; if( member_size != decoder.member_position() ) - { std::fputs( "Member size mismatch\n", stderr ); retval = 2; } + { std::fputs( "Member size mismatch.\n", stderr ); retval = 2; } if( retval ) return retval; } if( std::fclose( stdout ) != 0 ) { std::fprintf( stderr, "Error closing stdout: %s\n", std::strerror( errno ) ); return 1; } + if( empty && multi ) + { std::fputs( "Empty member not allowed.\n", stderr ); return 2; } return 0; } @end verbatim @@ -31,7 +31,7 @@ CRC32 crc32; -int LZe_get_match_pairs( struct LZ_encoder * const e, struct Pair * pairs ) +int LZe_get_match_pairs( LZ_encoder * const e, Pair * pairs ) { int len_limit = e->match_len_limit; if( len_limit > Mb_available_bytes( &e->eb.mb ) ) @@ -133,7 +133,7 @@ int LZe_get_match_pairs( struct LZ_encoder * const e, struct Pair * pairs ) } -static void LZe_update_distance_prices( struct LZ_encoder * const e ) +static void LZe_update_distance_prices( LZ_encoder * const e ) { int dis, len_state; for( dis = start_dis_model; dis < modeled_distances; ++dis ) @@ -172,7 +172,7 @@ static void LZe_update_distance_prices( struct LZ_encoder * const e ) ( trials[0].dis4 == -1 ) means literal. A match/rep longer or equal than match_len_limit finishes the sequence. */ -static int LZe_sequence_optimizer( struct LZ_encoder * const e, +static int LZe_sequence_optimizer( LZ_encoder * const e, const int reps[num_rep_distances], const State state ) { @@ -291,7 +291,7 @@ static int LZe_sequence_optimizer( struct LZ_encoder * const e, } /* give final values to current trial */ - struct Trial * cur_trial = &e->trials[cur]; + Trial * cur_trial = &e->trials[cur]; State cur_state; { const int dis4 = cur_trial->dis4; @@ -336,7 +336,7 @@ static int LZe_sequence_optimizer( struct LZ_encoder * const e, next_price += LZeb_price_matched( &e->eb, prev_byte, cur_byte, match_byte ); /* try last updates to next trial */ - struct Trial * next_trial = &e->trials[cur+1]; + Trial * next_trial = &e->trials[cur+1]; Tr_update( next_trial, next_price, -1, cur ); /* literal */ @@ -346,8 +346,7 @@ static int LZe_sequence_optimizer( struct LZ_encoder * const e, if( match_byte == cur_byte && next_trial->dis4 != 0 && next_trial->prev_index2 == single_step_trial ) { - const int price = rep_match_price + - LZeb_price_shortrep( &e->eb, cur_state, pos_state ); + const int price = rep_match_price + LZeb_price_shortrep( &e->eb, cur_state, pos_state ); if( price <= next_trial->price ) { next_trial->price = price; @@ -478,12 +477,12 @@ static int LZe_sequence_optimizer( struct LZ_encoder * const e, } -bool LZe_encode_member( struct LZ_encoder * const e, +bool LZe_encode_member( LZ_encoder * const e, const unsigned long long member_size ) { const unsigned long long member_size_limit = member_size - Lt_size - max_marker_size; - const bool best = ( e->match_len_limit > 12 ); + const bool best = e->match_len_limit > 12; const int dis_price_count = best ? 1 : 512; const int align_price_count = best ? 1 : dis_align_size; const int price_count = ( e->match_len_limit > 36 ) ? 1013 : 4093; @@ -537,7 +536,7 @@ bool LZe_encode_member( struct LZ_encoder * const e, const int len = e->trials[i].price; int dis = e->trials[i].dis4; - bool bit = ( dis < 0 ); + bool bit = dis < 0; Re_encode_bit( &e->eb.renc, &e->eb.bm_match[state][pos_state], !bit ); if( bit ) /* literal byte */ { @@ -556,11 +555,11 @@ bool LZe_encode_member( struct LZ_encoder * const e, { CRC32_update_buf( &e->eb.crc, Mb_ptr_to_current_pos( &e->eb.mb ) - ahead, len ); mtf_reps( dis, reps ); - bit = ( dis < num_rep_distances ); + bit = dis < num_rep_distances; Re_encode_bit( &e->eb.renc, &e->eb.bm_rep[state], bit ); if( bit ) /* repeated match */ { - bit = ( dis == 0 ); + bit = dis == 0; Re_encode_bit( &e->eb.renc, &e->eb.bm_rep0[state], !bit ); if( bit ) Re_encode_bit( &e->eb.renc, &e->eb.bm_len[state][pos_state], len > 1 ); @@ -15,16 +15,16 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. */ -struct Len_prices +typedef struct Len_prices { - const struct Len_model * lm; + const Len_model * lm; int len_symbols; int count; int prices[pos_states][max_len_symbols]; int counters[pos_states]; /* may decrement below 0 */ - }; + } Len_prices; -static inline void Lp_update_low_mid_prices( struct Len_prices * const lp, +static inline void Lp_update_low_mid_prices( Len_prices * const lp, const int pos_state ) { int * const pps = lp->prices[pos_state]; @@ -39,7 +39,7 @@ static inline void Lp_update_low_mid_prices( struct Len_prices * const lp, price_symbol3( lp->lm->bm_mid[pos_state], len - len_low_symbols ); } -static inline void Lp_update_high_prices( struct Len_prices * const lp ) +static inline void Lp_update_high_prices( Len_prices * const lp ) { const int tmp = price1( lp->lm->choice1 ) + price1( lp->lm->choice2 ); int len; @@ -50,11 +50,10 @@ static inline void Lp_update_high_prices( struct Len_prices * const lp ) price_symbol8( lp->lm->bm_high, len - len_low_symbols - len_mid_symbols ); } -static inline void Lp_reset( struct Len_prices * const lp ) +static inline void Lp_reset( Len_prices * const lp ) { int i; for( i = 0; i < pos_states; ++i ) lp->counters[i] = 0; } -static inline void Lp_init( struct Len_prices * const lp, - const struct Len_model * const lm, +static inline void Lp_init( Len_prices * const lp, const Len_model * const lm, const int match_len_limit ) { lp->lm = lm; @@ -63,11 +62,11 @@ static inline void Lp_init( struct Len_prices * const lp, Lp_reset( lp ); } -static inline void Lp_decrement_counter( struct Len_prices * const lp, +static inline void Lp_decrement_counter( Len_prices * const lp, const int pos_state ) { --lp->counters[pos_state]; } -static inline void Lp_update_prices( struct Len_prices * const lp ) +static inline void Lp_update_prices( Len_prices * const lp ) { int pos_state; bool high_pending = false; @@ -79,23 +78,23 @@ static inline void Lp_update_prices( struct Len_prices * const lp ) Lp_update_high_prices( lp ); } -static inline int Lp_price( const struct Len_prices * const lp, +static inline int Lp_price( const Len_prices * const lp, const int len, const int pos_state ) { return lp->prices[pos_state][len - min_match_len]; } -struct Pair /* distance-length pair */ +typedef struct Pair /* distance-length pair */ { int dis; int len; - }; + } Pair; enum { infinite_price = 0x0FFFFFFF, max_num_trials = 1 << 13, single_step_trial = -2, dual_step_trial = -1 }; -struct Trial +typedef struct Trial { State state; int price; /* dual use var; cumulative price, match length */ @@ -105,9 +104,9 @@ struct Trial /* -1 literal + rep0 */ /* >= 0 ( rep or match ) + literal + rep0 */ int reps[num_rep_distances]; - }; + } Trial; -static inline void Tr_update( struct Trial * const trial, const int pr, +static inline void Tr_update( Trial * const trial, const int pr, const int distance4, const int p_i ) { if( pr < trial->price ) @@ -115,7 +114,7 @@ static inline void Tr_update( struct Trial * const trial, const int pr, trial->prev_index2 = single_step_trial; } } -static inline void Tr_update2( struct Trial * const trial, const int pr, +static inline void Tr_update2( Trial * const trial, const int pr, const int p_i ) { if( pr < trial->price ) @@ -123,7 +122,7 @@ static inline void Tr_update2( struct Trial * const trial, const int pr, trial->prev_index2 = dual_step_trial; } } -static inline void Tr_update3( struct Trial * const trial, const int pr, +static inline void Tr_update3( Trial * const trial, const int pr, const int distance4, const int p_i, const int p_i2 ) { @@ -133,25 +132,24 @@ static inline void Tr_update3( struct Trial * const trial, const int pr, } -struct LZ_encoder +typedef struct LZ_encoder { - struct LZ_encoder_base eb; + LZ_encoder_base eb; int cycles; int match_len_limit; - struct Len_prices match_len_prices; - struct Len_prices rep_len_prices; + Len_prices match_len_prices; + Len_prices rep_len_prices; int pending_num_pairs; - struct Pair pairs[max_match_len+1]; - struct Trial trials[max_num_trials]; + Pair pairs[max_match_len+1]; + Trial trials[max_num_trials]; int dis_slot_prices[len_states][2*max_dictionary_bits]; int dis_prices[len_states][modeled_distances]; int align_prices[dis_align_size]; int num_dis_slots; - }; + } LZ_encoder; -static inline bool Mb_dec_pos( struct Matchfinder_base * const mb, - const int ahead ) +static inline bool Mb_dec_pos( Matchfinder_base * const mb, const int ahead ) { if( ahead < 0 || mb->pos < ahead ) return false; mb->pos -= ahead; @@ -160,7 +158,7 @@ static inline bool Mb_dec_pos( struct Matchfinder_base * const mb, return true; } -int LZe_get_match_pairs( struct LZ_encoder * const e, struct Pair * pairs ); +int LZe_get_match_pairs( LZ_encoder * const e, Pair * pairs ); /* move-to-front dis in/into reps; do nothing if( dis4 <= 0 ) */ static inline void mtf_reps( const int dis4, int reps[num_rep_distances] ) @@ -178,13 +176,13 @@ static inline void mtf_reps( const int dis4, int reps[num_rep_distances] ) } } -static inline int LZeb_price_shortrep( const struct LZ_encoder_base * const eb, +static inline int LZeb_price_shortrep( const LZ_encoder_base * const eb, const State state, const int pos_state ) { return price0( eb->bm_rep0[state] ) + price0( eb->bm_len[state][pos_state] ); } -static inline int LZeb_price_rep( const struct LZ_encoder_base * const eb, +static inline int LZeb_price_rep( const LZ_encoder_base * const eb, const int rep, const State state, const int pos_state ) { @@ -201,7 +199,7 @@ static inline int LZeb_price_rep( const struct LZ_encoder_base * const eb, return price; } -static inline int LZe_price_rep0_len( const struct LZ_encoder * const e, +static inline int LZe_price_rep0_len( const LZ_encoder * const e, const int len, const State state, const int pos_state ) { @@ -209,9 +207,8 @@ static inline int LZe_price_rep0_len( const struct LZ_encoder * const e, Lp_price( &e->rep_len_prices, len, pos_state ); } -static inline int LZe_price_pair( const struct LZ_encoder * const e, - const int dis, const int len, - const int pos_state ) +static inline int LZe_price_pair( const LZ_encoder * const e, const int dis, + const int len, const int pos_state ) { const int price = Lp_price( &e->match_len_prices, len, pos_state ); const int len_state = get_len_state( len ); @@ -222,7 +219,7 @@ static inline int LZe_price_pair( const struct LZ_encoder * const e, e->align_prices[dis & (dis_align_size - 1)]; } -static inline int LZe_read_match_distances( struct LZ_encoder * const e ) +static inline int LZe_read_match_distances( LZ_encoder * const e ) { const int num_pairs = LZe_get_match_pairs( e, e->pairs ); if( num_pairs > 0 ) @@ -235,7 +232,7 @@ static inline int LZe_read_match_distances( struct LZ_encoder * const e ) return num_pairs; } -static inline void LZe_move_and_update( struct LZ_encoder * const e, int n ) +static inline void LZe_move_and_update( LZ_encoder * const e, int n ) { while( true ) { @@ -245,13 +242,13 @@ static inline void LZe_move_and_update( struct LZ_encoder * const e, int n ) } } -static inline void LZe_backward( struct LZ_encoder * const e, int cur ) +static inline void LZe_backward( LZ_encoder * const e, int cur ) { int dis4 = e->trials[cur].dis4; while( cur > 0 ) { const int prev_index = e->trials[cur].prev_index; - struct Trial * const prev_trial = &e->trials[prev_index]; + Trial * const prev_trial = &e->trials[prev_index]; if( e->trials[cur].prev_index2 != single_step_trial ) { @@ -260,7 +257,7 @@ static inline void LZe_backward( struct LZ_encoder * const e, int cur ) prev_trial->prev_index2 = single_step_trial; if( e->trials[cur].prev_index2 >= 0 ) { - struct Trial * const prev_trial2 = &e->trials[prev_index-1]; + Trial * const prev_trial2 = &e->trials[prev_index-1]; prev_trial2->dis4 = dis4; dis4 = 0; /* rep0 */ prev_trial2->prev_index = e->trials[cur].prev_index2; prev_trial2->prev_index2 = single_step_trial; @@ -275,7 +272,7 @@ static inline void LZe_backward( struct LZ_encoder * const e, int cur ) enum { num_prev_positions3 = 1 << 16, num_prev_positions2 = 1 << 10 }; -static inline bool LZe_init( struct LZ_encoder * const e, +static inline bool LZe_init( LZ_encoder * const e, const int dict_size, const int len_limit, const int ifd, const int outfd ) { @@ -300,7 +297,7 @@ static inline bool LZe_init( struct LZ_encoder * const e, return true; } -static inline void LZe_reset( struct LZ_encoder * const e ) +static inline void LZe_reset( LZ_encoder * const e ) { LZeb_reset( &e->eb ); Lp_reset( &e->match_len_prices ); @@ -308,5 +305,5 @@ static inline void LZe_reset( struct LZ_encoder * const e ) e->pending_num_pairs = 0; } -bool LZe_encode_member( struct LZ_encoder * const e, +bool LZe_encode_member( LZ_encoder * const e, const unsigned long long member_size ); diff --git a/encoder_base.c b/encoder_base.c index 5f40f9b..cfa14d5 100644 --- a/encoder_base.c +++ b/encoder_base.c @@ -31,7 +31,7 @@ Dis_slots dis_slots; Prob_prices prob_prices; -bool Mb_read_block( struct Matchfinder_base * const mb ) +bool Mb_read_block( Matchfinder_base * const mb ) { if( !mb->at_stream_end && mb->stream_pos < mb->buffer_size ) { @@ -46,7 +46,7 @@ bool Mb_read_block( struct Matchfinder_base * const mb ) } -void Mb_normalize_pos( struct Matchfinder_base * const mb ) +void Mb_normalize_pos( Matchfinder_base * const mb ) { if( mb->pos > mb->stream_pos ) internal_error( "pos > stream_pos in Mb_normalize_pos." ); @@ -69,7 +69,7 @@ void Mb_normalize_pos( struct Matchfinder_base * const mb ) } -bool Mb_init( struct Matchfinder_base * const mb, const int before_size, +bool Mb_init( Matchfinder_base * const mb, const int before_size, const int dict_size, const int after_size, const int dict_factor, const int num_prev_positions23, const int pos_array_factor, const int ifd ) @@ -123,7 +123,7 @@ bool Mb_init( struct Matchfinder_base * const mb, const int before_size, } -void Mb_reset( struct Matchfinder_base * const mb ) +void Mb_reset( Matchfinder_base * const mb ) { int i; if( mb->stream_pos > mb->pos ) @@ -147,13 +147,13 @@ void Mb_reset( struct Matchfinder_base * const mb ) } -void Re_flush_data( struct Range_encoder * const renc ) +void Re_flush_data( Range_encoder * const renc ) { if( renc->pos > 0 ) { if( renc->outfd >= 0 && writeblock( renc->outfd, renc->buffer, renc->pos ) != renc->pos ) - { show_error( "Write error", errno, false ); cleanup_and_fail( 1 ); } + { show_error( write_error_msg, errno, false ); cleanup_and_fail( 1 ); } renc->partial_member_pos += renc->pos; renc->pos = 0; show_cprogress( 0, 0, 0, 0 ); @@ -162,7 +162,7 @@ void Re_flush_data( struct Range_encoder * const renc ) /* End Of Stream marker => (dis == 0xFFFFFFFFU, len == min_match_len) */ -void LZeb_full_flush( struct LZ_encoder_base * const eb, const State state ) +void LZeb_full_flush( LZ_encoder_base * const eb, const State state ) { const int pos_state = Mb_data_position( &eb->mb ) & pos_state_mask; Re_encode_bit( &eb->renc, &eb->bm_match[state][pos_state], 1 ); @@ -178,7 +178,7 @@ void LZeb_full_flush( struct LZ_encoder_base * const eb, const State state ) } -void LZeb_reset( struct LZ_encoder_base * const eb ) +void LZeb_reset( LZ_encoder_base * const eb ) { Mb_reset( &eb->mb ); eb->crc = 0xFFFFFFFFU; diff --git a/encoder_base.h b/encoder_base.h index c947904..0a1bd5d 100644 --- a/encoder_base.h +++ b/encoder_base.h @@ -174,36 +174,36 @@ struct Matchfinder_base bool at_stream_end; /* stream_pos shows real end of file */ }; -bool Mb_read_block( struct Matchfinder_base * const mb ); -void Mb_normalize_pos( struct Matchfinder_base * const mb ); +bool Mb_read_block( Matchfinder_base * const mb ); +void Mb_normalize_pos( Matchfinder_base * const mb ); -bool Mb_init( struct Matchfinder_base * const mb, const int before_size, +bool Mb_init( Matchfinder_base * const mb, const int before_size, const int dict_size, const int after_size, const int dict_factor, const int num_prev_positions23, const int pos_array_factor, const int ifd ); -static inline void Mb_free( struct Matchfinder_base * const mb ) +static inline void Mb_free( Matchfinder_base * const mb ) { free( mb->prev_positions ); free( mb->buffer ); } -static inline uint8_t Mb_peek( const struct Matchfinder_base * const mb, +static inline uint8_t Mb_peek( const Matchfinder_base * const mb, const int distance ) { return mb->buffer[mb->pos-distance]; } -static inline int Mb_available_bytes( const struct Matchfinder_base * const mb ) +static inline int Mb_available_bytes( const Matchfinder_base * const mb ) { return mb->stream_pos - mb->pos; } static inline unsigned long long -Mb_data_position( const struct Matchfinder_base * const mb ) +Mb_data_position( const Matchfinder_base * const mb ) { return mb->partial_data_pos + mb->pos; } -static inline bool Mb_data_finished( const struct Matchfinder_base * const mb ) +static inline bool Mb_data_finished( const Matchfinder_base * const mb ) { return mb->at_stream_end && mb->pos >= mb->stream_pos; } static inline const uint8_t * -Mb_ptr_to_current_pos( const struct Matchfinder_base * const mb ) +Mb_ptr_to_current_pos( const Matchfinder_base * const mb ) { return mb->buffer + mb->pos; } -static inline int Mb_true_match_len( const struct Matchfinder_base * const mb, +static inline int Mb_true_match_len( const Matchfinder_base * const mb, const int index, const int distance ) { const uint8_t * const data = mb->buffer + mb->pos; @@ -213,18 +213,18 @@ static inline int Mb_true_match_len( const struct Matchfinder_base * const mb, return i; } -static inline void Mb_move_pos( struct Matchfinder_base * const mb ) +static inline void Mb_move_pos( Matchfinder_base * const mb ) { if( ++mb->cyclic_pos > mb->dictionary_size ) mb->cyclic_pos = 0; if( ++mb->pos >= mb->pos_limit ) Mb_normalize_pos( mb ); } -void Mb_reset( struct Matchfinder_base * const mb ); +void Mb_reset( Matchfinder_base * const mb ); enum { re_buffer_size = 65536 }; -struct Range_encoder +typedef struct Range_encoder { uint64_t low; unsigned long long partial_member_pos; @@ -235,22 +235,21 @@ struct Range_encoder int outfd; /* output file descriptor */ uint8_t cache; Lzip_header header; - }; + } Range_encoder; -void Re_flush_data( struct Range_encoder * const renc ); +void Re_flush_data( Range_encoder * const renc ); -static inline void Re_put_byte( struct Range_encoder * const renc, - const uint8_t b ) +static inline void Re_put_byte( Range_encoder * const renc, const uint8_t b ) { renc->buffer[renc->pos] = b; if( ++renc->pos >= re_buffer_size ) Re_flush_data( renc ); } -static inline void Re_shift_low( struct Range_encoder * const renc ) +static inline void Re_shift_low( Range_encoder * const renc ) { if( renc->low >> 24 != 0xFF ) { - const bool carry = ( renc->low > 0xFFFFFFFFU ); + const bool carry = renc->low > 0xFFFFFFFFU; Re_put_byte( renc, renc->cache + carry ); for( ; renc->ff_count > 0; --renc->ff_count ) Re_put_byte( renc, 0xFF + carry ); @@ -260,7 +259,7 @@ static inline void Re_shift_low( struct Range_encoder * const renc ) renc->low = ( renc->low & 0x00FFFFFFU ) << 8; } -static inline void Re_reset( struct Range_encoder * const renc, +static inline void Re_reset( Range_encoder * const renc, const unsigned dictionary_size ) { renc->low = 0; @@ -273,7 +272,7 @@ static inline void Re_reset( struct Range_encoder * const renc, int i; for( i = 0; i < Lh_size; ++i ) Re_put_byte( renc, renc->header[i] ); } -static inline bool Re_init( struct Range_encoder * const renc, +static inline bool Re_init( Range_encoder * const renc, const unsigned dictionary_size, const int ofd ) { renc->buffer = (uint8_t *)malloc( re_buffer_size ); @@ -284,17 +283,17 @@ static inline bool Re_init( struct Range_encoder * const renc, return true; } -static inline void Re_free( struct Range_encoder * const renc ) +static inline void Re_free( Range_encoder * const renc ) { free( renc->buffer ); } static inline unsigned long long -Re_member_position( const struct Range_encoder * const renc ) +Re_member_position( const Range_encoder * const renc ) { return renc->partial_member_pos + renc->pos + renc->ff_count; } -static inline void Re_flush( struct Range_encoder * const renc ) +static inline void Re_flush( Range_encoder * const renc ) { int i; for( i = 0; i < 5; ++i ) Re_shift_low( renc ); } -static inline void Re_encode( struct Range_encoder * const renc, +static inline void Re_encode( Range_encoder * const renc, const int symbol, const int num_bits ) { unsigned mask; @@ -306,7 +305,7 @@ static inline void Re_encode( struct Range_encoder * const renc, } } -static inline void Re_encode_bit( struct Range_encoder * const renc, +static inline void Re_encode_bit( Range_encoder * const renc, Bit_model * const probability, const bool bit ) { const uint32_t bound = ( renc->range >> bit_model_total_bits ) * *probability; @@ -324,7 +323,7 @@ static inline void Re_encode_bit( struct Range_encoder * const renc, if( renc->range <= 0x00FFFFFFU ) { renc->range <<= 8; Re_shift_low( renc ); } } -static inline void Re_encode_tree3( struct Range_encoder * const renc, +static inline void Re_encode_tree3( Range_encoder * const renc, Bit_model bm[], const int symbol ) { bool bit = ( symbol >> 2 ) & 1; @@ -335,7 +334,7 @@ static inline void Re_encode_tree3( struct Range_encoder * const renc, Re_encode_bit( renc, &bm[model], symbol & 1 ); } -static inline void Re_encode_tree6( struct Range_encoder * const renc, +static inline void Re_encode_tree6( Range_encoder * const renc, Bit_model bm[], const unsigned symbol ) { bool bit = ( symbol >> 5 ) & 1; @@ -352,7 +351,7 @@ static inline void Re_encode_tree6( struct Range_encoder * const renc, Re_encode_bit( renc, &bm[model], symbol & 1 ); } -static inline void Re_encode_tree8( struct Range_encoder * const renc, +static inline void Re_encode_tree8( Range_encoder * const renc, Bit_model bm[], const int symbol ) { int model = 1; @@ -365,7 +364,7 @@ static inline void Re_encode_tree8( struct Range_encoder * const renc, } } -static inline void Re_encode_tree_reversed( struct Range_encoder * const renc, +static inline void Re_encode_tree_reversed( Range_encoder * const renc, Bit_model bm[], int symbol, const int num_bits ) { int model = 1; @@ -379,7 +378,7 @@ static inline void Re_encode_tree_reversed( struct Range_encoder * const renc, } } -static inline void Re_encode_matched( struct Range_encoder * const renc, +static inline void Re_encode_matched( Range_encoder * const renc, Bit_model bm[], unsigned symbol, unsigned match_byte ) { @@ -395,17 +394,17 @@ static inline void Re_encode_matched( struct Range_encoder * const renc, } } -static inline void Re_encode_len( struct Range_encoder * const renc, - struct Len_model * const lm, +static inline void Re_encode_len( Range_encoder * const renc, + Len_model * const lm, int symbol, const int pos_state ) { - bool bit = ( ( symbol -= min_match_len ) >= len_low_symbols ); + bool bit = ( symbol -= min_match_len ) >= len_low_symbols; Re_encode_bit( renc, &lm->choice1, bit ); if( !bit ) Re_encode_tree3( renc, lm->bm_low[pos_state], symbol ); else { - bit = ( ( symbol -= len_low_symbols ) >= len_mid_symbols ); + bit = ( symbol -= len_low_symbols ) >= len_mid_symbols; Re_encode_bit( renc, &lm->choice2, bit ); if( !bit ) Re_encode_tree3( renc, lm->bm_mid[pos_state], symbol ); @@ -418,9 +417,9 @@ static inline void Re_encode_len( struct Range_encoder * const renc, enum { max_marker_size = 16, num_rep_distances = 4 }; /* must be 4 */ -struct LZ_encoder_base +typedef struct LZ_encoder_base { - struct Matchfinder_base mb; + Matchfinder_base mb; uint32_t crc; Bit_model bm_literal[1<<literal_context_bits][0x300]; @@ -433,14 +432,14 @@ struct LZ_encoder_base Bit_model bm_dis_slot[len_states][1<<dis_slot_bits]; Bit_model bm_dis[modeled_distances-end_dis_model+1]; Bit_model bm_align[dis_align_size]; - struct Len_model match_len_model; - struct Len_model rep_len_model; - struct Range_encoder renc; - }; + Len_model match_len_model; + Len_model rep_len_model; + Range_encoder renc; + } LZ_encoder_base; -void LZeb_reset( struct LZ_encoder_base * const eb ); +void LZeb_reset( LZ_encoder_base * const eb ); -static inline bool LZeb_init( struct LZ_encoder_base * const eb, +static inline bool LZeb_init( LZ_encoder_base * const eb, const int before_size, const int dict_size, const int after_size, const int dict_factor, const int num_prev_positions23, @@ -454,31 +453,31 @@ static inline bool LZeb_init( struct LZ_encoder_base * const eb, return true; } -static inline void LZeb_free( struct LZ_encoder_base * const eb ) +static inline void LZeb_free( LZ_encoder_base * const eb ) { Re_free( &eb->renc ); Mb_free( &eb->mb ); } -static inline unsigned LZeb_crc( const struct LZ_encoder_base * const eb ) +static inline unsigned LZeb_crc( const LZ_encoder_base * const eb ) { return eb->crc ^ 0xFFFFFFFFU; } -static inline int LZeb_price_literal( const struct LZ_encoder_base * const eb, +static inline int LZeb_price_literal( const LZ_encoder_base * const eb, const uint8_t prev_byte, const uint8_t symbol ) { return price_symbol8( eb->bm_literal[get_lit_state(prev_byte)], symbol ); } -static inline int LZeb_price_matched( const struct LZ_encoder_base * const eb, +static inline int LZeb_price_matched( const LZ_encoder_base * const eb, const uint8_t prev_byte, const uint8_t symbol, const uint8_t match_byte ) { return price_matched( eb->bm_literal[get_lit_state(prev_byte)], symbol, match_byte ); } -static inline void LZeb_encode_literal( struct LZ_encoder_base * const eb, +static inline void LZeb_encode_literal( LZ_encoder_base * const eb, const uint8_t prev_byte, const uint8_t symbol ) { Re_encode_tree8( &eb->renc, eb->bm_literal[get_lit_state(prev_byte)], symbol ); } -static inline void LZeb_encode_matched( struct LZ_encoder_base * const eb, +static inline void LZeb_encode_matched( LZ_encoder_base * const eb, const uint8_t prev_byte, const uint8_t symbol, const uint8_t match_byte ) { Re_encode_matched( &eb->renc, eb->bm_literal[get_lit_state(prev_byte)], symbol, match_byte ); } -static inline void LZeb_encode_pair( struct LZ_encoder_base * const eb, +static inline void LZeb_encode_pair( LZ_encoder_base * const eb, const unsigned dis, const int len, const int pos_state ) { @@ -504,4 +503,4 @@ static inline void LZeb_encode_pair( struct LZ_encoder_base * const eb, } } -void LZeb_full_flush( struct LZ_encoder_base * const eb, const State state ); +void LZeb_full_flush( LZ_encoder_base * const eb, const State state ); diff --git a/fast_encoder.c b/fast_encoder.c index bab87ca..f55ca2c 100644 --- a/fast_encoder.c +++ b/fast_encoder.c @@ -28,7 +28,7 @@ #include "fast_encoder.h" -int FLZe_longest_match_len( struct FLZ_encoder * const fe, int * const distance ) +int FLZe_longest_match_len( FLZ_encoder * const fe, int * const distance ) { enum { len_limit = 16 }; const int available = min( Mb_available_bytes( &fe->eb.mb ), max_match_len ); @@ -69,7 +69,7 @@ int FLZe_longest_match_len( struct FLZ_encoder * const fe, int * const distance } -bool FLZe_encode_member( struct FLZ_encoder * const fe, +bool FLZe_encode_member( FLZ_encoder * const fe, const unsigned long long member_size ) { const unsigned long long member_size_limit = diff --git a/fast_encoder.h b/fast_encoder.h index e4e4000..773780d 100644 --- a/fast_encoder.h +++ b/fast_encoder.h @@ -15,13 +15,13 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. */ -struct FLZ_encoder +typedef struct FLZ_encoder { - struct LZ_encoder_base eb; + LZ_encoder_base eb; unsigned key4; /* key made from latest 4 bytes */ - }; + } FLZ_encoder; -static inline void FLZe_reset_key4( struct FLZ_encoder * const fe ) +static inline void FLZe_reset_key4( FLZ_encoder * const fe ) { int i; fe->key4 = 0; @@ -29,11 +29,11 @@ static inline void FLZe_reset_key4( struct FLZ_encoder * const fe ) fe->key4 = ( fe->key4 << 4 ) ^ fe->eb.mb.buffer[i]; } -int FLZe_longest_match_len( struct FLZ_encoder * const fe, int * const distance ); +int FLZe_longest_match_len( FLZ_encoder * const fe, int * const distance ); -static inline void FLZe_update_and_move( struct FLZ_encoder * const fe, int n ) +static inline void FLZe_update_and_move( FLZ_encoder * const fe, int n ) { - struct Matchfinder_base * const mb = &fe->eb.mb; + Matchfinder_base * const mb = &fe->eb.mb; while( --n >= 0 ) { if( Mb_available_bytes( mb ) >= 4 ) @@ -46,7 +46,7 @@ static inline void FLZe_update_and_move( struct FLZ_encoder * const fe, int n ) } } -static inline bool FLZe_init( struct FLZ_encoder * const fe, +static inline bool FLZe_init( FLZ_encoder * const fe, const int ifd, const int outfd ) { enum { before_size = 0, @@ -61,8 +61,8 @@ static inline bool FLZe_init( struct FLZ_encoder * const fe, num_prev_positions23, pos_array_factor, ifd, outfd ); } -static inline void FLZe_reset( struct FLZ_encoder * const fe ) +static inline void FLZe_reset( FLZ_encoder * const fe ) { LZeb_reset( &fe->eb ); } -bool FLZe_encode_member( struct FLZ_encoder * const fe, +bool FLZe_encode_member( FLZ_encoder * const fe, const unsigned long long member_size ); @@ -17,6 +17,7 @@ #define _FILE_OFFSET_BITS 64 +#include <errno.h> #include <stdbool.h> #include <stdint.h> #include <stdio.h> @@ -43,7 +44,7 @@ static void list_line( const unsigned long long uncomp_size, int list_files( const char * const filenames[], const int num_filenames, - const struct Cl_options * const cl_opts ) + const Cl_options * const cl_opts ) { unsigned long long total_comp = 0, total_uncomp = 0; int files = 0, retval = 0; @@ -53,7 +54,7 @@ int list_files( const char * const filenames[], const int num_filenames, for( i = 0; i < num_filenames; ++i ) { - const bool from_stdin = ( strcmp( filenames[i], "-" ) == 0 ); + const bool from_stdin = strcmp( filenames[i], "-" ) == 0; if( from_stdin ) { if( stdin_used ) continue; else stdin_used = true; } const char * const input_filename = from_stdin ? "(stdin)" : filenames[i]; struct stat in_stats; /* not used */ @@ -61,7 +62,7 @@ int list_files( const char * const filenames[], const int num_filenames, open_instream( input_filename, &in_stats, false, true ); if( infd < 0 ) { set_retval( &retval, 1 ); continue; } - struct Lzip_index lzip_index; + Lzip_index lzip_index; Li_init( &lzip_index, infd, cl_opts ); close( infd ); if( lzip_index.retval != 0 ) @@ -70,6 +71,8 @@ int list_files( const char * const filenames[], const int num_filenames, set_retval( &retval, lzip_index.retval ); Li_free( &lzip_index ); continue; } + const bool multi_empty = !from_stdin && Li_multi_empty( &lzip_index ); + if( multi_empty ) set_retval( &retval, 2 ); if( verbosity < 0 ) { Li_free( &lzip_index ); continue; } const unsigned long long udata_size = Li_udata_size( &lzip_index ); const unsigned long long cdata_size = Li_cdata_size( &lzip_index ); @@ -81,6 +84,8 @@ int list_files( const char * const filenames[], const int num_filenames, if( verbosity >= 1 ) fputs( " dict memb trail ", stdout ); fputs( " uncompressed compressed saved name\n", stdout ); } + if( multi_empty ) + { fflush( stdout ); show_file_error( input_filename, empty_msg, 0 ); } if( verbosity >= 1 ) printf( "%s %5ld %6lld ", format_ds( lzip_index.dictionary_size ), members, Li_file_size( &lzip_index ) - cdata_size ); @@ -92,8 +97,8 @@ int list_files( const char * const filenames[], const int num_filenames, fputs( " member data_pos data_size member_pos member_size\n", stdout ); for( i = 0; i < members; ++i ) { - const struct Block * db = Li_dblock( &lzip_index, i ); - const struct Block * mb = Li_mblock( &lzip_index, i ); + const Block * db = Li_dblock( &lzip_index, i ); + const Block * mb = Li_mblock( &lzip_index, i ); printf( "%6ld %14llu %14llu %14llu %14llu\n", i + 1, db->pos, db->size, mb->pos, mb->size ); } @@ -101,12 +106,16 @@ int list_files( const char * const filenames[], const int num_filenames, } fflush( stdout ); Li_free( &lzip_index ); + if( ferror( stdout ) ) break; } - if( verbosity >= 0 && files > 1 ) + if( verbosity >= 0 && files > 1 && !ferror( stdout ) ) { if( verbosity >= 1 ) fputs( " ", stdout ); list_line( total_uncomp, total_comp, "(totals)" ); fflush( stdout ); } + if( verbosity >= 0 && ( ferror( stdout ) || fclose( stdout ) != 0 ) ) + { show_file_error( "(stdout)", write_error_msg, errno ); + set_retval( &retval, 1 ); } return retval; } @@ -92,16 +92,16 @@ static inline void Bm_init( Bit_model * const probability ) static inline void Bm_array_init( Bit_model bm[], const int size ) { int i; for( i = 0; i < size; ++i ) Bm_init( &bm[i] ); } -struct Len_model +typedef struct Len_model { Bit_model choice1; Bit_model choice2; Bit_model bm_low[pos_states][len_low_symbols]; Bit_model bm_mid[pos_states][len_mid_symbols]; Bit_model bm_high[len_high_symbols]; - }; + } Len_model; -static inline void Lm_init( struct Len_model * const lm ) +static inline void Lm_init( Len_model * const lm ) { Bm_init( &lm->choice1 ); Bm_init( &lm->choice2 ); @@ -276,17 +276,14 @@ static inline bool Lt_check_consistency( const Lzip_trailer data ) } -struct Cl_options /* command-line options */ +typedef struct Cl_options /* command-line options */ { - bool ignore_empty; - bool ignore_marking; bool ignore_trailing; bool loose_trailing; - }; + } Cl_options; -static inline void Cl_options_init( struct Cl_options * cl_opts ) - { cl_opts->ignore_empty = true; cl_opts->ignore_marking = true; - cl_opts->ignore_trailing = true; cl_opts->loose_trailing = false; } +static inline void Cl_options_init( Cl_options * cl_opts ) + { cl_opts->ignore_trailing = true; cl_opts->loose_trailing = false; } static inline void set_retval( int * retval, const int new_val ) @@ -296,9 +293,10 @@ static const char * const bad_magic_msg = "Bad magic number (file not in lzip fo static const char * const bad_dict_msg = "Invalid dictionary size in member header."; static const char * const corrupt_mm_msg = "Corrupt header in multimember file."; static const char * const empty_msg = "Empty member not allowed."; -static const char * const marking_msg = "Marking data not allowed."; -static const char * const trailing_msg = "Trailing data not allowed."; static const char * const mem_msg = "Not enough memory."; +static const char * const nonzero_msg = "Nonzero first LZMA byte."; +static const char * const trailing_msg = "Trailing data not allowed."; +static const char * const write_error_msg = "Write error"; /* defined in decoder.c */ int readblock( const int fd, uint8_t * const buf, const int size ); @@ -306,14 +304,16 @@ int writeblock( const int fd, const uint8_t * const buf, const int size ); /* defined in list.c */ int list_files( const char * const filenames[], const int num_filenames, - const struct Cl_options * const cl_opts ); + const Cl_options * const cl_opts ); /* defined in main.c */ struct stat; -struct Pretty_print; +typedef struct Matchfinder_base Matchfinder_base; +typedef struct Pretty_print Pretty_print; +typedef struct Range_decoder Range_decoder; extern int verbosity; void * resize_buffer( void * buf, const unsigned min_size ); -void Pp_show_msg( struct Pretty_print * const pp, const char * const msg ); +void Pp_show_msg( Pretty_print * const pp, const char * const msg ); const char * bad_version( const unsigned version ); const char * format_ds( const unsigned dictionary_size ); void show_header( const unsigned dictionary_size ); @@ -324,13 +324,11 @@ void show_error( const char * const msg, const int errcode, const bool help ); void show_file_error( const char * const filename, const char * const msg, const int errcode ); void internal_error( const char * const msg ); -struct Matchfinder_base; void show_cprogress( const unsigned long long cfile_size, const unsigned long long partial_size, - const struct Matchfinder_base * const m, - struct Pretty_print * const p ); -struct Range_decoder; + const Matchfinder_base * const m, + Pretty_print * const p ); void show_dprogress( const unsigned long long cfile_size, const unsigned long long partial_size, - const struct Range_decoder * const d, - struct Pretty_print * const p ); + const Range_decoder * const d, + Pretty_print * const p ); diff --git a/lzip_index.c b/lzip_index.c index b7d594c..cbdb0fc 100644 --- a/lzip_index.c +++ b/lzip_index.c @@ -38,7 +38,7 @@ static int seek_read( const int fd, uint8_t * const buf, const int size, } -static bool add_error( struct Lzip_index * const li, const char * const msg ) +static bool add_error( Lzip_index * const li, const char * const msg ) { const int len = strlen( msg ); void * tmp = resize_buffer( li->error, li->error_size + len + 1 ); @@ -50,16 +50,15 @@ static bool add_error( struct Lzip_index * const li, const char * const msg ) } -static bool push_back_member( struct Lzip_index * const li, - const long long dp, const long long ds, - const long long mp, const long long ms, - const unsigned dict_size ) +static bool push_back_member( Lzip_index * const li, const long long dp, + const long long ds, const long long mp, + const long long ms, const unsigned dict_size ) { - struct Member * p; + Member * p; void * tmp = resize_buffer( li->member_vector, ( li->members + 1 ) * sizeof li->member_vector[0] ); if( !tmp ) { add_error( li, mem_msg ); li->retval = 1; return false; } - li->member_vector = (struct Member *)tmp; + li->member_vector = (Member *)tmp; p = &(li->member_vector[li->members]); init_member( p, dp, ds, mp, ms, dict_size ); ++li->members; @@ -67,7 +66,7 @@ static bool push_back_member( struct Lzip_index * const li, } -static void Li_free_member_vector( struct Lzip_index * const li ) +static void Li_free_member_vector( Lzip_index * const li ) { if( li->member_vector ) { free( li->member_vector ); li->member_vector = 0; } @@ -75,9 +74,9 @@ static void Li_free_member_vector( struct Lzip_index * const li ) } -static void Li_reverse_member_vector( struct Lzip_index * const li ) +static void Li_reverse_member_vector( Lzip_index * const li ) { - struct Member tmp; + Member tmp; long i; for( i = 0; i < li->members / 2; ++i ) { @@ -88,8 +87,7 @@ static void Li_reverse_member_vector( struct Lzip_index * const li ) } -static bool Li_check_header( struct Lzip_index * const li, - const Lzip_header header ) +static bool Li_check_header( Lzip_index * const li, const Lzip_header header ) { if( !Lh_check_magic( header ) ) { add_error( li, bad_magic_msg ); li->retval = 2; return false; } @@ -101,15 +99,14 @@ static bool Li_check_header( struct Lzip_index * const li, return true; } -static void Li_set_errno_error( struct Lzip_index * const li, - const char * const msg ) +static void Li_set_errno_error( Lzip_index * const li, const char * const msg ) { add_error( li, msg ); add_error( li, strerror( errno ) ); li->retval = 1; } -static void Li_set_num_error( struct Lzip_index * const li, - const char * const msg, unsigned long long num ) +static void Li_set_num_error( Lzip_index * const li, const char * const msg, + unsigned long long num ) { char buf[80]; snprintf( buf, sizeof buf, "%s%llu", msg, num ); @@ -118,22 +115,19 @@ static void Li_set_num_error( struct Lzip_index * const li, } -static bool Li_read_header( struct Lzip_index * const li, const int fd, - Lzip_header header, const long long pos, const bool ignore_marking ) +static bool Li_read_header( Lzip_index * const li, const int fd, + Lzip_header header, const long long pos ) { if( seek_read( fd, header, Lh_size, pos ) != Lh_size ) { Li_set_errno_error( li, "Error reading member header: " ); return false; } - uint8_t byte; - if( !ignore_marking && readblock( fd, &byte, 1 ) == 1 && byte != 0 ) - { add_error( li, marking_msg ); li->retval = 2; return false; } return true; } /* If successful, push last member and set pos to member header. */ -static bool Li_skip_trailing_data( struct Lzip_index * const li, const int fd, +static bool Li_skip_trailing_data( Lzip_index * const li, const int fd, unsigned long long * const pos, - const struct Cl_options * const cl_opts ) + const Cl_options * const cl_opts ) { if( *pos < min_member_size ) return false; enum { block_size = 16384, @@ -162,8 +156,8 @@ static bool Li_skip_trailing_data( struct Lzip_index * const li, const int fd, if( member_size > ipos + i || !Lt_check_consistency( *trailer ) ) continue; Lzip_header header; - if( !Li_read_header( li, fd, header, ipos + i - member_size, - cl_opts->ignore_marking ) ) return false; + if( !Li_read_header( li, fd, header, ipos + i - member_size ) ) + return false; if( !Lh_check( header ) ) continue; const Lzip_header * header2 = (const Lzip_header *)( buffer + i ); const bool full_h2 = bsize - i >= Lh_size; @@ -178,15 +172,12 @@ static bool Li_skip_trailing_data( struct Lzip_index * const li, const int fd, { add_error( li, corrupt_mm_msg ); li->retval = 2; return false; } if( !cl_opts->ignore_trailing ) { add_error( li, trailing_msg ); li->retval = 2; return false; } - const unsigned long long data_size = Lt_get_data_size( *trailer ); - if( !cl_opts->ignore_empty && data_size == 0 ) - { add_error( li, empty_msg ); li->retval = 2; return false; } *pos = ipos + i - member_size; /* good member */ const unsigned dictionary_size = Lh_get_dictionary_size( header ); if( li->dictionary_size < dictionary_size ) li->dictionary_size = dictionary_size; - return push_back_member( li, 0, data_size, *pos, member_size, - dictionary_size ); + return push_back_member( li, 0, Lt_get_data_size( *trailer ), *pos, + member_size, dictionary_size ); } if( ipos == 0 ) { Li_set_num_error( li, "Bad trailer at pos ", *pos - Lt_size ); @@ -200,8 +191,8 @@ static bool Li_skip_trailing_data( struct Lzip_index * const li, const int fd, } -bool Li_init( struct Lzip_index * const li, const int infd, - const struct Cl_options * const cl_opts ) +bool Li_init( Lzip_index * const li, const int infd, + const Cl_options * const cl_opts ) { li->member_vector = 0; li->error = 0; @@ -212,6 +203,10 @@ bool Li_init( struct Lzip_index * const li, const int infd, li->dictionary_size = 0; if( li->insize < 0 ) { Li_set_errno_error( li, "Input file is not seekable: " ); return false; } + Lzip_header header; + if( li->insize >= Lh_size && + ( !Li_read_header( li, infd, header, 0 ) || + !Li_check_header( li, header ) ) ) return false; if( li->insize < min_member_size ) { add_error( li, "Input file is too short." ); li->retval = 2; return false; } @@ -219,10 +214,6 @@ bool Li_init( struct Lzip_index * const li, const int infd, { add_error( li, "Input file is too long (2^63 bytes or more)." ); li->retval = 2; return false; } - Lzip_header header; - if( !Li_read_header( li, infd, header, 0, cl_opts->ignore_marking ) || - !Li_check_header( li, header ) ) return false; - unsigned long long pos = li->insize; /* always points to a header or to EOF */ while( pos >= min_member_size ) { @@ -237,8 +228,7 @@ bool Li_init( struct Lzip_index * const li, const int infd, return false; } Li_set_num_error( li, "Bad trailer at pos ", pos - Lt_size ); break; } - if( !Li_read_header( li, infd, header, pos - member_size, - cl_opts->ignore_marking ) ) break; + if( !Li_read_header( li, infd, header, pos - member_size ) ) break; if( !Lh_check( header ) ) /* bad header */ { if( li->members <= 0 ) @@ -246,15 +236,12 @@ bool Li_init( struct Lzip_index * const li, const int infd, return false; } Li_set_num_error( li, "Bad header at pos ", pos - member_size ); break; } - const unsigned long long data_size = Lt_get_data_size( trailer ); - if( !cl_opts->ignore_empty && data_size == 0 ) - { add_error( li, empty_msg ); li->retval = 2; break; } pos -= member_size; /* good member */ const unsigned dictionary_size = Lh_get_dictionary_size( header ); if( li->dictionary_size < dictionary_size ) li->dictionary_size = dictionary_size; - if( !push_back_member( li, 0, data_size, pos, member_size, - dictionary_size ) ) return false; + if( !push_back_member( li, 0, Lt_get_data_size( trailer ), pos, + member_size, dictionary_size ) ) return false; } if( pos != 0 || li->members <= 0 || li->retval != 0 ) { @@ -264,8 +251,7 @@ bool Li_init( struct Lzip_index * const li, const int infd, return false; } Li_reverse_member_vector( li ); - long i; - for( i = 0; ; ++i ) + long i; for( i = 0; ; ++i ) { const long long end = block_end( li->member_vector[i].dblock ); if( end < 0 || end > INT64_MAX ) @@ -281,7 +267,7 @@ bool Li_init( struct Lzip_index * const li, const int infd, } -void Li_free( struct Lzip_index * const li ) +void Li_free( Lzip_index * const li ) { Li_free_member_vector( li ); if( li->error ) { free( li->error ); li->error = 0; } diff --git a/lzip_index.h b/lzip_index.h index e273eaf..480356f 100644 --- a/lzip_index.h +++ b/lzip_index.h @@ -20,72 +20,80 @@ #endif -struct Block +typedef struct Block { long long pos, size; /* pos >= 0, size >= 0, pos + size <= INT64_MAX */ - }; + } Block; -static inline void init_block( struct Block * const b, +static inline void init_block( Block * const b, const long long p, const long long s ) { b->pos = p; b->size = s; } -static inline long long block_end( const struct Block b ) - { return b.pos + b.size; } +static inline long long block_end( const Block b ) { return b.pos + b.size; } -struct Member +typedef struct Member { - struct Block dblock, mblock; /* data block, member block */ + Block dblock, mblock; /* data block, member block */ unsigned dictionary_size; - }; + } Member; -static inline void init_member( struct Member * const m, - const long long dpos, const long long dsize, - const long long mpos, const long long msize, - const unsigned dict_size ) - { init_block( &m->dblock, dpos, dsize ); init_block( &m->mblock, mpos, msize ); - m->dictionary_size = dict_size; } +static inline void init_member( Member * const m, const long long dpos, + const long long dsize, const long long mpos, + const long long msize, const unsigned dict_size ) + { init_block( &m->dblock, dpos, dsize ); + init_block( &m->mblock, mpos, msize ); m->dictionary_size = dict_size; } -struct Lzip_index +typedef struct Lzip_index { - struct Member * member_vector; + Member * member_vector; char * error; long long insize; long members; int error_size; int retval; unsigned dictionary_size; /* largest dictionary size in the file */ - }; + } Lzip_index; -bool Li_init( struct Lzip_index * const li, const int infd, - const struct Cl_options * const cl_opts ); +bool Li_init( Lzip_index * const li, const int infd, + const Cl_options * const cl_opts ); -void Li_free( struct Lzip_index * const li ); +void Li_free( Lzip_index * const li ); -static inline long long Li_udata_size( const struct Lzip_index * const li ) +/* multimember file with empty member(s) */ +static inline bool Li_multi_empty( Lzip_index * const li ) + { + long i; + if( li->members > 1 ) + for( i = 0; i < li->members; ++i ) + if( li->member_vector[i].dblock.size == 0 ) return true; + return false; + } + +static inline long long Li_udata_size( const Lzip_index * const li ) { if( li->members <= 0 ) return 0; return block_end( li->member_vector[li->members-1].dblock ); } -static inline long long Li_cdata_size( const struct Lzip_index * const li ) +static inline long long Li_cdata_size( const Lzip_index * const li ) { if( li->members <= 0 ) return 0; return block_end( li->member_vector[li->members-1].mblock ); } /* total size including trailing data (if any) */ -static inline long long Li_file_size( const struct Lzip_index * const li ) +static inline long long Li_file_size( const Lzip_index * const li ) { if( li->insize >= 0 ) return li->insize; else return 0; } -static inline const struct Block * Li_dblock( const struct Lzip_index * const li, - const long i ) +static inline const Block * Li_dblock( const Lzip_index * const li, + const long i ) { return &li->member_vector[i].dblock; } -static inline const struct Block * Li_mblock( const struct Lzip_index * const li, - const long i ) +static inline const Block * Li_mblock( const Lzip_index * const li, + const long i ) { return &li->member_vector[i].mblock; } -static inline unsigned Li_dictionary_size( const struct Lzip_index * const li, +static inline unsigned Li_dictionary_size( const Lzip_index * const li, const long i ) { return li->member_vector[i].dictionary_size; } @@ -26,7 +26,7 @@ #include <ctype.h> #include <errno.h> #include <fcntl.h> -#include <limits.h> /* SSIZE_MAX */ +#include <limits.h> /* CHAR_BIT, SSIZE_MAX */ #include <signal.h> #include <stdbool.h> #include <stdint.h> /* SIZE_MAX */ @@ -39,8 +39,10 @@ #if defined __MSVCRT__ || defined __OS2__ || defined __DJGPP__ #include <io.h> #if defined __MSVCRT__ +#include <direct.h> #define fchmod(x,y) 0 #define fchown(x,y,z) 0 +#define mkdir(name,mode) _mkdir(name) #define strtoull strtoul #define SIGHUP SIGTERM #define S_ISSOCK(x) 0 @@ -88,13 +90,13 @@ static const struct { const char * from; const char * to; } known_extensions[] = { ".tlz", ".tar" }, { 0, 0 } }; -struct Lzma_options +typedef struct Lzma_options { int dictionary_size; /* 4 KiB .. 512 MiB */ int match_len_limit; /* 5 .. 273 */ - }; + } Lzma_options; -enum Mode { m_compress, m_decompress, m_list, m_test }; +typedef enum Mode { m_compress, m_decompress, m_list, m_test } Mode; /* Variables used in signal handler context. They are not declared volatile because the handler never returns. */ @@ -105,26 +107,25 @@ static bool delete_output_on_interrupt = false; static void show_help( void ) { - printf( "Clzip is a C language version of lzip, compatible with lzip 1.4 or newer. As\n" - "clzip is written in C, it may be easier to integrate in applications like\n" - "package managers, embedded devices, or systems lacking a C++ compiler.\n" + printf( "Clzip is a C language version of lzip intended for systems lacking a C++\n" + "compiler.\n" "\nLzip is a lossless data compressor with a user interface similar to the one\n" - "of gzip or bzip2. Lzip uses a simplified form of the 'Lempel-Ziv-Markov\n" - "chain-Algorithm' (LZMA) stream format to maximize interoperability. The\n" - "maximum dictionary size is 512 MiB so that any lzip file can be decompressed\n" - "on 32-bit machines. Lzip provides accurate and robust 3-factor integrity\n" - "checking. Lzip can compress about as fast as gzip (lzip -0) or compress most\n" - "files more than bzip2 (lzip -9). Decompression speed is intermediate between\n" - "gzip and bzip2. Lzip is better than gzip and bzip2 from a data recovery\n" - "perspective. Lzip has been designed, written, and tested with great care to\n" - "replace gzip and bzip2 as the standard general-purpose compressed format for\n" - "Unix-like systems.\n" + "of gzip or bzip2. Lzip uses a simplified form of LZMA (Lempel-Ziv-Markov\n" + "chain-Algorithm) designed to achieve complete interoperability between\n" + "implementations. The maximum dictionary size is 512 MiB so that any lzip\n" + "file can be decompressed on 32-bit machines. Lzip provides accurate and\n" + "robust 3-factor integrity checking. 'lzip -0' compresses about as fast as\n" + "gzip, while 'lzip -9' compresses most files more than bzip2. Decompression\n" + "speed is intermediate between gzip and bzip2. Lzip provides better data\n" + "recovery capabilities than gzip and bzip2. Lzip has been designed, written,\n" + "and tested with great care to replace gzip and bzip2 as general-purpose\n" + "compressed format for Unix-like systems.\n" "\nUsage: %s [options] [files]\n", invocation_name ); printf( "\nOptions:\n" " -h, --help display this help and exit\n" " -V, --version output version information and exit\n" " -a, --trailing-error exit with error status if trailing data\n" - " -b, --member-size=<bytes> set member size limit in bytes\n" + " -b, --member-size=<bytes> set member size limit of multimember files\n" " -c, --stdout write to standard output, keep input files\n" " -d, --decompress decompress, test compressed file integrity\n" " -f, --force overwrite existing output files\n" @@ -141,8 +142,6 @@ static void show_help( void ) " -0 .. -9 set compression level [default 6]\n" " --fast alias for -0\n" " --best alias for -9\n" - " --empty-error exit with error status if empty member in file\n" - " --marking-error exit with error status if 1st LZMA byte not 0\n" " --loose-trailing allow trailing data seeming corrupt header\n" "\nIf no file names are given, or if a file is '-', clzip compresses or\n" "decompresses from standard input to standard output.\n" @@ -199,7 +198,7 @@ struct Pretty_print bool first_post; }; -static void Pp_init( struct Pretty_print * const pp, +static void Pp_init( Pretty_print * const pp, const char * const filenames[], const int num_filenames ) { pp->name = 0; @@ -220,8 +219,10 @@ static void Pp_init( struct Pretty_print * const pp, if( pp->longest_name == 0 ) pp->longest_name = stdin_name_len; } -static void Pp_set_name( struct Pretty_print * const pp, - const char * const filename ) +void Pp_free( Pretty_print * const pp ) + { if( pp->padded_name ) { free( pp->padded_name ); pp->padded_name = 0; } } + +static void Pp_set_name( Pretty_print * const pp, const char * const filename ) { unsigned name_len, padded_name_len, i = 0; @@ -239,10 +240,10 @@ static void Pp_set_name( struct Pretty_print * const pp, pp->first_post = true; } -static void Pp_reset( struct Pretty_print * const pp ) +static void Pp_reset( Pretty_print * const pp ) { if( pp->name && pp->name[0] ) pp->first_post = true; } -void Pp_show_msg( struct Pretty_print * const pp, const char * const msg ) +void Pp_show_msg( Pretty_print * const pp, const char * const msg ) { if( verbosity < 0 ) return; if( pp->first_post ) @@ -272,7 +273,7 @@ const char * format_ds( const unsigned dictionary_size ) const char * p = ""; const char * np = " "; unsigned num = dictionary_size; - bool exact = ( num % factor == 0 ); + bool exact = num % factor == 0; int i; for( i = 0; i < n && ( num > 9999 || ( exact && num >= factor ) ); ++i ) { num /= factor; if( num % factor != 0 ) exact = false; @@ -288,7 +289,7 @@ void show_header( const unsigned dictionary_size ) } -/* separate numbers of 5 or more digits in groups of 3 digits using '_' */ +/* separate numbers of 6 or more digits in groups of 3 digits using '_' */ static const char * format_num3( unsigned long long num ) { enum { buffers = 8, bufsize = 4 * sizeof num, n = 10 }; @@ -300,7 +301,7 @@ static const char * format_num3( unsigned long long num ) char * const buf = buffer[current++]; current %= buffers; char * p = buf + bufsize - 1; /* fill the buffer backwards */ *p = 0; /* terminator */ - if( num > 1024 ) + if( num > 9999 ) { char prefix = 0; /* try binary first, then si */ for( i = 0; i < n && num != 0 && num % 1024 == 0; ++i ) @@ -311,7 +312,7 @@ static const char * format_num3( unsigned long long num ) { num /= 1000; prefix = si_prefix[i]; } if( prefix ) *(--p) = prefix; } - const bool split = num >= 10000; + const bool split = num >= 100000; for( i = 0; ; ) { @@ -346,7 +347,7 @@ static unsigned long long getnum( const char * const arg, if( !errno && tail[0] ) { - const unsigned factor = ( tail[1] == 'i' ) ? 1024 : 1000; + const unsigned factor = (tail[1] == 'i') ? 1024 : 1000; int exponent = 0; /* 0 = bad multiplier */ int i; switch( tail[0] ) @@ -396,7 +397,7 @@ static int get_dict_size( const char * const arg, const char * const option_name } -static void set_mode( enum Mode * const program_modep, const enum Mode new_mode ) +static void set_mode( Mode * const program_modep, const Mode new_mode ) { if( *program_modep != m_compress && *program_modep != new_mode ) { @@ -473,9 +474,9 @@ int open_instream( const char * const name, struct stat * const in_statsp, { const int i = fstat( infd, in_statsp ); const mode_t mode = in_statsp->st_mode; - const bool can_read = ( i == 0 && !reg_only && - ( S_ISBLK( mode ) || S_ISCHR( mode ) || - S_ISFIFO( mode ) || S_ISSOCK( mode ) ) ); + const bool can_read = i == 0 && !reg_only && + ( S_ISBLK( mode ) || S_ISCHR( mode ) || + S_ISFIFO( mode ) || S_ISSOCK( mode ) ); if( i != 0 || ( !S_ISREG( mode ) && ( !can_read || one_to_one ) ) ) { if( verbosity >= 0 ) @@ -491,13 +492,13 @@ int open_instream( const char * const name, struct stat * const in_statsp, static int open_instream2( const char * const name, struct stat * const in_statsp, - const enum Mode program_mode, const int eindex, + const Mode program_mode, const int eindex, const bool one_to_one, const bool recompress ) { if( program_mode == m_compress && !recompress && eindex >= 0 ) { if( verbosity >= 0 ) - fprintf( stderr, "%s: %s: Input file already has '%s' suffix.\n", + fprintf( stderr, "%s: %s: Input file already has '%s' suffix, ignored.\n", program_name, name, known_extensions[eindex].from ); return -1; } @@ -519,7 +520,7 @@ static bool make_dirs( const char * const name ) while( i < dirsize && name[i] != '/' ) ++i; if( first < i ) { - char partial[i+1]; memcpy( partial, name, i ); partial[i] = 0; + char partial[i+1]; memcpy( partial, name, i ); partial[i] = 0; /* vla */ const mode_t mode = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH; struct stat st; if( stat( partial, &st ) == 0 ) @@ -590,7 +591,7 @@ static void signal_handler( int sig ) static bool check_tty_in( const char * const input_filename, const int infd, - const enum Mode program_mode, int * const retval ) + const Mode program_mode, int * const retval ) { if( ( program_mode == m_decompress || program_mode == m_test ) && isatty( infd ) ) /* for example /dev/tty */ @@ -602,7 +603,7 @@ static bool check_tty_in( const char * const input_filename, const int infd, return true; } -static bool check_tty_out( const enum Mode program_mode ) +static bool check_tty_out( const Mode program_mode ) { if( program_mode == m_compress && isatty( outfd ) ) { show_file_error( output_filename[0] ? @@ -661,30 +662,26 @@ static bool next_filename( void ) } -struct Poly_encoder - { - struct LZ_encoder_base * eb; - struct LZ_encoder * e; - struct FLZ_encoder * fe; - }; - - static int compress( const unsigned long long cfile_size, const unsigned long long member_size, const unsigned long long volume_size, const int infd, - const struct Lzma_options * const encoder_options, - struct Pretty_print * const pp, + const Lzma_options * const encoder_options, + Pretty_print * const pp, const struct stat * const in_statsp, const bool zero ) { - int retval = 0; - struct Poly_encoder encoder = { 0, 0, 0 }; /* polymorphic encoder */ + struct + { + LZ_encoder_base * eb; + LZ_encoder * e; + FLZ_encoder * fe; + } encoder = { 0, 0, 0 }; /* polymorphic encoder */ if( verbosity >= 1 ) Pp_show_msg( pp, 0 ); { bool error = false; if( zero ) { - encoder.fe = (struct FLZ_encoder *)malloc( sizeof *encoder.fe ); + encoder.fe = (FLZ_encoder *)malloc( sizeof *encoder.fe ); if( !encoder.fe || !FLZe_init( encoder.fe, infd, outfd ) ) error = true; else encoder.eb = &encoder.fe->eb; } @@ -694,7 +691,7 @@ static int compress( const unsigned long long cfile_size, if( Lh_set_dictionary_size( header, encoder_options->dictionary_size ) && encoder_options->match_len_limit >= min_match_len_limit && encoder_options->match_len_limit <= max_match_len ) - encoder.e = (struct LZ_encoder *)malloc( sizeof *encoder.e ); + encoder.e = (LZ_encoder *)malloc( sizeof *encoder.e ); else internal_error( "invalid argument to encoder." ); if( !encoder.e || !LZe_init( encoder.e, Lh_get_dictionary_size( header ), encoder_options->match_len_limit, infd, outfd ) ) @@ -709,9 +706,10 @@ static int compress( const unsigned long long cfile_size, } unsigned long long in_size = 0, out_size = 0, partial_volume_size = 0; + int retval = 0; while( true ) /* encode one member per iteration */ { - const unsigned long long size = ( volume_size > 0 ) ? + const unsigned long long size = (volume_size > 0) ? min( member_size, volume_size - partial_volume_size ) : member_size; show_cprogress( cfile_size, in_size, &encoder.eb->mb, pp ); /* init */ if( ( zero && !FLZe_encode_member( encoder.fe, size ) ) || @@ -765,7 +763,7 @@ static unsigned char xdigit( const unsigned value ) /* hex digit for 'value' */ static bool show_trailing_data( const uint8_t * const data, const int size, - struct Pretty_print * const pp, const bool all, + Pretty_print * const pp, const bool all, const int ignore_trailing ) /* -1 = show */ { if( verbosity >= 4 || ignore_trailing <= 0 ) @@ -793,15 +791,16 @@ static bool show_trailing_data( const uint8_t * const data, const int size, static int decompress( const unsigned long long cfile_size, const int infd, - const struct Cl_options * const cl_opts, - struct Pretty_print * const pp, const bool testing ) + const Cl_options * const cl_opts, Pretty_print * const pp, + const bool from_stdin, const bool testing ) { unsigned long long partial_file_pos = 0; - struct Range_decoder rdec; + Range_decoder rdec; int retval = 0; bool first_member; if( !Rd_init( &rdec, infd ) ) { show_error( mem_msg, 0, false ); cleanup_and_fail( 1 ); } + bool empty = false, multi = false; for( first_member = true; ; first_member = false ) { @@ -841,11 +840,11 @@ static int decompress( const unsigned long long cfile_size, const int infd, if( verbosity >= 2 || ( verbosity == 1 && first_member ) ) Pp_show_msg( pp, 0 ); - struct LZ_decoder decoder; + LZ_decoder decoder; if( !LZd_init( &decoder, &rdec, dictionary_size, outfd ) ) { Pp_show_msg( pp, mem_msg ); retval = 1; break; } show_dprogress( cfile_size, partial_file_pos, &rdec, pp ); /* init */ - const int result = LZd_decode_member( &decoder, cl_opts, pp ); + const int result = LZd_decode_member( &decoder, pp ); partial_file_pos += Rd_member_position( &rdec ); LZd_free( &decoder ); if( result != 0 ) @@ -857,16 +856,19 @@ static int decompress( const unsigned long long cfile_size, const int infd, "File ends unexpectedly" : "Decoder error", partial_file_pos ); } - else if( result == 5 ) Pp_show_msg( pp, empty_msg ); - else if( result == 6 ) Pp_show_msg( pp, marking_msg ); + else if( result == 5 ) Pp_show_msg( pp, nonzero_msg ); retval = 2; break; } + if( !from_stdin ) { multi = !first_member; + if( LZd_data_position( &decoder ) == 0 ) empty = true; } if( verbosity >= 2 ) { fputs( testing ? "ok\n" : "done\n", stderr ); Pp_reset( pp ); } } Rd_free( &rdec ); if( verbosity == 1 && retval == 0 ) fputs( testing ? "ok\n" : "done\n", stderr ); + if( empty && multi && retval == 0 ) + { show_file_error( pp->name, empty_msg, 0 ); retval = 2; } return retval; } @@ -904,13 +906,13 @@ void internal_error( const char * const msg ) void show_cprogress( const unsigned long long cfile_size, const unsigned long long partial_size, - const struct Matchfinder_base * const m, - struct Pretty_print * const p ) + const Matchfinder_base * const m, + Pretty_print * const p ) { static unsigned long long csize = 0; /* file_size / 100 */ static unsigned long long psize = 0; - static const struct Matchfinder_base * mb = 0; - static struct Pretty_print * pp = 0; + static const Matchfinder_base * mb = 0; + static Pretty_print * pp = 0; static bool enabled = true; if( !enabled ) return; @@ -933,13 +935,13 @@ void show_cprogress( const unsigned long long cfile_size, void show_dprogress( const unsigned long long cfile_size, const unsigned long long partial_size, - const struct Range_decoder * const d, - struct Pretty_print * const p ) + const Range_decoder * const d, + Pretty_print * const p ) { static unsigned long long csize = 0; /* file_size / 100 */ static unsigned long long psize = 0; - static const struct Range_decoder * rdec = 0; - static struct Pretty_print * pp = 0; + static const Range_decoder * rdec = 0; + static Pretty_print * pp = 0; static int counter = 0; static bool enabled = true; @@ -966,7 +968,7 @@ int main( const int argc, const char * const argv[] ) { /* Mapping from gzip/bzip2 style 0..9 compression levels to the corresponding LZMA compression parameters. */ - const struct Lzma_options option_mapping[] = + const Lzma_options option_mapping[] = { { 1 << 16, 16 }, /* -0 */ { 1 << 20, 5 }, /* -1 */ @@ -978,15 +980,14 @@ int main( const int argc, const char * const argv[] ) { 1 << 24, 68 }, /* -7 */ { 3 << 23, 132 }, /* -8 */ { 1 << 25, 273 } }; /* -9 */ - struct Lzma_options encoder_options = option_mapping[6]; /* default = "-6" */ + Lzma_options encoder_options = option_mapping[6]; /* default = "-6" */ const unsigned long long max_member_size = 0x0008000000000000ULL; /* 2 PiB */ const unsigned long long max_volume_size = 0x4000000000000000ULL; /* 4 EiB */ unsigned long long member_size = max_member_size; unsigned long long volume_size = 0; const char * default_output_filename = ""; - enum Mode program_mode = m_compress; - int i; - struct Cl_options cl_opts; /* command-line options */ + Mode program_mode = m_compress; + Cl_options cl_opts; /* command-line options */ Cl_options_init( &cl_opts ); bool force = false; bool keep_input_files = false; @@ -995,46 +996,44 @@ int main( const int argc, const char * const argv[] ) bool zero = false; if( argc > 0 ) invocation_name = argv[0]; - enum { opt_eer = 256, opt_lt, opt_mer }; - const struct ap_Option options[] = + enum { opt_lt = 256 }; + const ap_Option options[] = { - { '0', "fast", ap_no }, - { '1', 0, ap_no }, - { '2', 0, ap_no }, - { '3', 0, ap_no }, - { '4', 0, ap_no }, - { '5', 0, ap_no }, - { '6', 0, ap_no }, - { '7', 0, ap_no }, - { '8', 0, ap_no }, - { '9', "best", ap_no }, - { 'a', "trailing-error", ap_no }, - { 'b', "member-size", ap_yes }, - { 'c', "stdout", ap_no }, - { 'd', "decompress", ap_no }, - { 'f', "force", ap_no }, - { 'F', "recompress", ap_no }, - { 'h', "help", ap_no }, - { 'k', "keep", ap_no }, - { 'l', "list", ap_no }, - { 'm', "match-length", ap_yes }, - { 'n', "threads", ap_yes }, - { 'o', "output", ap_yes }, - { 'q', "quiet", ap_no }, - { 's', "dictionary-size", ap_yes }, - { 'S', "volume-size", ap_yes }, - { 't', "test", ap_no }, - { 'v', "verbose", ap_no }, - { 'V', "version", ap_no }, - { opt_eer, "empty-error", ap_no }, - { opt_lt, "loose-trailing", ap_no }, - { opt_mer, "marking-error", ap_no }, - { 0, 0, ap_no } }; + { '0', "fast", ap_no }, + { '1', 0, ap_no }, + { '2', 0, ap_no }, + { '3', 0, ap_no }, + { '4', 0, ap_no }, + { '5', 0, ap_no }, + { '6', 0, ap_no }, + { '7', 0, ap_no }, + { '8', 0, ap_no }, + { '9', "best", ap_no }, + { 'a', "trailing-error", ap_no }, + { 'b', "member-size", ap_yes }, + { 'c', "stdout", ap_no }, + { 'd', "decompress", ap_no }, + { 'f', "force", ap_no }, + { 'F', "recompress", ap_no }, + { 'h', "help", ap_no }, + { 'k', "keep", ap_no }, + { 'l', "list", ap_no }, + { 'm', "match-length", ap_yes }, + { 'n', "threads", ap_yes }, + { 'o', "output", ap_yes }, + { 'q', "quiet", ap_no }, + { 's', "dictionary-size", ap_yes }, + { 'S', "volume-size", ap_yes }, + { 't', "test", ap_no }, + { 'v', "verbose", ap_no }, + { 'V', "version", ap_no }, + { opt_lt, "loose-trailing", ap_no }, + { 0, 0, ap_no } }; CRC32_init(); /* static because valgrind complains and memory management in C sucks */ - static struct Arg_parser parser; + static Arg_parser parser; if( !ap_init( &parser, argc, argv, options, 0 ) ) { show_error( mem_msg, 0, false ); return 1; } if( ap_error( &parser ) ) /* bad option */ @@ -1049,9 +1048,8 @@ int main( const int argc, const char * const argv[] ) const char * const arg = ap_argument( &parser, argind ); switch( code ) { - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - zero = ( code == '0' ); + case '0': case '1': case '2': case '3': case '4': case '5': + case '6': case '7': case '8': case '9': zero = code == '0'; encoder_options = option_mapping[code-'0']; break; case 'a': cl_opts.ignore_trailing = false; break; case 'b': member_size = getnum( arg, pn, 100000, max_member_size ); break; @@ -1065,7 +1063,7 @@ int main( const int argc, const char * const argv[] ) case 'm': encoder_options.match_len_limit = getnum( arg, pn, min_match_len_limit, max_match_len ); zero = false; break; - case 'n': break; + case 'n': break; /* ignored */ case 'o': if( strcmp( arg, "-" ) == 0 ) to_stdout = true; else { default_output_filename = arg; } break; case 'q': verbosity = -1; break; @@ -1075,9 +1073,7 @@ int main( const int argc, const char * const argv[] ) case 't': set_mode( &program_mode, m_test ); break; case 'v': if( verbosity < 4 ) ++verbosity; break; case 'V': show_version(); return 0; - case opt_eer: cl_opts.ignore_empty = false; break; - case opt_lt: cl_opts.loose_trailing = true; break; - case opt_mer: cl_opts.ignore_marking = false; break; + case opt_lt: cl_opts.loose_trailing = true; break; default: internal_error( "uncaught option." ); } } /* end process options */ @@ -1092,6 +1088,7 @@ int main( const int argc, const char * const argv[] ) filenames = resize_buffer( filenames, num_filenames * sizeof filenames[0] ); filenames[0] = "-"; + int i; bool filenames_given = false; for( i = 0; argind + i < ap_arguments( &parser ); ++i ) { @@ -1126,7 +1123,7 @@ int main( const int argc, const char * const argv[] ) if( !to_stdout && program_mode != m_test && ( filenames_given || to_file ) ) set_signals( signal_handler ); - static struct Pretty_print pp; + static Pretty_print pp; Pp_init( &pp, filenames, num_filenames ); int failed_tests = 0; @@ -1138,9 +1135,10 @@ int main( const int argc, const char * const argv[] ) { const char * input_filename = ""; int infd; + const bool from_stdin = strcmp( filenames[i], "-" ) == 0; Pp_set_name( &pp, filenames[i] ); - if( strcmp( filenames[i], "-" ) == 0 ) + if( from_stdin ) { if( stdin_used ) continue; else stdin_used = true; infd = STDIN_FILENO; @@ -1189,7 +1187,8 @@ int main( const int argc, const char * const argv[] ) tmp = compress( cfile_size, member_size, volume_size, infd, &encoder_options, &pp, in_statsp, zero ); else - tmp = decompress( cfile_size, infd, &cl_opts, &pp, program_mode == m_test ); + tmp = decompress( cfile_size, infd, &cl_opts, &pp, from_stdin, + program_mode == m_test ); if( close( infd ) != 0 ) { show_file_error( pp.name, "Error closing input file", errno ); set_retval( &tmp, 1 ); } @@ -1217,6 +1216,7 @@ int main( const int argc, const char * const argv[] ) program_name, failed_tests, ( failed_tests == 1 ) ? "file" : "files" ); free( output_filename ); + Pp_free( &pp ); free( filenames ); ap_free( &parser ); return retval; diff --git a/testsuite/check.sh b/testsuite/check.sh index 100deae..394bfcb 100755 --- a/testsuite/check.sh +++ b/testsuite/check.sh @@ -28,12 +28,10 @@ if [ -d tmp ] ; then rm -rf tmp ; fi mkdir tmp cd "${objdir}"/tmp || framework_failure -cat "${testdir}"/test.txt > in || framework_failure +cp "${testdir}"/test.txt in || framework_failure in_lz="${testdir}"/test.txt.lz -in_em="${testdir}"/test_em.txt.lz fox_lz="${testdir}"/fox.lz -fox6_lz="${testdir}"/fox6.lz -f6mk_lz="${testdir}"/fox6_mark.lz +fnz_lz="${testdir}"/fox_nz.lz fail=0 test_failed() { fail=1 ; printf " $1" ; [ -z "$2" ] || printf "($2)" ; } @@ -70,7 +68,7 @@ done "${LZIP}" -q -o out.lz nx_file [ $? = 1 ] || test_failed $LINENO [ ! -e out.lz ] || test_failed $LINENO -"${LZIP}" -qf -S100k -o out in in +"${LZIP}" -qf -S100k -o out in in # only one file with -o and -S [ $? = 1 ] || test_failed $LINENO { [ ! -e out ] && [ ! -e out.lz ] ; } || test_failed $LINENO # these are for code coverage @@ -106,37 +104,25 @@ printf "LZIP\001+.............................." | "${LZIP}" -t 2> /dev/null printf "\ntesting decompression..." -for i in "${in_lz}" "${in_em}" ; do - "${LZIP}" -lq "$i" || test_failed $LINENO "$i" - "${LZIP}" -t "$i" || test_failed $LINENO "$i" - "${LZIP}" -d "$i" -o out || test_failed $LINENO "$i" - cmp in out || test_failed $LINENO "$i" - "${LZIP}" -cd "$i" > out || test_failed $LINENO "$i" - cmp in out || test_failed $LINENO "$i" - "${LZIP}" -d "$i" -o - > out || test_failed $LINENO "$i" - cmp in out || test_failed $LINENO "$i" - "${LZIP}" -d < "$i" > out || test_failed $LINENO "$i" - cmp in out || test_failed $LINENO "$i" - rm -f out || framework_failure -done - -lines=`"${LZIP}" -tvv "${in_em}" 2>&1 | wc -l` || test_failed $LINENO -[ "${lines}" -eq 8 ] || test_failed $LINENO "${lines}" -"${LZIP}" -tq "${in_em}" --empty-error -[ $? = 2 ] || test_failed $LINENO - -lines=`"${LZIP}" -lvv "${in_em}" | wc -l` || test_failed $LINENO -[ "${lines}" -eq 11 ] || test_failed $LINENO "${lines}" -"${LZIP}" -lq "${in_em}" --empty-error -[ $? = 2 ] || test_failed $LINENO +"${LZIP}" -l "${in_lz}" > /dev/null || test_failed $LINENO +"${LZIP}" -t "${in_lz}" || test_failed $LINENO +"${LZIP}" -d "${in_lz}" -o out || test_failed $LINENO +cmp in out || test_failed $LINENO +"${LZIP}" -cd "${in_lz}" > out || test_failed $LINENO +cmp in out || test_failed $LINENO +"${LZIP}" -d "${in_lz}" -o - > out || test_failed $LINENO +cmp in out || test_failed $LINENO +"${LZIP}" -d < "${in_lz}" > out || test_failed $LINENO +cmp in out || test_failed $LINENO +rm -f out || framework_failure -cat "${in_lz}" > out.lz || framework_failure +cp "${in_lz}" out.lz || framework_failure "${LZIP}" -dk out.lz || test_failed $LINENO cmp in out || test_failed $LINENO rm -f out || framework_failure "${LZIP}" -cd "${fox_lz}" > fox || test_failed $LINENO -cat fox > copy || framework_failure -cat "${in_lz}" > copy.lz || framework_failure +cp fox copy || framework_failure +cp "${in_lz}" copy.lz || framework_failure "${LZIP}" -d copy.lz out.lz 2> /dev/null # skip copy, decompress out [ $? = 1 ] || test_failed $LINENO [ ! -e out.lz ] || test_failed $LINENO @@ -147,7 +133,7 @@ cmp in out || test_failed $LINENO cmp in copy || test_failed $LINENO rm -f copy out || framework_failure -cat "${in_lz}" > out.lz || framework_failure +cp "${in_lz}" out.lz || framework_failure "${LZIP}" -d -S100k out.lz || test_failed $LINENO # ignore -S [ ! -e out.lz ] || test_failed $LINENO cmp in out || test_failed $LINENO @@ -155,7 +141,6 @@ cmp in out || test_failed $LINENO printf "to be overwritten" > out || framework_failure "${LZIP}" -df -o out < "${in_lz}" || test_failed $LINENO cmp in out || test_failed $LINENO -rm -f out || framework_failure "${LZIP}" -d -o ./- "${in_lz}" || test_failed $LINENO cmp in ./- || test_failed $LINENO rm -f ./- || framework_failure @@ -163,12 +148,12 @@ rm -f ./- || framework_failure cmp in ./- || test_failed $LINENO rm -f ./- || framework_failure -cat "${in_lz}" > anyothername || framework_failure +cp "${in_lz}" anyothername || framework_failure "${LZIP}" -dv - anyothername - < "${in_lz}" > out 2> /dev/null || test_failed $LINENO cmp in out || test_failed $LINENO cmp in anyothername.out || test_failed $LINENO -rm -f out anyothername.out || framework_failure +rm -f anyothername.out || framework_failure "${LZIP}" -lq in "${in_lz}" [ $? = 2 ] || test_failed $LINENO @@ -185,7 +170,7 @@ cat out in | cmp in - || test_failed $LINENO # out must be empty [ $? = 1 ] || test_failed $LINENO cmp in out || test_failed $LINENO rm -f out || framework_failure -cat "${in_lz}" > out.lz || framework_failure +cp "${in_lz}" out.lz || framework_failure for i in 1 2 3 4 5 6 7 ; do printf "g" >> out.lz || framework_failure "${LZIP}" -alvv out.lz "${in_lz}" > /dev/null 2>&1 @@ -206,7 +191,7 @@ cmp in out || test_failed $LINENO rm -f out || framework_failure cat in in > in2 || framework_failure -"${LZIP}" -lq "${in_lz}" "${in_lz}" || test_failed $LINENO +"${LZIP}" -l "${in_lz}" "${in_lz}" > /dev/null || test_failed $LINENO "${LZIP}" -t "${in_lz}" "${in_lz}" || test_failed $LINENO "${LZIP}" -cd "${in_lz}" "${in_lz}" -o out > out2 || test_failed $LINENO [ ! -e out ] || test_failed $LINENO # override -o @@ -217,6 +202,11 @@ cmp in2 out2 || test_failed $LINENO rm -f out2 || framework_failure cat "${in_lz}" "${in_lz}" > out2.lz || framework_failure +lines=`"${LZIP}" -tvv out2.lz 2>&1 | wc -l` || test_failed $LINENO +[ "${lines}" -eq 2 ] || test_failed $LINENO "${lines}" +lines=`"${LZIP}" -lvv out2.lz | wc -l` || test_failed $LINENO +[ "${lines}" -eq 5 ] || test_failed $LINENO "${lines}" + printf "\ngarbage" >> out2.lz || framework_failure "${LZIP}" -tvvvv out2.lz 2> /dev/null || test_failed $LINENO "${LZIP}" -alq out2.lz @@ -236,15 +226,6 @@ printf "to be overwritten" > out2 || framework_failure cmp in2 out2 || test_failed $LINENO rm -f out2 || framework_failure -"${LZIP}" -cd "${fox6_lz}" > out || test_failed $LINENO -"${LZIP}" -cd "${f6mk_lz}" > copy || test_failed $LINENO -cmp copy out || test_failed $LINENO -rm -f copy out || framework_failure -"${LZIP}" -lq "${f6mk_lz}" --marking-error -[ $? = 2 ] || test_failed $LINENO -"${LZIP}" -tq "${f6mk_lz}" --marking-error -[ $? = 2 ] || test_failed $LINENO - "${LZIP}" -d "${fox_lz}" -o a/b/c/fox || test_failed $LINENO cmp fox a/b/c/fox || test_failed $LINENO rm -rf a || framework_failure @@ -255,6 +236,21 @@ rm -rf a || framework_failure [ $? = 1 ] || test_failed $LINENO [ ! -e a ] || test_failed $LINENO +touch empty em || framework_failure +"${LZIP}" -0 em || test_failed $LINENO +"${LZIP}" -l em.lz > /dev/null || test_failed $LINENO +"${LZIP}" -dk em.lz || test_failed $LINENO +cmp empty em || test_failed $LINENO +cat em.lz em.lz | "${LZIP}" -t || test_failed $LINENO +cat em.lz em.lz | "${LZIP}" -d > em || test_failed $LINENO +cmp empty em || test_failed $LINENO +cat em.lz "${in_lz}" | "${LZIP}" -t || test_failed $LINENO +cat em.lz "${in_lz}" | "${LZIP}" -d > out || test_failed $LINENO +cmp in out || test_failed $LINENO +cat "${in_lz}" em.lz | "${LZIP}" -t || test_failed $LINENO +cat "${in_lz}" em.lz | "${LZIP}" -d > out || test_failed $LINENO +cmp in out || test_failed $LINENO + printf "\ntesting compression..." "${LZIP}" -c -0 in in in -S100k -o out3.lz > copy2.lz || test_failed $LINENO @@ -263,7 +259,7 @@ printf "\ntesting compression..." "${LZIP}" -d copy2.lz -o out2 || test_failed $LINENO [ -e copy2.lz ] || test_failed $LINENO cmp in2 out2 || test_failed $LINENO -rm -f in2 out2 copy2.lz || framework_failure +rm -f copy2.lz || framework_failure "${LZIP}" -cf "${in_lz}" > lzlz 2> /dev/null # /dev/null is a tty on OS/2 [ $? = 1 ] || test_failed $LINENO @@ -331,7 +327,7 @@ rm -f in8 || framework_failure "${LZIP}" -t out00001.lz out00002.lz || test_failed $LINENO "${LZIP}" -cd out00001.lz out00002.lz | cmp in8.lz - || test_failed $LINENO [ ! -e out00003.lz ] || test_failed $LINENO -rm -f out00001.lz || framework_failure +rm -f out00001.lz out00002.lz || framework_failure "${LZIP}" -1 -S100k -o a/b/c/out < in8.lz || test_failed $LINENO "${LZIP}" -t a/b/c/out00001.lz a/b/c/out00002.lz || test_failed $LINENO "${LZIP}" -cd a/b/c/out00001.lz a/b/c/out00002.lz | cmp in8.lz - || @@ -357,11 +353,44 @@ rm -rf a || framework_failure printf "\ntesting bad input..." +cat em.lz em.lz > ee.lz || framework_failure +"${LZIP}" -l < ee.lz > /dev/null || test_failed $LINENO +"${LZIP}" -t < ee.lz || test_failed $LINENO +"${LZIP}" -d < ee.lz > em || test_failed $LINENO +cmp empty em || test_failed $LINENO +"${LZIP}" -lq ee.lz +[ $? = 2 ] || test_failed $LINENO +"${LZIP}" -tq ee.lz +[ $? = 2 ] || test_failed $LINENO +"${LZIP}" -dq ee.lz +[ $? = 2 ] || test_failed $LINENO +[ ! -e ee ] || test_failed $LINENO +"${LZIP}" -cdq ee.lz > em +[ $? = 2 ] || test_failed $LINENO +cmp empty em || test_failed $LINENO +rm -f empty em || framework_failure +cat "${in_lz}" em.lz "${in_lz}" > inein.lz || framework_failure +"${LZIP}" -l < inein.lz > /dev/null || test_failed $LINENO +"${LZIP}" -t < inein.lz || test_failed $LINENO +"${LZIP}" -d < inein.lz > out2 || test_failed $LINENO +cmp in2 out2 || test_failed $LINENO +"${LZIP}" -lq inein.lz +[ $? = 2 ] || test_failed $LINENO +"${LZIP}" -tq inein.lz +[ $? = 2 ] || test_failed $LINENO +"${LZIP}" -dq inein.lz +[ $? = 2 ] || test_failed $LINENO +[ ! -e inein ] || test_failed $LINENO +"${LZIP}" -cdq inein.lz > out2 +[ $? = 2 ] || test_failed $LINENO +cmp in2 out2 || test_failed $LINENO +rm -f in2 out2 inein.lz em.lz || framework_failure + headers='LZIp LZiP LZip LzIP LzIp LziP lZIP lZIp lZiP lzIP' -body='\001\014\000\203\377\373\377\377\300\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000$\000\000\000\000\000\000\000' -cat "${in_lz}" > int.lz || framework_failure +body='\001\014\000\000\101\376\367\377\377\340\000\200\000\215\357\002\322\001\000\000\000\000\000\000\000\045\000\000\000\000\000\000\000' +cp "${in_lz}" int.lz || framework_failure printf "LZIP${body}" >> int.lz || framework_failure -if "${LZIP}" -tq int.lz ; then +if "${LZIP}" -t int.lz ; then for header in ${headers} ; do printf "${header}${body}" > int.lz || framework_failure "${LZIP}" -lq int.lz # first member @@ -380,7 +409,7 @@ if "${LZIP}" -tq int.lz ; then [ $? = 2 ] || test_failed $LINENO ${header} "${LZIP}" -cdq --loose-trailing int.lz > /dev/null [ $? = 2 ] || test_failed $LINENO ${header} - cat "${in_lz}" > int.lz || framework_failure + cp "${in_lz}" int.lz || framework_failure printf "${header}${body}" >> int.lz || framework_failure "${LZIP}" -lq int.lz # trailing data [ $? = 2 ] || test_failed $LINENO ${header} @@ -390,7 +419,7 @@ if "${LZIP}" -tq int.lz ; then [ $? = 2 ] || test_failed $LINENO ${header} "${LZIP}" -cdq int.lz > /dev/null [ $? = 2 ] || test_failed $LINENO ${header} - "${LZIP}" -lq --loose-trailing int.lz || + "${LZIP}" -l --loose-trailing int.lz > /dev/null || test_failed $LINENO ${header} "${LZIP}" -t --loose-trailing int.lz || test_failed $LINENO ${header} @@ -408,10 +437,14 @@ if "${LZIP}" -tq int.lz ; then [ $? = 2 ] || test_failed $LINENO ${header} done else - printf "\nwarning: skipping header test: 'printf' does not work on your system." + printf "warning: skipping header test: 'printf' does not work on your system." fi rm -f int.lz || framework_failure +"${LZIP}" -l "${fnz_lz}" > /dev/null || test_failed $LINENO +"${LZIP}" -tq "${fnz_lz}" +[ $? = 2 ] || test_failed $LINENO + for i in fox_v2.lz fox_s11.lz fox_de20.lz \ fox_bcrc.lz fox_crc0.lz fox_das46.lz fox_mes81.lz ; do "${LZIP}" -tq "${testdir}"/$i @@ -423,13 +456,13 @@ for i in fox_bcrc.lz fox_crc0.lz fox_das46.lz fox_mes81.lz ; do [ $? = 2 ] || test_failed $LINENO $i cmp fox out || test_failed $LINENO $i done -rm -f fox out || framework_failure +rm -f fox || framework_failure cat "${in_lz}" "${in_lz}" > in2.lz || framework_failure cat "${in_lz}" "${in_lz}" "${in_lz}" > in3.lz || framework_failure -if dd if=in3.lz of=trunc.lz bs=14752 count=1 2> /dev/null && - [ -e trunc.lz ] && cmp in2.lz trunc.lz > /dev/null 2>&1 ; then - for i in 6 20 14734 14753 14754 14755 14756 14757 14758 ; do +if dd if=in3.lz of=trunc.lz bs=14682 count=1 2> /dev/null && + [ -e trunc.lz ] && cmp in2.lz trunc.lz ; then + for i in 6 20 14664 14683 14684 14685 14686 14687 14688 ; do dd if=in3.lz of=trunc.lz bs=$i count=1 2> /dev/null "${LZIP}" -lq trunc.lz [ $? = 2 ] || test_failed $LINENO $i @@ -443,11 +476,11 @@ if dd if=in3.lz of=trunc.lz bs=14752 count=1 2> /dev/null && [ $? = 2 ] || test_failed $LINENO $i done else - printf "\nwarning: skipping truncation test: 'dd' does not work on your system." + printf "warning: skipping truncation test: 'dd' does not work on your system." fi rm -f in2.lz in3.lz trunc.lz || framework_failure -cat "${in_lz}" > ingin.lz || framework_failure +cp "${in_lz}" ingin.lz || framework_failure printf "g" >> ingin.lz || framework_failure cat "${in_lz}" >> ingin.lz || framework_failure "${LZIP}" -lq ingin.lz @@ -456,17 +489,21 @@ cat "${in_lz}" >> ingin.lz || framework_failure [ $? = 2 ] || test_failed $LINENO "${LZIP}" -atq < ingin.lz [ $? = 2 ] || test_failed $LINENO -"${LZIP}" -acdq ingin.lz > /dev/null +"${LZIP}" -acdq ingin.lz > out [ $? = 2 ] || test_failed $LINENO -"${LZIP}" -adq < ingin.lz > /dev/null +cmp in out || test_failed $LINENO +"${LZIP}" -adq < ingin.lz > out [ $? = 2 ] || test_failed $LINENO +cmp in out || test_failed $LINENO "${LZIP}" -t ingin.lz || test_failed $LINENO "${LZIP}" -t < ingin.lz || test_failed $LINENO +"${LZIP}" -dk ingin.lz || test_failed $LINENO +cmp in ingin || test_failed $LINENO "${LZIP}" -cd ingin.lz > out || test_failed $LINENO cmp in out || test_failed $LINENO "${LZIP}" -d < ingin.lz > out || test_failed $LINENO cmp in out || test_failed $LINENO -rm -f out ingin.lz || framework_failure +rm -f out ingin ingin.lz || framework_failure echo if [ ${fail} = 0 ] ; then diff --git a/testsuite/fox6.lz b/testsuite/fox6.lz Binary files differdeleted file mode 100644 index 8401b99..0000000 --- a/testsuite/fox6.lz +++ /dev/null diff --git a/testsuite/fox6_mark.lz b/testsuite/fox6_mark.lz Binary files differdeleted file mode 100644 index 32b2ac0..0000000 --- a/testsuite/fox6_mark.lz +++ /dev/null diff --git a/testsuite/fox_nz.lz b/testsuite/fox_nz.lz Binary files differnew file mode 100644 index 0000000..44a4b58 --- /dev/null +++ b/testsuite/fox_nz.lz diff --git a/testsuite/test.txt b/testsuite/test.txt index 9196a3a..423f0c0 100644 --- a/testsuite/test.txt +++ b/testsuite/test.txt @@ -1,8 +1,7 @@ GNU GENERAL PUBLIC LICENSE Version 2, June 1991 - Copyright (C) 1989, 1991 Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Copyright (C) 1989, 1991 Free Software Foundation, Inc. <http://fsf.org/> Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. @@ -339,8 +338,7 @@ Public License instead of this License. GNU GENERAL PUBLIC LICENSE
Version 2, June 1991
- Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
- 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc. <http://fsf.org/>
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
diff --git a/testsuite/test.txt.lz b/testsuite/test.txt.lz Binary files differindex 22cea6e..5dc169f 100644 --- a/testsuite/test.txt.lz +++ b/testsuite/test.txt.lz diff --git a/testsuite/test_em.txt.lz b/testsuite/test_em.txt.lz Binary files differdeleted file mode 100644 index 7e96250..0000000 --- a/testsuite/test_em.txt.lz +++ /dev/null |