diff options
-rw-r--r-- | ChangeLog | 74 | ||||
-rw-r--r-- | INSTALL | 14 | ||||
-rw-r--r-- | Makefile.in | 6 | ||||
-rw-r--r-- | NEWS | 47 | ||||
-rw-r--r-- | README | 52 | ||||
-rw-r--r-- | arg_parser.cc | 2 | ||||
-rw-r--r-- | arg_parser.h | 2 | ||||
-rwxr-xr-x | configure | 16 | ||||
-rw-r--r-- | decoder.cc | 126 | ||||
-rw-r--r-- | decoder.h | 2 | ||||
-rw-r--r-- | doc/lzip.1 | 14 | ||||
-rw-r--r-- | doc/lzip.info | 294 | ||||
-rw-r--r-- | doc/lzip.texi | 227 | ||||
-rw-r--r-- | encoder.cc | 6 | ||||
-rw-r--r-- | encoder.h | 2 | ||||
-rw-r--r-- | encoder_base.cc | 19 | ||||
-rw-r--r-- | encoder_base.h | 6 | ||||
-rw-r--r-- | fast_encoder.cc | 6 | ||||
-rw-r--r-- | fast_encoder.h | 2 | ||||
-rw-r--r-- | list.cc | 32 | ||||
-rw-r--r-- | lzip.h | 35 | ||||
-rw-r--r-- | lzip_index.cc (renamed from file_index.cc) | 73 | ||||
-rw-r--r-- | lzip_index.h (renamed from file_index.h) | 10 | ||||
-rw-r--r-- | main.cc | 104 | ||||
-rwxr-xr-x | testsuite/check.sh | 140 |
25 files changed, 707 insertions, 604 deletions
@@ -1,7 +1,19 @@ +2019-01-03 Antonio Diaz Diaz <antonio@gnu.org> + + * Version 1.21 released. + * File_* renamed to Lzip_*. + * lzip.h (Lzip_trailer): New function 'verify_consistency'. + * lzip_index.cc: Detect some kinds of corrupt trailers. + * main.cc (main): Check return value of close( infd ). + * main.cc: Compile on DOS with DJGPP. + * lzip.texi: Improved description of '-0..-9', '-m' and '-s'. + * configure: Accept appending to CXXFLAGS, 'CXXFLAGS+=OPTIONS'. + * INSTALL: Document use of CXXFLAGS+='-D __USE_MINGW_ANSI_STDIO'. + 2018-02-11 Antonio Diaz Diaz <antonio@gnu.org> * Version 1.20 released. - * main.cc: Added new option '--loose-trailing'. + * Added new option '--loose-trailing'. * Improved corrupt header detection to HD=3. * main.cc: Show corrupt or truncated header in multimember file. * main.cc (main): Option '-S, --volume-size' now keeps input files. @@ -25,13 +37,13 @@ * main.cc: Continue testing if any input file is a terminal. * main.cc: Show trailing data in both hexadecimal and ASCII. * encoder.cc (Matchfinder_base): Verify size passed to new. - * file_index.cc: Improve detection of bad dict and trailing data. + * lzip_index.cc: Improve detection of bad dict and trailing data. * lzip.h: Unified messages for bad magic, trailing data, etc. 2016-05-14 Antonio Diaz Diaz <antonio@gnu.org> * Version 1.18 released. - * main.cc: Added new option '-a, --trailing-error'. + * Added new option '-a, --trailing-error'. * Decompression time has been reduced by 2%. * decoder.cc (verify_trailer): Removed test of final code. * main.cc (main): Delete '--output' file if infd is a terminal. @@ -98,7 +110,7 @@ 2011-04-30 Antonio Diaz Diaz <ant_diaz@teleline.es> * Version 1.12 released. - * main.cc: Added new option '-F, --recompress'. + * Added new option '-F, --recompress'. * encoder.h (update_prices): Update high length symbol prices independently of the value of 'pos_state'. This gives better compression for large values of '--match-length' without being @@ -122,47 +134,47 @@ 2010-09-16 Antonio Diaz Diaz <ant_diaz@teleline.es> * Version 1.11 released. - * Added new option '-0' which produces a compression speed and - ratio comparable to those of 'gzip -9'. + * Added new option '-0' which produces a compression speed and ratio + comparable to those of 'gzip -9'. * fast_encoder.h fast_encoder.cc: New files. * main.cc: Match length limit set by options -1 to -8 has been reduced to extend range of use towards gzip. Lower numbers now - compress less but faster. (-1 now takes 43% less time for only - 20% larger compressed size). + compress less but faster. (-1 now takes 43% less time for only 20% + larger compressed size). * Compression ratio of option '-9' has been slightly increased. * lziprecover.cc: Added new option '-m, --merge' which tries to produce a correct file merging the good parts of two or more damaged copies. - * lziprecover.cc: Added new option '-R, --repair' for repairing - a 1-byte error in single-member files. - * decoder.cc (decode_member): Detect file errors earlier to - improve efficiency of lziprecover's new repair capability. + * lziprecover.cc: Added new option '-R, --repair' for repairing a + 1-byte error in single-member files. + * decoder.cc (decode_member): Detect file errors earlier to improve + efficiency of lziprecover's new repair capability. This change also prevents (harmless) access to uninitialized memory when decompressing a corrupt file. * lziprecover.cc: Added new option '-f, --force'. * lziprecover.cc: Added new option '-o, --output'. - * lziprecover.cc: Added new option '-s, --split' to select the - until now only operation of splitting multimember files. - * lziprecover.cc: If no operation is specified, warn the user - and do nothing. + * lziprecover.cc: Added new option '-s, --split' to select the until + now only operation of splitting multimember files. + * lziprecover.cc: If no operation is specified, warn the user and do + nothing. * main.cc: Fixed warning about fchown's return value being ignored. * decoder.cc: '-tvvvv' now also shows compression ratio. * main.cc: Set stdin/stdout in binary mode on MSVC and OS2. * lzip.texinfo: Added new examples. * testsuite: 'test1' renamed to 'test.txt'. Added new tests. * Matchfinder types HC4 (4 bytes hash-chain) and HT4 (4 bytes - hash-table) have been tested and found no better than the - current BT4. + hash-table) have been tested and found no better than the current + BT4. 2010-04-05 Antonio Diaz Diaz <ant_diaz@teleline.es> * Version 1.10 released. * decoder.h: Input_buffer integrated in Range_decoder. - * main.cc: File specified with option '-o' is now created with - mode 0666 if umask allows it, deleted if interrupted by user. + * main.cc: File specified with option '-o' is now created with mode + 0666 if umask allows it, deleted if interrupted by user. * main.cc: New constant 'o_binary'. - * main.cc: Dictionary size for options -2, -3, -4 and -8 has - been changed to improve linearity of compressed sizes. + * main.cc: Dictionary size for options -2, -3, -4 and -8 has been + changed to improve linearity of compressed sizes. * lzip.h: Fixed warnings produced by over-optimization (-O3). * Makefile.in: Added quotes to directory names. @@ -192,8 +204,8 @@ * Added decompression support for Sync Flush marker. * Added support for .tbz extension to lzdiff and lzgrep. * Added man pages for lzdiff, lzgrep and lziprecover. - * encoder.cc (Matchfinder): Reduce memory use to 9x if input file - is smaller than dictionary size limit. + * encoder.cc (Matchfinder): Reduce memory use to 9x if input file is + smaller than dictionary size limit. * decoder.cc: Added extra flush calls to improve partial decompression of corrupt files. * '--test' no longer needs '/dev/null'. @@ -207,7 +219,7 @@ 2009-04-12 Antonio Diaz Diaz <ant_diaz@teleline.es> * Version 1.5 released. - * lzip.h: Coded dictionary size implemented in File_header. + * lzip.h: Coded dictionary size implemented in Lzip_header. * Fixed some includes that prevented compilation with GCC 4.4. * 'member_size' and 'volume_size' are now accurate limits. * Compression speed has been improved. @@ -233,8 +245,8 @@ * Version 1.3 released. * This version automatically chooses the smallest possible - dictionary size for each file during compression, saving - memory during decompression. + dictionary size for each file during compression, saving memory + during decompression. * Implemented decompression of version 1 files. * check.sh: Replaced 'diff -q' with 'cmp'. @@ -293,8 +305,8 @@ * Version 0.1 released. -Copyright (C) 2008-2018 Antonio Diaz Diaz. +Copyright (C) 2008-2019 Antonio Diaz Diaz. -This file is a collection of facts, and thus it is not copyrightable, -but just in case, you have unlimited permission to copy, distribute and -modify it. +This file is a collection of facts, and thus it is not copyrightable, but +just in case, you have unlimited permission to copy, distribute and modify +it. @@ -1,10 +1,14 @@ Requirements ------------ You will need a C++ compiler. -I use gcc 5.3.0 and 4.1.2, but the code should compile with any -standards compliant compiler. +I use gcc 5.3.0 and 4.1.2, but the code should compile with any standards +compliant compiler. Gcc is available at http://gcc.gnu.org. +The operating system must allow signal handlers read access to objects with +static storage duration so that the cleanup handler for Control-C can delete +the partial output file. + Procedure --------- @@ -23,6 +27,10 @@ the main archive. cd lzip[version] ./configure + If you are compiling on MinGW, use: + + ./configure CXXFLAGS+='-D __USE_MINGW_ANSI_STDIO' + 3. Run make. make @@ -58,7 +66,7 @@ After running 'configure', you can run 'make' and 'make install' as explained above. -Copyright (C) 2008-2018 Antonio Diaz Diaz. +Copyright (C) 2008-2019 Antonio Diaz Diaz. This file is free documentation: you have unlimited permission to copy, distribute and modify it. diff --git a/Makefile.in b/Makefile.in index 9db3f63..20e9f16 100644 --- a/Makefile.in +++ b/Makefile.in @@ -7,7 +7,7 @@ INSTALL_DIR = $(INSTALL) -d -m 755 SHELL = /bin/sh CAN_RUN_INSTALLINFO = $(SHELL) -c "install-info --version" > /dev/null 2>&1 -objs = arg_parser.o file_index.o list.o encoder_base.o encoder.o \ +objs = arg_parser.o lzip_index.o list.o encoder_base.o encoder.o \ fast_encoder.o decoder.o main.o @@ -34,8 +34,8 @@ decoder.o : lzip.h decoder.h encoder_base.o : lzip.h encoder_base.h encoder.o : lzip.h encoder_base.h encoder.h fast_encoder.o : lzip.h encoder_base.h fast_encoder.h -file_index.o : lzip.h file_index.h -list.o : lzip.h file_index.h +list.o : lzip.h lzip_index.h +lzip_index.o : lzip.h lzip_index.h main.o : arg_parser.h lzip.h decoder.h encoder_base.h encoder.h fast_encoder.h @@ -1,42 +1,17 @@ -Changes in version 1.20: +Changes in version 1.21: -The option '--loose-trailing', has been added. +Detection of forbidden combinations of characters in trailing data has been +improved. -The test used by lzip to discriminate trailing data from a corrupt -header in multimember or concatenated files has been improved to a -Hamming distance (HD) of 3, and the 3 bit flips must happen in different -magic bytes for the test to fail. As a consequence some kinds of files -no longer can be appended to a lzip file as trailing data unless the -'--loose-trailing' option is used when decompressing. -Lziprecover can be used to remove conflicting trailing data from a file. +Errors are now also checked when closing the input file. -The contents of a corrupt or truncated header found in a multimember -file are now shown, after the error message, in the same format as -trailing data. +Lzip now compiles on DOS with DJGPP. (Patch from Robert Riebisch). -Option '-S, --volume-size' now keeps input files unchanged. +The descriptions of '-0..-9', '-m' and '-s' in the manual have been +improved. -When creating multimember files or splitting the output in volumes, the -dictionary size is now adjusted for each member individually. +The configure script now accepts appending options to CXXFLAGS using the +syntax 'CXXFLAGS+=OPTIONS'. -The 'bits/byte' ratio has been replaced with the inverse compression -ratio in the output. - -The progress of decompression is now shown at verbosity level 2 (-vv) or -higher. - -Progress of (de)compression is only shown if stderr is a terminal. - -A final diagnostic is now shown at verbosity level 1 (-v) or higher if -any file fails the test when testing multiple files. - -A second '.lz' extension is no longer added to the argument of '-o' if -it already ends in '.lz' or '.tlz'. - -In case of (de)compressed size mismatch, the stored size is now also -shown in hexadecimal to ease visual comparison. - -The dictionary size is now shown at verbosity level 4 (-vvvv) when -decompressing or testing. - -The new chapter "Meaning of lzip's output" has been added to the manual. +It has been documented in INSTALL the use of +CXXFLAGS+='-D __USE_MINGW_ANSI_STDIO' when compiling on MinGW. @@ -1,27 +1,28 @@ Description Lzip is a lossless data compressor with a user interface similar to the -one of gzip or bzip2. Lzip can compress about as fast as gzip (lzip -0), +one of gzip or bzip2. Lzip can compress about as fast as gzip (lzip -0) or compress most files more than bzip2 (lzip -9). Decompression speed is intermediate between gzip and bzip2. Lzip is better than gzip and bzip2 -from a data recovery perspective. +from a data recovery perspective. Lzip has been designed, written and +tested with great care to replace gzip and bzip2 as the standard +general-purpose compressed format for unix-like systems. -The lzip file format is designed for data sharing and long-term -archiving, taking into account both data integrity and decoder -availability: +The lzip file format is designed for data sharing and long-term archiving, +taking into account both data integrity and decoder availability: * The lzip format provides very safe integrity checking and some data - recovery means. The lziprecover program can repair bit-flip errors + recovery means. The lziprecover program can repair bit flip errors (one of the most common forms of data corruption) in lzip files, and provides data recovery capabilities, including error-checked merging of damaged copies of a file. * The lzip format is as simple as possible (but not simpler). The - lzip manual provides the source code of a simple decompressor along - with a detailed explanation of how it works, so that with the only - help of the lzip manual it would be possible for a digital - archaeologist to extract the data from a lzip file long after - quantum computers eventually render LZMA obsolete. + lzip manual provides the source code of a simple decompressor + along with a detailed explanation of how it works, so that with + the only help of the lzip manual it would be possible for a + digital archaeologist to extract the data from a lzip file long + after quantum computers eventually render LZMA obsolete. * Additionally the lzip reference implementation is copylefted, which guarantees that it will remain free forever. @@ -33,13 +34,12 @@ corrupt byte near the beginning is a thing of the past. Lzip uses the same well-defined exit status values used by bzip2, which makes it safer than compressors returning ambiguous warning values (like -gzip) when it is used as a back end for other programs like tar or -zutils. +gzip) when it is used as a back end for other programs like tar or zutils. -Lzip will automatically use the smallest possible dictionary size for -each file without exceeding the given limit. Keep in mind that the -decompression memory requirement is affected at compression time by the -choice of dictionary size limit. +Lzip will automatically use for each file the largest dictionary size +that does not exceed neither the file size nor the limit given. Keep in +mind that the decompression memory requirement is affected at +compression time by the choice of dictionary size limit. The amount of memory required for compression is about 1 or 2 times the dictionary size limit (1 if input file size is less than dictionary size @@ -59,22 +59,22 @@ anyothername becomes anyothername.out (De)compressing a file is much like copying or moving it; therefore lzip preserves the access and modification dates, permissions, and, when -possible, ownership of the file just as "cp -p" does. (If the user ID or +possible, ownership of the file just as 'cp -p' does. (If the user ID or the group ID can't be duplicated, the file permission bits S_ISUID and S_ISGID are cleared). Lzip is able to read from some types of non regular files if the -"--stdout" option is specified. +'--stdout' option is specified. If no file names are specified, lzip compresses (or decompresses) from standard input to standard output. In this case, lzip will decline to write compressed output to a terminal, as this would be entirely incomprehensible and therefore pointless. -Lzip will correctly decompress a file which is the concatenation of two -or more compressed files. The result is the concatenation of the -corresponding decompressed files. Integrity testing of concatenated -compressed files is also supported. +Lzip will correctly decompress a file which is the concatenation of two or +more compressed files. The result is the concatenation of the corresponding +decompressed files. Integrity testing of concatenated compressed files is +also supported. Lzip can produce multimember files, and lziprecover can safely recover the undamaged members in case of file damage. Lzip can also split the @@ -110,8 +110,12 @@ the definition of Markov chains), G.N.N. Martin (for the definition of range encoding), Igor Pavlov (for putting all the above together in LZMA), and Julian Seward (for bzip2's CLI). +LANGUAGE NOTE: Uncompressed = not compressed = plain data; it may never +have been compressed. Decompressed is used to refer to data which have +undergone the process of decompression. -Copyright (C) 2008-2018 Antonio Diaz Diaz. + +Copyright (C) 2008-2019 Antonio Diaz Diaz. This file is free documentation: you have unlimited permission to copy, distribute and modify it. diff --git a/arg_parser.cc b/arg_parser.cc index 008ebc8..ea32fde 100644 --- a/arg_parser.cc +++ b/arg_parser.cc @@ -1,5 +1,5 @@ /* Arg_parser - POSIX/GNU command line argument parser. (C++ version) - Copyright (C) 2006-2018 Antonio Diaz Diaz. + Copyright (C) 2006-2019 Antonio Diaz Diaz. This library is free software. Redistribution and use in source and binary forms, with or without modification, are permitted provided diff --git a/arg_parser.h b/arg_parser.h index f015881..ceb9933 100644 --- a/arg_parser.h +++ b/arg_parser.h @@ -1,5 +1,5 @@ /* Arg_parser - POSIX/GNU command line argument parser. (C++ version) - Copyright (C) 2006-2018 Antonio Diaz Diaz. + Copyright (C) 2006-2019 Antonio Diaz Diaz. This library is free software. Redistribution and use in source and binary forms, with or without modification, are permitted provided @@ -1,12 +1,12 @@ #! /bin/sh # configure script for Lzip - LZMA lossless data compressor -# Copyright (C) 2008-2018 Antonio Diaz Diaz. +# Copyright (C) 2008-2019 Antonio Diaz Diaz. # # This configure script is free software: you have unlimited permission # to copy, distribute and modify it. pkgname=lzip -pkgversion=1.20 +pkgversion=1.21 progname=lzip srctrigger=doc/${pkgname}.texi @@ -70,6 +70,7 @@ while [ $# != 0 ] ; do echo " CXX=COMPILER C++ compiler to use [${CXX}]" echo " CPPFLAGS=OPTIONS command line options for the preprocessor [${CPPFLAGS}]" echo " CXXFLAGS=OPTIONS command line options for the C++ compiler [${CXXFLAGS}]" + echo " CXXFLAGS+=OPTIONS append options to the current value of CXXFLAGS" echo " LDFLAGS=OPTIONS command line options for the linker [${LDFLAGS}]" echo exit 0 ;; @@ -93,10 +94,11 @@ while [ $# != 0 ] ; do --mandir=*) mandir=${optarg} ;; --no-create) no_create=yes ;; - CXX=*) CXX=${optarg} ;; - CPPFLAGS=*) CPPFLAGS=${optarg} ;; - CXXFLAGS=*) CXXFLAGS=${optarg} ;; - LDFLAGS=*) LDFLAGS=${optarg} ;; + CXX=*) CXX=${optarg} ;; + CPPFLAGS=*) CPPFLAGS=${optarg} ;; + CXXFLAGS=*) CXXFLAGS=${optarg} ;; + CXXFLAGS+=*) CXXFLAGS="${CXXFLAGS} ${optarg}" ;; + LDFLAGS=*) LDFLAGS=${optarg} ;; --*) echo "configure: WARNING: unrecognized option: '${option}'" 1>&2 ;; @@ -168,7 +170,7 @@ echo "LDFLAGS = ${LDFLAGS}" rm -f Makefile cat > Makefile << EOF # Makefile for Lzip - LZMA lossless data compressor -# Copyright (C) 2008-2018 Antonio Diaz Diaz. +# Copyright (C) 2008-2019 Antonio Diaz Diaz. # This file was generated automatically by configure. Don't edit. # # This Makefile is free software: you have unlimited permission @@ -1,5 +1,5 @@ /* Lzip - LZMA lossless data compressor - Copyright (C) 2008-2018 Antonio Diaz Diaz. + Copyright (C) 2008-2019 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -100,13 +100,13 @@ void LZ_decoder::flush_data() bool LZ_decoder::verify_trailer( const Pretty_print & pp ) const { - File_trailer trailer; - int size = rdec.read_data( trailer.data, File_trailer::size ); + Lzip_trailer trailer; + int size = rdec.read_data( trailer.data, Lzip_trailer::size ); const unsigned long long data_size = data_position(); const unsigned long long member_size = rdec.member_position(); bool error = false; - if( size < File_trailer::size ) + if( size < Lzip_trailer::size ) { error = true; if( verbosity >= 0 ) @@ -115,7 +115,7 @@ bool LZ_decoder::verify_trailer( const Pretty_print & pp ) const std::fprintf( stderr, "Trailer truncated at trailer position %d;" " some checks may fail.\n", size ); } - while( size < File_trailer::size ) trailer.data[size++] = 0; + while( size < Lzip_trailer::size ) trailer.data[size++] = 0; } const unsigned td_crc = trailer.data_crc(); @@ -199,86 +199,86 @@ int LZ_decoder::decode_member( const Pretty_print & pp ) const int pos_state = data_position() & pos_state_mask; if( rdec.decode_bit( bm_match[state()][pos_state] ) == 0 ) // 1st bit { + // literal byte Bit_model * const bm = bm_literal[get_lit_state(peek_prev())]; if( state.is_char_set_char() ) put_byte( rdec.decode_tree8( bm ) ); else put_byte( rdec.decode_matched( bm, peek( rep0 ) ) ); + continue; } - else // match or repeated match + // match or repeated match + int len; + if( rdec.decode_bit( bm_rep[state()] ) != 0 ) // 2nd bit { - int len; - if( rdec.decode_bit( bm_rep[state()] ) != 0 ) // 2nd bit + if( rdec.decode_bit( bm_rep0[state()] ) == 0 ) // 3rd bit { - if( rdec.decode_bit( bm_rep0[state()] ) == 0 ) // 3rd bit - { - if( rdec.decode_bit( bm_len[state()][pos_state] ) == 0 ) // 4th bit - { state.set_short_rep(); put_byte( peek( rep0 ) ); continue; } - } + if( rdec.decode_bit( bm_len[state()][pos_state] ) == 0 ) // 4th bit + { state.set_short_rep(); put_byte( peek( rep0 ) ); continue; } + } + else + { + unsigned distance; + if( rdec.decode_bit( bm_rep1[state()] ) == 0 ) // 4th bit + distance = rep1; else { - unsigned distance; - if( rdec.decode_bit( bm_rep1[state()] ) == 0 ) // 4th bit - distance = rep1; + if( rdec.decode_bit( bm_rep2[state()] ) == 0 ) // 5th bit + distance = rep2; else - { - if( rdec.decode_bit( bm_rep2[state()] ) == 0 ) // 5th bit - distance = rep2; - else - { distance = rep3; rep3 = rep2; } - rep2 = rep1; - } - rep1 = rep0; - rep0 = distance; + { distance = rep3; rep3 = rep2; } + rep2 = rep1; } - state.set_rep(); - len = min_match_len + rdec.decode_len( rep_len_model, pos_state ); + rep1 = rep0; + rep0 = distance; } - else // match + state.set_rep(); + len = min_match_len + rdec.decode_len( rep_len_model, pos_state ); + } + else // match + { + len = min_match_len + rdec.decode_len( match_len_model, pos_state ); + unsigned distance = rdec.decode_tree6( bm_dis_slot[get_len_state(len)] ); + if( distance >= start_dis_model ) { - len = min_match_len + rdec.decode_len( match_len_model, pos_state ); - unsigned distance = rdec.decode_tree6( bm_dis_slot[get_len_state(len)] ); - if( distance >= start_dis_model ) + const unsigned dis_slot = distance; + const int direct_bits = ( dis_slot >> 1 ) - 1; + distance = ( 2 | ( dis_slot & 1 ) ) << direct_bits; + if( dis_slot < end_dis_model ) + distance += rdec.decode_tree_reversed( + bm_dis + ( distance - dis_slot ), direct_bits ); + else { - const unsigned dis_slot = distance; - const int direct_bits = ( dis_slot >> 1 ) - 1; - distance = ( 2 | ( dis_slot & 1 ) ) << direct_bits; - if( dis_slot < end_dis_model ) - distance += rdec.decode_tree_reversed( - bm_dis + ( distance - dis_slot ), direct_bits ); - else + distance += + rdec.decode( direct_bits - dis_align_bits ) << dis_align_bits; + distance += rdec.decode_tree_reversed4( bm_align ); + if( distance == 0xFFFFFFFFU ) // marker found { - distance += - rdec.decode( direct_bits - dis_align_bits ) << dis_align_bits; - distance += rdec.decode_tree_reversed4( bm_align ); - if( distance == 0xFFFFFFFFU ) // marker found + rdec.normalize(); + flush_data(); + if( len == min_match_len ) // End Of Stream marker + { + if( verify_trailer( pp ) ) return 0; else return 3; + } + if( len == min_match_len + 1 ) // Sync Flush marker + { + rdec.load(); continue; + } + if( verbosity >= 0 ) { - rdec.normalize(); - flush_data(); - if( len == min_match_len ) // End Of Stream marker - { - if( verify_trailer( pp ) ) return 0; else return 3; - } - if( len == min_match_len + 1 ) // Sync Flush marker - { - rdec.load(); continue; - } - if( verbosity >= 0 ) - { - pp(); - std::fprintf( stderr, "Unsupported marker code '%d'\n", len ); - } - return 4; + pp(); + std::fprintf( stderr, "Unsupported marker code '%d'\n", len ); } + return 4; } } - rep3 = rep2; rep2 = rep1; rep1 = rep0; rep0 = distance; - state.set_match(); - if( rep0 >= dictionary_size || ( rep0 >= pos && !pos_wrapped ) ) - { flush_data(); return 1; } } - copy_block( rep0, len ); + rep3 = rep2; rep2 = rep1; rep1 = rep0; rep0 = distance; + state.set_match(); + if( rep0 >= dictionary_size || ( rep0 >= pos && !pos_wrapped ) ) + { flush_data(); return 1; } } + copy_block( rep0, len ); } flush_data(); return 2; @@ -1,5 +1,5 @@ /* Lzip - LZMA lossless data compressor - Copyright (C) 2008-2018 Antonio Diaz Diaz. + Copyright (C) 2008-2019 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -1,12 +1,18 @@ .\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.46.1. -.TH LZIP "1" "February 2018" "lzip 1.20" "User Commands" +.TH LZIP "1" "January 2019" "lzip 1.21" "User Commands" .SH NAME lzip \- reduces the size of files .SH SYNOPSIS .B lzip [\fI\,options\/\fR] [\fI\,files\/\fR] .SH DESCRIPTION -Lzip \- LZMA lossless data compressor. +Lzip is a lossless data compressor with a user interface similar to the +one of gzip or bzip2. Lzip can compress about as fast as gzip (lzip \fB\-0\fR) +or compress most files more than bzip2 (lzip \fB\-9\fR). Decompression speed is +intermediate between gzip and bzip2. Lzip is better than gzip and bzip2 +from a data recovery perspective. Lzip has been designed, written and +tested with great care to replace gzip and bzip2 as the standard +general\-purpose compressed format for unix\-like systems. .SH OPTIONS .TP \fB\-h\fR, \fB\-\-help\fR @@ -52,7 +58,7 @@ suppress all messages set dictionary size limit in bytes [8 MiB] .TP \fB\-S\fR, \fB\-\-volume\-size=\fR<bytes> -set volume size limit in bytes, implies \fB\-k\fR +set volume size limit in bytes .TP \fB\-t\fR, \fB\-\-test\fR test compressed file integrity @@ -93,7 +99,7 @@ Report bugs to lzip\-bug@nongnu.org .br Lzip home page: http://www.nongnu.org/lzip/lzip.html .SH COPYRIGHT -Copyright \(co 2018 Antonio Diaz Diaz. +Copyright \(co 2019 Antonio Diaz Diaz. License GPLv2+: GNU GPL version 2 or later <http://gnu.org/licenses/gpl.html> .br This is free software: you are free to change and redistribute it. diff --git a/doc/lzip.info b/doc/lzip.info index 8591b5b..36368c4 100644 --- a/doc/lzip.info +++ b/doc/lzip.info @@ -11,7 +11,7 @@ File: lzip.info, Node: Top, Next: Introduction, Up: (dir) Lzip Manual *********** -This manual is for Lzip (version 1.20, 11 February 2018). +This manual is for Lzip (version 1.21, 3 January 2019). * Menu: @@ -29,7 +29,7 @@ This manual is for Lzip (version 1.20, 11 February 2018). * Concept index:: Index of concepts - Copyright (C) 2008-2018 Antonio Diaz Diaz. + Copyright (C) 2008-2019 Antonio Diaz Diaz. This manual is free documentation: you have unlimited permission to copy, distribute and modify it. @@ -41,10 +41,10 @@ File: lzip.info, Node: Introduction, Next: Output, Prev: Top, Up: Top ************** Lzip is a lossless data compressor with a user interface similar to the -one of gzip or bzip2. Lzip can compress about as fast as gzip -(lzip -0), or compress most files more than bzip2 (lzip -9). -Decompression speed is intermediate between gzip and bzip2. Lzip is -better than gzip and bzip2 from a data recovery perspective. +one of gzip or bzip2. Lzip can compress about as fast as gzip (lzip -0) +or compress most files more than bzip2 (lzip -9). Decompression speed +is intermediate between gzip and bzip2. Lzip is better than gzip and +bzip2 from a data recovery perspective. The lzip file format is designed for data sharing and long-term archiving, taking into account both data integrity and decoder @@ -88,10 +88,10 @@ which makes it safer than compressors returning ambiguous warning values (like gzip) when it is used as a back end for other programs like tar or zutils. - Lzip will automatically use the smallest possible dictionary size for -each file without exceeding the given limit. Keep in mind that the -decompression memory requirement is affected at compression time by the -choice of dictionary size limit. + Lzip will automatically use for each file the largest dictionary size +that does not exceed neither the file size nor the limit given. Keep in +mind that the decompression memory requirement is affected at +compression time by the choice of dictionary size limit. The amount of memory required for compression is about 1 or 2 times the dictionary size limit (1 if input file size is less than dictionary @@ -111,7 +111,7 @@ anyothername becomes anyothername.out (De)compressing a file is much like copying or moving it; therefore lzip preserves the access and modification dates, permissions, and, when -possible, ownership of the file just as "cp -p" does. (If the user ID or +possible, ownership of the file just as 'cp -p' does. (If the user ID or the group ID can't be duplicated, the file permission bits S_ISUID and S_ISGID are cleared). @@ -209,6 +209,7 @@ command line. '-V' '--version' Print the version number of lzip on the standard output and exit. + This version number should be included in all bug reports. '-a' '--trailing-error' @@ -293,12 +294,14 @@ command line. '-s BYTES' '--dictionary-size=BYTES' When compressing, set the dictionary size limit in bytes. Lzip - will use the smallest possible dictionary size for each file - without exceeding this limit. Valid values range from 4 KiB to - 512 MiB. Values 12 to 29 are interpreted as powers of two, meaning - 2^12 to 2^29 bytes. Note that dictionary sizes are quantized. If - the specified size does not match one of the valid sizes, it will - be rounded upwards by adding up to (BYTES / 8) to it. + will use for each file the largest dictionary size that does not + exceed neither the file size nor this limit. Valid values range + from 4 KiB to 512 MiB. Values 12 to 29 are interpreted as powers + of two, meaning 2^12 to 2^29 bytes. Dictionary sizes are quantized + so that they can be coded in just one byte (*note + coded-dict-size::). If the specified size does not match one of + the valid sizes, it will be rounded upwards by adding up to + (BYTES / 8) to it. For maximum compression you should use a dictionary size limit as large as possible, but keep in mind that the decompression memory @@ -337,27 +340,32 @@ command line. Two or more '-v' options show the progress of (de)compression. '-0 .. -9' - Set the compression parameters (dictionary size and match length - limit) as shown in the table below. The default compression level - is '-6'. Note that '-9' can be much slower than '-0'. These - options have no effect when decompressing, testing or listing. + Compression level. Set the compression parameters (dictionary size + and match length limit) as shown in the table below. The default + compression level is '-6', equivalent to '-s8MiB -m36'. Note that + '-9' can be much slower than '-0'. These options have no effect + when decompressing, testing or listing. The bidimensional parameter space of LZMA can't be mapped to a linear scale optimal for all files. If your files are large, very repetitive, etc, you may need to use the '--dictionary-size' and '--match-length' options directly to achieve optimal performance. - Level Dictionary size Match length limit - -0 64 KiB 16 bytes - -1 1 MiB 5 bytes - -2 1.5 MiB 6 bytes - -3 2 MiB 8 bytes - -4 3 MiB 12 bytes - -5 4 MiB 20 bytes - -6 8 MiB 36 bytes - -7 16 MiB 68 bytes - -8 24 MiB 132 bytes - -9 32 MiB 273 bytes + If several compression levels or '-s' or '-m' options are given, + the last setting is used. For example '-9 -s64MiB' is equivalent + to '-s64MiB -m273' + + Level Dictionary size (-s) Match length limit (-m) + -0 64 KiB 16 bytes + -1 1 MiB 5 bytes + -2 1.5 MiB 6 bytes + -3 2 MiB 8 bytes + -4 3 MiB 12 bytes + -5 4 MiB 20 bytes + -6 8 MiB 36 bytes + -7 16 MiB 68 bytes + -8 24 MiB 132 bytes + -9 32 MiB 273 bytes '--fast' '--best' @@ -404,10 +412,10 @@ is to make it so complicated that there are no obvious deficiencies. The first method is far more difficult. -- C.A.R. Hoare - Lzip has been designed, written and tested with great care to be the -standard general-purpose compressor for unix-like systems. This chapter -describes the lessons learned from previous compressors (gzip and -bzip2), and their application to the design of lzip. + Lzip has been designed, written and tested with great care to replace +gzip and bzip2 as the standard general-purpose compressed format for +unix-like systems. This chapter describes the lessons learned from +these previous formats, and their application to the design of lzip. 4.1 Format design @@ -450,17 +458,20 @@ error detection. Any distance larger than the dictionary size acts as a forbidden symbol, allowing the decompressor to detect the approximate position of errors, and leaving very little work for the check sequence (CRC and data sizes) in the detection of errors. Lzip is usually able -to detect all posible bit flips in the compressed data without +to detect all possible bit flips in the compressed data without resorting to the check sequence. It would be difficult to write an automatic recovery tool like lziprecover for the gzip format. And, as far as I know, it has never been written. Lzip, like gzip and bzip2, uses a CRC32 to check the integrity of the -decompressed data because it provides more accurate error detection than -CRC64 up to a compressed size of about 16 GiB, a size larger than that -of most files. In the case of lzip, the additional detection capability -of the decompressor reduces the probability of undetected errors more -than a million times beyond what the CRC32 alone provides. +decompressed data because it provides optimal accuracy in the detection +of errors up to a compressed size of about 16 GiB, a size larger than +that of most files. In the case of lzip, the additional detection +capability of the decompressor reduces the probability of undetected +errors about four million times more, resulting in a combined integrity +checking optimally accurate for any member size produced by lzip. +Preliminary results suggest that the lzip format is safe enough to be +used in critical safety avionics systems. The lzip format is designed for long-term archiving. Therefore it excludes any unneeded features that may interfere with the future @@ -515,7 +526,7 @@ extraction of the decompressed data. Bzip2 does not store the uncompressed size of the file. The lzip format provides a 64-bit field for the uncompressed size. - Additionaly, lzip produces multimember output automatically when + Additionally, lzip produces multimember output automatically when the size is too large for a single member, allowing for an unlimited uncompressed size. @@ -563,9 +574,9 @@ extraction of the decompressed data. (lziprecover)Unzcrash. 'Dictionary size' - Lzip automatically uses the smallest possible dictionary size for - each file. In addition to reducing the amount of memory required - for decompression, this feature also minimizes the probability of + Lzip automatically adapts the dictionary size to the size of each + file. In addition to reducing the amount of memory required for + decompression, this feature also minimizes the probability of being affected by RAM errors during compression. 'Exit status' @@ -619,11 +630,11 @@ additional information before, between, or after them. 'DS (coded dictionary size, 1 byte)' The dictionary size is calculated by taking a power of 2 (the base - size) and substracting from it a fraction between 0/16 and 7/16 of + size) and subtracting from it a fraction between 0/16 and 7/16 of the base size. Bits 4-0 contain the base 2 logarithm of the base size (12 to 29). - Bits 7-5 contain the numerator of the fraction (0 to 7) to - substract from the base size to obtain the dictionary size. + Bits 7-5 contain the numerator of the fraction (0 to 7) to subtract + from the base size to obtain the dictionary size. Example: 0xD3 = 2^19 - 6 * 2^15 = 512 KiB - 6 * 32 KiB = 320 KiB Valid values for dictionary size range from 4 KiB to 512 MiB. @@ -762,7 +773,7 @@ reusing a recently used distance). There are 7 different coding sequences: Bit sequence Name Description ---------------------------------------------------------------------------- +------------------------------------------------------------------------ 0 + byte literal literal byte 1 + 0 + len + dis match distance-length pair 1 + 1 + 0 + 0 shortrep 1 byte match at latest used distance @@ -782,7 +793,7 @@ order, from MSB to LSB, except where noted otherwise. Lengths (the 'len' in the table above) are coded as follows: Bit sequence Description --------------------------------------------------------------------------- +------------------------------------------------------------------------ 0 + 3 bits lengths from 2 to 9 1 + 0 + 3 bits lengths from 10 to 17 1 + 1 + 8 bits lengths from 18 to 273 @@ -823,7 +834,7 @@ order (from LSB to MSB). For distances >= 128, the 'direct_bits - 4' part is coded with fixed 0.5 probability. Bit sequence Description --------------------------------------------------------------------------- +------------------------------------------------------------------------ slot distances from 0 to 3 slot + direct_bits distances from 4 to 127 slot + (direct_bits - 4) + 4 bits distances from 128 to 2^32 - 1 @@ -859,7 +870,7 @@ byte. 'rep' is any one of 'rep0', 'rep1', 'rep2' or 'rep3'. The types of previous sequences corresponding to each state are: State Types of previous sequences --------------------------------------------------------- +------------------------------------------------------ 0 literal, literal, literal 1 match, literal, literal 2 rep or (!literal, shortrep), literal, literal @@ -876,24 +887,24 @@ State Types of previous sequences The contexts for decoding the type of coding sequence are: -Name Indices Used when ---------------------------------------------------------------------------- -bm_match state, pos_state sequence start -bm_rep state after sequence 1 -bm_rep0 state after sequence 11 -bm_rep1 state after sequence 111 -bm_rep2 state after sequence 1111 -bm_len state, pos_state after sequence 110 +Name Indices Used when +----------------------------------------------------------------------- +bm_match state, pos_state sequence start +bm_rep state after sequence 1 +bm_rep0 state after sequence 11 +bm_rep1 state after sequence 111 +bm_rep2 state after sequence 1111 +bm_len state, pos_state after sequence 110 The contexts for decoding distances are: -Name Indices Used when ---------------------------------------------------------------------------- -bm_dis_slot len_state, bit tree distance start -bm_dis reverse bit tree after slots 4 to 13 -bm_align reverse bit tree for distances >= 128, after - fixed probability bits +Name Indices Used when +------------------------------------------------------------------------ +bm_dis_slot len_state, bit tree distance start +bm_dis reverse bit tree after slots 4 to 13 +bm_align reverse bit tree for distances >= 128, after fixed + probability bits There are two separate sets of contexts for lengths ('Len_model' in @@ -901,7 +912,7 @@ the source). One for normal matches, the other for repeated matches. The contexts in each Len_model are (see 'decode_len' in the source): Name Indices Used when ---------------------------------------------------------------------------- +------------------------------------------------------------------------ choice1 none length start choice2 none after sequence 1 bm_low pos_state, bit tree after sequence 0 @@ -1008,7 +1019,11 @@ compressed file (bugs in the system libraries, memory errors, etc). Therefore, if the data you are going to compress are important, give the '--keep' option to lzip and don't remove the original file until you verify the compressed file with a command like -'lzip -cd file.lz | cmp file -'. +'lzip -cd file.lz | cmp file -'. Most RAM errors happening during +compression can only be detected by comparing the compressed file with +the original because the corruption happens before lzip compresses the +RAM contents, resulting in a valid compressed file containing wrong +data. Example 1: Replace a regular file with its compressed version 'file.lz' @@ -1101,7 +1116,7 @@ Appendix A Reference source code ******************************** /* Lzd - Educational decompressor for the lzip format - Copyright (C) 2013-2018 Antonio Diaz Diaz. + Copyright (C) 2013-2019 Antonio Diaz Diaz. This program is free software. Redistribution and use in source and binary forms, with or without modification, are permitted provided @@ -1131,7 +1146,7 @@ Appendix A Reference source code #include <cstring> #include <stdint.h> #include <unistd.h> -#if defined(__MSVCRT__) || defined(__OS2__) || defined(_MSC_VER) +#if defined(__MSVCRT__) || defined(__OS2__) || defined(__DJGPP__) #include <fcntl.h> #include <io.h> #endif @@ -1232,9 +1247,9 @@ public: const CRC32 crc32; -typedef uint8_t File_header[6]; // 0-3 magic, 4 version, 5 coded_dict_size +typedef uint8_t Lzip_header[6]; // 0-3 magic, 4 version, 5 coded_dict_size -typedef uint8_t File_trailer[20]; +typedef uint8_t Lzip_trailer[20]; // 0-3 CRC32 of the uncompressed data // 4-11 size of the uncompressed data // 12-19 member size including header and trailer @@ -1428,6 +1443,7 @@ bool LZ_decoder::decode_member() // Returns false if error const int pos_state = data_position() & pos_state_mask; if( rdec.decode_bit( bm_match[state()][pos_state] ) == 0 ) // 1st bit { + // literal byte const uint8_t prev_byte = peek( 0 ); const int literal_state = prev_byte >> ( 8 - literal_context_bits ); Bit_model * const bm = bm_literal[literal_state]; @@ -1436,67 +1452,66 @@ bool LZ_decoder::decode_member() // Returns false if error else put_byte( rdec.decode_matched( bm, peek( rep0 ) ) ); state.set_char(); + continue; } - else // match or repeated match + // match or repeated match + int len; + if( rdec.decode_bit( bm_rep[state()] ) != 0 ) // 2nd bit { - int len; - if( rdec.decode_bit( bm_rep[state()] ) != 0 ) // 2nd bit + if( rdec.decode_bit( bm_rep0[state()] ) == 0 ) // 3rd bit { - if( rdec.decode_bit( bm_rep0[state()] ) == 0 ) // 3rd bit - { - if( rdec.decode_bit( bm_len[state()][pos_state] ) == 0 ) // 4th bit - { state.set_short_rep(); put_byte( peek( rep0 ) ); continue; } - } + if( rdec.decode_bit( bm_len[state()][pos_state] ) == 0 ) // 4th bit + { state.set_short_rep(); put_byte( peek( rep0 ) ); continue; } + } + else + { + unsigned distance; + if( rdec.decode_bit( bm_rep1[state()] ) == 0 ) // 4th bit + distance = rep1; else { - unsigned distance; - if( rdec.decode_bit( bm_rep1[state()] ) == 0 ) // 4th bit - distance = rep1; + if( rdec.decode_bit( bm_rep2[state()] ) == 0 ) // 5th bit + distance = rep2; else - { - if( rdec.decode_bit( bm_rep2[state()] ) == 0 ) // 5th bit - distance = rep2; - else - { distance = rep3; rep3 = rep2; } - rep2 = rep1; - } - rep1 = rep0; - rep0 = distance; + { distance = rep3; rep3 = rep2; } + rep2 = rep1; } - state.set_rep(); - len = min_match_len + rdec.decode_len( rep_len_model, pos_state ); + rep1 = rep0; + rep0 = distance; } - else // match + state.set_rep(); + len = min_match_len + rdec.decode_len( rep_len_model, pos_state ); + } + else // match + { + rep3 = rep2; rep2 = rep1; rep1 = rep0; + len = min_match_len + rdec.decode_len( match_len_model, pos_state ); + const int len_state = std::min( len - min_match_len, len_states - 1 ); + rep0 = rdec.decode_tree( bm_dis_slot[len_state], dis_slot_bits ); + if( rep0 >= start_dis_model ) { - rep3 = rep2; rep2 = rep1; rep1 = rep0; - len = min_match_len + rdec.decode_len( match_len_model, pos_state ); - const int len_state = std::min( len - min_match_len, len_states - 1 ); - rep0 = rdec.decode_tree( bm_dis_slot[len_state], dis_slot_bits ); - if( rep0 >= start_dis_model ) + const unsigned dis_slot = rep0; + const int direct_bits = ( dis_slot >> 1 ) - 1; + rep0 = ( 2 | ( dis_slot & 1 ) ) << direct_bits; + if( dis_slot < end_dis_model ) + rep0 += rdec.decode_tree_reversed( bm_dis + ( rep0 - dis_slot ), + direct_bits ); + else { - const unsigned dis_slot = rep0; - const int direct_bits = ( dis_slot >> 1 ) - 1; - rep0 = ( 2 | ( dis_slot & 1 ) ) << direct_bits; - if( dis_slot < end_dis_model ) - rep0 += rdec.decode_tree_reversed( bm_dis + ( rep0 - dis_slot ), - direct_bits ); - else + rep0 += rdec.decode( direct_bits - dis_align_bits ) << dis_align_bits; + rep0 += rdec.decode_tree_reversed( bm_align, dis_align_bits ); + if( rep0 == 0xFFFFFFFFU ) // marker found { - rep0 += rdec.decode( direct_bits - dis_align_bits ) << dis_align_bits; - rep0 += rdec.decode_tree_reversed( bm_align, dis_align_bits ); - if( rep0 == 0xFFFFFFFFU ) // marker found - { - flush_data(); - return ( len == min_match_len ); // End Of Stream marker - } + flush_data(); + return ( len == min_match_len ); // End Of Stream marker } } - state.set_match(); - if( rep0 >= dictionary_size || ( rep0 >= pos && !pos_wrapped ) ) - { flush_data(); return false; } } - for( int i = 0; i < len; ++i ) put_byte( peek( rep0 ) ); + state.set_match(); + if( rep0 >= dictionary_size || ( rep0 >= pos && !pos_wrapped ) ) + { flush_data(); return false; } } + for( int i = 0; i < len; ++i ) put_byte( peek( rep0 ) ); } flush_data(); return false; @@ -1514,7 +1529,7 @@ int main( const int argc, const char * const argv[] ) "It is not safe to use lzd for any real work.\n" "\nUsage: %s < file.lz > file\n", argv[0] ); std::printf( "Lzd decompresses from standard input to standard output.\n" - "\nCopyright (C) 2018 Antonio Diaz Diaz.\n" + "\nCopyright (C) 2019 Antonio Diaz Diaz.\n" "This is free software: you are free to change and redistribute it.\n" "There is NO WARRANTY, to the extent permitted by law.\n" "Report bugs to lzip-bug@nongnu.org\n" @@ -1522,14 +1537,14 @@ int main( const int argc, const char * const argv[] ) return 0; } -#if defined(__MSVCRT__) || defined(__OS2__) || defined(_MSC_VER) - setmode( fileno( stdin ), O_BINARY ); - setmode( fileno( stdout ), O_BINARY ); +#if defined(__MSVCRT__) || defined(__OS2__) || defined(__DJGPP__) + setmode( STDIN_FILENO, O_BINARY ); + setmode( STDOUT_FILENO, O_BINARY ); #endif for( bool first_member = true; ; first_member = false ) { - File_header header; // verify header + Lzip_header header; // verify header for( int i = 0; i < 6; ++i ) header[i] = std::getc( stdin ); if( std::feof( stdin ) || std::memcmp( header, "LZIP\x01", 5 ) != 0 ) { @@ -1548,7 +1563,7 @@ int main( const int argc, const char * const argv[] ) if( !decoder.decode_member() ) { std::fputs( "Data error\n", stderr ); return 2; } - File_trailer trailer; // verify trailer + Lzip_trailer trailer; // verify trailer for( int i = 0; i < 20; ++i ) trailer[i] = std::getc( stdin ); unsigned crc = 0; for( int i = 3; i >= 0; --i ) { crc <<= 8; crc += trailer[i]; } @@ -1593,20 +1608,21 @@ Concept index Tag Table: Node: Top208 -Node: Introduction1202 -Node: Output6228 -Node: Invoking lzip7740 -Ref: --trailing-error8298 -Node: Quality assurance15946 -Node: File format24354 -Node: Algorithm26758 -Node: Stream format29584 -Node: Trailing data40324 -Node: Examples42600 -Ref: concat-example43773 -Node: Problems44811 -Node: Reference source code45343 -Node: Concept index59659 +Node: Introduction1200 +Node: Output6244 +Node: Invoking lzip7756 +Ref: --trailing-error8378 +Node: Quality assurance16391 +Node: File format24994 +Ref: coded-dict-size26286 +Node: Algorithm27396 +Node: Stream format30222 +Node: Trailing data40873 +Node: Examples43149 +Ref: concat-example44571 +Node: Problems45609 +Node: Reference source code46141 +Node: Concept index60353 End Tag Table diff --git a/doc/lzip.texi b/doc/lzip.texi index d2efdc9..142e5a0 100644 --- a/doc/lzip.texi +++ b/doc/lzip.texi @@ -6,8 +6,8 @@ @finalout @c %**end of header -@set UPDATED 11 February 2018 -@set VERSION 1.20 +@set UPDATED 3 January 2019 +@set VERSION 1.21 @dircategory Data Compression @direntry @@ -50,7 +50,7 @@ This manual is for Lzip (version @value{VERSION}, @value{UPDATED}). @end menu @sp 1 -Copyright @copyright{} 2008-2018 Antonio Diaz Diaz. +Copyright @copyright{} 2008-2019 Antonio Diaz Diaz. This manual is free documentation: you have unlimited permission to copy, distribute and modify it. @@ -60,15 +60,15 @@ to copy, distribute and modify it. @chapter Introduction @cindex introduction -Lzip is a lossless data compressor with a user interface similar to the -one of gzip or bzip2. Lzip can compress about as fast as gzip -@w{(lzip -0)}, or compress most files more than bzip2 @w{(lzip -9)}. -Decompression speed is intermediate between gzip and bzip2. Lzip is -better than gzip and bzip2 from a data recovery perspective. +@uref{http://www.nongnu.org/lzip/lzip.html,,Lzip} is a lossless data +compressor with a user interface similar to the one of gzip or bzip2. Lzip +can compress about as fast as gzip @w{(lzip -0)} or compress most files more +than bzip2 @w{(lzip -9)}. Decompression speed is intermediate between gzip +and bzip2. Lzip is better than gzip and bzip2 from a data recovery +perspective. -The lzip file format is designed for data sharing and long-term -archiving, taking into account both data integrity and decoder -availability: +The lzip file format is designed for data sharing and long-term archiving, +taking into account both data integrity and decoder availability: @itemize @bullet @item @@ -113,13 +113,12 @@ uncompressed data. Lzip uses the same well-defined exit status values used by bzip2, which makes it safer than compressors returning ambiguous warning values (like -gzip) when it is used as a back end for other programs like tar or -zutils. +gzip) when it is used as a back end for other programs like tar or zutils. -Lzip will automatically use the smallest possible dictionary size for -each file without exceeding the given limit. Keep in mind that the -decompression memory requirement is affected at compression time by the -choice of dictionary size limit. +Lzip will automatically use for each file the largest dictionary size +that does not exceed neither the file size nor the limit given. Keep in +mind that the decompression memory requirement is affected at +compression time by the choice of dictionary size limit. The amount of memory required for compression is about 1 or 2 times the dictionary size limit (1 if input file size is less than dictionary size @@ -141,7 +140,7 @@ file from that of the compressed file as follows: (De)compressing a file is much like copying or moving it; therefore lzip preserves the access and modification dates, permissions, and, when -possible, ownership of the file just as "cp -p" does. (If the user ID or +possible, ownership of the file just as @samp{cp -p} does. (If the user ID or the group ID can't be duplicated, the file permission bits S_ISUID and S_ISGID are cleared). @@ -247,6 +246,7 @@ Print an informative help message describing the options and exit. @item -V @itemx --version Print the version number of lzip on the standard output and exit. +This version number should be included in all bug reports. @anchor{--trailing-error} @item -a @@ -328,12 +328,13 @@ Quiet operation. Suppress all messages. @item -s @var{bytes} @itemx --dictionary-size=@var{bytes} When compressing, set the dictionary size limit in bytes. Lzip will use -the smallest possible dictionary size for each file without exceeding -this limit. Valid values range from @w{4 KiB} to @w{512 MiB}. Values 12 -to 29 are interpreted as powers of two, meaning 2^12 to 2^29 bytes. Note -that dictionary sizes are quantized. If the specified size does not -match one of the valid sizes, it will be rounded upwards by adding up to -@w{(@var{bytes} / 8)} to it. +for each file the largest dictionary size that does not exceed neither +the file size nor this limit. Valid values range from @w{4 KiB} to +@w{512 MiB}. Values 12 to 29 are interpreted as powers of two, meaning +2^12 to 2^29 bytes. Dictionary sizes are quantized so that they can be +coded in just one byte (@pxref{coded-dict-size}). If the specified size +does not match one of the valid sizes, it will be rounded upwards by +adding up to @w{(@var{bytes} / 8)} to it. For maximum compression you should use a dictionary size limit as large as possible, but keep in mind that the decompression memory requirement @@ -371,18 +372,23 @@ ASCII characters.@* Two or more @samp{-v} options show the progress of (de)compression. @item -0 .. -9 -Set the compression parameters (dictionary size and match length limit) -as shown in the table below. The default compression level is @samp{-6}. -Note that @samp{-9} can be much slower than @samp{-0}. These options -have no effect when decompressing, testing or listing. +Compression level. Set the compression parameters (dictionary size and +match length limit) as shown in the table below. The default compression +level is @samp{-6}, equivalent to @w{@samp{-s8MiB -m36}}. Note that +@samp{-9} can be much slower than @samp{-0}. These options have no +effect when decompressing, testing or listing. The bidimensional parameter space of LZMA can't be mapped to a linear scale optimal for all files. If your files are large, very repetitive, etc, you may need to use the @samp{--dictionary-size} and @samp{--match-length} options directly to achieve optimal performance. -@multitable {Level} {Dictionary size} {Match length limit} -@item Level @tab Dictionary size @tab Match length limit +If several compression levels or @samp{-s} or @samp{-m} options are +given, the last setting is used. For example @w{@samp{-9 -s64MiB}} is +equivalent to @w{@samp{-s64MiB -m273}} + +@multitable {Level} {Dictionary size (-s)} {Match length limit (-m)} +@item Level @tab Dictionary size (-s) @tab Match length limit (-m) @item -0 @tab 64 KiB @tab 16 bytes @item -1 @tab 1 MiB @tab 5 bytes @item -2 @tab 1.5 MiB @tab 6 bytes @@ -441,10 +447,10 @@ is to make it so complicated that there are no obvious deficiencies. The first method is far more difficult.@* --- C.A.R. Hoare -Lzip has been designed, written and tested with great care to be the -standard general-purpose compressor for unix-like systems. This chapter -describes the lessons learned from previous compressors (gzip and -bzip2), and their application to the design of lzip. +Lzip has been designed, written and tested with great care to replace +gzip and bzip2 as the standard general-purpose compressed format for +unix-like systems. This chapter describes the lessons learned from +these previous formats, and their application to the design of lzip. @sp 1 @section Format design @@ -484,18 +490,21 @@ is extraordinarily safe. It provides embedded error detection. Any distance larger than the dictionary size acts as a forbidden symbol, allowing the decompressor to detect the approximate position of errors, and leaving very little work for the check sequence (CRC and data sizes) -in the detection of errors. Lzip is usually able to detect all posible +in the detection of errors. Lzip is usually able to detect all possible bit flips in the compressed data without resorting to the check sequence. It would be difficult to write an automatic recovery tool like lziprecover for the gzip format. And, as far as I know, it has never been written. Lzip, like gzip and bzip2, uses a CRC32 to check the integrity of the -decompressed data because it provides more accurate error detection than -CRC64 up to a compressed size of about @w{16 GiB}, a size larger than -that of most files. In the case of lzip, the additional detection +decompressed data because it provides optimal accuracy in the detection +of errors up to a compressed size of about @w{16 GiB}, a size larger +than that of most files. In the case of lzip, the additional detection capability of the decompressor reduces the probability of undetected -errors more than a million times beyond what the CRC32 alone provides. +errors about four million times more, resulting in a combined integrity +checking optimally accurate for any member size produced by lzip. +Preliminary results suggest that the lzip format is safe enough to be +used in critical safety avionics systems. The lzip format is designed for long-term archiving. Therefore it excludes any unneeded features that may interfere with the future @@ -554,7 +563,7 @@ size. The size of any file larger than @w{4 GiB} gets truncated. Bzip2 does not store the uncompressed size of the file. The lzip format provides a 64-bit field for the uncompressed size. -Additionaly, lzip produces multimember output automatically when the +Additionally, lzip produces multimember output automatically when the size is too large for a single member, allowing for an unlimited uncompressed size. @@ -609,10 +618,10 @@ vulnerability or false negative. @item Dictionary size -Lzip automatically uses the smallest possible dictionary size for each -file. In addition to reducing the amount of memory required for -decompression, this feature also minimizes the probability of being -affected by RAM errors during compression. +Lzip automatically adapts the dictionary size to the size of each file. +In addition to reducing the amount of memory required for decompression, +this feature also minimizes the probability of being affected by RAM +errors during compression. @c key4_mask @item Exit status @@ -669,12 +678,13 @@ A four byte string, identifying the lzip format, with the value "LZIP" @item VN (version number, 1 byte) Just in case something needs to be modified in the future. 1 for now. +@anchor{coded-dict-size} @item DS (coded dictionary size, 1 byte) The dictionary size is calculated by taking a power of 2 (the base size) -and substracting from it a fraction between 0/16 and 7/16 of the base +and subtracting from it a fraction between 0/16 and 7/16 of the base size.@* Bits 4-0 contain the base 2 logarithm of the base size (12 to 29).@* -Bits 7-5 contain the numerator of the fraction (0 to 7) to substract +Bits 7-5 contain the numerator of the fraction (0 to 7) to subtract from the base size to obtain the dictionary size.@* Example: 0xD3 = 2^19 - 6 * 2^15 = 512 KiB - 6 * 32 KiB = 320 KiB@* Valid values for dictionary size range from 4 KiB to 512 MiB. @@ -934,7 +944,7 @@ are: @sp 1 The contexts for decoding the type of coding sequence are: -@multitable @columnfractions .2 .4 .4 +@multitable @columnfractions .2 .35 .45 @headitem Name @tab Indices @tab Used when @item bm_match @tab state, pos_state @tab sequence start @item bm_rep @tab state @tab after sequence 1 @@ -947,7 +957,7 @@ The contexts for decoding the type of coding sequence are: @sp 1 The contexts for decoding distances are: -@multitable @columnfractions .2 .4 .4 +@multitable @columnfractions .2 .3 .5 @headitem Name @tab Indices @tab Used when @item bm_dis_slot @tab len_state, bit tree @tab distance start @item bm_dis @tab reverse bit tree @tab after slots 4 to 13 @@ -1068,9 +1078,12 @@ where a file containing trailing data must be rejected, the option WARNING! Even if lzip is bug-free, other causes may result in a corrupt compressed file (bugs in the system libraries, memory errors, etc). Therefore, if the data you are going to compress are important, give the -@samp{--keep} option to lzip and don't remove the original file until -you verify the compressed file with a command like -@w{@samp{lzip -cd file.lz | cmp file -}}. +@samp{--keep} option to lzip and don't remove the original file until you +verify the compressed file with a command like +@w{@samp{lzip -cd file.lz | cmp file -}}. Most RAM errors happening during +compression can only be detected by comparing the compressed file with the +original because the corruption happens before lzip compresses the RAM +contents, resulting in a valid compressed file containing wrong data. @sp 1 @noindent @@ -1198,7 +1211,7 @@ find by running @w{@code{lzip --version}}. @verbatim /* Lzd - Educational decompressor for the lzip format - Copyright (C) 2013-2018 Antonio Diaz Diaz. + Copyright (C) 2013-2019 Antonio Diaz Diaz. This program is free software. Redistribution and use in source and binary forms, with or without modification, are permitted provided @@ -1228,7 +1241,7 @@ find by running @w{@code{lzip --version}}. #include <cstring> #include <stdint.h> #include <unistd.h> -#if defined(__MSVCRT__) || defined(__OS2__) || defined(_MSC_VER) +#if defined(__MSVCRT__) || defined(__OS2__) || defined(__DJGPP__) #include <fcntl.h> #include <io.h> #endif @@ -1329,9 +1342,9 @@ public: const CRC32 crc32; -typedef uint8_t File_header[6]; // 0-3 magic, 4 version, 5 coded_dict_size +typedef uint8_t Lzip_header[6]; // 0-3 magic, 4 version, 5 coded_dict_size -typedef uint8_t File_trailer[20]; +typedef uint8_t Lzip_trailer[20]; // 0-3 CRC32 of the uncompressed data // 4-11 size of the uncompressed data // 12-19 member size including header and trailer @@ -1525,6 +1538,7 @@ bool LZ_decoder::decode_member() // Returns false if error const int pos_state = data_position() & pos_state_mask; if( rdec.decode_bit( bm_match[state()][pos_state] ) == 0 ) // 1st bit { + // literal byte const uint8_t prev_byte = peek( 0 ); const int literal_state = prev_byte >> ( 8 - literal_context_bits ); Bit_model * const bm = bm_literal[literal_state]; @@ -1533,67 +1547,66 @@ bool LZ_decoder::decode_member() // Returns false if error else put_byte( rdec.decode_matched( bm, peek( rep0 ) ) ); state.set_char(); + continue; } - else // match or repeated match + // match or repeated match + int len; + if( rdec.decode_bit( bm_rep[state()] ) != 0 ) // 2nd bit { - int len; - if( rdec.decode_bit( bm_rep[state()] ) != 0 ) // 2nd bit + if( rdec.decode_bit( bm_rep0[state()] ) == 0 ) // 3rd bit { - if( rdec.decode_bit( bm_rep0[state()] ) == 0 ) // 3rd bit - { - if( rdec.decode_bit( bm_len[state()][pos_state] ) == 0 ) // 4th bit - { state.set_short_rep(); put_byte( peek( rep0 ) ); continue; } - } + if( rdec.decode_bit( bm_len[state()][pos_state] ) == 0 ) // 4th bit + { state.set_short_rep(); put_byte( peek( rep0 ) ); continue; } + } + else + { + unsigned distance; + if( rdec.decode_bit( bm_rep1[state()] ) == 0 ) // 4th bit + distance = rep1; else { - unsigned distance; - if( rdec.decode_bit( bm_rep1[state()] ) == 0 ) // 4th bit - distance = rep1; + if( rdec.decode_bit( bm_rep2[state()] ) == 0 ) // 5th bit + distance = rep2; else - { - if( rdec.decode_bit( bm_rep2[state()] ) == 0 ) // 5th bit - distance = rep2; - else - { distance = rep3; rep3 = rep2; } - rep2 = rep1; - } - rep1 = rep0; - rep0 = distance; + { distance = rep3; rep3 = rep2; } + rep2 = rep1; } - state.set_rep(); - len = min_match_len + rdec.decode_len( rep_len_model, pos_state ); + rep1 = rep0; + rep0 = distance; } - else // match + state.set_rep(); + len = min_match_len + rdec.decode_len( rep_len_model, pos_state ); + } + else // match + { + rep3 = rep2; rep2 = rep1; rep1 = rep0; + len = min_match_len + rdec.decode_len( match_len_model, pos_state ); + const int len_state = std::min( len - min_match_len, len_states - 1 ); + rep0 = rdec.decode_tree( bm_dis_slot[len_state], dis_slot_bits ); + if( rep0 >= start_dis_model ) { - rep3 = rep2; rep2 = rep1; rep1 = rep0; - len = min_match_len + rdec.decode_len( match_len_model, pos_state ); - const int len_state = std::min( len - min_match_len, len_states - 1 ); - rep0 = rdec.decode_tree( bm_dis_slot[len_state], dis_slot_bits ); - if( rep0 >= start_dis_model ) + const unsigned dis_slot = rep0; + const int direct_bits = ( dis_slot >> 1 ) - 1; + rep0 = ( 2 | ( dis_slot & 1 ) ) << direct_bits; + if( dis_slot < end_dis_model ) + rep0 += rdec.decode_tree_reversed( bm_dis + ( rep0 - dis_slot ), + direct_bits ); + else { - const unsigned dis_slot = rep0; - const int direct_bits = ( dis_slot >> 1 ) - 1; - rep0 = ( 2 | ( dis_slot & 1 ) ) << direct_bits; - if( dis_slot < end_dis_model ) - rep0 += rdec.decode_tree_reversed( bm_dis + ( rep0 - dis_slot ), - direct_bits ); - else + rep0 += rdec.decode( direct_bits - dis_align_bits ) << dis_align_bits; + rep0 += rdec.decode_tree_reversed( bm_align, dis_align_bits ); + if( rep0 == 0xFFFFFFFFU ) // marker found { - rep0 += rdec.decode( direct_bits - dis_align_bits ) << dis_align_bits; - rep0 += rdec.decode_tree_reversed( bm_align, dis_align_bits ); - if( rep0 == 0xFFFFFFFFU ) // marker found - { - flush_data(); - return ( len == min_match_len ); // End Of Stream marker - } + flush_data(); + return ( len == min_match_len ); // End Of Stream marker } } - state.set_match(); - if( rep0 >= dictionary_size || ( rep0 >= pos && !pos_wrapped ) ) - { flush_data(); return false; } } - for( int i = 0; i < len; ++i ) put_byte( peek( rep0 ) ); + state.set_match(); + if( rep0 >= dictionary_size || ( rep0 >= pos && !pos_wrapped ) ) + { flush_data(); return false; } } + for( int i = 0; i < len; ++i ) put_byte( peek( rep0 ) ); } flush_data(); return false; @@ -1611,7 +1624,7 @@ int main( const int argc, const char * const argv[] ) "It is not safe to use lzd for any real work.\n" "\nUsage: %s < file.lz > file\n", argv[0] ); std::printf( "Lzd decompresses from standard input to standard output.\n" - "\nCopyright (C) 2018 Antonio Diaz Diaz.\n" + "\nCopyright (C) 2019 Antonio Diaz Diaz.\n" "This is free software: you are free to change and redistribute it.\n" "There is NO WARRANTY, to the extent permitted by law.\n" "Report bugs to lzip-bug@nongnu.org\n" @@ -1619,14 +1632,14 @@ int main( const int argc, const char * const argv[] ) return 0; } -#if defined(__MSVCRT__) || defined(__OS2__) || defined(_MSC_VER) - setmode( fileno( stdin ), O_BINARY ); - setmode( fileno( stdout ), O_BINARY ); +#if defined(__MSVCRT__) || defined(__OS2__) || defined(__DJGPP__) + setmode( STDIN_FILENO, O_BINARY ); + setmode( STDOUT_FILENO, O_BINARY ); #endif for( bool first_member = true; ; first_member = false ) { - File_header header; // verify header + Lzip_header header; // verify header for( int i = 0; i < 6; ++i ) header[i] = std::getc( stdin ); if( std::feof( stdin ) || std::memcmp( header, "LZIP\x01", 5 ) != 0 ) { @@ -1645,7 +1658,7 @@ int main( const int argc, const char * const argv[] ) if( !decoder.decode_member() ) { std::fputs( "Data error\n", stderr ); return 2; } - File_trailer trailer; // verify trailer + Lzip_trailer trailer; // verify trailer for( int i = 0; i < 20; ++i ) trailer[i] = std::getc( stdin ); unsigned crc = 0; for( int i = 3; i >= 0; --i ) { crc <<= 8; crc += trailer[i]; } @@ -1,5 +1,5 @@ /* Lzip - LZMA lossless data compressor - Copyright (C) 2008-2018 Antonio Diaz Diaz. + Copyright (C) 2008-2019 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -479,7 +479,7 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], bool LZ_encoder::encode_member( const unsigned long long member_size ) { const unsigned long long member_size_limit = - member_size - File_trailer::size - max_marker_size; + member_size - Lzip_trailer::size - max_marker_size; const bool best = ( match_len_limit > 12 ); const int dis_price_count = best ? 1 : 512; const int align_price_count = best ? 1 : dis_align_size; @@ -491,7 +491,7 @@ bool LZ_encoder::encode_member( const unsigned long long member_size ) State state; for( int i = 0; i < num_rep_distances; ++i ) reps[i] = 0; - if( data_position() != 0 || renc.member_position() != File_header::size ) + if( data_position() != 0 || renc.member_position() != Lzip_header::size ) return false; // can be called only once if( !data_finished() ) // encode first byte @@ -1,5 +1,5 @@ /* Lzip - LZMA lossless data compressor - Copyright (C) 2008-2018 Antonio Diaz Diaz. + Copyright (C) 2008-2019 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/encoder_base.cc b/encoder_base.cc index 4e2b765..c2133bd 100644 --- a/encoder_base.cc +++ b/encoder_base.cc @@ -1,5 +1,5 @@ /* Lzip - LZMA lossless data compressor - Copyright (C) 2008-2018 Antonio Diaz Diaz. + Copyright (C) 2008-2019 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -50,10 +50,11 @@ bool Matchfinder_base::read_block() void Matchfinder_base::normalize_pos() { if( pos > stream_pos ) - internal_error( "pos > stream_pos in Matchfinder_base::normalize_pos." ); + internal_error( "pos > stream_pos in normalize_pos." ); if( !at_stream_end ) { - const int offset = pos - before_size - dictionary_size; + // offset is int32_t for the std::min below + const int32_t offset = pos - before_size - dictionary_size; const int size = stream_pos - offset; std::memmove( buffer, buffer + offset, size ); partial_data_pos += offset; @@ -104,7 +105,7 @@ Matchfinder_base::Matchfinder_base( const int before_size_, unsigned size = 1 << std::max( 16, real_bits( dictionary_size - 1 ) - 2 ); if( dictionary_size > 1 << 26 ) // 64 MiB size >>= 1; - key4_mask = size - 1; + key4_mask = size - 1; // increases with dictionary size size += num_prev_positions23; num_prev_positions = size; @@ -163,11 +164,11 @@ void LZ_encoder_base::full_flush( const State state ) renc.encode_bit( bm_rep[state()], 0 ); encode_pair( 0xFFFFFFFFU, min_match_len, pos_state ); renc.flush(); - File_trailer trailer; + Lzip_trailer trailer; trailer.data_crc( crc() ); trailer.data_size( data_position() ); - trailer.member_size( renc.member_position() + File_trailer::size ); - for( int i = 0; i < File_trailer::size; ++i ) + trailer.member_size( renc.member_position() + Lzip_trailer::size ); + for( int i = 0; i < Lzip_trailer::size; ++i ) renc.put_byte( trailer.data[i] ); renc.flush_data(); } @@ -177,14 +178,14 @@ void LZ_encoder_base::reset() { Matchfinder_base::reset(); crc_ = 0xFFFFFFFFU; - bm_literal[0][0].reset( ( 1 << literal_context_bits ) * 0x300 ); + bm_literal[0][0].reset( (1 << literal_context_bits) * 0x300 ); bm_match[0][0].reset( State::states * pos_states ); bm_rep[0].reset( State::states ); bm_rep0[0].reset( State::states ); bm_rep1[0].reset( State::states ); bm_rep2[0].reset( State::states ); bm_len[0][0].reset( State::states * pos_states ); - bm_dis_slot[0][0].reset( len_states * (1 << dis_slot_bits ) ); + bm_dis_slot[0][0].reset( len_states * (1 << dis_slot_bits) ); bm_dis[0].reset( modeled_distances - end_dis_model + 1 ); bm_align[0].reset( dis_align_size ); match_len_model.reset(); diff --git a/encoder_base.h b/encoder_base.h index 6a651f0..4c1962f 100644 --- a/encoder_base.h +++ b/encoder_base.h @@ -1,5 +1,5 @@ /* Lzip - LZMA lossless data compressor - Copyright (C) 2008-2018 Antonio Diaz Diaz. + Copyright (C) 2008-2019 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -232,7 +232,7 @@ class Range_encoder unsigned ff_count; const int outfd; // output file descriptor uint8_t cache; - File_header header; + Lzip_header header; void shift_low() { @@ -260,7 +260,7 @@ public: ff_count = 0; cache = 0; header.dictionary_size( dictionary_size ); - for( int i = 0; i < File_header::size; ++i ) + for( int i = 0; i < Lzip_header::size; ++i ) put_byte( header.data[i] ); } diff --git a/fast_encoder.cc b/fast_encoder.cc index 3c5d32b..8fc0d3a 100644 --- a/fast_encoder.cc +++ b/fast_encoder.cc @@ -1,5 +1,5 @@ /* Lzip - LZMA lossless data compressor - Copyright (C) 2008-2018 Antonio Diaz Diaz. + Copyright (C) 2008-2019 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -73,13 +73,13 @@ int FLZ_encoder::longest_match_len( int * const distance ) bool FLZ_encoder::encode_member( const unsigned long long member_size ) { const unsigned long long member_size_limit = - member_size - File_trailer::size - max_marker_size; + member_size - Lzip_trailer::size - max_marker_size; int rep = 0; int reps[num_rep_distances]; State state; for( int i = 0; i < num_rep_distances; ++i ) reps[i] = 0; - if( data_position() != 0 || renc.member_position() != File_header::size ) + if( data_position() != 0 || renc.member_position() != Lzip_header::size ) return false; // can be called only once if( !data_finished() ) // encode first byte diff --git a/fast_encoder.h b/fast_encoder.h index 89b6a0c..5570a36 100644 --- a/fast_encoder.h +++ b/fast_encoder.h @@ -1,5 +1,5 @@ /* Lzip - LZMA lossless data compressor - Copyright (C) 2008-2018 Antonio Diaz Diaz. + Copyright (C) 2008-2019 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -1,5 +1,5 @@ /* Lzip - LZMA lossless data compressor - Copyright (C) 2008-2018 Antonio Diaz Diaz. + Copyright (C) 2008-2019 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -26,7 +26,7 @@ #include <sys/stat.h> #include "lzip.h" -#include "file_index.h" +#include "lzip_index.h" namespace { @@ -65,18 +65,18 @@ int list_files( const std::vector< std::string > & filenames, open_instream( input_filename, &in_stats, true, true ); if( infd < 0 ) { if( retval < 1 ) retval = 1; continue; } - const File_index file_index( infd, ignore_trailing, loose_trailing ); + const Lzip_index lzip_index( infd, ignore_trailing, loose_trailing ); close( infd ); - if( file_index.retval() != 0 ) + if( lzip_index.retval() != 0 ) { - show_file_error( input_filename, file_index.error().c_str() ); - if( retval < file_index.retval() ) retval = file_index.retval(); + show_file_error( input_filename, lzip_index.error().c_str() ); + if( retval < lzip_index.retval() ) retval = lzip_index.retval(); continue; } if( verbosity >= 0 ) { - const unsigned long long udata_size = file_index.udata_size(); - const unsigned long long cdata_size = file_index.cdata_size(); + const unsigned long long udata_size = lzip_index.udata_size(); + const unsigned long long cdata_size = lzip_index.cdata_size(); total_comp += cdata_size; total_uncomp += udata_size; ++files; if( first_post ) { @@ -87,22 +87,22 @@ int list_files( const std::vector< std::string > & filenames, if( verbosity >= 1 ) { unsigned dictionary_size = 0; - for( long i = 0; i < file_index.members(); ++i ) + for( long i = 0; i < lzip_index.members(); ++i ) dictionary_size = - std::max( dictionary_size, file_index.dictionary_size( i ) ); - const long long trailing_size = file_index.file_size() - cdata_size; + std::max( dictionary_size, lzip_index.dictionary_size( i ) ); + const long long trailing_size = lzip_index.file_size() - cdata_size; std::printf( "%s %5ld %6lld ", format_ds( dictionary_size ), - file_index.members(), trailing_size ); + lzip_index.members(), trailing_size ); } list_line( udata_size, cdata_size, input_filename ); - if( verbosity >= 2 && file_index.members() > 1 ) + if( verbosity >= 2 && lzip_index.members() > 1 ) { std::fputs( " member data_pos data_size member_pos member_size\n", stdout ); - for( long i = 0; i < file_index.members(); ++i ) + for( long i = 0; i < lzip_index.members(); ++i ) { - const Block & db = file_index.dblock( i ); - const Block & mb = file_index.mblock( i ); + const Block & db = lzip_index.dblock( i ); + const Block & mb = lzip_index.mblock( i ); std::printf( "%5ld %15llu %15llu %15llu %15llu\n", i + 1, db.pos(), db.size(), mb.pos(), mb.size() ); } @@ -1,5 +1,5 @@ /* Lzip - LZMA lossless data compressor - Copyright (C) 2008-2018 Antonio Diaz Diaz. + Copyright (C) 2008-2019 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -134,7 +134,7 @@ public: { const std::string & s = filenames[i]; const unsigned len = ( s == "-" ) ? stdin_name_len : s.size(); - if( len > longest_name ) longest_name = len; + if( longest_name < len ) longest_name = len; } if( longest_name == 0 ) longest_name = stdin_name_len; } @@ -144,7 +144,7 @@ public: if( filename.size() && filename != "-" ) name_ = filename; else name_ = stdin_name; padded_name = " "; padded_name += name_; padded_name += ": "; - if( name_.size() < longest_name ) + if( longest_name > name_.size() ) padded_name.append( longest_name - name_.size(), ' ' ); first_post = true; } @@ -202,30 +202,30 @@ inline int real_bits( unsigned value ) } -const uint8_t magic_string[4] = { 0x4C, 0x5A, 0x49, 0x50 }; // "LZIP" +const uint8_t lzip_magic[4] = { 0x4C, 0x5A, 0x49, 0x50 }; // "LZIP" -struct File_header +struct Lzip_header { uint8_t data[6]; // 0-3 magic bytes // 4 version // 5 coded_dict_size enum { size = 6 }; - void set_magic() { std::memcpy( data, magic_string, 4 ); data[4] = 1; } + void set_magic() { std::memcpy( data, lzip_magic, 4 ); data[4] = 1; } bool verify_magic() const - { return ( std::memcmp( data, magic_string, 4 ) == 0 ); } + { return ( std::memcmp( data, lzip_magic, 4 ) == 0 ); } bool verify_prefix( const int sz ) const // detect (truncated) header { for( int i = 0; i < sz && i < 4; ++i ) - if( data[i] != magic_string[i] ) return false; + if( data[i] != lzip_magic[i] ) return false; return ( sz > 0 ); } bool verify_corrupt() const // detect corrupt header { int matches = 0; for( int i = 0; i < 4; ++i ) - if( data[i] == magic_string[i] ) ++matches; + if( data[i] == lzip_magic[i] ) ++matches; return ( matches > 1 && matches < 4 ); } @@ -257,12 +257,11 @@ struct File_header }; -struct File_trailer +struct Lzip_trailer { uint8_t data[20]; // 0-3 CRC32 of the uncompressed data // 4-11 size of the uncompressed data // 12-19 member size including header and trailer - enum { size = 20 }; unsigned data_crc() const @@ -294,6 +293,20 @@ struct File_trailer void member_size( unsigned long long sz ) { for( int i = 12; i <= 19; ++i ) { data[i] = (uint8_t)sz; sz >>= 8; } } + + bool verify_consistency() const // check internal consistency + { + const unsigned crc = data_crc(); + const unsigned long long dsize = data_size(); + if( ( crc == 0 ) != ( dsize == 0 ) ) return false; + const unsigned long long msize = member_size(); + if( msize < min_member_size ) return false; + const unsigned long long mlimit = ( 9 * dsize + 7 ) / 8 + min_member_size; + if( mlimit > dsize && msize > mlimit ) return false; + const unsigned long long dlimit = 7090 * ( msize - 26 ) - 1; + if( dlimit > msize && dsize > dlimit ) return false; + return true; + } }; diff --git a/file_index.cc b/lzip_index.cc index 137f82a..4b2aadc 100644 --- a/file_index.cc +++ b/lzip_index.cc @@ -1,5 +1,5 @@ /* Lzip - LZMA lossless data compressor - Copyright (C) 2008-2018 Antonio Diaz Diaz. + Copyright (C) 2008-2019 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -27,7 +27,7 @@ #include <unistd.h> #include "lzip.h" -#include "file_index.h" +#include "lzip_index.h" namespace { @@ -43,13 +43,13 @@ int seek_read( const int fd, uint8_t * const buf, const int size, } // end namespace -void File_index::set_errno_error( const char * const msg ) +void Lzip_index::set_errno_error( const char * const msg ) { error_ = msg; error_ += std::strerror( errno ); retval_ = 1; } -void File_index::set_num_error( const char * const msg, unsigned long long num ) +void Lzip_index::set_num_error( const char * const msg, unsigned long long num ) { char buf[80]; snprintf( buf, sizeof buf, "%s%llu", msg, num ); @@ -59,11 +59,11 @@ void File_index::set_num_error( const char * const msg, unsigned long long num ) // If successful, push last member and set pos to member header. -bool File_index::skip_trailing_data( const int fd, long long & pos, +bool Lzip_index::skip_trailing_data( const int fd, long long & pos, const bool ignore_trailing, const bool loose_trailing ) { enum { block_size = 16384, - buffer_size = block_size + File_trailer::size - 1 + File_header::size }; + buffer_size = block_size + Lzip_trailer::size - 1 + Lzip_header::size }; uint8_t buffer[buffer_size]; if( pos < min_member_size ) return false; int bsize = pos % block_size; // total bytes in buffer @@ -77,28 +77,28 @@ bool File_index::skip_trailing_data( const int fd, long long & pos, if( seek_read( fd, buffer, rd_size, ipos ) != rd_size ) { set_errno_error( "Error seeking member trailer: " ); return false; } const uint8_t max_msb = ( ipos + search_size ) >> 56; - for( int i = search_size; i >= File_trailer::size; --i ) + for( int i = search_size; i >= Lzip_trailer::size; --i ) if( buffer[i-1] <= max_msb ) // most significant byte of member_size { - File_trailer & trailer = - *(File_trailer *)( buffer + i - File_trailer::size ); + const Lzip_trailer & trailer = + *(const Lzip_trailer *)( buffer + i - Lzip_trailer::size ); const unsigned long long member_size = trailer.member_size(); - if( member_size == 0 ) - { while( i > File_trailer::size && buffer[i-9] == 0 ) --i; continue; } - if( member_size < min_member_size || member_size > ipos + i ) + if( member_size == 0 ) // skip trailing zeros + { while( i > Lzip_trailer::size && buffer[i-9] == 0 ) --i; continue; } + if( member_size > ipos + i || !trailer.verify_consistency() ) continue; - File_header header; - if( seek_read( fd, header.data, File_header::size, - ipos + i - member_size ) != File_header::size ) + Lzip_header header; + if( seek_read( fd, header.data, Lzip_header::size, + ipos + i - member_size ) != Lzip_header::size ) { set_errno_error( "Error reading member header: " ); return false; } const unsigned dictionary_size = header.dictionary_size(); if( !header.verify_magic() || !header.verify_version() || !isvalid_ds( dictionary_size ) ) continue; - if( (*(File_header *)( buffer + i )).verify_prefix( bsize - i ) ) + if( (*(const Lzip_header *)( buffer + i )).verify_prefix( bsize - i ) ) { error_ = "Last member in input file is truncated or corrupt."; retval_ = 2; return false; } - if( !loose_trailing && bsize - i >= File_header::size && - (*(File_header *)( buffer + i )).verify_corrupt() ) + if( !loose_trailing && bsize - i >= Lzip_header::size && + (*(const Lzip_header *)( buffer + i )).verify_corrupt() ) { error_ = corrupt_mm_msg; retval_ = 2; return false; } if( !ignore_trailing ) { error_ = trailing_msg; retval_ = 2; return false; } @@ -108,10 +108,10 @@ bool File_index::skip_trailing_data( const int fd, long long & pos, return true; } if( ipos <= 0 ) - { set_num_error( "Member size in trailer is corrupt at pos ", pos - 8 ); + { set_num_error( "Bad trailer at pos ", pos - Lzip_trailer::size ); return false; } bsize = buffer_size; - search_size = bsize - File_header::size; + search_size = bsize - Lzip_header::size; rd_size = block_size; ipos -= rd_size; std::memcpy( buffer + rd_size, buffer, buffer_size - rd_size ); @@ -119,20 +119,20 @@ bool File_index::skip_trailing_data( const int fd, long long & pos, } -File_index::File_index( const int infd, const bool ignore_trailing, +Lzip_index::Lzip_index( const int infd, const bool ignore_trailing, const bool loose_trailing ) - : isize( lseek( infd, 0, SEEK_END ) ), retval_( 0 ) + : insize( lseek( infd, 0, SEEK_END ) ), retval_( 0 ) { - if( isize < 0 ) + if( insize < 0 ) { set_errno_error( "Input file is not seekable: " ); return; } - if( isize < min_member_size ) + if( insize < min_member_size ) { error_ = "Input file is too short."; retval_ = 2; return; } - if( isize > INT64_MAX ) + if( insize > INT64_MAX ) { error_ = "Input file is too long (2^63 bytes or more)."; retval_ = 2; return; } - File_header header; - if( seek_read( infd, header.data, File_header::size, 0 ) != File_header::size ) + Lzip_header header; + if( seek_read( infd, header.data, Lzip_header::size, 0 ) != Lzip_header::size ) { set_errno_error( "Error reading member header: " ); return; } if( !header.verify_magic() ) { error_ = bad_magic_msg; retval_ = 2; return; } @@ -141,24 +141,24 @@ File_index::File_index( const int infd, const bool ignore_trailing, if( !isvalid_ds( header.dictionary_size() ) ) { error_ = bad_dict_msg; retval_ = 2; return; } - long long pos = isize; // always points to a header or to EOF + long long pos = insize; // always points to a header or to EOF while( pos >= min_member_size ) { - File_trailer trailer; - if( seek_read( infd, trailer.data, File_trailer::size, - pos - File_trailer::size ) != File_trailer::size ) + Lzip_trailer trailer; + if( seek_read( infd, trailer.data, Lzip_trailer::size, + pos - Lzip_trailer::size ) != Lzip_trailer::size ) { set_errno_error( "Error reading member trailer: " ); break; } const unsigned long long member_size = trailer.member_size(); - if( member_size < min_member_size || member_size > (unsigned long long)pos ) + if( member_size > (unsigned long long)pos || !trailer.verify_consistency() ) { if( member_vector.empty() ) { if( skip_trailing_data( infd, pos, ignore_trailing, loose_trailing ) ) continue; else return; } - set_num_error( "Member size in trailer is corrupt at pos ", pos - 8 ); + set_num_error( "Bad trailer at pos ", pos - Lzip_trailer::size ); break; } - if( seek_read( infd, header.data, File_header::size, - pos - member_size ) != File_header::size ) + if( seek_read( infd, header.data, Lzip_header::size, + pos - member_size ) != Lzip_header::size ) { set_errno_error( "Error reading member header: " ); break; } const unsigned dictionary_size = header.dictionary_size(); if( !header.verify_magic() || !header.verify_version() || @@ -181,7 +181,7 @@ File_index::File_index( const int infd, const bool ignore_trailing, return; } std::reverse( member_vector.begin(), member_vector.end() ); - for( unsigned long i = 0; i < member_vector.size() - 1; ++i ) + for( unsigned long i = 0; ; ++i ) { const long long end = member_vector[i].dblock.end(); if( end < 0 || end > INT64_MAX ) @@ -190,6 +190,7 @@ File_index::File_index( const int infd, const bool ignore_trailing, error_ = "Data in input file is too long (2^63 bytes or more)."; retval_ = 2; return; } + if( i + 1 >= member_vector.size() ) break; member_vector[i+1].dblock.pos( end ); } } diff --git a/file_index.h b/lzip_index.h index 7cba508..3be6756 100644 --- a/file_index.h +++ b/lzip_index.h @@ -1,5 +1,5 @@ /* Lzip - LZMA lossless data compressor - Copyright (C) 2008-2018 Antonio Diaz Diaz. + Copyright (C) 2008-2019 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -36,7 +36,7 @@ public: }; -class File_index +class Lzip_index { struct Member { @@ -50,7 +50,7 @@ class File_index std::vector< Member > member_vector; std::string error_; - const long long isize; + const long long insize; int retval_; void set_errno_error( const char * const msg ); @@ -59,7 +59,7 @@ class File_index const bool ignore_trailing, const bool loose_trailing ); public: - File_index( const int infd, const bool ignore_trailing, + Lzip_index( const int infd, const bool ignore_trailing, const bool loose_trailing ); long members() const { return member_vector.size(); } @@ -76,7 +76,7 @@ public: // total size including trailing data (if any) long long file_size() const - { if( isize >= 0 ) return isize; else return 0; } + { if( insize >= 0 ) return insize; else return 0; } const Block & dblock( const long i ) const { return member_vector[i].dblock; } @@ -1,5 +1,5 @@ /* Lzip - LZMA lossless data compressor - Copyright (C) 2008-2018 Antonio Diaz Diaz. + Copyright (C) 2008-2019 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -38,20 +38,25 @@ #include <unistd.h> #include <utime.h> #include <sys/stat.h> -#if defined(__MSVCRT__) +#if defined(__MSVCRT__) || defined(__OS2__) || defined(__DJGPP__) #include <io.h> +#if defined(__MSVCRT__) #define fchmod(x,y) 0 #define fchown(x,y,z) 0 #define strtoull std::strtoul #define SIGHUP SIGTERM #define S_ISSOCK(x) 0 +#ifndef S_IRGRP #define S_IRGRP 0 #define S_IWGRP 0 #define S_IROTH 0 #define S_IWOTH 0 #endif -#if defined(__OS2__) -#include <io.h> +#endif +#if defined(__DJGPP__) +#define S_ISSOCK(x) 0 +#define S_ISVTX 0 +#endif #endif #include "arg_parser.h" @@ -73,9 +78,8 @@ int verbosity = 0; namespace { -const char * const Program_name = "Lzip"; const char * const program_name = "lzip"; -const char * const program_year = "2018"; +const char * const program_year = "2019"; const char * invocation_name = 0; const struct { const char * from; const char * to; } known_extensions[] = { @@ -91,6 +95,8 @@ struct Lzma_options enum Mode { m_compress, m_decompress, m_list, m_test }; +/* Variables used in signal handler context. + They are not declared volatile because the handler never returns. */ std::string output_filename; int outfd = -1; bool delete_output_on_interrupt = false; @@ -98,8 +104,14 @@ bool delete_output_on_interrupt = false; void show_help() { - std::printf( "%s - LZMA lossless data compressor.\n", Program_name ); - std::printf( "\nUsage: %s [options] [files]\n", invocation_name ); + std::printf( "Lzip is a lossless data compressor with a user interface similar to the\n" + "one of gzip or bzip2. Lzip can compress about as fast as gzip (lzip -0)\n" + "or compress most files more than bzip2 (lzip -9). Decompression speed is\n" + "intermediate between gzip and bzip2. Lzip is better than gzip and bzip2\n" + "from a data recovery perspective. Lzip has been designed, written and\n" + "tested with great care to replace gzip and bzip2 as the standard\n" + "general-purpose compressed format for unix-like systems.\n" + "\nUsage: %s [options] [files]\n", invocation_name ); std::printf( "\nOptions:\n" " -h, --help display this help and exit\n" " -V, --version output version information and exit\n" @@ -115,7 +127,7 @@ void show_help() " -o, --output=<file> if reading standard input, write to <file>\n" " -q, --quiet suppress all messages\n" " -s, --dictionary-size=<bytes> set dictionary size limit in bytes [8 MiB]\n" - " -S, --volume-size=<bytes> set volume size limit in bytes, implies -k\n" + " -S, --volume-size=<bytes> set volume size limit in bytes\n" " -t, --test test compressed file integrity\n" " -v, --verbose be verbose (a 2nd -v gives more)\n" " -0 .. -9 set compression level [default 6]\n" @@ -258,7 +270,7 @@ int get_dict_size( const char * const arg ) const long bits = std::strtol( arg, &tail, 0 ); if( bits >= min_dictionary_bits && bits <= max_dictionary_bits && *tail == 0 ) - return ( 1 << bits ); + return 1 << bits; return getnum( arg, min_dictionary_size, max_dictionary_size ); } @@ -404,8 +416,17 @@ bool check_tty( const char * const input_filename, const int infd, } +void set_signals( void (*action)(int) ) + { + std::signal( SIGHUP, action ); + std::signal( SIGINT, action ); + std::signal( SIGTERM, action ); + } + + void cleanup_and_fail( const int retval ) { + set_signals( SIG_IGN ); // ignore signals if( delete_output_on_interrupt ) { delete_output_on_interrupt = false; @@ -420,6 +441,13 @@ void cleanup_and_fail( const int retval ) } +extern "C" void signal_handler( int ) + { + show_error( "Control-C or similar caught, quitting." ); + cleanup_and_fail( 1 ); + } + + // Set permissions, owner and times. void close_and_set_permissions( const struct stat * const in_statsp ) { @@ -483,7 +511,7 @@ int compress( const unsigned long long cfile_size, encoder = new FLZ_encoder( infd, outfd ); else { - File_header header; + Lzip_header header; if( header.dictionary_size( encoder_options.dictionary_size ) && encoder_options.match_len_limit >= min_match_len_limit && encoder_options.match_len_limit <= max_match_len ) @@ -534,12 +562,12 @@ int compress( const unsigned long long cfile_size, in_size, out_size ); } } - catch( std::bad_alloc ) + catch( std::bad_alloc & ) { pp( "Not enough memory. Try a smaller dictionary size." ); retval = 1; } - catch( Error e ) { pp(); show_error( e.msg, errno ); retval = 1; } + catch( Error & e ) { pp(); show_error( e.msg, errno ); retval = 1; } delete encoder; return retval; } @@ -590,9 +618,9 @@ int decompress( const unsigned long long cfile_size, const int infd, Range_decoder rdec( infd ); for( bool first_member = true; ; first_member = false ) { - File_header header; + Lzip_header header; rdec.reset_member_position(); - const int size = rdec.read_data( header.data, File_header::size ); + const int size = rdec.read_data( header.data, Lzip_header::size ); if( rdec.finished() ) // End Of File { if( first_member ) @@ -646,28 +674,13 @@ int decompress( const unsigned long long cfile_size, const int infd, { std::fputs( testing ? "ok\n" : "done\n", stderr ); pp.reset(); } } } - catch( std::bad_alloc ) { pp( "Not enough memory." ); retval = 1; } - catch( Error e ) { pp(); show_error( e.msg, errno ); retval = 1; } + catch( std::bad_alloc & ) { pp( "Not enough memory." ); retval = 1; } + catch( Error & e ) { pp(); show_error( e.msg, errno ); retval = 1; } if( verbosity == 1 && retval == 0 ) std::fputs( testing ? "ok\n" : "done\n", stderr ); return retval; } - -extern "C" void signal_handler( int ) - { - show_error( "Control-C or similar caught, quitting." ); - cleanup_and_fail( 1 ); - } - - -void set_signals() - { - std::signal( SIGHUP, signal_handler ); - std::signal( SIGINT, signal_handler ); - std::signal( SIGTERM, signal_handler ); - } - } // end namespace @@ -675,11 +688,9 @@ void show_error( const char * const msg, const int errcode, const bool help ) { if( verbosity < 0 ) return; if( msg && msg[0] ) - { - std::fprintf( stderr, "%s: %s", program_name, msg ); - if( errcode > 0 ) std::fprintf( stderr, ": %s", std::strerror( errcode ) ); - std::fputc( '\n', stderr ); - } + std::fprintf( stderr, "%s: %s%s%s\n", program_name, msg, + ( errcode > 0 ) ? ": " : "", + ( errcode > 0 ) ? std::strerror( errcode ) : "" ); if( help ) std::fprintf( stderr, "Try '%s --help' for more information.\n", invocation_name ); @@ -689,10 +700,10 @@ void show_error( const char * const msg, const int errcode, const bool help ) void show_file_error( const char * const filename, const char * const msg, const int errcode ) { - if( verbosity < 0 ) return; - std::fprintf( stderr, "%s: %s: %s", program_name, filename, msg ); - if( errcode > 0 ) std::fprintf( stderr, ": %s", std::strerror( errcode ) ); - std::fputc( '\n', stderr ); + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: %s: %s%s%s\n", program_name, filename, msg, + ( errcode > 0 ) ? ": " : "", + ( errcode > 0 ) ? std::strerror( errcode ) : "" ); } @@ -874,7 +885,7 @@ int main( const int argc, const char * const argv[] ) } } // end process options -#if defined(__MSVCRT__) || defined(__OS2__) +#if defined(__MSVCRT__) || defined(__OS2__) || defined(__DJGPP__) setmode( STDIN_FILENO, O_BINARY ); setmode( STDOUT_FILENO, O_BINARY ); #endif @@ -900,7 +911,7 @@ int main( const int argc, const char * const argv[] ) if( !to_stdout && program_mode != m_test && ( filenames_given || default_output_filename.size() ) ) - set_signals(); + set_signals( signal_handler ); Pretty_print pp( filenames ); @@ -979,6 +990,12 @@ int main( const int argc, const char * const argv[] ) else tmp = decompress( cfile_size, infd, pp, ignore_trailing, loose_trailing, program_mode == m_test ); + if( close( infd ) != 0 ) + { + show_error( input_filename.size() ? "Error closing input file" : + "Error closing stdin", errno ); + if( tmp < 1 ) tmp = 1; + } if( tmp > retval ) retval = tmp; if( tmp ) { if( program_mode != m_test ) cleanup_and_fail( retval ); @@ -988,7 +1005,6 @@ int main( const int argc, const char * const argv[] ) close_and_set_permissions( in_statsp ); if( input_filename.size() ) { - close( infd ); if( !keep_input_files && !to_stdout && program_mode != m_test && ( program_mode != m_compress || volume_size == 0 ) ) std::remove( input_filename.c_str() ); diff --git a/testsuite/check.sh b/testsuite/check.sh index 6cb1616..ba85edc 100755 --- a/testsuite/check.sh +++ b/testsuite/check.sh @@ -1,6 +1,6 @@ #! /bin/sh # check script for Lzip - LZMA lossless data compressor -# Copyright (C) 2008-2018 Antonio Diaz Diaz. +# Copyright (C) 2008-2019 Antonio Diaz Diaz. # # This script is free software: you have unlimited permission # to copy, distribute and modify it. @@ -36,12 +36,15 @@ test_failed() { fail=1 ; printf " $1" ; [ -z "$2" ] || printf "($2)" ; } printf "testing lzip-%s..." "$2" "${LZIP}" -fkqm4 in -{ [ $? = 1 ] && [ ! -e in.lz ] ; } || test_failed $LINENO +[ $? = 1 ] || test_failed $LINENO +[ ! -e in.lz ] || test_failed $LINENO "${LZIP}" -fkqm274 in -{ [ $? = 1 ] && [ ! -e in.lz ] ; } || test_failed $LINENO +[ $? = 1 ] || test_failed $LINENO +[ ! -e in.lz ] || test_failed $LINENO for i in bad_size -1 0 4095 513MiB 1G 1T 1P 1E 1Z 1Y 10KB ; do "${LZIP}" -fkqs $i in - { [ $? = 1 ] && [ ! -e in.lz ] ; } || test_failed $LINENO $i + [ $? = 1 ] || test_failed $LINENO $i + [ ! -e in.lz ] || test_failed $LINENO $i done "${LZIP}" -lq in [ $? = 2 ] || test_failed $LINENO @@ -91,31 +94,34 @@ printf "\ntesting decompression..." "${LZIP}" -cd "${in_lz}" > copy || test_failed $LINENO cmp in copy || test_failed $LINENO -rm -f copy +rm -f copy || framework_failure cat "${in_lz}" > copy.lz || framework_failure "${LZIP}" -dk copy.lz || test_failed $LINENO cmp in copy || test_failed $LINENO printf "to be overwritten" > copy || framework_failure "${LZIP}" -d copy.lz 2> /dev/null [ $? = 1 ] || test_failed $LINENO -"${LZIP}" -df copy.lz -{ [ $? = 0 ] && [ ! -e copy.lz ] && cmp in copy ; } || test_failed $LINENO +"${LZIP}" -df copy.lz || test_failed $LINENO +[ ! -e copy.lz ] || test_failed $LINENO +cmp in copy || test_failed $LINENO -rm -f copy +rm -f copy || framework_failure cat "${in_lz}" > copy.lz || framework_failure -"${LZIP}" -d -S100k copy.lz -{ [ $? = 0 ] && [ ! -e copy.lz ] && cmp in copy ; } || test_failed $LINENO +"${LZIP}" -d -S100k copy.lz || test_failed $LINENO # ignore -S +[ ! -e copy.lz ] || test_failed $LINENO +cmp in copy || test_failed $LINENO printf "to be overwritten" > copy || framework_failure "${LZIP}" -df -o copy < "${in_lz}" || test_failed $LINENO cmp in copy || test_failed $LINENO -rm -f copy +rm -f copy || framework_failure "${LZIP}" < in > anyothername || test_failed $LINENO -"${LZIP}" -dv --output copy - anyothername - < "${in_lz}" 2> /dev/null -{ [ $? = 0 ] && cmp in copy && cmp in anyothername.out ; } || +"${LZIP}" -dv --output copy - anyothername - < "${in_lz}" 2> /dev/null || test_failed $LINENO -rm -f copy anyothername.out +cmp in copy || test_failed $LINENO +cmp in anyothername.out || test_failed $LINENO +rm -f copy anyothername.out || framework_failure "${LZIP}" -lq in "${in_lz}" [ $? = 2 ] || test_failed $LINENO @@ -126,10 +132,12 @@ rm -f copy anyothername.out "${LZIP}" -tq nx_file.lz "${in_lz}" [ $? = 1 ] || test_failed $LINENO "${LZIP}" -cdq in "${in_lz}" > copy -{ [ $? = 2 ] && cat copy in | cmp in - ; } || test_failed $LINENO +[ $? = 2 ] || test_failed $LINENO +cat copy in | cmp in - || test_failed $LINENO "${LZIP}" -cdq nx_file.lz "${in_lz}" > copy -{ [ $? = 1 ] && cmp in copy ; } || test_failed $LINENO -rm -f copy +[ $? = 1 ] || test_failed $LINENO +cmp in copy || test_failed $LINENO +rm -f copy || framework_failure cat "${in_lz}" > copy.lz || framework_failure for i in 1 2 3 4 5 6 7 ; do printf "g" >> copy.lz || framework_failure @@ -139,11 +147,15 @@ for i in 1 2 3 4 5 6 7 ; do [ $? = 2 ] || test_failed $LINENO $i done "${LZIP}" -dq in copy.lz -{ [ $? = 2 ] && [ -e copy.lz ] && [ ! -e copy ] && [ ! -e in.out ] ; } || - test_failed $LINENO +[ $? = 2 ] || test_failed $LINENO +[ -e copy.lz ] || test_failed $LINENO +[ ! -e copy ] || test_failed $LINENO +[ ! -e in.out ] || test_failed $LINENO "${LZIP}" -dq nx_file.lz copy.lz -{ [ $? = 1 ] && [ ! -e copy.lz ] && [ ! -e nx_file ] && cmp in copy ; } || - test_failed $LINENO +[ $? = 1 ] || test_failed $LINENO +[ ! -e copy.lz ] || test_failed $LINENO +[ ! -e nx_file ] || test_failed $LINENO +cmp in copy || test_failed $LINENO cat in in > in2 || framework_failure cat "${in_lz}" "${in_lz}" > in2.lz || framework_failure @@ -160,7 +172,7 @@ cmp in2 copy2 || test_failed $LINENO printf "\ngarbage" >> copy2.lz || framework_failure "${LZIP}" -tvvvv copy2.lz 2> /dev/null || test_failed $LINENO -rm -f copy2 +rm -f copy2 || framework_failure "${LZIP}" -alq copy2.lz [ $? = 2 ] || test_failed $LINENO "${LZIP}" -atq copy2.lz @@ -168,12 +180,15 @@ rm -f copy2 "${LZIP}" -atq < copy2.lz [ $? = 2 ] || test_failed $LINENO "${LZIP}" -adkq copy2.lz -{ [ $? = 2 ] && [ ! -e copy2 ] ; } || test_failed $LINENO +[ $? = 2 ] || test_failed $LINENO +[ ! -e copy2 ] || test_failed $LINENO "${LZIP}" -adkq -o copy2 < copy2.lz -{ [ $? = 2 ] && [ ! -e copy2 ] ; } || test_failed $LINENO +[ $? = 2 ] || test_failed $LINENO +[ ! -e copy2 ] || test_failed $LINENO printf "to be overwritten" > copy2 || framework_failure "${LZIP}" -df copy2.lz || test_failed $LINENO cmp in2 copy2 || test_failed $LINENO +rm -f in2 copy2 || framework_failure printf "\ntesting compression..." @@ -209,73 +224,94 @@ for i in s4Ki 0 1 2 3 4 5 6 7 8 9 ; do "${LZIP}" -df -o copy < out.lz || test_failed $LINENO $i cmp in copy || test_failed $LINENO $i done +rm -f out.lz || framework_failure cat in in in in in in in in > in8 || framework_failure "${LZIP}" -1s12 -S100k in8 || test_failed $LINENO "${LZIP}" -t in800001.lz in800002.lz || test_failed $LINENO "${LZIP}" -cd in800001.lz in800002.lz | cmp in8 - || test_failed $LINENO -rm -f in800001.lz in800002.lz +rm -f in800001.lz in800002.lz || framework_failure "${LZIP}" -1s12 -S100k -o out.lz < in8 || test_failed $LINENO "${LZIP}" -t out.lz00001.lz out.lz00002.lz || test_failed $LINENO "${LZIP}" -cd out.lz00001.lz out.lz00002.lz | cmp in8 - || test_failed $LINENO -rm -f out.lz00001.lz out.lz00002.lz +rm -f out.lz00001.lz out.lz00002.lz || framework_failure "${LZIP}" -1ks4Ki -b100000 in8 || test_failed $LINENO "${LZIP}" -t in8.lz || test_failed $LINENO "${LZIP}" -cd in8.lz | cmp in8 - || test_failed $LINENO -rm -f in8 +rm -f in8 || framework_failure "${LZIP}" -0 -S100k -o out < in8.lz || test_failed $LINENO "${LZIP}" -t out00001.lz out00002.lz || test_failed $LINENO "${LZIP}" -cd out00001.lz out00002.lz | cmp in8.lz - || test_failed $LINENO -rm -f out00001.lz +rm -f out00001.lz || framework_failure "${LZIP}" -1 -S100k -o out < in8.lz || test_failed $LINENO "${LZIP}" -t out00001.lz out00002.lz || test_failed $LINENO "${LZIP}" -cd out00001.lz out00002.lz | cmp in8.lz - || test_failed $LINENO -rm -f out00001.lz out00002.lz +rm -f out00001.lz out00002.lz || framework_failure "${LZIP}" -0 -F -S100k in8.lz || test_failed $LINENO "${LZIP}" -t in8.lz00001.lz in8.lz00002.lz || test_failed $LINENO "${LZIP}" -cd in8.lz00001.lz in8.lz00002.lz | cmp in8.lz - || test_failed $LINENO -rm -f in8.lz00001.lz in8.lz00002.lz +rm -f in8.lz00001.lz in8.lz00002.lz || framework_failure "${LZIP}" -0kF -b100k in8.lz || test_failed $LINENO "${LZIP}" -t in8.lz.lz || test_failed $LINENO "${LZIP}" -cd in8.lz.lz | cmp in8.lz - || test_failed $LINENO -rm -f in8.lz in8.lz.lz +rm -f in8.lz in8.lz.lz || framework_failure printf "\ntesting bad input..." headers='LZIp LZiP LZip LzIP LzIp LziP lZIP lZIp lZiP lzIP' body='\001\014\000\203\377\373\377\377\300\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000$\000\000\000\000\000\000\000' -cat "${in_lz}" > in0.lz -printf "LZIP${body}" >> in0.lz -if "${LZIP}" -tq in0.lz ; then +cat "${in_lz}" > int.lz +printf "LZIP${body}" >> int.lz +if "${LZIP}" -tq int.lz ; then for header in ${headers} ; do - printf "${header}${body}" > in0.lz # first member - "${LZIP}" -lq in0.lz + printf "${header}${body}" > int.lz # first member + "${LZIP}" -lq int.lz + [ $? = 2 ] || test_failed $LINENO ${header} + "${LZIP}" -tq int.lz + [ $? = 2 ] || test_failed $LINENO ${header} + "${LZIP}" -tq < int.lz + [ $? = 2 ] || test_failed $LINENO ${header} + "${LZIP}" -cdq int.lz > /dev/null + [ $? = 2 ] || test_failed $LINENO ${header} + "${LZIP}" -lq --loose-trailing int.lz + [ $? = 2 ] || test_failed $LINENO ${header} + "${LZIP}" -tq --loose-trailing int.lz + [ $? = 2 ] || test_failed $LINENO ${header} + "${LZIP}" -tq --loose-trailing < int.lz + [ $? = 2 ] || test_failed $LINENO ${header} + "${LZIP}" -cdq --loose-trailing int.lz > /dev/null + [ $? = 2 ] || test_failed $LINENO ${header} + cat "${in_lz}" > int.lz + printf "${header}${body}" >> int.lz # trailing data + "${LZIP}" -lq int.lz [ $? = 2 ] || test_failed $LINENO ${header} - "${LZIP}" -tq in0.lz + "${LZIP}" -tq int.lz [ $? = 2 ] || test_failed $LINENO ${header} - "${LZIP}" -lq --loose-trailing in0.lz + "${LZIP}" -tq < int.lz [ $? = 2 ] || test_failed $LINENO ${header} - "${LZIP}" -tq --loose-trailing in0.lz + "${LZIP}" -cdq int.lz > /dev/null [ $? = 2 ] || test_failed $LINENO ${header} - cat "${in_lz}" > in0.lz - printf "${header}${body}" >> in0.lz # trailing data - "${LZIP}" -lq in0.lz + "${LZIP}" -lq --loose-trailing int.lz || + test_failed $LINENO ${header} + "${LZIP}" -t --loose-trailing int.lz || + test_failed $LINENO ${header} + "${LZIP}" -t --loose-trailing < int.lz || + test_failed $LINENO ${header} + "${LZIP}" -cd --loose-trailing int.lz > /dev/null || + test_failed $LINENO ${header} + "${LZIP}" -lq --loose-trailing --trailing-error int.lz [ $? = 2 ] || test_failed $LINENO ${header} - "${LZIP}" -tq in0.lz + "${LZIP}" -tq --loose-trailing --trailing-error int.lz [ $? = 2 ] || test_failed $LINENO ${header} - "${LZIP}" -lq --loose-trailing in0.lz - [ $? = 0 ] || test_failed $LINENO ${header} - "${LZIP}" -t --loose-trailing in0.lz - [ $? = 0 ] || test_failed $LINENO ${header} - "${LZIP}" -lq --loose-trailing --trailing-error in0.lz + "${LZIP}" -tq --loose-trailing --trailing-error < int.lz [ $? = 2 ] || test_failed $LINENO ${header} - "${LZIP}" -tq --loose-trailing --trailing-error in0.lz + "${LZIP}" -cdq --loose-trailing --trailing-error int.lz > /dev/null [ $? = 2 ] || test_failed $LINENO ${header} done else printf "\nwarning: skipping header test: 'printf' does not work on your system." fi -rm -f in0.lz +rm -f int.lz || framework_failure cat "${in_lz}" "${in_lz}" "${in_lz}" > in3.lz || framework_failure if dd if=in3.lz of=trunc.lz bs=14752 count=1 2> /dev/null && @@ -296,7 +332,7 @@ if dd if=in3.lz of=trunc.lz bs=14752 count=1 2> /dev/null && else printf "\nwarning: skipping truncation test: 'dd' does not work on your system." fi -rm -f in3.lz trunc.lz +rm -f in2.lz in3.lz trunc.lz out || framework_failure cat "${in_lz}" > ingin.lz || framework_failure printf "g" >> ingin.lz || framework_failure @@ -309,7 +345,7 @@ cmp in copy || test_failed $LINENO "${LZIP}" -t < ingin.lz || test_failed $LINENO "${LZIP}" -d < ingin.lz > copy || test_failed $LINENO cmp in copy || test_failed $LINENO -rm -f ingin.lz +rm -f copy ingin.lz || framework_failure echo if [ ${fail} = 0 ] ; then |