diff options
-rw-r--r-- | AUTHORS | 7 | ||||
-rw-r--r-- | ChangeLog | 14 | ||||
-rw-r--r-- | Makefile.in | 4 | ||||
-rw-r--r-- | NEWS | 20 | ||||
-rw-r--r-- | arg_parser.cc | 4 | ||||
-rw-r--r-- | arg_parser.h | 4 | ||||
-rwxr-xr-x | configure | 4 | ||||
-rw-r--r-- | doc/lzlib.info | 39 | ||||
-rw-r--r-- | doc/lzlib.texinfo | 17 | ||||
-rw-r--r-- | encoder.cc | 44 | ||||
-rw-r--r-- | encoder.h | 25 | ||||
-rw-r--r-- | lzip.h | 8 | ||||
-rw-r--r-- | lzlib.h | 2 | ||||
-rw-r--r-- | main.cc | 35 | ||||
-rwxr-xr-x | testsuite/check.sh | 9 |
15 files changed, 138 insertions, 98 deletions
@@ -1,4 +1,7 @@ Lzlib was written by Antonio Diaz Diaz. -Lzlib implements a simplified version of the LZMA algorithm. -The original LZMA algorithm was designed by Igor Pavlov. +The ideas embodied in lzlib are due to (at least) the following people: +Abraham Lempel and Jacob Ziv (for the LZ algorithm), Andrey Markov (for +the definition of Markov chains), G.N.N. Martin (for the definition of +range encoding), Igor Pavlov (for putting all the above together in +LZMA), and Julian Seward (for bzip2's CLI and the idea of unzcrash). @@ -1,3 +1,17 @@ +2011-10-25 Antonio Diaz Diaz <ant_diaz@teleline.es> + + * Version 1.2 released. + * encoder.h (Lee_update_prices): Update high length symbol prices + independently of the value of `pos_state'. This gives better + compression for large values of `--match-length' without being + slower. + * encoder.h encoder.cc: Optimize pair price calculations. This + reduces compression time for large values of `--match-length' + by up to 6%. + * main.cc: Added new option `-F, --recompress'. + * Makefile.in: `make install' no more tries to run + `/sbin/ldconfig' on systems lacking it. + 2011-01-03 Antonio Diaz Diaz <ant_diaz@teleline.es> * Version 1.1 released. diff --git a/Makefile.in b/Makefile.in index 94e0114..fd9c6b6 100644 --- a/Makefile.in +++ b/Makefile.in @@ -5,7 +5,7 @@ INSTALL = install INSTALL_PROGRAM = $(INSTALL) -p -m 755 INSTALL_DATA = $(INSTALL) -p -m 644 INSTALL_DIR = $(INSTALL) -d -m 755 -LDCONFIG = ldconfig +LDCONFIG = /sbin/ldconfig SHELL = /bin/sh lib_objs = decoder.o encoder.o lzlib.o @@ -101,7 +101,7 @@ install : all install-info else run_ldconfig=yes ; \ fi ; \ cd "$(DESTDIR)$(libdir)" && ln -s lib$(libname).so.$(pkgversion) lib$(libname).so.$(soversion) ; \ - if [ $${run_ldconfig} = yes ] ; then $(LDCONFIG) "$(DESTDIR)$(libdir)" ; fi ; \ + if [ $${run_ldconfig} = yes ] && [ -x "$(LDCONFIG)" ] ; then "$(LDCONFIG)" -n "$(DESTDIR)$(libdir)" ; fi ; \ fi install-info : @@ -1,16 +1,10 @@ -Changes in version 1.1: +Changes in version 1.2: -Compression time has been reduced by 2%. +For large values of "--match-length", compression ratio has been +slightly increased and compression time has been reduced by up to 6%. -All declarations not belonging to the API have been encapsulated in the -namespace "Lzlib". +The option "-F, --recompress", which forces recompression of files whose +name already has the ".lz" or ".tlz" suffix, has been added to minilzip. -New tests have been added to the testsuite. - -Match length limits set by options -1 to -9 of minilzip have been -changed to match those of lzip 1.11. - -Minilzip now sets stdin and stdout in binary mode on OS2. - -The file bbexample.cc, containing example functions for buffer-to-buffer -compression/decompression, has been added. +"make install" no more tries to run "/sbin/ldconfig" on systems lacking +it. diff --git a/arg_parser.cc b/arg_parser.cc index cc9f87d..bc4b4a3 100644 --- a/arg_parser.cc +++ b/arg_parser.cc @@ -1,5 +1,5 @@ -/* Arg_parser - A POSIX/GNU command line argument parser. (C++ version) - Copyright (C) 2006, 2007, 2008, 2009, 2010 Antonio Diaz Diaz. +/* Arg_parser - POSIX/GNU command line argument parser. (C++ version) + Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 Antonio Diaz Diaz. This library is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/arg_parser.h b/arg_parser.h index da1cc94..d1e5c02 100644 --- a/arg_parser.h +++ b/arg_parser.h @@ -1,5 +1,5 @@ -/* Arg_parser - A POSIX/GNU command line argument parser. (C++ version) - Copyright (C) 2006, 2007, 2008, 2009, 2010 Antonio Diaz Diaz. +/* Arg_parser - POSIX/GNU command line argument parser. (C++ version) + Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 Antonio Diaz Diaz. This library is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -4,13 +4,11 @@ # # This configure script is free software: you have unlimited permission # to copy, distribute and modify it. -# -# Date of this version: 2011-01-03 args= no_create= pkgname=lzlib -pkgversion=1.1 +pkgversion=1.2 soversion=1 progname=minilzip progname_shared= diff --git a/doc/lzlib.info b/doc/lzlib.info index 4fddce8..e502f6b 100644 --- a/doc/lzlib.info +++ b/doc/lzlib.info @@ -12,7 +12,7 @@ File: lzlib.info, Node: Top, Next: Introduction, Up: (dir) Lzlib Manual ************ -This manual is for Lzlib (version 1.1, 3 January 2011). +This manual is for Lzlib (version 1.2, 25 October 2011). * Menu: @@ -180,7 +180,8 @@ verified by calling `LZ_compress_errno' before using it. DICTIONARY_SIZE sets the dictionary size to be used, in bytes. Valid values range from 4KiB to 512MiB. Note that dictionary sizes are quantized. If the specified size does not match one of the - valid sizes, it will be rounded upwards. + valid sizes, it will be rounded upwards by adding up to + (DICTIONARY_SIZE / 16) to it. MATCH_LEN_LIMIT sets the match length limit in bytes. Valid values range from 5 to 273. Larger values usually give better compression @@ -252,7 +253,7 @@ verified by calling `LZ_compress_errno' before using it. -- Function: enum LZ_Errno LZ_compress_errno ( struct LZ_Encoder * const ENCODER ) - Returns the current error code for ENCODER (*note Error Codes::) + Returns the current error code for ENCODER (*note Error Codes::). -- Function: int LZ_compress_finished ( struct LZ_Encoder * const ENCODER ) @@ -367,7 +368,7 @@ be verified by calling `LZ_decompress_errno' before using it. -- Function: enum LZ_Errno LZ_decompress_errno ( struct LZ_Decoder * const DECODER ) - Returns the current error code for DECODER (*note Error Codes::) + Returns the current error code for DECODER (*note Error Codes::). -- Function: int LZ_decompress_finished ( struct LZ_Decoder * const DECODER ) @@ -514,13 +515,13 @@ with no additional information before, between, or after them. All multibyte values are stored in little endian order. `ID string' - A four byte string, identifying the member type, with the value + A four byte string, identifying the lzip format, with the value "LZIP". `VN (version number, 1 byte)' Just in case something needs to be modified in the future. Valid - values are 0 and 1. Version 0 files have only one member and lack - `Member size'. + values are 0 and 1. Version 0 files are deprecated. They can + contain only one member and lack the `Member size' field. `DS (coded dictionary size, 1 byte)' Bits 4-0 contain the base 2 logarithm of the base dictionary size. @@ -703,17 +704,17 @@ Concept Index Tag Table: Node: Top219 -Node: Introduction1310 -Node: Library Version3135 -Node: Buffering3780 -Node: Parameter Limits4900 -Node: Compression Functions5857 -Node: Decompression Functions11903 -Node: Error Codes17974 -Node: Error Messages19913 -Node: Data Format20492 -Node: Examples22462 -Node: Problems26328 -Node: Concept Index26900 +Node: Introduction1311 +Node: Library Version3136 +Node: Buffering3781 +Node: Parameter Limits4901 +Node: Compression Functions5858 +Node: Decompression Functions11955 +Node: Error Codes18027 +Node: Error Messages19966 +Node: Data Format20545 +Node: Examples22553 +Node: Problems26419 +Node: Concept Index26991 End Tag Table diff --git a/doc/lzlib.texinfo b/doc/lzlib.texinfo index d03d817..55e5fa6 100644 --- a/doc/lzlib.texinfo +++ b/doc/lzlib.texinfo @@ -5,8 +5,8 @@ @finalout @c %**end of header -@set UPDATED 3 January 2011 -@set VERSION 1.1 +@set UPDATED 25 October 2011 +@set VERSION 1.2 @dircategory Data Compression @direntry @@ -204,7 +204,8 @@ should be freed with @samp{LZ_compress_close} to avoid memory leaks. @var{dictionary_size} sets the dictionary size to be used, in bytes. Valid values range from 4KiB to 512MiB. Note that dictionary sizes are quantized. If the specified size does not match one of the valid sizes, -it will be rounded upwards. +it will be rounded upwards by adding up to (@var{dictionary_size} / 16) +to it. @var{match_len_limit} sets the match length limit in bytes. Valid values range from 5 to 273. Larger values usually give better compression @@ -285,7 +286,7 @@ accept a @var{size} up to the returned number of bytes. @deftypefun {enum LZ_Errno} LZ_compress_errno ( struct LZ_Encoder * const @var{encoder} ) -Returns the current error code for @var{encoder} (@pxref{Error Codes}) +Returns the current error code for @var{encoder} (@pxref{Error Codes}). @end deftypefun @@ -417,7 +418,7 @@ will accept a @var{size} up to the returned number of bytes. @deftypefun {enum LZ_Errno} LZ_decompress_errno ( struct LZ_Decoder * const @var{decoder} ) -Returns the current error code for @var{decoder} (@pxref{Error Codes}) +Returns the current error code for @var{decoder} (@pxref{Error Codes}). @end deftypefun @@ -585,12 +586,12 @@ All multibyte values are stored in little endian order. @table @samp @item ID string -A four byte string, identifying the member type, with the value "LZIP". +A four byte string, identifying the lzip format, with the value "LZIP". @item VN (version number, 1 byte) Just in case something needs to be modified in the future. Valid values -are 0 and 1. Version 0 files have only one member and lack @samp{Member -size}. +are 0 and 1. Version 0 files are deprecated. They can contain only one +member and lack the @samp{Member size} field. @item DS (coded dictionary size, 1 byte) Bits 4-0 contain the base 2 logarithm of the base dictionary size.@* @@ -353,22 +353,23 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], return 1; } - { - const int normal_match_price = match_price + price0( bm_rep[state()] ); - int len = min_match_len; if( main_len <= replens[rep_index] ) { main_len = replens[rep_index]; - for( ; len <= main_len; ++len ) trials[len].price = infinite_price; + for( int len = min_match_len; len <= main_len; ++len ) + trials[len].price = infinite_price; } - else for( ; len <= main_len; ++len ) + else { - trials[len].dis = match_distances[len] + num_rep_distances; - trials[len].prev_index = 0; - trials[len].price = normal_match_price + - price_pair( match_distances[len], len, pos_state ); + const int normal_match_price = match_price + price0( bm_rep[state()] ); + for( int len = min_match_len; len <= main_len; ++len ) + { + trials[len].dis = match_distances[len] + num_rep_distances; + trials[len].prev_index = 0; + trials[len].price = normal_match_price + + price_pair( match_distances[len], len, pos_state ); + } } - } for( int rep = 0; rep < num_rep_distances; ++rep ) { @@ -474,10 +475,25 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], while( num_trials < cur + newlen ) trials[++num_trials].price = infinite_price; - for( int len = min_match_len; len <= newlen; ++len ) - trials[cur+len].update( match_distances[len] + num_rep_distances, cur, - normal_match_price + - price_pair( match_distances[len], len, pos_state ) ); + int dis = match_distances[min_match_len]; + int dis_state = get_dis_state( min_match_len ); + int dis_price = infinite_price; + if( dis < modeled_distances ) + trials[cur+min_match_len].update( dis + num_rep_distances, cur, + normal_match_price + dis_prices[dis_state][dis] + + len_encoder.price( min_match_len, pos_state ) ); + for( int len = min_match_len + 1; len <= newlen; ++len ) + { + if( dis != match_distances[len] || dis_state < max_dis_states - 1 ) + { + dis = match_distances[len]; + dis_state = get_dis_state( len ); + dis_price = price_dis( dis, dis_state ); + } + trials[cur+len].update( dis + num_rep_distances, cur, + normal_match_price + dis_price + + len_encoder.price( len, pos_state ) ); + } } } } @@ -383,7 +383,9 @@ class Len_encoder pps[len] = tmp + price0( choice2 ) + price_symbol( bm_mid[pos_state], len - len_low_symbols, len_mid_bits ); for( ; len < len_symbols; ++len ) - pps[len] = tmp + price1( choice2 ) + + // using 4 slots per value makes "price" faster + prices[3][len] = prices[2][len] = prices[1][len] = prices[0][len] = + tmp + price1( choice2 ) + price_symbol( bm_high, len - len_low_symbols - len_mid_symbols, len_high_bits ); counters[pos_state] = len_symbols; } @@ -407,7 +409,7 @@ class Literal_encoder { Bit_model bm_literal[1<<literal_context_bits][0x300]; - int lstate( const int prev_byte ) const throw() + int lstate( const uint8_t prev_byte ) const throw() { return ( prev_byte >> ( 8 - literal_context_bits ) ); } public: @@ -522,18 +524,21 @@ class LZ_encoder return price; } + int price_dis( const int dis, const int dis_state ) const throw() + { + if( dis < modeled_distances ) + return dis_prices[dis_state][dis]; + else + return dis_slot_prices[dis_state][dis_slots[dis]] + + align_prices[dis & (dis_align_size - 1)]; + } + int price_pair( const int dis, const int len, const int pos_state ) const throw() { if( len <= min_match_len && dis >= modeled_distances ) return infinite_price; - int price = len_encoder.price( len, pos_state ); - const int dis_state = get_dis_state( len ); - if( dis < modeled_distances ) - price += dis_prices[dis_state][dis]; - else - price += dis_slot_prices[dis_state][dis_slots[dis]] + - align_prices[dis & (dis_align_size - 1)]; - return price; + return len_encoder.price( len, pos_state ) + + price_dis( dis, get_dis_state( len ) ); } void encode_pair( const uint32_t dis, const int len, const int pos_state ) throw() @@ -40,28 +40,28 @@ public: void set_char() throw() { static const unsigned char next[states] = - {0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5}; + { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5 }; st = next[st]; } void set_match() throw() { static const unsigned char next[states] = - {7, 7, 7, 7, 7, 7, 7, 10, 10, 10, 10, 10}; + { 7, 7, 7, 7, 7, 7, 7, 10, 10, 10, 10, 10 }; st = next[st]; } void set_rep() throw() { static const unsigned char next[states] = - {8, 8, 8, 8, 8, 8, 8, 11, 11, 11, 11, 11}; + { 8, 8, 8, 8, 8, 8, 8, 11, 11, 11, 11, 11 }; st = next[st]; } void set_short_rep() throw() { static const unsigned char next[states] = - {9, 9, 9, 9, 9, 9, 9, 11, 11, 11, 11, 11}; + { 9, 9, 9, 9, 9, 9, 9, 11, 11, 11, 11, 11 }; st = next[st]; } }; @@ -29,7 +29,7 @@ extern "C" { #endif -const char * const LZ_version_string = "1.1"; +const char * const LZ_version_string = "1.2"; enum LZ_Errno { LZ_ok = 0, LZ_bad_argument, LZ_mem_error, LZ_sequence_error, LZ_header_error, LZ_unexpected_eof, @@ -18,7 +18,7 @@ Return values: 0 for a normal exit, 1 for environmental problems (file not found, invalid flags, I/O errors, etc), 2 to indicate a corrupt or invalid input file, 3 for an internal consistency error - (eg, bug) which caused lzip to panic. + (eg, bug) which caused minilzip to panic. */ #define _FILE_OFFSET_BITS 64 @@ -138,6 +138,7 @@ void show_help() throw() std::printf( " -c, --stdout send output to standard output\n" ); std::printf( " -d, --decompress decompress\n" ); std::printf( " -f, --force overwrite existing output files\n" ); + std::printf( " -F, --recompress force recompression of compressed files\n" ); std::printf( " -k, --keep keep (don't delete) input files\n" ); std::printf( " -m, --match-length=<n> set match length limit in bytes [36]\n" ); std::printf( " -o, --output=<file> if reading stdin, place the output into <file>\n" ); @@ -184,7 +185,7 @@ const char * format_num( long long num ) throw() } -long long getnum( const char * const ptr, const int bs = 0, +long long getnum( const char * const ptr, const long long llimit = LLONG_MIN + 1, const long long ulimit = LLONG_MAX ) throw() { @@ -205,9 +206,6 @@ long long getnum( const char * const ptr, const int bs = 0, switch( tail[0] ) { case ' ': break; - case 'b': if( bs > 0 ) { factor = bs; exponent = 1; } - else bad_multiplier = true; - break; case 'Y': exponent = 8; break; case 'Z': exponent = 7; break; case 'E': exponent = 6; break; @@ -249,7 +247,7 @@ int get_dict_size( const char * const arg ) throw() if( bits >= LZ_min_dictionary_bits() && bits <= LZ_max_dictionary_bits() && *tail == 0 ) return ( 1 << bits ); - return getnum( arg, 0, LZ_min_dictionary_size(), LZ_max_dictionary_size() ); + return getnum( arg, LZ_min_dictionary_size(), LZ_max_dictionary_size() ); } @@ -268,10 +266,10 @@ int extension_index( const std::string & name ) throw() int open_instream( const std::string & name, struct stat * const in_statsp, const Mode program_mode, const int eindex, - const bool force, const bool to_stdout ) throw() + const bool recompress, const bool to_stdout ) throw() { int infd = -1; - if( program_mode == m_compress && !force && eindex >= 0 ) + if( program_mode == m_compress && !recompress && eindex >= 0 ) { if( verbosity >= 0 ) std::fprintf( stderr, "%s: Input file `%s' already has `%s' suffix.\n", @@ -331,7 +329,7 @@ void set_d_outname( const std::string & name, const int i ) throw() } } output_filename = name; output_filename += ".out"; - if( verbosity >= 0 ) + if( verbosity >= 1 ) std::fprintf( stderr, "%s: Can't guess original name for `%s' -- using `%s'.\n", program_name, name.c_str(), output_filename.c_str() ); } @@ -382,7 +380,7 @@ void cleanup_and_fail( const int retval ) throw() std::fprintf( stderr, "%s: Deleting output file `%s', if it exists.\n", program_name, output_filename.c_str() ); if( outfd >= 0 ) { close( outfd ); outfd = -1; } - if( std::remove( output_filename.c_str() ) != 0 ) + if( std::remove( output_filename.c_str() ) != 0 && errno != ENOENT ) show_error( "WARNING: deletion of output file (apparently) failed." ); } std::exit( retval ); @@ -589,12 +587,12 @@ int do_decompress( LZ_Decoder * const decoder, const int infd, std::fprintf( stderr, "version %d, dictionary size %7sB. ", LZ_decompress_member_version( decoder ), format_num( LZ_decompress_dictionary_size( decoder ) ) ); - if( verbosity >= 4 && data_position > 0 && member_size > 0 ) + if( verbosity >= 3 && data_position > 0 && member_size > 0 ) std::fprintf( stderr, "%6.3f:1, %6.3f bits/byte, %5.2f%% saved. ", (double)data_position / member_size, ( 8.0 * member_size ) / data_position, 100.0 * ( 1.0 - ( (double)member_size / data_position ) ) ); - if( verbosity >= 3 ) + if( verbosity >= 4 ) std::fprintf( stderr, "data CRC %08X, data size %9lld, member size %8lld. ", LZ_decompress_data_crc( decoder ), data_position, member_size ); @@ -778,6 +776,7 @@ int main( const int argc, const char * const argv[] ) Mode program_mode = m_compress; bool force = false; bool keep_input_files = false; + bool recompress = false; bool to_stdout = false; std::string input_filename; std::string default_output_filename; @@ -807,6 +806,7 @@ int main( const int argc, const char * const argv[] ) { 'd', "decompress", Arg_parser::no }, { 'e', "extreme", Arg_parser::no }, { 'f', "force", Arg_parser::no }, + { 'F', "recompress", Arg_parser::no }, { 'h', "help", Arg_parser::no }, { 'k', "keep", Arg_parser::no }, { 'm', "match-length", Arg_parser::yes }, @@ -834,21 +834,22 @@ int main( const int argc, const char * const argv[] ) case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': encoder_options = option_mapping[code-'0']; break; - case 'b': member_size = getnum( arg, 0, 100000, LLONG_MAX / 2 ); break; + case 'b': member_size = getnum( arg, 100000, LLONG_MAX / 2 ); break; case 'c': to_stdout = true; break; case 'd': program_mode = m_decompress; break; case 'e': break; // ignored by now case 'f': force = true; break; + case 'F': recompress = true; break; case 'h': show_help(); return 0; case 'k': keep_input_files = true; break; case 'm': encoder_options.match_len_limit = - getnum( arg, 0, LZ_min_match_len_limit(), - LZ_max_match_len_limit() ); break; + getnum( arg, LZ_min_match_len_limit(), + LZ_max_match_len_limit() ); break; case 'o': default_output_filename = arg; break; case 'q': verbosity = -1; break; case 's': encoder_options.dictionary_size = get_dict_size( arg ); break; - case 'S': volume_size = getnum( arg, 0, 100000, LLONG_MAX / 2 ); break; + case 'S': volume_size = getnum( arg, 100000, LLONG_MAX / 2 ); break; case 't': program_mode = m_test; break; case 'v': if( verbosity < 4 ) ++verbosity; break; case 'V': show_version(); return 0; @@ -911,7 +912,7 @@ int main( const int argc, const char * const argv[] ) input_filename = filenames[i]; const int eindex = extension_index( input_filename ); infd = open_instream( input_filename, &in_stats, program_mode, - eindex, force, to_stdout ); + eindex, recompress, to_stdout ); if( infd < 0 ) { if( retval < 1 ) retval = 1; continue; } if( program_mode != m_test ) { diff --git a/testsuite/check.sh b/testsuite/check.sh index be6204b..b74e550 100755 --- a/testsuite/check.sh +++ b/testsuite/check.sh @@ -45,6 +45,13 @@ printf . cmp in copy || fail=1 printf . +"${LZIP}" -cf "${testdir}"/test_v1.lz > out 2>/dev/null +if [ $? != 1 ] ; then fail=1 ; printf - ; else printf . ; fi +"${LZIP}" -cF "${testdir}"/test_v1.lz > out || fail=1 +"${LZIP}" -cd out | "${LZIP}" -d > copy || fail=1 +cmp in copy || fail=1 +printf . + for i in s4Ki 0 1 2 3 4 5 6 7 8s16 9s16 ; do "${LZIP}" -k -$i in || fail=1 mv -f in.lz copy.lz || fail=1 @@ -77,7 +84,7 @@ for i in s4Ki 0 1 2 3 4 5 6 7 8s16 9s16 ; do done "${LZIP}" -$i < in > anyothername || fail=1 -"${LZIP}" -dq anyothername || fail=1 +"${LZIP}" -d anyothername || fail=1 cmp in anyothername.out || fail=1 printf . |