From 418b05fcd64815e6928cb9751285fa2d29a34361 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 7 Nov 2015 10:31:21 +0100 Subject: Adding upstream version 1.13~rc2. Signed-off-by: Daniel Baumann --- ChangeLog | 110 ++++++++++-------- INSTALL | 26 ++--- Makefile.in | 6 +- NEWS | 9 ++ README | 4 +- arg_parser.cc | 3 +- arg_parser.h | 5 +- configure | 6 +- decoder.cc | 26 ++--- decoder.h | 23 +++- doc/lzip.1 | 8 +- doc/lzip.info | 37 +++--- doc/lzip.texinfo | 19 +-- encoder.cc | 166 +++++++++++++------------- encoder.h | 249 +++++++++++++++++++++++---------------- fast_encoder.cc | 316 ++++++++++++++------------------------------------ fast_encoder.h | 156 +++++-------------------- lzip.h | 7 +- main.cc | 65 ++++++----- testsuite/check.sh | 4 +- testsuite/unzcrash.cc | 7 +- 21 files changed, 550 insertions(+), 702 deletions(-) diff --git a/ChangeLog b/ChangeLog index a173610..3a3eccb 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +2012-01-05 Antonio Diaz Diaz + + * Version 1.13-rc2 released. + * Compression time of option '-0' has been reduced by 2%. + * Reorganization of the compression code. + * Small change in '--help' output and man page. + * Changed quote characters in messages as advised by GNU Standards. + 2011-11-12 Antonio Diaz Diaz * Version 1.13-rc1 released. @@ -8,58 +16,58 @@ 2011-04-30 Antonio Diaz Diaz * Version 1.12 released. - * main.cc: Added new option `-F, --recompress'. + * main.cc: Added new option '-F, --recompress'. * encoder.h (update_prices): Update high length symbol prices - independently of the value of `pos_state'. This gives better - compression for large values of `--match-length' without being + independently of the value of 'pos_state'. This gives better + compression for large values of '--match-length' without being slower. * encoder.h encoder.cc: Optimize pair price calculations. This - reduces compression time for large values of `--match-length' + reduces compression time for large values of '--match-length' by up to 6%. - * Compression time of option `-0' has been reduced by 2%. + * Compression time of option '-0' has been reduced by 2%. * main.cc (decompress): Print only one status line for each - multimember file when only one `-v' is specified. + multi-member file when only one '-v' is specified. * main.cc (decompress): Print up to 6 bytes of trailing garbage - when `-vvvv' is specified. + when '-vvvv' is specified. * main.cc (open_instream): Do not show the message - " and `--stdout' was not specified" for directories, etc. - * lziprecover.cc: If `-v' is not specified show errors only. + " and '--stdout' was not specified" for directories, etc. + * lziprecover.cc: If '-v' is not specified show errors only. * testsuite/unzcrash.cc: Use Arg_parser. - * testsuite/unzcrash.cc: Added new option `-b, --bits'. - * testsuite/unzcrash.cc: Added new option `-p, --position'. - * testsuite/unzcrash.cc: Added new option `-s, --size'. + * testsuite/unzcrash.cc: Added new option '-b, --bits'. + * testsuite/unzcrash.cc: Added new option '-p, --position'. + * testsuite/unzcrash.cc: Added new option '-s, --size'. 2010-09-16 Antonio Diaz Diaz * Version 1.11 released. - * Added new option `-0' which produces a compression speed and - ratio comparable to those of `gzip -9'. + * Added new option '-0' which produces a compression speed and + ratio comparable to those of 'gzip -9'. * fast_encoder.h fast_encoder.cc: New files. * main.cc: Match length limit set by options -1 to -8 has been reduced to extend range of use towards gzip. Lower numbers now compress less but faster. (-1 now takes 43% less time for only 20% larger compressed size). * encoder.cc: Compression of option -9 has been slightly increased. - * lziprecover.cc: Added new option `-m, --merge' which tries to + * lziprecover.cc: Added new option '-m, --merge' which tries to produce a correct file merging the good parts of two or more damaged copies. - * lziprecover.cc: Added new option `-R, --repair' for repairing + * lziprecover.cc: Added new option '-R, --repair' for repairing a 1-byte error in single-member files. * decoder.cc (decode_member): Detect file errors earlier to improve efficiency of lziprecover's new repair capability. This change also prevents (harmless) access to uninitialized memory when decompressing a corrupt file. - * lziprecover.cc: Added new option `-f, --force'. - * lziprecover.cc: Added new option `-o, --output'. - * lziprecover.cc: Added new option `-s, --split' to select the - until now only operation of splitting multimember files. + * lziprecover.cc: Added new option '-f, --force'. + * lziprecover.cc: Added new option '-o, --output'. + * lziprecover.cc: Added new option '-s, --split' to select the + until now only operation of splitting multi-member files. * lziprecover.cc: If no operation is specified, warn the user and do nothing. * main.cc: Fixed warning about fchown's return value being ignored. - * decoder.cc: `-tvvvv' now also shows compression ratio. + * decoder.cc: '-tvvvv' now also shows compression ratio. * main.cc: Set stdin/stdout in binary mode on MSVC and OS2. * New examples have been added to the manual. - * testsuite: `test1' renamed to `test.txt'. Added new tests. + * testsuite: 'test1' renamed to 'test.txt'. Added new tests. * Matchfinder types HC4 (4 bytes hash-chain) and HT4 (4 bytes hash-table) have been tested and found no better than the current BT4. @@ -68,9 +76,9 @@ * Version 1.10 released. * decoder.h: Input_buffer integrated in Range_decoder. - * main.cc: File specified with option `-o' is now created with + * main.cc: File specified with option '-o' is now created with mode 0666 if umask allows it, deleted if interrupted by user. - * main.cc: New constant `o_binary'. + * main.cc: New constant 'o_binary'. * main.cc: Dictionary size for options -2, -3, -4 and -8 has been changed to improve linearity of compressed sizes. * lzip.h: Fixed warnings produced by over-optimization (-O3). @@ -80,8 +88,8 @@ * Version 1.9 released. * main.cc (main): Return at least 1 if closing stdout fails. - * Makefile.in: Added `--name' option to help2man invocation. - * testsuite/check.sh: Use `test1' instead of `COPYING' for testing. + * Makefile.in: Added '--name' option to help2man invocation. + * testsuite/check.sh: Use 'test1' instead of 'COPYING' for testing. 2009-09-02 Antonio Diaz Diaz @@ -106,25 +114,25 @@ is smaller than dictionary size limit. * decoder.cc: Added extra flush calls to improve partial decompression of corrupt files. - * `--test' no more needs `/dev/null'. - * Removed some `bashisms' from lzdiff and lzgrep. - * Dictionary size for options `-1' to `-4' has been changed. - * main.cc (signal_handler): Declared as `extern "C"'. + * '--test' no more needs '/dev/null'. + * Removed some 'bashisms' from lzdiff and lzgrep. + * Dictionary size for options '-1' to '-4' has been changed. + * main.cc (signal_handler): Declared as 'extern "C"'. * Makefile.in: Extra files are now installed by default. * testsuite/check.sh: Test lziprecover. - * Added `export LC_ALL=C' to all scripts. + * Added 'export LC_ALL=C' to all scripts. 2009-04-12 Antonio Diaz Diaz * Version 1.5 released. * lzip.h: Coded dictionary size implemented in File_header. * Fixed some includes that prevented compilation with GCC 4.4. - * `member_size' and `volume_size' are now accurate limits. + * 'member_size' and 'volume_size' are now accurate limits. * Compression speed has been improved. * Implemented bt4 type matchfinder. - * Added chapter `Algorithm' to the manual. - * Lzdiff and lzgrep now accept `-h' for `--help' and - `-V' for `--version'. + * Added chapter 'Algorithm' to the manual. + * Lzdiff and lzgrep now accept '-h' for '--help' and + '-V' for '--version'. * Makefile.in: Man page is now installed by default. * testsuite/check.sh: Verify that files are open in binary mode. @@ -132,11 +140,11 @@ * Version 1.4 released. * Implemented compression of version 1 files. - * Added new option `-b, --member-size'. - * Added new option `-S, --volume-size'. - * Added new option `-o, --output'. - * main.cc: Read from non regular files if `--stdout' is specified. - * Added `lziprecover', a member recoverer program. + * Added new option '-b, --member-size'. + * Added new option '-S, --volume-size'. + * Added new option '-o, --output'. + * main.cc: Read from non regular files if '--stdout' is specified. + * Added 'lziprecover', a member recoverer program. * testsuite/unzcrash.cc: Test all 1-byte errors. 2008-12-21 Antonio Diaz Diaz @@ -146,7 +154,7 @@ dictionary size for each file during compression, saving memory during decompression. * Implemented decompression of version 1 files. - * testsuite/check.sh: Replaced `diff -q' with `cmp'. + * testsuite/check.sh: Replaced 'diff -q' with 'cmp'. 2008-12-10 Antonio Diaz Diaz @@ -154,23 +162,23 @@ * encoder.cc: A 1-byte read outside allocated memory has been fixed. * lzip.h: Dictionary size limit has been reduced to 512MiB because setting it to 1GiB causes overflow of a 32 bit integer. - * Added `lzdiff', a diff/cmp wrapper for gzip, bzip2, lzip and + * Added 'lzdiff', a diff/cmp wrapper for gzip, bzip2, lzip and non-compressed files. - * Added `lzgrep', a grep wrapper for gzip, bzip2, lzip and + * Added 'lzgrep', a grep wrapper for gzip, bzip2, lzip and non-compressed files. - * `make install-info' should now work on Debian and OS X. + * 'make install-info' should now work on Debian and OS X. 2008-11-17 Antonio Diaz Diaz * Version 1.1 released. - * Changed short name of option `--dictionary-size' to `-s'. - * Changed short name of option `--match-length' to `-m'. + * Changed short name of option '--dictionary-size' to '-s'. + * Changed short name of option '--match-length' to '-m'. * Changed LONG_LONG_MAX to LLONG_MAX. 2008-10-14 Antonio Diaz Diaz * Version 1.0 released. - * `-tvv' shows file version and dictionary size. + * '-tvv' shows file version and dictionary size. 2008-09-30 Antonio Diaz Diaz @@ -180,7 +188,7 @@ 2008-09-23 Antonio Diaz Diaz * Version 0.4 released. - * Code cleanup for global variable `verbosity'. + * Code cleanup for global variable 'verbosity'. * Regained the compression ratio of 0.2 with 5% faster speed. * Fixed compilation on sistems where size_t != unsigned int. @@ -188,8 +196,8 @@ * Version 0.3 released. * encoder.cc: Compression is now 15% faster, 1% worse. - * main.cc (main): Make `-t' option override `-c'. - * main.cc (decompress): Show `done' instead of `ok' when not testing. + * main.cc (main): Make '-t' option override '-c'. + * main.cc (decompress): Show 'done' instead of 'ok' when not testing. * encoder.h: Use trials[] to return the list of pairs. 2008-09-09 Antonio Diaz Diaz @@ -203,7 +211,7 @@ * Version 0.1 released. -Copyright (C) 2008, 2009, 2010, 2011 Antonio Diaz Diaz. +Copyright (C) 2008, 2009, 2010, 2011, 2012 Antonio Diaz Diaz. This file is a collection of facts, and thus it is not copyrightable, but just in case, you have unlimited permission to copy, distribute and diff --git a/INSTALL b/INSTALL index b516f57..4885d81 100644 --- a/INSTALL +++ b/INSTALL @@ -18,7 +18,7 @@ This creates the directory ./lzip[version] containing the source from the main archive. 2. Change to lzip directory and run configure. - (Try `configure --help' for usage instructions). + (Try 'configure --help' for usage instructions). cd lzip[version] ./configure @@ -27,30 +27,30 @@ the main archive. make -4. Optionally, type `make check' to run the tests that come with lzip. +4. Optionally, type 'make check' to run the tests that come with lzip. -5. Type `make install' to install the programs and any data files and +5. Type 'make install' to install the program and any data files and documentation. Another way ----------- You can also compile lzip into a separate directory. To do this, you -must use a version of `make' that supports the `VPATH' variable, such -as GNU `make'. `cd' to the directory where you want the object files -and executables to go and run the `configure' script. `configure' -automatically checks for the source code in `.', in `..' and in the -directory that `configure' is in. - -`configure' recognizes the option `--srcdir=DIR' to control where to -look for the sources. Usually `configure' can determine that directory +must use a version of 'make' that supports the 'VPATH' variable, such +as GNU 'make'. 'cd' to the directory where you want the object files +and executables to go and run the 'configure' script. 'configure' +automatically checks for the source code in '.', in '..' and in the +directory that 'configure' is in. + +'configure' recognizes the option '--srcdir=DIR' to control where to +look for the sources. Usually 'configure' can determine that directory automatically. -After running `configure', you can run `make' and `make install' as +After running 'configure', you can run 'make' and 'make install' as explained above. -Copyright (C) 2008, 2009, 2010, 2011 Antonio Diaz Diaz. +Copyright (C) 2008, 2009, 2010, 2011, 2012 Antonio Diaz Diaz. This file is free documentation: you have unlimited permission to copy, distribute and modify it. diff --git a/Makefile.in b/Makefile.in index 98a9a98..83991ef 100644 --- a/Makefile.in +++ b/Makefile.in @@ -17,13 +17,13 @@ unzobjs = arg_parser.o unzcrash.o all : $(progname) $(progname) : $(objs) - $(CXX) $(LDFLAGS) -o $@ $(objs) + $(CXX) $(LDFLAGS) -o $@ $^ $(progname)_profiled : $(objs) - $(CXX) $(LDFLAGS) -pg -o $@ $(objs) + $(CXX) $(LDFLAGS) -pg -o $@ $^ unzcrash : $(unzobjs) - $(CXX) $(LDFLAGS) -o $@ $(unzobjs) + $(CXX) $(LDFLAGS) -o $@ $^ main.o : main.cc $(CXX) $(CPPFLAGS) $(CXXFLAGS) -DPROGVERSION=\"$(pkgversion)\" -c -o $@ $< diff --git a/NEWS b/NEWS index 6b4c70d..df56781 100644 --- a/NEWS +++ b/NEWS @@ -4,3 +4,12 @@ Lziprecover has been moved to its own package. Inability to change output file attributes has been downgraded from error to warning. + +Compression time of option "-0" has been reduced by 2%. + +A reorganization of the compression code has been made. + +A small change has been made in the "--help" output and man page. + +Quote characters in messages have been changed as advised by GNU Coding +Standards. diff --git a/README b/README index 51a03e5..75674e1 100644 --- a/README +++ b/README @@ -26,7 +26,7 @@ or more compressed files. The result is the concatenation of the corresponding uncompressed files. Integrity testing of concatenated compressed files is also supported. -Lzip can produce multimember files and safely recover, with lziprecover, +Lzip can produce multi-member files and safely recover, with lziprecover, the undamaged members in case of file damage. Lzip can also split the compressed output in volumes of a given size, even when reading from standard input. This allows the direct creation of multivolume @@ -63,7 +63,7 @@ range encoding), Igor Pavlov (for putting all the above together in LZMA), and Julian Seward (for bzip2's CLI and the idea of unzcrash). -Copyright (C) 2008, 2009, 2010, 2011 Antonio Diaz Diaz. +Copyright (C) 2008, 2009, 2010, 2011, 2012 Antonio Diaz Diaz. This file is free documentation: you have unlimited permission to copy, distribute and modify it. diff --git a/arg_parser.cc b/arg_parser.cc index bc4b4a3..27137a1 100644 --- a/arg_parser.cc +++ b/arg_parser.cc @@ -1,5 +1,6 @@ /* Arg_parser - POSIX/GNU command line argument parser. (C++ version) - Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 Antonio Diaz Diaz. + Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012 + Antonio Diaz Diaz. This library is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/arg_parser.h b/arg_parser.h index d1e5c02..5d036ab 100644 --- a/arg_parser.h +++ b/arg_parser.h @@ -1,5 +1,6 @@ /* Arg_parser - POSIX/GNU command line argument parser. (C++ version) - Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 Antonio Diaz Diaz. + Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012 + Antonio Diaz Diaz. This library is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -65,7 +66,7 @@ private: { int code; std::string argument; - Record( const int c = 0 ) : code( c ) {} + explicit Record( const int c = 0 ) : code( c ) {} }; std::string error_; diff --git a/configure b/configure index f2d87bc..633f4d3 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # configure script for Lzip - Data compressor based on the LZMA algorithm -# Copyright (C) 2008, 2009, 2010, 2011 Antonio Diaz Diaz. +# Copyright (C) 2008, 2009, 2010, 2011, 2012 Antonio Diaz Diaz. # # This configure script is free software: you have unlimited permission # to copy, distribute and modify it. @@ -8,7 +8,7 @@ args= no_create= pkgname=lzip -pkgversion=1.13-rc1 +pkgversion=1.13-rc2 progname=lzip srctrigger=lzip.h @@ -165,7 +165,7 @@ echo "LDFLAGS = ${LDFLAGS}" rm -f Makefile cat > Makefile << EOF # Makefile for Lzip - Data compressor based on the LZMA algorithm -# Copyright (C) 2008, 2009, 2010, 2011 Antonio Diaz Diaz. +# Copyright (C) 2008, 2009, 2010, 2011, 2012 Antonio Diaz Diaz. # This file was generated automatically by configure. Do not edit. # # This Makefile is free software: you have unlimited permission diff --git a/decoder.cc b/decoder.cc index dafd73d..a060797 100644 --- a/decoder.cc +++ b/decoder.cc @@ -1,5 +1,5 @@ /* Lzip - Data compressor based on the LZMA algorithm - Copyright (C) 2008, 2009, 2010, 2011 Antonio Diaz Diaz. + Copyright (C) 2008, 2009, 2010, 2011, 2012 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -82,7 +82,7 @@ int writeblock( const int fd, const uint8_t * const buf, const int size ) throw( errno = 0; const int n = write( fd, buf + size - rest, rest ); if( n > 0 ) rest -= n; - else if( errno && errno != EINTR && errno != EAGAIN ) break; + else if( n < 0 && errno != EINTR && errno != EAGAIN ) break; } return ( rest > 0 ) ? size - rest : size; } @@ -124,21 +124,17 @@ bool LZ_decoder::verify_trailer( const Pretty_print & pp ) const const long long member_size = range_decoder.member_position() + trailer_size; bool error = false; - for( int i = 0; i < trailer_size && !error; ++i ) + const int size = range_decoder.read( trailer.data, trailer_size ); + if( size < trailer_size ) { - if( !range_decoder.finished() ) - trailer.data[i] = range_decoder.get_byte(); - else + error = true; + if( pp.verbosity() >= 0 ) { - error = true; - if( pp.verbosity() >= 0 ) - { - pp(); - std::fprintf( stderr, "Trailer truncated at trailer position %d;" - " some checks may fail.\n", i ); - } - for( ; i < trailer_size; ++i ) trailer.data[i] = 0; + pp(); + std::fprintf( stderr, "Trailer truncated at trailer position %d;" + " some checks may fail.\n", size ); } + for( int i = size; i < trailer_size; ++i ) trailer.data[i] = 0; } if( member_version == 0 ) trailer.member_size( member_size ); if( !range_decoder.code_is_zero() ) @@ -292,7 +288,7 @@ int LZ_decoder::decode_member( const Pretty_print & pp ) if( pp.verbosity() >= 0 ) { pp(); - std::fprintf( stderr, "Unsupported marker code `%d'.\n", len ); + std::fprintf( stderr, "Unsupported marker code '%d'.\n", len ); } return 4; } diff --git a/decoder.h b/decoder.h index a974b33..ad3ff7c 100644 --- a/decoder.h +++ b/decoder.h @@ -1,5 +1,5 @@ /* Lzip - Data compressor based on the LZMA algorithm - Copyright (C) 2008, 2009, 2010, 2011 Antonio Diaz Diaz. + Copyright (C) 2008, 2009, 2010, 2011, 2012 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -29,8 +29,11 @@ class Range_decoder bool read_block(); + Range_decoder( const Range_decoder & ); + void operator=( const Range_decoder & ); + public: - Range_decoder( const int ifd ) + explicit Range_decoder( const int ifd ) : partial_member_pos( 0 ), buffer( new uint8_t[buffer_size] ), @@ -56,6 +59,19 @@ public: return buffer[pos++]; } + int read( uint8_t * const outbuf, const int size ) + { + int rest = size; + while( rest > 0 && !finished() ) + { + const int rd = std::min( rest, stream_pos - pos ); + std::memcpy( outbuf + size - rest, buffer + pos, rd ); + pos += rd; + rest -= rd; + } + return ( rest > 0 ) ? size - rest : size; + } + void load() { code = 0; @@ -242,6 +258,9 @@ class LZ_decoder } } + LZ_decoder( const LZ_decoder & ); + void operator=( const LZ_decoder & ); + public: LZ_decoder( const File_header & header, Range_decoder & rdec, const int ofd ) : diff --git a/doc/lzip.1 b/doc/lzip.1 index caac739..60df32e 100644 --- a/doc/lzip.1 +++ b/doc/lzip.1 @@ -1,5 +1,5 @@ .\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.37.1. -.TH LZIP "1" "November 2011" "Lzip 1.13-rc1" "User Commands" +.TH LZIP "1" "January 2012" "Lzip 1.13-rc2" "User Commands" .SH NAME Lzip \- reduces the size of files .SH SYNOPSIS @@ -67,12 +67,16 @@ If no file names are given, lzip compresses or decompresses from standard input to standard output. Numbers may be followed by a multiplier: k = kB = 10^3 = 1000, Ki = KiB = 2^10 = 1024, M = 10^6, Mi = 2^20, G = 10^9, Gi = 2^30, etc... +The bidimensional parameter space of LZMA can't be mapped to a linear +scale optimal for all files. If your files are large, very repetitive, +etc, you may need to use the \fB\-\-match\-length\fR and \fB\-\-dictionary\-size\fR +options directly to achieve optimal performance. .SH "REPORTING BUGS" Report bugs to lzip\-bug@nongnu.org .br Lzip home page: http://www.nongnu.org/lzip/lzip.html .SH COPYRIGHT -Copyright \(co 2011 Antonio Diaz Diaz. +Copyright \(co 2012 Antonio Diaz Diaz. License GPLv3+: GNU GPL version 3 or later .br This is free software: you are free to change and redistribute it. diff --git a/doc/lzip.info b/doc/lzip.info index 2981447..5d2c6ec 100644 --- a/doc/lzip.info +++ b/doc/lzip.info @@ -11,7 +11,7 @@ File: lzip.info, Node: Top, Next: Introduction, Up: (dir) Lzip Manual *********** -This manual is for Lzip (version 1.13-rc1, 12 November 2011). +This manual is for Lzip (version 1.13-rc2, 5 January 2012). * Menu: @@ -24,7 +24,7 @@ This manual is for Lzip (version 1.13-rc1, 12 November 2011). * Concept Index:: Index of concepts - Copyright (C) 2008, 2009, 2010, 2011 Antonio Diaz Diaz. + Copyright (C) 2008, 2009, 2010, 2011, 2012 Antonio Diaz Diaz. This manual is free documentation: you have unlimited permission to copy, distribute and modify it. @@ -61,7 +61,7 @@ two or more compressed files. The result is the concatenation of the corresponding uncompressed files. Integrity testing of concatenated compressed files is also supported. - Lzip can produce multimember files and safely recover, with + Lzip can produce multi-member files and safely recover, with lziprecover, the undamaged members in case of file damage. Lzip can also split the compressed output in volumes of a given size, even when reading from standard input. This allows the direct creation of @@ -185,7 +185,7 @@ The format for running lzip is: `-b BYTES' `--member-size=BYTES' - Produce a multimember file and set the member size limit to BYTES. + Produce a multi-member file and set the member size limit to BYTES. Minimum member size limit is 100kB. Small member size may degrade compression ratio, so use it only when needed. The default is to produce single-member files. @@ -254,7 +254,7 @@ The format for running lzip is: Split the compressed output into several volume files with names `original_name00001.lz', `original_name00002.lz', etc, and set the volume size limit to BYTES. Each volume is a complete, maybe - multimember, lzip file. Minimum volume size limit is 100kB. Small + multi-member, lzip file. Minimum volume size limit is 100kB. Small volume size may degrade compression ratio, so use it only when needed. @@ -377,7 +377,7 @@ additional information before, between, or after them. `Member size (8 bytes)' Total size of the member, including header and trailer. This - facilitates safe recovery of undamaged members from multimember + facilitates safe recovery of undamaged members from multi-member files. @@ -401,8 +401,8 @@ and show the compression ratio. lzip -v file -Example 2: Like example 1 but the created `file.lz' is multimember with -a member size of 1MiB. The compression ratio is not shown. +Example 2: Like example 1 but the created `file.lz' is multi-member +with a member size of 1MiB. The compression ratio is not shown. lzip -b 1MiB file @@ -449,7 +449,7 @@ Example 9: Extract a multivolume compressed tar archive. Example 10: Create a multivolume compressed backup of a big database -file with a volume size of 650MB, where each volume is a multimember +file with a volume size of 650MB, where each volume is a multi-member file with a member size of 32MiB. lzip -b 32MiB -S 650MB big_db @@ -493,12 +493,17 @@ Concept Index  Tag Table: Node: Top224 -Node: Introduction917 -Node: Algorithm4417 -Node: Invoking Lzip6935 -Node: File Format12285 -Node: Examples14277 -Node: Problems16221 -Node: Concept Index16743 +Node: Introduction921 +Node: Algorithm4422 +Node: Invoking Lzip6940 +Node: File Format12292 +Node: Examples14285 +Node: Problems16231 +Node: Concept Index16753  End Tag Table + + +Local Variables: +coding: iso-8859-15 +End: diff --git a/doc/lzip.texinfo b/doc/lzip.texinfo index 190f20f..cf7b21e 100644 --- a/doc/lzip.texinfo +++ b/doc/lzip.texinfo @@ -1,12 +1,13 @@ \input texinfo @c -*-texinfo-*- @c %**start of header @setfilename lzip.info +@documentencoding ISO-8859-15 @settitle Lzip Manual @finalout @c %**end of header -@set UPDATED 12 November 2011 -@set VERSION 1.13-rc1 +@set UPDATED 5 January 2012 +@set VERSION 1.13-rc2 @dircategory Data Compression @direntry @@ -44,7 +45,7 @@ This manual is for Lzip (version @value{VERSION}, @value{UPDATED}). @end menu @sp 1 -Copyright @copyright{} 2008, 2009, 2010, 2011 Antonio Diaz Diaz. +Copyright @copyright{} 2008, 2009, 2010, 2011, 2012 Antonio Diaz Diaz. This manual is free documentation: you have unlimited permission to copy, distribute and modify it. @@ -80,7 +81,7 @@ or more compressed files. The result is the concatenation of the corresponding uncompressed files. Integrity testing of concatenated compressed files is also supported. -Lzip can produce multimember files and safely recover, with lziprecover, +Lzip can produce multi-member files and safely recover, with lziprecover, the undamaged members in case of file damage. Lzip can also split the compressed output in volumes of a given size, even when reading from standard input. This allows the direct creation of multivolume @@ -211,7 +212,7 @@ Print the version number of lzip on the standard output and exit. @item -b @var{bytes} @itemx --member-size=@var{bytes} -Produce a multimember file and set the member size limit to @var{bytes}. +Produce a multi-member file and set the member size limit to @var{bytes}. Minimum member size limit is 100kB. Small member size may degrade compression ratio, so use it only when needed. The default is to produce single-member files. @@ -275,7 +276,7 @@ is affected at compression time by the choice of dictionary size limit. Split the compressed output into several volume files with names @samp{original_name00001.lz}, @samp{original_name00002.lz}, etc, and set the volume size limit to @var{bytes}. Each volume is a complete, maybe -multimember, lzip file. Minimum volume size limit is 100kB. Small volume +multi-member, lzip file. Minimum volume size limit is 100kB. Small volume size may degrade compression ratio, so use it only when needed. @item -t @@ -405,7 +406,7 @@ Size of the uncompressed original data. @item Member size (8 bytes) Total size of the member, including header and trailer. This facilitates -safe recovery of undamaged members from multimember files. +safe recovery of undamaged members from multi-member files. @end table @@ -432,7 +433,7 @@ lzip -v file @sp 1 @noindent -Example 2: Like example 1 but the created @samp{file.lz} is multimember +Example 2: Like example 1 but the created @samp{file.lz} is multi-member with a member size of 1MiB. The compression ratio is not shown. @example @@ -505,7 +506,7 @@ lzip -cd volume_name*.lz | tar -xf - @sp 1 @noindent Example 10: Create a multivolume compressed backup of a big database -file with a volume size of 650MB, where each volume is a multimember +file with a volume size of 650MB, where each volume is a multi-member file with a member size of 32MiB. @example diff --git a/encoder.cc b/encoder.cc index 79dd5cf..67e9d95 100644 --- a/encoder.cc +++ b/encoder.cc @@ -1,5 +1,5 @@ /* Lzip - Data compressor based on the LZMA algorithm - Copyright (C) 2008, 2009, 2010, 2011 Antonio Diaz Diaz. + Copyright (C) 2008, 2009, 2010, 2011, 2012 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -33,7 +33,7 @@ Dis_slots dis_slots; Prob_prices prob_prices; -bool Matchfinder::read_block() +bool Matchfinder_base::read_block() { if( !at_stream_end && stream_pos < buffer_size ) { @@ -41,26 +41,51 @@ bool Matchfinder::read_block() const int rd = readblock( infd, buffer + stream_pos, size ); stream_pos += rd; if( rd != size && errno ) throw Error( "Read error" ); - at_stream_end = ( rd < size ); + if( rd < size ) { at_stream_end = true; pos_limit = buffer_size; } } return pos < stream_pos; } -Matchfinder::Matchfinder( const int dict_size, const int len_limit, - const int ifd ) +void Matchfinder_base::normalize_pos() + { + if( pos > stream_pos ) + internal_error( "pos > stream_pos in Matchfinder_base::normalize_pos" ); + if( !at_stream_end ) + { + const int offset = pos - dictionary_size_ - before_size; + const int size = stream_pos - offset; + std::memmove( buffer, buffer + offset, size ); + partial_data_pos += offset; + pos -= offset; + stream_pos -= offset; + for( int i = 0; i < num_prev_positions; ++i ) + if( prev_positions[i] >= 0 ) prev_positions[i] -= offset; + for( int i = 0; i < pos_array_size; ++i ) + if( pos_array[i] >= 0 ) pos_array[i] -= offset; + read_block(); + } + } + + +Matchfinder_base::Matchfinder_base( const int before, const int dict_size, + const int dict_factor, const int len_limit, + const int num_prev_pos, const int ifd, + const int pos_array_factor ) : partial_data_pos( 0 ), - prev_positions( new int32_t[num_prev_positions] ), + prev_positions( new int32_t[num_prev_pos] ), + num_prev_positions( num_prev_pos ), + before_size( before ), + match_len_limit_( len_limit ), + infd( ifd ), pos( 0 ), cyclic_pos( 0 ), stream_pos( 0 ), - match_len_limit_( len_limit ), - cycles( ( len_limit < max_match_len ) ? 16 + ( len_limit / 2 ) : 256 ), - infd( ifd ), at_stream_end( false ) { - const int buffer_size_limit = ( 2 * dict_size ) + before_size + after_size; + const int buffer_size_limit = + ( dict_size * dict_factor ) + before_size + after_size; buffer_size = std::max( 65536, dict_size ); buffer = (uint8_t *)std::malloc( buffer_size ); if( !buffer ) { delete[] prev_positions; throw std::bad_alloc(); } @@ -78,14 +103,15 @@ Matchfinder::Matchfinder( const int dict_size, const int len_limit, else dictionary_size_ = dict_size; pos_limit = buffer_size; if( !at_stream_end ) pos_limit -= after_size; - prev_pos_tree = new( std::nothrow ) int32_t[2*dictionary_size_]; - if( !prev_pos_tree ) - { std::free( buffer ); delete[] prev_positions; throw std::bad_alloc(); } for( int i = 0; i < num_prev_positions; ++i ) prev_positions[i] = -1; + pos_array_size = pos_array_factor * dictionary_size_; + pos_array = new( std::nothrow ) int32_t[pos_array_size]; + if( !pos_array ) + { std::free( buffer ); delete[] prev_positions; throw std::bad_alloc(); } } -void Matchfinder::reset() +void Matchfinder_base::reset() { const int size = stream_pos - pos; if( size > 0 ) std::memmove( buffer, buffer + pos, size ); @@ -98,28 +124,13 @@ void Matchfinder::reset() } -void Matchfinder::move_pos() +bool Matchfinder::dec_pos( const int ahead ) throw() { - if( ++cyclic_pos >= dictionary_size_ ) cyclic_pos = 0; - if( ++pos >= pos_limit ) - { - if( pos > stream_pos ) - internal_error( "pos > stream_pos in Matchfinder::move_pos" ); - if( !at_stream_end ) - { - const int offset = pos - dictionary_size_ - before_size; - const int size = stream_pos - offset; - std::memmove( buffer, buffer + offset, size ); - partial_data_pos += offset; - pos -= offset; - stream_pos -= offset; - for( int i = 0; i < num_prev_positions; ++i ) - if( prev_positions[i] >= 0 ) prev_positions[i] -= offset; - for( int i = 0; i < 2 * dictionary_size_; ++i ) - if( prev_pos_tree[i] >= 0 ) prev_pos_tree[i] -= offset; - read_block(); - } - } + if( ahead < 0 || pos < ahead ) return false; + pos -= ahead; + cyclic_pos -= ahead; + if( cyclic_pos < 0 ) cyclic_pos += dictionary_size_; + return true; } @@ -162,7 +173,7 @@ int Matchfinder::longest_match_len( int * const distances ) throw() int newpos = prev_positions[key4]; prev_positions[key4] = pos; - int32_t * ptr0 = prev_pos_tree + ( cyclic_pos << 1 ); + int32_t * ptr0 = pos_array + ( cyclic_pos << 1 ); int32_t * ptr1 = ptr0 + 1; int len = 0, len0 = 0, len1 = 0; @@ -175,7 +186,7 @@ int Matchfinder::longest_match_len( int * const distances ) throw() const int delta = pos - newpos; if( distances ) while( maxlen < len ) distances[++maxlen] = delta - 1; - int32_t * const newptr = prev_pos_tree + + int32_t * const newptr = pos_array + ( ( cyclic_pos - delta + ( ( cyclic_pos >= delta ) ? 0 : dictionary_size_ ) ) << 1 ); @@ -251,6 +262,25 @@ void Len_encoder::encode( Range_encoder & range_encoder, int symbol, } + // End Of Stream mark => (dis == 0xFFFFFFFFU, len == min_match_len) +void LZ_encoder_base::full_flush( const long long data_position, + const State & state ) + { + const int pos_state = data_position & pos_state_mask; + range_encoder.encode_bit( bm_match[state()][pos_state], 1 ); + range_encoder.encode_bit( bm_rep[state()], 0 ); + encode_pair( 0xFFFFFFFFU, min_match_len, pos_state ); + range_encoder.flush(); + File_trailer trailer; + trailer.data_crc( crc() ); + trailer.data_size( data_position ); + trailer.member_size( range_encoder.member_position() + File_trailer::size() ); + for( int i = 0; i < File_trailer::size(); ++i ) + range_encoder.put_byte( trailer.data[i] ); + range_encoder.flush_data(); + } + + void LZ_encoder::fill_align_prices() throw() { for( int i = 0; i < dis_align_size; ++i ) @@ -300,7 +330,7 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], const State & state ) { int main_len; - if( longest_match_found > 0 ) // from previous call + if( longest_match_found > 0 ) // from previous call { main_len = longest_match_found; longest_match_found = 0; @@ -318,7 +348,7 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], { trials[0].dis = rep_index; trials[0].price = replens[rep_index]; - move_pos( replens[rep_index], true ); + move_pos( replens[rep_index] ); return replens[rep_index]; } @@ -327,7 +357,7 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], trials[0].dis = match_distances[matchfinder.match_len_limit()] + num_rep_distances; trials[0].price = main_len; - move_pos( main_len, true ); + move_pos( main_len ); return main_len; } @@ -415,6 +445,7 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], for( int i = 0; i < num_rep_distances; ++i ) cur_trial.reps[i] = trials[prev_index].reps[i]; + if( prev_index == cur - 1 ) { if( cur_trial.dis == 0 ) cur_trial.state.set_short_rep(); @@ -507,42 +538,6 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], } - // End Of Stream mark => (dis == 0xFFFFFFFFU, len == min_match_len) -void LZ_encoder::full_flush( const State & state ) - { - const int pos_state = matchfinder.data_position() & pos_state_mask; - range_encoder.encode_bit( bm_match[state()][pos_state], 1 ); - range_encoder.encode_bit( bm_rep[state()], 0 ); - encode_pair( 0xFFFFFFFFU, min_match_len, pos_state ); - range_encoder.flush(); - File_trailer trailer; - trailer.data_crc( crc() ); - trailer.data_size( matchfinder.data_position() ); - trailer.member_size( range_encoder.member_position() + File_trailer::size() ); - for( int i = 0; i < File_trailer::size(); ++i ) - range_encoder.put_byte( trailer.data[i] ); - range_encoder.flush_data(); - } - - -LZ_encoder::LZ_encoder( Matchfinder & mf, const File_header & header, - const int outfd ) - : - longest_match_found( 0 ), - crc_( 0xFFFFFFFFU ), - matchfinder( mf ), - range_encoder( outfd ), - len_encoder( matchfinder.match_len_limit() ), - rep_match_len_encoder( matchfinder.match_len_limit() ), - num_dis_slots( 2 * real_bits( matchfinder.dictionary_size() - 1 ) ) - { - fill_align_prices(); - - for( int i = 0; i < File_header::size; ++i ) - range_encoder.put_byte( header.data[i] ); - } - - bool LZ_encoder::encode_member( const long long member_size ) { const long long member_size_limit = @@ -564,17 +559,17 @@ bool LZ_encoder::encode_member( const long long member_size ) range_encoder.encode_bit( bm_match[state()][0], 0 ); literal_encoder.encode( range_encoder, prev_byte, cur_byte ); crc32.update( crc_, cur_byte ); - move_pos( 1 ); + matchfinder.longest_match_len(); + matchfinder.move_pos(); } - while( true ) + while( !matchfinder.finished() ) { - if( matchfinder.finished() ) { full_flush( state ); return true; } if( fill_counter <= 0 ) { fill_distance_prices(); fill_counter = fill_count; } int ahead = sequence_optimizer( rep_distances, state ); - if( ahead <= 0 ) return false; + if( ahead <= 0 ) return false; // can't happen fill_counter -= ahead; for( int i = 0; ; ) @@ -586,7 +581,7 @@ bool LZ_encoder::encode_member( const long long member_size ) bool bit = ( dis < 0 && len == 1 ); range_encoder.encode_bit( bm_match[state()][pos_state], !bit ); - if( bit ) // literal byte + if( bit ) // literal byte { const uint8_t prev_byte = matchfinder[-ahead-1]; const uint8_t cur_byte = matchfinder[-ahead]; @@ -601,7 +596,7 @@ bool LZ_encoder::encode_member( const long long member_size ) } state.set_char(); } - else // match or repeated match + else // match or repeated match { crc32.update( crc_, matchfinder.ptr_to_current_pos() - ahead, len ); mtf_reps( dis, rep_distances ); @@ -629,6 +624,9 @@ bool LZ_encoder::encode_member( const long long member_size ) else { encode_pair( dis - num_rep_distances, len, pos_state ); + if( dis_slots[dis - num_rep_distances] >= end_dis_model && + --align_price_count <= 0 ) + fill_align_prices(); state.set_match(); } } @@ -636,10 +634,12 @@ bool LZ_encoder::encode_member( const long long member_size ) if( range_encoder.member_position() >= member_size_limit ) { if( !matchfinder.dec_pos( ahead ) ) return false; - full_flush( state ); + full_flush( matchfinder.data_position(), state ); return true; } if( ahead <= 0 ) break; } } + full_flush( matchfinder.data_position(), state ); + return true; } diff --git a/encoder.h b/encoder.h index 86c23c2..21c683d 100644 --- a/encoder.h +++ b/encoder.h @@ -1,5 +1,5 @@ /* Lzip - Data compressor based on the LZMA algorithm - Copyright (C) 2008, 2009, 2010, 2011 Antonio Diaz Diaz. + Copyright (C) 2008, 2009, 2010, 2011, 2012 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -140,41 +140,43 @@ inline int price_matched( const Bit_model bm[], const int symbol, } -class Matchfinder +class Matchfinder_base { - enum { // bytes to keep in buffer before dictionary - before_size = max_num_trials + 1, - // bytes to keep in buffer after pos - after_size = max_match_len, - num_prev_positions4 = 1 << 20, - num_prev_positions3 = 1 << 18, - num_prev_positions2 = 1 << 16, - num_prev_positions = num_prev_positions4 + num_prev_positions3 + - num_prev_positions2 }; + Matchfinder_base( const Matchfinder_base & ); // declared as private + void operator=( const Matchfinder_base & ); // declared as private + + bool read_block(); + void normalize_pos(); + +protected: + enum { after_size = max_match_len }; // bytes to keep in buffer after pos long long partial_data_pos; - uint8_t * buffer; // input buffer int32_t * const prev_positions; // last seen position of key - int32_t * prev_pos_tree; - int dictionary_size_; // bytes to keep in buffer before pos + uint8_t * buffer; // input buffer + int32_t * pos_array; // may be tree or chain + const int num_prev_positions; + const int before_size; // bytes to keep in buffer before dictionary + const int match_len_limit_; + const int infd; // input file descriptor int buffer_size; + int dictionary_size_; // bytes to keep in buffer before pos int pos; // current pos in buffer int cyclic_pos; // current pos in dictionary - int stream_pos; // first byte not yet read from file int pos_limit; // when reached, a new block must be read - const int match_len_limit_; - const int cycles; - const int infd; // input file descriptor + int stream_pos; // first byte not yet read from file + int pos_array_size; bool at_stream_end; // stream_pos shows real end of file - bool read_block(); - -public: - Matchfinder( const int dict_size, const int len_limit, const int ifd ); + Matchfinder_base( const int before, const int dict_size, + const int dict_factor, const int len_limit, + const int num_prev_pos, const int ifd, + const int pos_array_factor ); - ~Matchfinder() - { delete[] prev_pos_tree; std::free( buffer ); delete[] prev_positions; } + ~Matchfinder_base() + { delete[] pos_array; std::free( buffer ); delete[] prev_positions; } +public: uint8_t operator[]( const int i ) const throw() { return buffer[pos+i]; } int available_bytes() const throw() { return stream_pos - pos; } long long data_position() const throw() { return partial_data_pos + pos; } @@ -183,27 +185,47 @@ public: int match_len_limit() const throw() { return match_len_limit_; } const uint8_t * ptr_to_current_pos() const throw() { return buffer + pos; } - bool dec_pos( const int ahead ) throw() - { - if( ahead < 0 || pos < ahead ) return false; - pos -= ahead; - cyclic_pos -= ahead; - if( cyclic_pos < 0 ) cyclic_pos += dictionary_size_; - return true; - } - int true_match_len( const int index, const int distance, int len_limit ) const throw() { if( index + len_limit > available_bytes() ) len_limit = available_bytes() - index; - const uint8_t * const data = buffer + pos + index - distance; + const uint8_t * const data = buffer + pos + index; int i = 0; - while( i < len_limit && data[i] == data[i+distance] ) ++i; + while( i < len_limit && data[i-distance] == data[i] ) ++i; return i; } void reset(); - void move_pos(); + void move_pos() + { + if( ++cyclic_pos >= dictionary_size_ ) cyclic_pos = 0; + if( ++pos >= pos_limit ) normalize_pos(); + } + }; + + +class Matchfinder : public Matchfinder_base + { + enum { before = max_num_trials + 1, + dict_factor = 2, + num_prev_positions4 = 1 << 20, + num_prev_positions3 = 1 << 18, + num_prev_positions2 = 1 << 16, + num_prev_pos = num_prev_positions4 + num_prev_positions3 + + num_prev_positions2, + pos_array_factor = 2 }; + + const int cycles; + +public: + Matchfinder( const int dict_size, const int len_limit, const int ifd ) + : + Matchfinder_base( before, dict_size, dict_factor, len_limit, + num_prev_pos, ifd, pos_array_factor ), + cycles( ( len_limit < max_match_len ) ? 16 + ( len_limit / 2 ) : 256 ) + {} + + bool dec_pos( const int ahead ) throw(); int longest_match_len( int * const distances = 0 ) throw(); }; @@ -233,8 +255,11 @@ class Range_encoder low = ( low & 0x00FFFFFFU ) << 8; } + Range_encoder( const Range_encoder & ); + void operator=( const Range_encoder & ); + public: - Range_encoder( const int ofd ) + explicit Range_encoder( const int ofd ) : low( 0 ), partial_member_pos( 0 ), @@ -367,7 +392,7 @@ class Len_encoder } public: - Len_encoder( const int len_limit ) + explicit Len_encoder( const int len_limit ) : len_symbols( len_limit + 1 - min_match_len ) { for( int i = 0; i < pos_states; ++i ) update_prices( i ); @@ -408,24 +433,12 @@ public: }; -class LZ_encoder +class LZ_encoder_base { - enum { infinite_price = 0x0FFFFFFF, - max_marker_size = 16, +protected: + enum { max_marker_size = 16, num_rep_distances = 4 }; // must be 4 - struct Trial - { - State state; - int dis; - int prev_index; // index of prev trial in trials[] - int price; // dual use var; cumulative price, match length - int reps[num_rep_distances]; - void update( const int d, const int p_i, const int pr ) throw() - { if( pr < price ) { dis = d; prev_index = p_i; price = pr; } } - }; - - int longest_match_found; uint32_t crc_; Bit_model bm_match[State::states][pos_states]; @@ -438,26 +451,28 @@ class LZ_encoder Bit_model bm_dis[modeled_distances-end_dis_model+1]; Bit_model bm_align[dis_align_size]; - Matchfinder & matchfinder; Range_encoder range_encoder; Len_encoder len_encoder; Len_encoder rep_match_len_encoder; Literal_encoder literal_encoder; const int num_dis_slots; - int match_distances[max_match_len+1]; - Trial trials[max_num_trials]; - - int dis_slot_prices[max_dis_states][2*max_dictionary_bits]; - int dis_prices[max_dis_states][modeled_distances]; - int align_prices[dis_align_size]; - int align_price_count; - - void fill_align_prices() throw(); - void fill_distance_prices() throw(); uint32_t crc() const throw() { return crc_ ^ 0xFFFFFFFFU; } + LZ_encoder_base( const File_header & header, const int dictionary_size, + const int match_len_limit, const int outfd ) + : + crc_( 0xFFFFFFFFU ), + range_encoder( outfd ), + len_encoder( match_len_limit ), + rep_match_len_encoder( match_len_limit ), + num_dis_slots( 2 * real_bits( dictionary_size - 1 ) ) + { + for( int i = 0; i < File_header::size; ++i ) + range_encoder.put_byte( header.data[i] ); + } + // move-to-front dis in/into reps void mtf_reps( const int dis, int reps[num_rep_distances] ) throw() { @@ -474,6 +489,65 @@ class LZ_encoder } } + void encode_pair( const uint32_t dis, const int len, const int pos_state ) throw() + { + len_encoder.encode( range_encoder, len, pos_state ); + const int dis_slot = dis_slots[dis]; + range_encoder.encode_tree( bm_dis_slot[get_dis_state(len)], dis_slot, dis_slot_bits ); + + if( dis_slot >= start_dis_model ) + { + const int direct_bits = ( dis_slot >> 1 ) - 1; + const uint32_t base = ( 2 | ( dis_slot & 1 ) ) << direct_bits; + const uint32_t direct_dis = dis - base; + + if( dis_slot < end_dis_model ) + range_encoder.encode_tree_reversed( bm_dis + base - dis_slot, + direct_dis, direct_bits ); + else + { + range_encoder.encode( direct_dis >> dis_align_bits, direct_bits - dis_align_bits ); + range_encoder.encode_tree_reversed( bm_align, direct_dis, dis_align_bits ); + } + } + } + + void full_flush( const long long data_position, const State & state ); + +public: + long long member_position() const throw() + { return range_encoder.member_position(); } + }; + + +class LZ_encoder : public LZ_encoder_base + { + enum { infinite_price = 0x0FFFFFFF }; + + struct Trial + { + State state; + int dis; + int prev_index; // index of prev trial in trials[] + int price; // dual use var; cumulative price, match length + int reps[num_rep_distances]; + void update( const int d, const int p_i, const int pr ) throw() + { if( pr < price ) { dis = d; prev_index = p_i; price = pr; } } + }; + + Matchfinder & matchfinder; + int longest_match_found; + int match_distances[max_match_len+1]; + Trial trials[max_num_trials]; + + int dis_slot_prices[max_dis_states][2*max_dictionary_bits]; + int dis_prices[max_dis_states][modeled_distances]; + int align_prices[dis_align_size]; + int align_price_count; + + void fill_align_prices() throw(); + void fill_distance_prices() throw(); + int price_rep_len1( const State & state, const int pos_state ) const throw() { return price0( bm_rep0[state()] ) + price0( bm_len[state()][pos_state] ); @@ -512,44 +586,21 @@ class LZ_encoder price_dis( dis, get_dis_state( len ) ); } - void encode_pair( const uint32_t dis, const int len, const int pos_state ) throw() - { - len_encoder.encode( range_encoder, len, pos_state ); - const int dis_slot = dis_slots[dis]; - range_encoder.encode_tree( bm_dis_slot[get_dis_state(len)], dis_slot, dis_slot_bits ); - - if( dis_slot >= start_dis_model ) - { - const int direct_bits = ( dis_slot >> 1 ) - 1; - const uint32_t base = ( 2 | ( dis_slot & 1 ) ) << direct_bits; - const uint32_t direct_dis = dis - base; - - if( dis_slot < end_dis_model ) - range_encoder.encode_tree_reversed( bm_dis + base - dis_slot, - direct_dis, direct_bits ); - else - { - range_encoder.encode( direct_dis >> dis_align_bits, direct_bits - dis_align_bits ); - range_encoder.encode_tree_reversed( bm_align, direct_dis, dis_align_bits ); - if( --align_price_count <= 0 ) fill_align_prices(); - } - } - } - int read_match_distances() throw() { int len = matchfinder.longest_match_len( match_distances ); - if( len == matchfinder.match_len_limit() ) - len += matchfinder.true_match_len( len, match_distances[len] + 1, max_match_len - len ); + if( len == matchfinder.match_len_limit() && len < max_match_len ) + len += matchfinder.true_match_len( len, match_distances[len] + 1, + max_match_len - len ); return len; } - void move_pos( int n, bool skip = false ) + void move_pos( int n ) { + if( --n >= 0 ) matchfinder.move_pos(); while( --n >= 0 ) { - if( skip ) skip = false; - else matchfinder.longest_match_len(); + matchfinder.longest_match_len(); matchfinder.move_pos(); } } @@ -570,13 +621,13 @@ class LZ_encoder int sequence_optimizer( const int reps[num_rep_distances], const State & state ); - void full_flush( const State & state ); - public: - LZ_encoder( Matchfinder & mf, const File_header & header, const int outfd ); + LZ_encoder( Matchfinder & mf, const File_header & header, const int outfd ) + : + LZ_encoder_base( header, mf.dictionary_size(), mf.match_len_limit(), outfd ), + matchfinder( mf ), + longest_match_found( 0 ) + { fill_align_prices(); } bool encode_member( const long long member_size ); - - long long member_position() const throw() - { return range_encoder.member_position(); } }; diff --git a/fast_encoder.cc b/fast_encoder.cc index ba7c34c..319250a 100644 --- a/fast_encoder.cc +++ b/fast_encoder.cc @@ -1,5 +1,5 @@ /* Lzip - Data compressor based on the LZMA algorithm - Copyright (C) 2008, 2009, 2010, 2011 Antonio Diaz Diaz. + Copyright (C) 2008, 2009, 2010, 2011, 2012 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -30,97 +30,6 @@ #include "fast_encoder.h" -bool Fmatchfinder::read_block() - { - if( !at_stream_end && stream_pos < buffer_size ) - { - const int size = buffer_size - stream_pos; - const int rd = readblock( infd, buffer + stream_pos, size ); - stream_pos += rd; - if( rd != size && errno ) throw Error( "Read error" ); - at_stream_end = ( rd < size ); - } - return pos < stream_pos; - } - - -Fmatchfinder::Fmatchfinder( const int ifd ) - : - partial_data_pos( 0 ), - prev_positions( new int32_t[num_prev_positions] ), - pos( 0 ), - cyclic_pos( 0 ), - key4( 0 ), - stream_pos( 0 ), - match_len_limit_( 16 ), - infd( ifd ), - at_stream_end( false ) - { - const int dict_size = 65536; - const int buffer_size_limit = ( 16 * dict_size ) + before_size + after_size; - buffer_size = dict_size; - buffer = (uint8_t *)std::malloc( buffer_size ); - if( !buffer ) { delete[] prev_positions; throw std::bad_alloc(); } - if( read_block() && !at_stream_end && buffer_size < buffer_size_limit ) - { - buffer_size = buffer_size_limit; - uint8_t * const tmp = (uint8_t *)std::realloc( buffer, buffer_size ); - if( !tmp ) - { std::free( buffer ); delete[] prev_positions; throw std::bad_alloc(); } - buffer = tmp; - read_block(); - } - if( at_stream_end && stream_pos < dict_size ) - dictionary_size_ = std::max( (int)min_dictionary_size, stream_pos ); - else dictionary_size_ = dict_size; - pos_limit = buffer_size; - if( !at_stream_end ) pos_limit -= after_size; - prev_pos_chain = new( std::nothrow ) int32_t[dictionary_size_]; - if( !prev_pos_chain ) - { std::free( buffer ); delete[] prev_positions; throw std::bad_alloc(); } - for( int i = 0; i < num_prev_positions; ++i ) prev_positions[i] = -1; - } - - -void Fmatchfinder::reset() - { - const int size = stream_pos - pos; - if( size > 0 ) std::memmove( buffer, buffer + pos, size ); - partial_data_pos = 0; - stream_pos -= pos; - pos = 0; - cyclic_pos = 0; - key4 = 0; - for( int i = 0; i < num_prev_positions; ++i ) prev_positions[i] = -1; - read_block(); - } - - -void Fmatchfinder::move_pos() - { - if( ++cyclic_pos >= dictionary_size_ ) cyclic_pos = 0; - if( ++pos >= pos_limit ) - { - if( pos > stream_pos ) - internal_error( "pos > stream_pos in Fmatchfinder::move_pos" ); - if( !at_stream_end ) - { - const int offset = pos - dictionary_size_ - before_size; - const int size = stream_pos - offset; - std::memmove( buffer, buffer + offset, size ); - partial_data_pos += offset; - pos -= offset; - stream_pos -= offset; - for( int i = 0; i < num_prev_positions; ++i ) - if( prev_positions[i] >= 0 ) prev_positions[i] -= offset; - for( int i = 0; i < dictionary_size_; ++i ) - if( prev_pos_chain[i] >= 0 ) prev_pos_chain[i] -= offset; - read_block(); - } - } - } - - int Fmatchfinder::longest_match_len( int * const distance ) throw() { int len_limit = match_len_limit_; @@ -130,28 +39,26 @@ int Fmatchfinder::longest_match_len( int * const distance ) throw() if( len_limit < 4 ) return 0; } - const uint8_t * const data = buffer + pos; - key4 = ( ( key4 << 4 ) ^ data[3] ) & ( num_prev_positions - 1 ); + key4 = ( ( key4 << 4 ) ^ buffer[pos+3] ) & ( num_prev_pos - 1 ); int newpos = prev_positions[key4]; prev_positions[key4] = pos; - int32_t * ptr0 = prev_pos_chain + cyclic_pos; + int32_t * ptr0 = pos_array + cyclic_pos; int maxlen = 0; for( int count = 4; ; ) { if( newpos < (pos - dictionary_size_ + 1) || newpos < 0 || --count < 0 ) { *ptr0 = -1; break; } - const uint8_t * const newdata = buffer + newpos; int len = 0; - if( newdata[maxlen] == data[maxlen] ) - while( len < len_limit && newdata[len] == data[len] ) ++len; + if( buffer[maxlen+newpos] == buffer[maxlen+pos] ) + while( len < len_limit && buffer[len+newpos] == buffer[len+pos] ) ++len; const int delta = pos - newpos; if( maxlen < len ) { maxlen = len; *distance = delta - 1; } - int32_t * const newptr = prev_pos_chain + + int32_t * const newptr = pos_array + ( cyclic_pos - delta + ( ( cyclic_pos >= delta ) ? 0 : dictionary_size_ ) ); @@ -167,6 +74,8 @@ int Fmatchfinder::longest_match_len( int * const distance ) throw() break; } } + if( maxlen == match_len_limit_ && maxlen < max_match_len ) + maxlen += true_match_len( maxlen, *distance + 1, max_match_len - maxlen ); return maxlen; } @@ -180,68 +89,86 @@ void Fmatchfinder::longest_match_len() throw() if( len_limit < 4 ) return; } - const uint8_t * const data = buffer + pos; - key4 = ( ( key4 << 4 ) ^ data[3] ) & ( num_prev_positions - 1 ); + key4 = ( ( key4 << 4 ) ^ buffer[pos+3] ) & ( num_prev_pos - 1 ); const int newpos = prev_positions[key4]; prev_positions[key4] = pos; - int32_t * const ptr0 = prev_pos_chain + cyclic_pos; + int32_t * const ptr0 = pos_array + cyclic_pos; - if( newpos < (pos - dictionary_size_ + 1) || newpos < 0 ) *ptr0 = -1; + if( newpos < (pos - dictionary_size_ + 1) || newpos < 0 ) + *ptr0 = -1; + else if( buffer[len_limit-1+newpos] != buffer[len_limit-1+pos] || + std::memcmp( buffer + newpos, buffer + pos, len_limit - 1 ) ) + *ptr0 = newpos; else { - const uint8_t * const newdata = buffer + newpos; - if( newdata[len_limit-1] != data[len_limit-1] || - std::memcmp( newdata, data, len_limit - 1 ) ) *ptr0 = newpos; - else - { - int idx = cyclic_pos - pos + newpos; - if( idx < 0 ) idx += dictionary_size_; - *ptr0 = prev_pos_chain[idx]; - } + int idx = cyclic_pos - pos + newpos; + if( idx < 0 ) idx += dictionary_size_; + *ptr0 = pos_array[idx]; } } -// Return value == number of bytes advanced (len). -// *disp returns the distance to encode. -// ( *disp == -1 && len == 1 ) means literal. -int FLZ_encoder::sequence_optimizer( const int reps[num_rep_distances], - int * const disp, const State & state ) +void FLZ_encoder::sequence_optimizer( int reps[num_rep_distances], + State & state ) { - const int main_len = read_match_distances(); + int match_distance; + const int main_len = fmatchfinder.longest_match_len( &match_distance ); + const int pos_state = fmatchfinder.data_position() & pos_state_mask; + int dis = 0; + int len = 0; - int replen = 0; - int rep_index = 0; for( int i = 0; i < num_rep_distances; ++i ) { - const int len = fmatchfinder.true_match_len( 0, reps[i] + 1, max_match_len ); - if( len > replen ) { replen = len; rep_index = i; } + const int tlen = + fmatchfinder.true_match_len( 0, reps[i] + 1, max_match_len ); + if( tlen > len ) { len = tlen; dis = i; } } - if( replen > min_match_len && replen + 4 > main_len ) + if( len > min_match_len && len + 4 > main_len ) { - *disp = rep_index; - move_pos( replen, true ); - return replen; + crc32.update( crc_, fmatchfinder.ptr_to_current_pos(), len ); + mtf_reps( dis, reps ); + range_encoder.encode_bit( bm_match[state()][pos_state], 1 ); + range_encoder.encode_bit( bm_rep[state()], 1 ); + const bool bit = ( dis == 0 ); + range_encoder.encode_bit( bm_rep0[state()], !bit ); + if( bit ) + range_encoder.encode_bit( bm_len[state()][pos_state], 1 ); + else + { + range_encoder.encode_bit( bm_rep1[state()], dis > 1 ); + if( dis > 1 ) + range_encoder.encode_bit( bm_rep2[state()], dis > 2 ); + } + rep_match_len_encoder.encode( range_encoder, len, pos_state ); + state.set_rep(); + move_pos( len ); + return; } if( main_len > min_match_len || ( main_len == min_match_len && match_distance < modeled_distances ) ) { - *disp = num_rep_distances + match_distance; - move_pos( main_len, true ); - return main_len; + crc32.update( crc_, fmatchfinder.ptr_to_current_pos(), main_len ); + dis = match_distance; + mtf_reps( dis + num_rep_distances, reps ); + range_encoder.encode_bit( bm_match[state()][pos_state], 1 ); + range_encoder.encode_bit( bm_rep[state()], 0 ); + encode_pair( dis, main_len, pos_state ); + state.set_match(); + move_pos( main_len ); + return; } + const uint8_t prev_byte = fmatchfinder[-1]; const uint8_t cur_byte = fmatchfinder[0]; const uint8_t match_byte = fmatchfinder[-reps[0]-1]; + crc32.update( crc_, cur_byte ); + fmatchfinder.move_pos(); - *disp = -1; if( match_byte == cur_byte ) { - const uint8_t prev_byte = fmatchfinder[-1]; - const int pos_state = fmatchfinder.data_position() & pos_state_mask; int price = price0( bm_match[state()][pos_state] ); if( state.is_char() ) price += literal_encoder.price_symbol( prev_byte, cur_byte ); @@ -251,44 +178,25 @@ int FLZ_encoder::sequence_optimizer( const int reps[num_rep_distances], price1( bm_rep[state()] ) + price0( bm_rep0[state()] ) + price0( bm_len[state()][pos_state] ); - if( short_rep_price < price ) *disp = 0; + if( short_rep_price < price ) + { + range_encoder.encode_bit( bm_match[state()][pos_state], 1 ); + range_encoder.encode_bit( bm_rep[state()], 1 ); + range_encoder.encode_bit( bm_rep0[state()], 0 ); + range_encoder.encode_bit( bm_len[state()][pos_state], 0 ); + state.set_short_rep(); + return; + } } - fmatchfinder.move_pos(); - return 1; - } - - - // End Of Stream mark => (dis == 0xFFFFFFFFU, len == min_match_len) -void FLZ_encoder::full_flush( const State & state ) - { - const int pos_state = fmatchfinder.data_position() & pos_state_mask; - range_encoder.encode_bit( bm_match[state()][pos_state], 1 ); - range_encoder.encode_bit( bm_rep[state()], 0 ); - encode_pair( 0xFFFFFFFFU, min_match_len, pos_state ); - range_encoder.flush(); - File_trailer trailer; - trailer.data_crc( crc() ); - trailer.data_size( fmatchfinder.data_position() ); - trailer.member_size( range_encoder.member_position() + File_trailer::size() ); - for( int i = 0; i < File_trailer::size(); ++i ) - range_encoder.put_byte( trailer.data[i] ); - range_encoder.flush_data(); - } - - -FLZ_encoder::FLZ_encoder( Fmatchfinder & mf, const File_header & header, - const int outfd ) - : - crc_( 0xFFFFFFFFU ), - fmatchfinder( mf ), - range_encoder( outfd ), - len_encoder( fmatchfinder.match_len_limit() ), - rep_match_len_encoder( fmatchfinder.match_len_limit() ), - num_dis_slots( 2 * real_bits( fmatchfinder.dictionary_size() - 1 ) ) - { - for( int i = 0; i < File_header::size; ++i ) - range_encoder.put_byte( header.data[i] ); + // literal byte + range_encoder.encode_bit( bm_match[state()][pos_state], 0 ); + if( state.is_char() ) + literal_encoder.encode( range_encoder, prev_byte, cur_byte ); + else + literal_encoder.encode_matched( range_encoder, + prev_byte, cur_byte, match_byte ); + state.set_char(); } @@ -304,77 +212,21 @@ bool FLZ_encoder::encode_member( const long long member_size ) range_encoder.member_position() != File_header::size ) return false; // can be called only once - if( !fmatchfinder.finished() ) // encode first byte + if( !fmatchfinder.finished() ) // encode first byte { const uint8_t prev_byte = 0; const uint8_t cur_byte = fmatchfinder[0]; range_encoder.encode_bit( bm_match[state()][0], 0 ); literal_encoder.encode( range_encoder, prev_byte, cur_byte ); crc32.update( crc_, cur_byte ); - move_pos( 1 ); + fmatchfinder.longest_match_len(); + fmatchfinder.move_pos(); } - while( true ) - { - if( fmatchfinder.finished() ) { full_flush( state ); return true; } - - const int pos_state = fmatchfinder.data_position() & pos_state_mask; - int dis; - const int len = sequence_optimizer( rep_distances, &dis, state ); - if( len <= 0 ) return false; + while( !fmatchfinder.finished() && + range_encoder.member_position() < member_size_limit ) + sequence_optimizer( rep_distances, state ); - bool bit = ( dis < 0 && len == 1 ); - range_encoder.encode_bit( bm_match[state()][pos_state], !bit ); - if( bit ) // literal byte - { - const uint8_t prev_byte = fmatchfinder[-len-1]; - const uint8_t cur_byte = fmatchfinder[-len]; - crc32.update( crc_, cur_byte ); - if( state.is_char() ) - literal_encoder.encode( range_encoder, prev_byte, cur_byte ); - else - { - const uint8_t match_byte = fmatchfinder[-len-rep_distances[0]-1]; - literal_encoder.encode_matched( range_encoder, - prev_byte, cur_byte, match_byte ); - } - state.set_char(); - } - else // match or repeated match - { - crc32.update( crc_, fmatchfinder.ptr_to_current_pos() - len, len ); - mtf_reps( dis, rep_distances ); - bit = ( dis < num_rep_distances ); - range_encoder.encode_bit( bm_rep[state()], bit ); - if( bit ) - { - bit = ( dis == 0 ); - range_encoder.encode_bit( bm_rep0[state()], !bit ); - if( bit ) - range_encoder.encode_bit( bm_len[state()][pos_state], len > 1 ); - else - { - range_encoder.encode_bit( bm_rep1[state()], dis > 1 ); - if( dis > 1 ) - range_encoder.encode_bit( bm_rep2[state()], dis > 2 ); - } - if( len == 1 ) state.set_short_rep(); - else - { - rep_match_len_encoder.encode( range_encoder, len, pos_state ); - state.set_rep(); - } - } - else - { - encode_pair( dis - num_rep_distances, len, pos_state ); - state.set_match(); - } - } - if( range_encoder.member_position() >= member_size_limit ) - { - full_flush( state ); - return true; - } - } + full_flush( fmatchfinder.data_position(), state ); + return true; } diff --git a/fast_encoder.h b/fast_encoder.h index 188eb92..ce5bae5 100644 --- a/fast_encoder.h +++ b/fast_encoder.h @@ -1,5 +1,5 @@ /* Lzip - Data compressor based on the LZMA algorithm - Copyright (C) 2008, 2009, 2010, 2011 Antonio Diaz Diaz. + Copyright (C) 2008, 2009, 2010, 2011, 2012 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -15,157 +15,53 @@ along with this program. If not, see . */ -class Fmatchfinder +class Fmatchfinder : public Matchfinder_base { - enum { // bytes to keep in buffer before dictionary - before_size = max_match_len + 1, - // bytes to keep in buffer after pos - after_size = max_match_len, - num_prev_positions = 1 << 16 }; + enum { before = max_match_len + 1, + dict_size = 65536, + dict_factor = 16, + len_limit = 16, + num_prev_pos = 1 << 16, + pos_array_factor = 1 }; - long long partial_data_pos; - uint8_t * buffer; // input buffer - int32_t * const prev_positions; // last seen position of key - int32_t * prev_pos_chain; - int dictionary_size_; // bytes to keep in buffer before pos - int buffer_size; - int pos; // current pos in buffer - int cyclic_pos; // current pos in dictionary int key4; // key made from latest 4 bytes - int stream_pos; // first byte not yet read from file - int pos_limit; // when reached, a new block must be read - const int match_len_limit_; - const int infd; // input file descriptor - bool at_stream_end; // stream_pos shows real end of file - - bool read_block(); public: - Fmatchfinder( const int ifd ); - - ~Fmatchfinder() - { delete[] prev_pos_chain; delete[] prev_positions; std::free( buffer ); } - - uint8_t operator[]( const int i ) const throw() { return buffer[pos+i]; } - int available_bytes() const throw() { return stream_pos - pos; } - long long data_position() const throw() { return partial_data_pos + pos; } - int dictionary_size() const throw() { return dictionary_size_; } - bool finished() const throw() { return at_stream_end && pos >= stream_pos; } - int match_len_limit() const throw() { return match_len_limit_; } - const uint8_t * ptr_to_current_pos() const throw() { return buffer + pos; } - - int true_match_len( const int index, const int distance, int len_limit ) const throw() - { - if( index + len_limit > available_bytes() ) - len_limit = available_bytes() - index; - const uint8_t * const data = buffer + pos + index - distance; - int i = 0; - while( i < len_limit && data[i] == data[i+distance] ) ++i; - return i; - } - - void reset(); - void move_pos(); + explicit Fmatchfinder( const int ifd ) + : + Matchfinder_base( before, dict_size, dict_factor, len_limit, + num_prev_pos, ifd, pos_array_factor ), + key4( 0 ) + {} + + void reset() { Matchfinder_base::reset(); key4 = 0; } int longest_match_len( int * const distance ) throw(); void longest_match_len() throw(); }; -class FLZ_encoder +class FLZ_encoder : public LZ_encoder_base { - enum { max_marker_size = 16, - num_rep_distances = 4 }; // must be 4 - - uint32_t crc_; - - Bit_model bm_match[State::states][pos_states]; - Bit_model bm_rep[State::states]; - Bit_model bm_rep0[State::states]; - Bit_model bm_rep1[State::states]; - Bit_model bm_rep2[State::states]; - Bit_model bm_len[State::states][pos_states]; - Bit_model bm_dis_slot[max_dis_states][1<= num_rep_distances ) - { - for( int i = num_rep_distances - 1; i > 0; --i ) reps[i] = reps[i-1]; - reps[0] = dis - num_rep_distances; - } - else if( dis > 0 ) - { - const int distance = reps[dis]; - for( int i = dis; i > 0; --i ) reps[i] = reps[i-1]; - reps[0] = distance; - } - } - - void encode_pair( const uint32_t dis, const int len, const int pos_state ) throw() - { - len_encoder.encode( range_encoder, len, pos_state ); - const int dis_slot = dis_slots[dis]; - range_encoder.encode_tree( bm_dis_slot[get_dis_state(len)], dis_slot, dis_slot_bits ); - - if( dis_slot >= start_dis_model ) - { - const int direct_bits = ( dis_slot >> 1 ) - 1; - const uint32_t base = ( 2 | ( dis_slot & 1 ) ) << direct_bits; - const uint32_t direct_dis = dis - base; - - if( dis_slot < end_dis_model ) - range_encoder.encode_tree_reversed( bm_dis + base - dis_slot, - direct_dis, direct_bits ); - else - { - range_encoder.encode( direct_dis >> dis_align_bits, direct_bits - dis_align_bits ); - range_encoder.encode_tree_reversed( bm_align, direct_dis, dis_align_bits ); - } - } - } - - int read_match_distances() throw() - { - int len = fmatchfinder.longest_match_len( &match_distance ); - if( len == fmatchfinder.match_len_limit() ) - len += fmatchfinder.true_match_len( len, match_distance + 1, max_match_len - len ); - return len; - } - - void move_pos( int n, bool skip = false ) + void move_pos( int n ) { + if( --n >= 0 ) fmatchfinder.move_pos(); while( --n >= 0 ) { - if( skip ) skip = false; - else fmatchfinder.longest_match_len(); + fmatchfinder.longest_match_len(); fmatchfinder.move_pos(); } } - int sequence_optimizer( const int reps[num_rep_distances], - int * const disp, const State & state ); - - void full_flush( const State & state ); + void sequence_optimizer( int reps[num_rep_distances], State & state ); public: - FLZ_encoder( Fmatchfinder & mf, const File_header & header, const int outfd ); + FLZ_encoder( Fmatchfinder & mf, const File_header & header, const int outfd ) + : + LZ_encoder_base( header, mf.dictionary_size(), mf.match_len_limit(), outfd ), + fmatchfinder( mf ) + {} bool encode_member( const long long member_size ); - - long long member_position() const throw() - { return range_encoder.member_position(); } }; diff --git a/lzip.h b/lzip.h index f789ba3..9776710 100644 --- a/lzip.h +++ b/lzip.h @@ -1,5 +1,5 @@ /* Lzip - Data compressor based on the LZMA algorithm - Copyright (C) 2008, 2009, 2010, 2011 Antonio Diaz Diaz. + Copyright (C) 2008, 2009, 2010, 2011, 2012 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -179,6 +179,7 @@ inline int real_bits( const int value ) throw() return bits; } + const uint8_t magic_string[4] = { 'L', 'Z', 'I', 'P' }; struct File_header @@ -273,14 +274,14 @@ struct File_trailer struct Error { const char * const msg; - Error( const char * const s ) throw() : msg( s ) {} + explicit Error( const char * const s ) throw() : msg( s ) {} }; // defined in main.cc void show_error( const char * const msg, const int errcode = 0, const bool help = false ) throw(); -void internal_error( const char * const msg ); +void internal_error( const char * const msg ) throw(); // defined in decoder.cc int readblock( const int fd, uint8_t * const buf, const int size ) throw(); diff --git a/main.cc b/main.cc index 2de410b..3e95eb9 100644 --- a/main.cc +++ b/main.cc @@ -1,5 +1,5 @@ /* Lzip - Data compressor based on the LZMA algorithm - Copyright (C) 2008, 2009, 2010, 2011 Antonio Diaz Diaz. + Copyright (C) 2008, 2009, 2010, 2011, 2012 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -77,7 +77,7 @@ namespace { const char * const Program_name = "Lzip"; const char * const program_name = "lzip"; -const char * const program_year = "2011"; +const char * const program_year = "2012"; const char * invocation_name = 0; #ifdef O_BINARY @@ -135,6 +135,10 @@ void show_help() throw() "from standard input to standard output.\n" "Numbers may be followed by a multiplier: k = kB = 10^3 = 1000,\n" "Ki = KiB = 2^10 = 1024, M = 10^6, Mi = 2^20, G = 10^9, Gi = 2^30, etc...\n" + "The bidimensional parameter space of LZMA can't be mapped to a linear\n" + "scale optimal for all files. If your files are large, very repetitive,\n" + "etc, you may need to use the --match-length and --dictionary-size\n" + "options directly to achieve optimal performance.\n" "\nReport bugs to lzip-bug@nongnu.org\n" "Lzip home page: http://www.nongnu.org/lzip/lzip.html\n" ); } @@ -254,7 +258,7 @@ int open_instream( const std::string & name, struct stat * const in_statsp, if( program_mode == m_compress && !recompress && eindex >= 0 ) { if( verbosity >= 0 ) - std::fprintf( stderr, "%s: Input file `%s' already has `%s' suffix.\n", + std::fprintf( stderr, "%s: Input file '%s' already has '%s' suffix.\n", program_name, name.c_str(), known_extensions[eindex].from ); } @@ -264,7 +268,7 @@ int open_instream( const std::string & name, struct stat * const in_statsp, if( infd < 0 ) { if( verbosity >= 0 ) - std::fprintf( stderr, "%s: Can't open input file `%s': %s.\n", + std::fprintf( stderr, "%s: Can't open input file '%s': %s.\n", program_name, name.c_str(), std::strerror( errno ) ); } else @@ -277,10 +281,10 @@ int open_instream( const std::string & name, struct stat * const in_statsp, if( i != 0 || ( !S_ISREG( mode ) && ( !to_stdout || !can_read ) ) ) { if( verbosity >= 0 ) - std::fprintf( stderr, "%s: Input file `%s' is not a regular file%s.\n", + std::fprintf( stderr, "%s: Input file '%s' is not a regular file%s.\n", program_name, name.c_str(), ( can_read && !to_stdout ) ? - " and `--stdout' was not specified" : "" ); + " and '--stdout' was not specified" : "" ); close( infd ); infd = -1; } @@ -312,7 +316,7 @@ void set_d_outname( const std::string & name, const int i ) throw() } output_filename = name; output_filename += ".out"; if( verbosity >= 1 ) - std::fprintf( stderr, "%s: Can't guess original name for `%s' -- using `%s'.\n", + std::fprintf( stderr, "%s: Can't guess original name for '%s' -- using '%s'.\n", program_name, name.c_str(), output_filename.c_str() ); } @@ -326,10 +330,10 @@ bool open_outstream( const bool force ) throw() if( outfd < 0 && verbosity >= 0 ) { if( errno == EEXIST ) - std::fprintf( stderr, "%s: Output file `%s' already exists, skipping.\n", + std::fprintf( stderr, "%s: Output file '%s' already exists, skipping.\n", program_name, output_filename.c_str() ); else - std::fprintf( stderr, "%s: Can't create output file `%s': %s.\n", + std::fprintf( stderr, "%s: Can't create output file '%s': %s.\n", program_name, output_filename.c_str(), std::strerror( errno ) ); } return ( outfd >= 0 ); @@ -359,7 +363,7 @@ void cleanup_and_fail( const int retval ) throw() { delete_output_on_interrupt = false; if( verbosity >= 0 ) - std::fprintf( stderr, "%s: Deleting output file `%s', if it exists.\n", + std::fprintf( stderr, "%s: Deleting output file '%s', if it exists.\n", program_name, output_filename.c_str() ); if( outfd >= 0 ) { close( outfd ); outfd = -1; } if( std::remove( output_filename.c_str() ) != 0 && errno != ENOENT ) @@ -412,14 +416,14 @@ int compress( const long long member_size, const long long volume_size, const Lzma_options & encoder_options, const int infd, const Pretty_print & pp, const struct stat * const in_statsp ) { - if( verbosity >= 1 ) pp(); + int retval = 0; File_header header; header.set_magic(); + if( verbosity >= 1 ) pp(); if( !header.dictionary_size( encoder_options.dictionary_size ) || encoder_options.match_len_limit < min_match_len_limit || encoder_options.match_len_limit > max_match_len ) internal_error( "invalid argument to encoder" ); - int retval = 0; try { Matchfinder matchfinder( header.dictionary_size(), @@ -480,10 +484,10 @@ int fcompress( const long long member_size, const long long volume_size, const int infd, const Pretty_print & pp, const struct stat * const in_statsp ) { - if( verbosity >= 1 ) pp(); + int retval = 0; File_header header; header.set_magic(); - int retval = 0; + if( verbosity >= 1 ) pp(); try { Fmatchfinder fmatchfinder( infd ); @@ -558,7 +562,7 @@ void show_trailing_garbage( const uint8_t * const data, const int size, if( !std::isprint( data[i] ) ) { text = false; break; } if( text ) { - garbage_msg += '`'; + garbage_msg += '\''; garbage_msg.append( (const char *)data, size ); garbage_msg += '\''; } @@ -585,10 +589,8 @@ int decompress( const int infd, const Pretty_print & pp, const bool testing ) for( bool first_member = true; ; first_member = false, pp.reset() ) { File_header header; - int size; rdec.reset_member_position(); - for( size = 0; size < File_header::size && !rdec.finished(); ++size ) - header.data[size] = rdec.get_byte(); + const int size = rdec.read( header.data, File_header::size ); if( rdec.finished() ) // End Of File { if( first_member ) @@ -625,8 +627,8 @@ int decompress( const int infd, const Pretty_print & pp, const bool testing ) header.version(), format_num( header.dictionary_size() ) ); } - LZ_decoder decoder( header, rdec, outfd ); + LZ_decoder decoder( header, rdec, outfd ); const int result = decoder.decode_member( pp ); partial_file_pos += rdec.member_position(); if( result != 0 ) @@ -690,13 +692,13 @@ void show_error( const char * const msg, const int errcode, const bool help ) th std::fprintf( stderr, "\n" ); } if( help && invocation_name && invocation_name[0] ) - std::fprintf( stderr, "Try `%s --help' for more information.\n", + std::fprintf( stderr, "Try '%s --help' for more information.\n", invocation_name ); } } -void internal_error( const char * const msg ) +void internal_error( const char * const msg ) throw() { if( verbosity >= 0 ) std::fprintf( stderr, "%s: internal error: %s.\n", program_name, msg ); @@ -750,7 +752,6 @@ int main( const int argc, const char * const argv[] ) { 'b', "member-size", Arg_parser::yes }, { 'c', "stdout", Arg_parser::no }, { 'd', "decompress", Arg_parser::no }, - { 'e', "extreme", Arg_parser::no }, { 'f', "force", Arg_parser::no }, { 'F', "recompress", Arg_parser::no }, { 'h', "help", Arg_parser::no }, @@ -785,7 +786,6 @@ int main( const int argc, const char * const argv[] ) case 'b': member_size = getnum( arg, 100000, LLONG_MAX / 2 ); break; case 'c': to_stdout = true; break; case 'd': program_mode = m_decompress; break; - case 'e': break; // ignored by now case 'f': force = true; break; case 'F': recompress = true; break; case 'h': show_help(); return 0; @@ -806,10 +806,18 @@ int main( const int argc, const char * const argv[] ) } // end process options #if defined(__MSVCRT__) || defined(__OS2__) - _setmode( STDIN_FILENO, O_BINARY ); - _setmode( STDOUT_FILENO, O_BINARY ); + _fsetmode( stdin, "b" ); + _fsetmode( stdout, "b" ); #endif + if( program_mode == m_test ) + outfd = -1; + else if( program_mode == m_compress ) + { + dis_slots.init(); + prob_prices.init(); + } + bool filenames_given = false; for( ; argind < parser.arguments(); ++argind ) { @@ -823,13 +831,6 @@ int main( const int argc, const char * const argv[] ) set_signals(); Pretty_print pp( filenames, verbosity ); - if( program_mode == m_test ) - outfd = -1; - else if( program_mode == m_compress ) - { - dis_slots.init(); - prob_prices.init(); - } int retval = 0; for( unsigned int i = 0; i < filenames.size(); ++i ) diff --git a/testsuite/check.sh b/testsuite/check.sh index 69291e2..a82d0de 100755 --- a/testsuite/check.sh +++ b/testsuite/check.sh @@ -1,6 +1,6 @@ #! /bin/sh # check script for Lzip - Data compressor based on the LZMA algorithm -# Copyright (C) 2008, 2009, 2010, 2011 Antonio Diaz Diaz. +# Copyright (C) 2008, 2009, 2010, 2011, 2012 Antonio Diaz Diaz. # # This script is free software: you have unlimited permission # to copy, distribute and modify it. @@ -82,7 +82,7 @@ for i in s4Ki 0 1 2 3 4 5 6 7 8 9 ; do printf . done -"${LZIP}" -$i < in > anyothername || fail=1 +"${LZIP}" < in > anyothername || fail=1 "${LZIP}" -d anyothername || fail=1 cmp in anyothername.out || fail=1 printf . diff --git a/testsuite/unzcrash.cc b/testsuite/unzcrash.cc index b80580f..65d160d 100644 --- a/testsuite/unzcrash.cc +++ b/testsuite/unzcrash.cc @@ -106,7 +106,8 @@ void show_error( const char * const msg, const int errcode = 0, } -void internal_error( const char * const msg ) +void internal_error( const char * const msg ) throw() __attribute__ ((noreturn)); +void internal_error( const char * const msg ) throw() { if( verbosity >= 0 ) std::fprintf( stderr, "%s: internal error: %s.\n", program_name, msg ); @@ -340,7 +341,9 @@ int main( const int argc, const char * const argv[] ) if( !f ) { show_error( "Can't open pipe", errno ); return 1; } std::fwrite( buffer, 1, size, f ); - pclose( f ); + if( pclose( f ) == 0 && verbosity >= 0 ) + std::fprintf( stderr, "0x%02X (0x%02X+0x%02X) passed the test\n", + buffer[i], byte, j + 1 ); } } buffer[i] = byte; -- cgit v1.2.3