From 38f9d48a0b7370eb50a55d5de4b06da954742ea9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 7 Nov 2015 11:05:53 +0100 Subject: Adding upstream version 1.17~pre1. Signed-off-by: Daniel Baumann --- ChangeLog | 8 +- INSTALL | 6 +- Makefile.in | 39 ++-- NEWS | 21 +- README | 13 +- arg_parser.cc | 2 +- arg_parser.h | 2 +- configure | 6 +- decoder.cc | 22 +-- decoder.h | 22 +-- doc/lzip.1 | 7 +- doc/lzip.info | 76 ++++---- doc/lzip.texi | 61 +++--- encoder.cc | 282 +++++++-------------------- encoder.h | 589 ++++++-------------------------------------------------- encoder_base.cc | 180 +++++++++++++++++ encoder_base.h | 487 ++++++++++++++++++++++++++++++++++++++++++++++ fast_encoder.cc | 67 ++++--- fast_encoder.h | 41 ++-- lzip.h | 40 ++-- main.cc | 226 ++++++++-------------- 21 files changed, 1093 insertions(+), 1104 deletions(-) create mode 100644 encoder_base.cc create mode 100644 encoder_base.h diff --git a/ChangeLog b/ChangeLog index 6f1ac88..490e992 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +2015-03-26 Antonio Diaz Diaz + + * Version 1.17-pre1 released. + * Reorganization of the compression code. + * Makefile.in: Added new targets 'install*-compress'. + 2014-08-26 Antonio Diaz Diaz * Version 1.16 released. @@ -241,7 +247,7 @@ * Version 0.1 released. -Copyright (C) 2008-2014 Antonio Diaz Diaz. +Copyright (C) 2008-2015 Antonio Diaz Diaz. This file is a collection of facts, and thus it is not copyrightable, but just in case, you have unlimited permission to copy, distribute and diff --git a/INSTALL b/INSTALL index 2da3542..4e5ae2f 100644 --- a/INSTALL +++ b/INSTALL @@ -32,6 +32,10 @@ the main archive. 5. Type 'make install' to install the program and any data files and documentation. + Or type 'make install-compress', which additionally compresses the + info manual and the man page after installation. (Installing + compressed docs may become the default in the future). + You can install only the program, the info manual or the man page by typing 'make install-bin', 'make install-info' or 'make install-man' respectively. @@ -54,7 +58,7 @@ After running 'configure', you can run 'make' and 'make install' as explained above. -Copyright (C) 2008-2014 Antonio Diaz Diaz. +Copyright (C) 2008-2015 Antonio Diaz Diaz. This file is free documentation: you have unlimited permission to copy, distribute and modify it. diff --git a/Makefile.in b/Makefile.in index df11a7d..cfddf4f 100644 --- a/Makefile.in +++ b/Makefile.in @@ -6,10 +6,12 @@ INSTALL_DATA = $(INSTALL) -m 644 INSTALL_DIR = $(INSTALL) -d -m 755 SHELL = /bin/sh -objs = arg_parser.o encoder.o fast_encoder.o decoder.o main.o +objs = arg_parser.o encoder_base.o encoder.o fast_encoder.o decoder.o main.o -.PHONY : all install install-bin install-info install-man install-strip \ +.PHONY : all install install-bin install-info install-man \ + install-strip install-compress install-strip-compress \ + install-bin-strip install-info-compress install-man-compress \ uninstall uninstall-bin uninstall-info uninstall-man \ doc info man check dist clean distclean @@ -18,9 +20,6 @@ all : $(progname) $(progname) : $(objs) $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $(objs) -$(progname)_profiled : $(objs) - $(CXX) $(CXXFLAGS) $(LDFLAGS) -pg -o $@ $(objs) - main.o : main.cc $(CXX) $(CXXFLAGS) $(CPPFLAGS) -DPROGVERSION=\"$(pkgversion)\" -c -o $@ $< @@ -30,9 +29,10 @@ main.o : main.cc $(objs) : Makefile arg_parser.o : arg_parser.h decoder.o : lzip.h decoder.h -encoder.o : lzip.h encoder.h -fast_encoder.o : lzip.h encoder.h fast_encoder.h -main.o : arg_parser.h lzip.h decoder.h encoder.h fast_encoder.h +encoder_base.o : lzip.h encoder_base.h +encoder.o : lzip.h encoder_base.h encoder.h +fast_encoder.o : lzip.h encoder_base.h fast_encoder.h +main.o : arg_parser.h lzip.h decoder.h encoder_base.h encoder.h fast_encoder.h doc : info man @@ -54,34 +54,45 @@ check : all @$(VPATH)/testsuite/check.sh $(VPATH)/testsuite $(pkgversion) install : install-bin install-info install-man +install-strip : install-bin-strip install-info install-man +install-compress : install-bin install-info-compress install-man-compress +install-strip-compress : install-bin-strip install-info-compress install-man-compress install-bin : all if [ ! -d "$(DESTDIR)$(bindir)" ] ; then $(INSTALL_DIR) "$(DESTDIR)$(bindir)" ; fi $(INSTALL_PROGRAM) ./$(progname) "$(DESTDIR)$(bindir)/$(progname)" +install-bin-strip : all + $(MAKE) INSTALL_PROGRAM='$(INSTALL_PROGRAM) -s' install-bin + install-info : if [ ! -d "$(DESTDIR)$(infodir)" ] ; then $(INSTALL_DIR) "$(DESTDIR)$(infodir)" ; fi + -rm -f "$(DESTDIR)$(infodir)/$(pkgname).info"* $(INSTALL_DATA) $(VPATH)/doc/$(pkgname).info "$(DESTDIR)$(infodir)/$(pkgname).info" -install-info --info-dir="$(DESTDIR)$(infodir)" "$(DESTDIR)$(infodir)/$(pkgname).info" +install-info-compress : install-info + lzip -v -9 "$(DESTDIR)$(infodir)/$(pkgname).info" + install-man : if [ ! -d "$(DESTDIR)$(mandir)/man1" ] ; then $(INSTALL_DIR) "$(DESTDIR)$(mandir)/man1" ; fi + -rm -f "$(DESTDIR)$(mandir)/man1/$(progname).1"* $(INSTALL_DATA) $(VPATH)/doc/$(progname).1 "$(DESTDIR)$(mandir)/man1/$(progname).1" -install-strip : all - $(MAKE) INSTALL_PROGRAM='$(INSTALL_PROGRAM) -s' install +install-man-compress : install-man + lzip -v -9 "$(DESTDIR)$(mandir)/man1/$(progname).1" -uninstall : uninstall-bin uninstall-info uninstall-man +uninstall : uninstall-man uninstall-info uninstall-bin uninstall-bin : -rm -f "$(DESTDIR)$(bindir)/$(progname)" uninstall-info : -install-info --info-dir="$(DESTDIR)$(infodir)" --remove "$(DESTDIR)$(infodir)/$(pkgname).info" - -rm -f "$(DESTDIR)$(infodir)/$(pkgname).info" + -rm -f "$(DESTDIR)$(infodir)/$(pkgname).info"* uninstall-man : - -rm -f "$(DESTDIR)$(mandir)/man1/$(progname).1" + -rm -f "$(DESTDIR)$(mandir)/man1/$(progname).1"* dist : doc ln -sf $(VPATH) $(DISTNAME) @@ -106,7 +117,7 @@ dist : doc lzip -v -9 $(DISTNAME).tar clean : - -rm -f $(progname) $(progname)_profiled $(objs) + -rm -f $(progname) $(objs) distclean : clean -rm -f Makefile config.status *.tar *.tar.lz diff --git a/NEWS b/NEWS index c8bf7a9..7949a14 100644 --- a/NEWS +++ b/NEWS @@ -1,17 +1,8 @@ -Changes in version 1.16: +Changes in version 1.17: -Compression ratio of option -9 has been slightly increased. +The compression code has been reorganized to ease the porting of the +fast encoder to clzip and lzlib. -Compression time has been reduced by 4%. - -"lzip -0" is now comparable in compression speed and ratio to "gzip -6". - -Copying of file dates, permissions, and ownership now behaves like "cp -p". -(If the user ID or the group ID can't be duplicated, the file permission -bits S_ISUID and S_ISGID are cleared). - -Some minor improvements have been made. - -"lzip.texinfo" has been renamed to "lzip.texi". - -The license has been changed to GPL version 2 or later. +The targets "install-compress", "install-strip-compress", +"install-info-compress" and "install-man-compress" have been added to +the Makefile. diff --git a/README b/README index 356c5e9..0db23e7 100644 --- a/README +++ b/README @@ -5,8 +5,9 @@ one of gzip or bzip2. Lzip is about as fast as gzip, compresses most files more than bzip2, and is better than both from a data recovery perspective. Lzip is a clean implementation of the LZMA "algorithm". -The lzip file format is designed for long-term data archiving, taking -into account both data integrity and decoder availability: +The lzip file format is designed for data sharing and long-term +archiving, taking into account both data integrity and decoder +availability: * The lzip format provides very safe integrity checking and some data recovery means. The lziprecover program can repair bit-flip errors @@ -21,8 +22,8 @@ into account both data integrity and decoder availability: extract the data from a lzip file long after quantum computers eventually render LZMA obsolete. - * Additionally lzip is copylefted, which guarantees that it will - remain free forever. + * Additionally the lzip reference implementation is copylefted, which + guarantees that it will remain free forever. A nice feature of the lzip format is that a corrupt byte is easier to repair the nearer it is from the beginning of the file. Therefore, with @@ -81,7 +82,7 @@ There is no such thing as a "LZMA algorithm"; it is more like a "LZMA coding scheme". For example, the option '-0' of lzip uses the scheme in almost the simplest way possible; issuing the longest match it can find, or a literal byte if it can't find a match. Inversely, a much more -elaborated way of finding coding sequences of minimum price than the one +elaborated way of finding coding sequences of minimum size than the one currently used by lzip could be developed, and the resulting sequence could also be coded using the LZMA coding scheme. @@ -101,7 +102,7 @@ range encoding), Igor Pavlov (for putting all the above together in LZMA), and Julian Seward (for bzip2's CLI). -Copyright (C) 2008-2014 Antonio Diaz Diaz. +Copyright (C) 2008-2015 Antonio Diaz Diaz. This file is free documentation: you have unlimited permission to copy, distribute and modify it. diff --git a/arg_parser.cc b/arg_parser.cc index 74f9298..55764bd 100644 --- a/arg_parser.cc +++ b/arg_parser.cc @@ -1,5 +1,5 @@ /* Arg_parser - POSIX/GNU command line argument parser. (C++ version) - Copyright (C) 2006-2014 Antonio Diaz Diaz. + Copyright (C) 2006-2015 Antonio Diaz Diaz. This library is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/arg_parser.h b/arg_parser.h index d80c353..2e8731c 100644 --- a/arg_parser.h +++ b/arg_parser.h @@ -1,5 +1,5 @@ /* Arg_parser - POSIX/GNU command line argument parser. (C++ version) - Copyright (C) 2006-2014 Antonio Diaz Diaz. + Copyright (C) 2006-2015 Antonio Diaz Diaz. This library is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/configure b/configure index 0483e78..0bc2585 100755 --- a/configure +++ b/configure @@ -1,12 +1,12 @@ #! /bin/sh # configure script for Lzip - LZMA lossless data compressor -# Copyright (C) 2008-2014 Antonio Diaz Diaz. +# Copyright (C) 2008-2015 Antonio Diaz Diaz. # # This configure script is free software: you have unlimited permission # to copy, distribute and modify it. pkgname=lzip -pkgversion=1.16 +pkgversion=1.17-pre1 progname=lzip srctrigger=doc/${pkgname}.texi @@ -165,7 +165,7 @@ echo "LDFLAGS = ${LDFLAGS}" rm -f Makefile cat > Makefile << EOF # Makefile for Lzip - LZMA lossless data compressor -# Copyright (C) 2008-2014 Antonio Diaz Diaz. +# Copyright (C) 2008-2015 Antonio Diaz Diaz. # This file was generated automatically by configure. Do not edit. # # This Makefile is free software: you have unlimited permission diff --git a/decoder.cc b/decoder.cc index 0784011..d10bc09 100644 --- a/decoder.cc +++ b/decoder.cc @@ -1,5 +1,5 @@ /* Lzip - LZMA lossless data compressor - Copyright (C) 2008-2014 Antonio Diaz Diaz. + Copyright (C) 2008-2015 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -62,7 +62,7 @@ int readblock( const int fd, uint8_t * const buf, const int size ) { const int n = read( fd, buf + sz, size - sz ); if( n > 0 ) sz += n; - else if( n == 0 ) break; // EOF + else if( n == 0 ) break; /* EOF */ else if( errno != EINTR ) break; errno = 0; } @@ -201,9 +201,9 @@ int LZ_decoder::decode_member( const Pretty_print & pp ) Bit_model bm_align[dis_align_size]; Len_model match_len_model; Len_model rep_len_model; - unsigned rep0 = 0; // rep[0-3] latest four distances - unsigned rep1 = 0; // used for efficient coding of - unsigned rep2 = 0; // repeated distances + unsigned rep0 = 0; /* rep[0-3] latest four distances */ + unsigned rep1 = 0; /* used for efficient coding of */ + unsigned rep2 = 0; /* repeated distances */ unsigned rep3 = 0; State state; @@ -213,7 +213,7 @@ int LZ_decoder::decode_member( const Pretty_print & pp ) const int pos_state = data_position() & pos_state_mask; if( rdec.decode_bit( bm_match[state()][pos_state] ) == 0 ) // 1st bit { - const uint8_t prev_byte = get_prev_byte(); + const uint8_t prev_byte = peek1(); if( state.is_char() ) { state.set_char1(); @@ -223,7 +223,7 @@ int LZ_decoder::decode_member( const Pretty_print & pp ) { state.set_char2(); put_byte( rdec.decode_matched( bm_literal[get_lit_state(prev_byte)], - get_byte( rep0 ) ) ); + peek( rep0 ) ) ); } } else @@ -250,7 +250,7 @@ int LZ_decoder::decode_member( const Pretty_print & pp ) else { if( rdec.decode_bit( bm_len[state()][pos_state] ) == 0 ) // 4th bit - { state.set_short_rep(); put_byte( get_byte( rep0 ) ); continue; } + { state.set_short_rep(); put_byte( peek( rep0 ) ); continue; } } state.set_rep(); len = min_match_len + rdec.decode_len( rep_len_model, pos_state ); @@ -272,16 +272,16 @@ int LZ_decoder::decode_member( const Pretty_print & pp ) { rep0 += rdec.decode( direct_bits - dis_align_bits ) << dis_align_bits; rep0 += rdec.decode_tree_reversed4( bm_align ); - if( rep0 == 0xFFFFFFFFU ) // Marker found + if( rep0 == 0xFFFFFFFFU ) /* marker found */ { rep0 = rep0_saved; rdec.normalize(); flush_data(); - if( len == min_match_len ) // End Of Stream marker + if( len == min_match_len ) /* End Of Stream marker */ { if( verify_trailer( pp ) ) return 0; else return 3; } - if( len == min_match_len + 1 ) // Sync Flush marker + if( len == min_match_len + 1 ) /* Sync Flush marker */ { rdec.load(); continue; } diff --git a/decoder.h b/decoder.h index 623e19b..c445ccf 100644 --- a/decoder.h +++ b/decoder.h @@ -1,5 +1,5 @@ /* Lzip - LZMA lossless data compressor - Copyright (C) 2008-2014 Antonio Diaz Diaz. + Copyright (C) 2008-2015 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,12 +19,12 @@ class Range_decoder { enum { buffer_size = 16384 }; unsigned long long partial_member_pos; - uint8_t * const buffer; // input buffer - int pos; // current pos in buffer - int stream_pos; // when reached, a new block must be read + uint8_t * const buffer; /* input buffer */ + int pos; /* current pos in buffer */ + int stream_pos; /* when reached, a new block must be read */ uint32_t code; uint32_t range; - const int infd; // input file descriptor + const int infd; /* input file descriptor */ bool at_stream_end; bool read_block(); @@ -213,23 +213,23 @@ class LZ_decoder Range_decoder & rdec; const unsigned dictionary_size; const int buffer_size; - uint8_t * const buffer; // output buffer - int pos; // current pos in buffer - int stream_pos; // first byte not yet written to file + uint8_t * const buffer; /* output buffer */ + int pos; /* current pos in buffer */ + int stream_pos; /* first byte not yet written to file */ uint32_t crc_; - const int outfd; // output file descriptor + const int outfd; /* output file descriptor */ const int member_version; void flush_data(); bool verify_trailer( const Pretty_print & pp ) const; - uint8_t get_prev_byte() const + uint8_t peek1() const { const int i = ( ( pos > 0 ) ? pos : buffer_size ) - 1; return buffer[i]; } - uint8_t get_byte( const int distance ) const + uint8_t peek( const int distance ) const { int i = pos - distance - 1; if( i < 0 ) i += buffer_size; diff --git a/doc/lzip.1 b/doc/lzip.1 index 0c4ed50..7160512 100644 --- a/doc/lzip.1 +++ b/doc/lzip.1 @@ -1,5 +1,5 @@ .\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.46.1. -.TH LZIP "1" "August 2014" "lzip 1.16" "User Commands" +.TH LZIP "1" "March 2015" "lzip 1.17-pre1" "User Commands" .SH NAME lzip \- reduces the size of files .SH SYNOPSIS @@ -70,8 +70,7 @@ Ki = KiB = 2^10 = 1024, M = 10^6, Mi = 2^20, G = 10^9, Gi = 2^30, etc... The bidimensional parameter space of LZMA can't be mapped to a linear scale optimal for all files. If your files are large, very repetitive, etc, you may need to use the \fB\-\-match\-length\fR and \fB\-\-dictionary\-size\fR -options directly to achieve optimal performance. For example, \fB\-9m64\fR -usually compresses executables more (and faster) than \fB\-9\fR. +options directly to achieve optimal performance. .PP Exit status: 0 for a normal exit, 1 for environmental problems (file not found, invalid flags, I/O errors, etc), 2 to indicate a corrupt or @@ -82,7 +81,7 @@ Report bugs to lzip\-bug@nongnu.org .br Lzip home page: http://www.nongnu.org/lzip/lzip.html .SH COPYRIGHT -Copyright \(co 2014 Antonio Diaz Diaz. +Copyright \(co 2015 Antonio Diaz Diaz. License GPLv2+: GNU GPL version 2 or later .br This is free software: you are free to change and redistribute it. diff --git a/doc/lzip.info b/doc/lzip.info index a2d18bc..dd8858b 100644 --- a/doc/lzip.info +++ b/doc/lzip.info @@ -11,7 +11,7 @@ File: lzip.info, Node: Top, Next: Introduction, Up: (dir) Lzip Manual *********** -This manual is for Lzip (version 1.16, 26 August 2014). +This manual is for Lzip (version 1.17-pre1, 26 March 2015). * Menu: @@ -26,7 +26,7 @@ This manual is for Lzip (version 1.16, 26 August 2014). * Concept index:: Index of concepts - Copyright (C) 2008-2014 Antonio Diaz Diaz. + Copyright (C) 2008-2015 Antonio Diaz Diaz. This manual is free documentation: you have unlimited permission to copy, distribute and modify it. @@ -43,8 +43,9 @@ files more than bzip2, and is better than both from a data recovery perspective. Lzip is a clean implementation of the LZMA (Lempel-Ziv-Markov chain-Algorithm) "algorithm". - The lzip file format is designed for long-term data archiving, taking -into account both data integrity and decoder availability: + The lzip file format is designed for data sharing and long-term +archiving, taking into account both data integrity and decoder +availability: * The lzip format provides very safe integrity checking and some data recovery means. The lziprecover program can repair bit-flip errors @@ -59,8 +60,8 @@ into account both data integrity and decoder availability: archaeologist to extract the data from a lzip file long after quantum computers eventually render LZMA obsolete. - * Additionally lzip is copylefted, which guarantees that it will - remain free forever. + * Additionally the lzip reference implementation is copylefted, which + guarantees that it will remain free forever. A nice feature of the lzip format is that a corrupt byte is easier to repair the nearer it is from the beginning of the file. Therefore, with @@ -144,7 +145,7 @@ There is no such thing as a "LZMA algorithm"; it is more like a "LZMA coding scheme". For example, the option '-0' of lzip uses the scheme in almost the simplest way possible; issuing the longest match it can find, or a literal byte if it can't find a match. Inversely, a much more -elaborated way of finding coding sequences of minimum price than the one +elaborated way of finding coding sequences of minimum size than the one currently used by lzip could be developed, and the resulting sequence could also be coded using the LZMA coding scheme. @@ -319,8 +320,7 @@ The format for running lzip is: linear scale optimal for all files. If your files are large, very repetitive, etc, you may need to use the '--match-length' and '--dictionary-size' options directly to achieve optimal - performance. For example, '-9m64' usually compresses executables - more (and faster) than '-9'. + performance. Level Dictionary size Match length limit -0 64 KiB 16 bytes @@ -404,21 +404,19 @@ additional information before, between, or after them. now. 'DS (coded dictionary size, 1 byte)' - Lzip divides the distance between any two powers of 2 into 8 - equally spaced intervals, named "wedges". The dictionary size is - calculated by taking a power of 2 (the base size) and substracting - from it a number of wedges between 0 and 7. The size of a wedge is - (base_size / 16). + The dictionary size is calculated by taking a power of 2 (the base + size) and substracting from it a fraction between 0/16 and 7/16 of + the base size. Bits 4-0 contain the base 2 logarithm of the base size (12 to 29). - Bits 7-5 contain the number of wedges (0 to 7) to substract from - the base size to obtain the dictionary size. + Bits 7-5 contain the numerator of the fraction (0 to 7) to + substract from the base size to obtain the dictionary size. Example: 0xD3 = 2^19 - 6 * 2^15 = 512 KiB - 6 * 32 KiB = 320 KiB Valid values for dictionary size range from 4 KiB to 512 MiB. 'Lzma stream' The lzma stream, finished by an end of stream marker. Uses default - values for encoder properties. See the chapter 'Stream format' - (*note Stream format::) for a complete description. + values for encoder properties. *Note Stream format::, for a + complete description. 'CRC32 (4 bytes)' CRC of the uncompressed original data. @@ -741,7 +739,7 @@ Appendix A Reference source code ******************************** /* Lzd - Educational decompressor for lzip files - Copyright (C) 2013, 2014 Antonio Diaz Diaz. + Copyright (C) 2013-2015 Antonio Diaz Diaz. This program is free software: you have unlimited permission to copy, distribute and modify it. @@ -979,7 +977,7 @@ class LZ_decoder void flush_data(); - uint8_t get_byte( const unsigned distance ) const + uint8_t peek( const unsigned distance ) const { unsigned i = pos - distance - 1; if( pos <= distance ) i += dictionary_size; @@ -1053,13 +1051,13 @@ bool LZ_decoder::decode_member() // Returns false if error const int pos_state = data_position() & pos_state_mask; if( rdec.decode_bit( bm_match[state()][pos_state] ) == 0 ) // 1st bit { - const uint8_t prev_byte = get_byte( 0 ); + const uint8_t prev_byte = peek( 0 ); const int literal_state = prev_byte >> ( 8 - literal_context_bits ); Bit_model * const bm = bm_literal[literal_state]; if( state.is_char() ) put_byte( rdec.decode_tree( bm, 8 ) ); else - put_byte( rdec.decode_matched( bm, get_byte( rep0 ) ) ); + put_byte( rdec.decode_matched( bm, peek( rep0 ) ) ); state.set_char(); } else @@ -1086,7 +1084,7 @@ bool LZ_decoder::decode_member() // Returns false if error else { if( rdec.decode_bit( bm_len[state()][pos_state] ) == 0 ) // 4th bit - { state.set_short_rep(); put_byte( get_byte( rep0 ) ); continue; } + { state.set_short_rep(); put_byte( peek( rep0 ) ); continue; } } state.set_rep(); len = min_match_len + rdec.decode_len( rep_len_model, pos_state ); @@ -1110,7 +1108,7 @@ bool LZ_decoder::decode_member() // Returns false if error { rep0 += rdec.decode( direct_bits - dis_align_bits ) << dis_align_bits; rep0 += rdec.decode_tree_reversed( bm_align, dis_align_bits ); - if( rep0 == 0xFFFFFFFFU ) // Marker found + if( rep0 == 0xFFFFFFFFU ) // marker found { flush_data(); return ( len == min_match_len ); // End Of Stream marker @@ -1121,7 +1119,7 @@ bool LZ_decoder::decode_member() // Returns false if error if( rep0 >= dictionary_size || rep0 >= data_position() ) { flush_data(); return false; } } - for( int i = 0; i < len; ++i ) put_byte( get_byte( rep0 ) ); + for( int i = 0; i < len; ++i ) put_byte( peek( rep0 ) ); } } flush_data(); @@ -1140,7 +1138,7 @@ int main( const int argc, const char * const argv[] ) "It is not safe to use lzd for any real work.\n" "\nUsage: %s < file.lz > file\n", argv[0] ); std::printf( "Lzd decompresses from standard input to standard output.\n" - "\nCopyright (C) 2014 Antonio Diaz Diaz.\n" + "\nCopyright (C) 2015 Antonio Diaz Diaz.\n" "This is free software: you are free to change and redistribute it.\n" "There is NO WARRANTY, to the extent permitted by law.\n" "Report bugs to lzip-bug@nongnu.org\n" @@ -1152,19 +1150,13 @@ int main( const int argc, const char * const argv[] ) { File_header header; for( int i = 0; i < 6; ++i ) header[i] = std::getc( stdin ); - if( std::feof( stdin ) || std::memcmp( header, "LZIP", 4 ) != 0 ) + if( std::feof( stdin ) || std::memcmp( header, "LZIP\x01", 5 ) != 0 ) { if( first_member ) { std::fprintf( stderr, "Bad magic number (file not in lzip format)\n" ); return 2; } break; } - if( header[4] != 1 ) - { - std::fprintf( stderr, "Version %d member format not supported.\n", - header[4] ); - return 2; - } unsigned dict_size = 1 << ( header[5] & 0x1F ); dict_size -= ( dict_size / 16 ) * ( ( header[5] >> 5 ) & 7 ); if( dict_size < min_dictionary_size || dict_size > max_dictionary_size ) @@ -1217,15 +1209,15 @@ Concept index  Tag Table: Node: Top208 -Node: Introduction1022 -Node: Algorithm5992 -Node: Invoking lzip8750 -Node: File format14426 -Node: Stream format16974 -Node: Examples26406 -Node: Problems28363 -Node: Reference source code28893 -Node: Concept index42410 +Node: Introduction1026 +Node: Algorithm6037 +Node: Invoking lzip8794 +Node: File format14384 +Node: Stream format16769 +Node: Examples26201 +Node: Problems28158 +Node: Reference source code28688 +Node: Concept index42025  End Tag Table diff --git a/doc/lzip.texi b/doc/lzip.texi index 037dd6e..58d6f9a 100644 --- a/doc/lzip.texi +++ b/doc/lzip.texi @@ -6,8 +6,8 @@ @finalout @c %**end of header -@set UPDATED 26 August 2014 -@set VERSION 1.16 +@set UPDATED 26 March 2015 +@set VERSION 1.17-pre1 @dircategory Data Compression @direntry @@ -47,7 +47,7 @@ This manual is for Lzip (version @value{VERSION}, @value{UPDATED}). @end menu @sp 1 -Copyright @copyright{} 2008-2014 Antonio Diaz Diaz. +Copyright @copyright{} 2008-2015 Antonio Diaz Diaz. This manual is free documentation: you have unlimited permission to copy, distribute and modify it. @@ -63,8 +63,9 @@ files more than bzip2, and is better than both from a data recovery perspective. Lzip is a clean implementation of the LZMA (Lempel-Ziv-Markov chain-Algorithm) "algorithm". -The lzip file format is designed for long-term data archiving, taking -into account both data integrity and decoder availability: +The lzip file format is designed for data sharing and long-term +archiving, taking into account both data integrity and decoder +availability: @itemize @bullet @item @@ -83,8 +84,8 @@ data from a lzip file long after quantum computers eventually render LZMA obsolete. @item -Additionally lzip is copylefted, which guarantees that it will remain -free forever. +Additionally the lzip reference implementation is copylefted, which +guarantees that it will remain free forever. @end itemize A nice feature of the lzip format is that a corrupt byte is easier to @@ -169,7 +170,7 @@ There is no such thing as a "LZMA algorithm"; it is more like a "LZMA coding scheme". For example, the option '-0' of lzip uses the scheme in almost the simplest way possible; issuing the longest match it can find, or a literal byte if it can't find a match. Inversely, a much more -elaborated way of finding coding sequences of minimum price than the one +elaborated way of finding coding sequences of minimum size than the one currently used by lzip could be developed, and the resulting sequence could also be coded using the LZMA coding scheme. @@ -344,8 +345,7 @@ The bidimensional parameter space of LZMA can't be mapped to a linear scale optimal for all files. If your files are large, very repetitive, etc, you may need to use the @samp{--match-length} and @samp{--dictionary-size} options directly to achieve optimal -performance. For example, @samp{-9m64} usually compresses executables -more (and faster) than @samp{-9}. +performance. @multitable {Level} {Dictionary size} {Match length limit} @item Level @tab Dictionary size @tab Match length limit @@ -439,20 +439,19 @@ A four byte string, identifying the lzip format, with the value "LZIP" Just in case something needs to be modified in the future. 1 for now. @item DS (coded dictionary size, 1 byte) -Lzip divides the distance between any two powers of 2 into 8 equally -spaced intervals, named "wedges". The dictionary size is calculated by -taking a power of 2 (the base size) and substracting from it a number of -wedges between 0 and 7. The size of a wedge is (base_size / 16).@* +The dictionary size is calculated by taking a power of 2 (the base size) +and substracting from it a fraction between 0/16 and 7/16 of the base +size.@* Bits 4-0 contain the base 2 logarithm of the base size (12 to 29).@* -Bits 7-5 contain the number of wedges (0 to 7) to substract from the -base size to obtain the dictionary size.@* +Bits 7-5 contain the numerator of the fraction (0 to 7) to substract +from the base size to obtain the dictionary size.@* Example: 0xD3 = 2^19 - 6 * 2^15 = 512 KiB - 6 * 32 KiB = 320 KiB@* Valid values for dictionary size range from 4 KiB to 512 MiB. @item Lzma stream The lzma stream, finished by an end of stream marker. Uses default -values for encoder properties. See the chapter @samp{Stream format} -(@pxref{Stream format}) for a complete description. +values for encoder properties. @xref{Stream format}, for a complete +description. @item CRC32 (4 bytes) CRC of the uncompressed original data. @@ -805,7 +804,7 @@ for all eternity, if not longer. If you find a bug in lzip, please send electronic mail to @email{lzip-bug@@nongnu.org}. Include the version number, which you can -find by running @w{@samp{lzip --version}}. +find by running @w{@code{lzip --version}}. @node Reference source code @@ -814,7 +813,7 @@ find by running @w{@samp{lzip --version}}. @verbatim /* Lzd - Educational decompressor for lzip files - Copyright (C) 2013, 2014 Antonio Diaz Diaz. + Copyright (C) 2013-2015 Antonio Diaz Diaz. This program is free software: you have unlimited permission to copy, distribute and modify it. @@ -1052,7 +1051,7 @@ class LZ_decoder void flush_data(); - uint8_t get_byte( const unsigned distance ) const + uint8_t peek( const unsigned distance ) const { unsigned i = pos - distance - 1; if( pos <= distance ) i += dictionary_size; @@ -1126,13 +1125,13 @@ bool LZ_decoder::decode_member() // Returns false if error const int pos_state = data_position() & pos_state_mask; if( rdec.decode_bit( bm_match[state()][pos_state] ) == 0 ) // 1st bit { - const uint8_t prev_byte = get_byte( 0 ); + const uint8_t prev_byte = peek( 0 ); const int literal_state = prev_byte >> ( 8 - literal_context_bits ); Bit_model * const bm = bm_literal[literal_state]; if( state.is_char() ) put_byte( rdec.decode_tree( bm, 8 ) ); else - put_byte( rdec.decode_matched( bm, get_byte( rep0 ) ) ); + put_byte( rdec.decode_matched( bm, peek( rep0 ) ) ); state.set_char(); } else @@ -1159,7 +1158,7 @@ bool LZ_decoder::decode_member() // Returns false if error else { if( rdec.decode_bit( bm_len[state()][pos_state] ) == 0 ) // 4th bit - { state.set_short_rep(); put_byte( get_byte( rep0 ) ); continue; } + { state.set_short_rep(); put_byte( peek( rep0 ) ); continue; } } state.set_rep(); len = min_match_len + rdec.decode_len( rep_len_model, pos_state ); @@ -1183,7 +1182,7 @@ bool LZ_decoder::decode_member() // Returns false if error { rep0 += rdec.decode( direct_bits - dis_align_bits ) << dis_align_bits; rep0 += rdec.decode_tree_reversed( bm_align, dis_align_bits ); - if( rep0 == 0xFFFFFFFFU ) // Marker found + if( rep0 == 0xFFFFFFFFU ) // marker found { flush_data(); return ( len == min_match_len ); // End Of Stream marker @@ -1194,7 +1193,7 @@ bool LZ_decoder::decode_member() // Returns false if error if( rep0 >= dictionary_size || rep0 >= data_position() ) { flush_data(); return false; } } - for( int i = 0; i < len; ++i ) put_byte( get_byte( rep0 ) ); + for( int i = 0; i < len; ++i ) put_byte( peek( rep0 ) ); } } flush_data(); @@ -1213,7 +1212,7 @@ int main( const int argc, const char * const argv[] ) "It is not safe to use lzd for any real work.\n" "\nUsage: %s < file.lz > file\n", argv[0] ); std::printf( "Lzd decompresses from standard input to standard output.\n" - "\nCopyright (C) 2014 Antonio Diaz Diaz.\n" + "\nCopyright (C) 2015 Antonio Diaz Diaz.\n" "This is free software: you are free to change and redistribute it.\n" "There is NO WARRANTY, to the extent permitted by law.\n" "Report bugs to lzip-bug@nongnu.org\n" @@ -1225,19 +1224,13 @@ int main( const int argc, const char * const argv[] ) { File_header header; for( int i = 0; i < 6; ++i ) header[i] = std::getc( stdin ); - if( std::feof( stdin ) || std::memcmp( header, "LZIP", 4 ) != 0 ) + if( std::feof( stdin ) || std::memcmp( header, "LZIP\x01", 5 ) != 0 ) { if( first_member ) { std::fprintf( stderr, "Bad magic number (file not in lzip format)\n" ); return 2; } break; } - if( header[4] != 1 ) - { - std::fprintf( stderr, "Version %d member format not supported.\n", - header[4] ); - return 2; - } unsigned dict_size = 1 << ( header[5] & 0x1F ); dict_size -= ( dict_size / 16 ) * ( ( header[5] >> 5 ) & 7 ); if( dict_size < min_dictionary_size || dict_size > max_dictionary_size ) diff --git a/encoder.cc b/encoder.cc index 0d34ad1..4833dd4 100644 --- a/encoder.cc +++ b/encoder.cc @@ -1,5 +1,5 @@ /* Lzip - LZMA lossless data compressor - Copyright (C) 2008-2014 Antonio Diaz Diaz. + Copyright (C) 2008-2015 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -26,112 +26,13 @@ #include #include "lzip.h" +#include "encoder_base.h" #include "encoder.h" -Dis_slots dis_slots; -Prob_prices prob_prices; - - -bool Matchfinder_base::read_block() - { - if( !at_stream_end && stream_pos < buffer_size ) - { - const int size = buffer_size - stream_pos; - const int rd = readblock( infd, buffer + stream_pos, size ); - stream_pos += rd; - if( rd != size && errno ) throw Error( "Read error" ); - if( rd < size ) { at_stream_end = true; pos_limit = buffer_size; } - } - return pos < stream_pos; - } - - -void Matchfinder_base::normalize_pos() - { - if( pos > stream_pos ) - internal_error( "pos > stream_pos in Matchfinder_base::normalize_pos." ); - if( !at_stream_end ) - { - const int offset = pos - dictionary_size_ - before_size; - const int size = stream_pos - offset; - std::memmove( buffer, buffer + offset, size ); - partial_data_pos += offset; - pos -= offset; - stream_pos -= offset; - for( int i = 0; i < num_prev_positions; ++i ) - prev_positions[i] -= std::min( prev_positions[i], offset ); - for( int i = 0; i < pos_array_size; ++i ) - pos_array[i] -= std::min( pos_array[i], offset ); - read_block(); - } - } - - -Matchfinder_base::Matchfinder_base( const int before, const int dict_size, - const int after_size, const int dict_factor, - const int num_prev_positions23, - const int pos_array_factor, const int ifd ) - : - partial_data_pos( 0 ), - before_size( before ), - pos( 0 ), - cyclic_pos( 0 ), - stream_pos( 0 ), - infd( ifd ), - at_stream_end( false ) +int LZ_encoder::get_match_pairs( Pair * pairs ) { - const int buffer_size_limit = - ( dict_factor * dict_size ) + before_size + after_size; - buffer_size = std::max( 65536, dict_size ); - buffer = (uint8_t *)std::malloc( buffer_size ); - if( !buffer ) throw std::bad_alloc(); - if( read_block() && !at_stream_end && buffer_size < buffer_size_limit ) - { - buffer_size = buffer_size_limit; - uint8_t * const tmp = (uint8_t *)std::realloc( buffer, buffer_size ); - if( !tmp ) { std::free( buffer ); throw std::bad_alloc(); } - buffer = tmp; - read_block(); - } - if( at_stream_end && stream_pos < dict_size ) - dictionary_size_ = std::max( (int)min_dictionary_size, stream_pos ); - else - dictionary_size_ = dict_size; - pos_limit = buffer_size; - if( !at_stream_end ) pos_limit -= after_size; - unsigned size = 1 << std::max( 16, real_bits( dictionary_size_ - 1 ) - 2 ); - if( dictionary_size_ > 1 << 26 ) // 64 MiB - size >>= 1; - key4_mask = size - 1; - size += num_prev_positions23; - - num_prev_positions = size; - pos_array_size = pos_array_factor * ( dictionary_size_ + 1 ); - size += pos_array_size; - prev_positions = new( std::nothrow ) int32_t[size]; - if( !prev_positions ) { std::free( buffer ); throw std::bad_alloc(); } - pos_array = prev_positions + num_prev_positions; - for( int i = 0; i < num_prev_positions; ++i ) prev_positions[i] = 0; - } - - -void Matchfinder_base::reset() - { - if( stream_pos > pos ) - std::memmove( buffer, buffer + pos, stream_pos - pos ); - partial_data_pos = 0; - stream_pos -= pos; - pos = 0; - cyclic_pos = 0; - for( int i = 0; i < num_prev_positions; ++i ) prev_positions[i] = 0; - read_block(); - } - - -int Matchfinder::get_match_pairs( Pair * pairs ) - { - int len_limit = match_len_limit_; + int len_limit = match_len_limit; if( len_limit > available_bytes() ) { len_limit = available_bytes(); @@ -141,8 +42,8 @@ int Matchfinder::get_match_pairs( Pair * pairs ) int maxlen = 0; int num_pairs = 0; const int pos1 = pos + 1; - const int min_pos = ( pos > dictionary_size_ ) ? pos - dictionary_size_ : 0; - const uint8_t * const data = buffer + pos; + const int min_pos = ( pos > dictionary_size ) ? pos - dictionary_size : 0; + const uint8_t * const data = ptr_to_current_pos(); unsigned tmp = crc32[data[0]] ^ data[1]; const int key2 = tmp & ( num_prev_positions2 - 1 ); @@ -174,7 +75,7 @@ int Matchfinder::get_match_pairs( Pair * pairs ) while( maxlen < len_limit && data[maxlen-delta] == data[maxlen] ) ++maxlen; pairs[num_pairs-1].len = maxlen; - if( maxlen >= len_limit ) pairs = 0; // done. now just skip + if( maxlen >= len_limit ) pairs = 0; /* done. now just skip */ } if( maxlen < 3 ) maxlen = 3; } @@ -195,7 +96,7 @@ int Matchfinder::get_match_pairs( Pair * pairs ) const int delta = pos1 - newpos; int32_t * const newptr = pos_array + ( ( cyclic_pos - delta + - ( ( cyclic_pos >= delta ) ? 0 : dictionary_size_ + 1 ) ) << 1 ); + ( ( cyclic_pos >= delta ) ? 0 : dictionary_size + 1 ) ) << 1 ); if( data[len-delta] == data[len] ) { while( ++len < len_limit && data[len-delta] == data[len] ) {} @@ -231,38 +132,6 @@ int Matchfinder::get_match_pairs( Pair * pairs ) } -void Range_encoder::flush_data() - { - if( pos > 0 ) - { - if( outfd >= 0 && writeblock( outfd, buffer, pos ) != pos ) - throw Error( "Write error" ); - partial_member_pos += pos; - pos = 0; - if( verbosity >= 2 ) show_progress(); - } - } - - - // End Of Stream mark => (dis == 0xFFFFFFFFU, len == min_match_len) -void LZ_encoder_base::full_flush( const unsigned long long data_position, - const State state ) - { - const int pos_state = data_position & pos_state_mask; - renc.encode_bit( bm_match[state()][pos_state], 1 ); - renc.encode_bit( bm_rep[state()], 0 ); - encode_pair( 0xFFFFFFFFU, min_match_len, pos_state ); - renc.flush(); - File_trailer trailer; - trailer.data_crc( crc() ); - trailer.data_size( data_position ); - trailer.member_size( renc.member_position() + File_trailer::size() ); - for( int i = 0; i < File_trailer::size(); ++i ) - renc.put_byte( trailer.data[i] ); - renc.flush_data(); - } - - void LZ_encoder::update_distance_prices() { for( int dis = start_dis_model; dis < modeled_distances; ++dis ) @@ -297,7 +166,7 @@ void LZ_encoder::update_distance_prices() } -/* Return value == number of bytes advanced (ahead). +/* Returns the number of bytes advanced (ahead). trials[0]..trials[ahead-1] contain the steps to encode. ( trials[0].dis == -1 ) means literal. A match/rep longer or equal than match_len_limit finishes the sequence. @@ -320,29 +189,29 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], int rep_index = 0; for( int i = 0; i < num_rep_distances; ++i ) { - replens[i] = matchfinder.true_match_len( 0, reps[i] + 1, max_match_len ); + replens[i] = true_match_len( 0, reps[i] + 1, max_match_len ); if( replens[i] > replens[rep_index] ) rep_index = i; } - if( replens[rep_index] >= matchfinder.match_len_limit() ) + if( replens[rep_index] >= match_len_limit ) { trials[0].dis = rep_index; trials[0].price = replens[rep_index]; - move_pos( replens[rep_index] ); + move_and_update( replens[rep_index] ); return replens[rep_index]; } - if( main_len >= matchfinder.match_len_limit() ) + if( main_len >= match_len_limit ) { trials[0].dis = pairs[num_pairs-1].dis + num_rep_distances; trials[0].price = main_len; - move_pos( main_len ); + move_and_update( main_len ); return main_len; } - const int pos_state = matchfinder.data_position() & pos_state_mask; - const uint8_t prev_byte = matchfinder[1]; - const uint8_t cur_byte = matchfinder[0]; - const uint8_t match_byte = matchfinder[reps[0]+1]; + const int pos_state = data_position() & pos_state_mask; + const uint8_t prev_byte = peek( 1 ); + const uint8_t cur_byte = peek( 0 ); + const uint8_t match_byte = peek( reps[0] + 1 ); trials[0].state = state; trials[1].dis = -1; // literal @@ -364,7 +233,7 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], { trials[0].dis = trials[1].dis; trials[0].price = 1; - matchfinder.move_pos(); + move_pos(); return 1; } @@ -400,10 +269,10 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], } int cur = 0; - while( true ) // price optimization loop + while( true ) /* price optimization loop */ { - matchfinder.move_pos(); - if( ++cur >= num_trials ) // no more initialized trials + move_pos(); + if( ++cur >= num_trials ) /* no more initialized trials */ { backward( cur ); return cur; @@ -411,14 +280,14 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], const int num_pairs = read_match_distances(); const int newlen = ( num_pairs > 0 ) ? pairs[num_pairs-1].len : 0; - if( newlen >= matchfinder.match_len_limit() ) + if( newlen >= match_len_limit ) { pending_num_pairs = num_pairs; backward( cur ); return cur; } - // give final values to current trial + /* give final values to current trial */ Trial & cur_trial = trials[cur]; State cur_state; { @@ -429,7 +298,7 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], if( prev_index2 == single_step_trial ) { cur_state = trials[prev_index].state; - if( prev_index + 1 == cur ) // len == 1 + if( prev_index + 1 == cur ) /* len == 1 */ { if( dis == 0 ) cur_state.set_short_rep(); else cur_state.set_char(); // literal @@ -437,14 +306,14 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], else if( dis < num_rep_distances ) cur_state.set_rep(); else cur_state.set_match(); } - else if( prev_index2 == dual_step_trial ) // dis == 0 + else if( prev_index2 == dual_step_trial ) /* dis == 0 */ { --prev_index; cur_state = trials[prev_index].state; cur_state.set_char(); cur_state.set_rep(); } - else // if( prev_index2 >= 0 ) + else /* if( prev_index2 >= 0 ) */ { prev_index = prev_index2; cur_state = trials[prev_index].state; @@ -459,10 +328,10 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], mtf_reps( dis, cur_trial.reps ); } - const int pos_state = matchfinder.data_position() & pos_state_mask; - const uint8_t prev_byte = matchfinder[1]; - const uint8_t cur_byte = matchfinder[0]; - const uint8_t match_byte = matchfinder[cur_trial.reps[0]+1]; + const int pos_state = data_position() & pos_state_mask; + const uint8_t prev_byte = peek( 1 ); + const uint8_t cur_byte = peek( 0 ); + const uint8_t match_byte = peek( cur_trial.reps[0] + 1 ); int next_price = cur_trial.price + price0( bm_match[cur_state()][pos_state] ); @@ -471,7 +340,7 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], else next_price += price_matched( prev_byte, cur_byte, match_byte ); - // try last updates to next trial + /* try last updates to next trial */ Trial & next_trial = trials[cur+1]; next_trial.update( next_price, -1, cur ); // literal @@ -482,8 +351,7 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], if( match_byte == cur_byte && next_trial.dis != 0 && next_trial.prev_index2 == single_step_trial ) { - const int price = rep_match_price + - price_shortrep( cur_state, pos_state ); + const int price = rep_match_price + price_shortrep( cur_state, pos_state ); if( price <= next_trial.price ) { next_trial.price = price; @@ -492,20 +360,18 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], } } - const int available_bytes = std::min( matchfinder.available_bytes(), - max_num_trials - 1 - cur ); - if( available_bytes < min_match_len ) continue; + const int triable_bytes = + std::min( available_bytes(), max_num_trials - 1 - cur ); + if( triable_bytes < min_match_len ) continue; - const int len_limit = std::min( matchfinder.match_len_limit(), - available_bytes ); + const int len_limit = std::min( match_len_limit, triable_bytes ); - // try literal + rep0 + /* try literal + rep0 */ if( match_byte != cur_byte && next_trial.prev_index != cur ) { - const uint8_t * const data = matchfinder.ptr_to_current_pos(); + const uint8_t * const data = ptr_to_current_pos(); const int dis = cur_trial.reps[0] + 1; - const int limit = std::min( matchfinder.match_len_limit() + 1, - available_bytes ); + const int limit = std::min( match_len_limit + 1, triable_bytes ); int len = 1; while( len < limit && data[len-dis] == data[len] ) ++len; if( --len >= min_match_len ) @@ -524,10 +390,10 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], int start_len = min_match_len; - // try rep distances + /* try rep distances */ for( int rep = 0; rep < num_rep_distances; ++rep ) { - const uint8_t * const data = matchfinder.ptr_to_current_pos(); + const uint8_t * const data = ptr_to_current_pos(); int len; const int dis = cur_trial.reps[rep] + 1; @@ -541,12 +407,11 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], trials[cur+i].update( price + rep_len_prices.price( i, pos_state ), rep, cur ); - if( rep == 0 ) start_len = len + 1; // discard shorter matches + if( rep == 0 ) start_len = len + 1; /* discard shorter matches */ - // try rep + literal + rep0 + /* try rep + literal + rep0 */ int len2 = len + 1; - const int limit = std::min( matchfinder.match_len_limit() + len2, - available_bytes ); + const int limit = std::min( match_len_limit + len2, triable_bytes ); while( len2 < limit && data[len2-dis] == data[len2] ) ++len2; len2 -= len + 1; if( len2 < min_match_len ) continue; @@ -566,7 +431,7 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], trials[cur+len+1+len2].update3( price, rep, cur + len + 1, cur ); } - // try matches + /* try matches */ if( newlen >= start_len && newlen <= len_limit ) { const int normal_match_price = match_price + @@ -584,14 +449,13 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], trials[cur+len].update( price, dis + num_rep_distances, cur ); - // try match + literal + rep0 + /* try match + literal + rep0 */ if( len == pairs[i].len ) { - const uint8_t * const data = matchfinder.ptr_to_current_pos(); + const uint8_t * const data = ptr_to_current_pos(); const int dis2 = dis + 1; int len2 = len + 1; - const int limit = std::min( matchfinder.match_len_limit() + len2, - available_bytes ); + const int limit = std::min( match_len_limit + len2, triable_bytes ); while( len2 < limit && data[len2-dis2] == data[len2] ) ++len2; len2 -= len + 1; if( len2 >= min_match_len ) @@ -624,10 +488,10 @@ bool LZ_encoder::encode_member( const unsigned long long member_size ) { const unsigned long long member_size_limit = member_size - File_trailer::size() - max_marker_size; - const bool best = ( matchfinder.match_len_limit() > 12 ); + const bool best = ( match_len_limit > 12 ); const int dis_price_count = best ? 1 : 512; const int align_price_count = best ? 1 : dis_align_size; - const int price_count = ( matchfinder.match_len_limit() > 36 ) ? 1013 : 4093; + const int price_count = ( match_len_limit > 36 ) ? 1013 : 4093; int price_counter = 0; int dis_price_counter = 0; int align_price_counter = 0; @@ -635,26 +499,25 @@ bool LZ_encoder::encode_member( const unsigned long long member_size ) State state; for( int i = 0; i < num_rep_distances; ++i ) reps[i] = 0; - if( matchfinder.data_position() != 0 || - renc.member_position() != File_header::size ) - return false; // can be called only once + if( data_position() != 0 || renc.member_position() != File_header::size ) + return false; /* can be called only once */ - if( !matchfinder.finished() ) // encode first byte + if( !data_finished() ) // encode first byte { const uint8_t prev_byte = 0; - const uint8_t cur_byte = matchfinder[0]; + const uint8_t cur_byte = peek( 0 ); renc.encode_bit( bm_match[state()][0], 0 ); encode_literal( prev_byte, cur_byte ); crc32.update_byte( crc_, cur_byte ); - matchfinder.get_match_pairs(); - matchfinder.move_pos(); + get_match_pairs(); + move_pos(); } - while( !matchfinder.finished() ) + while( !data_finished() ) { if( price_counter <= 0 && pending_num_pairs == 0 ) { - price_counter = price_count; // recalculate prices every these bytes + price_counter = price_count; /* recalculate prices every these bytes */ if( dis_price_counter <= 0 ) { dis_price_counter = dis_price_count; update_distance_prices(); } if( align_price_counter <= 0 ) @@ -668,39 +531,38 @@ bool LZ_encoder::encode_member( const unsigned long long member_size ) } int ahead = sequence_optimizer( reps, state ); - if( ahead <= 0 ) return false; // can't happen + if( ahead <= 0 ) return false; /* can't happen */ price_counter -= ahead; for( int i = 0; ahead > 0; ) { - const int pos_state = - ( matchfinder.data_position() - ahead ) & pos_state_mask; + const int pos_state = ( data_position() - ahead ) & pos_state_mask; const int dis = trials[i].dis; const int len = trials[i].price; bool bit = ( dis < 0 ); renc.encode_bit( bm_match[state()][pos_state], !bit ); - if( bit ) // literal byte + if( bit ) /* literal byte */ { - const uint8_t prev_byte = matchfinder[ahead+1]; - const uint8_t cur_byte = matchfinder[ahead]; + const uint8_t prev_byte = peek( ahead + 1 ); + const uint8_t cur_byte = peek( ahead ); crc32.update_byte( crc_, cur_byte ); if( state.is_char() ) encode_literal( prev_byte, cur_byte ); else { - const uint8_t match_byte = matchfinder[ahead+reps[0]+1]; + const uint8_t match_byte = peek( ahead + reps[0] + 1 ); encode_matched( prev_byte, cur_byte, match_byte ); } state.set_char(); } - else // match or repeated match + else /* match or repeated match */ { - crc32.update_buf( crc_, matchfinder.ptr_to_current_pos() - ahead, len ); + crc32.update_buf( crc_, ptr_to_current_pos() - ahead, len ); mtf_reps( dis, reps ); bit = ( dis < num_rep_distances ); renc.encode_bit( bm_rep[state()], bit ); - if( bit ) // repeated match + if( bit ) /* repeated match */ { bit = ( dis == 0 ); renc.encode_bit( bm_rep0[state()], !bit ); @@ -720,7 +582,7 @@ bool LZ_encoder::encode_member( const unsigned long long member_size ) state.set_rep(); } } - else // match + else /* match */ { encode_pair( dis - num_rep_distances, len, pos_state ); if( get_slot( dis - num_rep_distances ) >= end_dis_model ) @@ -733,12 +595,12 @@ bool LZ_encoder::encode_member( const unsigned long long member_size ) ahead -= len; i += len; if( renc.member_position() >= member_size_limit ) { - if( !matchfinder.dec_pos( ahead ) ) return false; - full_flush( matchfinder.data_position(), state ); + if( !dec_pos( ahead ) ) return false; + full_flush( state ); return true; } } } - full_flush( matchfinder.data_position(), state ); + full_flush( state ); return true; } diff --git a/encoder.h b/encoder.h index 6f50da7..81cc1e0 100644 --- a/encoder.h +++ b/encoder.h @@ -1,5 +1,5 @@ /* Lzip - LZMA lossless data compressor - Copyright (C) 2008-2014 Antonio Diaz Diaz. + Copyright (C) 2008-2015 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -15,429 +15,6 @@ along with this program. If not, see . */ -enum { max_num_trials = 1 << 13, - price_shift_bits = 6, - price_step_bits = 2, - price_step = 1 << price_step_bits }; - -class Dis_slots - { - uint8_t data[1<<10]; - -public: - void init() - { - for( int slot = 0; slot < 4; ++slot ) data[slot] = slot; - for( int i = 4, size = 2, slot = 4; slot < 20; slot += 2 ) - { - std::memset( &data[i], slot, size ); - std::memset( &data[i+size], slot + 1, size ); - size <<= 1; - i += size; - } - } - - uint8_t operator[]( const int dis ) const { return data[dis]; } - }; - -extern Dis_slots dis_slots; - -inline uint8_t get_slot( const unsigned dis ) - { - if( dis < (1 << 10) ) return dis_slots[dis]; - if( dis < (1 << 19) ) return dis_slots[dis>> 9] + 18; - if( dis < (1 << 28) ) return dis_slots[dis>>18] + 36; - return dis_slots[dis>>27] + 54; - } - - -class Prob_prices - { - short data[bit_model_total >> price_step_bits]; - -public: - void init() - { - for( int i = 0; i < bit_model_total >> price_step_bits; ++i ) - { - unsigned val = ( i * price_step ) + ( price_step / 2 ); - int bits = 0; // base 2 logarithm of val - for( int j = 0; j < price_shift_bits; ++j ) - { - val = val * val; - bits <<= 1; - while( val >= 1 << 16 ) { val >>= 1; ++bits; } - } - bits += 15; // remaining bits in val - data[i] = ( bit_model_total_bits << price_shift_bits ) - bits; - } - } - - int operator[]( const int probability ) const - { return data[probability >> price_step_bits]; } - }; - -extern Prob_prices prob_prices; - - -inline int price0( const Bit_model bm ) - { return prob_prices[bm.probability]; } - -inline int price1( const Bit_model bm ) - { return prob_prices[bit_model_total - bm.probability]; } - -inline int price_bit( const Bit_model bm, const int bit ) - { if( bit ) return price1( bm ); else return price0( bm ); } - - -inline int price_symbol3( const Bit_model bm[], int symbol ) - { - int bit = symbol & 1; - symbol |= 8; symbol >>= 1; - int price = price_bit( bm[symbol], bit ); - bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit ); - return price + price_bit( bm[1], symbol & 1 ); - } - - -inline int price_symbol6( const Bit_model bm[], int symbol ) - { - int bit = symbol & 1; - symbol |= 64; symbol >>= 1; - int price = price_bit( bm[symbol], bit ); - bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit ); - bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit ); - bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit ); - bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit ); - return price + price_bit( bm[1], symbol & 1 ); - } - - -inline int price_symbol8( const Bit_model bm[], int symbol ) - { - int bit = symbol & 1; - symbol |= 0x100; symbol >>= 1; - int price = price_bit( bm[symbol], bit ); - bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit ); - bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit ); - bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit ); - bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit ); - bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit ); - bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit ); - return price + price_bit( bm[1], symbol & 1 ); - } - - -inline int price_symbol_reversed( const Bit_model bm[], int symbol, - const int num_bits ) - { - int price = 0; - int model = 1; - for( int i = num_bits; i > 0; --i ) - { - const int bit = symbol & 1; - price += price_bit( bm[model], bit ); - model = ( model << 1 ) | bit; - symbol >>= 1; - } - return price; - } - - -inline int price_matched( const Bit_model bm[], int symbol, int match_byte ) - { - int price = 0; - int mask = 0x100; - symbol |= mask; - - do { - match_byte <<= 1; - const int match_bit = match_byte & mask; - symbol <<= 1; - const int bit = symbol & 0x100; - price += price_bit( bm[match_bit+(symbol>>9)+mask], bit ); - mask &= ~(match_byte ^ symbol); // if( match_bit != bit ) mask = 0; - } - while( symbol < 0x10000 ); - return price; - } - - -class Matchfinder_base - { - bool read_block(); - void normalize_pos(); - - Matchfinder_base( const Matchfinder_base & ); // declared as private - void operator=( const Matchfinder_base & ); // declared as private - -protected: - unsigned long long partial_data_pos; - uint8_t * buffer; // input buffer - int32_t * prev_positions; // 1 + last seen position of key. else 0 - int32_t * pos_array; // may be tree or chain - const int before_size; // bytes to keep in buffer before dictionary - int buffer_size; - int dictionary_size_; // bytes to keep in buffer before pos - int pos; // current pos in buffer - int cyclic_pos; // cycles through [0, dictionary_size] - int stream_pos; // first byte not yet read from file - int pos_limit; // when reached, a new block must be read - int key4_mask; - int num_prev_positions; // size of prev_positions - int pos_array_size; - const int infd; // input file descriptor - bool at_stream_end; // stream_pos shows real end of file - - Matchfinder_base( const int before, const int dict_size, - const int after_size, const int dict_factor, - const int num_prev_positions23, - const int pos_array_factor, const int ifd ); - - ~Matchfinder_base() - { delete[] prev_positions; std::free( buffer ); } - -public: - uint8_t operator[]( const int distance ) const - { return buffer[pos-distance]; } - int available_bytes() const { return stream_pos - pos; } - unsigned long long data_position() const { return partial_data_pos + pos; } - int dictionary_size() const { return dictionary_size_; } - bool finished() const { return at_stream_end && pos >= stream_pos; } - const uint8_t * ptr_to_current_pos() const { return buffer + pos; } - - int true_match_len( const int index, const int distance, int len_limit ) const - { - if( index + len_limit > available_bytes() ) - len_limit = available_bytes() - index; - const uint8_t * const data = buffer + pos + index; - int i = 0; - while( i < len_limit && data[i-distance] == data[i] ) ++i; - return i; - } - - void move_pos() - { - if( ++cyclic_pos > dictionary_size_ ) cyclic_pos = 0; - if( ++pos >= pos_limit ) normalize_pos(); - } - - void reset(); - }; - - -struct Pair /* distance-length pair */ - { - int dis; - int len; - }; - - -class Matchfinder : public Matchfinder_base - { - enum { before_size = max_num_trials + 1, - // bytes to keep in buffer after pos - after_size = ( 2 * max_match_len ) + 1, - dict_factor = 2, - num_prev_positions3 = 1 << 16, - num_prev_positions2 = 1 << 10, - num_prev_positions23 = num_prev_positions2 + num_prev_positions3, - pos_array_factor = 2 }; - - const int cycles; - const int match_len_limit_; - -public: - Matchfinder( const int dict_size, const int len_limit, const int ifd ) - : - Matchfinder_base( before_size, dict_size, after_size, dict_factor, - num_prev_positions23, pos_array_factor, ifd ), - cycles( ( len_limit < max_match_len ) ? 16 + ( len_limit / 2 ) : 256 ), - match_len_limit_( len_limit ) - {} - - bool dec_pos( const int ahead ) - { - if( ahead < 0 || pos < ahead ) return false; - pos -= ahead; - cyclic_pos -= ahead; - if( cyclic_pos < 0 ) cyclic_pos += dictionary_size_ + 1; - return true; - } - - int match_len_limit() const { return match_len_limit_; } - int get_match_pairs( Pair * pairs = 0 ); - }; - - -class Range_encoder - { - enum { buffer_size = 65536 }; - uint64_t low; - unsigned long long partial_member_pos; - uint8_t * const buffer; // output buffer - int pos; // current pos in buffer - uint32_t range; - unsigned ff_count; - const int outfd; // output file descriptor - uint8_t cache; - - void shift_low() - { - const bool carry = ( low > 0xFFFFFFFFU ); - if( carry || low < 0xFF000000U ) - { - put_byte( cache + carry ); - for( ; ff_count > 0; --ff_count ) put_byte( 0xFF + carry ); - cache = low >> 24; - } - else ++ff_count; - low = ( low & 0x00FFFFFFU ) << 8; - } - - Range_encoder( const Range_encoder & ); // declared as private - void operator=( const Range_encoder & ); // declared as private - -public: - explicit Range_encoder( const int ofd ) - : - low( 0 ), - partial_member_pos( 0 ), - buffer( new uint8_t[buffer_size] ), - pos( 0 ), - range( 0xFFFFFFFFU ), - ff_count( 0 ), - outfd( ofd ), - cache( 0 ) - {} - - ~Range_encoder() { delete[] buffer; } - - unsigned long long member_position() const - { return partial_member_pos + pos + ff_count; } - - void flush() { for( int i = 0; i < 5; ++i ) shift_low(); } - void flush_data(); - - void put_byte( const uint8_t b ) - { - buffer[pos] = b; - if( ++pos >= buffer_size ) flush_data(); - } - - void encode( const int symbol, const int num_bits ) - { - for( int i = num_bits - 1; i >= 0; --i ) - { - range >>= 1; - if( (symbol >> i) & 1 ) low += range; - if( range <= 0x00FFFFFFU ) { range <<= 8; shift_low(); } - } - } - - void encode_bit( Bit_model & bm, const int bit ) - { - const uint32_t bound = ( range >> bit_model_total_bits ) * bm.probability; - if( !bit ) - { - range = bound; - bm.probability += (bit_model_total - bm.probability) >> bit_model_move_bits; - } - else - { - low += bound; - range -= bound; - bm.probability -= bm.probability >> bit_model_move_bits; - } - if( range <= 0x00FFFFFFU ) { range <<= 8; shift_low(); } - } - - void encode_tree3( Bit_model bm[], const int symbol ) - { - int model = 1; - int bit = ( symbol >> 2 ) & 1; - encode_bit( bm[model], bit ); model = ( model << 1 ) | bit; - bit = ( symbol >> 1 ) & 1; - encode_bit( bm[model], bit ); model = ( model << 1 ) | bit; - encode_bit( bm[model], symbol & 1 ); - } - - void encode_tree6( Bit_model bm[], const int symbol ) - { - int model = 1; - int bit = ( symbol >> 5 ) & 1; - encode_bit( bm[model], bit ); model = ( model << 1 ) | bit; - bit = ( symbol >> 4 ) & 1; - encode_bit( bm[model], bit ); model = ( model << 1 ) | bit; - bit = ( symbol >> 3 ) & 1; - encode_bit( bm[model], bit ); model = ( model << 1 ) | bit; - bit = ( symbol >> 2 ) & 1; - encode_bit( bm[model], bit ); model = ( model << 1 ) | bit; - bit = ( symbol >> 1 ) & 1; - encode_bit( bm[model], bit ); model = ( model << 1 ) | bit; - encode_bit( bm[model], symbol & 1 ); - } - - void encode_tree8( Bit_model bm[], const int symbol ) - { - int model = 1; - int mask = ( 1 << 7 ); - do { - const int bit = ( symbol & mask ); - encode_bit( bm[model], bit ); - model <<= 1; if( bit ) ++model; - } - while( mask >>= 1 ); - } - - void encode_tree_reversed( Bit_model bm[], int symbol, const int num_bits ) - { - int model = 1; - for( int i = num_bits; i > 0; --i ) - { - const int bit = symbol & 1; - encode_bit( bm[model], bit ); - model = ( model << 1 ) | bit; - symbol >>= 1; - } - } - - void encode_matched( Bit_model bm[], int symbol, int match_byte ) - { - int mask = 0x100; - symbol |= mask; - - do { - match_byte <<= 1; - const int match_bit = match_byte & mask; - symbol <<= 1; - const int bit = symbol & 0x100; - encode_bit( bm[match_bit+(symbol>>9)+mask], bit ); - mask &= ~(match_byte ^ symbol); // if( match_bit != bit ) mask = 0; - } - while( symbol < 0x10000 ); - } - - void encode_len( Len_model & lm, int symbol, const int pos_state ) - { - symbol -= min_match_len; - bool bit = ( symbol >= len_low_symbols ); - encode_bit( lm.choice1, bit ); - if( !bit ) - encode_tree3( lm.bm_low[pos_state], symbol ); - else - { - bit = ( symbol >= len_low_symbols + len_mid_symbols ); - encode_bit( lm.choice2, bit ); - if( !bit ) - encode_tree3( lm.bm_mid[pos_state], symbol - len_low_symbols ); - else - encode_tree8( lm.bm_high, symbol - len_low_symbols - len_mid_symbols ); - } - } - }; - - class Len_prices { const Len_model & lm; @@ -471,14 +48,14 @@ class Len_prices } public: + void reset() { for( int i = 0; i < pos_states; ++i ) counters[i] = 0; } + Len_prices( const Len_model & m, const int match_len_limit ) : lm( m ), len_symbols( match_len_limit + 1 - min_match_len ), count( ( match_len_limit > 12 ) ? 1 : len_symbols ) - { - for( int i = 0; i < pos_states; ++i ) counters[i] = 0; - } + { reset(); } void decrement_counter( const int pos_state ) { --counters[pos_state]; } @@ -497,101 +74,28 @@ public: }; -class LZ_encoder_base +class LZ_encoder : public LZ_encoder_base { -protected: - enum { max_marker_size = 16, - num_rep_distances = 4 }; // must be 4 - - uint32_t crc_; - - Bit_model bm_literal[1<= start_dis_model ) - { - const int direct_bits = ( dis_slot >> 1 ) - 1; - const unsigned base = ( 2 | ( dis_slot & 1 ) ) << direct_bits; - const unsigned direct_dis = dis - base; - - if( dis_slot < end_dis_model ) - renc.encode_tree_reversed( bm_dis + base - dis_slot - 1, direct_dis, - direct_bits ); - else - { - renc.encode( direct_dis >> dis_align_bits, direct_bits - dis_align_bits ); - renc.encode_tree_reversed( bm_align, direct_dis, dis_align_bits ); - } - } - } - - void full_flush( const unsigned long long data_position, const State state ); - -public: - unsigned long long member_position() const { return renc.member_position(); } - }; - + int dis; + int len; + }; -class LZ_encoder : public LZ_encoder_base - { enum { infinite_price = 0x0FFFFFFF, + max_num_trials = 1 << 13, single_step_trial = -2, dual_step_trial = -1 }; struct Trial { State state; - int price; // dual use var; cumulative price, match length - int dis; // rep index or match distance. (-1 for literal) - int prev_index; // index of prev trial in trials[] - int prev_index2; // -2 trial is single step - // -1 literal + rep0 - // >= 0 ( rep or match ) + literal + rep0 + int price; /* dual use var; cumulative price, match length */ + int dis; /* rep index or match distance. (-1 for literal) */ + int prev_index; /* index of prev trial in trials[] */ + int prev_index2; /* -2 trial is single step */ + /* -1 literal + rep0 */ + /* >= 0 ( rep or match ) + literal + rep0 */ int reps[num_rep_distances]; void update( const int pr, const int distance, const int p_i ) @@ -616,7 +120,8 @@ class LZ_encoder : public LZ_encoder_base } }; - Matchfinder & matchfinder; + const int cycles; + const int match_len_limit; Len_prices match_len_prices; Len_prices rep_len_prices; int pending_num_pairs; @@ -628,9 +133,19 @@ class LZ_encoder : public LZ_encoder_base int align_prices[dis_align_size]; const int num_dis_slots; + bool dec_pos( const int ahead ) + { + if( ahead < 0 || pos < ahead ) return false; + pos -= ahead; + cyclic_pos -= ahead; + if( cyclic_pos < 0 ) cyclic_pos += dictionary_size + 1; + return true; + } + + int get_match_pairs( Pair * pairs = 0 ); void update_distance_prices(); - // move-to-front dis in/into reps if( dis > 0 ) + /* move-to-front dis in/into reps if( dis > 0 ) */ static void mtf_reps( const int dis, int reps[num_rep_distances] ) { if( dis >= num_rep_distances ) @@ -685,27 +200,27 @@ class LZ_encoder : public LZ_encoder_base int read_match_distances() { - const int num_pairs = matchfinder.get_match_pairs( pairs ); + const int num_pairs = get_match_pairs( pairs ); if( num_pairs > 0 ) { int len = pairs[num_pairs-1].len; - if( len == matchfinder.match_len_limit() && len < max_match_len ) + if( len == match_len_limit && len < max_match_len ) { - len += matchfinder.true_match_len( len, pairs[num_pairs-1].dis + 1, - max_match_len - len ); + len += true_match_len( len, pairs[num_pairs-1].dis + 1, + max_match_len - len ); pairs[num_pairs-1].len = len; } } return num_pairs; } - void move_pos( int n ) + void move_and_update( int n ) { while( true ) { - matchfinder.move_pos(); + move_pos(); if( --n <= 0 ) break; - matchfinder.get_match_pairs(); + get_match_pairs(); } } @@ -739,16 +254,36 @@ class LZ_encoder : public LZ_encoder_base int sequence_optimizer( const int reps[num_rep_distances], const State state ); + enum { before = max_num_trials + 1, + /* bytes to keep in buffer after pos */ + after_size = ( 2 * max_match_len ) + 1, + dict_factor = 2, + num_prev_positions3 = 1 << 16, + num_prev_positions2 = 1 << 10, + num_prev_positions23 = num_prev_positions2 + num_prev_positions3, + pos_array_factor = 2 }; + public: - LZ_encoder( Matchfinder & mf, const File_header & header, const int outfd ) + LZ_encoder( const int dict_size, const int len_limit, + const int ifd, const int outfd ) : - LZ_encoder_base( header, outfd ), - matchfinder( mf ), - match_len_prices( match_len_model, mf.match_len_limit() ), - rep_len_prices( rep_len_model, mf.match_len_limit() ), + LZ_encoder_base( before, dict_size, after_size, dict_factor, + num_prev_positions23, pos_array_factor, ifd, outfd ), + cycles( ( len_limit < max_match_len ) ? 16 + ( len_limit / 2 ) : 256 ), + match_len_limit( len_limit ), + match_len_prices( match_len_model, match_len_limit ), + rep_len_prices( rep_len_model, match_len_limit ), pending_num_pairs( 0 ), - num_dis_slots( 2 * real_bits( mf.dictionary_size() - 1 ) ) + num_dis_slots( 2 * real_bits( dictionary_size - 1 ) ) {} + void reset() + { + LZ_encoder_base::reset(); + match_len_prices.reset(); + rep_len_prices.reset(); + pending_num_pairs = 0; + } + bool encode_member( const unsigned long long member_size ); }; diff --git a/encoder_base.cc b/encoder_base.cc new file mode 100644 index 0000000..982f12c --- /dev/null +++ b/encoder_base.cc @@ -0,0 +1,180 @@ +/* Lzip - LZMA lossless data compressor + Copyright (C) 2008-2015 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#define _FILE_OFFSET_BITS 64 + +#include +#include +#include +#include +#include +#include +#include + +#include "lzip.h" +#include "encoder_base.h" + + +Dis_slots dis_slots; +Prob_prices prob_prices; + + +bool Matchfinder_base::read_block() + { + if( !at_stream_end && stream_pos < buffer_size ) + { + const int size = buffer_size - stream_pos; + const int rd = readblock( infd, buffer + stream_pos, size ); + stream_pos += rd; + if( rd != size && errno ) throw Error( "Read error" ); + if( rd < size ) { at_stream_end = true; pos_limit = buffer_size; } + } + return pos < stream_pos; + } + + +void Matchfinder_base::normalize_pos() + { + if( pos > stream_pos ) + internal_error( "pos > stream_pos in Matchfinder_base::normalize_pos." ); + if( !at_stream_end ) + { + const int offset = pos - dictionary_size - before_size; + const int size = stream_pos - offset; + std::memmove( buffer, buffer + offset, size ); + partial_data_pos += offset; + pos -= offset; + stream_pos -= offset; + for( int i = 0; i < num_prev_positions; ++i ) + prev_positions[i] -= std::min( prev_positions[i], offset ); + for( int i = 0; i < pos_array_size; ++i ) + pos_array[i] -= std::min( pos_array[i], offset ); + read_block(); + } + } + + +Matchfinder_base::Matchfinder_base( const int before, const int dict_size, + const int after_size, const int dict_factor, + const int num_prev_positions23, + const int pos_array_factor, const int ifd ) + : + partial_data_pos( 0 ), + before_size( before ), + pos( 0 ), + cyclic_pos( 0 ), + stream_pos( 0 ), + infd( ifd ), + at_stream_end( false ) + { + const int buffer_size_limit = + ( dict_factor * dict_size ) + before_size + after_size; + buffer_size = std::max( 65536, dict_size ); + buffer = (uint8_t *)std::malloc( buffer_size ); + if( !buffer ) throw std::bad_alloc(); + if( read_block() && !at_stream_end && buffer_size < buffer_size_limit ) + { + buffer_size = buffer_size_limit; + uint8_t * const tmp = (uint8_t *)std::realloc( buffer, buffer_size ); + if( !tmp ) { std::free( buffer ); throw std::bad_alloc(); } + buffer = tmp; + read_block(); + } + if( at_stream_end && stream_pos < dict_size ) + dictionary_size = std::max( (int)min_dictionary_size, stream_pos ); + else + dictionary_size = dict_size; + pos_limit = buffer_size; + if( !at_stream_end ) pos_limit -= after_size; + unsigned size = 1 << std::max( 16, real_bits( dictionary_size - 1 ) - 2 ); + if( dictionary_size > 1 << 26 ) // 64 MiB + size >>= 1; + key4_mask = size - 1; + size += num_prev_positions23; + + num_prev_positions = size; + pos_array_size = pos_array_factor * ( dictionary_size + 1 ); + size += pos_array_size; + prev_positions = new( std::nothrow ) int32_t[size]; + if( !prev_positions ) { std::free( buffer ); throw std::bad_alloc(); } + pos_array = prev_positions + num_prev_positions; + for( int i = 0; i < num_prev_positions; ++i ) prev_positions[i] = 0; + } + + +void Matchfinder_base::reset() + { + if( stream_pos > pos ) + std::memmove( buffer, buffer + pos, stream_pos - pos ); + partial_data_pos = 0; + stream_pos -= pos; + pos = 0; + cyclic_pos = 0; + for( int i = 0; i < num_prev_positions; ++i ) prev_positions[i] = 0; + read_block(); + } + + +void Range_encoder::flush_data() + { + if( pos > 0 ) + { + if( outfd >= 0 && writeblock( outfd, buffer, pos ) != pos ) + throw Error( "Write error" ); + partial_member_pos += pos; + pos = 0; + show_progress(); + } + } + + + /* End Of Stream mark => (dis == 0xFFFFFFFFU, len == min_match_len) */ +void LZ_encoder_base::full_flush( const State state ) + { + const int pos_state = data_position() & pos_state_mask; + renc.encode_bit( bm_match[state()][pos_state], 1 ); + renc.encode_bit( bm_rep[state()], 0 ); + encode_pair( 0xFFFFFFFFU, min_match_len, pos_state ); + renc.flush(); + File_trailer trailer; + trailer.data_crc( crc() ); + trailer.data_size( data_position() ); + trailer.member_size( renc.member_position() + File_trailer::size() ); + for( int i = 0; i < File_trailer::size(); ++i ) + renc.put_byte( trailer.data[i] ); + renc.flush_data(); + } + + +void LZ_encoder_base::reset() + { + Matchfinder_base::reset(); + crc_ = 0xFFFFFFFFU; + bm_literal[0][0].reset( ( 1 << literal_context_bits ) * 0x300 ); + bm_match[0][0].reset( State::states * pos_states ); + bm_rep[0].reset( State::states ); + bm_rep0[0].reset( State::states ); + bm_rep1[0].reset( State::states ); + bm_rep2[0].reset( State::states ); + bm_len[0][0].reset( State::states * pos_states ); + bm_dis_slot[0][0].reset( len_states * (1 << dis_slot_bits ) ); + bm_dis[0].reset( modeled_distances - end_dis_model ); + bm_align[0].reset( dis_align_size ); + match_len_model.reset(); + rep_len_model.reset(); + renc.reset(); + } diff --git a/encoder_base.h b/encoder_base.h new file mode 100644 index 0000000..27c7a90 --- /dev/null +++ b/encoder_base.h @@ -0,0 +1,487 @@ +/* Lzip - LZMA lossless data compressor + Copyright (C) 2008-2015 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +enum { price_shift_bits = 6, + price_step_bits = 2, + price_step = 1 << price_step_bits }; + +class Dis_slots + { + uint8_t data[1<<10]; + +public: + void init() + { + for( int slot = 0; slot < 4; ++slot ) data[slot] = slot; + for( int i = 4, size = 2, slot = 4; slot < 20; slot += 2 ) + { + std::memset( &data[i], slot, size ); + std::memset( &data[i+size], slot + 1, size ); + size <<= 1; + i += size; + } + } + + uint8_t operator[]( const int dis ) const { return data[dis]; } + }; + +extern Dis_slots dis_slots; + +inline uint8_t get_slot( const unsigned dis ) + { + if( dis < (1 << 10) ) return dis_slots[dis]; + if( dis < (1 << 19) ) return dis_slots[dis>> 9] + 18; + if( dis < (1 << 28) ) return dis_slots[dis>>18] + 36; + return dis_slots[dis>>27] + 54; + } + + +class Prob_prices + { + short data[bit_model_total >> price_step_bits]; + +public: + void init() + { + for( int i = 0; i < bit_model_total >> price_step_bits; ++i ) + { + unsigned val = ( i * price_step ) + ( price_step / 2 ); + int bits = 0; /* base 2 logarithm of val */ + for( int j = 0; j < price_shift_bits; ++j ) + { + val = val * val; + bits <<= 1; + while( val >= 1 << 16 ) { val >>= 1; ++bits; } + } + bits += 15; /* remaining bits in val */ + data[i] = ( bit_model_total_bits << price_shift_bits ) - bits; + } + } + + int operator[]( const int probability ) const + { return data[probability >> price_step_bits]; } + }; + +extern Prob_prices prob_prices; + + +inline int price0( const Bit_model bm ) + { return prob_prices[bm.probability]; } + +inline int price1( const Bit_model bm ) + { return prob_prices[bit_model_total - bm.probability]; } + +inline int price_bit( const Bit_model bm, const int bit ) + { if( bit ) return price1( bm ); else return price0( bm ); } + + +inline int price_symbol3( const Bit_model bm[], int symbol ) + { + int bit = symbol & 1; + symbol |= 8; symbol >>= 1; + int price = price_bit( bm[symbol], bit ); + bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit ); + return price + price_bit( bm[1], symbol & 1 ); + } + + +inline int price_symbol6( const Bit_model bm[], int symbol ) + { + int bit = symbol & 1; + symbol |= 64; symbol >>= 1; + int price = price_bit( bm[symbol], bit ); + bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit ); + bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit ); + bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit ); + bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit ); + return price + price_bit( bm[1], symbol & 1 ); + } + + +inline int price_symbol8( const Bit_model bm[], int symbol ) + { + int bit = symbol & 1; + symbol |= 0x100; symbol >>= 1; + int price = price_bit( bm[symbol], bit ); + bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit ); + bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit ); + bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit ); + bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit ); + bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit ); + bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit ); + return price + price_bit( bm[1], symbol & 1 ); + } + + +inline int price_symbol_reversed( const Bit_model bm[], int symbol, + const int num_bits ) + { + int price = 0; + int model = 1; + for( int i = num_bits; i > 0; --i ) + { + const int bit = symbol & 1; + price += price_bit( bm[model], bit ); + model = ( model << 1 ) | bit; + symbol >>= 1; + } + return price; + } + + +inline int price_matched( const Bit_model bm[], int symbol, int match_byte ) + { + int price = 0; + int mask = 0x100; + symbol |= mask; + + do { + match_byte <<= 1; + const int match_bit = match_byte & mask; + symbol <<= 1; + const int bit = symbol & 0x100; + price += price_bit( bm[match_bit+(symbol>>9)+mask], bit ); + mask &= ~(match_byte ^ symbol); /* if( match_bit != bit ) mask = 0; */ + } + while( symbol < 0x10000 ); + return price; + } + + +class Matchfinder_base + { + bool read_block(); + void normalize_pos(); + + Matchfinder_base( const Matchfinder_base & ); // declared as private + void operator=( const Matchfinder_base & ); // declared as private + +protected: + unsigned long long partial_data_pos; + uint8_t * buffer; /* input buffer */ + int32_t * prev_positions; /* 1 + last seen position of key. else 0 */ + int32_t * pos_array; /* may be tree or chain */ + const int before_size; /* bytes to keep in buffer before dictionary */ + int buffer_size; + int dictionary_size; /* bytes to keep in buffer before pos */ + int pos; /* current pos in buffer */ + int cyclic_pos; /* cycles through [0, dictionary_size] */ + int stream_pos; /* first byte not yet read from file */ + int pos_limit; /* when reached, a new block must be read */ + int key4_mask; + int num_prev_positions; /* size of prev_positions */ + int pos_array_size; + const int infd; /* input file descriptor */ + bool at_stream_end; /* stream_pos shows real end of file */ + + Matchfinder_base( const int before, const int dict_size, + const int after_size, const int dict_factor, + const int num_prev_positions23, + const int pos_array_factor, const int ifd ); + + ~Matchfinder_base() + { delete[] prev_positions; std::free( buffer ); } + +public: + uint8_t peek( const int distance ) const { return buffer[pos-distance]; } + int available_bytes() const { return stream_pos - pos; } + unsigned long long data_position() const { return partial_data_pos + pos; } + bool data_finished() const { return at_stream_end && pos >= stream_pos; } + const uint8_t * ptr_to_current_pos() const { return buffer + pos; } + + int true_match_len( const int index, const int distance, int len_limit ) const + { + const uint8_t * const data = buffer + pos + index; + int i = 0; + if( index + len_limit > available_bytes() ) + len_limit = available_bytes() - index; + while( i < len_limit && data[i-distance] == data[i] ) ++i; + return i; + } + + void move_pos() + { + if( ++cyclic_pos > dictionary_size ) cyclic_pos = 0; + if( ++pos >= pos_limit ) normalize_pos(); + } + + void reset(); + }; + + +class Range_encoder + { + enum { buffer_size = 65536 }; + uint64_t low; + unsigned long long partial_member_pos; + uint8_t * const buffer; /* output buffer */ + int pos; /* current pos in buffer */ + uint32_t range; + unsigned ff_count; + const int outfd; /* output file descriptor */ + uint8_t cache; + File_header header; + + void shift_low() + { + const bool carry = ( low > 0xFFFFFFFFU ); + if( carry || low < 0xFF000000U ) + { + put_byte( cache + carry ); + for( ; ff_count > 0; --ff_count ) put_byte( 0xFF + carry ); + cache = low >> 24; + } + else ++ff_count; + low = ( low & 0x00FFFFFFU ) << 8; + } + + Range_encoder( const Range_encoder & ); // declared as private + void operator=( const Range_encoder & ); // declared as private + +public: + void reset() + { + low = 0; + partial_member_pos = 0; + pos = 0; + range = 0xFFFFFFFFU; + ff_count = 0; + cache = 0; + for( int i = 0; i < File_header::size; ++i ) + put_byte( header.data[i] ); + } + + Range_encoder( const unsigned dictionary_size, const int ofd ) + : + buffer( new uint8_t[buffer_size] ), + outfd( ofd ) + { + header.set_magic(); + header.dictionary_size( dictionary_size ); + reset(); + } + + ~Range_encoder() { delete[] buffer; } + + unsigned long long member_position() const + { return partial_member_pos + pos + ff_count; } + + void flush() { for( int i = 0; i < 5; ++i ) shift_low(); } + void flush_data(); + + void put_byte( const uint8_t b ) + { + buffer[pos] = b; + if( ++pos >= buffer_size ) flush_data(); + } + + void encode( const int symbol, const int num_bits ) + { + for( int i = num_bits - 1; i >= 0; --i ) + { + range >>= 1; + if( (symbol >> i) & 1 ) low += range; + if( range <= 0x00FFFFFFU ) { range <<= 8; shift_low(); } + } + } + + void encode_bit( Bit_model & bm, const int bit ) + { + const uint32_t bound = ( range >> bit_model_total_bits ) * bm.probability; + if( !bit ) + { + range = bound; + bm.probability += (bit_model_total - bm.probability) >> bit_model_move_bits; + } + else + { + low += bound; + range -= bound; + bm.probability -= bm.probability >> bit_model_move_bits; + } + if( range <= 0x00FFFFFFU ) { range <<= 8; shift_low(); } + } + + void encode_tree3( Bit_model bm[], const int symbol ) + { + int model = 1; + int bit = ( symbol >> 2 ) & 1; + encode_bit( bm[model], bit ); model = ( model << 1 ) | bit; + bit = ( symbol >> 1 ) & 1; + encode_bit( bm[model], bit ); model = ( model << 1 ) | bit; + encode_bit( bm[model], symbol & 1 ); + } + + void encode_tree6( Bit_model bm[], const int symbol ) + { + int model = 1; + int bit = ( symbol >> 5 ) & 1; + encode_bit( bm[model], bit ); model = ( model << 1 ) | bit; + bit = ( symbol >> 4 ) & 1; + encode_bit( bm[model], bit ); model = ( model << 1 ) | bit; + bit = ( symbol >> 3 ) & 1; + encode_bit( bm[model], bit ); model = ( model << 1 ) | bit; + bit = ( symbol >> 2 ) & 1; + encode_bit( bm[model], bit ); model = ( model << 1 ) | bit; + bit = ( symbol >> 1 ) & 1; + encode_bit( bm[model], bit ); model = ( model << 1 ) | bit; + encode_bit( bm[model], symbol & 1 ); + } + + void encode_tree8( Bit_model bm[], const int symbol ) + { + int model = 1; + int mask = ( 1 << 7 ); + do { + const int bit = ( symbol & mask ); + encode_bit( bm[model], bit ); + model <<= 1; if( bit ) ++model; + } + while( mask >>= 1 ); + } + + void encode_tree_reversed( Bit_model bm[], int symbol, const int num_bits ) + { + int model = 1; + for( int i = num_bits; i > 0; --i ) + { + const int bit = symbol & 1; + encode_bit( bm[model], bit ); + model = ( model << 1 ) | bit; + symbol >>= 1; + } + } + + void encode_matched( Bit_model bm[], int symbol, int match_byte ) + { + int mask = 0x100; + symbol |= mask; + + do { + match_byte <<= 1; + const int match_bit = match_byte & mask; + symbol <<= 1; + const int bit = symbol & 0x100; + encode_bit( bm[match_bit+(symbol>>9)+mask], bit ); + mask &= ~(match_byte ^ symbol); /* if( match_bit != bit ) mask = 0; */ + } + while( symbol < 0x10000 ); + } + + void encode_len( Len_model & lm, int symbol, const int pos_state ) + { + symbol -= min_match_len; + bool bit = ( symbol >= len_low_symbols ); + encode_bit( lm.choice1, bit ); + if( !bit ) + encode_tree3( lm.bm_low[pos_state], symbol ); + else + { + bit = ( symbol >= len_low_symbols + len_mid_symbols ); + encode_bit( lm.choice2, bit ); + if( !bit ) + encode_tree3( lm.bm_mid[pos_state], symbol - len_low_symbols ); + else + encode_tree8( lm.bm_high, symbol - len_low_symbols - len_mid_symbols ); + } + } + }; + + +class LZ_encoder_base : public Matchfinder_base + { +protected: + enum { max_marker_size = 16, + num_rep_distances = 4 }; /* must be 4 */ + + uint32_t crc_; + + Bit_model bm_literal[1<= start_dis_model ) + { + const int direct_bits = ( dis_slot >> 1 ) - 1; + const unsigned base = ( 2 | ( dis_slot & 1 ) ) << direct_bits; + const unsigned direct_dis = dis - base; + + if( dis_slot < end_dis_model ) + renc.encode_tree_reversed( bm_dis + base - dis_slot - 1, direct_dis, + direct_bits ); + else + { + renc.encode( direct_dis >> dis_align_bits, direct_bits - dis_align_bits ); + renc.encode_tree_reversed( bm_align, direct_dis, dis_align_bits ); + } + } + } + + void full_flush( const State state ); + +public: + virtual ~LZ_encoder_base() {} + + unsigned long long member_position() const { return renc.member_position(); } + virtual void reset(); + + virtual bool encode_member( const unsigned long long member_size ) = 0; + }; diff --git a/fast_encoder.cc b/fast_encoder.cc index 9642e54..1ecd169 100644 --- a/fast_encoder.cc +++ b/fast_encoder.cc @@ -1,5 +1,5 @@ /* Lzip - LZMA lossless data compressor - Copyright (C) 2008-2014 Antonio Diaz Diaz. + Copyright (C) 2008-2015 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -26,15 +26,16 @@ #include #include "lzip.h" -#include "encoder.h" +#include "encoder_base.h" #include "fast_encoder.h" -int Fmatchfinder::longest_match_len( int * const distance ) +int FLZ_encoder::longest_match_len( int * const distance ) { + enum { len_limit = 16 }; if( len_limit > available_bytes() ) return 0; - const uint8_t * const data = buffer + pos; + const uint8_t * const data = ptr_to_current_pos(); key4 = ( ( key4 << 4 ) ^ data[3] ) & key4_mask; const int pos1 = pos + 1; @@ -48,10 +49,10 @@ int Fmatchfinder::longest_match_len( int * const distance ) { if( --count < 0 || newpos <= 0 ) { *ptr0 = 0; break; } const int delta = pos1 - newpos; - if( delta > dictionary_size_ ) { *ptr0 = 0; break; } + if( delta > dictionary_size ) { *ptr0 = 0; break; } int32_t * const newptr = pos_array + ( cyclic_pos - delta + - ( ( cyclic_pos >= delta ) ? 0 : dictionary_size_ + 1 ) ); + ( ( cyclic_pos >= delta ) ? 0 : dictionary_size + 1 ) ); if( data[maxlen-delta] == data[maxlen] ) { @@ -86,37 +87,35 @@ bool FLZ_encoder::encode_member( const unsigned long long member_size ) State state; for( int i = 0; i < num_rep_distances; ++i ) reps[i] = 0; - if( fmatchfinder.data_position() != 0 || - renc.member_position() != File_header::size ) - return false; // can be called only once + if( data_position() != 0 || renc.member_position() != File_header::size ) + return false; /* can be called only once */ - if( !fmatchfinder.finished() ) // encode first byte + if( !data_finished() ) // encode first byte { const uint8_t prev_byte = 0; - const uint8_t cur_byte = fmatchfinder[0]; + const uint8_t cur_byte = peek( 0 ); renc.encode_bit( bm_match[state()][0], 0 ); encode_literal( prev_byte, cur_byte ); crc32.update_byte( crc_, cur_byte ); - fmatchfinder.update_and_move( 1 ); + reset_key4(); + update_and_move( 1 ); } - while( !fmatchfinder.finished() && - renc.member_position() < member_size_limit ) + while( !data_finished() && renc.member_position() < member_size_limit ) { int match_distance; - const int main_len = fmatchfinder.longest_match_len( &match_distance ); - const int pos_state = fmatchfinder.data_position() & pos_state_mask; + const int main_len = longest_match_len( &match_distance ); + const int pos_state = data_position() & pos_state_mask; int len = 0; for( int i = 0; i < num_rep_distances; ++i ) { - const int tlen = - fmatchfinder.true_match_len( 0, reps[i] + 1, max_match_len ); + const int tlen = true_match_len( 0, reps[i] + 1, max_match_len ); if( tlen > len ) { len = tlen; rep = i; } } if( len > min_match_len && len + 3 > main_len ) { - crc32.update_buf( crc_, fmatchfinder.ptr_to_current_pos(), len ); + crc32.update_buf( crc_, ptr_to_current_pos(), len ); renc.encode_bit( bm_match[state()][pos_state], 1 ); renc.encode_bit( bm_rep[state()], 1 ); renc.encode_bit( bm_rep0[state()], rep != 0 ); @@ -133,42 +132,42 @@ bool FLZ_encoder::encode_member( const unsigned long long member_size ) } state.set_rep(); renc.encode_len( rep_len_model, len, pos_state ); - fmatchfinder.move_pos(); - fmatchfinder.update_and_move( len - 1 ); + move_pos(); + update_and_move( len - 1 ); continue; } if( main_len > min_match_len ) { - crc32.update_buf( crc_, fmatchfinder.ptr_to_current_pos(), main_len ); + crc32.update_buf( crc_, ptr_to_current_pos(), main_len ); renc.encode_bit( bm_match[state()][pos_state], 1 ); renc.encode_bit( bm_rep[state()], 0 ); state.set_match(); for( int i = num_rep_distances - 1; i > 0; --i ) reps[i] = reps[i-1]; reps[0] = match_distance; encode_pair( match_distance, main_len, pos_state ); - fmatchfinder.move_pos(); - fmatchfinder.update_and_move( main_len - 1 ); + move_pos(); + update_and_move( main_len - 1 ); continue; } - const uint8_t prev_byte = fmatchfinder[1]; - const uint8_t cur_byte = fmatchfinder[0]; - const uint8_t match_byte = fmatchfinder[reps[0]+1]; - fmatchfinder.move_pos(); + const uint8_t prev_byte = peek( 1 ); + const uint8_t cur_byte = peek( 0 ); + const uint8_t match_byte = peek( reps[0] + 1 ); + move_pos(); crc32.update_byte( crc_, cur_byte ); if( match_byte == cur_byte ) { + const int short_rep_price = price1( bm_match[state()][pos_state] ) + + price1( bm_rep[state()] ) + + price0( bm_rep0[state()] ) + + price0( bm_len[state()][pos_state] ); int price = price0( bm_match[state()][pos_state] ); if( state.is_char() ) price += price_literal( prev_byte, cur_byte ); else price += price_matched( prev_byte, cur_byte, match_byte ); - const int short_rep_price = price1( bm_match[state()][pos_state] ) + - price1( bm_rep[state()] ) + - price0( bm_rep0[state()] ) + - price0( bm_len[state()][pos_state] ); if( short_rep_price < price ) { renc.encode_bit( bm_match[state()][pos_state], 1 ); @@ -180,7 +179,7 @@ bool FLZ_encoder::encode_member( const unsigned long long member_size ) } } - // literal byte + /* literal byte */ renc.encode_bit( bm_match[state()][pos_state], 0 ); if( state.is_char() ) encode_literal( prev_byte, cur_byte ); @@ -189,6 +188,6 @@ bool FLZ_encoder::encode_member( const unsigned long long member_size ) state.set_char(); } - full_flush( fmatchfinder.data_position(), state ); + full_flush( state ); return true; } diff --git a/fast_encoder.h b/fast_encoder.h index e37ad7f..b26e388 100644 --- a/fast_encoder.h +++ b/fast_encoder.h @@ -1,5 +1,5 @@ /* Lzip - LZMA lossless data compressor - Copyright (C) 2008-2014 Antonio Diaz Diaz. + Copyright (C) 2008-2015 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -15,17 +15,9 @@ along with this program. If not, see . */ -class Fmatchfinder : public Matchfinder_base +class FLZ_encoder : public LZ_encoder_base { - enum { before_size = max_match_len + 1, - dict_size = 65536, - // bytes to keep in buffer after pos - after_size = max_match_len, - dict_factor = 16, - num_prev_positions23 = 0, - pos_array_factor = 1 }; - - int key4; // key made from latest 4 bytes + int key4; /* key made from latest 4 bytes */ void reset_key4() { @@ -34,15 +26,6 @@ class Fmatchfinder : public Matchfinder_base key4 = ( key4 << 4 ) ^ buffer[i]; } -public: - explicit Fmatchfinder( const int ifd ) - : - Matchfinder_base( before_size, dict_size, after_size, dict_factor, - num_prev_positions23, pos_array_factor, ifd ) - { reset_key4(); } - - enum { len_limit = 16 }; - void reset() { Matchfinder_base::reset(); reset_key4(); } int longest_match_len( int * const distance ); void update_and_move( int n ) @@ -59,18 +42,20 @@ public: move_pos(); } } - }; - -class FLZ_encoder : public LZ_encoder_base - { - Fmatchfinder & fmatchfinder; + enum { before = 0, + dict_size = 65536, + /* bytes to keep in buffer after pos */ + after_size = max_match_len, + dict_factor = 16, + num_prev_positions23 = 0, + pos_array_factor = 1 }; public: - FLZ_encoder( Fmatchfinder & mf, const File_header & header, const int outfd ) + FLZ_encoder( const int ifd, const int outfd ) : - LZ_encoder_base( header, outfd ), - fmatchfinder( mf ) + LZ_encoder_base( before, dict_size, after_size, dict_factor, + num_prev_positions23, pos_array_factor, ifd, outfd ) {} bool encode_member( const unsigned long long member_size ); diff --git a/lzip.h b/lzip.h index 4cad6fa..4a8bc98 100644 --- a/lzip.h +++ b/lzip.h @@ -1,5 +1,5 @@ /* Lzip - LZMA lossless data compressor - Copyright (C) 2008-2014 Antonio Diaz Diaz. + Copyright (C) 2008-2015 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -40,7 +40,7 @@ public: enum { min_dictionary_bits = 12, - min_dictionary_size = 1 << min_dictionary_bits, // >= modeled_distances + min_dictionary_size = 1 << min_dictionary_bits, /* >= modeled_distances */ max_dictionary_bits = 29, max_dictionary_size = 1 << max_dictionary_bits, literal_context_bits = 3, @@ -52,7 +52,7 @@ enum { dis_slot_bits = 6, start_dis_model = 4, end_dis_model = 14, - modeled_distances = 1 << (end_dis_model / 2), // 128 + modeled_distances = 1 << (end_dis_model / 2), /* 128 */ dis_align_bits = 4, dis_align_size = 1 << dis_align_bits, @@ -64,8 +64,8 @@ enum { len_high_symbols = 1 << len_high_bits, max_len_symbols = len_low_symbols + len_mid_symbols + len_high_symbols, - min_match_len = 2, // must be 2 - max_match_len = min_match_len + max_len_symbols - 1, // 273 + min_match_len = 2, /* must be 2 */ + max_match_len = min_match_len + max_len_symbols - 1, /* 273 */ min_match_len_limit = 5 }; inline int get_len_state( const int len ) @@ -82,7 +82,10 @@ enum { bit_model_move_bits = 5, struct Bit_model { int probability; - Bit_model() : probability( bit_model_total / 2 ) {} + void reset() { probability = bit_model_total / 2; } + void reset( const int size ) + { for( int i = 0; i < size; ++i ) this[i].reset(); } + Bit_model() { reset(); } }; struct Len_model @@ -92,6 +95,15 @@ struct Len_model Bit_model bm_low[pos_states][len_low_symbols]; Bit_model bm_mid[pos_states][len_mid_symbols]; Bit_model bm_high[len_high_symbols]; + + void reset() + { + choice1.reset(); + choice2.reset(); + bm_low[0][0].reset( pos_states * len_low_symbols ); + bm_mid[0][0].reset( pos_states * len_mid_symbols ); + bm_high[0].reset( len_high_symbols ); + } }; @@ -173,9 +185,9 @@ const uint8_t magic_string[4] = { 0x4C, 0x5A, 0x49, 0x50 }; // "LZIP" struct File_header { - uint8_t data[6]; // 0-3 magic bytes - // 4 version - // 5 coded_dict_size + uint8_t data[6]; /* 0-3 magic bytes */ + /* 4 version */ + /* 5 coded_dict_size */ enum { size = 6 }; void set_magic() { std::memcpy( data, magic_string, 4 ); data[4] = 1; } @@ -201,9 +213,9 @@ struct File_header if( sz > min_dictionary_size ) { const unsigned base_size = 1 << data[5]; - const unsigned wedge = base_size / 16; + const unsigned fraction = base_size / 16; for( int i = 7; i >= 1; --i ) - if( base_size - ( i * wedge ) >= sz ) + if( base_size - ( i * fraction ) >= sz ) { data[5] |= ( i << 5 ); break; } } return true; @@ -215,9 +227,9 @@ struct File_header struct File_trailer { - uint8_t data[20]; // 0-3 CRC32 of the uncompressed data - // 4-11 size of the uncompressed data - // 12-19 member size including header and trailer + uint8_t data[20]; /* 0-3 CRC32 of the uncompressed data */ + /* 4-11 size of the uncompressed data */ + /* 12-19 member size including header and trailer */ static int size( const int version = 1 ) { return ( ( version >= 1 ) ? 20 : 12 ); } diff --git a/main.cc b/main.cc index dd1fb27..6b5c430 100644 --- a/main.cc +++ b/main.cc @@ -1,5 +1,5 @@ /* Lzip - LZMA lossless data compressor - Copyright (C) 2008-2014 Antonio Diaz Diaz. + Copyright (C) 2008-2015 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -56,6 +56,7 @@ #include "arg_parser.h" #include "lzip.h" #include "decoder.h" +#include "encoder_base.h" #include "encoder.h" #include "fast_encoder.h" @@ -72,7 +73,7 @@ namespace { const char * const Program_name = "Lzip"; const char * const program_name = "lzip"; -const char * const program_year = "2014"; +const char * const program_year = "2015"; const char * invocation_name = 0; struct { const char * from; const char * to; } const known_extensions[] = { @@ -82,8 +83,8 @@ struct { const char * from; const char * to; } const known_extensions[] = { struct Lzma_options { - int dictionary_size; // 4 KiB .. 512 MiB - int match_len_limit; // 5 .. 273 + int dictionary_size; /* 4 KiB .. 512 MiB */ + int match_len_limit; /* 5 .. 273 */ }; enum Mode { m_compress, m_decompress, m_test }; @@ -126,8 +127,7 @@ void show_help() "The bidimensional parameter space of LZMA can't be mapped to a linear\n" "scale optimal for all files. If your files are large, very repetitive,\n" "etc, you may need to use the --match-length and --dictionary-size\n" - "options directly to achieve optimal performance. For example, -9m64\n" - "usually compresses executables more (and faster) than -9.\n" + "options directly to achieve optimal performance.\n" "\nExit status: 0 for a normal exit, 1 for environmental problems (file\n" "not found, invalid flags, I/O errors, etc), 2 to indicate a corrupt or\n" "invalid input file, 3 for an internal consistency error (eg, bug) which\n" @@ -149,20 +149,23 @@ void show_version() void show_header( const File_header & header ) { - const char * const prefix[8] = - { "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi", "Yi" }; - enum { factor = 1024 }; - const char * p = ""; - const char * np = " "; - unsigned num = header.dictionary_size(); - bool exact = ( num % factor == 0 ); - - for( int i = 0; i < 8 && ( num > 9999 || ( exact && num >= factor ) ); ++i ) - { num /= factor; if( num % factor != 0 ) exact = false; - p = prefix[i]; np = ""; } - if( verbosity >= 4 && header.version() != 1 ) - std::fprintf( stderr, "version %d, ", header.version() ); - std::fprintf( stderr, "dictionary size %s%4u %sB. ", np, num, p ); + if( verbosity >= 3 ) + { + const char * const prefix[8] = + { "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi", "Yi" }; + enum { factor = 1024 }; + const char * p = ""; + const char * np = " "; + unsigned num = header.dictionary_size(); + bool exact = ( num % factor == 0 ); + + for( int i = 0; i < 8 && ( num > 9999 || ( exact && num >= factor ) ); ++i ) + { num /= factor; if( num % factor != 0 ) exact = false; + p = prefix[i]; np = ""; } + if( verbosity >= 4 && header.version() != 1 ) + std::fprintf( stderr, "version %d, ", header.version() ); + std::fprintf( stderr, "dictionary size %s%4u %sB. ", np, num, p ); + } } @@ -337,7 +340,7 @@ bool open_outstream( const bool force ) bool check_tty( const int infd, const Mode program_mode ) { - if( program_mode == m_compress && outfd >= 0 && isatty( outfd ) ) + if( program_mode == m_compress && isatty( outfd ) ) { show_error( "I won't write compressed data to a terminal.", 0, true ); return false; @@ -368,14 +371,14 @@ void cleanup_and_fail( const int retval ) } - // Set permissions, owner and times. + /* Set permissions, owner and times. */ void close_and_set_permissions( const struct stat * const in_statsp ) { bool warning = false; if( in_statsp ) { const mode_t mode = in_statsp->st_mode; - // fchown will in many cases return with EPERM, which can be safely ignored. + /* fchown will in many cases return with EPERM, which can be safely ignored. */ if( fchown( outfd, in_statsp->st_uid, in_statsp->st_gid ) == 0 ) { if( fchmod( outfd, mode ) != 0 ) warning = true; } else @@ -400,9 +403,9 @@ void close_and_set_permissions( const struct stat * const in_statsp ) bool next_filename() { - const unsigned len = std::strlen( known_extensions[0].from ); - if( output_filename.size() >= len + 5 ) // "*00001.lz" - for( int i = output_filename.size() - len - 1, j = 0; j < 5; --i, ++j ) + const unsigned ext_len = std::strlen( known_extensions[0].from ); + if( output_filename.size() >= ext_len + 5 ) // "*00001.lz" + for( int i = output_filename.size() - ext_len - 1, j = 0; j < 5; --i, ++j ) { if( output_filename[i] < '9' ) { ++output_filename[i]; return true; } else output_filename[i] = '0'; @@ -412,114 +415,44 @@ bool next_filename() int compress( const unsigned long long member_size, - const unsigned long long volume_size, - const Lzma_options & encoder_options, const int infd, - const Pretty_print & pp, const struct stat * const in_statsp ) + const unsigned long long volume_size, const int infd, + const Lzma_options & encoder_options, const Pretty_print & pp, + const struct stat * const in_statsp, const bool zero ) { const unsigned long long cfile_size = (in_statsp && S_ISREG( in_statsp->st_mode )) ? in_statsp->st_size / 100 : 0; int retval = 0; - File_header header; - header.set_magic(); - + LZ_encoder_base * encoder = 0; // polymorphic encoder if( verbosity >= 1 ) pp(); - if( !header.dictionary_size( encoder_options.dictionary_size ) || - encoder_options.match_len_limit < min_match_len_limit || - encoder_options.match_len_limit > max_match_len ) - internal_error( "invalid argument to encoder." ); try { - Matchfinder matchfinder( header.dictionary_size(), - encoder_options.match_len_limit, infd ); - header.dictionary_size( matchfinder.dictionary_size() ); - - unsigned long long in_size = 0, out_size = 0, partial_volume_size = 0; - while( true ) // encode one member per iteration - { - LZ_encoder encoder( matchfinder, header, outfd ); - const unsigned long long size = ( volume_size > 0 ) ? - std::min( member_size, volume_size - partial_volume_size ) : member_size; - if( verbosity >= 2 ) - show_progress( in_size, &matchfinder, &pp, cfile_size ); // init - if( !encoder.encode_member( size ) ) - { pp( "Encoder error." ); retval = 1; break; } - in_size += matchfinder.data_position(); - out_size += encoder.member_position(); - if( matchfinder.finished() ) break; - if( volume_size > 0 ) - { - partial_volume_size += encoder.member_position(); - if( partial_volume_size >= volume_size - min_dictionary_size ) - { - partial_volume_size = 0; - if( delete_output_on_interrupt ) - { - close_and_set_permissions( in_statsp ); - if( !next_filename() ) - { pp( "Too many volume files." ); retval = 1; break; } - if( !open_outstream( true ) ) { retval = 1; break; } - delete_output_on_interrupt = true; - } - } - } - matchfinder.reset(); - } - - if( retval == 0 && verbosity >= 1 ) + if( zero ) + encoder = new FLZ_encoder( infd, outfd ); + else { - if( in_size == 0 || out_size == 0 ) - std::fprintf( stderr, " no data compressed.\n" ); - else - std::fprintf( stderr, "%6.3f:1, %6.3f bits/byte, " - "%5.2f%% saved, %llu in, %llu out.\n", - (double)in_size / out_size, - ( 8.0 * out_size ) / in_size, - 100.0 * ( 1.0 - ( (double)out_size / in_size ) ), - in_size, out_size ); + File_header header; + if( !header.dictionary_size( encoder_options.dictionary_size ) || + encoder_options.match_len_limit < min_match_len_limit || + encoder_options.match_len_limit > max_match_len ) + internal_error( "invalid argument to encoder." ); + encoder = new LZ_encoder( header.dictionary_size(), + encoder_options.match_len_limit, infd, outfd ); } - } - catch( std::bad_alloc ) - { - pp( "Not enough memory. Try a smaller dictionary size." ); - retval = 1; - } - catch( Error e ) { pp(); show_error( e.msg, errno ); retval = 1; } - return retval; - } - - -int fcompress( const unsigned long long member_size, - const unsigned long long volume_size, const int infd, - const Pretty_print & pp, const struct stat * const in_statsp ) - { - const unsigned long long cfile_size = - (in_statsp && S_ISREG( in_statsp->st_mode )) ? in_statsp->st_size / 100 : 0; - int retval = 0; - File_header header; - header.set_magic(); - if( verbosity >= 1 ) pp(); - - try { - Fmatchfinder fmatchfinder( infd ); - if( !header.dictionary_size( fmatchfinder.dictionary_size() ) ) - internal_error( "invalid argument to encoder." ); unsigned long long in_size = 0, out_size = 0, partial_volume_size = 0; - while( true ) // encode one member per iteration + while( true ) /* encode one member per iteration */ { - FLZ_encoder encoder( fmatchfinder, header, outfd ); const unsigned long long size = ( volume_size > 0 ) ? std::min( member_size, volume_size - partial_volume_size ) : member_size; - if( verbosity >= 2 ) - show_progress( in_size, &fmatchfinder, &pp, cfile_size ); // init - if( !encoder.encode_member( size ) ) + show_progress( in_size, encoder, &pp, cfile_size ); // init + if( !encoder->encode_member( size ) ) { pp( "Encoder error." ); retval = 1; break; } - in_size += fmatchfinder.data_position(); - out_size += encoder.member_position(); - if( fmatchfinder.finished() ) break; + in_size += encoder->data_position(); + out_size += encoder->member_position(); + if( encoder->data_finished() ) break; if( volume_size > 0 ) { - partial_volume_size += encoder.member_position(); + partial_volume_size += encoder->member_position(); if( partial_volume_size >= volume_size - min_dictionary_size ) { partial_volume_size = 0; @@ -533,7 +466,7 @@ int fcompress( const unsigned long long member_size, } } } - fmatchfinder.reset(); + encoder->reset(); } if( retval == 0 && verbosity >= 1 ) @@ -555,6 +488,7 @@ int fcompress( const unsigned long long member_size, retval = 1; } catch( Error e ) { pp(); show_error( e.msg, errno ); retval = 1; } + delete encoder; return retval; } @@ -637,7 +571,7 @@ int decompress( const int infd, const Pretty_print & pp, const bool testing ) { pp( "Invalid dictionary size in member header." ); retval = 2; break; } if( verbosity >= 2 || ( verbosity == 1 && first_member ) ) - { pp(); if( verbosity >= 3 ) show_header( header ); } + { pp(); show_header( header ); } LZ_decoder decoder( header, rdec, outfd ); const int result = decoder.decode_member( pp ); @@ -719,20 +653,23 @@ void show_progress( const unsigned long long partial_size, const Pretty_print * const p, const unsigned long long cfile_size ) { - static unsigned long long csize = 0; // file_size / 100 + static unsigned long long csize = 0; /* file_size / 100 */ static unsigned long long psize = 0; static const Matchfinder_base * mb = 0; static const Pretty_print * pp = 0; - if( m ) // initialize static vars - { csize = cfile_size; psize = partial_size; mb = m; pp = p; } - if( mb && pp ) + if( verbosity >= 2 ) { - const unsigned long long pos = psize + mb->data_position(); - if( csize > 0 ) - std::fprintf( stderr, "%4llu%%", pos / csize ); - std::fprintf( stderr, " %.1f MB\r", pos / 1000000.0 ); - pp->reset(); (*pp)(); // restore cursor position + if( m ) /* initialize static vars */ + { csize = cfile_size; psize = partial_size; mb = m; pp = p; } + if( mb && pp ) + { + const unsigned long long pos = psize + mb->data_position(); + if( csize > 0 ) + std::fprintf( stderr, "%4llu%%", pos / csize ); + std::fprintf( stderr, " %.1f MB\r", pos / 1000000.0 ); + pp->reset(); (*pp)(); // restore cursor position + } } } @@ -743,16 +680,16 @@ int main( const int argc, const char * const argv[] ) to the corresponding LZMA compression modes. */ const Lzma_options option_mapping[] = { - { 1 << 16, 16 }, // -0 entry values not used - { 1 << 20, 5 }, // -1 - { 3 << 19, 6 }, // -2 - { 1 << 21, 8 }, // -3 - { 3 << 20, 12 }, // -4 - { 1 << 22, 20 }, // -5 - { 1 << 23, 36 }, // -6 - { 1 << 24, 68 }, // -7 - { 3 << 23, 132 }, // -8 - { 1 << 25, 273 } }; // -9 + { 1 << 16, 16 }, /* -0 entry values not used */ + { 1 << 20, 5 }, /* -1 */ + { 3 << 19, 6 }, /* -2 */ + { 1 << 21, 8 }, /* -3 */ + { 3 << 20, 12 }, /* -4 */ + { 1 << 22, 20 }, /* -5 */ + { 1 << 23, 36 }, /* -6 */ + { 1 << 24, 68 }, /* -7 */ + { 3 << 23, 132 }, /* -8 */ + { 1 << 25, 273 } }; /* -9 */ Lzma_options encoder_options = option_mapping[6]; // default = "-6" const unsigned long long max_member_size = 0x0100000000000000ULL; const unsigned long long max_volume_size = 0x4000000000000000ULL; @@ -808,7 +745,7 @@ int main( const int argc, const char * const argv[] ) for( ; argind < parser.arguments(); ++argind ) { const int code = parser.code( argind ); - if( !code ) break; // no more options + if( !code ) break; /* no more options */ const char * const arg = parser.argument( argind ).c_str(); switch( code ) { @@ -837,7 +774,7 @@ int main( const int argc, const char * const argv[] ) case 'V': show_version(); return 0; default : internal_error( "uncaught option." ); } - } // end process options + } /* end process options */ #if defined(__MSVCRT__) || defined(__OS2__) setmode( STDIN_FILENO, O_BINARY ); @@ -929,13 +866,8 @@ int main( const int argc, const char * const argv[] ) pp.set_name( input_filename ); int tmp; if( program_mode == m_compress ) - { - if( zero ) - tmp = fcompress( member_size, volume_size, infd, pp, in_statsp ); - else - tmp = compress( member_size, volume_size, encoder_options, infd, - pp, in_statsp ); - } + tmp = compress( member_size, volume_size, infd, encoder_options, pp, + in_statsp, zero ); else tmp = decompress( infd, pp, program_mode == m_test ); if( tmp > retval ) retval = tmp; -- cgit v1.2.3