diff options
-rw-r--r-- | ChangeLog | 9 | ||||
-rw-r--r-- | NEWS | 14 | ||||
-rw-r--r-- | arg_parser.cc | 4 | ||||
-rw-r--r-- | arg_parser.h | 2 | ||||
-rwxr-xr-x | configure | 4 | ||||
-rw-r--r-- | decoder.cc | 67 | ||||
-rw-r--r-- | decoder.h | 51 | ||||
-rw-r--r-- | doc/lzip.1 | 2 | ||||
-rw-r--r-- | doc/lzip.info | 695 | ||||
-rw-r--r-- | doc/lzip.texinfo | 676 | ||||
-rw-r--r-- | encoder.cc | 9 | ||||
-rw-r--r-- | encoder.h | 23 | ||||
-rw-r--r-- | fast_encoder.cc | 2 | ||||
-rw-r--r-- | lzip.h | 9 | ||||
-rw-r--r-- | main.cc | 7 |
15 files changed, 1443 insertions, 131 deletions
@@ -1,3 +1,12 @@ +2013-03-21 Antonio Diaz Diaz <ant_diaz@teleline.es> + + * Version 1.15-pre1 released. + * Decompression time has been reduced by 1%. + * main.cc (show_header): Show header version if verbosity >= 4. + * Ignore option '-n, --threads' for compatibility with plzip. + * Added chapter 'Stream Format' and appendix 'Reference source code' + to the manual. + 2013-02-17 Antonio Diaz Diaz <ant_diaz@teleline.es> * Version 1.14 released. @@ -1,11 +1,11 @@ -Changes in version 1.14: +Changes in version 1.15: -Multi-step trials have been implemented. +Decompression time has been reduced by 1%. -Compression ratio has been slightly increased. +File version is now shown only if verbosity >= 4. -Compression time has been reduced by 5%. +Option "-n, --threads" is now accepted and ignored for compatibility +with plzip. -Decompression time has been reduced by 12%. - -The target "install-bin" has been added to the Makefile. +The chapter "Stream Format" and the appendix "Reference source code" +have been added to the manual. diff --git a/arg_parser.cc b/arg_parser.cc index 13bcbcb..a28d2ba 100644 --- a/arg_parser.cc +++ b/arg_parser.cc @@ -1,5 +1,5 @@ /* Arg_parser - POSIX/GNU command line argument parser. (C++ version) - Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012 + Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 Antonio Diaz Diaz. This library is free software: you can redistribute it and/or modify @@ -44,7 +44,7 @@ bool Arg_parser::parse_long_option( const char * const opt, const char * const a // Test all long options for either exact match or abbreviated matches. for( int i = 0; options[i].code != 0; ++i ) - if( options[i].name && !std::strncmp( options[i].name, &opt[2], len ) ) + if( options[i].name && std::strncmp( options[i].name, &opt[2], len ) == 0 ) { if( std::strlen( options[i].name ) == len ) // Exact match found { index = i; exact = true; break; } diff --git a/arg_parser.h b/arg_parser.h index 4fbd1af..5248cb1 100644 --- a/arg_parser.h +++ b/arg_parser.h @@ -1,5 +1,5 @@ /* Arg_parser - POSIX/GNU command line argument parser. (C++ version) - Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012 + Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 Antonio Diaz Diaz. This library is free software: you can redistribute it and/or modify @@ -8,9 +8,9 @@ args= no_create= pkgname=lzip -pkgversion=1.14 +pkgversion=1.15-pre1 progname=lzip -srctrigger=lzip.h +srctrigger=doc/lzip.texinfo # clear some things potentially inherited from environment. LC_ALL=C @@ -122,10 +122,10 @@ bool LZ_decoder::verify_trailer( const Pretty_print & pp ) const File_trailer trailer; const int trailer_size = File_trailer::size( member_version ); const unsigned long long member_size = - range_decoder.member_position() + trailer_size; + rdec.member_position() + trailer_size; bool error = false; - int size = range_decoder.read_data( trailer.data, trailer_size ); + int size = rdec.read_data( trailer.data, trailer_size ); if( size < trailer_size ) { error = true; @@ -140,7 +140,7 @@ bool LZ_decoder::verify_trailer( const Pretty_print & pp ) const if( member_version == 0 ) trailer.member_size( member_size ); - if( !range_decoder.code_is_zero() ) + if( !rdec.code_is_zero() ) { error = true; pp( "Range decoder final code is not zero" ); @@ -201,82 +201,79 @@ int LZ_decoder::decode_member( const Pretty_print & pp ) Bit_model bm_dis_slot[max_dis_states][1<<dis_slot_bits]; Bit_model bm_dis[modeled_distances-end_dis_model]; Bit_model bm_align[dis_align_size]; - Len_decoder len_decoder; - Len_decoder rep_match_len_decoder; + Len_model match_len_model; + Len_model rep_len_model; unsigned rep0 = 0; // rep[0-3] latest four distances unsigned rep1 = 0; // used for efficient coding of unsigned rep2 = 0; // repeated distances unsigned rep3 = 0; - State state; - range_decoder.load(); - while( !range_decoder.finished() ) + rdec.load(); + while( !rdec.finished() ) { const int pos_state = data_position() & pos_state_mask; - if( range_decoder.decode_bit( bm_match[state()][pos_state] ) == 0 ) + if( rdec.decode_bit( bm_match[state()][pos_state] ) == 0 ) { const uint8_t prev_byte = get_prev_byte(); if( state.is_char() ) - put_byte( range_decoder.decode_tree( bm_literal[get_lit_state(prev_byte)], 8 ) ); + put_byte( rdec.decode_tree( bm_literal[get_lit_state(prev_byte)], 8 ) ); else - put_byte( range_decoder.decode_matched( bm_literal[get_lit_state(prev_byte)], - get_byte( rep0 ) ) ); + put_byte( rdec.decode_matched( bm_literal[get_lit_state(prev_byte)], + get_byte( rep0 ) ) ); state.set_char(); } else { int len; - if( range_decoder.decode_bit( bm_rep[state()] ) == 1 ) + if( rdec.decode_bit( bm_rep[state()] ) == 1 ) { - len = 0; - if( range_decoder.decode_bit( bm_rep0[state()] ) == 1 ) + if( rdec.decode_bit( bm_rep0[state()] ) == 0 ) + { + if( rdec.decode_bit( bm_len[state()][pos_state] ) == 0 ) + { state.set_short_rep(); put_byte( get_byte( rep0 ) ); continue; } + } + else { unsigned distance; - if( range_decoder.decode_bit( bm_rep1[state()] ) == 0 ) + if( rdec.decode_bit( bm_rep1[state()] ) == 0 ) distance = rep1; else { - if( range_decoder.decode_bit( bm_rep2[state()] ) == 0 ) + if( rdec.decode_bit( bm_rep2[state()] ) == 0 ) distance = rep2; - else { distance = rep3; rep3 = rep2; } + else + { distance = rep3; rep3 = rep2; } rep2 = rep1; } rep1 = rep0; rep0 = distance; } - else - { - if( range_decoder.decode_bit( bm_len[state()][pos_state] ) == 0 ) - { state.set_short_rep(); len = 1; } - } - if( len == 0 ) - { - state.set_rep(); - len = min_match_len + rep_match_len_decoder.decode( range_decoder, pos_state ); - } + len = min_match_len + rdec.decode_len( rep_len_model, pos_state ); + state.set_rep(); } else { const unsigned rep0_saved = rep0; - len = min_match_len + len_decoder.decode( range_decoder, pos_state ); - const int dis_slot = range_decoder.decode_tree6( bm_dis_slot[get_dis_state(len)] ); + len = min_match_len + rdec.decode_len( match_len_model, pos_state ); + const int dis_slot = rdec.decode_tree6( bm_dis_slot[get_dis_state(len)] ); if( dis_slot < start_dis_model ) rep0 = dis_slot; else { const int direct_bits = ( dis_slot >> 1 ) - 1; rep0 = ( 2 | ( dis_slot & 1 ) ) << direct_bits; if( dis_slot < end_dis_model ) - rep0 += range_decoder.decode_tree_reversed( bm_dis + rep0 - dis_slot - 1, direct_bits ); + rep0 += rdec.decode_tree_reversed( bm_dis + rep0 - dis_slot - 1, + direct_bits ); else { - rep0 += range_decoder.decode( direct_bits - dis_align_bits ) << dis_align_bits; - rep0 += range_decoder.decode_tree_reversed4( bm_align ); + rep0 += rdec.decode( direct_bits - dis_align_bits ) << dis_align_bits; + rep0 += rdec.decode_tree_reversed4( bm_align ); if( rep0 == 0xFFFFFFFFU ) // Marker found { rep0 = rep0_saved; - range_decoder.normalize(); + rdec.normalize(); flush_data(); if( len == min_match_len ) // End Of Stream marker { @@ -284,7 +281,7 @@ int LZ_decoder::decode_member( const Pretty_print & pp ) } if( len == min_match_len + 1 ) // Sync Flush marker { - range_decoder.load(); continue; + rdec.load(); continue; } if( pp.verbosity() >= 0 ) { @@ -122,22 +122,22 @@ public: int decode_tree( Bit_model bm[], const int num_bits ) { - int model = 1; + int symbol = 1; for( int i = num_bits; i > 0; --i ) - model = ( model << 1 ) | decode_bit( bm[model] ); - return model - (1 << num_bits); + symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); + return symbol - (1 << num_bits); } int decode_tree6( Bit_model bm[] ) { - int model = 1; - model = ( model << 1 ) | decode_bit( bm[model] ); - model = ( model << 1 ) | decode_bit( bm[model] ); - model = ( model << 1 ) | decode_bit( bm[model] ); - model = ( model << 1 ) | decode_bit( bm[model] ); - model = ( model << 1 ) | decode_bit( bm[model] ); - model = ( model << 1 ) | decode_bit( bm[model] ); - return model - (1 << 6); + int symbol = 1; + symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); + symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); + symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); + symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); + symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); + symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); + return symbol - (1 << 6); } int decode_tree_reversed( Bit_model bm[], const int num_bits ) @@ -186,27 +186,16 @@ public: } return symbol - 0x100; } - }; - -class Len_decoder - { - Bit_model choice1; - Bit_model choice2; - Bit_model bm_low[pos_states][len_low_symbols]; - Bit_model bm_mid[pos_states][len_mid_symbols]; - Bit_model bm_high[len_high_symbols]; - -public: - int decode( Range_decoder & range_decoder, const int pos_state ) + int decode_len( Len_model & lm, const int pos_state ) { - if( range_decoder.decode_bit( choice1 ) == 0 ) - return range_decoder.decode_tree( bm_low[pos_state], len_low_bits ); - if( range_decoder.decode_bit( choice2 ) == 0 ) + if( decode_bit( lm.choice1 ) == 0 ) + return decode_tree( lm.bm_low[pos_state], len_low_bits ); + if( decode_bit( lm.choice2 ) == 0 ) return len_low_symbols + - range_decoder.decode_tree( bm_mid[pos_state], len_mid_bits ); + decode_tree( lm.bm_mid[pos_state], len_mid_bits ); return len_low_symbols + len_mid_symbols + - range_decoder.decode_tree( bm_high, len_high_bits ); + decode_tree( lm.bm_high, len_high_bits ); } }; @@ -214,7 +203,7 @@ public: class LZ_decoder { unsigned long long partial_data_pos; - Range_decoder & range_decoder; + Range_decoder & rdec; const int dictionary_size; const int buffer_size; uint8_t * const buffer; // output buffer @@ -267,10 +256,10 @@ class LZ_decoder void operator=( const LZ_decoder & ); // declared as private public: - LZ_decoder( const File_header & header, Range_decoder & rdec, const int ofd ) + LZ_decoder( const File_header & header, Range_decoder & rde, const int ofd ) : partial_data_pos( 0 ), - range_decoder( rdec ), + rdec( rde ), dictionary_size( header.dictionary_size() ), buffer_size( std::max( 65536, dictionary_size ) ), buffer( new uint8_t[buffer_size] ), @@ -1,5 +1,5 @@ .\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.37.1. -.TH LZIP "1" "February 2013" "Lzip 1.14" "User Commands" +.TH LZIP "1" "March 2013" "Lzip 1.15-pre1" "User Commands" .SH NAME Lzip \- reduces the size of files .SH SYNOPSIS diff --git a/doc/lzip.info b/doc/lzip.info index e61550e..5c4cf81 100644 --- a/doc/lzip.info +++ b/doc/lzip.info @@ -11,17 +11,19 @@ File: lzip.info, Node: Top, Next: Introduction, Up: (dir) Lzip Manual *********** -This manual is for Lzip (version 1.14, 17 February 2013). +This manual is for Lzip (version 1.15-pre1, 21 March 2013). * Menu: -* Introduction:: Purpose and features of lzip -* Algorithm:: How lzip compresses the data -* Invoking Lzip:: Command line interface -* File Format:: Detailed format of the compressed file -* Examples:: A small tutorial with examples -* Problems:: Reporting bugs -* Concept Index:: Index of concepts +* Introduction:: Purpose and features of lzip +* Algorithm:: How lzip compresses the data +* Invoking Lzip:: Command line interface +* File Format:: Detailed format of the compressed file +* Stream Format:: Format of the LZMA stream in lzip files +* Examples:: A small tutorial with examples +* Problems:: Reporting bugs +* Reference source code:: Source code illustrating stream format +* Concept Index:: Index of concepts Copyright (C) 2008, 2009, 2010, 2011, 2012, 2013 Antonio Diaz Diaz. @@ -324,7 +326,7 @@ Z zettabyte (10^21) | Zi zebibyte (2^70) Y yottabyte (10^24) | Yi yobibyte (2^80) -File: lzip.info, Node: File Format, Next: Examples, Prev: Invoking Lzip, Up: Top +File: lzip.info, Node: File Format, Next: Stream Format, Prev: Invoking Lzip, Up: Top 4 File Format ************* @@ -375,6 +377,7 @@ additional information before, between, or after them. Bits 4-0 contain the base 2 logarithm of the base size (12 to 29). Bits 7-5 contain the number of wedges (0 to 7) to substract from the base size to obtain the dictionary size. + Example: 0xD3 = (2^19 - 6 * 2^15) = (512KiB - 6 * 32KiB) = 320KiB Valid values for dictionary size range from 4KiB to 512MiB. `Lzma stream' @@ -394,9 +397,201 @@ additional information before, between, or after them. -File: lzip.info, Node: Examples, Next: Problems, Prev: File Format, Up: Top +File: lzip.info, Node: Stream Format, Next: Examples, Prev: File Format, Up: Top + +5 Format of the LZMA stream in lzip files +***************************************** + +The LZMA algorithm has three parameters, called "special LZMA +properties", to adjust it for some kinds of binary data. These +parameters are; `literal_context_bits' (with a default value of 3), +`literal_pos_state_bits' (with a default value of 0), and +`pos_state_bits' (with a default value of 2). As a general purpose +compressor, lzip only uses the default values for these parameters. + + Lzip also finishes the LZMA stream with an "End Of Stream" marker +(the distance-length pair 0xFFFFFFFFU, 2), which in conjunction with the +"member size" field in the member trailer allows the verification of +stream integrity. The LZMA stream in lzip files always has these two +features (default properties and EOS marker) and is referred to in this +document as LZMA-302eos or LZMA-lzip. + + The second stage of LZMA is a range encoder that uses a different +probability model for each type of symbol; distances, lengths, literal +bytes, etc. Range encoding conceptually encodes all the symbols of the +message into one number. Unlike Huffman coding, which assigns to each +symbol a bit-pattern and concatenates all the bit-patterns together, +range encoding can compress one symbol to less than one bit. Therefore +the compressed data produced by a range encoder can't be split in pieces +that could be individually described. + + It seems that the only way of describing the LZMA-302eos stream is +describing the algorithm that decodes it. And given the many details +about the range decoder that need to be described accurately, the source +code of a real decoder seems the only appropiate reference to use. + + What follows is a description of the decoding algorithm for +LZMA-302eos streams using as reference the source code of "lzd", an +educational decompressor for lzip files which can be downloaded from +the lzip download directory. The source code of lzd is included in +appendix A. *note Reference source code:: + + +5.1 What is coded +================= + +The LZMA stream includes literals, matches and repeated matches (matches +reusing a recently used distance). There are 7 different coding +sequences: + +Bit sequence Name Description +--------------------------------------------------------------------------- +0 + byte literal literal byte +1 + 0 + len + dis match distance-length pair +1 + 1 + 0 + 0 shortrep 1 byte match at latest used distance +1 + 1 + 0 + 1 + len rep0 len bytes match at latest used + distance +1 + 1 + 1 + 0 + len rep1 len bytes match at second latest + used distance +1 + 1 + 1 + 1 + 0 + len rep2 len bytes match at third latest used + distance +1 + 1 + 1 + 1 + 1 + len rep3 len bytes match at fourth latest + used distance + + + In the following tables, multi-bit sequences are coded in normal +order, from MSB to LSB, except where noted otherwise. + + Lengths (the `len' in the table above) are coded as follows: + +Bit sequence Description +-------------------------------------------------------------------------- +0 + 3 bits lengths from 2 to 9 +1 + 0 + 3 bits lengths from 10 to 17 +1 + 1 + 8 bits lengths from 18 to 273 + + + The coding of distances is a little more complicated. LZMA divides +the interval between any two powers of 2 into 2 halves, named slots. As +possible distances range from 0 to (2^32 - 1), there are 64 slots (0 to +63). The slot number is context-coded in 6 bits. `direct_bits' are the +remaining bits (from 0 to 30) needed to form a complete distance, and +are calculated as (slot >> 1) - 1. If a distance needs 6 or more +direct_bits, the last 4 bits are coded separately. The last piece +(direct_bits for distances 4 to 127 or the last 4 bits for distances >= +128) is context-coded in reverse order (from LSB to MSB). For distances +>= 128, the `direct_bits - 4' part is coded with fixed 0.5 probability. + +Bit sequence Description +-------------------------------------------------------------------------- +slot distances from 0 to 3 +slot + direct_bits distances from 4 to 127 +slot + (direct_bits - 4) + 4 bits distances from 128 to 2^32 - 1 + + +5.2 The coding contexts +======================= + +These contexts (`Bit_model' in the source), are integers or arrays of +integers representing the probability of the corresponding bit being 0. + + The indices used in these arrays are: + +`state' + A state machine (`State' in the source) coding the latest 2 to 4 + types of sequences processed. The initial state is 0. + +`pos_state' + Value of the 2 least significant bits of the current position in + the decoded data. + +`literal_state' + Value of the 3 most significant bits of the latest byte decoded. + +`dis_state' + Coded value of length (real length - 2), with a maximum of 3. The + resulting value is in the range 0 to 3. + + + The contexts for decoding the type of coding sequence are: + +Name Indices Used when +--------------------------------------------------------------------------- +bm_match state, pos_state sequence start +bm_rep state after sequence 1 +bm_rep0 state after sequence 11 +bm_rep1 state after sequence 111 +bm_rep2 state after sequence 1111 +bm_len state, pos_state after sequence 110 -5 A small tutorial with examples + + The contexts for decoding distances are: + +Name Indices Used when +--------------------------------------------------------------------------- +bm_dis_slot dis_state, bit tree distance start +bm_dis reverse bit tree after slots 4 to 13 +bm_align reverse bit tree for distances >= 128, after + fixed probability bits + + + There are two separated sets of length contexts (`Len_model' in the +source). One for normal matches, the other for repeated matches. The +contexts in each Len_model are (see `decode_len' in the source): + +Name Indices Used when +--------------------------------------------------------------------------- +choice1 none length start +choice2 none after sequence 1 +bm_low pos_state, bit tree after sequence 0 +bm_mid pos_state, bit tree after sequence 10 +bm_high bit tree after sequence 11 + + + The context array `bm_literal' is special. In principle it acts as a +normal bit tree context, the one selected by `literal_state'. But if +the previous decoded byte was not a literal, two other bit tree +contexts are used depending on the value of each bit in `match_byte' +(the byte at the latest used distance), until a bit is decoded that is +different from its corresponding bit in `match_byte'. After the first +difference is found, the rest of the byte is decoded using the normal +bit tree context. (See `decode_matched' in the source). + + +5.3 The range decoder +===================== + +The LZMA stream is consumed one byte at a time by the range decoder. +(See `normalize' in the source). Every byte consumed produces a +variable number of decoded bits, depending on how well these bits agree +with their context. (See `decode_bit' in the source). + + The range decoder state consists of two unsigned 32-bit variables, +`range' (representing the most significant part of the range size not +yet decoded), and `code' (representing the current point within +`range'). `range' is initialized to (2^32 - 1), and `code' is +initialized to 0. + + The range encoder produces a first 0 byte that must be ignored by the +range decoder. This is done by shifting 5 bytes in the initialization of +`code' instead of 4. (See the `Range_decoder' constructor in the +source). + + +5.4 Decoding the LZMA stream +============================ + +After decoding the member header and obtaining the dictionary size, the +range decoder is initialized and then the LZMA decoder enters a loop +(See `decode_member' in the source) where it invokes the range decoder +with the appropiate contexts to decode the different coding sequences +(matches, repeated matches, and literal bytes), until the "End Of +Stream" marker is decoded. + + +File: lzip.info, Node: Examples, Next: Problems, Prev: Stream Format, Up: Top + +6 A small tutorial with examples ******************************** WARNING! Even if lzip is bug-free, other causes may result in a corrupt @@ -467,9 +662,9 @@ file with a member size of 32MiB. lzip -b 32MiB -S 650MB big_db -File: lzip.info, Node: Problems, Next: Concept Index, Prev: Examples, Up: Top +File: lzip.info, Node: Problems, Next: Reference source code, Prev: Examples, Up: Top -6 Reporting Bugs +7 Reporting Bugs **************** There are probably bugs in lzip. There are certainly errors and @@ -482,7 +677,461 @@ for all eternity, if not longer. by running `lzip --version'. -File: lzip.info, Node: Concept Index, Prev: Problems, Up: Top +File: lzip.info, Node: Reference source code, Next: Concept Index, Prev: Problems, Up: Top + +Appendix A Reference source code +******************************** + +#include <algorithm> +#include <cerrno> +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <stdint.h> +#include <unistd.h> + + +class State + { + int st; + +public: + enum { states = 12 }; + State() : st( 0 ) {} + int operator()() const { return st; } + bool is_char() const { return st < 7; } + + void set_char() + { + static const int next[states] = { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5 }; + st = next[st]; + } + + void set_match() { st = ( ( st < 7 ) ? 7 : 10 ); } + void set_rep() { st = ( ( st < 7 ) ? 8 : 11 ); } + void set_short_rep() { st = ( ( st < 7 ) ? 9 : 11 ); } + }; + + +enum { + literal_context_bits = 3, + pos_state_bits = 2, + pos_states = 1 << pos_state_bits, + pos_state_mask = pos_states - 1, + + max_dis_states = 4, + dis_slot_bits = 6, + start_dis_model = 4, + end_dis_model = 14, + modeled_distances = 1 << (end_dis_model / 2), // 128 + dis_align_bits = 4, + dis_align_size = 1 << dis_align_bits, + + len_low_bits = 3, + len_mid_bits = 3, + len_high_bits = 8, + len_low_symbols = 1 << len_low_bits, + len_mid_symbols = 1 << len_mid_bits, + len_high_symbols = 1 << len_high_bits, + max_len_symbols = len_low_symbols + len_mid_symbols + len_high_symbols, + min_match_len = 2, // must be 2 + + bit_model_move_bits = 5, + bit_model_total_bits = 11, + bit_model_total = 1 << bit_model_total_bits }; + + +struct Bit_model + { + int probability; + Bit_model() : probability( bit_model_total / 2 ) {} + }; + + +struct Len_model + { + Bit_model choice1; + Bit_model choice2; + Bit_model bm_low[pos_states][len_low_symbols]; + Bit_model bm_mid[pos_states][len_mid_symbols]; + Bit_model bm_high[len_high_symbols]; + }; + + +class Range_decoder + { + uint32_t code; + uint32_t range; + +public: + Range_decoder() : code( 0 ), range( 0xFFFFFFFFU ) + { + for( int i = 0; i < 5; ++i ) code = (code << 8) | std::getc( stdin ); + } + + int decode( const int num_bits ) + { + int symbol = 0; + for( int i = 0; i < num_bits; ++i ) + { + range >>= 1; + symbol <<= 1; + if( code >= range ) { code -= range; symbol |= 1; } + if( range <= 0x00FFFFFFU ) // normalize + { range <<= 8; code = (code << 8) | std::getc( stdin ); } + } + return symbol; + } + + int decode_bit( Bit_model & bm ) + { + int symbol; + const uint32_t bound = ( range >> bit_model_total_bits ) * bm.probability; + if( code < bound ) + { + range = bound; + bm.probability += (bit_model_total - bm.probability) >> bit_model_move_bits; + symbol = 0; + } + else + { + range -= bound; + code -= bound; + bm.probability -= bm.probability >> bit_model_move_bits; + symbol = 1; + } + if( range <= 0x00FFFFFFU ) // normalize + { range <<= 8; code = (code << 8) | std::getc( stdin ); } + return symbol; + } + + int decode_tree( Bit_model bm[], const int num_bits ) + { + int symbol = 1; + for( int i = 0; i < num_bits; ++i ) + symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); + return symbol - (1 << num_bits); + } + + int decode_tree_reversed( Bit_model bm[], const int num_bits ) + { + int symbol = decode_tree( bm, num_bits ); + int reversed_symbol = 0; + for( int i = 0; i < num_bits; ++i ) + { + reversed_symbol = ( reversed_symbol << 1 ) | ( symbol & 1 ); + symbol >>= 1; + } + return reversed_symbol; + } + + int decode_matched( Bit_model bm[], const int match_byte ) + { + Bit_model * const bm1 = bm + 0x100; + int symbol = 1; + for( int i = 7; i >= 0; --i ) + { + const int match_bit = ( match_byte >> i ) & 1; + const int bit = decode_bit( bm1[(match_bit<<8)+symbol] ); + symbol = ( symbol << 1 ) | bit; + if( match_bit != bit ) + { + while( symbol < 0x100 ) + symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); + break; + } + } + return symbol - 0x100; + } + + int decode_len( Len_model & lm, const int pos_state ) + { + if( decode_bit( lm.choice1 ) == 0 ) + return min_match_len + + decode_tree( lm.bm_low[pos_state], len_low_bits ); + if( decode_bit( lm.choice2 ) == 0 ) + return min_match_len + len_low_symbols + + decode_tree( lm.bm_mid[pos_state], len_mid_bits ); + return min_match_len + len_low_symbols + len_mid_symbols + + decode_tree( lm.bm_high, len_high_bits ); + } + }; + + +class LZ_decoder + { + unsigned long long partial_data_pos; + Range_decoder rdec; + const unsigned dictionary_size; + uint8_t * const buffer; // output buffer + unsigned pos; // current pos in buffer + unsigned stream_pos; // first byte not yet written to stdout + uint32_t crc_; + + void flush_data(); + + uint8_t get_byte( const unsigned distance ) const + { + int i = pos - distance - 1; + if( i < 0 ) i += dictionary_size; + return buffer[i]; + } + + void put_byte( const uint8_t b ) + { + buffer[pos] = b; + if( ++pos >= dictionary_size ) flush_data(); + } + +public: + LZ_decoder( const unsigned dict_size ) + : + partial_data_pos( 0 ), + dictionary_size( dict_size ), + buffer( new uint8_t[dictionary_size] ), + pos( 0 ), + stream_pos( 0 ), + crc_( 0xFFFFFFFFU ) + { buffer[dictionary_size-1] = 0; } // prev_byte of first_byte + + ~LZ_decoder() { delete[] buffer; } + + unsigned crc() const { return crc_ ^ 0xFFFFFFFFU; } + unsigned long long data_position() const { return partial_data_pos + pos; } + + bool decode_member(); + }; + + +class CRC32 + { + uint32_t data[256]; // Table of CRCs of all 8-bit messages. + +public: + CRC32() + { + for( unsigned n = 0; n < 256; ++n ) + { + unsigned c = n; + for( int k = 0; k < 8; ++k ) + { if( c & 1 ) c = 0xEDB88320U ^ ( c >> 1 ); else c >>= 1; } + data[n] = c; + } + } + + void update( uint32_t & crc, const uint8_t * buffer, const int size ) const + { + for( int i = 0; i < size; ++i ) + crc = data[(crc^buffer[i])&0xFF] ^ ( crc >> 8 ); + } + }; + +const CRC32 crc32; + + +void LZ_decoder::flush_data() + { + if( pos > stream_pos ) + { + const unsigned size = pos - stream_pos; + crc32.update( crc_, buffer + stream_pos, size ); + errno = 0; + if( std::fwrite( buffer + stream_pos, 1, size, stdout ) != size ) + { std::fprintf( stderr, "Write error: %s\n", std::strerror( errno ) ); + std::exit( 1 ); } + if( pos >= dictionary_size ) { partial_data_pos += pos; pos = 0; } + stream_pos = pos; + } + } + + +bool LZ_decoder::decode_member() // Returns false if error + { + Bit_model bm_literal[1<<literal_context_bits][0x300]; + Bit_model bm_match[State::states][pos_states]; + Bit_model bm_rep[State::states]; + Bit_model bm_rep0[State::states]; + Bit_model bm_rep1[State::states]; + Bit_model bm_rep2[State::states]; + Bit_model bm_len[State::states][pos_states]; + Bit_model bm_dis_slot[max_dis_states][1<<dis_slot_bits]; + Bit_model bm_dis[modeled_distances-end_dis_model]; + Bit_model bm_align[dis_align_size]; + Len_model match_len_model; + Len_model rep_len_model; + unsigned rep0 = 0; // rep[0-3] latest four distances + unsigned rep1 = 0; // used for efficient coding of + unsigned rep2 = 0; // repeated distances + unsigned rep3 = 0; + State state; + + while( !std::feof( stdin ) && !std::ferror( stdin ) ) + { + const int pos_state = data_position() & pos_state_mask; + if( rdec.decode_bit( bm_match[state()][pos_state] ) == 0 ) // 1st bit + { + const uint8_t prev_byte = get_byte( 0 ); + const int literal_state = prev_byte >> ( 8 - literal_context_bits ); + Bit_model * const bm = bm_literal[literal_state]; + if( state.is_char() ) + put_byte( rdec.decode_tree( bm, 8 ) ); + else + put_byte( rdec.decode_matched( bm, get_byte( rep0 ) ) ); + state.set_char(); + } + else + { + int len; + if( rdec.decode_bit( bm_rep[state()] ) == 1 ) // 2nd bit + { + if( rdec.decode_bit( bm_rep0[state()] ) == 0 ) // 3rd bit + { + if( rdec.decode_bit( bm_len[state()][pos_state] ) == 0 ) // 4th bit + { state.set_short_rep(); put_byte( get_byte( rep0 ) ); continue; } + } + else + { + unsigned distance; + if( rdec.decode_bit( bm_rep1[state()] ) == 0 ) // 4th bit + distance = rep1; + else + { + if( rdec.decode_bit( bm_rep2[state()] ) == 0 ) // 5th bit + distance = rep2; + else + { distance = rep3; rep3 = rep2; } + rep2 = rep1; + } + rep1 = rep0; + rep0 = distance; + } + len = rdec.decode_len( rep_len_model, pos_state ); + state.set_rep(); + } + else + { + rep3 = rep2; rep2 = rep1; rep1 = rep0; + len = rdec.decode_len( match_len_model, pos_state ); + const int dis_state = std::min( len - min_match_len, max_dis_states - 1 ); + const int dis_slot = + rdec.decode_tree( bm_dis_slot[dis_state], dis_slot_bits ); + if( dis_slot < start_dis_model ) rep0 = dis_slot; + else + { + const int direct_bits = ( dis_slot >> 1 ) - 1; + rep0 = ( 2 | ( dis_slot & 1 ) ) << direct_bits; + if( dis_slot < end_dis_model ) + rep0 += rdec.decode_tree_reversed( bm_dis + rep0 - dis_slot - 1, + direct_bits ); + else + { + rep0 += rdec.decode( direct_bits - dis_align_bits ) << dis_align_bits; + rep0 += rdec.decode_tree_reversed( bm_align, dis_align_bits ); + if( rep0 == 0xFFFFFFFFU ) // Marker found + { + flush_data(); + return ( len == min_match_len ); // End Of Stream marker + } + } + } + state.set_match(); + if( rep0 >= dictionary_size || ( rep0 >= pos && !partial_data_pos ) ) + return false; + } + for( int i = 0; i < len; ++i ) + put_byte( get_byte( rep0 ) ); + } + } + return false; + } + + +enum { min_dictionary_size = 1 << 12, + max_dictionary_size = 1 << 29 }; + +typedef uint8_t File_header[6]; // 0-3 magic, 4 version, 5 coded_dict_size + +typedef uint8_t File_trailer[20]; + // 0-3 CRC32 of the uncompressed data + // 4-11 size of the uncompressed data + // 12-19 member size including header and trailer + + +/* Return values: 0 for a normal exit, 1 for environmental problems + (file not found, invalid flags, I/O errors, etc), 2 to indicate a + corrupt or invalid input file. */ + +int main( const int argc, const char * const argv[] ) + { + if( argc > 1 ) + { + std::printf( "Lzd %s - Educational decompressor for lzip files.\n", + PROGVERSION ); + std::printf( "Study the source to learn how a simple lzip decompressor works.\n" + "It is not safe to use it for any real work.\n" + "\nUsage: %s < file.lz > file\n", argv[0] ); + std::printf( "Lzd decompresses from standard input to standard output.\n" + "\nCopyright (C) 2013 Antonio Diaz Diaz.\n" + "This is free software: you are free to change and redistribute it.\n" + "There is NO WARRANTY, to the extent permitted by law.\n" + "Report bugs to lzip-bug@nongnu.org\n" + "Lzip home page: http://www.nongnu.org/lzip/lzip.html\n" ); + return 0; + } + + if( isatty( STDIN_FILENO ) ) + { + std::fprintf( stderr, "I won't read compressed data from a terminal.\n" + "Try '%s --help' for more information.\n", argv[0] ); + return 1; + } + + for( bool first_member = true; ; first_member = false ) + { + File_header header; + for( int i = 0; i < 6; ++i ) + header[i] = std::getc( stdin ); + if( std::feof( stdin ) || std::memcmp( header, "LZIP", 4 ) != 0 ) + { + if( first_member ) + { std::fprintf( stderr, "Bad magic number (file not in lzip format)\n" ); + return 2; } + break; + } + if( header[4] != 1 ) + { + std::fprintf( stderr, "Version %d member format not supported.\n", + header[4] ); + return 2; + } + unsigned dict_size = 1 << ( header[5] & 0x1F ); + dict_size -= ( dict_size / 16 ) * ( ( header[5] >> 5 ) & 7 ); + if( dict_size < min_dictionary_size || dict_size > max_dictionary_size ) + { std::fprintf( stderr, "Invalid dictionary size in member header\n" ); + return 2; } + + LZ_decoder decoder( dict_size ); + if( !decoder.decode_member() ) + { std::fprintf( stderr, "Data error\n" ); return 2; } + + File_trailer trailer; + for( int i = 0; i < 20; ++i ) trailer[i] = std::getc( stdin ); + unsigned crc = 0; + for( int i = 3; i >= 0; --i ) { crc <<= 8; crc += trailer[i]; } + unsigned long long data_size = 0; + for( int i = 11; i >= 4; --i ) { data_size <<= 8; data_size += trailer[i]; } + if( crc != decoder.crc() || data_size != decoder.data_position() ) + { std::fprintf( stderr, "CRC error\n" ); return 2; } + } + + if( std::fclose( stdout ) != 0 ) + { std::fprintf( stderr, "Can't close stdout: %s\n", std::strerror( errno ) ); + return 1; } + return 0; + } + + +File: lzip.info, Node: Concept Index, Prev: Reference source code, Up: Top Concept Index ************* @@ -494,10 +1143,12 @@ Concept Index * bugs: Problems. (line 6) * examples: Examples. (line 6) * file format: File Format. (line 6) +* format of the LZMA stream: Stream Format. (line 6) * getting help: Problems. (line 6) * introduction: Introduction. (line 6) * invoking: Invoking Lzip. (line 6) * options: Invoking Lzip. (line 6) +* reference source code: Reference source code. (line 6) * usage: Invoking Lzip. (line 6) * version: Invoking Lzip. (line 6) @@ -505,13 +1156,15 @@ Concept Index Tag Table: Node: Top224 -Node: Introduction925 -Node: Algorithm4590 -Node: Invoking Lzip7108 -Node: File Format12460 -Node: Examples14767 -Node: Problems16714 -Node: Concept Index17236 +Node: Introduction1067 +Node: Algorithm4732 +Node: Invoking Lzip7250 +Node: File Format12602 +Node: Stream Format14985 +Node: Examples23695 +Node: Problems25644 +Node: Reference source code26174 +Node: Concept Index39424 End Tag Table diff --git a/doc/lzip.texinfo b/doc/lzip.texinfo index 76ccfc9..632abc5 100644 --- a/doc/lzip.texinfo +++ b/doc/lzip.texinfo @@ -6,8 +6,8 @@ @finalout @c %**end of header -@set UPDATED 17 February 2013 -@set VERSION 1.14 +@set UPDATED 21 March 2013 +@set VERSION 1.15-pre1 @dircategory Data Compression @direntry @@ -35,13 +35,15 @@ This manual is for Lzip (version @value{VERSION}, @value{UPDATED}). @menu -* Introduction:: Purpose and features of lzip -* Algorithm:: How lzip compresses the data -* Invoking Lzip:: Command line interface -* File Format:: Detailed format of the compressed file -* Examples:: A small tutorial with examples -* Problems:: Reporting bugs -* Concept Index:: Index of concepts +* Introduction:: Purpose and features of lzip +* Algorithm:: How lzip compresses the data +* Invoking Lzip:: Command line interface +* File Format:: Detailed format of the compressed file +* Stream Format:: Format of the LZMA stream in lzip files +* Examples:: A small tutorial with examples +* Problems:: Reporting bugs +* Reference source code:: Source code illustrating stream format +* Concept Index:: Index of concepts @end menu @sp 1 @@ -403,6 +405,7 @@ wedges between 0 and 7. The size of a wedge is (base_size / 16).@* Bits 4-0 contain the base 2 logarithm of the base size (12 to 29).@* Bits 7-5 contain the number of wedges (0 to 7) to substract from the base size to obtain the dictionary size.@* +Example: 0xD3 = (2^19 - 6 * 2^15) = (512KiB - 6 * 32KiB) = 320KiB@* Valid values for dictionary size range from 4KiB to 512MiB. @item Lzma stream @@ -423,6 +426,206 @@ members from multi-member files. @end table +@node Stream Format +@chapter Format of the LZMA stream in lzip files +@cindex format of the LZMA stream + +The LZMA algorithm has three parameters, called "special LZMA +properties", to adjust it for some kinds of binary data. These +parameters are; @samp{literal_context_bits} (with a default value of 3), +@samp{literal_pos_state_bits} (with a default value of 0), and +@samp{pos_state_bits} (with a default value of 2). As a general purpose +compressor, lzip only uses the default values for these parameters. + +Lzip also finishes the LZMA stream with an "End Of Stream" marker (the +distance-length pair 0xFFFFFFFFU, 2), which in conjunction with the +"member size" field in the member trailer allows the verification of +stream integrity. The LZMA stream in lzip files always has these two +features (default properties and EOS marker) and is referred to in this +document as LZMA-302eos or LZMA-lzip. + +The second stage of LZMA is a range encoder that uses a different +probability model for each type of symbol; distances, lengths, literal +bytes, etc. Range encoding conceptually encodes all the symbols of the +message into one number. Unlike Huffman coding, which assigns to each +symbol a bit-pattern and concatenates all the bit-patterns together, +range encoding can compress one symbol to less than one bit. Therefore +the compressed data produced by a range encoder can't be split in pieces +that could be individually described. + +It seems that the only way of describing the LZMA-302eos stream is +describing the algorithm that decodes it. And given the many details +about the range decoder that need to be described accurately, the source +code of a real decoder seems the only appropiate reference to use. + +What follows is a description of the decoding algorithm for LZMA-302eos +streams using as reference the source code of "lzd", an educational +decompressor for lzip files which can be downloaded from the lzip +download directory. The source code of lzd is included in appendix A. +@ref{Reference source code} + +@sp 1 +@section What is coded + +The LZMA stream includes literals, matches and repeated matches (matches +reusing a recently used distance). There are 7 different coding sequences: + +@multitable @columnfractions .35 .14 .51 +@headitem Bit sequence @tab Name @tab Description +@item 0 + byte @tab literal @tab literal byte +@item 1 + 0 + len + dis @tab match @tab distance-length pair +@item 1 + 1 + 0 + 0 @tab shortrep @tab 1 byte match at latest used +distance +@item 1 + 1 + 0 + 1 + len @tab rep0 @tab len bytes match at latest used +distance +@item 1 + 1 + 1 + 0 + len @tab rep1 @tab len bytes match at second +latest used distance +@item 1 + 1 + 1 + 1 + 0 + len @tab rep2 @tab len bytes match at third +latest used distance +@item 1 + 1 + 1 + 1 + 1 + len @tab rep3 @tab len bytes match at fourth +latest used distance +@end multitable + +@sp 1 +In the following tables, multi-bit sequences are coded in normal order, +from MSB to LSB, except where noted otherwise. + +Lengths (the @samp{len} in the table above) are coded as follows: + +@multitable @columnfractions .5 .5 +@headitem Bit sequence @tab Description +@item 0 + 3 bits @tab lengths from 2 to 9 +@item 1 + 0 + 3 bits @tab lengths from 10 to 17 +@item 1 + 1 + 8 bits @tab lengths from 18 to 273 +@end multitable + +@sp 1 +The coding of distances is a little more complicated. LZMA divides the +interval between any two powers of 2 into 2 halves, named slots. As +possible distances range from 0 to (2^32 - 1), there are 64 slots (0 to +63). The slot number is context-coded in 6 bits. @samp{direct_bits} are +the remaining bits (from 0 to 30) needed to form a complete distance, +and are calculated as (slot >> 1) - 1. If a distance needs 6 or more +direct_bits, the last 4 bits are coded separately. The last piece +(direct_bits for distances 4 to 127 or the last 4 bits for distances >= +128) is context-coded in reverse order (from LSB to MSB). For distances +>= 128, the @samp{direct_bits - 4} part is coded with fixed 0.5 +probability. + +@multitable @columnfractions .5 .5 +@headitem Bit sequence @tab Description +@item slot @tab distances from 0 to 3 +@item slot + direct_bits @tab distances from 4 to 127 +@item slot + (direct_bits - 4) + 4 bits @tab distances from 128 to +2^32 - 1 +@end multitable + +@sp 1 +@section The coding contexts + +These contexts (@samp{Bit_model} in the source), are integers or arrays +of integers representing the probability of the corresponding bit being 0. + +The indices used in these arrays are: + +@table @samp +@item state + +A state machine (@samp{State} in the source) coding the latest 2 to 4 +types of sequences processed. The initial state is 0. + +@item pos_state +Value of the 2 least significant bits of the current position in the +decoded data. + +@item literal_state +Value of the 3 most significant bits of the latest byte decoded. + +@item dis_state +Coded value of length (real length - 2), with a maximum of 3. The +resulting value is in the range 0 to 3. + +@end table + + +The contexts for decoding the type of coding sequence are: + +@multitable @columnfractions .2 .4 .4 +@headitem Name @tab Indices @tab Used when +@item bm_match @tab state, pos_state @tab sequence start +@item bm_rep @tab state @tab after sequence 1 +@item bm_rep0 @tab state @tab after sequence 11 +@item bm_rep1 @tab state @tab after sequence 111 +@item bm_rep2 @tab state @tab after sequence 1111 +@item bm_len @tab state, pos_state @tab after sequence 110 +@end multitable + +@sp 1 +The contexts for decoding distances are: + +@multitable @columnfractions .2 .4 .4 +@headitem Name @tab Indices @tab Used when +@item bm_dis_slot @tab dis_state, bit tree @tab distance start +@item bm_dis @tab reverse bit tree @tab after slots 4 to 13 +@item bm_align @tab reverse bit tree @tab for distances >= 128, after +fixed probability bits +@end multitable + +@sp 1 +There are two separated sets of length contexts (@samp{Len_model} in the +source). One for normal matches, the other for repeated matches. The +contexts in each Len_model are (see @samp{decode_len} in the source): + +@multitable @columnfractions .2 .4 .4 +@headitem Name @tab Indices @tab Used when +@item choice1 @tab none @tab length start +@item choice2 @tab none @tab after sequence 1 +@item bm_low @tab pos_state, bit tree @tab after sequence 0 +@item bm_mid @tab pos_state, bit tree @tab after sequence 10 +@item bm_high @tab bit tree @tab after sequence 11 +@end multitable + +@sp 1 +The context array @samp{bm_literal} is special. In principle it acts as +a normal bit tree context, the one selected by @samp{literal_state}. But +if the previous decoded byte was not a literal, two other bit tree +contexts are used depending on the value of each bit in +@samp{match_byte} (the byte at the latest used distance), until a bit is +decoded that is different from its corresponding bit in +@samp{match_byte}. After the first difference is found, the rest of the +byte is decoded using the normal bit tree context. (See +@samp{decode_matched} in the source). + +@sp 1 +@section The range decoder + +The LZMA stream is consumed one byte at a time by the range decoder. +(See @samp{normalize} in the source). Every byte consumed produces a +variable number of decoded bits, depending on how well these bits agree +with their context. (See @samp{decode_bit} in the source). + +The range decoder state consists of two unsigned 32-bit variables, +@code{range} (representing the most significant part of the range size +not yet decoded), and @code{code} (representing the current point within +@code{range}). @code{range} is initialized to (2^32 - 1), and +@code{code} is initialized to 0. + +The range encoder produces a first 0 byte that must be ignored by the +range decoder. This is done by shifting 5 bytes in the initialization of +@code{code} instead of 4. (See the @samp{Range_decoder} constructor in +the source). + +@sp 1 +@section Decoding the LZMA stream + +After decoding the member header and obtaining the dictionary size, the +range decoder is initialized and then the LZMA decoder enters a loop +(See @samp{decode_member} in the source) where it invokes the range +decoder with the appropiate contexts to decode the different coding +sequences (matches, repeated matches, and literal bytes), until the "End +Of Stream" marker is decoded. + + @node Examples @chapter A small tutorial with examples @cindex examples @@ -541,6 +744,461 @@ If you find a bug in lzip, please send electronic mail to find by running @w{@samp{lzip --version}}. +@node Reference source code +@appendix Reference source code +@cindex reference source code + +@verbatim +#include <algorithm> +#include <cerrno> +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <stdint.h> +#include <unistd.h> + + +class State + { + int st; + +public: + enum { states = 12 }; + State() : st( 0 ) {} + int operator()() const { return st; } + bool is_char() const { return st < 7; } + + void set_char() + { + static const int next[states] = { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5 }; + st = next[st]; + } + + void set_match() { st = ( ( st < 7 ) ? 7 : 10 ); } + void set_rep() { st = ( ( st < 7 ) ? 8 : 11 ); } + void set_short_rep() { st = ( ( st < 7 ) ? 9 : 11 ); } + }; + + +enum { + literal_context_bits = 3, + pos_state_bits = 2, + pos_states = 1 << pos_state_bits, + pos_state_mask = pos_states - 1, + + max_dis_states = 4, + dis_slot_bits = 6, + start_dis_model = 4, + end_dis_model = 14, + modeled_distances = 1 << (end_dis_model / 2), // 128 + dis_align_bits = 4, + dis_align_size = 1 << dis_align_bits, + + len_low_bits = 3, + len_mid_bits = 3, + len_high_bits = 8, + len_low_symbols = 1 << len_low_bits, + len_mid_symbols = 1 << len_mid_bits, + len_high_symbols = 1 << len_high_bits, + max_len_symbols = len_low_symbols + len_mid_symbols + len_high_symbols, + min_match_len = 2, // must be 2 + + bit_model_move_bits = 5, + bit_model_total_bits = 11, + bit_model_total = 1 << bit_model_total_bits }; + + +struct Bit_model + { + int probability; + Bit_model() : probability( bit_model_total / 2 ) {} + }; + + +struct Len_model + { + Bit_model choice1; + Bit_model choice2; + Bit_model bm_low[pos_states][len_low_symbols]; + Bit_model bm_mid[pos_states][len_mid_symbols]; + Bit_model bm_high[len_high_symbols]; + }; + + +class Range_decoder + { + uint32_t code; + uint32_t range; + +public: + Range_decoder() : code( 0 ), range( 0xFFFFFFFFU ) + { + for( int i = 0; i < 5; ++i ) code = (code << 8) | std::getc( stdin ); + } + + int decode( const int num_bits ) + { + int symbol = 0; + for( int i = 0; i < num_bits; ++i ) + { + range >>= 1; + symbol <<= 1; + if( code >= range ) { code -= range; symbol |= 1; } + if( range <= 0x00FFFFFFU ) // normalize + { range <<= 8; code = (code << 8) | std::getc( stdin ); } + } + return symbol; + } + + int decode_bit( Bit_model & bm ) + { + int symbol; + const uint32_t bound = ( range >> bit_model_total_bits ) * bm.probability; + if( code < bound ) + { + range = bound; + bm.probability += (bit_model_total - bm.probability) >> bit_model_move_bits; + symbol = 0; + } + else + { + range -= bound; + code -= bound; + bm.probability -= bm.probability >> bit_model_move_bits; + symbol = 1; + } + if( range <= 0x00FFFFFFU ) // normalize + { range <<= 8; code = (code << 8) | std::getc( stdin ); } + return symbol; + } + + int decode_tree( Bit_model bm[], const int num_bits ) + { + int symbol = 1; + for( int i = 0; i < num_bits; ++i ) + symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); + return symbol - (1 << num_bits); + } + + int decode_tree_reversed( Bit_model bm[], const int num_bits ) + { + int symbol = decode_tree( bm, num_bits ); + int reversed_symbol = 0; + for( int i = 0; i < num_bits; ++i ) + { + reversed_symbol = ( reversed_symbol << 1 ) | ( symbol & 1 ); + symbol >>= 1; + } + return reversed_symbol; + } + + int decode_matched( Bit_model bm[], const int match_byte ) + { + Bit_model * const bm1 = bm + 0x100; + int symbol = 1; + for( int i = 7; i >= 0; --i ) + { + const int match_bit = ( match_byte >> i ) & 1; + const int bit = decode_bit( bm1[(match_bit<<8)+symbol] ); + symbol = ( symbol << 1 ) | bit; + if( match_bit != bit ) + { + while( symbol < 0x100 ) + symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); + break; + } + } + return symbol - 0x100; + } + + int decode_len( Len_model & lm, const int pos_state ) + { + if( decode_bit( lm.choice1 ) == 0 ) + return min_match_len + + decode_tree( lm.bm_low[pos_state], len_low_bits ); + if( decode_bit( lm.choice2 ) == 0 ) + return min_match_len + len_low_symbols + + decode_tree( lm.bm_mid[pos_state], len_mid_bits ); + return min_match_len + len_low_symbols + len_mid_symbols + + decode_tree( lm.bm_high, len_high_bits ); + } + }; + + +class LZ_decoder + { + unsigned long long partial_data_pos; + Range_decoder rdec; + const unsigned dictionary_size; + uint8_t * const buffer; // output buffer + unsigned pos; // current pos in buffer + unsigned stream_pos; // first byte not yet written to stdout + uint32_t crc_; + + void flush_data(); + + uint8_t get_byte( const unsigned distance ) const + { + int i = pos - distance - 1; + if( i < 0 ) i += dictionary_size; + return buffer[i]; + } + + void put_byte( const uint8_t b ) + { + buffer[pos] = b; + if( ++pos >= dictionary_size ) flush_data(); + } + +public: + LZ_decoder( const unsigned dict_size ) + : + partial_data_pos( 0 ), + dictionary_size( dict_size ), + buffer( new uint8_t[dictionary_size] ), + pos( 0 ), + stream_pos( 0 ), + crc_( 0xFFFFFFFFU ) + { buffer[dictionary_size-1] = 0; } // prev_byte of first_byte + + ~LZ_decoder() { delete[] buffer; } + + unsigned crc() const { return crc_ ^ 0xFFFFFFFFU; } + unsigned long long data_position() const { return partial_data_pos + pos; } + + bool decode_member(); + }; + + +class CRC32 + { + uint32_t data[256]; // Table of CRCs of all 8-bit messages. + +public: + CRC32() + { + for( unsigned n = 0; n < 256; ++n ) + { + unsigned c = n; + for( int k = 0; k < 8; ++k ) + { if( c & 1 ) c = 0xEDB88320U ^ ( c >> 1 ); else c >>= 1; } + data[n] = c; + } + } + + void update( uint32_t & crc, const uint8_t * buffer, const int size ) const + { + for( int i = 0; i < size; ++i ) + crc = data[(crc^buffer[i])&0xFF] ^ ( crc >> 8 ); + } + }; + +const CRC32 crc32; + + +void LZ_decoder::flush_data() + { + if( pos > stream_pos ) + { + const unsigned size = pos - stream_pos; + crc32.update( crc_, buffer + stream_pos, size ); + errno = 0; + if( std::fwrite( buffer + stream_pos, 1, size, stdout ) != size ) + { std::fprintf( stderr, "Write error: %s\n", std::strerror( errno ) ); + std::exit( 1 ); } + if( pos >= dictionary_size ) { partial_data_pos += pos; pos = 0; } + stream_pos = pos; + } + } + + +bool LZ_decoder::decode_member() // Returns false if error + { + Bit_model bm_literal[1<<literal_context_bits][0x300]; + Bit_model bm_match[State::states][pos_states]; + Bit_model bm_rep[State::states]; + Bit_model bm_rep0[State::states]; + Bit_model bm_rep1[State::states]; + Bit_model bm_rep2[State::states]; + Bit_model bm_len[State::states][pos_states]; + Bit_model bm_dis_slot[max_dis_states][1<<dis_slot_bits]; + Bit_model bm_dis[modeled_distances-end_dis_model]; + Bit_model bm_align[dis_align_size]; + Len_model match_len_model; + Len_model rep_len_model; + unsigned rep0 = 0; // rep[0-3] latest four distances + unsigned rep1 = 0; // used for efficient coding of + unsigned rep2 = 0; // repeated distances + unsigned rep3 = 0; + State state; + + while( !std::feof( stdin ) && !std::ferror( stdin ) ) + { + const int pos_state = data_position() & pos_state_mask; + if( rdec.decode_bit( bm_match[state()][pos_state] ) == 0 ) // 1st bit + { + const uint8_t prev_byte = get_byte( 0 ); + const int literal_state = prev_byte >> ( 8 - literal_context_bits ); + Bit_model * const bm = bm_literal[literal_state]; + if( state.is_char() ) + put_byte( rdec.decode_tree( bm, 8 ) ); + else + put_byte( rdec.decode_matched( bm, get_byte( rep0 ) ) ); + state.set_char(); + } + else + { + int len; + if( rdec.decode_bit( bm_rep[state()] ) == 1 ) // 2nd bit + { + if( rdec.decode_bit( bm_rep0[state()] ) == 0 ) // 3rd bit + { + if( rdec.decode_bit( bm_len[state()][pos_state] ) == 0 ) // 4th bit + { state.set_short_rep(); put_byte( get_byte( rep0 ) ); continue; } + } + else + { + unsigned distance; + if( rdec.decode_bit( bm_rep1[state()] ) == 0 ) // 4th bit + distance = rep1; + else + { + if( rdec.decode_bit( bm_rep2[state()] ) == 0 ) // 5th bit + distance = rep2; + else + { distance = rep3; rep3 = rep2; } + rep2 = rep1; + } + rep1 = rep0; + rep0 = distance; + } + len = rdec.decode_len( rep_len_model, pos_state ); + state.set_rep(); + } + else + { + rep3 = rep2; rep2 = rep1; rep1 = rep0; + len = rdec.decode_len( match_len_model, pos_state ); + const int dis_state = std::min( len - min_match_len, max_dis_states - 1 ); + const int dis_slot = + rdec.decode_tree( bm_dis_slot[dis_state], dis_slot_bits ); + if( dis_slot < start_dis_model ) rep0 = dis_slot; + else + { + const int direct_bits = ( dis_slot >> 1 ) - 1; + rep0 = ( 2 | ( dis_slot & 1 ) ) << direct_bits; + if( dis_slot < end_dis_model ) + rep0 += rdec.decode_tree_reversed( bm_dis + rep0 - dis_slot - 1, + direct_bits ); + else + { + rep0 += rdec.decode( direct_bits - dis_align_bits ) << dis_align_bits; + rep0 += rdec.decode_tree_reversed( bm_align, dis_align_bits ); + if( rep0 == 0xFFFFFFFFU ) // Marker found + { + flush_data(); + return ( len == min_match_len ); // End Of Stream marker + } + } + } + state.set_match(); + if( rep0 >= dictionary_size || ( rep0 >= pos && !partial_data_pos ) ) + return false; + } + for( int i = 0; i < len; ++i ) + put_byte( get_byte( rep0 ) ); + } + } + return false; + } + + +enum { min_dictionary_size = 1 << 12, + max_dictionary_size = 1 << 29 }; + +typedef uint8_t File_header[6]; // 0-3 magic, 4 version, 5 coded_dict_size + +typedef uint8_t File_trailer[20]; + // 0-3 CRC32 of the uncompressed data + // 4-11 size of the uncompressed data + // 12-19 member size including header and trailer + + +/* Return values: 0 for a normal exit, 1 for environmental problems + (file not found, invalid flags, I/O errors, etc), 2 to indicate a + corrupt or invalid input file. */ + +int main( const int argc, const char * const argv[] ) + { + if( argc > 1 ) + { + std::printf( "Lzd %s - Educational decompressor for lzip files.\n", + PROGVERSION ); + std::printf( "Study the source to learn how a simple lzip decompressor works.\n" + "It is not safe to use it for any real work.\n" + "\nUsage: %s < file.lz > file\n", argv[0] ); + std::printf( "Lzd decompresses from standard input to standard output.\n" + "\nCopyright (C) 2013 Antonio Diaz Diaz.\n" + "This is free software: you are free to change and redistribute it.\n" + "There is NO WARRANTY, to the extent permitted by law.\n" + "Report bugs to lzip-bug@nongnu.org\n" + "Lzip home page: http://www.nongnu.org/lzip/lzip.html\n" ); + return 0; + } + + if( isatty( STDIN_FILENO ) ) + { + std::fprintf( stderr, "I won't read compressed data from a terminal.\n" + "Try '%s --help' for more information.\n", argv[0] ); + return 1; + } + + for( bool first_member = true; ; first_member = false ) + { + File_header header; + for( int i = 0; i < 6; ++i ) + header[i] = std::getc( stdin ); + if( std::feof( stdin ) || std::memcmp( header, "LZIP", 4 ) != 0 ) + { + if( first_member ) + { std::fprintf( stderr, "Bad magic number (file not in lzip format)\n" ); + return 2; } + break; + } + if( header[4] != 1 ) + { + std::fprintf( stderr, "Version %d member format not supported.\n", + header[4] ); + return 2; + } + unsigned dict_size = 1 << ( header[5] & 0x1F ); + dict_size -= ( dict_size / 16 ) * ( ( header[5] >> 5 ) & 7 ); + if( dict_size < min_dictionary_size || dict_size > max_dictionary_size ) + { std::fprintf( stderr, "Invalid dictionary size in member header\n" ); + return 2; } + + LZ_decoder decoder( dict_size ); + if( !decoder.decode_member() ) + { std::fprintf( stderr, "Data error\n" ); return 2; } + + File_trailer trailer; + for( int i = 0; i < 20; ++i ) trailer[i] = std::getc( stdin ); + unsigned crc = 0; + for( int i = 3; i >= 0; --i ) { crc <<= 8; crc += trailer[i]; } + unsigned long long data_size = 0; + for( int i = 11; i >= 4; --i ) { data_size <<= 8; data_size += trailer[i]; } + if( crc != decoder.crc() || data_size != decoder.data_position() ) + { std::fprintf( stderr, "CRC error\n" ); return 2; } + } + + if( std::fclose( stdout ) != 0 ) + { std::fprintf( stderr, "Can't close stdout: %s\n", std::strerror( errno ) ); + return 1; } + return 0; + } +@end verbatim + + @node Concept Index @unnumbered Concept Index @@ -416,7 +416,7 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], if( replens[rep] < min_match_len ) continue; const int price = rep_match_price + price_rep( rep, state, pos_state ); for( int len = min_match_len; len <= replens[rep]; ++len ) - trials[len].update( price + rep_match_len_encoder.price( len, pos_state ), + trials[len].update( price + rep_len_encoder.price( len, pos_state ), rep, 0 ); } @@ -586,8 +586,7 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], trials[++num_trials].price = infinite_price; int price = rep_match_price + price_rep( rep, cur_state, pos_state ); for( int i = min_match_len; i <= len; ++i ) - trials[cur+i].update( price + - rep_match_len_encoder.price( i, pos_state ), + trials[cur+i].update( price + rep_len_encoder.price( i, pos_state ), rep, cur ); if( rep == 0 ) start_len = len + 1; // discard shorter matches @@ -602,7 +601,7 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances], int pos_state2 = ( pos_state + len ) & pos_state_mask; State state2 = cur_state; state2.set_rep(); - price += rep_match_len_encoder.price( len, pos_state ) + + price += rep_len_encoder.price( len, pos_state ) + price0( bm_match[state2()][pos_state2] ) + price_matched( data[len-1], data[len], data[len-dis] ); pos_state2 = ( pos_state2 + 1 ) & pos_state_mask; @@ -750,7 +749,7 @@ bool LZ_encoder::encode_member( const unsigned long long member_size ) if( len == 1 ) state.set_short_rep(); else { - rep_match_len_encoder.encode( range_encoder, len, pos_state ); + rep_len_encoder.encode( range_encoder, len, pos_state ); state.set_rep(); } } @@ -113,9 +113,9 @@ inline int price_symbol_reversed( const Bit_model bm[], int symbol, for( int i = num_bits; i > 0; --i ) { const int bit = symbol & 1; - symbol >>= 1; price += price_bit( bm[model], bit ); model = ( model << 1 ) | bit; + symbol >>= 1; } return price; } @@ -372,13 +372,8 @@ public: }; -class Len_encoder +class Len_encoder : public Len_model { - Bit_model choice1; - Bit_model choice2; - Bit_model bm_low[pos_states][len_low_symbols]; - Bit_model bm_mid[pos_states][len_mid_symbols]; - Bit_model bm_high[len_high_symbols]; int prices[pos_states][max_len_symbols]; const int len_symbols; int counters[pos_states]; @@ -438,8 +433,8 @@ protected: Bit_model bm_align[dis_align_size]; Range_encoder range_encoder; - Len_encoder len_encoder; - Len_encoder rep_match_len_encoder; + Len_encoder match_len_encoder; + Len_encoder rep_len_encoder; const int num_dis_slots; @@ -450,8 +445,8 @@ protected: : crc_( 0xFFFFFFFFU ), range_encoder( outfd ), - len_encoder( match_len_limit ), - rep_match_len_encoder( match_len_limit ), + match_len_encoder( match_len_limit ), + rep_len_encoder( match_len_limit ), num_dis_slots( 2 * real_bits( dictionary_size - 1 ) ) { for( int i = 0; i < File_header::size; ++i ) @@ -492,7 +487,7 @@ protected: void encode_pair( const uint32_t dis, const int len, const int pos_state ) { - len_encoder.encode( range_encoder, len, pos_state ); + match_len_encoder.encode( range_encoder, len, pos_state ); const int dis_slot = get_slot( dis ); range_encoder.encode_tree( bm_dis_slot[get_dis_state(len)], dis_slot, dis_slot_bits ); @@ -598,7 +593,7 @@ class LZ_encoder : public LZ_encoder_base int price_rep0_len( const int len, const State state, const int pos_state ) const { return price_rep( 0, state, pos_state ) + - rep_match_len_encoder.price( len, pos_state ); + rep_len_encoder.price( len, pos_state ); } int price_dis( const int dis, const int dis_state ) const @@ -612,7 +607,7 @@ class LZ_encoder : public LZ_encoder_base int price_pair( const int dis, const int len, const int pos_state ) const { - return len_encoder.price( len, pos_state ) + + return match_len_encoder.price( len, pos_state ) + price_dis( dis, get_dis_state( len ) ); } diff --git a/fast_encoder.cc b/fast_encoder.cc index 954e671..fc5f891 100644 --- a/fast_encoder.cc +++ b/fast_encoder.cc @@ -169,7 +169,7 @@ bool FLZ_encoder::encode_member( const unsigned long long member_size ) if( dis > 1 ) range_encoder.encode_bit( bm_rep2[state()], dis > 2 ); } - rep_match_len_encoder.encode( range_encoder, len, pos_state ); + rep_len_encoder.encode( range_encoder, len, pos_state ); state.set_rep(); move_pos( len ); continue; @@ -85,6 +85,15 @@ struct Bit_model Bit_model() : probability( bit_model_total / 2 ) {} }; +struct Len_model + { + Bit_model choice1; + Bit_model choice2; + Bit_model bm_low[pos_states][len_low_symbols]; + Bit_model bm_mid[pos_states][len_mid_symbols]; + Bit_model bm_high[len_high_symbols]; + }; + class Pretty_print { @@ -160,8 +160,9 @@ void show_header( const File_header & header ) for( int i = 0; i < 8 && ( num > 9999 || ( exact && num >= factor ) ); ++i ) { num /= factor; if( num % factor != 0 ) exact = false; p = prefix[i]; np = ""; } - std::fprintf( stderr, "version %d, dictionary size %s%4u %sB. ", - header.version(), np, num, p ); + if( verbosity >= 4 ) + std::fprintf( stderr, "version %d, ", header.version() ); + std::fprintf( stderr, "dictionary size %s%4u %sB. ", np, num, p ); } @@ -754,6 +755,7 @@ int main( const int argc, const char * const argv[] ) { 'h', "help", Arg_parser::no }, { 'k', "keep", Arg_parser::no }, { 'm', "match-length", Arg_parser::yes }, + { 'n', "threads", Arg_parser::yes }, { 'o', "output", Arg_parser::yes }, { 'q', "quiet", Arg_parser::no }, { 's', "dictionary-size", Arg_parser::yes }, @@ -790,6 +792,7 @@ int main( const int argc, const char * const argv[] ) case 'm': encoder_options.match_len_limit = getnum( arg, min_match_len_limit, max_match_len ); zero = false; break; + case 'n': break; case 'o': default_output_filename = arg; break; case 'q': verbosity = -1; break; case 's': encoder_options.dictionary_size = get_dict_size( arg ); |