summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog9
-rw-r--r--NEWS14
-rw-r--r--arg_parser.cc4
-rw-r--r--arg_parser.h2
-rwxr-xr-xconfigure4
-rw-r--r--decoder.cc67
-rw-r--r--decoder.h51
-rw-r--r--doc/lzip.12
-rw-r--r--doc/lzip.info695
-rw-r--r--doc/lzip.texinfo676
-rw-r--r--encoder.cc9
-rw-r--r--encoder.h23
-rw-r--r--fast_encoder.cc2
-rw-r--r--lzip.h9
-rw-r--r--main.cc7
15 files changed, 1443 insertions, 131 deletions
diff --git a/ChangeLog b/ChangeLog
index 296049a..62ceec2 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2013-03-21 Antonio Diaz Diaz <ant_diaz@teleline.es>
+
+ * Version 1.15-pre1 released.
+ * Decompression time has been reduced by 1%.
+ * main.cc (show_header): Show header version if verbosity >= 4.
+ * Ignore option '-n, --threads' for compatibility with plzip.
+ * Added chapter 'Stream Format' and appendix 'Reference source code'
+ to the manual.
+
2013-02-17 Antonio Diaz Diaz <ant_diaz@teleline.es>
* Version 1.14 released.
diff --git a/NEWS b/NEWS
index 0bf236a..b39fd81 100644
--- a/NEWS
+++ b/NEWS
@@ -1,11 +1,11 @@
-Changes in version 1.14:
+Changes in version 1.15:
-Multi-step trials have been implemented.
+Decompression time has been reduced by 1%.
-Compression ratio has been slightly increased.
+File version is now shown only if verbosity >= 4.
-Compression time has been reduced by 5%.
+Option "-n, --threads" is now accepted and ignored for compatibility
+with plzip.
-Decompression time has been reduced by 12%.
-
-The target "install-bin" has been added to the Makefile.
+The chapter "Stream Format" and the appendix "Reference source code"
+have been added to the manual.
diff --git a/arg_parser.cc b/arg_parser.cc
index 13bcbcb..a28d2ba 100644
--- a/arg_parser.cc
+++ b/arg_parser.cc
@@ -1,5 +1,5 @@
/* Arg_parser - POSIX/GNU command line argument parser. (C++ version)
- Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012
+ Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013
Antonio Diaz Diaz.
This library is free software: you can redistribute it and/or modify
@@ -44,7 +44,7 @@ bool Arg_parser::parse_long_option( const char * const opt, const char * const a
// Test all long options for either exact match or abbreviated matches.
for( int i = 0; options[i].code != 0; ++i )
- if( options[i].name && !std::strncmp( options[i].name, &opt[2], len ) )
+ if( options[i].name && std::strncmp( options[i].name, &opt[2], len ) == 0 )
{
if( std::strlen( options[i].name ) == len ) // Exact match found
{ index = i; exact = true; break; }
diff --git a/arg_parser.h b/arg_parser.h
index 4fbd1af..5248cb1 100644
--- a/arg_parser.h
+++ b/arg_parser.h
@@ -1,5 +1,5 @@
/* Arg_parser - POSIX/GNU command line argument parser. (C++ version)
- Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012
+ Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013
Antonio Diaz Diaz.
This library is free software: you can redistribute it and/or modify
diff --git a/configure b/configure
index b30d534..91ff0e4 100755
--- a/configure
+++ b/configure
@@ -8,9 +8,9 @@
args=
no_create=
pkgname=lzip
-pkgversion=1.14
+pkgversion=1.15-pre1
progname=lzip
-srctrigger=lzip.h
+srctrigger=doc/lzip.texinfo
# clear some things potentially inherited from environment.
LC_ALL=C
diff --git a/decoder.cc b/decoder.cc
index 537143f..1d4774f 100644
--- a/decoder.cc
+++ b/decoder.cc
@@ -122,10 +122,10 @@ bool LZ_decoder::verify_trailer( const Pretty_print & pp ) const
File_trailer trailer;
const int trailer_size = File_trailer::size( member_version );
const unsigned long long member_size =
- range_decoder.member_position() + trailer_size;
+ rdec.member_position() + trailer_size;
bool error = false;
- int size = range_decoder.read_data( trailer.data, trailer_size );
+ int size = rdec.read_data( trailer.data, trailer_size );
if( size < trailer_size )
{
error = true;
@@ -140,7 +140,7 @@ bool LZ_decoder::verify_trailer( const Pretty_print & pp ) const
if( member_version == 0 ) trailer.member_size( member_size );
- if( !range_decoder.code_is_zero() )
+ if( !rdec.code_is_zero() )
{
error = true;
pp( "Range decoder final code is not zero" );
@@ -201,82 +201,79 @@ int LZ_decoder::decode_member( const Pretty_print & pp )
Bit_model bm_dis_slot[max_dis_states][1<<dis_slot_bits];
Bit_model bm_dis[modeled_distances-end_dis_model];
Bit_model bm_align[dis_align_size];
- Len_decoder len_decoder;
- Len_decoder rep_match_len_decoder;
+ Len_model match_len_model;
+ Len_model rep_len_model;
unsigned rep0 = 0; // rep[0-3] latest four distances
unsigned rep1 = 0; // used for efficient coding of
unsigned rep2 = 0; // repeated distances
unsigned rep3 = 0;
-
State state;
- range_decoder.load();
- while( !range_decoder.finished() )
+ rdec.load();
+ while( !rdec.finished() )
{
const int pos_state = data_position() & pos_state_mask;
- if( range_decoder.decode_bit( bm_match[state()][pos_state] ) == 0 )
+ if( rdec.decode_bit( bm_match[state()][pos_state] ) == 0 )
{
const uint8_t prev_byte = get_prev_byte();
if( state.is_char() )
- put_byte( range_decoder.decode_tree( bm_literal[get_lit_state(prev_byte)], 8 ) );
+ put_byte( rdec.decode_tree( bm_literal[get_lit_state(prev_byte)], 8 ) );
else
- put_byte( range_decoder.decode_matched( bm_literal[get_lit_state(prev_byte)],
- get_byte( rep0 ) ) );
+ put_byte( rdec.decode_matched( bm_literal[get_lit_state(prev_byte)],
+ get_byte( rep0 ) ) );
state.set_char();
}
else
{
int len;
- if( range_decoder.decode_bit( bm_rep[state()] ) == 1 )
+ if( rdec.decode_bit( bm_rep[state()] ) == 1 )
{
- len = 0;
- if( range_decoder.decode_bit( bm_rep0[state()] ) == 1 )
+ if( rdec.decode_bit( bm_rep0[state()] ) == 0 )
+ {
+ if( rdec.decode_bit( bm_len[state()][pos_state] ) == 0 )
+ { state.set_short_rep(); put_byte( get_byte( rep0 ) ); continue; }
+ }
+ else
{
unsigned distance;
- if( range_decoder.decode_bit( bm_rep1[state()] ) == 0 )
+ if( rdec.decode_bit( bm_rep1[state()] ) == 0 )
distance = rep1;
else
{
- if( range_decoder.decode_bit( bm_rep2[state()] ) == 0 )
+ if( rdec.decode_bit( bm_rep2[state()] ) == 0 )
distance = rep2;
- else { distance = rep3; rep3 = rep2; }
+ else
+ { distance = rep3; rep3 = rep2; }
rep2 = rep1;
}
rep1 = rep0;
rep0 = distance;
}
- else
- {
- if( range_decoder.decode_bit( bm_len[state()][pos_state] ) == 0 )
- { state.set_short_rep(); len = 1; }
- }
- if( len == 0 )
- {
- state.set_rep();
- len = min_match_len + rep_match_len_decoder.decode( range_decoder, pos_state );
- }
+ len = min_match_len + rdec.decode_len( rep_len_model, pos_state );
+ state.set_rep();
}
else
{
const unsigned rep0_saved = rep0;
- len = min_match_len + len_decoder.decode( range_decoder, pos_state );
- const int dis_slot = range_decoder.decode_tree6( bm_dis_slot[get_dis_state(len)] );
+ len = min_match_len + rdec.decode_len( match_len_model, pos_state );
+ const int dis_slot = rdec.decode_tree6( bm_dis_slot[get_dis_state(len)] );
if( dis_slot < start_dis_model ) rep0 = dis_slot;
else
{
const int direct_bits = ( dis_slot >> 1 ) - 1;
rep0 = ( 2 | ( dis_slot & 1 ) ) << direct_bits;
if( dis_slot < end_dis_model )
- rep0 += range_decoder.decode_tree_reversed( bm_dis + rep0 - dis_slot - 1, direct_bits );
+ rep0 += rdec.decode_tree_reversed( bm_dis + rep0 - dis_slot - 1,
+ direct_bits );
else
{
- rep0 += range_decoder.decode( direct_bits - dis_align_bits ) << dis_align_bits;
- rep0 += range_decoder.decode_tree_reversed4( bm_align );
+ rep0 += rdec.decode( direct_bits - dis_align_bits ) << dis_align_bits;
+ rep0 += rdec.decode_tree_reversed4( bm_align );
if( rep0 == 0xFFFFFFFFU ) // Marker found
{
rep0 = rep0_saved;
- range_decoder.normalize();
+ rdec.normalize();
flush_data();
if( len == min_match_len ) // End Of Stream marker
{
@@ -284,7 +281,7 @@ int LZ_decoder::decode_member( const Pretty_print & pp )
}
if( len == min_match_len + 1 ) // Sync Flush marker
{
- range_decoder.load(); continue;
+ rdec.load(); continue;
}
if( pp.verbosity() >= 0 )
{
diff --git a/decoder.h b/decoder.h
index a3559ad..a25f0f1 100644
--- a/decoder.h
+++ b/decoder.h
@@ -122,22 +122,22 @@ public:
int decode_tree( Bit_model bm[], const int num_bits )
{
- int model = 1;
+ int symbol = 1;
for( int i = num_bits; i > 0; --i )
- model = ( model << 1 ) | decode_bit( bm[model] );
- return model - (1 << num_bits);
+ symbol = ( symbol << 1 ) | decode_bit( bm[symbol] );
+ return symbol - (1 << num_bits);
}
int decode_tree6( Bit_model bm[] )
{
- int model = 1;
- model = ( model << 1 ) | decode_bit( bm[model] );
- model = ( model << 1 ) | decode_bit( bm[model] );
- model = ( model << 1 ) | decode_bit( bm[model] );
- model = ( model << 1 ) | decode_bit( bm[model] );
- model = ( model << 1 ) | decode_bit( bm[model] );
- model = ( model << 1 ) | decode_bit( bm[model] );
- return model - (1 << 6);
+ int symbol = 1;
+ symbol = ( symbol << 1 ) | decode_bit( bm[symbol] );
+ symbol = ( symbol << 1 ) | decode_bit( bm[symbol] );
+ symbol = ( symbol << 1 ) | decode_bit( bm[symbol] );
+ symbol = ( symbol << 1 ) | decode_bit( bm[symbol] );
+ symbol = ( symbol << 1 ) | decode_bit( bm[symbol] );
+ symbol = ( symbol << 1 ) | decode_bit( bm[symbol] );
+ return symbol - (1 << 6);
}
int decode_tree_reversed( Bit_model bm[], const int num_bits )
@@ -186,27 +186,16 @@ public:
}
return symbol - 0x100;
}
- };
-
-class Len_decoder
- {
- Bit_model choice1;
- Bit_model choice2;
- Bit_model bm_low[pos_states][len_low_symbols];
- Bit_model bm_mid[pos_states][len_mid_symbols];
- Bit_model bm_high[len_high_symbols];
-
-public:
- int decode( Range_decoder & range_decoder, const int pos_state )
+ int decode_len( Len_model & lm, const int pos_state )
{
- if( range_decoder.decode_bit( choice1 ) == 0 )
- return range_decoder.decode_tree( bm_low[pos_state], len_low_bits );
- if( range_decoder.decode_bit( choice2 ) == 0 )
+ if( decode_bit( lm.choice1 ) == 0 )
+ return decode_tree( lm.bm_low[pos_state], len_low_bits );
+ if( decode_bit( lm.choice2 ) == 0 )
return len_low_symbols +
- range_decoder.decode_tree( bm_mid[pos_state], len_mid_bits );
+ decode_tree( lm.bm_mid[pos_state], len_mid_bits );
return len_low_symbols + len_mid_symbols +
- range_decoder.decode_tree( bm_high, len_high_bits );
+ decode_tree( lm.bm_high, len_high_bits );
}
};
@@ -214,7 +203,7 @@ public:
class LZ_decoder
{
unsigned long long partial_data_pos;
- Range_decoder & range_decoder;
+ Range_decoder & rdec;
const int dictionary_size;
const int buffer_size;
uint8_t * const buffer; // output buffer
@@ -267,10 +256,10 @@ class LZ_decoder
void operator=( const LZ_decoder & ); // declared as private
public:
- LZ_decoder( const File_header & header, Range_decoder & rdec, const int ofd )
+ LZ_decoder( const File_header & header, Range_decoder & rde, const int ofd )
:
partial_data_pos( 0 ),
- range_decoder( rdec ),
+ rdec( rde ),
dictionary_size( header.dictionary_size() ),
buffer_size( std::max( 65536, dictionary_size ) ),
buffer( new uint8_t[buffer_size] ),
diff --git a/doc/lzip.1 b/doc/lzip.1
index e0b9ad0..0c6f516 100644
--- a/doc/lzip.1
+++ b/doc/lzip.1
@@ -1,5 +1,5 @@
.\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.37.1.
-.TH LZIP "1" "February 2013" "Lzip 1.14" "User Commands"
+.TH LZIP "1" "March 2013" "Lzip 1.15-pre1" "User Commands"
.SH NAME
Lzip \- reduces the size of files
.SH SYNOPSIS
diff --git a/doc/lzip.info b/doc/lzip.info
index e61550e..5c4cf81 100644
--- a/doc/lzip.info
+++ b/doc/lzip.info
@@ -11,17 +11,19 @@ File: lzip.info, Node: Top, Next: Introduction, Up: (dir)
Lzip Manual
***********
-This manual is for Lzip (version 1.14, 17 February 2013).
+This manual is for Lzip (version 1.15-pre1, 21 March 2013).
* Menu:
-* Introduction:: Purpose and features of lzip
-* Algorithm:: How lzip compresses the data
-* Invoking Lzip:: Command line interface
-* File Format:: Detailed format of the compressed file
-* Examples:: A small tutorial with examples
-* Problems:: Reporting bugs
-* Concept Index:: Index of concepts
+* Introduction:: Purpose and features of lzip
+* Algorithm:: How lzip compresses the data
+* Invoking Lzip:: Command line interface
+* File Format:: Detailed format of the compressed file
+* Stream Format:: Format of the LZMA stream in lzip files
+* Examples:: A small tutorial with examples
+* Problems:: Reporting bugs
+* Reference source code:: Source code illustrating stream format
+* Concept Index:: Index of concepts
Copyright (C) 2008, 2009, 2010, 2011, 2012, 2013 Antonio Diaz Diaz.
@@ -324,7 +326,7 @@ Z zettabyte (10^21) | Zi zebibyte (2^70)
Y yottabyte (10^24) | Yi yobibyte (2^80)

-File: lzip.info, Node: File Format, Next: Examples, Prev: Invoking Lzip, Up: Top
+File: lzip.info, Node: File Format, Next: Stream Format, Prev: Invoking Lzip, Up: Top
4 File Format
*************
@@ -375,6 +377,7 @@ additional information before, between, or after them.
Bits 4-0 contain the base 2 logarithm of the base size (12 to 29).
Bits 7-5 contain the number of wedges (0 to 7) to substract from
the base size to obtain the dictionary size.
+ Example: 0xD3 = (2^19 - 6 * 2^15) = (512KiB - 6 * 32KiB) = 320KiB
Valid values for dictionary size range from 4KiB to 512MiB.
`Lzma stream'
@@ -394,9 +397,201 @@ additional information before, between, or after them.

-File: lzip.info, Node: Examples, Next: Problems, Prev: File Format, Up: Top
+File: lzip.info, Node: Stream Format, Next: Examples, Prev: File Format, Up: Top
+
+5 Format of the LZMA stream in lzip files
+*****************************************
+
+The LZMA algorithm has three parameters, called "special LZMA
+properties", to adjust it for some kinds of binary data. These
+parameters are; `literal_context_bits' (with a default value of 3),
+`literal_pos_state_bits' (with a default value of 0), and
+`pos_state_bits' (with a default value of 2). As a general purpose
+compressor, lzip only uses the default values for these parameters.
+
+ Lzip also finishes the LZMA stream with an "End Of Stream" marker
+(the distance-length pair 0xFFFFFFFFU, 2), which in conjunction with the
+"member size" field in the member trailer allows the verification of
+stream integrity. The LZMA stream in lzip files always has these two
+features (default properties and EOS marker) and is referred to in this
+document as LZMA-302eos or LZMA-lzip.
+
+ The second stage of LZMA is a range encoder that uses a different
+probability model for each type of symbol; distances, lengths, literal
+bytes, etc. Range encoding conceptually encodes all the symbols of the
+message into one number. Unlike Huffman coding, which assigns to each
+symbol a bit-pattern and concatenates all the bit-patterns together,
+range encoding can compress one symbol to less than one bit. Therefore
+the compressed data produced by a range encoder can't be split in pieces
+that could be individually described.
+
+ It seems that the only way of describing the LZMA-302eos stream is
+describing the algorithm that decodes it. And given the many details
+about the range decoder that need to be described accurately, the source
+code of a real decoder seems the only appropiate reference to use.
+
+ What follows is a description of the decoding algorithm for
+LZMA-302eos streams using as reference the source code of "lzd", an
+educational decompressor for lzip files which can be downloaded from
+the lzip download directory. The source code of lzd is included in
+appendix A. *note Reference source code::
+
+
+5.1 What is coded
+=================
+
+The LZMA stream includes literals, matches and repeated matches (matches
+reusing a recently used distance). There are 7 different coding
+sequences:
+
+Bit sequence Name Description
+---------------------------------------------------------------------------
+0 + byte literal literal byte
+1 + 0 + len + dis match distance-length pair
+1 + 1 + 0 + 0 shortrep 1 byte match at latest used distance
+1 + 1 + 0 + 1 + len rep0 len bytes match at latest used
+ distance
+1 + 1 + 1 + 0 + len rep1 len bytes match at second latest
+ used distance
+1 + 1 + 1 + 1 + 0 + len rep2 len bytes match at third latest used
+ distance
+1 + 1 + 1 + 1 + 1 + len rep3 len bytes match at fourth latest
+ used distance
+
+
+ In the following tables, multi-bit sequences are coded in normal
+order, from MSB to LSB, except where noted otherwise.
+
+ Lengths (the `len' in the table above) are coded as follows:
+
+Bit sequence Description
+--------------------------------------------------------------------------
+0 + 3 bits lengths from 2 to 9
+1 + 0 + 3 bits lengths from 10 to 17
+1 + 1 + 8 bits lengths from 18 to 273
+
+
+ The coding of distances is a little more complicated. LZMA divides
+the interval between any two powers of 2 into 2 halves, named slots. As
+possible distances range from 0 to (2^32 - 1), there are 64 slots (0 to
+63). The slot number is context-coded in 6 bits. `direct_bits' are the
+remaining bits (from 0 to 30) needed to form a complete distance, and
+are calculated as (slot >> 1) - 1. If a distance needs 6 or more
+direct_bits, the last 4 bits are coded separately. The last piece
+(direct_bits for distances 4 to 127 or the last 4 bits for distances >=
+128) is context-coded in reverse order (from LSB to MSB). For distances
+>= 128, the `direct_bits - 4' part is coded with fixed 0.5 probability.
+
+Bit sequence Description
+--------------------------------------------------------------------------
+slot distances from 0 to 3
+slot + direct_bits distances from 4 to 127
+slot + (direct_bits - 4) + 4 bits distances from 128 to 2^32 - 1
+
+
+5.2 The coding contexts
+=======================
+
+These contexts (`Bit_model' in the source), are integers or arrays of
+integers representing the probability of the corresponding bit being 0.
+
+ The indices used in these arrays are:
+
+`state'
+ A state machine (`State' in the source) coding the latest 2 to 4
+ types of sequences processed. The initial state is 0.
+
+`pos_state'
+ Value of the 2 least significant bits of the current position in
+ the decoded data.
+
+`literal_state'
+ Value of the 3 most significant bits of the latest byte decoded.
+
+`dis_state'
+ Coded value of length (real length - 2), with a maximum of 3. The
+ resulting value is in the range 0 to 3.
+
+
+ The contexts for decoding the type of coding sequence are:
+
+Name Indices Used when
+---------------------------------------------------------------------------
+bm_match state, pos_state sequence start
+bm_rep state after sequence 1
+bm_rep0 state after sequence 11
+bm_rep1 state after sequence 111
+bm_rep2 state after sequence 1111
+bm_len state, pos_state after sequence 110
-5 A small tutorial with examples
+
+ The contexts for decoding distances are:
+
+Name Indices Used when
+---------------------------------------------------------------------------
+bm_dis_slot dis_state, bit tree distance start
+bm_dis reverse bit tree after slots 4 to 13
+bm_align reverse bit tree for distances >= 128, after
+ fixed probability bits
+
+
+ There are two separated sets of length contexts (`Len_model' in the
+source). One for normal matches, the other for repeated matches. The
+contexts in each Len_model are (see `decode_len' in the source):
+
+Name Indices Used when
+---------------------------------------------------------------------------
+choice1 none length start
+choice2 none after sequence 1
+bm_low pos_state, bit tree after sequence 0
+bm_mid pos_state, bit tree after sequence 10
+bm_high bit tree after sequence 11
+
+
+ The context array `bm_literal' is special. In principle it acts as a
+normal bit tree context, the one selected by `literal_state'. But if
+the previous decoded byte was not a literal, two other bit tree
+contexts are used depending on the value of each bit in `match_byte'
+(the byte at the latest used distance), until a bit is decoded that is
+different from its corresponding bit in `match_byte'. After the first
+difference is found, the rest of the byte is decoded using the normal
+bit tree context. (See `decode_matched' in the source).
+
+
+5.3 The range decoder
+=====================
+
+The LZMA stream is consumed one byte at a time by the range decoder.
+(See `normalize' in the source). Every byte consumed produces a
+variable number of decoded bits, depending on how well these bits agree
+with their context. (See `decode_bit' in the source).
+
+ The range decoder state consists of two unsigned 32-bit variables,
+`range' (representing the most significant part of the range size not
+yet decoded), and `code' (representing the current point within
+`range'). `range' is initialized to (2^32 - 1), and `code' is
+initialized to 0.
+
+ The range encoder produces a first 0 byte that must be ignored by the
+range decoder. This is done by shifting 5 bytes in the initialization of
+`code' instead of 4. (See the `Range_decoder' constructor in the
+source).
+
+
+5.4 Decoding the LZMA stream
+============================
+
+After decoding the member header and obtaining the dictionary size, the
+range decoder is initialized and then the LZMA decoder enters a loop
+(See `decode_member' in the source) where it invokes the range decoder
+with the appropiate contexts to decode the different coding sequences
+(matches, repeated matches, and literal bytes), until the "End Of
+Stream" marker is decoded.
+
+
+File: lzip.info, Node: Examples, Next: Problems, Prev: Stream Format, Up: Top
+
+6 A small tutorial with examples
********************************
WARNING! Even if lzip is bug-free, other causes may result in a corrupt
@@ -467,9 +662,9 @@ file with a member size of 32MiB.
lzip -b 32MiB -S 650MB big_db

-File: lzip.info, Node: Problems, Next: Concept Index, Prev: Examples, Up: Top
+File: lzip.info, Node: Problems, Next: Reference source code, Prev: Examples, Up: Top
-6 Reporting Bugs
+7 Reporting Bugs
****************
There are probably bugs in lzip. There are certainly errors and
@@ -482,7 +677,461 @@ for all eternity, if not longer.
by running `lzip --version'.

-File: lzip.info, Node: Concept Index, Prev: Problems, Up: Top
+File: lzip.info, Node: Reference source code, Next: Concept Index, Prev: Problems, Up: Top
+
+Appendix A Reference source code
+********************************
+
+#include <algorithm>
+#include <cerrno>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <stdint.h>
+#include <unistd.h>
+
+
+class State
+ {
+ int st;
+
+public:
+ enum { states = 12 };
+ State() : st( 0 ) {}
+ int operator()() const { return st; }
+ bool is_char() const { return st < 7; }
+
+ void set_char()
+ {
+ static const int next[states] = { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5 };
+ st = next[st];
+ }
+
+ void set_match() { st = ( ( st < 7 ) ? 7 : 10 ); }
+ void set_rep() { st = ( ( st < 7 ) ? 8 : 11 ); }
+ void set_short_rep() { st = ( ( st < 7 ) ? 9 : 11 ); }
+ };
+
+
+enum {
+ literal_context_bits = 3,
+ pos_state_bits = 2,
+ pos_states = 1 << pos_state_bits,
+ pos_state_mask = pos_states - 1,
+
+ max_dis_states = 4,
+ dis_slot_bits = 6,
+ start_dis_model = 4,
+ end_dis_model = 14,
+ modeled_distances = 1 << (end_dis_model / 2), // 128
+ dis_align_bits = 4,
+ dis_align_size = 1 << dis_align_bits,
+
+ len_low_bits = 3,
+ len_mid_bits = 3,
+ len_high_bits = 8,
+ len_low_symbols = 1 << len_low_bits,
+ len_mid_symbols = 1 << len_mid_bits,
+ len_high_symbols = 1 << len_high_bits,
+ max_len_symbols = len_low_symbols + len_mid_symbols + len_high_symbols,
+ min_match_len = 2, // must be 2
+
+ bit_model_move_bits = 5,
+ bit_model_total_bits = 11,
+ bit_model_total = 1 << bit_model_total_bits };
+
+
+struct Bit_model
+ {
+ int probability;
+ Bit_model() : probability( bit_model_total / 2 ) {}
+ };
+
+
+struct Len_model
+ {
+ Bit_model choice1;
+ Bit_model choice2;
+ Bit_model bm_low[pos_states][len_low_symbols];
+ Bit_model bm_mid[pos_states][len_mid_symbols];
+ Bit_model bm_high[len_high_symbols];
+ };
+
+
+class Range_decoder
+ {
+ uint32_t code;
+ uint32_t range;
+
+public:
+ Range_decoder() : code( 0 ), range( 0xFFFFFFFFU )
+ {
+ for( int i = 0; i < 5; ++i ) code = (code << 8) | std::getc( stdin );
+ }
+
+ int decode( const int num_bits )
+ {
+ int symbol = 0;
+ for( int i = 0; i < num_bits; ++i )
+ {
+ range >>= 1;
+ symbol <<= 1;
+ if( code >= range ) { code -= range; symbol |= 1; }
+ if( range <= 0x00FFFFFFU ) // normalize
+ { range <<= 8; code = (code << 8) | std::getc( stdin ); }
+ }
+ return symbol;
+ }
+
+ int decode_bit( Bit_model & bm )
+ {
+ int symbol;
+ const uint32_t bound = ( range >> bit_model_total_bits ) * bm.probability;
+ if( code < bound )
+ {
+ range = bound;
+ bm.probability += (bit_model_total - bm.probability) >> bit_model_move_bits;
+ symbol = 0;
+ }
+ else
+ {
+ range -= bound;
+ code -= bound;
+ bm.probability -= bm.probability >> bit_model_move_bits;
+ symbol = 1;
+ }
+ if( range <= 0x00FFFFFFU ) // normalize
+ { range <<= 8; code = (code << 8) | std::getc( stdin ); }
+ return symbol;
+ }
+
+ int decode_tree( Bit_model bm[], const int num_bits )
+ {
+ int symbol = 1;
+ for( int i = 0; i < num_bits; ++i )
+ symbol = ( symbol << 1 ) | decode_bit( bm[symbol] );
+ return symbol - (1 << num_bits);
+ }
+
+ int decode_tree_reversed( Bit_model bm[], const int num_bits )
+ {
+ int symbol = decode_tree( bm, num_bits );
+ int reversed_symbol = 0;
+ for( int i = 0; i < num_bits; ++i )
+ {
+ reversed_symbol = ( reversed_symbol << 1 ) | ( symbol & 1 );
+ symbol >>= 1;
+ }
+ return reversed_symbol;
+ }
+
+ int decode_matched( Bit_model bm[], const int match_byte )
+ {
+ Bit_model * const bm1 = bm + 0x100;
+ int symbol = 1;
+ for( int i = 7; i >= 0; --i )
+ {
+ const int match_bit = ( match_byte >> i ) & 1;
+ const int bit = decode_bit( bm1[(match_bit<<8)+symbol] );
+ symbol = ( symbol << 1 ) | bit;
+ if( match_bit != bit )
+ {
+ while( symbol < 0x100 )
+ symbol = ( symbol << 1 ) | decode_bit( bm[symbol] );
+ break;
+ }
+ }
+ return symbol - 0x100;
+ }
+
+ int decode_len( Len_model & lm, const int pos_state )
+ {
+ if( decode_bit( lm.choice1 ) == 0 )
+ return min_match_len +
+ decode_tree( lm.bm_low[pos_state], len_low_bits );
+ if( decode_bit( lm.choice2 ) == 0 )
+ return min_match_len + len_low_symbols +
+ decode_tree( lm.bm_mid[pos_state], len_mid_bits );
+ return min_match_len + len_low_symbols + len_mid_symbols +
+ decode_tree( lm.bm_high, len_high_bits );
+ }
+ };
+
+
+class LZ_decoder
+ {
+ unsigned long long partial_data_pos;
+ Range_decoder rdec;
+ const unsigned dictionary_size;
+ uint8_t * const buffer; // output buffer
+ unsigned pos; // current pos in buffer
+ unsigned stream_pos; // first byte not yet written to stdout
+ uint32_t crc_;
+
+ void flush_data();
+
+ uint8_t get_byte( const unsigned distance ) const
+ {
+ int i = pos - distance - 1;
+ if( i < 0 ) i += dictionary_size;
+ return buffer[i];
+ }
+
+ void put_byte( const uint8_t b )
+ {
+ buffer[pos] = b;
+ if( ++pos >= dictionary_size ) flush_data();
+ }
+
+public:
+ LZ_decoder( const unsigned dict_size )
+ :
+ partial_data_pos( 0 ),
+ dictionary_size( dict_size ),
+ buffer( new uint8_t[dictionary_size] ),
+ pos( 0 ),
+ stream_pos( 0 ),
+ crc_( 0xFFFFFFFFU )
+ { buffer[dictionary_size-1] = 0; } // prev_byte of first_byte
+
+ ~LZ_decoder() { delete[] buffer; }
+
+ unsigned crc() const { return crc_ ^ 0xFFFFFFFFU; }
+ unsigned long long data_position() const { return partial_data_pos + pos; }
+
+ bool decode_member();
+ };
+
+
+class CRC32
+ {
+ uint32_t data[256]; // Table of CRCs of all 8-bit messages.
+
+public:
+ CRC32()
+ {
+ for( unsigned n = 0; n < 256; ++n )
+ {
+ unsigned c = n;
+ for( int k = 0; k < 8; ++k )
+ { if( c & 1 ) c = 0xEDB88320U ^ ( c >> 1 ); else c >>= 1; }
+ data[n] = c;
+ }
+ }
+
+ void update( uint32_t & crc, const uint8_t * buffer, const int size ) const
+ {
+ for( int i = 0; i < size; ++i )
+ crc = data[(crc^buffer[i])&0xFF] ^ ( crc >> 8 );
+ }
+ };
+
+const CRC32 crc32;
+
+
+void LZ_decoder::flush_data()
+ {
+ if( pos > stream_pos )
+ {
+ const unsigned size = pos - stream_pos;
+ crc32.update( crc_, buffer + stream_pos, size );
+ errno = 0;
+ if( std::fwrite( buffer + stream_pos, 1, size, stdout ) != size )
+ { std::fprintf( stderr, "Write error: %s\n", std::strerror( errno ) );
+ std::exit( 1 ); }
+ if( pos >= dictionary_size ) { partial_data_pos += pos; pos = 0; }
+ stream_pos = pos;
+ }
+ }
+
+
+bool LZ_decoder::decode_member() // Returns false if error
+ {
+ Bit_model bm_literal[1<<literal_context_bits][0x300];
+ Bit_model bm_match[State::states][pos_states];
+ Bit_model bm_rep[State::states];
+ Bit_model bm_rep0[State::states];
+ Bit_model bm_rep1[State::states];
+ Bit_model bm_rep2[State::states];
+ Bit_model bm_len[State::states][pos_states];
+ Bit_model bm_dis_slot[max_dis_states][1<<dis_slot_bits];
+ Bit_model bm_dis[modeled_distances-end_dis_model];
+ Bit_model bm_align[dis_align_size];
+ Len_model match_len_model;
+ Len_model rep_len_model;
+ unsigned rep0 = 0; // rep[0-3] latest four distances
+ unsigned rep1 = 0; // used for efficient coding of
+ unsigned rep2 = 0; // repeated distances
+ unsigned rep3 = 0;
+ State state;
+
+ while( !std::feof( stdin ) && !std::ferror( stdin ) )
+ {
+ const int pos_state = data_position() & pos_state_mask;
+ if( rdec.decode_bit( bm_match[state()][pos_state] ) == 0 ) // 1st bit
+ {
+ const uint8_t prev_byte = get_byte( 0 );
+ const int literal_state = prev_byte >> ( 8 - literal_context_bits );
+ Bit_model * const bm = bm_literal[literal_state];
+ if( state.is_char() )
+ put_byte( rdec.decode_tree( bm, 8 ) );
+ else
+ put_byte( rdec.decode_matched( bm, get_byte( rep0 ) ) );
+ state.set_char();
+ }
+ else
+ {
+ int len;
+ if( rdec.decode_bit( bm_rep[state()] ) == 1 ) // 2nd bit
+ {
+ if( rdec.decode_bit( bm_rep0[state()] ) == 0 ) // 3rd bit
+ {
+ if( rdec.decode_bit( bm_len[state()][pos_state] ) == 0 ) // 4th bit
+ { state.set_short_rep(); put_byte( get_byte( rep0 ) ); continue; }
+ }
+ else
+ {
+ unsigned distance;
+ if( rdec.decode_bit( bm_rep1[state()] ) == 0 ) // 4th bit
+ distance = rep1;
+ else
+ {
+ if( rdec.decode_bit( bm_rep2[state()] ) == 0 ) // 5th bit
+ distance = rep2;
+ else
+ { distance = rep3; rep3 = rep2; }
+ rep2 = rep1;
+ }
+ rep1 = rep0;
+ rep0 = distance;
+ }
+ len = rdec.decode_len( rep_len_model, pos_state );
+ state.set_rep();
+ }
+ else
+ {
+ rep3 = rep2; rep2 = rep1; rep1 = rep0;
+ len = rdec.decode_len( match_len_model, pos_state );
+ const int dis_state = std::min( len - min_match_len, max_dis_states - 1 );
+ const int dis_slot =
+ rdec.decode_tree( bm_dis_slot[dis_state], dis_slot_bits );
+ if( dis_slot < start_dis_model ) rep0 = dis_slot;
+ else
+ {
+ const int direct_bits = ( dis_slot >> 1 ) - 1;
+ rep0 = ( 2 | ( dis_slot & 1 ) ) << direct_bits;
+ if( dis_slot < end_dis_model )
+ rep0 += rdec.decode_tree_reversed( bm_dis + rep0 - dis_slot - 1,
+ direct_bits );
+ else
+ {
+ rep0 += rdec.decode( direct_bits - dis_align_bits ) << dis_align_bits;
+ rep0 += rdec.decode_tree_reversed( bm_align, dis_align_bits );
+ if( rep0 == 0xFFFFFFFFU ) // Marker found
+ {
+ flush_data();
+ return ( len == min_match_len ); // End Of Stream marker
+ }
+ }
+ }
+ state.set_match();
+ if( rep0 >= dictionary_size || ( rep0 >= pos && !partial_data_pos ) )
+ return false;
+ }
+ for( int i = 0; i < len; ++i )
+ put_byte( get_byte( rep0 ) );
+ }
+ }
+ return false;
+ }
+
+
+enum { min_dictionary_size = 1 << 12,
+ max_dictionary_size = 1 << 29 };
+
+typedef uint8_t File_header[6]; // 0-3 magic, 4 version, 5 coded_dict_size
+
+typedef uint8_t File_trailer[20];
+ // 0-3 CRC32 of the uncompressed data
+ // 4-11 size of the uncompressed data
+ // 12-19 member size including header and trailer
+
+
+/* Return values: 0 for a normal exit, 1 for environmental problems
+ (file not found, invalid flags, I/O errors, etc), 2 to indicate a
+ corrupt or invalid input file. */
+
+int main( const int argc, const char * const argv[] )
+ {
+ if( argc > 1 )
+ {
+ std::printf( "Lzd %s - Educational decompressor for lzip files.\n",
+ PROGVERSION );
+ std::printf( "Study the source to learn how a simple lzip decompressor works.\n"
+ "It is not safe to use it for any real work.\n"
+ "\nUsage: %s < file.lz > file\n", argv[0] );
+ std::printf( "Lzd decompresses from standard input to standard output.\n"
+ "\nCopyright (C) 2013 Antonio Diaz Diaz.\n"
+ "This is free software: you are free to change and redistribute it.\n"
+ "There is NO WARRANTY, to the extent permitted by law.\n"
+ "Report bugs to lzip-bug@nongnu.org\n"
+ "Lzip home page: http://www.nongnu.org/lzip/lzip.html\n" );
+ return 0;
+ }
+
+ if( isatty( STDIN_FILENO ) )
+ {
+ std::fprintf( stderr, "I won't read compressed data from a terminal.\n"
+ "Try '%s --help' for more information.\n", argv[0] );
+ return 1;
+ }
+
+ for( bool first_member = true; ; first_member = false )
+ {
+ File_header header;
+ for( int i = 0; i < 6; ++i )
+ header[i] = std::getc( stdin );
+ if( std::feof( stdin ) || std::memcmp( header, "LZIP", 4 ) != 0 )
+ {
+ if( first_member )
+ { std::fprintf( stderr, "Bad magic number (file not in lzip format)\n" );
+ return 2; }
+ break;
+ }
+ if( header[4] != 1 )
+ {
+ std::fprintf( stderr, "Version %d member format not supported.\n",
+ header[4] );
+ return 2;
+ }
+ unsigned dict_size = 1 << ( header[5] & 0x1F );
+ dict_size -= ( dict_size / 16 ) * ( ( header[5] >> 5 ) & 7 );
+ if( dict_size < min_dictionary_size || dict_size > max_dictionary_size )
+ { std::fprintf( stderr, "Invalid dictionary size in member header\n" );
+ return 2; }
+
+ LZ_decoder decoder( dict_size );
+ if( !decoder.decode_member() )
+ { std::fprintf( stderr, "Data error\n" ); return 2; }
+
+ File_trailer trailer;
+ for( int i = 0; i < 20; ++i ) trailer[i] = std::getc( stdin );
+ unsigned crc = 0;
+ for( int i = 3; i >= 0; --i ) { crc <<= 8; crc += trailer[i]; }
+ unsigned long long data_size = 0;
+ for( int i = 11; i >= 4; --i ) { data_size <<= 8; data_size += trailer[i]; }
+ if( crc != decoder.crc() || data_size != decoder.data_position() )
+ { std::fprintf( stderr, "CRC error\n" ); return 2; }
+ }
+
+ if( std::fclose( stdout ) != 0 )
+ { std::fprintf( stderr, "Can't close stdout: %s\n", std::strerror( errno ) );
+ return 1; }
+ return 0;
+ }
+
+
+File: lzip.info, Node: Concept Index, Prev: Reference source code, Up: Top
Concept Index
*************
@@ -494,10 +1143,12 @@ Concept Index
* bugs: Problems. (line 6)
* examples: Examples. (line 6)
* file format: File Format. (line 6)
+* format of the LZMA stream: Stream Format. (line 6)
* getting help: Problems. (line 6)
* introduction: Introduction. (line 6)
* invoking: Invoking Lzip. (line 6)
* options: Invoking Lzip. (line 6)
+* reference source code: Reference source code. (line 6)
* usage: Invoking Lzip. (line 6)
* version: Invoking Lzip. (line 6)
@@ -505,13 +1156,15 @@ Concept Index

Tag Table:
Node: Top224
-Node: Introduction925
-Node: Algorithm4590
-Node: Invoking Lzip7108
-Node: File Format12460
-Node: Examples14767
-Node: Problems16714
-Node: Concept Index17236
+Node: Introduction1067
+Node: Algorithm4732
+Node: Invoking Lzip7250
+Node: File Format12602
+Node: Stream Format14985
+Node: Examples23695
+Node: Problems25644
+Node: Reference source code26174
+Node: Concept Index39424

End Tag Table
diff --git a/doc/lzip.texinfo b/doc/lzip.texinfo
index 76ccfc9..632abc5 100644
--- a/doc/lzip.texinfo
+++ b/doc/lzip.texinfo
@@ -6,8 +6,8 @@
@finalout
@c %**end of header
-@set UPDATED 17 February 2013
-@set VERSION 1.14
+@set UPDATED 21 March 2013
+@set VERSION 1.15-pre1
@dircategory Data Compression
@direntry
@@ -35,13 +35,15 @@
This manual is for Lzip (version @value{VERSION}, @value{UPDATED}).
@menu
-* Introduction:: Purpose and features of lzip
-* Algorithm:: How lzip compresses the data
-* Invoking Lzip:: Command line interface
-* File Format:: Detailed format of the compressed file
-* Examples:: A small tutorial with examples
-* Problems:: Reporting bugs
-* Concept Index:: Index of concepts
+* Introduction:: Purpose and features of lzip
+* Algorithm:: How lzip compresses the data
+* Invoking Lzip:: Command line interface
+* File Format:: Detailed format of the compressed file
+* Stream Format:: Format of the LZMA stream in lzip files
+* Examples:: A small tutorial with examples
+* Problems:: Reporting bugs
+* Reference source code:: Source code illustrating stream format
+* Concept Index:: Index of concepts
@end menu
@sp 1
@@ -403,6 +405,7 @@ wedges between 0 and 7. The size of a wedge is (base_size / 16).@*
Bits 4-0 contain the base 2 logarithm of the base size (12 to 29).@*
Bits 7-5 contain the number of wedges (0 to 7) to substract from the
base size to obtain the dictionary size.@*
+Example: 0xD3 = (2^19 - 6 * 2^15) = (512KiB - 6 * 32KiB) = 320KiB@*
Valid values for dictionary size range from 4KiB to 512MiB.
@item Lzma stream
@@ -423,6 +426,206 @@ members from multi-member files.
@end table
+@node Stream Format
+@chapter Format of the LZMA stream in lzip files
+@cindex format of the LZMA stream
+
+The LZMA algorithm has three parameters, called "special LZMA
+properties", to adjust it for some kinds of binary data. These
+parameters are; @samp{literal_context_bits} (with a default value of 3),
+@samp{literal_pos_state_bits} (with a default value of 0), and
+@samp{pos_state_bits} (with a default value of 2). As a general purpose
+compressor, lzip only uses the default values for these parameters.
+
+Lzip also finishes the LZMA stream with an "End Of Stream" marker (the
+distance-length pair 0xFFFFFFFFU, 2), which in conjunction with the
+"member size" field in the member trailer allows the verification of
+stream integrity. The LZMA stream in lzip files always has these two
+features (default properties and EOS marker) and is referred to in this
+document as LZMA-302eos or LZMA-lzip.
+
+The second stage of LZMA is a range encoder that uses a different
+probability model for each type of symbol; distances, lengths, literal
+bytes, etc. Range encoding conceptually encodes all the symbols of the
+message into one number. Unlike Huffman coding, which assigns to each
+symbol a bit-pattern and concatenates all the bit-patterns together,
+range encoding can compress one symbol to less than one bit. Therefore
+the compressed data produced by a range encoder can't be split in pieces
+that could be individually described.
+
+It seems that the only way of describing the LZMA-302eos stream is
+describing the algorithm that decodes it. And given the many details
+about the range decoder that need to be described accurately, the source
+code of a real decoder seems the only appropiate reference to use.
+
+What follows is a description of the decoding algorithm for LZMA-302eos
+streams using as reference the source code of "lzd", an educational
+decompressor for lzip files which can be downloaded from the lzip
+download directory. The source code of lzd is included in appendix A.
+@ref{Reference source code}
+
+@sp 1
+@section What is coded
+
+The LZMA stream includes literals, matches and repeated matches (matches
+reusing a recently used distance). There are 7 different coding sequences:
+
+@multitable @columnfractions .35 .14 .51
+@headitem Bit sequence @tab Name @tab Description
+@item 0 + byte @tab literal @tab literal byte
+@item 1 + 0 + len + dis @tab match @tab distance-length pair
+@item 1 + 1 + 0 + 0 @tab shortrep @tab 1 byte match at latest used
+distance
+@item 1 + 1 + 0 + 1 + len @tab rep0 @tab len bytes match at latest used
+distance
+@item 1 + 1 + 1 + 0 + len @tab rep1 @tab len bytes match at second
+latest used distance
+@item 1 + 1 + 1 + 1 + 0 + len @tab rep2 @tab len bytes match at third
+latest used distance
+@item 1 + 1 + 1 + 1 + 1 + len @tab rep3 @tab len bytes match at fourth
+latest used distance
+@end multitable
+
+@sp 1
+In the following tables, multi-bit sequences are coded in normal order,
+from MSB to LSB, except where noted otherwise.
+
+Lengths (the @samp{len} in the table above) are coded as follows:
+
+@multitable @columnfractions .5 .5
+@headitem Bit sequence @tab Description
+@item 0 + 3 bits @tab lengths from 2 to 9
+@item 1 + 0 + 3 bits @tab lengths from 10 to 17
+@item 1 + 1 + 8 bits @tab lengths from 18 to 273
+@end multitable
+
+@sp 1
+The coding of distances is a little more complicated. LZMA divides the
+interval between any two powers of 2 into 2 halves, named slots. As
+possible distances range from 0 to (2^32 - 1), there are 64 slots (0 to
+63). The slot number is context-coded in 6 bits. @samp{direct_bits} are
+the remaining bits (from 0 to 30) needed to form a complete distance,
+and are calculated as (slot >> 1) - 1. If a distance needs 6 or more
+direct_bits, the last 4 bits are coded separately. The last piece
+(direct_bits for distances 4 to 127 or the last 4 bits for distances >=
+128) is context-coded in reverse order (from LSB to MSB). For distances
+>= 128, the @samp{direct_bits - 4} part is coded with fixed 0.5
+probability.
+
+@multitable @columnfractions .5 .5
+@headitem Bit sequence @tab Description
+@item slot @tab distances from 0 to 3
+@item slot + direct_bits @tab distances from 4 to 127
+@item slot + (direct_bits - 4) + 4 bits @tab distances from 128 to
+2^32 - 1
+@end multitable
+
+@sp 1
+@section The coding contexts
+
+These contexts (@samp{Bit_model} in the source), are integers or arrays
+of integers representing the probability of the corresponding bit being 0.
+
+The indices used in these arrays are:
+
+@table @samp
+@item state
+
+A state machine (@samp{State} in the source) coding the latest 2 to 4
+types of sequences processed. The initial state is 0.
+
+@item pos_state
+Value of the 2 least significant bits of the current position in the
+decoded data.
+
+@item literal_state
+Value of the 3 most significant bits of the latest byte decoded.
+
+@item dis_state
+Coded value of length (real length - 2), with a maximum of 3. The
+resulting value is in the range 0 to 3.
+
+@end table
+
+
+The contexts for decoding the type of coding sequence are:
+
+@multitable @columnfractions .2 .4 .4
+@headitem Name @tab Indices @tab Used when
+@item bm_match @tab state, pos_state @tab sequence start
+@item bm_rep @tab state @tab after sequence 1
+@item bm_rep0 @tab state @tab after sequence 11
+@item bm_rep1 @tab state @tab after sequence 111
+@item bm_rep2 @tab state @tab after sequence 1111
+@item bm_len @tab state, pos_state @tab after sequence 110
+@end multitable
+
+@sp 1
+The contexts for decoding distances are:
+
+@multitable @columnfractions .2 .4 .4
+@headitem Name @tab Indices @tab Used when
+@item bm_dis_slot @tab dis_state, bit tree @tab distance start
+@item bm_dis @tab reverse bit tree @tab after slots 4 to 13
+@item bm_align @tab reverse bit tree @tab for distances >= 128, after
+fixed probability bits
+@end multitable
+
+@sp 1
+There are two separated sets of length contexts (@samp{Len_model} in the
+source). One for normal matches, the other for repeated matches. The
+contexts in each Len_model are (see @samp{decode_len} in the source):
+
+@multitable @columnfractions .2 .4 .4
+@headitem Name @tab Indices @tab Used when
+@item choice1 @tab none @tab length start
+@item choice2 @tab none @tab after sequence 1
+@item bm_low @tab pos_state, bit tree @tab after sequence 0
+@item bm_mid @tab pos_state, bit tree @tab after sequence 10
+@item bm_high @tab bit tree @tab after sequence 11
+@end multitable
+
+@sp 1
+The context array @samp{bm_literal} is special. In principle it acts as
+a normal bit tree context, the one selected by @samp{literal_state}. But
+if the previous decoded byte was not a literal, two other bit tree
+contexts are used depending on the value of each bit in
+@samp{match_byte} (the byte at the latest used distance), until a bit is
+decoded that is different from its corresponding bit in
+@samp{match_byte}. After the first difference is found, the rest of the
+byte is decoded using the normal bit tree context. (See
+@samp{decode_matched} in the source).
+
+@sp 1
+@section The range decoder
+
+The LZMA stream is consumed one byte at a time by the range decoder.
+(See @samp{normalize} in the source). Every byte consumed produces a
+variable number of decoded bits, depending on how well these bits agree
+with their context. (See @samp{decode_bit} in the source).
+
+The range decoder state consists of two unsigned 32-bit variables,
+@code{range} (representing the most significant part of the range size
+not yet decoded), and @code{code} (representing the current point within
+@code{range}). @code{range} is initialized to (2^32 - 1), and
+@code{code} is initialized to 0.
+
+The range encoder produces a first 0 byte that must be ignored by the
+range decoder. This is done by shifting 5 bytes in the initialization of
+@code{code} instead of 4. (See the @samp{Range_decoder} constructor in
+the source).
+
+@sp 1
+@section Decoding the LZMA stream
+
+After decoding the member header and obtaining the dictionary size, the
+range decoder is initialized and then the LZMA decoder enters a loop
+(See @samp{decode_member} in the source) where it invokes the range
+decoder with the appropiate contexts to decode the different coding
+sequences (matches, repeated matches, and literal bytes), until the "End
+Of Stream" marker is decoded.
+
+
@node Examples
@chapter A small tutorial with examples
@cindex examples
@@ -541,6 +744,461 @@ If you find a bug in lzip, please send electronic mail to
find by running @w{@samp{lzip --version}}.
+@node Reference source code
+@appendix Reference source code
+@cindex reference source code
+
+@verbatim
+#include <algorithm>
+#include <cerrno>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <stdint.h>
+#include <unistd.h>
+
+
+class State
+ {
+ int st;
+
+public:
+ enum { states = 12 };
+ State() : st( 0 ) {}
+ int operator()() const { return st; }
+ bool is_char() const { return st < 7; }
+
+ void set_char()
+ {
+ static const int next[states] = { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5 };
+ st = next[st];
+ }
+
+ void set_match() { st = ( ( st < 7 ) ? 7 : 10 ); }
+ void set_rep() { st = ( ( st < 7 ) ? 8 : 11 ); }
+ void set_short_rep() { st = ( ( st < 7 ) ? 9 : 11 ); }
+ };
+
+
+enum {
+ literal_context_bits = 3,
+ pos_state_bits = 2,
+ pos_states = 1 << pos_state_bits,
+ pos_state_mask = pos_states - 1,
+
+ max_dis_states = 4,
+ dis_slot_bits = 6,
+ start_dis_model = 4,
+ end_dis_model = 14,
+ modeled_distances = 1 << (end_dis_model / 2), // 128
+ dis_align_bits = 4,
+ dis_align_size = 1 << dis_align_bits,
+
+ len_low_bits = 3,
+ len_mid_bits = 3,
+ len_high_bits = 8,
+ len_low_symbols = 1 << len_low_bits,
+ len_mid_symbols = 1 << len_mid_bits,
+ len_high_symbols = 1 << len_high_bits,
+ max_len_symbols = len_low_symbols + len_mid_symbols + len_high_symbols,
+ min_match_len = 2, // must be 2
+
+ bit_model_move_bits = 5,
+ bit_model_total_bits = 11,
+ bit_model_total = 1 << bit_model_total_bits };
+
+
+struct Bit_model
+ {
+ int probability;
+ Bit_model() : probability( bit_model_total / 2 ) {}
+ };
+
+
+struct Len_model
+ {
+ Bit_model choice1;
+ Bit_model choice2;
+ Bit_model bm_low[pos_states][len_low_symbols];
+ Bit_model bm_mid[pos_states][len_mid_symbols];
+ Bit_model bm_high[len_high_symbols];
+ };
+
+
+class Range_decoder
+ {
+ uint32_t code;
+ uint32_t range;
+
+public:
+ Range_decoder() : code( 0 ), range( 0xFFFFFFFFU )
+ {
+ for( int i = 0; i < 5; ++i ) code = (code << 8) | std::getc( stdin );
+ }
+
+ int decode( const int num_bits )
+ {
+ int symbol = 0;
+ for( int i = 0; i < num_bits; ++i )
+ {
+ range >>= 1;
+ symbol <<= 1;
+ if( code >= range ) { code -= range; symbol |= 1; }
+ if( range <= 0x00FFFFFFU ) // normalize
+ { range <<= 8; code = (code << 8) | std::getc( stdin ); }
+ }
+ return symbol;
+ }
+
+ int decode_bit( Bit_model & bm )
+ {
+ int symbol;
+ const uint32_t bound = ( range >> bit_model_total_bits ) * bm.probability;
+ if( code < bound )
+ {
+ range = bound;
+ bm.probability += (bit_model_total - bm.probability) >> bit_model_move_bits;
+ symbol = 0;
+ }
+ else
+ {
+ range -= bound;
+ code -= bound;
+ bm.probability -= bm.probability >> bit_model_move_bits;
+ symbol = 1;
+ }
+ if( range <= 0x00FFFFFFU ) // normalize
+ { range <<= 8; code = (code << 8) | std::getc( stdin ); }
+ return symbol;
+ }
+
+ int decode_tree( Bit_model bm[], const int num_bits )
+ {
+ int symbol = 1;
+ for( int i = 0; i < num_bits; ++i )
+ symbol = ( symbol << 1 ) | decode_bit( bm[symbol] );
+ return symbol - (1 << num_bits);
+ }
+
+ int decode_tree_reversed( Bit_model bm[], const int num_bits )
+ {
+ int symbol = decode_tree( bm, num_bits );
+ int reversed_symbol = 0;
+ for( int i = 0; i < num_bits; ++i )
+ {
+ reversed_symbol = ( reversed_symbol << 1 ) | ( symbol & 1 );
+ symbol >>= 1;
+ }
+ return reversed_symbol;
+ }
+
+ int decode_matched( Bit_model bm[], const int match_byte )
+ {
+ Bit_model * const bm1 = bm + 0x100;
+ int symbol = 1;
+ for( int i = 7; i >= 0; --i )
+ {
+ const int match_bit = ( match_byte >> i ) & 1;
+ const int bit = decode_bit( bm1[(match_bit<<8)+symbol] );
+ symbol = ( symbol << 1 ) | bit;
+ if( match_bit != bit )
+ {
+ while( symbol < 0x100 )
+ symbol = ( symbol << 1 ) | decode_bit( bm[symbol] );
+ break;
+ }
+ }
+ return symbol - 0x100;
+ }
+
+ int decode_len( Len_model & lm, const int pos_state )
+ {
+ if( decode_bit( lm.choice1 ) == 0 )
+ return min_match_len +
+ decode_tree( lm.bm_low[pos_state], len_low_bits );
+ if( decode_bit( lm.choice2 ) == 0 )
+ return min_match_len + len_low_symbols +
+ decode_tree( lm.bm_mid[pos_state], len_mid_bits );
+ return min_match_len + len_low_symbols + len_mid_symbols +
+ decode_tree( lm.bm_high, len_high_bits );
+ }
+ };
+
+
+class LZ_decoder
+ {
+ unsigned long long partial_data_pos;
+ Range_decoder rdec;
+ const unsigned dictionary_size;
+ uint8_t * const buffer; // output buffer
+ unsigned pos; // current pos in buffer
+ unsigned stream_pos; // first byte not yet written to stdout
+ uint32_t crc_;
+
+ void flush_data();
+
+ uint8_t get_byte( const unsigned distance ) const
+ {
+ int i = pos - distance - 1;
+ if( i < 0 ) i += dictionary_size;
+ return buffer[i];
+ }
+
+ void put_byte( const uint8_t b )
+ {
+ buffer[pos] = b;
+ if( ++pos >= dictionary_size ) flush_data();
+ }
+
+public:
+ LZ_decoder( const unsigned dict_size )
+ :
+ partial_data_pos( 0 ),
+ dictionary_size( dict_size ),
+ buffer( new uint8_t[dictionary_size] ),
+ pos( 0 ),
+ stream_pos( 0 ),
+ crc_( 0xFFFFFFFFU )
+ { buffer[dictionary_size-1] = 0; } // prev_byte of first_byte
+
+ ~LZ_decoder() { delete[] buffer; }
+
+ unsigned crc() const { return crc_ ^ 0xFFFFFFFFU; }
+ unsigned long long data_position() const { return partial_data_pos + pos; }
+
+ bool decode_member();
+ };
+
+
+class CRC32
+ {
+ uint32_t data[256]; // Table of CRCs of all 8-bit messages.
+
+public:
+ CRC32()
+ {
+ for( unsigned n = 0; n < 256; ++n )
+ {
+ unsigned c = n;
+ for( int k = 0; k < 8; ++k )
+ { if( c & 1 ) c = 0xEDB88320U ^ ( c >> 1 ); else c >>= 1; }
+ data[n] = c;
+ }
+ }
+
+ void update( uint32_t & crc, const uint8_t * buffer, const int size ) const
+ {
+ for( int i = 0; i < size; ++i )
+ crc = data[(crc^buffer[i])&0xFF] ^ ( crc >> 8 );
+ }
+ };
+
+const CRC32 crc32;
+
+
+void LZ_decoder::flush_data()
+ {
+ if( pos > stream_pos )
+ {
+ const unsigned size = pos - stream_pos;
+ crc32.update( crc_, buffer + stream_pos, size );
+ errno = 0;
+ if( std::fwrite( buffer + stream_pos, 1, size, stdout ) != size )
+ { std::fprintf( stderr, "Write error: %s\n", std::strerror( errno ) );
+ std::exit( 1 ); }
+ if( pos >= dictionary_size ) { partial_data_pos += pos; pos = 0; }
+ stream_pos = pos;
+ }
+ }
+
+
+bool LZ_decoder::decode_member() // Returns false if error
+ {
+ Bit_model bm_literal[1<<literal_context_bits][0x300];
+ Bit_model bm_match[State::states][pos_states];
+ Bit_model bm_rep[State::states];
+ Bit_model bm_rep0[State::states];
+ Bit_model bm_rep1[State::states];
+ Bit_model bm_rep2[State::states];
+ Bit_model bm_len[State::states][pos_states];
+ Bit_model bm_dis_slot[max_dis_states][1<<dis_slot_bits];
+ Bit_model bm_dis[modeled_distances-end_dis_model];
+ Bit_model bm_align[dis_align_size];
+ Len_model match_len_model;
+ Len_model rep_len_model;
+ unsigned rep0 = 0; // rep[0-3] latest four distances
+ unsigned rep1 = 0; // used for efficient coding of
+ unsigned rep2 = 0; // repeated distances
+ unsigned rep3 = 0;
+ State state;
+
+ while( !std::feof( stdin ) && !std::ferror( stdin ) )
+ {
+ const int pos_state = data_position() & pos_state_mask;
+ if( rdec.decode_bit( bm_match[state()][pos_state] ) == 0 ) // 1st bit
+ {
+ const uint8_t prev_byte = get_byte( 0 );
+ const int literal_state = prev_byte >> ( 8 - literal_context_bits );
+ Bit_model * const bm = bm_literal[literal_state];
+ if( state.is_char() )
+ put_byte( rdec.decode_tree( bm, 8 ) );
+ else
+ put_byte( rdec.decode_matched( bm, get_byte( rep0 ) ) );
+ state.set_char();
+ }
+ else
+ {
+ int len;
+ if( rdec.decode_bit( bm_rep[state()] ) == 1 ) // 2nd bit
+ {
+ if( rdec.decode_bit( bm_rep0[state()] ) == 0 ) // 3rd bit
+ {
+ if( rdec.decode_bit( bm_len[state()][pos_state] ) == 0 ) // 4th bit
+ { state.set_short_rep(); put_byte( get_byte( rep0 ) ); continue; }
+ }
+ else
+ {
+ unsigned distance;
+ if( rdec.decode_bit( bm_rep1[state()] ) == 0 ) // 4th bit
+ distance = rep1;
+ else
+ {
+ if( rdec.decode_bit( bm_rep2[state()] ) == 0 ) // 5th bit
+ distance = rep2;
+ else
+ { distance = rep3; rep3 = rep2; }
+ rep2 = rep1;
+ }
+ rep1 = rep0;
+ rep0 = distance;
+ }
+ len = rdec.decode_len( rep_len_model, pos_state );
+ state.set_rep();
+ }
+ else
+ {
+ rep3 = rep2; rep2 = rep1; rep1 = rep0;
+ len = rdec.decode_len( match_len_model, pos_state );
+ const int dis_state = std::min( len - min_match_len, max_dis_states - 1 );
+ const int dis_slot =
+ rdec.decode_tree( bm_dis_slot[dis_state], dis_slot_bits );
+ if( dis_slot < start_dis_model ) rep0 = dis_slot;
+ else
+ {
+ const int direct_bits = ( dis_slot >> 1 ) - 1;
+ rep0 = ( 2 | ( dis_slot & 1 ) ) << direct_bits;
+ if( dis_slot < end_dis_model )
+ rep0 += rdec.decode_tree_reversed( bm_dis + rep0 - dis_slot - 1,
+ direct_bits );
+ else
+ {
+ rep0 += rdec.decode( direct_bits - dis_align_bits ) << dis_align_bits;
+ rep0 += rdec.decode_tree_reversed( bm_align, dis_align_bits );
+ if( rep0 == 0xFFFFFFFFU ) // Marker found
+ {
+ flush_data();
+ return ( len == min_match_len ); // End Of Stream marker
+ }
+ }
+ }
+ state.set_match();
+ if( rep0 >= dictionary_size || ( rep0 >= pos && !partial_data_pos ) )
+ return false;
+ }
+ for( int i = 0; i < len; ++i )
+ put_byte( get_byte( rep0 ) );
+ }
+ }
+ return false;
+ }
+
+
+enum { min_dictionary_size = 1 << 12,
+ max_dictionary_size = 1 << 29 };
+
+typedef uint8_t File_header[6]; // 0-3 magic, 4 version, 5 coded_dict_size
+
+typedef uint8_t File_trailer[20];
+ // 0-3 CRC32 of the uncompressed data
+ // 4-11 size of the uncompressed data
+ // 12-19 member size including header and trailer
+
+
+/* Return values: 0 for a normal exit, 1 for environmental problems
+ (file not found, invalid flags, I/O errors, etc), 2 to indicate a
+ corrupt or invalid input file. */
+
+int main( const int argc, const char * const argv[] )
+ {
+ if( argc > 1 )
+ {
+ std::printf( "Lzd %s - Educational decompressor for lzip files.\n",
+ PROGVERSION );
+ std::printf( "Study the source to learn how a simple lzip decompressor works.\n"
+ "It is not safe to use it for any real work.\n"
+ "\nUsage: %s < file.lz > file\n", argv[0] );
+ std::printf( "Lzd decompresses from standard input to standard output.\n"
+ "\nCopyright (C) 2013 Antonio Diaz Diaz.\n"
+ "This is free software: you are free to change and redistribute it.\n"
+ "There is NO WARRANTY, to the extent permitted by law.\n"
+ "Report bugs to lzip-bug@nongnu.org\n"
+ "Lzip home page: http://www.nongnu.org/lzip/lzip.html\n" );
+ return 0;
+ }
+
+ if( isatty( STDIN_FILENO ) )
+ {
+ std::fprintf( stderr, "I won't read compressed data from a terminal.\n"
+ "Try '%s --help' for more information.\n", argv[0] );
+ return 1;
+ }
+
+ for( bool first_member = true; ; first_member = false )
+ {
+ File_header header;
+ for( int i = 0; i < 6; ++i )
+ header[i] = std::getc( stdin );
+ if( std::feof( stdin ) || std::memcmp( header, "LZIP", 4 ) != 0 )
+ {
+ if( first_member )
+ { std::fprintf( stderr, "Bad magic number (file not in lzip format)\n" );
+ return 2; }
+ break;
+ }
+ if( header[4] != 1 )
+ {
+ std::fprintf( stderr, "Version %d member format not supported.\n",
+ header[4] );
+ return 2;
+ }
+ unsigned dict_size = 1 << ( header[5] & 0x1F );
+ dict_size -= ( dict_size / 16 ) * ( ( header[5] >> 5 ) & 7 );
+ if( dict_size < min_dictionary_size || dict_size > max_dictionary_size )
+ { std::fprintf( stderr, "Invalid dictionary size in member header\n" );
+ return 2; }
+
+ LZ_decoder decoder( dict_size );
+ if( !decoder.decode_member() )
+ { std::fprintf( stderr, "Data error\n" ); return 2; }
+
+ File_trailer trailer;
+ for( int i = 0; i < 20; ++i ) trailer[i] = std::getc( stdin );
+ unsigned crc = 0;
+ for( int i = 3; i >= 0; --i ) { crc <<= 8; crc += trailer[i]; }
+ unsigned long long data_size = 0;
+ for( int i = 11; i >= 4; --i ) { data_size <<= 8; data_size += trailer[i]; }
+ if( crc != decoder.crc() || data_size != decoder.data_position() )
+ { std::fprintf( stderr, "CRC error\n" ); return 2; }
+ }
+
+ if( std::fclose( stdout ) != 0 )
+ { std::fprintf( stderr, "Can't close stdout: %s\n", std::strerror( errno ) );
+ return 1; }
+ return 0;
+ }
+@end verbatim
+
+
@node Concept Index
@unnumbered Concept Index
diff --git a/encoder.cc b/encoder.cc
index 2e00c3d..3387bbf 100644
--- a/encoder.cc
+++ b/encoder.cc
@@ -416,7 +416,7 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances],
if( replens[rep] < min_match_len ) continue;
const int price = rep_match_price + price_rep( rep, state, pos_state );
for( int len = min_match_len; len <= replens[rep]; ++len )
- trials[len].update( price + rep_match_len_encoder.price( len, pos_state ),
+ trials[len].update( price + rep_len_encoder.price( len, pos_state ),
rep, 0 );
}
@@ -586,8 +586,7 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances],
trials[++num_trials].price = infinite_price;
int price = rep_match_price + price_rep( rep, cur_state, pos_state );
for( int i = min_match_len; i <= len; ++i )
- trials[cur+i].update( price +
- rep_match_len_encoder.price( i, pos_state ),
+ trials[cur+i].update( price + rep_len_encoder.price( i, pos_state ),
rep, cur );
if( rep == 0 ) start_len = len + 1; // discard shorter matches
@@ -602,7 +601,7 @@ int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances],
int pos_state2 = ( pos_state + len ) & pos_state_mask;
State state2 = cur_state; state2.set_rep();
- price += rep_match_len_encoder.price( len, pos_state ) +
+ price += rep_len_encoder.price( len, pos_state ) +
price0( bm_match[state2()][pos_state2] ) +
price_matched( data[len-1], data[len], data[len-dis] );
pos_state2 = ( pos_state2 + 1 ) & pos_state_mask;
@@ -750,7 +749,7 @@ bool LZ_encoder::encode_member( const unsigned long long member_size )
if( len == 1 ) state.set_short_rep();
else
{
- rep_match_len_encoder.encode( range_encoder, len, pos_state );
+ rep_len_encoder.encode( range_encoder, len, pos_state );
state.set_rep();
}
}
diff --git a/encoder.h b/encoder.h
index da83a3c..d3f3e77 100644
--- a/encoder.h
+++ b/encoder.h
@@ -113,9 +113,9 @@ inline int price_symbol_reversed( const Bit_model bm[], int symbol,
for( int i = num_bits; i > 0; --i )
{
const int bit = symbol & 1;
- symbol >>= 1;
price += price_bit( bm[model], bit );
model = ( model << 1 ) | bit;
+ symbol >>= 1;
}
return price;
}
@@ -372,13 +372,8 @@ public:
};
-class Len_encoder
+class Len_encoder : public Len_model
{
- Bit_model choice1;
- Bit_model choice2;
- Bit_model bm_low[pos_states][len_low_symbols];
- Bit_model bm_mid[pos_states][len_mid_symbols];
- Bit_model bm_high[len_high_symbols];
int prices[pos_states][max_len_symbols];
const int len_symbols;
int counters[pos_states];
@@ -438,8 +433,8 @@ protected:
Bit_model bm_align[dis_align_size];
Range_encoder range_encoder;
- Len_encoder len_encoder;
- Len_encoder rep_match_len_encoder;
+ Len_encoder match_len_encoder;
+ Len_encoder rep_len_encoder;
const int num_dis_slots;
@@ -450,8 +445,8 @@ protected:
:
crc_( 0xFFFFFFFFU ),
range_encoder( outfd ),
- len_encoder( match_len_limit ),
- rep_match_len_encoder( match_len_limit ),
+ match_len_encoder( match_len_limit ),
+ rep_len_encoder( match_len_limit ),
num_dis_slots( 2 * real_bits( dictionary_size - 1 ) )
{
for( int i = 0; i < File_header::size; ++i )
@@ -492,7 +487,7 @@ protected:
void encode_pair( const uint32_t dis, const int len, const int pos_state )
{
- len_encoder.encode( range_encoder, len, pos_state );
+ match_len_encoder.encode( range_encoder, len, pos_state );
const int dis_slot = get_slot( dis );
range_encoder.encode_tree( bm_dis_slot[get_dis_state(len)], dis_slot, dis_slot_bits );
@@ -598,7 +593,7 @@ class LZ_encoder : public LZ_encoder_base
int price_rep0_len( const int len, const State state, const int pos_state ) const
{
return price_rep( 0, state, pos_state ) +
- rep_match_len_encoder.price( len, pos_state );
+ rep_len_encoder.price( len, pos_state );
}
int price_dis( const int dis, const int dis_state ) const
@@ -612,7 +607,7 @@ class LZ_encoder : public LZ_encoder_base
int price_pair( const int dis, const int len, const int pos_state ) const
{
- return len_encoder.price( len, pos_state ) +
+ return match_len_encoder.price( len, pos_state ) +
price_dis( dis, get_dis_state( len ) );
}
diff --git a/fast_encoder.cc b/fast_encoder.cc
index 954e671..fc5f891 100644
--- a/fast_encoder.cc
+++ b/fast_encoder.cc
@@ -169,7 +169,7 @@ bool FLZ_encoder::encode_member( const unsigned long long member_size )
if( dis > 1 )
range_encoder.encode_bit( bm_rep2[state()], dis > 2 );
}
- rep_match_len_encoder.encode( range_encoder, len, pos_state );
+ rep_len_encoder.encode( range_encoder, len, pos_state );
state.set_rep();
move_pos( len );
continue;
diff --git a/lzip.h b/lzip.h
index 77aa73e..ffa2fb3 100644
--- a/lzip.h
+++ b/lzip.h
@@ -85,6 +85,15 @@ struct Bit_model
Bit_model() : probability( bit_model_total / 2 ) {}
};
+struct Len_model
+ {
+ Bit_model choice1;
+ Bit_model choice2;
+ Bit_model bm_low[pos_states][len_low_symbols];
+ Bit_model bm_mid[pos_states][len_mid_symbols];
+ Bit_model bm_high[len_high_symbols];
+ };
+
class Pretty_print
{
diff --git a/main.cc b/main.cc
index 068ca62..bd9bda0 100644
--- a/main.cc
+++ b/main.cc
@@ -160,8 +160,9 @@ void show_header( const File_header & header )
for( int i = 0; i < 8 && ( num > 9999 || ( exact && num >= factor ) ); ++i )
{ num /= factor; if( num % factor != 0 ) exact = false;
p = prefix[i]; np = ""; }
- std::fprintf( stderr, "version %d, dictionary size %s%4u %sB. ",
- header.version(), np, num, p );
+ if( verbosity >= 4 )
+ std::fprintf( stderr, "version %d, ", header.version() );
+ std::fprintf( stderr, "dictionary size %s%4u %sB. ", np, num, p );
}
@@ -754,6 +755,7 @@ int main( const int argc, const char * const argv[] )
{ 'h', "help", Arg_parser::no },
{ 'k', "keep", Arg_parser::no },
{ 'm', "match-length", Arg_parser::yes },
+ { 'n', "threads", Arg_parser::yes },
{ 'o', "output", Arg_parser::yes },
{ 'q', "quiet", Arg_parser::no },
{ 's', "dictionary-size", Arg_parser::yes },
@@ -790,6 +792,7 @@ int main( const int argc, const char * const argv[] )
case 'm': encoder_options.match_len_limit =
getnum( arg, min_match_len_limit, max_match_len );
zero = false; break;
+ case 'n': break;
case 'o': default_output_filename = arg; break;
case 'q': verbosity = -1; break;
case 's': encoder_options.dictionary_size = get_dict_size( arg );