summaryrefslogtreecommitdiffstats
path: root/decoder.cc
diff options
context:
space:
mode:
Diffstat (limited to 'decoder.cc')
-rw-r--r--decoder.cc363
1 files changed, 363 insertions, 0 deletions
diff --git a/decoder.cc b/decoder.cc
new file mode 100644
index 0000000..fbfcdb3
--- /dev/null
+++ b/decoder.cc
@@ -0,0 +1,363 @@
+/* Lzd - Educational decompressor for lzip files
+ Copyright (C) 2013 Antonio Diaz Diaz.
+
+ This program is free software: you have unlimited permission
+ to copy, distribute and modify it.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+*/
+
+class State
+ {
+ int st;
+
+public:
+ enum { states = 12 };
+ State() : st( 0 ) {}
+ int operator()() const { return st; }
+ bool is_char() const { return st < 7; }
+
+ void set_char()
+ {
+ static const int next[states] = { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5 };
+ st = next[st];
+ }
+
+ void set_match() { st = ( ( st < 7 ) ? 7 : 10 ); }
+ void set_rep() { st = ( ( st < 7 ) ? 8 : 11 ); }
+ void set_short_rep() { st = ( ( st < 7 ) ? 9 : 11 ); }
+ };
+
+
+enum {
+ literal_context_bits = 3,
+ pos_state_bits = 2,
+ pos_states = 1 << pos_state_bits,
+ pos_state_mask = pos_states - 1,
+
+ max_dis_states = 4,
+ dis_slot_bits = 6,
+ start_dis_model = 4,
+ end_dis_model = 14,
+ modeled_distances = 1 << (end_dis_model / 2), // 128
+ dis_align_bits = 4,
+ dis_align_size = 1 << dis_align_bits,
+
+ len_low_bits = 3,
+ len_mid_bits = 3,
+ len_high_bits = 8,
+ len_low_symbols = 1 << len_low_bits,
+ len_mid_symbols = 1 << len_mid_bits,
+ len_high_symbols = 1 << len_high_bits,
+ max_len_symbols = len_low_symbols + len_mid_symbols + len_high_symbols,
+ min_match_len = 2, // must be 2
+
+ bit_model_move_bits = 5,
+ bit_model_total_bits = 11,
+ bit_model_total = 1 << bit_model_total_bits };
+
+
+struct Bit_model
+ {
+ int probability;
+ Bit_model() : probability( bit_model_total / 2 ) {}
+ };
+
+struct Len_model
+ {
+ Bit_model choice1;
+ Bit_model choice2;
+ Bit_model bm_low[pos_states][len_low_symbols];
+ Bit_model bm_mid[pos_states][len_mid_symbols];
+ Bit_model bm_high[len_high_symbols];
+ };
+
+
+class Range_decoder
+ {
+ uint32_t code;
+ uint32_t range;
+
+public:
+ Range_decoder() : code( 0 ), range( 0xFFFFFFFFU )
+ {
+ for( int i = 0; i < 5; ++i ) code = (code << 8) | std::getc( stdin );
+ }
+
+ int decode( const int num_bits )
+ {
+ int symbol = 0;
+ for( int i = 0; i < num_bits; ++i )
+ {
+ range >>= 1;
+ symbol <<= 1;
+ if( code >= range ) { code -= range; symbol |= 1; }
+ if( range <= 0x00FFFFFFU ) // normalize
+ { range <<= 8; code = (code << 8) | std::getc( stdin ); }
+ }
+ return symbol;
+ }
+
+ int decode_bit( Bit_model & bm )
+ {
+ int symbol;
+ const uint32_t bound = ( range >> bit_model_total_bits ) * bm.probability;
+ if( code < bound )
+ {
+ range = bound;
+ bm.probability += (bit_model_total - bm.probability) >> bit_model_move_bits;
+ symbol = 0;
+ }
+ else
+ {
+ range -= bound;
+ code -= bound;
+ bm.probability -= bm.probability >> bit_model_move_bits;
+ symbol = 1;
+ }
+ if( range <= 0x00FFFFFFU ) // normalize
+ { range <<= 8; code = (code << 8) | std::getc( stdin ); }
+ return symbol;
+ }
+
+ int decode_tree( Bit_model bm[], const int num_bits )
+ {
+ int symbol = 1;
+ for( int i = 0; i < num_bits; ++i )
+ symbol = ( symbol << 1 ) | decode_bit( bm[symbol] );
+ return symbol - (1 << num_bits);
+ }
+
+ int decode_tree_reversed( Bit_model bm[], const int num_bits )
+ {
+ int symbol = decode_tree( bm, num_bits );
+ int reversed_symbol = 0;
+ for( int i = 0; i < num_bits; ++i )
+ {
+ reversed_symbol = ( reversed_symbol << 1 ) | ( symbol & 1 );
+ symbol >>= 1;
+ }
+ return reversed_symbol;
+ }
+
+ int decode_matched( Bit_model bm[], const int match_byte )
+ {
+ Bit_model * const bm1 = bm + 0x100;
+ int symbol = 1;
+ for( int i = 7; i >= 0; --i )
+ {
+ const int match_bit = ( match_byte >> i ) & 1;
+ const int bit = decode_bit( bm1[(match_bit<<8)+symbol] );
+ symbol = ( symbol << 1 ) | bit;
+ if( match_bit != bit )
+ {
+ while( symbol < 0x100 )
+ symbol = ( symbol << 1 ) | decode_bit( bm[symbol] );
+ break;
+ }
+ }
+ return symbol - 0x100;
+ }
+
+ int decode_len( Len_model & lm, const int pos_state )
+ {
+ if( decode_bit( lm.choice1 ) == 0 )
+ return min_match_len +
+ decode_tree( lm.bm_low[pos_state], len_low_bits );
+ if( decode_bit( lm.choice2 ) == 0 )
+ return min_match_len + len_low_symbols +
+ decode_tree( lm.bm_mid[pos_state], len_mid_bits );
+ return min_match_len + len_low_symbols + len_mid_symbols +
+ decode_tree( lm.bm_high, len_high_bits );
+ }
+ };
+
+
+class LZ_decoder
+ {
+ unsigned long long partial_data_pos;
+ Range_decoder rdec;
+ const unsigned dictionary_size;
+ uint8_t * const buffer; // output buffer
+ unsigned pos; // current pos in buffer
+ unsigned stream_pos; // first byte not yet written to stdout
+ uint32_t crc_;
+
+ void flush_data();
+
+ uint8_t get_byte( const unsigned distance ) const
+ {
+ int i = pos - distance - 1;
+ if( i < 0 ) i += dictionary_size;
+ return buffer[i];
+ }
+
+ void put_byte( const uint8_t b )
+ {
+ buffer[pos] = b;
+ if( ++pos >= dictionary_size ) flush_data();
+ }
+
+public:
+ LZ_decoder( const unsigned dict_size )
+ :
+ partial_data_pos( 0 ),
+ dictionary_size( dict_size ),
+ buffer( new uint8_t[dictionary_size] ),
+ pos( 0 ),
+ stream_pos( 0 ),
+ crc_( 0xFFFFFFFFU )
+ { buffer[dictionary_size-1] = 0; } // prev_byte of first_byte
+
+ ~LZ_decoder() { delete[] buffer; }
+
+ unsigned crc() const { return crc_ ^ 0xFFFFFFFFU; }
+ unsigned long long data_position() const { return partial_data_pos + pos; }
+
+ bool decode_member();
+ };
+
+
+class CRC32
+ {
+ uint32_t data[256]; // Table of CRCs of all 8-bit messages.
+
+public:
+ CRC32()
+ {
+ for( unsigned n = 0; n < 256; ++n )
+ {
+ unsigned c = n;
+ for( int k = 0; k < 8; ++k )
+ { if( c & 1 ) c = 0xEDB88320U ^ ( c >> 1 ); else c >>= 1; }
+ data[n] = c;
+ }
+ }
+
+ void update( uint32_t & crc, const uint8_t * buffer, const int size ) const
+ {
+ for( int i = 0; i < size; ++i )
+ crc = data[(crc^buffer[i])&0xFF] ^ ( crc >> 8 );
+ }
+ };
+
+const CRC32 crc32;
+
+
+void LZ_decoder::flush_data()
+ {
+ if( pos > stream_pos )
+ {
+ const unsigned size = pos - stream_pos;
+ crc32.update( crc_, buffer + stream_pos, size );
+ errno = 0;
+ if( std::fwrite( buffer + stream_pos, 1, size, stdout ) != size )
+ { std::fprintf( stderr, "Write error: %s\n", std::strerror( errno ) );
+ std::exit( 1 ); }
+ if( pos >= dictionary_size ) { partial_data_pos += pos; pos = 0; }
+ stream_pos = pos;
+ }
+ }
+
+
+bool LZ_decoder::decode_member() // Returns false if error
+ {
+ Bit_model bm_literal[1<<literal_context_bits][0x300];
+ Bit_model bm_match[State::states][pos_states];
+ Bit_model bm_rep[State::states];
+ Bit_model bm_rep0[State::states];
+ Bit_model bm_rep1[State::states];
+ Bit_model bm_rep2[State::states];
+ Bit_model bm_len[State::states][pos_states];
+ Bit_model bm_dis_slot[max_dis_states][1<<dis_slot_bits];
+ Bit_model bm_dis[modeled_distances-end_dis_model];
+ Bit_model bm_align[dis_align_size];
+ Len_model match_len_model;
+ Len_model rep_len_model;
+ unsigned rep0 = 0; // rep[0-3] latest four distances
+ unsigned rep1 = 0; // used for efficient coding of
+ unsigned rep2 = 0; // repeated distances
+ unsigned rep3 = 0;
+ State state;
+
+ while( !std::feof( stdin ) && !std::ferror( stdin ) )
+ {
+ const int pos_state = data_position() & pos_state_mask;
+ if( rdec.decode_bit( bm_match[state()][pos_state] ) == 0 ) // 1st bit
+ {
+ const uint8_t prev_byte = get_byte( 0 );
+ const int literal_state = prev_byte >> ( 8 - literal_context_bits );
+ Bit_model * const bm = bm_literal[literal_state];
+ if( state.is_char() )
+ put_byte( rdec.decode_tree( bm, 8 ) );
+ else
+ put_byte( rdec.decode_matched( bm, get_byte( rep0 ) ) );
+ state.set_char();
+ }
+ else
+ {
+ int len;
+ if( rdec.decode_bit( bm_rep[state()] ) == 1 ) // 2nd bit
+ {
+ if( rdec.decode_bit( bm_rep0[state()] ) == 0 ) // 3rd bit
+ {
+ if( rdec.decode_bit( bm_len[state()][pos_state] ) == 0 ) // 4th bit
+ { state.set_short_rep(); put_byte( get_byte( rep0 ) ); continue; }
+ }
+ else
+ {
+ unsigned distance;
+ if( rdec.decode_bit( bm_rep1[state()] ) == 0 ) // 4th bit
+ distance = rep1;
+ else
+ {
+ if( rdec.decode_bit( bm_rep2[state()] ) == 0 ) // 5th bit
+ distance = rep2;
+ else
+ { distance = rep3; rep3 = rep2; }
+ rep2 = rep1;
+ }
+ rep1 = rep0;
+ rep0 = distance;
+ }
+ len = rdec.decode_len( rep_len_model, pos_state );
+ state.set_rep();
+ }
+ else
+ {
+ rep3 = rep2; rep2 = rep1; rep1 = rep0;
+ len = rdec.decode_len( match_len_model, pos_state );
+ const int dis_state = std::min( len - min_match_len, max_dis_states - 1 );
+ const int dis_slot =
+ rdec.decode_tree( bm_dis_slot[dis_state], dis_slot_bits );
+ if( dis_slot < start_dis_model ) rep0 = dis_slot;
+ else
+ {
+ const int direct_bits = ( dis_slot >> 1 ) - 1;
+ rep0 = ( 2 | ( dis_slot & 1 ) ) << direct_bits;
+ if( dis_slot < end_dis_model )
+ rep0 += rdec.decode_tree_reversed( bm_dis + rep0 - dis_slot - 1,
+ direct_bits );
+ else
+ {
+ rep0 += rdec.decode( direct_bits - dis_align_bits ) << dis_align_bits;
+ rep0 += rdec.decode_tree_reversed( bm_align, dis_align_bits );
+ if( rep0 == 0xFFFFFFFFU ) // Marker found
+ {
+ flush_data();
+ return ( len == min_match_len ); // End Of Stream marker
+ }
+ }
+ }
+ state.set_match();
+ if( rep0 >= dictionary_size || ( rep0 >= pos && !partial_data_pos ) )
+ return false;
+ }
+ for( int i = 0; i < len; ++i )
+ put_byte( get_byte( rep0 ) );
+ }
+ }
+ return false;
+ }