/* Lzd - Educational decompressor for lzip files Copyright (C) 2013 Antonio Diaz Diaz. This program is free software: you have unlimited permission to copy, distribute and modify it. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. */ /* Exit status: 0 for a normal exit, 1 for environmental problems (file not found, invalid flags, I/O errors, etc), 2 to indicate a corrupt or invalid input file. */ #include #include #include #include #include #include #include #if defined(__MSVCRT__) || defined(__OS2__) #include #include #endif class State { int st; public: enum { states = 12 }; State() : st( 0 ) {} int operator()() const { return st; } bool is_char() const { return st < 7; } void set_char() { static const int next[states] = { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5 }; st = next[st]; } void set_match() { st = ( st < 7 ) ? 7 : 10; } void set_rep() { st = ( st < 7 ) ? 8 : 11; } void set_short_rep() { st = ( st < 7 ) ? 9 : 11; } }; enum { min_dictionary_size = 1 << 12, max_dictionary_size = 1 << 29, literal_context_bits = 3, pos_state_bits = 2, pos_states = 1 << pos_state_bits, pos_state_mask = pos_states - 1, dis_slot_bits = 6, start_dis_model = 4, end_dis_model = 14, modeled_distances = 1 << (end_dis_model / 2), // 128 dis_align_bits = 4, dis_align_size = 1 << dis_align_bits, len_low_bits = 3, len_mid_bits = 3, len_high_bits = 8, len_low_symbols = 1 << len_low_bits, len_mid_symbols = 1 << len_mid_bits, len_high_symbols = 1 << len_high_bits, max_len_symbols = len_low_symbols + len_mid_symbols + len_high_symbols, min_match_len = 2, // must be 2 max_dis_states = 4, bit_model_move_bits = 5, bit_model_total_bits = 11, bit_model_total = 1 << bit_model_total_bits }; struct Bit_model { int probability; Bit_model() : probability( bit_model_total / 2 ) {} }; struct Len_model { Bit_model choice1; Bit_model choice2; Bit_model bm_low[pos_states][len_low_symbols]; Bit_model bm_mid[pos_states][len_mid_symbols]; Bit_model bm_high[len_high_symbols]; }; class CRC32 { uint32_t data[256]; // Table of CRCs of all 8-bit messages. public: CRC32() { for( unsigned n = 0; n < 256; ++n ) { unsigned c = n; for( int k = 0; k < 8; ++k ) { if( c & 1 ) c = 0xEDB88320U ^ ( c >> 1 ); else c >>= 1; } data[n] = c; } } void update( uint32_t & crc, const uint8_t * const buffer, const int size ) const { for( int i = 0; i < size; ++i ) crc = data[(crc^buffer[i])&0xFF] ^ ( crc >> 8 ); } }; const CRC32 crc32; typedef uint8_t File_header[6]; // 0-3 magic, 4 version, 5 coded_dict_size typedef uint8_t File_trailer[20]; // 0-3 CRC32 of the uncompressed data // 4-11 size of the uncompressed data // 12-19 member size including header and trailer class Range_decoder { uint32_t code; uint32_t range; public: Range_decoder() : code( 0 ), range( 0xFFFFFFFFU ) { for( int i = 0; i < 5; ++i ) code = (code << 8) | get_byte(); } uint8_t get_byte() { return std::getc( stdin ); } int decode( const int num_bits ) { int symbol = 0; for( int i = 0; i < num_bits; ++i ) { range >>= 1; symbol <<= 1; if( code >= range ) { code -= range; symbol |= 1; } if( range <= 0x00FFFFFFU ) // normalize { range <<= 8; code = (code << 8) | get_byte(); } } return symbol; } int decode_bit( Bit_model & bm ) { int symbol; const uint32_t bound = ( range >> bit_model_total_bits ) * bm.probability; if( code < bound ) { range = bound; bm.probability += (bit_model_total - bm.probability) >> bit_model_move_bits; symbol = 0; } else { range -= bound; code -= bound; bm.probability -= bm.probability >> bit_model_move_bits; symbol = 1; } if( range <= 0x00FFFFFFU ) // normalize { range <<= 8; code = (code << 8) | get_byte(); } return symbol; } int decode_tree( Bit_model bm[], const int num_bits ) { int symbol = 1; for( int i = 0; i < num_bits; ++i ) symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); return symbol - (1 << num_bits); } int decode_tree_reversed( Bit_model bm[], const int num_bits ) { int symbol = decode_tree( bm, num_bits ); int reversed_symbol = 0; for( int i = 0; i < num_bits; ++i ) { reversed_symbol = ( reversed_symbol << 1 ) | ( symbol & 1 ); symbol >>= 1; } return reversed_symbol; } int decode_matched( Bit_model bm[], const int match_byte ) { Bit_model * const bm1 = bm + 0x100; int symbol = 1; for( int i = 7; i >= 0; --i ) { const int match_bit = ( match_byte >> i ) & 1; const int bit = decode_bit( bm1[(match_bit<<8)+symbol] ); symbol = ( symbol << 1 ) | bit; if( match_bit != bit ) { while( symbol < 0x100 ) symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); break; } } return symbol - 0x100; } int decode_len( Len_model & lm, const int pos_state ) { if( decode_bit( lm.choice1 ) == 0 ) return decode_tree( lm.bm_low[pos_state], len_low_bits ); if( decode_bit( lm.choice2 ) == 0 ) return len_low_symbols + decode_tree( lm.bm_mid[pos_state], len_mid_bits ); return len_low_symbols + len_mid_symbols + decode_tree( lm.bm_high, len_high_bits ); } }; class LZ_decoder { unsigned long long partial_data_pos; Range_decoder rdec; const unsigned dictionary_size; uint8_t * const buffer; // output buffer unsigned pos; // current pos in buffer unsigned stream_pos; // first byte not yet written to stdout uint32_t crc_; void flush_data(); uint8_t get_byte( const unsigned distance ) const { unsigned i = pos - distance - 1; if( pos <= distance ) i += dictionary_size; return buffer[i]; } void put_byte( const uint8_t b ) { buffer[pos] = b; if( ++pos >= dictionary_size ) flush_data(); } public: LZ_decoder( const unsigned dict_size ) : partial_data_pos( 0 ), dictionary_size( dict_size ), buffer( new uint8_t[dictionary_size] ), pos( 0 ), stream_pos( 0 ), crc_( 0xFFFFFFFFU ) { buffer[dictionary_size-1] = 0; } // prev_byte of first_byte ~LZ_decoder() { delete[] buffer; } unsigned crc() const { return crc_ ^ 0xFFFFFFFFU; } unsigned long long data_position() const { return partial_data_pos + pos; } bool decode_member(); }; void LZ_decoder::flush_data() { if( pos > stream_pos ) { const unsigned size = pos - stream_pos; crc32.update( crc_, buffer + stream_pos, size ); errno = 0; if( std::fwrite( buffer + stream_pos, 1, size, stdout ) != size ) { std::fprintf( stderr, "Write error: %s\n", std::strerror( errno ) ); std::exit( 1 ); } if( pos >= dictionary_size ) { partial_data_pos += pos; pos = 0; } stream_pos = pos; } } bool LZ_decoder::decode_member() // Returns false if error { Bit_model bm_literal[1<> ( 8 - literal_context_bits ); Bit_model * const bm = bm_literal[literal_state]; if( state.is_char() ) put_byte( rdec.decode_tree( bm, 8 ) ); else put_byte( rdec.decode_matched( bm, get_byte( rep0 ) ) ); state.set_char(); } else { int len; if( rdec.decode_bit( bm_rep[state()] ) == 1 ) // 2nd bit { if( rdec.decode_bit( bm_rep0[state()] ) == 0 ) // 3rd bit { if( rdec.decode_bit( bm_len[state()][pos_state] ) == 0 ) // 4th bit { state.set_short_rep(); put_byte( get_byte( rep0 ) ); continue; } } else { unsigned distance; if( rdec.decode_bit( bm_rep1[state()] ) == 0 ) // 4th bit distance = rep1; else { if( rdec.decode_bit( bm_rep2[state()] ) == 0 ) // 5th bit distance = rep2; else { distance = rep3; rep3 = rep2; } rep2 = rep1; } rep1 = rep0; rep0 = distance; } len = min_match_len + rdec.decode_len( rep_len_model, pos_state ); state.set_rep(); } else { rep3 = rep2; rep2 = rep1; rep1 = rep0; len = min_match_len + rdec.decode_len( match_len_model, pos_state ); const int dis_state = std::min( len - min_match_len, max_dis_states - 1 ); const int dis_slot = rdec.decode_tree( bm_dis_slot[dis_state], dis_slot_bits ); if( dis_slot < start_dis_model ) rep0 = dis_slot; else { const int direct_bits = ( dis_slot >> 1 ) - 1; rep0 = ( 2 | ( dis_slot & 1 ) ) << direct_bits; if( dis_slot < end_dis_model ) rep0 += rdec.decode_tree_reversed( bm_dis + rep0 - dis_slot - 1, direct_bits ); else { rep0 += rdec.decode( direct_bits - dis_align_bits ) << dis_align_bits; rep0 += rdec.decode_tree_reversed( bm_align, dis_align_bits ); if( rep0 == 0xFFFFFFFFU ) // Marker found { flush_data(); return ( len == min_match_len ); // End Of Stream marker } } } state.set_match(); if( rep0 >= dictionary_size || ( rep0 >= pos && !partial_data_pos ) ) return false; } for( int i = 0; i < len; ++i ) put_byte( get_byte( rep0 ) ); } } return false; } int main( const int argc, const char * const argv[] ) { if( argc > 1 ) { std::printf( "Lzd %s - Educational decompressor for lzip files.\n", PROGVERSION ); std::printf( "Study the source to learn how a lzip decompressor works.\n" "See the lzip manual for an explanation of the code.\n" "It is not safe to use lzd for any real work.\n" "\nUsage: %s < file.lz > file\n", argv[0] ); std::printf( "Lzd decompresses from standard input to standard output.\n" "\nCopyright (C) 2013 Antonio Diaz Diaz.\n" "This is free software: you are free to change and redistribute it.\n" "There is NO WARRANTY, to the extent permitted by law.\n" "Report bugs to lzip-bug@nongnu.org\n" "Lzip home page: http://www.nongnu.org/lzip/lzip.html\n" ); return 0; } #if defined(__MSVCRT__) || defined(__OS2__) setmode( STDIN_FILENO, O_BINARY ); setmode( STDOUT_FILENO, O_BINARY ); #endif for( bool first_member = true; ; first_member = false ) { File_header header; for( int i = 0; i < 6; ++i ) header[i] = std::getc( stdin ); if( std::feof( stdin ) || std::memcmp( header, "LZIP", 4 ) != 0 ) { if( first_member ) { std::fprintf( stderr, "Bad magic number (file not in lzip format)\n" ); return 2; } break; } if( header[4] != 1 ) { std::fprintf( stderr, "Version %d member format not supported.\n", header[4] ); return 2; } unsigned dict_size = 1 << ( header[5] & 0x1F ); dict_size -= ( dict_size / 16 ) * ( ( header[5] >> 5 ) & 7 ); if( dict_size < min_dictionary_size || dict_size > max_dictionary_size ) { std::fprintf( stderr, "Invalid dictionary size in member header\n" ); return 2; } LZ_decoder decoder( dict_size ); if( !decoder.decode_member() ) { std::fprintf( stderr, "Data error\n" ); return 2; } File_trailer trailer; for( int i = 0; i < 20; ++i ) trailer[i] = std::getc( stdin ); unsigned crc = 0; for( int i = 3; i >= 0; --i ) { crc <<= 8; crc += trailer[i]; } unsigned long long data_size = 0; for( int i = 11; i >= 4; --i ) { data_size <<= 8; data_size += trailer[i]; } if( crc != decoder.crc() || data_size != decoder.data_position() ) { std::fprintf( stderr, "CRC error\n" ); return 2; } } if( std::fclose( stdout ) != 0 ) { std::fprintf( stderr, "Can't close stdout: %s\n", std::strerror( errno ) ); return 1; } return 0; }