diff options
Diffstat (limited to 'lzd.cc')
-rw-r--r-- | lzd.cc | 488 |
1 files changed, 488 insertions, 0 deletions
@@ -0,0 +1,488 @@ +/* Lzd - Educational decompressor for the lzip format + Copyright (C) 2013-2024 Antonio Diaz Diaz. + + This program is free software. Redistribution and use in source and + binary forms, with or without modification, are permitted provided + that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +*/ +/* + Exit status: 0 for a normal exit, 1 for environmental problems + (file not found, invalid command-line options, I/O errors, etc), 2 to + indicate a corrupt or invalid input file. +*/ + +#include <algorithm> +#include <cerrno> +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <stdint.h> +#include <unistd.h> +#if defined __MSVCRT__ || defined __OS2__ || defined __DJGPP__ +#include <fcntl.h> +#include <io.h> +#endif + + +class State + { + int st; + +public: + enum { states = 12 }; + State() : st( 0 ) {} + int operator()() const { return st; } + bool is_char() const { return st < 7; } + + void set_char() + { + const int next[states] = { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5 }; + st = next[st]; + } + void set_match() { st = ( st < 7 ) ? 7 : 10; } + void set_rep() { st = ( st < 7 ) ? 8 : 11; } + void set_short_rep() { st = ( st < 7 ) ? 9 : 11; } + }; + + +enum { + min_dictionary_size = 1 << 12, + max_dictionary_size = 1 << 29, + literal_context_bits = 3, + literal_pos_state_bits = 0, // not used + pos_state_bits = 2, + pos_states = 1 << pos_state_bits, + pos_state_mask = pos_states - 1, + + len_states = 4, + dis_slot_bits = 6, + start_dis_model = 4, + end_dis_model = 14, + modeled_distances = 1 << ( end_dis_model / 2 ), // 128 + dis_align_bits = 4, + dis_align_size = 1 << dis_align_bits, + + len_low_bits = 3, + len_mid_bits = 3, + len_high_bits = 8, + len_low_symbols = 1 << len_low_bits, + len_mid_symbols = 1 << len_mid_bits, + len_high_symbols = 1 << len_high_bits, + max_len_symbols = len_low_symbols + len_mid_symbols + len_high_symbols, + + min_match_len = 2, // must be 2 + + bit_model_move_bits = 5, + bit_model_total_bits = 11, + bit_model_total = 1 << bit_model_total_bits }; + +struct Bit_model + { + int probability; + Bit_model() : probability( bit_model_total / 2 ) {} + }; + +struct Len_model + { + Bit_model choice1; + Bit_model choice2; + Bit_model bm_low[pos_states][len_low_symbols]; + Bit_model bm_mid[pos_states][len_mid_symbols]; + Bit_model bm_high[len_high_symbols]; + }; + + +class CRC32 + { + uint32_t data[256]; // Table of CRCs of all 8-bit messages. + +public: + CRC32() + { + for( unsigned n = 0; n < 256; ++n ) + { + unsigned c = n; + for( int k = 0; k < 8; ++k ) + { if( c & 1 ) c = 0xEDB88320U ^ ( c >> 1 ); else c >>= 1; } + data[n] = c; + } + } + + void update_buf( uint32_t & crc, const uint8_t * const buffer, + const int size ) const + { + for( int i = 0; i < size; ++i ) + crc = data[(crc^buffer[i])&0xFF] ^ ( crc >> 8 ); + } + }; + +const CRC32 crc32; + + +enum { header_size = 6, trailer_size = 20 }; +typedef uint8_t Lzip_header[header_size]; // 0-3 magic bytes + // 4 version + // 5 coded dictionary size +typedef uint8_t Lzip_trailer[trailer_size]; + // 0-3 CRC32 of the uncompressed data + // 4-11 size of the uncompressed data + // 12-19 member size including header and trailer + +class Range_decoder + { + unsigned long long member_pos; + uint32_t code; + uint32_t range; + +public: + Range_decoder() + : member_pos( header_size ), code( 0 ), range( 0xFFFFFFFFU ) + { + get_byte(); // discard first byte of the LZMA stream + for( int i = 0; i < 4; ++i ) code = ( code << 8 ) | get_byte(); + } + + uint8_t get_byte() { ++member_pos; return std::getc( stdin ); } + unsigned long long member_position() const { return member_pos; } + + unsigned decode( const int num_bits ) + { + unsigned symbol = 0; + for( int i = num_bits; i > 0; --i ) + { + range >>= 1; + symbol <<= 1; + if( code >= range ) { code -= range; symbol |= 1; } + if( range <= 0x00FFFFFFU ) // normalize + { range <<= 8; code = ( code << 8 ) | get_byte(); } + } + return symbol; + } + + bool decode_bit( Bit_model & bm ) + { + bool symbol; + const uint32_t bound = ( range >> bit_model_total_bits ) * bm.probability; + if( code < bound ) + { + range = bound; + bm.probability += + ( bit_model_total - bm.probability ) >> bit_model_move_bits; + symbol = 0; + } + else + { + code -= bound; + range -= bound; + bm.probability -= bm.probability >> bit_model_move_bits; + symbol = 1; + } + if( range <= 0x00FFFFFFU ) // normalize + { range <<= 8; code = ( code << 8 ) | get_byte(); } + return symbol; + } + + unsigned decode_tree( Bit_model bm[], const int num_bits ) + { + unsigned symbol = 1; + for( int i = 0; i < num_bits; ++i ) + symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); + return symbol - ( 1 << num_bits ); + } + + unsigned decode_tree_reversed( Bit_model bm[], const int num_bits ) + { + unsigned symbol = decode_tree( bm, num_bits ); + unsigned reversed_symbol = 0; + for( int i = 0; i < num_bits; ++i ) + { + reversed_symbol = ( reversed_symbol << 1 ) | ( symbol & 1 ); + symbol >>= 1; + } + return reversed_symbol; + } + + unsigned decode_matched( Bit_model bm[], const unsigned match_byte ) + { + unsigned symbol = 1; + for( int i = 7; i >= 0; --i ) + { + const bool match_bit = ( match_byte >> i ) & 1; + const bool bit = decode_bit( bm[symbol+(match_bit<<8)+0x100] ); + symbol = ( symbol << 1 ) | bit; + if( match_bit != bit ) + { + while( symbol < 0x100 ) + symbol = ( symbol << 1 ) | decode_bit( bm[symbol] ); + break; + } + } + return symbol & 0xFF; + } + + unsigned decode_len( Len_model & lm, const int pos_state ) + { + if( decode_bit( lm.choice1 ) == 0 ) + return min_match_len + + decode_tree( lm.bm_low[pos_state], len_low_bits ); + if( decode_bit( lm.choice2 ) == 0 ) + return min_match_len + len_low_symbols + + decode_tree( lm.bm_mid[pos_state], len_mid_bits ); + return min_match_len + len_low_symbols + len_mid_symbols + + decode_tree( lm.bm_high, len_high_bits ); + } + }; + + +class LZ_decoder + { + unsigned long long partial_data_pos; + Range_decoder rdec; + const unsigned dictionary_size; + uint8_t * const buffer; // output buffer + unsigned pos; // current pos in buffer + unsigned stream_pos; // first byte not yet written to stdout + uint32_t crc_; + bool pos_wrapped; + + void flush_data(); + + uint8_t peek( const unsigned distance ) const + { + if( pos > distance ) return buffer[pos - distance - 1]; + if( pos_wrapped ) return buffer[dictionary_size + pos - distance - 1]; + return 0; // prev_byte of first byte + } + + void put_byte( const uint8_t b ) + { + buffer[pos] = b; + if( ++pos >= dictionary_size ) flush_data(); + } + +public: + explicit LZ_decoder( const unsigned dict_size ) + : + partial_data_pos( 0 ), + dictionary_size( dict_size ), + buffer( new uint8_t[dictionary_size] ), + pos( 0 ), + stream_pos( 0 ), + crc_( 0xFFFFFFFFU ), + pos_wrapped( false ) + {} + + ~LZ_decoder() { delete[] buffer; } + + unsigned crc() const { return crc_ ^ 0xFFFFFFFFU; } + unsigned long long data_position() const + { return partial_data_pos + pos; } + uint8_t get_byte() { return rdec.get_byte(); } + unsigned long long member_position() const + { return rdec.member_position(); } + + bool decode_member(); + }; + + +void LZ_decoder::flush_data() + { + if( pos > stream_pos ) + { + const unsigned size = pos - stream_pos; + crc32.update_buf( crc_, buffer + stream_pos, size ); + if( std::fwrite( buffer + stream_pos, 1, size, stdout ) != size ) + { std::fprintf( stderr, "Write error: %s\n", std::strerror( errno ) ); + std::exit( 1 ); } + if( pos >= dictionary_size ) + { partial_data_pos += pos; pos = 0; pos_wrapped = true; } + stream_pos = pos; + } + } + + +bool LZ_decoder::decode_member() // Return false if error + { + Bit_model bm_literal[1<<literal_context_bits][0x300]; + Bit_model bm_match[State::states][pos_states]; + Bit_model bm_rep[State::states]; + Bit_model bm_rep0[State::states]; + Bit_model bm_rep1[State::states]; + Bit_model bm_rep2[State::states]; + Bit_model bm_len[State::states][pos_states]; + Bit_model bm_dis_slot[len_states][1<<dis_slot_bits]; + Bit_model bm_dis[modeled_distances-end_dis_model+1]; + Bit_model bm_align[dis_align_size]; + Len_model match_len_model; + Len_model rep_len_model; + unsigned rep0 = 0; // rep[0-3] latest four distances + unsigned rep1 = 0; // used for efficient coding of + unsigned rep2 = 0; // repeated distances + unsigned rep3 = 0; + State state; + + while( !std::feof( stdin ) && !std::ferror( stdin ) ) + { + const int pos_state = data_position() & pos_state_mask; + if( rdec.decode_bit( bm_match[state()][pos_state] ) == 0 ) // 1st bit + { + // literal byte + const uint8_t prev_byte = peek( 0 ); + const int literal_state = prev_byte >> ( 8 - literal_context_bits ); + Bit_model * const bm = bm_literal[literal_state]; + if( state.is_char() ) + put_byte( rdec.decode_tree( bm, 8 ) ); + else + put_byte( rdec.decode_matched( bm, peek( rep0 ) ) ); + state.set_char(); + continue; + } + // match or repeated match + int len; + if( rdec.decode_bit( bm_rep[state()] ) != 0 ) // 2nd bit + { + if( rdec.decode_bit( bm_rep0[state()] ) == 0 ) // 3rd bit + { + if( rdec.decode_bit( bm_len[state()][pos_state] ) == 0 ) // 4th bit + { state.set_short_rep(); put_byte( peek( rep0 ) ); continue; } + } + else + { + unsigned distance; + if( rdec.decode_bit( bm_rep1[state()] ) == 0 ) // 4th bit + distance = rep1; + else + { + if( rdec.decode_bit( bm_rep2[state()] ) == 0 ) // 5th bit + distance = rep2; + else + { distance = rep3; rep3 = rep2; } + rep2 = rep1; + } + rep1 = rep0; + rep0 = distance; + } + state.set_rep(); + len = rdec.decode_len( rep_len_model, pos_state ); + } + else // match + { + rep3 = rep2; rep2 = rep1; rep1 = rep0; + len = rdec.decode_len( match_len_model, pos_state ); + const int len_state = std::min( len - min_match_len, len_states - 1 ); + rep0 = rdec.decode_tree( bm_dis_slot[len_state], dis_slot_bits ); + if( rep0 >= start_dis_model ) + { + const unsigned dis_slot = rep0; + const int direct_bits = ( dis_slot >> 1 ) - 1; + rep0 = ( 2 | ( dis_slot & 1 ) ) << direct_bits; + if( dis_slot < end_dis_model ) + rep0 += rdec.decode_tree_reversed( bm_dis + ( rep0 - dis_slot ), + direct_bits ); + else + { + rep0 += + rdec.decode( direct_bits - dis_align_bits ) << dis_align_bits; + rep0 += rdec.decode_tree_reversed( bm_align, dis_align_bits ); + if( rep0 == 0xFFFFFFFFU ) // marker found + { + flush_data(); + return len == min_match_len; // End Of Stream marker + } + } + } + state.set_match(); + if( rep0 >= dictionary_size || ( rep0 >= pos && !pos_wrapped ) ) + { flush_data(); return false; } + } + for( int i = 0; i < len; ++i ) put_byte( peek( rep0 ) ); + } + flush_data(); + return false; + } + + +int main( const int argc, const char * const argv[] ) + { + if( argc > 2 || ( argc == 2 && std::strcmp( argv[1], "-d" ) != 0 ) ) + { + std::printf( + "Lzd %s - Educational decompressor for the lzip format.\n" + "Study the source code to learn how a lzip decompressor works.\n" + "See the lzip manual for an explanation of the code.\n" + "\nUsage: %s [-d] < file.lz > file\n" + "Lzd decompresses from standard input to standard output.\n" + "\nCopyright (C) 2024 Antonio Diaz Diaz.\n" + "License 2-clause BSD.\n" + "This is free software: you are free to change and redistribute " + "it.\nThere is NO WARRANTY, to the extent permitted by law.\n" + "Report bugs to lzip-bug@nongnu.org\n" + "Lzd home page: http://www.nongnu.org/lzip/lzd.html\n", + PROGVERSION, argv[0] ); + return 0; + } + +#if defined __MSVCRT__ || defined __OS2__ || defined __DJGPP__ + setmode( STDIN_FILENO, O_BINARY ); + setmode( STDOUT_FILENO, O_BINARY ); +#endif + + for( bool first_member = true; ; first_member = false ) + { + Lzip_header header; // check header + for( int i = 0; i < header_size; ++i ) header[i] = std::getc( stdin ); + if( std::feof( stdin ) || std::memcmp( header, "LZIP\x01", 5 ) != 0 ) + { + if( first_member ) + { std::fputs( "Bad magic number (file not in lzip format).\n", + stderr ); return 2; } + break; // ignore trailing data + } + unsigned dict_size = 1 << ( header[5] & 0x1F ); + dict_size -= ( dict_size / 16 ) * ( ( header[5] >> 5 ) & 7 ); + if( dict_size < min_dictionary_size || dict_size > max_dictionary_size ) + { std::fputs( "Invalid dictionary size in member header.\n", + stderr ); return 2; } + + LZ_decoder decoder( dict_size ); // decode LZMA stream + if( !decoder.decode_member() ) + { std::fputs( "Data error\n", stderr ); return 2; } + + Lzip_trailer trailer; // check trailer + for( int i = 0; i < trailer_size; ++i ) trailer[i] = decoder.get_byte(); + int retval = 0; + unsigned crc = 0; + for( int i = 3; i >= 0; --i ) crc = ( crc << 8 ) + trailer[i]; + if( crc != decoder.crc() ) + { std::fputs( "CRC mismatch\n", stderr ); retval = 2; } + + unsigned long long data_size = 0; + for( int i = 11; i >= 4; --i ) + data_size = ( data_size << 8 ) + trailer[i]; + if( data_size != decoder.data_position() ) + { std::fputs( "Data size mismatch\n", stderr ); retval = 2; } + + unsigned long long member_size = 0; + for( int i = 19; i >= 12; --i ) + member_size = ( member_size << 8 ) + trailer[i]; + if( member_size != decoder.member_position() ) + { std::fputs( "Member size mismatch\n", stderr ); retval = 2; } + if( retval ) return retval; + } + + if( std::fclose( stdout ) != 0 ) + { std::fprintf( stderr, "Error closing stdout: %s\n", + std::strerror( errno ) ); return 1; } + return 0; + } |