1 files changed, 139 insertions, 43 deletions
diff --git a/decoder.cc b/lzd.cc
index fbfcdb3..0ac7b64 100644
--- a/decoder.cc
+++ b/lzd.cc
@@ -8,6 +8,24 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 */
+/*
+    Exit status: 0 for a normal exit, 1 for environmental problems
+    (file not found, invalid flags, I/O errors, etc), 2 to indicate a
+    corrupt or invalid input file.
+*/
+
+#include <algorithm>
+#include <cerrno>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <stdint.h>
+#include <unistd.h>
+#if defined(__MSVCRT__) || defined(__OS2__)
+#include <fcntl.h>
+#include <io.h>
+#endif
+
 
 class State
   {
@@ -24,20 +42,20 @@ public:
     static const int next[states] = { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5 };
     st = next[st];
     }
-
-  void set_match()     { st = ( ( st < 7 ) ? 7 : 10 ); }
-  void set_rep()       { st = ( ( st < 7 ) ? 8 : 11 ); }
-  void set_short_rep() { st = ( ( st < 7 ) ? 9 : 11 ); }
+  void set_match()     { st = ( st < 7 ) ? 7 : 10; }
+  void set_rep()       { st = ( st < 7 ) ? 8 : 11; }
+  void set_short_rep() { st = ( st < 7 ) ? 9 : 11; }
   };
 
 
 enum {
+  min_dictionary_size = 1 << 12,
+  max_dictionary_size = 1 << 29,
   literal_context_bits = 3,
   pos_state_bits = 2,
   pos_states = 1 << pos_state_bits,
   pos_state_mask = pos_states - 1,
 
-  max_dis_states = 4,
   dis_slot_bits = 6,
   start_dis_model = 4,
   end_dis_model = 14,
@@ -52,13 +70,14 @@ enum {
   len_mid_symbols = 1 << len_mid_bits,
   len_high_symbols = 1 << len_high_bits,
   max_len_symbols = len_low_symbols + len_mid_symbols + len_high_symbols,
+
   min_match_len = 2,					// must be 2
+  max_dis_states = 4,
 
   bit_model_move_bits = 5,
   bit_model_total_bits = 11,
   bit_model_total = 1 << bit_model_total_bits };
 
-
 struct Bit_model
   {
   int probability;
@@ -75,6 +94,39 @@ struct Len_model
   };
 
 
+class CRC32
+  {
+  uint32_t data[256];		// Table of CRCs of all 8-bit messages.
+
+public:
+  CRC32()
+    {
+    for( unsigned n = 0; n < 256; ++n )
+      {
+      unsigned c = n;
+      for( int k = 0; k < 8; ++k )
+        { if( c & 1 ) c = 0xEDB88320U ^ ( c >> 1 ); else c >>= 1; }
+      data[n] = c;
+      }
+    }
+
+  void update( uint32_t & crc, const uint8_t * const buffer, const int size ) const
+    {
+    for( int i = 0; i < size; ++i )
+      crc = data[(crc^buffer[i])&0xFF] ^ ( crc >> 8 );
+    }
+  };
+
+const CRC32 crc32;
+
+
+typedef uint8_t File_header[6];	// 0-3 magic, 4 version, 5 coded_dict_size
+
+typedef uint8_t File_trailer[20];
+			//  0-3  CRC32 of the uncompressed data
+			//  4-11 size of the uncompressed data
+			// 12-19 member size including header and trailer
+
 class Range_decoder
   {
   uint32_t code;
@@ -83,9 +135,11 @@ class Range_decoder
 public:
   Range_decoder() : code( 0 ), range( 0xFFFFFFFFU )
     {
-    for( int i = 0; i < 5; ++i ) code = (code << 8) | std::getc( stdin );
+    for( int i = 0; i < 5; ++i ) code = (code << 8) | get_byte();
     }
 
+  uint8_t get_byte() { return std::getc( stdin ); }
+
   int decode( const int num_bits )
     {
     int symbol = 0;
@@ -95,7 +149,7 @@ public:
       symbol <<= 1;
       if( code >= range ) { code -= range; symbol |= 1; }
       if( range <= 0x00FFFFFFU )			// normalize
-        { range <<= 8; code = (code << 8) | std::getc( stdin ); }
+        { range <<= 8; code = (code << 8) | get_byte(); }
       }
     return symbol;
     }
@@ -118,7 +172,7 @@ public:
       symbol = 1;
       }
     if( range <= 0x00FFFFFFU )				// normalize
-      { range <<= 8; code = (code << 8) | std::getc( stdin ); }
+      { range <<= 8; code = (code << 8) | get_byte(); }
     return symbol;
     }
 
@@ -164,12 +218,11 @@ public:
   int decode_len( Len_model & lm, const int pos_state )
     {
     if( decode_bit( lm.choice1 ) == 0 )
-      return min_match_len +
-             decode_tree( lm.bm_low[pos_state], len_low_bits );
+      return decode_tree( lm.bm_low[pos_state], len_low_bits );
     if( decode_bit( lm.choice2 ) == 0 )
-      return min_match_len + len_low_symbols +
+      return len_low_symbols +
              decode_tree( lm.bm_mid[pos_state], len_mid_bits );
-    return min_match_len + len_low_symbols + len_mid_symbols +
+    return len_low_symbols + len_mid_symbols +
            decode_tree( lm.bm_high, len_high_bits );
     }
   };
@@ -189,8 +242,8 @@ class LZ_decoder
 
   uint8_t get_byte( const unsigned distance ) const
     {
-    int i = pos - distance - 1;
-    if( i < 0 ) i += dictionary_size;
+    unsigned i = pos - distance - 1;
+    if( pos <= distance ) i += dictionary_size;
     return buffer[i];
     }
 
@@ -220,32 +273,6 @@ public:
   };
 
 
-class CRC32
-  {
-  uint32_t data[256];		// Table of CRCs of all 8-bit messages.
-
-public:
-  CRC32()
-    {
-    for( unsigned n = 0; n < 256; ++n )
-      {
-      unsigned c = n;
-      for( int k = 0; k < 8; ++k )
-        { if( c & 1 ) c = 0xEDB88320U ^ ( c >> 1 ); else c >>= 1; }
-      data[n] = c;
-      }
-    }
-
-  void update( uint32_t & crc, const uint8_t * buffer, const int size ) const
-    {
-    for( int i = 0; i < size; ++i )
-      crc = data[(crc^buffer[i])&0xFF] ^ ( crc >> 8 );
-    }
-  };
-
-const CRC32 crc32;
-
-
 void LZ_decoder::flush_data()
   {
   if( pos > stream_pos )
@@ -322,13 +349,13 @@ bool LZ_decoder::decode_member()		// Returns false if error
           rep1 = rep0;
           rep0 = distance;
           }
-        len = rdec.decode_len( rep_len_model, pos_state );
+        len = min_match_len + rdec.decode_len( rep_len_model, pos_state );
         state.set_rep();
         }
       else
         {
         rep3 = rep2; rep2 = rep1; rep1 = rep0;
-        len = rdec.decode_len( match_len_model, pos_state );
+        len = min_match_len + rdec.decode_len( match_len_model, pos_state );
         const int dis_state = std::min( len - min_match_len, max_dis_states - 1 );
         const int dis_slot =
           rdec.decode_tree( bm_dis_slot[dis_state], dis_slot_bits );
@@ -361,3 +388,72 @@ bool LZ_decoder::decode_member()		// Returns false if error
     }
   return false;
   }
+
+
+int main( const int argc, const char * const argv[] )
+  {
+  if( argc > 1 )
+    {
+    std::printf( "Lzd %s - Educational decompressor for lzip files.\n",
+                 PROGVERSION );
+    std::printf( "Study the source to learn how a lzip decompressor works.\n"
+                 "See the lzip manual for an explanation of the code.\n"
+                 "It is not safe to use lzd for any real work.\n"
+                 "\nUsage: %s < file.lz > file\n", argv[0] );
+    std::printf( "Lzd decompresses from standard input to standard output.\n"
+                 "\nCopyright (C) 2013 Antonio Diaz Diaz.\n"
+                 "This is free software: you are free to change and redistribute it.\n"
+                 "There is NO WARRANTY, to the extent permitted by law.\n"
+                 "Report bugs to lzip-bug@nongnu.org\n"
+                 "Lzip home page: http://www.nongnu.org/lzip/lzip.html\n" );
+    return 0;
+    }
+
+#if defined(__MSVCRT__) || defined(__OS2__)
+  setmode( STDIN_FILENO, O_BINARY );
+  setmode( STDOUT_FILENO, O_BINARY );
+#endif
+
+  for( bool first_member = true; ; first_member = false )
+    {
+    File_header header;
+    for( int i = 0; i < 6; ++i )
+      header[i] = std::getc( stdin );
+    if( std::feof( stdin ) || std::memcmp( header, "LZIP", 4 ) != 0 )
+      {
+      if( first_member )
+        { std::fprintf( stderr, "Bad magic number (file not in lzip format)\n" );
+          return 2; }
+      break;
+      }
+    if( header[4] != 1 )
+      {
+      std::fprintf( stderr, "Version %d member format not supported.\n",
+                    header[4] );
+      return 2;
+      }
+    unsigned dict_size = 1 << ( header[5] & 0x1F );
+    dict_size -= ( dict_size / 16 ) * ( ( header[5] >> 5 ) & 7 );
+    if( dict_size < min_dictionary_size || dict_size > max_dictionary_size )
+      { std::fprintf( stderr, "Invalid dictionary size in member header\n" );
+        return 2; }
+
+    LZ_decoder decoder( dict_size );
+    if( !decoder.decode_member() )
+      { std::fprintf( stderr, "Data error\n" ); return 2; }
+
+    File_trailer trailer;
+    for( int i = 0; i < 20; ++i ) trailer[i] = std::getc( stdin );
+    unsigned crc = 0;
+    for( int i = 3; i >= 0; --i ) { crc <<= 8; crc += trailer[i]; }
+    unsigned long long data_size = 0;
+    for( int i = 11; i >= 4; --i ) { data_size <<= 8; data_size += trailer[i]; }
+    if( crc != decoder.crc() || data_size != decoder.data_position() )
+      { std::fprintf( stderr, "CRC error\n" ); return 2; }
+    }
+
+  if( std::fclose( stdout ) != 0 )
+    { std::fprintf( stderr, "Can't close stdout: %s\n", std::strerror( errno ) );
+      return 1; }
+  return 0;
+  }