1 files changed, 98 insertions, 73 deletions
diff --git a/lzd.cc b/lzd.cc
index 503bc36..3cf3f13 100644
--- a/lzd.cc
+++ b/lzd.cc
@@ -1,25 +1,25 @@
-/*  Lzd - Educational decompressor for the lzip format
-    Copyright (C) 2013-2019 Antonio Diaz Diaz.
+/* Lzd - Educational decompressor for the lzip format
+   Copyright (C) 2013-2024 Antonio Diaz Diaz.
 
-    This program is free software. Redistribution and use in source and
-    binary forms, with or without modification, are permitted provided
-    that the following conditions are met:
+   This program is free software. Redistribution and use in source and
+   binary forms, with or without modification, are permitted provided
+   that the following conditions are met:
 
-    1. Redistributions of source code must retain the above copyright
-    notice, this list of conditions and the following disclaimer.
+   1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions, and the following disclaimer.
 
-    2. Redistributions in binary form must reproduce the above copyright
-    notice, this list of conditions and the following disclaimer in the
-    documentation and/or other materials provided with the distribution.
+   2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
 
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 */
 /*
-    Exit status: 0 for a normal exit, 1 for environmental problems
-    (file not found, invalid flags, I/O errors, etc), 2 to indicate a
-    corrupt or invalid input file.
+   Exit status: 0 for a normal exit, 1 for environmental problems
+   (file not found, invalid command-line options, I/O errors, etc), 2 to
+   indicate a corrupt or invalid input file.
 */
 
 #include <algorithm>
@@ -29,7 +29,7 @@
 #include <cstring>
 #include <stdint.h>
 #include <unistd.h>
-#if defined(__MSVCRT__) || defined(__OS2__) || defined(__DJGPP__)
+#if defined __MSVCRT__ || defined __OS2__ || defined __DJGPP__
 #include <fcntl.h>
 #include <io.h>
 #endif
@@ -47,7 +47,7 @@ public:
 
   void set_char()
     {
-    static const int next[states] = { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5 };
+    const int next[states] = { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5 };
     st = next[st];
     }
   void set_match()     { st = ( st < 7 ) ? 7 : 10; }
@@ -69,7 +69,7 @@ enum {
   dis_slot_bits = 6,
   start_dis_model = 4,
   end_dis_model = 14,
-  modeled_distances = 1 << (end_dis_model / 2),		// 128
+  modeled_distances = 1 << ( end_dis_model / 2 ),	// 128
   dis_align_bits = 4,
   dis_align_size = 1 << dis_align_bits,
 
@@ -130,25 +130,31 @@ public:
 const CRC32 crc32;
 
 
-typedef uint8_t Lzip_header[6];	// 0-3 magic, 4 version, 5 coded_dict_size
-
-typedef uint8_t Lzip_trailer[20];
+enum { header_size = 6, trailer_size = 20 };
+typedef uint8_t Lzip_header[header_size]; // 0-3 magic bytes
+					  //   4 version
+					  //   5 coded dictionary size
+typedef uint8_t Lzip_trailer[trailer_size];
 			//  0-3  CRC32 of the uncompressed data
 			//  4-11 size of the uncompressed data
 			// 12-19 member size including header and trailer
 
 class Range_decoder
   {
+  unsigned long long member_pos;
   uint32_t code;
   uint32_t range;
 
 public:
-  Range_decoder() : code( 0 ), range( 0xFFFFFFFFU )
+  Range_decoder()
+    : member_pos( header_size ), code( 0 ), range( 0xFFFFFFFFU )
     {
-    for( int i = 0; i < 5; ++i ) code = (code << 8) | get_byte();
+    get_byte();			// discard first byte of the LZMA stream
+    for( int i = 0; i < 4; ++i ) code = ( code << 8 ) | get_byte();
     }
 
-  uint8_t get_byte() { return std::getc( stdin ); }
+  uint8_t get_byte() { ++member_pos; return std::getc( stdin ); }
+  unsigned long long member_position() const { return member_pos; }
 
   unsigned decode( const int num_bits )
     {
@@ -159,30 +165,31 @@ public:
       symbol <<= 1;
       if( code >= range ) { code -= range; symbol |= 1; }
       if( range <= 0x00FFFFFFU )			// normalize
-        { range <<= 8; code = (code << 8) | get_byte(); }
+        { range <<= 8; code = ( code << 8 ) | get_byte(); }
       }
     return symbol;
     }
 
-  unsigned decode_bit( Bit_model & bm )
+  bool decode_bit( Bit_model & bm )
     {
-    unsigned symbol;
+    bool symbol;
     const uint32_t bound = ( range >> bit_model_total_bits ) * bm.probability;
     if( code < bound )
       {
       range = bound;
-      bm.probability += (bit_model_total - bm.probability) >> bit_model_move_bits;
+      bm.probability +=
+        ( bit_model_total - bm.probability ) >> bit_model_move_bits;
       symbol = 0;
       }
     else
       {
-      range -= bound;
       code -= bound;
+      range -= bound;
       bm.probability -= bm.probability >> bit_model_move_bits;
       symbol = 1;
       }
     if( range <= 0x00FFFFFFU )				// normalize
-      { range <<= 8; code = (code << 8) | get_byte(); }
+      { range <<= 8; code = ( code << 8 ) | get_byte(); }
     return symbol;
     }
 
@@ -191,7 +198,7 @@ public:
     unsigned symbol = 1;
     for( int i = 0; i < num_bits; ++i )
       symbol = ( symbol << 1 ) | decode_bit( bm[symbol] );
-    return symbol - (1 << num_bits);
+    return symbol - ( 1 << num_bits );
     }
 
   unsigned decode_tree_reversed( Bit_model bm[], const int num_bits )
@@ -211,8 +218,8 @@ public:
     unsigned symbol = 1;
     for( int i = 7; i >= 0; --i )
       {
-      const unsigned match_bit = ( match_byte >> i ) & 1;
-      const unsigned bit = decode_bit( bm[symbol+(match_bit<<8)+0x100] );
+      const bool match_bit = ( match_byte >> i ) & 1;
+      const bool bit = decode_bit( bm[symbol+(match_bit<<8)+0x100] );
       symbol = ( symbol << 1 ) | bit;
       if( match_bit != bit )
         {
@@ -227,11 +234,12 @@ public:
   unsigned decode_len( Len_model & lm, const int pos_state )
     {
     if( decode_bit( lm.choice1 ) == 0 )
-      return decode_tree( lm.bm_low[pos_state], len_low_bits );
+      return min_match_len +
+             decode_tree( lm.bm_low[pos_state], len_low_bits );
     if( decode_bit( lm.choice2 ) == 0 )
-      return len_low_symbols +
+      return min_match_len + len_low_symbols +
              decode_tree( lm.bm_mid[pos_state], len_mid_bits );
-    return len_low_symbols + len_mid_symbols +
+    return min_match_len + len_low_symbols + len_mid_symbols +
            decode_tree( lm.bm_high, len_high_bits );
     }
   };
@@ -278,7 +286,11 @@ public:
   ~LZ_decoder() { delete[] buffer; }
 
   unsigned crc() const { return crc_ ^ 0xFFFFFFFFU; }
-  unsigned long long data_position() const { return partial_data_pos + pos; }
+  unsigned long long data_position() const
+    { return partial_data_pos + pos; }
+  uint8_t get_byte() { return rdec.get_byte(); }
+  unsigned long long member_position() const
+    { return rdec.member_position(); }
 
   bool decode_member();
   };
@@ -290,7 +302,6 @@ void LZ_decoder::flush_data()
     {
     const unsigned size = pos - stream_pos;
     crc32.update_buf( crc_, buffer + stream_pos, size );
-    errno = 0;
     if( std::fwrite( buffer + stream_pos, 1, size, stdout ) != size )
       { std::fprintf( stderr, "Write error: %s\n", std::strerror( errno ) );
         std::exit( 1 ); }
@@ -301,7 +312,7 @@ void LZ_decoder::flush_data()
   }
 
 
-bool LZ_decoder::decode_member()		// Returns false if error
+bool LZ_decoder::decode_member()	// Return false if error
   {
   Bit_model bm_literal[1<<literal_context_bits][0x300];
   Bit_model bm_match[State::states][pos_states];
@@ -363,12 +374,12 @@ bool LZ_decoder::decode_member()		// Returns false if error
         rep0 = distance;
         }
       state.set_rep();
-      len = min_match_len + rdec.decode_len( rep_len_model, pos_state );
+      len = rdec.decode_len( rep_len_model, pos_state );
       }
     else					// match
       {
       rep3 = rep2; rep2 = rep1; rep1 = rep0;
-      len = min_match_len + rdec.decode_len( match_len_model, pos_state );
+      len = rdec.decode_len( match_len_model, pos_state );
       const int len_state = std::min( len - min_match_len, len_states - 1 );
       rep0 = rdec.decode_tree( bm_dis_slot[len_state], dis_slot_bits );
       if( rep0 >= start_dis_model )
@@ -381,12 +392,13 @@ bool LZ_decoder::decode_member()		// Returns false if error
                                              direct_bits );
         else
           {
-          rep0 += rdec.decode( direct_bits - dis_align_bits ) << dis_align_bits;
+          rep0 +=
+            rdec.decode( direct_bits - dis_align_bits ) << dis_align_bits;
           rep0 += rdec.decode_tree_reversed( bm_align, dis_align_bits );
           if( rep0 == 0xFFFFFFFFU )		// marker found
             {
             flush_data();
-            return ( len == min_match_len );	// End Of Stream marker
+            return len == min_match_len;	// End Of Stream marker
             }
           }
         }
@@ -403,61 +415,74 @@ bool LZ_decoder::decode_member()		// Returns false if error
 
 int main( const int argc, const char * const argv[] )
   {
-  if( argc > 1 )
+  if( argc > 2 || ( argc == 2 && std::strcmp( argv[1], "-d" ) != 0 ) )
     {
-    std::printf( "Lzd %s - Educational decompressor for the lzip format.\n",
-                 PROGVERSION );
-    std::printf( "Study the source to learn how a lzip decompressor works.\n"
-                 "See the lzip manual for an explanation of the code.\n"
-                 "It is not safe to use lzd for any real work.\n"
-                 "\nUsage: %s < file.lz > file\n", argv[0] );
-    std::printf( "Lzd decompresses from standard input to standard output.\n"
-                 "\nCopyright (C) 2019 Antonio Diaz Diaz.\n"
-                 "This is free software: you are free to change and redistribute it.\n"
-                 "There is NO WARRANTY, to the extent permitted by law.\n"
-                 "Report bugs to lzip-bug@nongnu.org\n"
-                 "Lzd home page: http://www.nongnu.org/lzip/lzd.html\n" );
+    std::printf(
+      "Lzd %s - Educational decompressor for the lzip format.\n"
+      "Study the source code to learn how a lzip decompressor works.\n"
+      "See the lzip manual for an explanation of the code.\n"
+      "\nUsage: %s [-d] < file.lz > file\n"
+      "Lzd decompresses from standard input to standard output.\n"
+      "\nCopyright (C) 2024 Antonio Diaz Diaz.\n"
+      "License 2-clause BSD.\n"
+      "This is free software: you are free to change and redistribute "
+      "it.\nThere is NO WARRANTY, to the extent permitted by law.\n"
+      "Report bugs to lzip-bug@nongnu.org\n"
+      "Lzd home page: http://www.nongnu.org/lzip/lzd.html\n",
+      PROGVERSION, argv[0] );
     return 0;
     }
 
-#if defined(__MSVCRT__) || defined(__OS2__) || defined(__DJGPP__)
+#if defined __MSVCRT__ || defined __OS2__ || defined __DJGPP__
   setmode( STDIN_FILENO, O_BINARY );
   setmode( STDOUT_FILENO, O_BINARY );
 #endif
 
   for( bool first_member = true; ; first_member = false )
     {
-    Lzip_header header;				// verify header
-    for( int i = 0; i < 6; ++i ) header[i] = std::getc( stdin );
+    Lzip_header header;				// check header
+    for( int i = 0; i < header_size; ++i ) header[i] = std::getc( stdin );
     if( std::feof( stdin ) || std::memcmp( header, "LZIP\x01", 5 ) != 0 )
       {
       if( first_member )
-        { std::fputs( "Bad magic number (file not in lzip format).\n", stderr );
-          return 2; }
-      break;
+        { std::fputs( "Bad magic number (file not in lzip format).\n",
+                      stderr ); return 2; }
+      break;					// ignore trailing data
       }
     unsigned dict_size = 1 << ( header[5] & 0x1F );
     dict_size -= ( dict_size / 16 ) * ( ( header[5] >> 5 ) & 7 );
     if( dict_size < min_dictionary_size || dict_size > max_dictionary_size )
-      { std::fputs( "Invalid dictionary size in member header.\n", stderr );
-        return 2; }
+      { std::fputs( "Invalid dictionary size in member header.\n",
+                    stderr ); return 2; }
 
     LZ_decoder decoder( dict_size );		// decode LZMA stream
     if( !decoder.decode_member() )
       { std::fputs( "Data error\n", stderr ); return 2; }
 
-    Lzip_trailer trailer;			// verify trailer
-    for( int i = 0; i < 20; ++i ) trailer[i] = std::getc( stdin );
+    Lzip_trailer trailer;			// check trailer
+    for( int i = 0; i < trailer_size; ++i ) trailer[i] = decoder.get_byte();
+    int retval = 0;
     unsigned crc = 0;
-    for( int i = 3; i >= 0; --i ) { crc <<= 8; crc += trailer[i]; }
+    for( int i = 3; i >= 0; --i ) crc = ( crc << 8 ) + trailer[i];
+    if( crc != decoder.crc() )
+      { std::fputs( "CRC mismatch\n", stderr ); retval = 2; }
+
     unsigned long long data_size = 0;
-    for( int i = 11; i >= 4; --i ) { data_size <<= 8; data_size += trailer[i]; }
-    if( crc != decoder.crc() || data_size != decoder.data_position() )
-      { std::fputs( "CRC error\n", stderr ); return 2; }
+    for( int i = 11; i >= 4; --i )
+      data_size = ( data_size << 8 ) + trailer[i];
+    if( data_size != decoder.data_position() )
+      { std::fputs( "Data size mismatch\n", stderr ); retval = 2; }
+
+    unsigned long long member_size = 0;
+    for( int i = 19; i >= 12; --i )
+      member_size = ( member_size << 8 ) + trailer[i];
+    if( member_size != decoder.member_position() )
+      { std::fputs( "Member size mismatch\n", stderr ); retval = 2; }
+    if( retval ) return retval;
     }
 
   if( std::fclose( stdout ) != 0 )
-    { std::fprintf( stderr, "Error closing stdout: %s\n", std::strerror( errno ) );
-      return 1; }
+    { std::fprintf( stderr, "Error closing stdout: %s\n",
+                    std::strerror( errno ) ); return 1; }
   return 0;
   }