1 files changed, 534 insertions, 0 deletions
diff --git a/zcmp.cc b/zcmp.cc
new file mode 100644
index 0000000..5336a13
--- /dev/null
+++ b/zcmp.cc
@@ -0,0 +1,534 @@
+/* Zcmp - decompress and compare two files byte by byte
+   Copyright (C) 2010-2024 Antonio Diaz Diaz.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#define _FILE_OFFSET_BITS 64
+
+#include <algorithm>
+#include <cctype>
+#include <cerrno>
+#include <climits>
+#include <csignal>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+#include <vector>
+#include <fcntl.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#if defined __MSVCRT__ || defined __OS2__
+#include <io.h>
+#endif
+
+#include "arg_parser.h"
+#include "rc.h"
+#include "zutils.h"
+
+#ifndef LLONG_MAX
+#define LLONG_MAX 0x7FFFFFFFFFFFFFFFLL
+#endif
+
+
+namespace {
+
+#include "zcmpdiff.cc"
+
+void show_help()
+  {
+  std::printf( "zcmp compares two files and, if they differ, writes to standard output the\n"
+               "first byte and line number where they differ. Bytes and lines are numbered\n"
+               "starting with 1. A hyphen '-' used as a file argument means standard input.\n"
+               "If any file given is compressed, its decompressed content is used. Compressed\n"
+               "files are decompressed on the fly; no temporary files are created.\n"
+               "\nThe formats supported are bzip2, gzip, lzip, xz, and zstd.\n"
+               "\nUsage: zcmp [options] file1 [file2]\n"
+               "\nzcmp compares file1 to file2. The standard input is used only if file1 or\n"
+               "file2 refers to standard input. If file2 is omitted zcmp tries to compare\n"
+               "file1 with the corresponding uncompressed file (if file1 is compressed), and\n"
+               "then with the corresponding compressed files of the remaining formats until\n"
+               "one is found.\n"
+               "\nExit status is 0 if inputs are identical, 1 if different, 2 if trouble.\n"
+               "\nOptions:\n"
+               "  -h, --help                        display this help and exit\n"
+               "  -V, --version                     output version information and exit\n"
+               "  -b, --print-bytes                 print differing bytes\n"
+               "  -H, --hexadecimal                 print hexadecimal values instead of octal\n"
+               "  -i, --ignore-initial=<n>[:<n2>]   ignore differences in the first <n> bytes\n"
+               "  -l, --list                        list position, value of all differing bytes\n"
+               "  -M, --format=<list>               process only the formats in <list>\n"
+               "  -n, --bytes=<n>                   compare at most <n> bytes\n"
+               "  -N, --no-rcfile                   don't read runtime configuration file\n"
+               "  -O, --force-format=[<f1>][,<f2>]  force one or both input formats\n"
+               "  -q, --quiet, --silent             suppress diagnostics written to stderr\n"
+               "  -s, --script                      suppress messages about file differences\n"
+               "  -v, --verbose                     verbose mode (opposite of --quiet)\n"
+               "      --bz2=<command>               set compressor and options for bzip2 format\n"
+               "      --gz=<command>                set compressor and options for gzip format\n"
+               "      --lz=<command>                set compressor and options for lzip format\n"
+               "      --xz=<command>                set compressor and options for xz format\n"
+               "      --zst=<command>               set compressor and options for zstd format\n"
+               "\nValid formats for options '-M' and '-O' are 'bz2', 'gz', 'lz', 'xz', 'zst',\n"
+               "and 'un' for uncompressed.\n"
+               "\nByte counts given as arguments to options may be expressed in decimal,\n"
+               "hexadecimal, or octal (using the same syntax as integer constants in C++),\n"
+               "and may be followed by a multiplier: k = kB = 10^3 = 1000,\n"
+               "Ki = KiB = 2^10 = 1024, M = 10^6, Mi = 2^20, G = 10^9, Gi = 2^30, etc.\n" );
+  show_help_addr();
+  }
+
+
+// separate numbers of 5 or more digits in groups of 3 digits using '_'
+const char * format_num3( long long num )
+  {
+  enum { buffers = 8, bufsize = 4 * sizeof num, n = 10 };
+  const char * const si_prefix = "kMGTPEZYRQ";
+  const char * const binary_prefix = "KMGTPEZYRQ";
+  static char buffer[buffers][bufsize];	// circle of static buffers for printf
+  static int current = 0;
+
+  char * const buf = buffer[current++]; current %= buffers;
+  char * p = buf + bufsize - 1;		// fill the buffer backwards
+  *p = 0;	// terminator
+  const bool negative = num < 0;
+  if( num > 1024 || num < -1024 )
+    {
+    char prefix = 0;			// try binary first, then si
+    for( int i = 0; i < n && num != 0 && num % 1024 == 0; ++i )
+      { num /= 1024; prefix = binary_prefix[i]; }
+    if( prefix ) *(--p) = 'i';
+    else
+      for( int i = 0; i < n && num != 0 && num % 1000 == 0; ++i )
+        { num /= 1000; prefix = si_prefix[i]; }
+    if( prefix ) *(--p) = prefix;
+    }
+  const bool split = num >= 10000 || num <= -10000;
+
+  for( int i = 0; ; )
+    {
+    const long long onum = num; num /= 10;
+    *(--p) = llabs( onum - ( 10 * num ) ) + '0'; if( num == 0 ) break;
+    if( split && ++i >= 3 ) { i = 0; *(--p) = '_'; }
+    }
+  if( negative ) *(--p) = '-';
+  return p;
+  }
+
+
+// Recognized formats: <num>k[B], <num>Ki[B], <num>[MGTPEZYRQ][i][B]
+long long getnum( const char * const arg, const char * const option_name,
+                  const char ** const tailp = 0,
+                  const long long llimit = 0,
+                  const long long ulimit = LLONG_MAX )
+  {
+  char * tail;
+  errno = 0;
+  long long result = strtoll( arg, &tail, 0 );
+  if( tail == arg )
+    { show_option_error( arg, "Bad or missing numerical argument in",
+                         option_name ); std::exit( 2 ); }
+  if( result < 0 ) errno = ERANGE;
+
+  if( !errno && tail[0] && std::isalpha( tail[0] ) )
+    {
+    const unsigned char ch = *tail++;
+    int factor;
+    bool bsuf;					// 'B' suffix is present
+    if( tail[0] == 'i' ) { ++tail; factor = 1024; } else factor = 1000;
+    if( tail[0] == 'B' ) { ++tail; bsuf = true; } else bsuf = false;
+    int exponent = -1;				// -1 = bad multiplier
+    switch( ch )
+      {
+      case 'Q': exponent = 10; break;
+      case 'R': exponent = 9; break;
+      case 'Y': exponent = 8; break;
+      case 'Z': exponent = 7; break;
+      case 'E': exponent = 6; break;
+      case 'P': exponent = 5; break;
+      case 'T': exponent = 4; break;
+      case 'G': exponent = 3; break;
+      case 'M': exponent = 2; break;
+      case 'K': if( factor == 1024 ) exponent = 1; break;
+      case 'k': if( factor == 1000 ) exponent = 1; break;
+      case 'B': if( factor == 1000 && !bsuf ) exponent = 0; break;
+      }
+    if( exponent < 0 )
+      { show_option_error( arg, "Bad multiplier in numerical argument of",
+                           option_name ); std::exit( 2 ); }
+    for( int i = 0; i < exponent; ++i )
+      {
+      if( ulimit / factor >= result ) result *= factor;
+      else { errno = ERANGE; break; }
+      }
+    }
+  if( !errno && ( result < llimit || result > ulimit ) ) errno = ERANGE;
+  if( errno )
+    {
+    if( verbosity >= 0 )
+      std::fprintf( stderr, "%s: '%s': Value out of limits [%s,%s] in "
+                    "option '%s'.\n", program_name, arg, format_num3( llimit ),
+                    format_num3( ulimit ), option_name );
+    std::exit( 2 );
+    }
+  if( tailp ) *tailp = tail;
+  return result;
+  }
+
+
+void parse_ignore_initial( const char * const arg, const char * const pn,
+                           long long ignore_initial[2] )
+  {
+  const char * tail;
+  ignore_initial[0] = getnum( arg, pn, &tail );
+  if( *tail == ':' || *tail == ',' )
+    ignore_initial[1] = getnum( ++tail, pn );
+  else if( *tail == 0 ) ignore_initial[1] = ignore_initial[0];
+  else { show_option_error( arg, "Missing colon in", pn ); std::exit( 2 ); }
+  }
+
+
+bool skip_ignore_initial( const long long ignore_initial, const int infd )
+  {
+  if( ignore_initial > 0 )
+    {
+    const int buffer_size = 4096;
+    long long rest = ignore_initial;
+    uint8_t buffer[buffer_size];
+    while( rest > 0 )
+      {
+      const int size = std::min( rest, (long long)buffer_size );
+      const int rd = readblock( infd, buffer, size );
+      if( rd != size && errno ) return false;
+      if( rd < size ) break;			// EOF
+      rest -= rd;
+      }
+    }
+  return true;
+  }
+
+
+/* Put into buf the unsigned char c, making unprintable bytes visible by
+   quoting like cat -t does. */
+void sprintc( char * const buf, unsigned char c )
+  {
+  int i = 0;
+
+  if( c < 32 || c >= 127 )
+    {
+    if( c >= 128 ) { c -= 128; buf[i++] = 'M'; buf[i++] = '-'; }
+    if( c < 32 ) { c += 64; buf[i++] = '^'; }
+    else if( c == 127 ) { c = '?'; buf[i++] = '^'; }
+    }
+  buf[i++] = c;
+  buf[i++] = 0;
+  }
+
+
+int block_compare( const uint8_t * const buffer0,
+                   const uint8_t * const buffer1,
+                   unsigned long long * const line_numberp )
+  {
+  const uint8_t * p0 = buffer0;
+  const uint8_t * p1 = buffer1;
+
+  if( line_numberp )
+    {
+    int nl_count = 0;
+    while( *p0 == *p1 )
+      { if( *p0 == '\n' ) { ++nl_count; } ++p0; ++p1; }
+    *line_numberp += nl_count;
+    }
+  else while( *p0 == *p1 ) { ++p0; ++p1; }
+  return p0 - buffer0;
+  }
+
+
+int cmp( const long long max_size, const int infd[2],
+         const std::string filenames[2], bool finished[2],
+         const bool hexadecimal, const bool list, const bool print_bytes,
+         const bool scripted )
+  {
+  const int buffer_size = 4096;
+  unsigned long long byte_number = 1;
+  unsigned long long line_number = 1;
+  // remaining number of bytes to compare
+  long long rest = ( max_size >= 0 ) ? max_size : buffer_size;
+  // buffers with space for sentinels at the end
+  uint8_t * const buffer0 = new uint8_t[2*(buffer_size+1)];
+  uint8_t * const buffer1 = buffer0 + buffer_size + 1;
+  uint8_t * buffer[2];
+  buffer[0] = buffer0; buffer[1] = buffer1;
+  int retval = 0;
+  bool empty[2] = { true, true };
+
+  while( rest > 0 )
+    {
+    const int size = std::min( (long long)buffer_size, rest );
+    if( max_size >= 0 ) rest -= size;
+    int rd[2];			// number of bytes read from each file
+    for( int i = 0; i < 2; ++i )
+      {
+      rd[i] = readblock( infd[i], buffer[i], size );
+      if( rd[i] != size && errno )
+        { show_file_error( filenames[i].c_str(), "Read error", errno );
+          retval = 2; goto done; }
+      if( rd[i] > 0 ) empty[i] = false;
+      }
+    for( int i = 0; i < 2; ++i )
+      if( rd[i] < size ) finished[i] = true;
+
+    const int min_rd = std::min( rd[0], rd[1] );
+    buffer0[min_rd] = 0;		// sentinels for the block compare
+    buffer1[min_rd] = 1;
+
+    int first_diff = block_compare( buffer0, buffer1, list ? 0 : &line_number );
+    byte_number += first_diff;
+
+    if( first_diff < min_rd )
+      {
+      retval = 1;				// difference found
+      if( scripted ) break;			// status only
+      if( !list )				// show first difference
+        {
+        if( !print_bytes )
+          std::printf( "%s %s differ: byte %llu, line %llu\n",
+                       filenames[0].c_str(), filenames[1].c_str(),
+                       byte_number, line_number );
+        else
+          {
+          const unsigned char c0 = buffer0[first_diff];
+          const unsigned char c1 = buffer1[first_diff];
+          char buf0[5], buf1[5];
+          sprintc( buf0, c0 ); sprintc( buf1, c1 );
+          std::printf( hexadecimal ?
+                       "%s %s differ: byte %llu, line %llu is %02X %s %02X %s\n" :
+                       "%s %s differ: byte %llu, line %llu is %3o %s %3o %s\n",
+                       filenames[0].c_str(), filenames[1].c_str(),
+                       byte_number, line_number, c0, buf0, c1, buf1 );
+          }
+        std::fflush( stdout );
+        break;
+        }
+      else			// list ; show all differences
+        {
+        for( ; first_diff < min_rd; ++byte_number, ++first_diff )
+          {
+          const unsigned char c0 = buffer0[first_diff];
+          const unsigned char c1 = buffer1[first_diff];
+          if( c0 != c1 )
+            {
+            if( !print_bytes )
+              std::printf( hexadecimal ? "%llu %02X %02X\n" : "%llu %3o %3o\n",
+                           byte_number, c0, c1 );
+            else
+              {
+              char buf0[5], buf1[5];
+              sprintc( buf0, c0 ); sprintc( buf1, c1 );
+              std::printf( hexadecimal ? "%llu %02X %-4s %02X %s\n" :
+                           "%llu %3o %-4s %3o %s\n",
+                           byte_number, c0, buf0, c1, buf1 );
+              }
+            }
+          }
+        std::fflush( stdout );
+        }
+      }
+
+    if( rd[0] != rd[1] )
+      {
+      const int i = rd[1] < rd[0];
+      if( verbosity >= 0 )
+        std::fprintf( stderr, empty[i] ?
+                      "%s: EOF on %s which is empty\n" : list ?
+                      "%s: EOF on %s after byte %llu\n" :
+                      "%s: EOF on %s after byte %llu, in line %llu\n",
+                      program_name, filenames[i].c_str(),
+                      byte_number - 1, line_number );
+      retval = 1; break;
+      }
+    if( min_rd != buffer_size ) break;
+    }
+done:
+  delete[] buffer0;
+  return retval;
+  }
+
+} // end namespace
+
+
+int main( const int argc, const char * const argv[] )
+  {
+  enum { bz2_opt = 256, gz_opt, lz_opt, xz_opt, zst_opt };
+  // number of initial bytes ignored for each file
+  long long ignore_initial[2] = { 0, 0 };
+  long long max_size = -1;			// < 0 means unlimited size
+  int format_types[2] = { -1, -1 };		// < 0 means undefined
+  bool hexadecimal = false;
+  bool list = false;		// list position, value of all differing bytes
+  bool print_bytes = false;	// print differing bytes
+  bool scripted = false;	// suppress messages about file differences
+  program_name = "zcmp";
+  invocation_name = ( argc > 0 ) ? argv[0] : program_name;
+
+  const Arg_parser::Option options[] =
+    {
+    { 'b', "print-bytes",    Arg_parser::no  },
+    { 'h', "help",           Arg_parser::no  },
+    { 'H', "hexadecimal",    Arg_parser::no  },
+    { 'i', "ignore-initial", Arg_parser::yes },
+    { 'l', "list",           Arg_parser::no  },
+    { 'M', "format",         Arg_parser::yes },
+    { 'n', "bytes",          Arg_parser::yes },
+    { 'N', "no-rcfile",      Arg_parser::no  },
+    { 'O', "force-format",   Arg_parser::yes },
+    { 'q', "quiet",          Arg_parser::no  },
+    { 'q', "silent",         Arg_parser::no  },
+    { 's', "script",         Arg_parser::no  },
+    { 'v', "verbose",        Arg_parser::no  },
+    { 'V', "version",        Arg_parser::no  },
+    { bz2_opt, "bz2",        Arg_parser::yes },
+    { gz_opt,  "gz",         Arg_parser::yes },
+    { lz_opt,  "lz",         Arg_parser::yes },
+    { xz_opt,  "xz",         Arg_parser::yes },
+    { zst_opt, "zst",        Arg_parser::yes },
+    {  0,   0,               Arg_parser::no  } };
+
+  const Arg_parser parser( argc, argv, options );
+  if( parser.error().size() )				// bad option
+    { show_error( parser.error().c_str(), 0, true ); return 2; }
+
+  maybe_process_config_file( parser );
+
+  int argind = 0;
+  for( ; argind < parser.arguments(); ++argind )
+    {
+    const int code = parser.code( argind );
+    if( !code ) break;					// no more options
+    const char * const pn = parser.parsed_name( argind ).c_str();
+    const std::string & sarg = parser.argument( argind );
+    const char * const arg = sarg.c_str();
+    switch( code )
+      {
+      case 'b': print_bytes = true; break;
+      case 'h': show_help(); return 0;
+      case 'H': hexadecimal = true; break;
+      case 'i': parse_ignore_initial( arg, pn, ignore_initial ); break;
+      case 'l': list = true; break;
+      case 'M': parse_format_list( sarg, pn ); break;
+      case 'n': max_size = getnum( arg, pn ); break;
+      case 'N': break;
+      case 'O': parse_format_types2( sarg, pn, format_types ); break;
+      case 'q': verbosity = -1; break;
+      case 's': scripted = true; break;
+      case 'v': if( verbosity < 4 ) ++verbosity; break;
+      case 'V': show_version(); return 0;
+      case bz2_opt: parse_compressor( sarg, pn, fmt_bz2 ); break;
+      case gz_opt: parse_compressor( sarg, pn, fmt_gz ); break;
+      case lz_opt: parse_compressor( sarg, pn, fmt_lz ); break;
+      case xz_opt: parse_compressor( sarg, pn, fmt_xz ); break;
+      case zst_opt: parse_compressor( sarg, pn, fmt_zst ); break;
+      default: internal_error( "uncaught option." );
+      }
+    } // end process options
+
+#if defined __MSVCRT__ || defined __OS2__
+  setmode( STDIN_FILENO, O_BINARY );
+  setmode( STDOUT_FILENO, O_BINARY );
+#endif
+
+  const int files = parser.arguments() - argind;
+  if( files < 1 ) { show_error( "No files given.", 0, true ); return 2; }
+  if( files > 2 ) { show_error( "Too many files.", 0, true ); return 2; }
+
+  std::string filenames[2];		// file names of the two input files
+  filenames[0] = parser.argument( argind );
+  if( files == 2 ) filenames[1] = parser.argument( argind + 1 );
+
+  int infd[2];				// file descriptors of the two files
+  infd[0] = ( filenames[0] == "-" ) ?
+    STDIN_FILENO : open_instream( filenames[0] );
+  if( infd[0] < 0 ) return 2;
+
+  if( files == 2 )
+    {
+    if( check_identical( filenames[0].c_str(), filenames[1].c_str() ) )
+      {
+      if( ignore_initial[0] == ignore_initial[1] ) return 0;
+      else { show_error( "Can't compare parts of same file." ); return 2; }
+      }
+    infd[1] = ( filenames[1] == "-" ) ?
+      STDIN_FILENO : open_instream( filenames[1] );
+    if( infd[1] < 0 ) return 2;
+    }
+  else
+    {
+    if( filenames[0] == "-" )
+      { show_error( "Missing operand after '-'.", 0, true ); return 2; }
+    if( format_types[0] >= 0 || format_types[1] >= 0 )
+      { show_error( "Two files must be given when format is specified.", 0, true );
+        return 2; }
+    filenames[1] = filenames[0];
+    infd[1] = open_other_instream( filenames[1] );
+    if( infd[1] < 0 )
+      {
+      if( verbosity >= 0 )
+        std::fprintf( stderr, "%s: Can't find file to compare with '%s'.\n",
+                      program_name, filenames[0].c_str() );
+      show_error( 0, 0, true ); return 2;
+      }
+    }
+
+  int old_infd[2];		// copy of file descriptors of the two files
+  old_infd[0] = infd[0]; old_infd[1] = infd[1];
+  Children children[2];
+  if( !set_data_feeder( filenames[0], &infd[0], children[0], format_types[0] ) ||
+      !set_data_feeder( filenames[1], &infd[1], children[1], format_types[1] ) )
+    return 2;
+
+  for( int i = 0; i < 2; ++i )
+    if( !skip_ignore_initial( ignore_initial[i], infd[i] ) )
+      {
+      show_file_error( filenames[i].c_str(),
+                       "Read error skipping initial bytes", errno );
+      return 2;
+      }
+
+  bool finished[2] = { false, false };
+  int retval = cmp( max_size, infd, filenames, finished, hexadecimal, list,
+                    print_bytes, scripted );
+
+  for( int i = 0; i < 2; ++i )
+    if( !good_status( children[i], finished[i] ) ) retval = 2;
+
+  for( int i = 0; i < 2; ++i )
+    {
+    if( close( infd[i] ) != 0 )
+      { show_close_error(); retval = 2; }
+    if( filenames[i] != "-" && close( old_infd[i] ) != 0 )
+      {
+      show_file_error( filenames[i].c_str(), "Error closing input file", errno );
+      retval = 2;
+      }
+    }
+  if( std::fclose( stdout ) != 0 )
+    {
+    show_error( "Error closing stdout", errno );
+    retval = 2;
+    }
+
+  return retval;
+  }