1 files changed, 502 insertions, 0 deletions
diff --git a/zcmp.cc b/zcmp.cc
new file mode 100644
index 0000000..caa7852
--- /dev/null
+++ b/zcmp.cc
@@ -0,0 +1,502 @@
+/*  Zcmp - decompress and compare two files byte by byte
+    Copyright (C) 2010 Antonio Diaz Diaz.
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#define _FILE_OFFSET_BITS 64
+
+#include <cctype>
+#include <cerrno>
+#include <climits>
+#include <csignal>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+#include <vector>
+#include <fcntl.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#if defined(__MSVCRT__) || defined(__OS2__)
+#include <io.h>
+#endif
+
+#include "arg_parser.h"
+#include "zutils.h"
+
+#if CHAR_BIT != 8
+#error "Environments where CHAR_BIT != 8 are not supported."
+#endif
+
+#ifndef LLONG_MAX
+#define LLONG_MAX  0x7FFFFFFFFFFFFFFFLL
+#endif
+#ifndef LLONG_MIN
+#define LLONG_MIN  (-LLONG_MAX - 1LL)
+#endif
+#ifndef ULLONG_MAX
+#define ULLONG_MAX 0xFFFFFFFFFFFFFFFFULL
+#endif
+
+
+namespace {
+
+#ifdef O_BINARY
+const int o_binary = O_BINARY;
+#else
+const int o_binary = 0;
+#endif
+
+struct { const char * from; const char * to; } const known_extensions[] = {
+  { ".bz2",  ""     },
+  { ".tbz",  ".tar" },
+  { ".tbz2", ".tar" },
+  { ".gz",   ""     },
+  { ".tgz",  ".tar" },
+  { ".lz",   ""     },
+  { ".tlz",  ".tar" },
+  { ".xz",   ""     },
+  { ".txz",  ".tar" },
+  { 0,       0      } };
+
+
+void show_help() throw()
+  {
+  std::printf( "Zcmp compares two files (\"-\" means standard input), and if they\n" );
+  std::printf( "differ, tells the first byte and line number where they differ. Bytes\n" );
+  std::printf( "and lines are numbered starting with 1. If any given file is compressed,\n" );
+  std::printf( "its uncompressed content is used. Compressed files are uncompressed on\n" );
+  std::printf( "the fly; no temporary files are created.\n" );
+  std::printf( "The supported compressors are bzip2, gzip, lzip and xz.\n" );
+  std::printf( "\nUsage: zcmp [options] file1 [file2]\n" );
+  std::printf( "\nCompares <file1> to <file2>. If <file2> is omitted zcmp tries the\n" );
+  std::printf( "following:\n" );
+  std::printf( "If <file1> is compressed, compares <file1> to the file with the\n" );
+  std::printf( "corresponding decompressed file name (removes the extension from\n" );
+  std::printf( "<file1>).\n" );
+  std::printf( "If <file1> is not compressed, compares <file1> to the uncompressed\n" );
+  std::printf( "contents of <file1>.[bz2|gz|lz|xz] (the first one that is found).\n" );
+  std::printf( "If no suitable file is found, compares <file1> to data read from\n" );
+  std::printf( "standard input.\n" );
+  std::printf( "\nExit status is 0 if inputs are identical, 1 if different, 2 if trouble.\n" );
+  std::printf( "\nOptions:\n" );
+  std::printf( "  -h, --help                       display this help and exit\n" );
+  std::printf( "  -V, --version                    output version information and exit\n" );
+  std::printf( "  -b, --print-bytes                print differing bytes\n" );
+  std::printf( "  -i, --ignore-initial=<n>[,<n2>]  ignore differences in the first <n> bytes\n" );
+  std::printf( "  -l, --list                       list position, value of all differing bytes\n" );
+  std::printf( "  -n, --bytes=<n>                  compare at most <n> bytes\n" );
+  std::printf( "  -q, --quiet                      suppress all messages\n" );
+  std::printf( "  -s, --silent                     (same as --quiet)\n" );
+  std::printf( "  -v, --verbose                    verbose mode (same as --list)\n" );
+  std::printf( "Numbers may be followed by a multiplier: k = kB = 10^3 = 1000,\n" );
+  std::printf( "Ki = KiB = 2^10 = 1024, M = 10^6, Mi = 2^20, G = 10^9, Gi = 2^30, etc...\n" );
+  show_help_addr();
+  }
+
+
+long long getnum( const char * const ptr, const char ** const tailp = 0,
+                  const long long llimit = LLONG_MIN + 1,
+                  const long long ulimit = LLONG_MAX ) throw()
+  {
+  errno = 0;
+  char * tail;
+  long long result = strtoll( ptr, &tail, 0 );
+  if( tail == ptr )
+    {
+    show_error( "Bad or missing numerical argument.", 0, true );
+    std::exit( 2 );
+    }
+
+  if( !errno && tail[0] && std::isalpha( tail[0] ) )
+    {
+    int factor = ( tail[1] == 'i' ) ? 1024 : 1000;
+    int exponent = 0;
+    bool bad_multiplier = false;
+    switch( tail[0] )
+      {
+      case 'Y': exponent = 8; break;
+      case 'Z': exponent = 7; break;
+      case 'E': exponent = 6; break;
+      case 'P': exponent = 5; break;
+      case 'T': exponent = 4; break;
+      case 'G': exponent = 3; break;
+      case 'M': exponent = 2; break;
+      case 'K': if( factor == 1024 ) exponent = 1; else bad_multiplier = true;
+                break;
+      case 'k': if( factor == 1000 ) exponent = 1; else bad_multiplier = true;
+                break;
+      default : bad_multiplier = true;
+      }
+    if( bad_multiplier )
+      {
+      show_error( "Bad multiplier in numerical argument.", 0, true );
+      std::exit( 2 );
+      }
+    for( int i = 0; i < exponent; ++i )
+      {
+      if( LLONG_MAX / factor >= llabs( result ) ) result *= factor;
+      else { errno = ERANGE; break; }
+      }
+    }
+  if( !errno && ( result < llimit || result > ulimit ) ) errno = ERANGE;
+  if( errno )
+    {
+    show_error( "Numerical argument out of limits." );
+    std::exit( 2 );
+    }
+  if( tailp ) *tailp = tail;
+  return result;
+  }
+
+
+int open_instream( const std::string & input_filename ) throw()
+  {
+  int infd = open( input_filename.c_str(), O_RDONLY | o_binary );
+  if( infd < 0 && verbosity >= 0 )
+    std::fprintf( stderr, "%s: Can't open input file `%s': %s.\n",
+                  util_name, input_filename.c_str(), std::strerror( errno ) );
+  return infd;
+  }
+
+
+int open_other_instream( std::string & name ) throw()
+  {
+  for( int i = 0; known_extensions[i].from; ++i )
+    {					// search uncompressed version
+    const std::string from( known_extensions[i].from );
+    if( name.size() > from.size() &&
+        name.compare( name.size() - from.size(), from.size(), from ) == 0 )
+      {
+      name.resize( name.size() - from.size() );
+      name += known_extensions[i].to;
+      return open( name.c_str(), O_RDONLY | o_binary );
+      }
+    }
+  for( int i = 0; simple_extensions[i]; ++i )
+    {					// search compressed version
+    const std::string s( name + simple_extensions[i] );
+    const int infd = open( s.c_str(), O_RDONLY | o_binary );
+    if( infd >= 0 ) { name = s; return infd; }
+    }
+  return -1;
+  }
+
+
+bool check_identical( const char * const name1, const char * const name2 ) throw()
+  {
+  if( !std::strcmp( name1, name2 ) ) return true;
+  struct stat stat1, stat2;
+  if( stat( name1, &stat1 ) || stat( name2, &stat2 ) ) return false;
+  return ( stat1.st_ino == stat2.st_ino && stat1.st_dev == stat2.st_dev );
+  }
+
+
+void parse_ignore_initial( const char * const arg, long long ignore_initial[2] )
+  {
+  const char * tail;
+  ignore_initial[0] = getnum( arg, &tail, 0 );
+  if( *tail == ',' || *tail == ':' )
+    ignore_initial[1] = getnum( ++tail, 0, 0 );
+  else ignore_initial[1] = ignore_initial[0];
+  }
+
+
+bool skip_ignore_initial( const long long ignore_initial, const int infd )
+  {
+  if( ignore_initial > 0 )
+    {
+    enum { buffer_size = 4096 };
+    long long rest = ignore_initial;
+    uint8_t buffer[buffer_size];
+    while( rest > 0 )
+      {
+      const int size = std::min( rest, (long long)buffer_size );
+      const int rd = readblock( infd, buffer, size );
+      if( rd != size && errno ) return false;
+      if( rd < size ) break;
+      rest -= rd;
+      }
+    }
+  return true;
+  }
+
+
+// Put into buf the unsigned char c, making unprintable bytes
+// visible by quoting like cat -t does.
+void sprintc( char * const buf, unsigned char c )
+  {
+  int i = 0;
+
+  if( c < 32 || c >= 127 )
+    {
+    if( c >= 128 ) { c -= 128; buf[i++] = 'M'; buf[i++] = '-'; }
+    if( c < 32 ) { c += 64; buf[i++] = '^'; }
+    else if( c == 127 ) { c = '?'; buf[i++] = '^'; }
+    }
+  buf[i++] = c;
+  buf[i++] = 0;
+  }
+
+
+int block_compare( const uint8_t * const buffer0,
+                   const uint8_t * const buffer1,
+                   long long * line_numberp )
+  {
+  const uint8_t * p0 = buffer0;
+  const uint8_t * p1 = buffer1;
+  if( verbosity == 0 )
+    {
+    int nl_count = 0;
+    while( *p0 == *p1 )
+      { if( *p0 == '\n' ) { ++nl_count; } ++p0; ++p1; }
+    *line_numberp += nl_count;
+    }
+  else while( *p0 == *p1 ) { ++p0; ++p1; }
+  return p0 - buffer0;
+  }
+
+
+int cmp( const long long max_size, const int infd[2],
+         const std::string filenames[2], const bool print_bytes )
+  {
+  enum { buffer_size = 4096 };
+  long long byte_number = 1;
+  long long line_number = 1;
+  long long rest = max_size;	// remaining number of bytes to compare
+  // buffers with space for sentinels at the end
+  uint8_t * const buffer0 = new uint8_t[2*(buffer_size+1)];
+  uint8_t * const buffer1 = buffer0 + buffer_size + 1;
+  uint8_t * buffer[2];
+  buffer[0] = buffer0; buffer[1] = buffer1;
+  int different = 0;
+
+  while( rest > 0 )
+    {
+    const int size = std::min( rest, (long long)buffer_size );
+    int rd[2];			// number of bytes read from each file
+    for( int i = 0; i < 2; ++i )
+      {
+      rd[i] = readblock( infd[i], buffer[i], size );
+      if( rd[i] != size && errno )
+        {
+        if( verbosity >= 0 )
+          std::fprintf( stderr, "%s: Error reading file `%s': %s.\n",
+                        util_name, filenames[i].c_str(), std::strerror( errno ) );
+        return 2;
+        }
+      }
+    rest -= size;
+
+    buffer0[rd[0]] = ~buffer1[rd[0]];	// sentinels for the block compare
+    buffer1[rd[1]] = ~buffer0[rd[1]];
+
+    int first_diff = block_compare( buffer0, buffer1, &line_number );
+    byte_number += first_diff;
+    const int min_rd = std::min( rd[0], rd[1] );
+
+    if( first_diff < min_rd )
+      {
+      if( verbosity < 0 ) return 1;		// return status only
+      if( verbosity == 0 )			// show first difference
+        {
+        if( !print_bytes )
+          std::printf( "%s %s differ: byte %lld, line %lld\n",
+                       filenames[0].c_str(), filenames[1].c_str(),
+                       byte_number, line_number );
+        else
+          {
+          const unsigned char c0 = buffer0[first_diff];
+          const unsigned char c1 = buffer1[first_diff];
+          char buf0[5], buf1[5];
+          sprintc( buf0, c0 ); sprintc( buf1, c1 );
+          std::printf( "%s %s differ: byte %lld, line %lld is %3o %s %3o %s\n",
+                       filenames[0].c_str(), filenames[1].c_str(),
+                       byte_number, line_number, c0, buf0, c1, buf1 );
+          }
+        return 1;
+        }
+      else			// verbosity > 0 ; show all differences
+        {
+        different = 1;
+        for( ; first_diff < min_rd; ++byte_number, ++first_diff )
+          {
+          const unsigned char c0 = buffer0[first_diff];
+          const unsigned char c1 = buffer1[first_diff];
+          if( c0 != c1 )
+            {
+            if( !print_bytes )
+              std::printf( "%lld %3o %3o\n", byte_number, c0, c1 );
+            else
+              {
+              char buf0[5], buf1[5];
+              sprintc( buf0, c0 ); sprintc( buf1, c1 );
+              std::printf( "%lld %3o %-4s %3o %s\n",
+                           byte_number, c0, buf0, c1, buf1 );
+              }
+            }
+          }
+        }
+      }
+
+    if( rd[0] != rd[1] )
+      {
+      if( verbosity >= 0 )
+        std::fprintf( stderr, "%s: EOF on %s\n",
+                      util_name, filenames[rd[1]<rd[0]].c_str() );
+        return 1;
+      }
+    if( min_rd != buffer_size ) break;
+    }
+
+  delete[] buffer0;
+  return different;
+  }
+
+} // end namespace
+
+
+int main( const int argc, const char * const argv[] )
+  {
+  // number of initial bytes ignored for each file
+  long long ignore_initial[2] = { 0, 0 };
+  long long max_size = LLONG_MAX;
+  bool print_bytes = false;
+  invocation_name = argv[0];
+  util_name = "zcmp";
+
+  const Arg_parser::Option options[] =
+    {
+    { 'b', "print-bytes",    Arg_parser::no  },
+    { 'h', "help",           Arg_parser::no  },
+    { 'i', "ignore-initial", Arg_parser::yes },
+    { 'l', "list",           Arg_parser::no  },
+    { 'n', "bytes",          Arg_parser::yes },
+    { 'q', "quiet",          Arg_parser::no  },
+    { 's', "silent",         Arg_parser::no  },
+    { 'v', "verbose",        Arg_parser::no  },
+    { 'V', "version",        Arg_parser::no  },
+    {  0 ,  0,               Arg_parser::no  } };
+
+  const Arg_parser parser( argc, argv, options );
+  if( parser.error().size() )				// bad option
+    { show_error( parser.error().c_str(), 0, true ); return 2; }
+
+  int argind = 0;
+  for( ; argind < parser.arguments(); ++argind )
+    {
+    const int code = parser.code( argind );
+    if( !code ) break;					// no more options
+    const char * const arg = parser.argument( argind ).c_str();
+    switch( code )
+      {
+      case 'b': print_bytes = true; break;
+      case 'h': show_help(); return 0;
+      case 'i': parse_ignore_initial( arg, ignore_initial ); break;
+      case 'l': verbosity = 1; break;
+      case 'n': max_size = getnum( arg, 0, 0 ); break;
+      case 'q':
+      case 's': verbosity = -1; break;
+      case 'v': verbosity = 1; break;
+      case 'V': show_version( "Zcmp" ); return 0;
+      default : internal_error( "uncaught option" );
+      }
+    } // end process options
+
+#if defined(__MSVCRT__) || defined(__OS2__)
+  _setmode( STDIN_FILENO, O_BINARY );
+  _setmode( STDOUT_FILENO, O_BINARY );
+#endif
+
+  if( argind >= parser.arguments() )
+    { show_error( "No files given.", 0, true ); return 2; }
+  if( argind + 2 < parser.arguments() )
+    { show_error( "Too many files.", 0, true ); return 2; }
+
+  const int files = parser.arguments() - argind;
+  std::string filenames[2];		// file names of the two input files
+  filenames[0] = parser.argument( argind );
+  if( files == 2 ) filenames[1] = parser.argument( argind + 1 );
+
+  int infd[2];				// file descriptors of the two files
+  infd[0] = ( filenames[0] == "-" ) ?
+    STDIN_FILENO : open_instream( filenames[0] );
+  if( infd[0] < 0 ) return 2;
+
+  if( ( files == 1 && filenames[0] == "-" ) ||
+      ( files == 2 && check_identical( filenames[0].c_str(),
+                                       filenames[1].c_str() ) ) )
+    {
+    if( ignore_initial[0] == ignore_initial[1] ) return 0;
+    else { show_error( "Can't compare parts of same file." ); return 2; }
+    }
+
+  if( files == 2 )
+    {
+    infd[1] = ( filenames[1] == "-" ) ?
+      STDIN_FILENO : open_instream( filenames[1] );
+    if( infd[1] < 0 ) return 2;
+    }
+  else
+    {
+    filenames[1] = filenames[0];
+    infd[1] = open_other_instream( filenames[1] );
+    if( infd[1] < 0 ) { infd[1] = STDIN_FILENO; filenames[1] = "-"; }
+    }
+
+  int old_infd[2];		// copy of file descriptors of the two files
+  old_infd[0] = infd[0]; old_infd[1] = infd[1];
+  pid_t pid[2];
+  if( !set_data_feeder( &infd[0], &pid[0] ) ||
+      !set_data_feeder( &infd[1], &pid[1] ) )
+    return 2;
+
+  for( int i = 0; i < 2; ++i )
+    if( !skip_ignore_initial( ignore_initial[i], infd[i] ) )
+      {
+      if( verbosity >= 0 )
+        std::fprintf( stderr, "%s: Can't skip initial bytes from file `%s': %s.\n",
+                      util_name, filenames[i].c_str(), std::strerror( errno ) );
+      return 2;
+      }
+
+  int retval = cmp( max_size, infd, filenames, print_bytes );
+
+  if( ( ( pid[0] && wait_for_child( pid[0], "data feeder" ) != 0 ) ||
+        ( pid[1] && wait_for_child( pid[1], "data feeder" ) != 0 ) ) &&
+      retval == 0 )
+    retval = 2;
+
+  for( int i = 0; i < 2; ++i )
+    {
+    if( close( infd[i] ) != 0 )
+      { show_error( "Can't close output of data feeder", errno ); retval = 2; }
+    if( filenames[i] != "-" && close( old_infd[i] ) != 0 )
+      {
+      if( verbosity >= 0 )
+        std::fprintf( stderr, "%s: Can't close input file `%s': %s.\n",
+                      util_name, filenames[i].c_str(), std::strerror( errno ) );
+      retval = 2;
+      }
+    }
+  if( std::fclose( stdout ) != 0 )
+    {
+    show_error( "Can't close stdout", errno );
+    retval = 2;
+    }
+
+  return retval;
+  }