diff options
Diffstat (limited to 'zcmp.cc')
-rw-r--r-- | zcmp.cc | 502 |
1 files changed, 502 insertions, 0 deletions
@@ -0,0 +1,502 @@ +/* Zcmp - decompress and compare two files byte by byte + Copyright (C) 2010 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#define _FILE_OFFSET_BITS 64 + +#include <cctype> +#include <cerrno> +#include <climits> +#include <csignal> +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <string> +#include <vector> +#include <fcntl.h> +#include <stdint.h> +#include <unistd.h> +#include <sys/stat.h> +#if defined(__MSVCRT__) || defined(__OS2__) +#include <io.h> +#endif + +#include "arg_parser.h" +#include "zutils.h" + +#if CHAR_BIT != 8 +#error "Environments where CHAR_BIT != 8 are not supported." +#endif + +#ifndef LLONG_MAX +#define LLONG_MAX 0x7FFFFFFFFFFFFFFFLL +#endif +#ifndef LLONG_MIN +#define LLONG_MIN (-LLONG_MAX - 1LL) +#endif +#ifndef ULLONG_MAX +#define ULLONG_MAX 0xFFFFFFFFFFFFFFFFULL +#endif + + +namespace { + +#ifdef O_BINARY +const int o_binary = O_BINARY; +#else +const int o_binary = 0; +#endif + +struct { const char * from; const char * to; } const known_extensions[] = { + { ".bz2", "" }, + { ".tbz", ".tar" }, + { ".tbz2", ".tar" }, + { ".gz", "" }, + { ".tgz", ".tar" }, + { ".lz", "" }, + { ".tlz", ".tar" }, + { ".xz", "" }, + { ".txz", ".tar" }, + { 0, 0 } }; + + +void show_help() throw() + { + std::printf( "Zcmp compares two files (\"-\" means standard input), and if they\n" ); + std::printf( "differ, tells the first byte and line number where they differ. Bytes\n" ); + std::printf( "and lines are numbered starting with 1. If any given file is compressed,\n" ); + std::printf( "its uncompressed content is used. Compressed files are uncompressed on\n" ); + std::printf( "the fly; no temporary files are created.\n" ); + std::printf( "The supported compressors are bzip2, gzip, lzip and xz.\n" ); + std::printf( "\nUsage: zcmp [options] file1 [file2]\n" ); + std::printf( "\nCompares <file1> to <file2>. If <file2> is omitted zcmp tries the\n" ); + std::printf( "following:\n" ); + std::printf( "If <file1> is compressed, compares <file1> to the file with the\n" ); + std::printf( "corresponding decompressed file name (removes the extension from\n" ); + std::printf( "<file1>).\n" ); + std::printf( "If <file1> is not compressed, compares <file1> to the uncompressed\n" ); + std::printf( "contents of <file1>.[bz2|gz|lz|xz] (the first one that is found).\n" ); + std::printf( "If no suitable file is found, compares <file1> to data read from\n" ); + std::printf( "standard input.\n" ); + std::printf( "\nExit status is 0 if inputs are identical, 1 if different, 2 if trouble.\n" ); + std::printf( "\nOptions:\n" ); + std::printf( " -h, --help display this help and exit\n" ); + std::printf( " -V, --version output version information and exit\n" ); + std::printf( " -b, --print-bytes print differing bytes\n" ); + std::printf( " -i, --ignore-initial=<n>[,<n2>] ignore differences in the first <n> bytes\n" ); + std::printf( " -l, --list list position, value of all differing bytes\n" ); + std::printf( " -n, --bytes=<n> compare at most <n> bytes\n" ); + std::printf( " -q, --quiet suppress all messages\n" ); + std::printf( " -s, --silent (same as --quiet)\n" ); + std::printf( " -v, --verbose verbose mode (same as --list)\n" ); + std::printf( "Numbers may be followed by a multiplier: k = kB = 10^3 = 1000,\n" ); + std::printf( "Ki = KiB = 2^10 = 1024, M = 10^6, Mi = 2^20, G = 10^9, Gi = 2^30, etc...\n" ); + show_help_addr(); + } + + +long long getnum( const char * const ptr, const char ** const tailp = 0, + const long long llimit = LLONG_MIN + 1, + const long long ulimit = LLONG_MAX ) throw() + { + errno = 0; + char * tail; + long long result = strtoll( ptr, &tail, 0 ); + if( tail == ptr ) + { + show_error( "Bad or missing numerical argument.", 0, true ); + std::exit( 2 ); + } + + if( !errno && tail[0] && std::isalpha( tail[0] ) ) + { + int factor = ( tail[1] == 'i' ) ? 1024 : 1000; + int exponent = 0; + bool bad_multiplier = false; + switch( tail[0] ) + { + case 'Y': exponent = 8; break; + case 'Z': exponent = 7; break; + case 'E': exponent = 6; break; + case 'P': exponent = 5; break; + case 'T': exponent = 4; break; + case 'G': exponent = 3; break; + case 'M': exponent = 2; break; + case 'K': if( factor == 1024 ) exponent = 1; else bad_multiplier = true; + break; + case 'k': if( factor == 1000 ) exponent = 1; else bad_multiplier = true; + break; + default : bad_multiplier = true; + } + if( bad_multiplier ) + { + show_error( "Bad multiplier in numerical argument.", 0, true ); + std::exit( 2 ); + } + for( int i = 0; i < exponent; ++i ) + { + if( LLONG_MAX / factor >= llabs( result ) ) result *= factor; + else { errno = ERANGE; break; } + } + } + if( !errno && ( result < llimit || result > ulimit ) ) errno = ERANGE; + if( errno ) + { + show_error( "Numerical argument out of limits." ); + std::exit( 2 ); + } + if( tailp ) *tailp = tail; + return result; + } + + +int open_instream( const std::string & input_filename ) throw() + { + int infd = open( input_filename.c_str(), O_RDONLY | o_binary ); + if( infd < 0 && verbosity >= 0 ) + std::fprintf( stderr, "%s: Can't open input file `%s': %s.\n", + util_name, input_filename.c_str(), std::strerror( errno ) ); + return infd; + } + + +int open_other_instream( std::string & name ) throw() + { + for( int i = 0; known_extensions[i].from; ++i ) + { // search uncompressed version + const std::string from( known_extensions[i].from ); + if( name.size() > from.size() && + name.compare( name.size() - from.size(), from.size(), from ) == 0 ) + { + name.resize( name.size() - from.size() ); + name += known_extensions[i].to; + return open( name.c_str(), O_RDONLY | o_binary ); + } + } + for( int i = 0; simple_extensions[i]; ++i ) + { // search compressed version + const std::string s( name + simple_extensions[i] ); + const int infd = open( s.c_str(), O_RDONLY | o_binary ); + if( infd >= 0 ) { name = s; return infd; } + } + return -1; + } + + +bool check_identical( const char * const name1, const char * const name2 ) throw() + { + if( !std::strcmp( name1, name2 ) ) return true; + struct stat stat1, stat2; + if( stat( name1, &stat1 ) || stat( name2, &stat2 ) ) return false; + return ( stat1.st_ino == stat2.st_ino && stat1.st_dev == stat2.st_dev ); + } + + +void parse_ignore_initial( const char * const arg, long long ignore_initial[2] ) + { + const char * tail; + ignore_initial[0] = getnum( arg, &tail, 0 ); + if( *tail == ',' || *tail == ':' ) + ignore_initial[1] = getnum( ++tail, 0, 0 ); + else ignore_initial[1] = ignore_initial[0]; + } + + +bool skip_ignore_initial( const long long ignore_initial, const int infd ) + { + if( ignore_initial > 0 ) + { + enum { buffer_size = 4096 }; + long long rest = ignore_initial; + uint8_t buffer[buffer_size]; + while( rest > 0 ) + { + const int size = std::min( rest, (long long)buffer_size ); + const int rd = readblock( infd, buffer, size ); + if( rd != size && errno ) return false; + if( rd < size ) break; + rest -= rd; + } + } + return true; + } + + +// Put into buf the unsigned char c, making unprintable bytes +// visible by quoting like cat -t does. +void sprintc( char * const buf, unsigned char c ) + { + int i = 0; + + if( c < 32 || c >= 127 ) + { + if( c >= 128 ) { c -= 128; buf[i++] = 'M'; buf[i++] = '-'; } + if( c < 32 ) { c += 64; buf[i++] = '^'; } + else if( c == 127 ) { c = '?'; buf[i++] = '^'; } + } + buf[i++] = c; + buf[i++] = 0; + } + + +int block_compare( const uint8_t * const buffer0, + const uint8_t * const buffer1, + long long * line_numberp ) + { + const uint8_t * p0 = buffer0; + const uint8_t * p1 = buffer1; + if( verbosity == 0 ) + { + int nl_count = 0; + while( *p0 == *p1 ) + { if( *p0 == '\n' ) { ++nl_count; } ++p0; ++p1; } + *line_numberp += nl_count; + } + else while( *p0 == *p1 ) { ++p0; ++p1; } + return p0 - buffer0; + } + + +int cmp( const long long max_size, const int infd[2], + const std::string filenames[2], const bool print_bytes ) + { + enum { buffer_size = 4096 }; + long long byte_number = 1; + long long line_number = 1; + long long rest = max_size; // remaining number of bytes to compare + // buffers with space for sentinels at the end + uint8_t * const buffer0 = new uint8_t[2*(buffer_size+1)]; + uint8_t * const buffer1 = buffer0 + buffer_size + 1; + uint8_t * buffer[2]; + buffer[0] = buffer0; buffer[1] = buffer1; + int different = 0; + + while( rest > 0 ) + { + const int size = std::min( rest, (long long)buffer_size ); + int rd[2]; // number of bytes read from each file + for( int i = 0; i < 2; ++i ) + { + rd[i] = readblock( infd[i], buffer[i], size ); + if( rd[i] != size && errno ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: Error reading file `%s': %s.\n", + util_name, filenames[i].c_str(), std::strerror( errno ) ); + return 2; + } + } + rest -= size; + + buffer0[rd[0]] = ~buffer1[rd[0]]; // sentinels for the block compare + buffer1[rd[1]] = ~buffer0[rd[1]]; + + int first_diff = block_compare( buffer0, buffer1, &line_number ); + byte_number += first_diff; + const int min_rd = std::min( rd[0], rd[1] ); + + if( first_diff < min_rd ) + { + if( verbosity < 0 ) return 1; // return status only + if( verbosity == 0 ) // show first difference + { + if( !print_bytes ) + std::printf( "%s %s differ: byte %lld, line %lld\n", + filenames[0].c_str(), filenames[1].c_str(), + byte_number, line_number ); + else + { + const unsigned char c0 = buffer0[first_diff]; + const unsigned char c1 = buffer1[first_diff]; + char buf0[5], buf1[5]; + sprintc( buf0, c0 ); sprintc( buf1, c1 ); + std::printf( "%s %s differ: byte %lld, line %lld is %3o %s %3o %s\n", + filenames[0].c_str(), filenames[1].c_str(), + byte_number, line_number, c0, buf0, c1, buf1 ); + } + return 1; + } + else // verbosity > 0 ; show all differences + { + different = 1; + for( ; first_diff < min_rd; ++byte_number, ++first_diff ) + { + const unsigned char c0 = buffer0[first_diff]; + const unsigned char c1 = buffer1[first_diff]; + if( c0 != c1 ) + { + if( !print_bytes ) + std::printf( "%lld %3o %3o\n", byte_number, c0, c1 ); + else + { + char buf0[5], buf1[5]; + sprintc( buf0, c0 ); sprintc( buf1, c1 ); + std::printf( "%lld %3o %-4s %3o %s\n", + byte_number, c0, buf0, c1, buf1 ); + } + } + } + } + } + + if( rd[0] != rd[1] ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: EOF on %s\n", + util_name, filenames[rd[1]<rd[0]].c_str() ); + return 1; + } + if( min_rd != buffer_size ) break; + } + + delete[] buffer0; + return different; + } + +} // end namespace + + +int main( const int argc, const char * const argv[] ) + { + // number of initial bytes ignored for each file + long long ignore_initial[2] = { 0, 0 }; + long long max_size = LLONG_MAX; + bool print_bytes = false; + invocation_name = argv[0]; + util_name = "zcmp"; + + const Arg_parser::Option options[] = + { + { 'b', "print-bytes", Arg_parser::no }, + { 'h', "help", Arg_parser::no }, + { 'i', "ignore-initial", Arg_parser::yes }, + { 'l', "list", Arg_parser::no }, + { 'n', "bytes", Arg_parser::yes }, + { 'q', "quiet", Arg_parser::no }, + { 's', "silent", Arg_parser::no }, + { 'v', "verbose", Arg_parser::no }, + { 'V', "version", Arg_parser::no }, + { 0 , 0, Arg_parser::no } }; + + const Arg_parser parser( argc, argv, options ); + if( parser.error().size() ) // bad option + { show_error( parser.error().c_str(), 0, true ); return 2; } + + int argind = 0; + for( ; argind < parser.arguments(); ++argind ) + { + const int code = parser.code( argind ); + if( !code ) break; // no more options + const char * const arg = parser.argument( argind ).c_str(); + switch( code ) + { + case 'b': print_bytes = true; break; + case 'h': show_help(); return 0; + case 'i': parse_ignore_initial( arg, ignore_initial ); break; + case 'l': verbosity = 1; break; + case 'n': max_size = getnum( arg, 0, 0 ); break; + case 'q': + case 's': verbosity = -1; break; + case 'v': verbosity = 1; break; + case 'V': show_version( "Zcmp" ); return 0; + default : internal_error( "uncaught option" ); + } + } // end process options + +#if defined(__MSVCRT__) || defined(__OS2__) + _setmode( STDIN_FILENO, O_BINARY ); + _setmode( STDOUT_FILENO, O_BINARY ); +#endif + + if( argind >= parser.arguments() ) + { show_error( "No files given.", 0, true ); return 2; } + if( argind + 2 < parser.arguments() ) + { show_error( "Too many files.", 0, true ); return 2; } + + const int files = parser.arguments() - argind; + std::string filenames[2]; // file names of the two input files + filenames[0] = parser.argument( argind ); + if( files == 2 ) filenames[1] = parser.argument( argind + 1 ); + + int infd[2]; // file descriptors of the two files + infd[0] = ( filenames[0] == "-" ) ? + STDIN_FILENO : open_instream( filenames[0] ); + if( infd[0] < 0 ) return 2; + + if( ( files == 1 && filenames[0] == "-" ) || + ( files == 2 && check_identical( filenames[0].c_str(), + filenames[1].c_str() ) ) ) + { + if( ignore_initial[0] == ignore_initial[1] ) return 0; + else { show_error( "Can't compare parts of same file." ); return 2; } + } + + if( files == 2 ) + { + infd[1] = ( filenames[1] == "-" ) ? + STDIN_FILENO : open_instream( filenames[1] ); + if( infd[1] < 0 ) return 2; + } + else + { + filenames[1] = filenames[0]; + infd[1] = open_other_instream( filenames[1] ); + if( infd[1] < 0 ) { infd[1] = STDIN_FILENO; filenames[1] = "-"; } + } + + int old_infd[2]; // copy of file descriptors of the two files + old_infd[0] = infd[0]; old_infd[1] = infd[1]; + pid_t pid[2]; + if( !set_data_feeder( &infd[0], &pid[0] ) || + !set_data_feeder( &infd[1], &pid[1] ) ) + return 2; + + for( int i = 0; i < 2; ++i ) + if( !skip_ignore_initial( ignore_initial[i], infd[i] ) ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: Can't skip initial bytes from file `%s': %s.\n", + util_name, filenames[i].c_str(), std::strerror( errno ) ); + return 2; + } + + int retval = cmp( max_size, infd, filenames, print_bytes ); + + if( ( ( pid[0] && wait_for_child( pid[0], "data feeder" ) != 0 ) || + ( pid[1] && wait_for_child( pid[1], "data feeder" ) != 0 ) ) && + retval == 0 ) + retval = 2; + + for( int i = 0; i < 2; ++i ) + { + if( close( infd[i] ) != 0 ) + { show_error( "Can't close output of data feeder", errno ); retval = 2; } + if( filenames[i] != "-" && close( old_infd[i] ) != 0 ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: Can't close input file `%s': %s.\n", + util_name, filenames[i].c_str(), std::strerror( errno ) ); + retval = 2; + } + } + if( std::fclose( stdout ) != 0 ) + { + show_error( "Can't close stdout", errno ); + retval = 2; + } + + return retval; + } |