diff options
Diffstat (limited to '')
-rw-r--r-- | zcmp.cc | 534 |
1 files changed, 534 insertions, 0 deletions
@@ -0,0 +1,534 @@ +/* Zcmp - decompress and compare two files byte by byte + Copyright (C) 2010-2024 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#define _FILE_OFFSET_BITS 64 + +#include <algorithm> +#include <cctype> +#include <cerrno> +#include <climits> +#include <csignal> +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <string> +#include <vector> +#include <fcntl.h> +#include <stdint.h> +#include <unistd.h> +#include <sys/stat.h> +#if defined __MSVCRT__ || defined __OS2__ +#include <io.h> +#endif + +#include "arg_parser.h" +#include "rc.h" +#include "zutils.h" + +#ifndef LLONG_MAX +#define LLONG_MAX 0x7FFFFFFFFFFFFFFFLL +#endif + + +namespace { + +#include "zcmpdiff.cc" + +void show_help() + { + std::printf( "zcmp compares two files and, if they differ, writes to standard output the\n" + "first byte and line number where they differ. Bytes and lines are numbered\n" + "starting with 1. A hyphen '-' used as a file argument means standard input.\n" + "If any file given is compressed, its decompressed content is used. Compressed\n" + "files are decompressed on the fly; no temporary files are created.\n" + "\nThe formats supported are bzip2, gzip, lzip, xz, and zstd.\n" + "\nUsage: zcmp [options] file1 [file2]\n" + "\nzcmp compares file1 to file2. The standard input is used only if file1 or\n" + "file2 refers to standard input. If file2 is omitted zcmp tries to compare\n" + "file1 with the corresponding uncompressed file (if file1 is compressed), and\n" + "then with the corresponding compressed files of the remaining formats until\n" + "one is found.\n" + "\nExit status is 0 if inputs are identical, 1 if different, 2 if trouble.\n" + "\nOptions:\n" + " -h, --help display this help and exit\n" + " -V, --version output version information and exit\n" + " -b, --print-bytes print differing bytes\n" + " -H, --hexadecimal print hexadecimal values instead of octal\n" + " -i, --ignore-initial=<n>[:<n2>] ignore differences in the first <n> bytes\n" + " -l, --list list position, value of all differing bytes\n" + " -M, --format=<list> process only the formats in <list>\n" + " -n, --bytes=<n> compare at most <n> bytes\n" + " -N, --no-rcfile don't read runtime configuration file\n" + " -O, --force-format=[<f1>][,<f2>] force one or both input formats\n" + " -q, --quiet, --silent suppress diagnostics written to stderr\n" + " -s, --script suppress messages about file differences\n" + " -v, --verbose verbose mode (opposite of --quiet)\n" + " --bz2=<command> set compressor and options for bzip2 format\n" + " --gz=<command> set compressor and options for gzip format\n" + " --lz=<command> set compressor and options for lzip format\n" + " --xz=<command> set compressor and options for xz format\n" + " --zst=<command> set compressor and options for zstd format\n" + "\nValid formats for options '-M' and '-O' are 'bz2', 'gz', 'lz', 'xz', 'zst',\n" + "and 'un' for uncompressed.\n" + "\nByte counts given as arguments to options may be expressed in decimal,\n" + "hexadecimal, or octal (using the same syntax as integer constants in C++),\n" + "and may be followed by a multiplier: k = kB = 10^3 = 1000,\n" + "Ki = KiB = 2^10 = 1024, M = 10^6, Mi = 2^20, G = 10^9, Gi = 2^30, etc.\n" ); + show_help_addr(); + } + + +// separate numbers of 5 or more digits in groups of 3 digits using '_' +const char * format_num3( long long num ) + { + enum { buffers = 8, bufsize = 4 * sizeof num, n = 10 }; + const char * const si_prefix = "kMGTPEZYRQ"; + const char * const binary_prefix = "KMGTPEZYRQ"; + static char buffer[buffers][bufsize]; // circle of static buffers for printf + static int current = 0; + + char * const buf = buffer[current++]; current %= buffers; + char * p = buf + bufsize - 1; // fill the buffer backwards + *p = 0; // terminator + const bool negative = num < 0; + if( num > 1024 || num < -1024 ) + { + char prefix = 0; // try binary first, then si + for( int i = 0; i < n && num != 0 && num % 1024 == 0; ++i ) + { num /= 1024; prefix = binary_prefix[i]; } + if( prefix ) *(--p) = 'i'; + else + for( int i = 0; i < n && num != 0 && num % 1000 == 0; ++i ) + { num /= 1000; prefix = si_prefix[i]; } + if( prefix ) *(--p) = prefix; + } + const bool split = num >= 10000 || num <= -10000; + + for( int i = 0; ; ) + { + const long long onum = num; num /= 10; + *(--p) = llabs( onum - ( 10 * num ) ) + '0'; if( num == 0 ) break; + if( split && ++i >= 3 ) { i = 0; *(--p) = '_'; } + } + if( negative ) *(--p) = '-'; + return p; + } + + +// Recognized formats: <num>k[B], <num>Ki[B], <num>[MGTPEZYRQ][i][B] +long long getnum( const char * const arg, const char * const option_name, + const char ** const tailp = 0, + const long long llimit = 0, + const long long ulimit = LLONG_MAX ) + { + char * tail; + errno = 0; + long long result = strtoll( arg, &tail, 0 ); + if( tail == arg ) + { show_option_error( arg, "Bad or missing numerical argument in", + option_name ); std::exit( 2 ); } + if( result < 0 ) errno = ERANGE; + + if( !errno && tail[0] && std::isalpha( tail[0] ) ) + { + const unsigned char ch = *tail++; + int factor; + bool bsuf; // 'B' suffix is present + if( tail[0] == 'i' ) { ++tail; factor = 1024; } else factor = 1000; + if( tail[0] == 'B' ) { ++tail; bsuf = true; } else bsuf = false; + int exponent = -1; // -1 = bad multiplier + switch( ch ) + { + case 'Q': exponent = 10; break; + case 'R': exponent = 9; break; + case 'Y': exponent = 8; break; + case 'Z': exponent = 7; break; + case 'E': exponent = 6; break; + case 'P': exponent = 5; break; + case 'T': exponent = 4; break; + case 'G': exponent = 3; break; + case 'M': exponent = 2; break; + case 'K': if( factor == 1024 ) exponent = 1; break; + case 'k': if( factor == 1000 ) exponent = 1; break; + case 'B': if( factor == 1000 && !bsuf ) exponent = 0; break; + } + if( exponent < 0 ) + { show_option_error( arg, "Bad multiplier in numerical argument of", + option_name ); std::exit( 2 ); } + for( int i = 0; i < exponent; ++i ) + { + if( ulimit / factor >= result ) result *= factor; + else { errno = ERANGE; break; } + } + } + if( !errno && ( result < llimit || result > ulimit ) ) errno = ERANGE; + if( errno ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: '%s': Value out of limits [%s,%s] in " + "option '%s'.\n", program_name, arg, format_num3( llimit ), + format_num3( ulimit ), option_name ); + std::exit( 2 ); + } + if( tailp ) *tailp = tail; + return result; + } + + +void parse_ignore_initial( const char * const arg, const char * const pn, + long long ignore_initial[2] ) + { + const char * tail; + ignore_initial[0] = getnum( arg, pn, &tail ); + if( *tail == ':' || *tail == ',' ) + ignore_initial[1] = getnum( ++tail, pn ); + else if( *tail == 0 ) ignore_initial[1] = ignore_initial[0]; + else { show_option_error( arg, "Missing colon in", pn ); std::exit( 2 ); } + } + + +bool skip_ignore_initial( const long long ignore_initial, const int infd ) + { + if( ignore_initial > 0 ) + { + const int buffer_size = 4096; + long long rest = ignore_initial; + uint8_t buffer[buffer_size]; + while( rest > 0 ) + { + const int size = std::min( rest, (long long)buffer_size ); + const int rd = readblock( infd, buffer, size ); + if( rd != size && errno ) return false; + if( rd < size ) break; // EOF + rest -= rd; + } + } + return true; + } + + +/* Put into buf the unsigned char c, making unprintable bytes visible by + quoting like cat -t does. */ +void sprintc( char * const buf, unsigned char c ) + { + int i = 0; + + if( c < 32 || c >= 127 ) + { + if( c >= 128 ) { c -= 128; buf[i++] = 'M'; buf[i++] = '-'; } + if( c < 32 ) { c += 64; buf[i++] = '^'; } + else if( c == 127 ) { c = '?'; buf[i++] = '^'; } + } + buf[i++] = c; + buf[i++] = 0; + } + + +int block_compare( const uint8_t * const buffer0, + const uint8_t * const buffer1, + unsigned long long * const line_numberp ) + { + const uint8_t * p0 = buffer0; + const uint8_t * p1 = buffer1; + + if( line_numberp ) + { + int nl_count = 0; + while( *p0 == *p1 ) + { if( *p0 == '\n' ) { ++nl_count; } ++p0; ++p1; } + *line_numberp += nl_count; + } + else while( *p0 == *p1 ) { ++p0; ++p1; } + return p0 - buffer0; + } + + +int cmp( const long long max_size, const int infd[2], + const std::string filenames[2], bool finished[2], + const bool hexadecimal, const bool list, const bool print_bytes, + const bool scripted ) + { + const int buffer_size = 4096; + unsigned long long byte_number = 1; + unsigned long long line_number = 1; + // remaining number of bytes to compare + long long rest = ( max_size >= 0 ) ? max_size : buffer_size; + // buffers with space for sentinels at the end + uint8_t * const buffer0 = new uint8_t[2*(buffer_size+1)]; + uint8_t * const buffer1 = buffer0 + buffer_size + 1; + uint8_t * buffer[2]; + buffer[0] = buffer0; buffer[1] = buffer1; + int retval = 0; + bool empty[2] = { true, true }; + + while( rest > 0 ) + { + const int size = std::min( (long long)buffer_size, rest ); + if( max_size >= 0 ) rest -= size; + int rd[2]; // number of bytes read from each file + for( int i = 0; i < 2; ++i ) + { + rd[i] = readblock( infd[i], buffer[i], size ); + if( rd[i] != size && errno ) + { show_file_error( filenames[i].c_str(), "Read error", errno ); + retval = 2; goto done; } + if( rd[i] > 0 ) empty[i] = false; + } + for( int i = 0; i < 2; ++i ) + if( rd[i] < size ) finished[i] = true; + + const int min_rd = std::min( rd[0], rd[1] ); + buffer0[min_rd] = 0; // sentinels for the block compare + buffer1[min_rd] = 1; + + int first_diff = block_compare( buffer0, buffer1, list ? 0 : &line_number ); + byte_number += first_diff; + + if( first_diff < min_rd ) + { + retval = 1; // difference found + if( scripted ) break; // status only + if( !list ) // show first difference + { + if( !print_bytes ) + std::printf( "%s %s differ: byte %llu, line %llu\n", + filenames[0].c_str(), filenames[1].c_str(), + byte_number, line_number ); + else + { + const unsigned char c0 = buffer0[first_diff]; + const unsigned char c1 = buffer1[first_diff]; + char buf0[5], buf1[5]; + sprintc( buf0, c0 ); sprintc( buf1, c1 ); + std::printf( hexadecimal ? + "%s %s differ: byte %llu, line %llu is %02X %s %02X %s\n" : + "%s %s differ: byte %llu, line %llu is %3o %s %3o %s\n", + filenames[0].c_str(), filenames[1].c_str(), + byte_number, line_number, c0, buf0, c1, buf1 ); + } + std::fflush( stdout ); + break; + } + else // list ; show all differences + { + for( ; first_diff < min_rd; ++byte_number, ++first_diff ) + { + const unsigned char c0 = buffer0[first_diff]; + const unsigned char c1 = buffer1[first_diff]; + if( c0 != c1 ) + { + if( !print_bytes ) + std::printf( hexadecimal ? "%llu %02X %02X\n" : "%llu %3o %3o\n", + byte_number, c0, c1 ); + else + { + char buf0[5], buf1[5]; + sprintc( buf0, c0 ); sprintc( buf1, c1 ); + std::printf( hexadecimal ? "%llu %02X %-4s %02X %s\n" : + "%llu %3o %-4s %3o %s\n", + byte_number, c0, buf0, c1, buf1 ); + } + } + } + std::fflush( stdout ); + } + } + + if( rd[0] != rd[1] ) + { + const int i = rd[1] < rd[0]; + if( verbosity >= 0 ) + std::fprintf( stderr, empty[i] ? + "%s: EOF on %s which is empty\n" : list ? + "%s: EOF on %s after byte %llu\n" : + "%s: EOF on %s after byte %llu, in line %llu\n", + program_name, filenames[i].c_str(), + byte_number - 1, line_number ); + retval = 1; break; + } + if( min_rd != buffer_size ) break; + } +done: + delete[] buffer0; + return retval; + } + +} // end namespace + + +int main( const int argc, const char * const argv[] ) + { + enum { bz2_opt = 256, gz_opt, lz_opt, xz_opt, zst_opt }; + // number of initial bytes ignored for each file + long long ignore_initial[2] = { 0, 0 }; + long long max_size = -1; // < 0 means unlimited size + int format_types[2] = { -1, -1 }; // < 0 means undefined + bool hexadecimal = false; + bool list = false; // list position, value of all differing bytes + bool print_bytes = false; // print differing bytes + bool scripted = false; // suppress messages about file differences + program_name = "zcmp"; + invocation_name = ( argc > 0 ) ? argv[0] : program_name; + + const Arg_parser::Option options[] = + { + { 'b', "print-bytes", Arg_parser::no }, + { 'h', "help", Arg_parser::no }, + { 'H', "hexadecimal", Arg_parser::no }, + { 'i', "ignore-initial", Arg_parser::yes }, + { 'l', "list", Arg_parser::no }, + { 'M', "format", Arg_parser::yes }, + { 'n', "bytes", Arg_parser::yes }, + { 'N', "no-rcfile", Arg_parser::no }, + { 'O', "force-format", Arg_parser::yes }, + { 'q', "quiet", Arg_parser::no }, + { 'q', "silent", Arg_parser::no }, + { 's', "script", Arg_parser::no }, + { 'v', "verbose", Arg_parser::no }, + { 'V', "version", Arg_parser::no }, + { bz2_opt, "bz2", Arg_parser::yes }, + { gz_opt, "gz", Arg_parser::yes }, + { lz_opt, "lz", Arg_parser::yes }, + { xz_opt, "xz", Arg_parser::yes }, + { zst_opt, "zst", Arg_parser::yes }, + { 0, 0, Arg_parser::no } }; + + const Arg_parser parser( argc, argv, options ); + if( parser.error().size() ) // bad option + { show_error( parser.error().c_str(), 0, true ); return 2; } + + maybe_process_config_file( parser ); + + int argind = 0; + for( ; argind < parser.arguments(); ++argind ) + { + const int code = parser.code( argind ); + if( !code ) break; // no more options + const char * const pn = parser.parsed_name( argind ).c_str(); + const std::string & sarg = parser.argument( argind ); + const char * const arg = sarg.c_str(); + switch( code ) + { + case 'b': print_bytes = true; break; + case 'h': show_help(); return 0; + case 'H': hexadecimal = true; break; + case 'i': parse_ignore_initial( arg, pn, ignore_initial ); break; + case 'l': list = true; break; + case 'M': parse_format_list( sarg, pn ); break; + case 'n': max_size = getnum( arg, pn ); break; + case 'N': break; + case 'O': parse_format_types2( sarg, pn, format_types ); break; + case 'q': verbosity = -1; break; + case 's': scripted = true; break; + case 'v': if( verbosity < 4 ) ++verbosity; break; + case 'V': show_version(); return 0; + case bz2_opt: parse_compressor( sarg, pn, fmt_bz2 ); break; + case gz_opt: parse_compressor( sarg, pn, fmt_gz ); break; + case lz_opt: parse_compressor( sarg, pn, fmt_lz ); break; + case xz_opt: parse_compressor( sarg, pn, fmt_xz ); break; + case zst_opt: parse_compressor( sarg, pn, fmt_zst ); break; + default: internal_error( "uncaught option." ); + } + } // end process options + +#if defined __MSVCRT__ || defined __OS2__ + setmode( STDIN_FILENO, O_BINARY ); + setmode( STDOUT_FILENO, O_BINARY ); +#endif + + const int files = parser.arguments() - argind; + if( files < 1 ) { show_error( "No files given.", 0, true ); return 2; } + if( files > 2 ) { show_error( "Too many files.", 0, true ); return 2; } + + std::string filenames[2]; // file names of the two input files + filenames[0] = parser.argument( argind ); + if( files == 2 ) filenames[1] = parser.argument( argind + 1 ); + + int infd[2]; // file descriptors of the two files + infd[0] = ( filenames[0] == "-" ) ? + STDIN_FILENO : open_instream( filenames[0] ); + if( infd[0] < 0 ) return 2; + + if( files == 2 ) + { + if( check_identical( filenames[0].c_str(), filenames[1].c_str() ) ) + { + if( ignore_initial[0] == ignore_initial[1] ) return 0; + else { show_error( "Can't compare parts of same file." ); return 2; } + } + infd[1] = ( filenames[1] == "-" ) ? + STDIN_FILENO : open_instream( filenames[1] ); + if( infd[1] < 0 ) return 2; + } + else + { + if( filenames[0] == "-" ) + { show_error( "Missing operand after '-'.", 0, true ); return 2; } + if( format_types[0] >= 0 || format_types[1] >= 0 ) + { show_error( "Two files must be given when format is specified.", 0, true ); + return 2; } + filenames[1] = filenames[0]; + infd[1] = open_other_instream( filenames[1] ); + if( infd[1] < 0 ) + { + if( verbosity >= 0 ) + std::fprintf( stderr, "%s: Can't find file to compare with '%s'.\n", + program_name, filenames[0].c_str() ); + show_error( 0, 0, true ); return 2; + } + } + + int old_infd[2]; // copy of file descriptors of the two files + old_infd[0] = infd[0]; old_infd[1] = infd[1]; + Children children[2]; + if( !set_data_feeder( filenames[0], &infd[0], children[0], format_types[0] ) || + !set_data_feeder( filenames[1], &infd[1], children[1], format_types[1] ) ) + return 2; + + for( int i = 0; i < 2; ++i ) + if( !skip_ignore_initial( ignore_initial[i], infd[i] ) ) + { + show_file_error( filenames[i].c_str(), + "Read error skipping initial bytes", errno ); + return 2; + } + + bool finished[2] = { false, false }; + int retval = cmp( max_size, infd, filenames, finished, hexadecimal, list, + print_bytes, scripted ); + + for( int i = 0; i < 2; ++i ) + if( !good_status( children[i], finished[i] ) ) retval = 2; + + for( int i = 0; i < 2; ++i ) + { + if( close( infd[i] ) != 0 ) + { show_close_error(); retval = 2; } + if( filenames[i] != "-" && close( old_infd[i] ) != 0 ) + { + show_file_error( filenames[i].c_str(), "Error closing input file", errno ); + retval = 2; + } + } + if( std::fclose( stdout ) != 0 ) + { + show_error( "Error closing stdout", errno ); + retval = 2; + } + + return retval; + } |