summaryrefslogtreecommitdiffstats
path: root/zgrep.cc
diff options
context:
space:
mode:
Diffstat (limited to 'zgrep.cc')
-rw-r--r--zgrep.cc417
1 files changed, 417 insertions, 0 deletions
diff --git a/zgrep.cc b/zgrep.cc
new file mode 100644
index 0000000..8f4bc9d
--- /dev/null
+++ b/zgrep.cc
@@ -0,0 +1,417 @@
+/* Zgrep - search compressed files for a regular expression
+ Copyright (C) 2010-2024 Antonio Diaz Diaz.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#define _FILE_OFFSET_BITS 64
+
+#include <cerrno>
+#include <climits>
+#include <csignal>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <list>
+#include <string>
+#include <vector>
+#include <dirent.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#if defined __MSVCRT__ || defined __OS2__
+#include <io.h>
+#endif
+
+#include "arg_parser.h"
+#include "rc.h"
+#include "zutils.h"
+
+
+namespace {
+
+#include "recursive.cc"
+#include "zcatgrep.cc"
+
+void show_help()
+ {
+ std::printf( "zgrep is a front end to the program grep that allows transparent search\n"
+ "on any combination of compressed and uncompressed files. If any file\n"
+ "given is compressed, its decompressed content is used. If a file given\n"
+ "does not exist, and its name does not end with one of the known\n"
+ "extensions, zgrep tries the compressed file names corresponding to the\n"
+ "formats supported until one is found. If a file fails to decompress, zgrep\n"
+ "continues searching the rest of the files.\n"
+ "\nIf a file is specified as '-', data are read from standard input,\n"
+ "decompressed if needed, and fed to grep. Data read from standard input\n"
+ "must be of the same type; all uncompressed or all in the same\n"
+ "compressed format.\n"
+ "\nIf no files are specified, recursive searches examine the current\n"
+ "working directory, and nonrecursive searches read standard input.\n"
+ "\n'zgrep --verbose -V' prints the version of the grep program used.\n"
+ "\nThe formats supported are bzip2, gzip, lzip, xz, and zstd.\n"
+ "\nUsage: zgrep [options] <pattern> [files]\n"
+ "\nExit status is 0 if match, 1 if no match, 2 if trouble.\n"
+ "Some options only work if the grep program used supports them.\n"
+ "\nOptions:\n"
+ " --help display this help and exit\n"
+ " -V, --version output version information and exit\n"
+ " -a, --text treat all files as text\n"
+ " -A, --after-context=<n> print <n> lines of trailing context\n"
+ " -b, --byte-offset print the byte offset of each line\n"
+ " -B, --before-context=<n> print <n> lines of leading context\n"
+ " -c, --count only print a count of matching lines per file\n"
+ " -C, --context=<n> print <n> lines of output context\n"
+ " --color[=<when>] show matched strings in color\n"
+ " -e, --regexp=<pattern> use <pattern> as the pattern to match\n"
+ " -E, --extended-regexp <pattern> is an extended regular expression\n"
+ " -f, --file=<file> obtain patterns from <file>\n"
+ " -F, --fixed-strings <pattern> is a set of newline-separated strings\n"
+ " -G, --basic-regexp <pattern> is a basic regular expression (default)\n"
+ " -h, --no-filename suppress the prefixing file name on output\n"
+ " -H, --with-filename print the file name for each match\n"
+ " -i, --ignore-case ignore case distinctions\n"
+ " -I ignore binary files\n"
+ " -l, --files-with-matches only print names of files containing matches\n"
+ " -L, --files-without-match only print names of files containing no matches\n"
+ " --label=<label> use <label> as file name for standard input\n"
+ " --line-buffered flush output on every line\n"
+ " -m, --max-count=<n> stop after <n> matches\n"
+ " -M, --format=<list> process only the formats in <list>\n"
+ " -n, --line-number print the line number of each line\n"
+ " -N, --no-rcfile don't read runtime configuration file\n"
+ " -o, --only-matching show only the part of a line matching <pattern>\n"
+ " -O, --force-format=<fmt> force the input format\n"
+ " -P, --perl-regexp <pattern> is a Perl regular expression\n"
+ " -q, --quiet, --silent suppress all messages\n"
+ " -r, --recursive operate recursively on directories\n"
+ " -R, --dereference-recursive recursively follow symbolic links\n"
+ " -s, --no-messages suppress error messages\n"
+ " -T, --initial-tab make tabs line up (if needed)\n"
+ " -U, --binary don't strip CR characters at EOL (DOS/Windows)\n"
+ " -v, --invert-match select non-matching lines\n"
+ " --verbose verbose mode (show error messages)\n"
+ " -w, --word-regexp match only whole words\n"
+ " -x, --line-regexp match only whole lines\n"
+ " -Z, --null print 0 byte (ASCII NUL) after file name\n"
+ " --bz2=<command> set compressor and options for bzip2 format\n"
+ " --gz=<command> set compressor and options for gzip format\n"
+ " --lz=<command> set compressor and options for lzip format\n"
+ " --xz=<command> set compressor and options for xz format\n"
+ " --zst=<command> set compressor and options for zstd format\n"
+ "\nValid formats for options '-M' and '-O' are 'bz2', 'gz', 'lz', 'xz', 'zst',\n"
+ "and 'un' for uncompressed.\n"
+ "\nNumbers may be followed by a multiplier: k = kB = 10^3 = 1000,\n"
+ "Ki = KiB = 2^10 = 1024, M = 10^6, Mi = 2^20, G = 10^9, Gi = 2^30, etc...\n" );
+ show_help_addr();
+ }
+
+
+int zgrep_file( int infd, const int format_index,
+ const std::string & input_filename,
+ const std::vector< const char * > & grep_args,
+ const int list_mode, const bool initial_tab,
+ const bool line_buffered, const bool show_name,
+ const bool z_null )
+ {
+ Children children;
+ if( !set_data_feeder( input_filename, &infd, children, format_index ) )
+ return 2;
+ int fda[2]; // pipe from grep
+ if( pipe( fda ) < 0 )
+ { show_error( "Can't create pipe", errno ); return 2; }
+ const pid_t grep_pid = fork();
+ if( grep_pid == 0 ) // child (grep)
+ {
+ if( dup2( infd, STDIN_FILENO ) >= 0 &&
+ dup2( fda[1], STDOUT_FILENO ) >= 0 &&
+ close( infd ) == 0 && close( fda[0] ) == 0 && close( fda[1] ) == 0 )
+ {
+ const char ** const argv = new const char *[grep_args.size()+2];
+ argv[0] = GREP;
+ for( unsigned i = 0; i < grep_args.size(); ++i )
+ argv[i+1] = grep_args[i];
+ argv[grep_args.size()+1] = 0;
+ execvp( argv[0], (char **)argv );
+ }
+ show_exec_error( GREP );
+ _exit( 2 );
+ }
+ if( grep_pid < 0 ) // parent
+ { show_fork_error( GREP ); return 2; }
+
+ close( fda[1] );
+ enum { buffer_size = 256 };
+ uint8_t buffer[buffer_size];
+ bool line_begin = true;
+ bool at_eof = false;
+ while( !at_eof )
+ {
+ int size;
+ bool error = false;
+ if( line_buffered )
+ for( size = 0; size < buffer_size; )
+ { if( readblock( fda[0], buffer + size, 1 ) == 1 )
+ { ++size; if( buffer[size-1] == '\n' ) break; }
+ else { at_eof = true; if( errno ) { error = true; } break; } }
+ else
+ { size = readblock( fda[0], buffer, buffer_size );
+ if( size < buffer_size ) { at_eof = true; if( errno ) error = true; } }
+ if( error )
+ { std::fflush( stdout ); show_error( "Read error", errno ); return 2; }
+ if( size > 0 && !list_mode )
+ {
+ if( show_name ) // print the file name for each match
+ for( int i = 0; i < size; ++i )
+ {
+ if( line_begin )
+ { line_begin = false;
+ const int len = std::printf( "%s%c", input_filename.c_str(),
+ z_null ? 0 : ':' );
+ if( initial_tab && len > 0 && len % 8 ) putchar( '\t' ); }
+ putchar( buffer[i] );
+ if( buffer[i] == '\n' )
+ { line_begin = true; if( line_buffered ) std::fflush( stdout ); }
+ }
+ else if( std::fwrite( buffer, 1, size, stdout ) != (unsigned)size )
+ { std::fflush( stdout ); show_error( "Write error", errno ); return 2; }
+ }
+ }
+ std::fflush( stdout );
+
+ int retval = wait_for_child( grep_pid, GREP );
+
+ if( !good_status( children, retval == 1 ) ) retval = 2;
+
+ if( list_mode && (retval == 0) == (list_mode == 1) )
+ { std::printf( "%s%c", input_filename.c_str(), z_null ? 0 : '\n' );
+ std::fflush( stdout ); }
+ if( close( infd ) != 0 )
+ { show_close_error(); return 2; }
+ if( close( fda[0] ) != 0 )
+ { show_close_error( GREP ); return 2; }
+ return retval;
+ }
+
+} // end namespace
+
+
+int main( const int argc, const char * const argv[] )
+ {
+ enum { help_opt = 256, verbose_opt, color_opt, label_opt, linebuf_opt,
+ bz2_opt, gz_opt, lz_opt, xz_opt, zst_opt };
+ int format_index = -1; // undefined
+ int list_mode = 0; // 1 = list matches, -1 = list non-matches
+ int recursive = 0; // 1 = '-r', 2 = '-R'
+ int show_name = -1; // tri-state bool
+ bool initial_tab = false;
+ bool line_buffered = false;
+ bool no_messages = false;
+ bool z_null = false; // for '-Z, --null'
+ std::list< std::string > filenames;
+ std::vector< const char * > grep_args; // args to grep, maybe empty
+ std::string color_option; // additional args to grep
+ std::string label_option;
+ const char * label = "(standard input)"; // prefix for standard input
+ program_name = "zgrep";
+ invocation_name = ( argc > 0 ) ? argv[0] : program_name;
+
+ const Arg_parser::Option options[] =
+ {
+ { 'a', "text", Arg_parser::no }, // grep GNU
+ { 'A', "after-context", Arg_parser::yes }, // grep GNU
+ { 'b', "byte-offset", Arg_parser::no }, // grep GNU
+ { 'B', "before-context", Arg_parser::yes }, // grep GNU
+ { 'c', "count", Arg_parser::no }, // grep
+ { 'C', "context", Arg_parser::yes }, // grep GNU
+ { 'e', "regexp", Arg_parser::yes }, // grep
+ { 'E', "extended-regexp", Arg_parser::no }, // grep
+ { 'f', "file ", Arg_parser::yes }, // grep
+ { 'F', "fixed-strings", Arg_parser::no }, // grep
+ { 'G', "basic-regexp", Arg_parser::no }, // grep GNU
+ { 'h', "no-filename", Arg_parser::no }, // grep GNU
+ { 'H', "with-filename", Arg_parser::no }, // grep GNU
+ { 'i', "ignore-case", Arg_parser::no }, // grep
+ { 'I', 0, Arg_parser::no }, // grep GNU
+ { 'l', "files-with-matches", Arg_parser::no }, // grep
+ { 'L', "files-without-match", Arg_parser::no }, // grep GNU
+ { 'm', "max-count", Arg_parser::yes }, // grep GNU
+ { 'M', "format", Arg_parser::yes },
+ { 'n', "line-number", Arg_parser::no }, // grep
+ { 'N', "no-rcfile", Arg_parser::no },
+ { 'o', "only-matching", Arg_parser::no }, // grep
+ { 'O', "force-format", Arg_parser::yes },
+ { 'P', "perl-regexp", Arg_parser::no }, // grep GNU
+ { 'q', "quiet", Arg_parser::no },
+ { 'q', "silent", Arg_parser::no },
+ { 'r', "recursive", Arg_parser::no },
+ { 'R', "dereference-recursive", Arg_parser::no },
+ { 's', "no-messages", Arg_parser::no }, // grep
+ { 'T', "initial-tab", Arg_parser::no }, // grep GNU
+ { 'U', "binary", Arg_parser::no }, // grep GNU
+ { 'v', "invert-match", Arg_parser::no }, // grep
+ { 'V', "version", Arg_parser::no },
+ { 'w', "word-regexp", Arg_parser::no }, // grep GNU
+ { 'x', "line-regexp", Arg_parser::no }, // grep
+ { 'Z', "null", Arg_parser::no }, // grep GNU
+ { help_opt, "help", Arg_parser::no },
+ { verbose_opt, "verbose", Arg_parser::no },
+ { color_opt, "color", Arg_parser::maybe },
+ { label_opt, "label", Arg_parser::yes },
+ { linebuf_opt, "line-buffered", Arg_parser::no },
+ { bz2_opt, "bz2", Arg_parser::yes },
+ { gz_opt, "gz", Arg_parser::yes },
+ { lz_opt, "lz", Arg_parser::yes },
+ { xz_opt, "xz", Arg_parser::yes },
+ { zst_opt, "zst", Arg_parser::yes },
+ { 0, 0, Arg_parser::no } };
+
+ const Arg_parser parser( argc, argv, options );
+ if( parser.error().size() ) // bad option
+ { show_error( parser.error().c_str(), 0, true ); return 2; }
+
+ maybe_process_config_file( parser );
+
+ int argind = 0;
+ bool pattern_found = false;
+ for( ; argind < parser.arguments(); ++argind )
+ {
+ const int code = parser.code( argind );
+ if( !code ) break; // no more options
+ const char * const pn = parser.parsed_name( argind ).c_str();
+ const std::string & sarg = parser.argument( argind );
+ const char * const arg = sarg.c_str();
+ switch( code )
+ {
+ case 'a': grep_args.push_back( "-a" ); break;
+ case 'A': grep_args.push_back( "-A" ); grep_args.push_back( arg ); break;
+ case 'b': grep_args.push_back( "-b" ); break;
+ case 'B': grep_args.push_back( "-B" ); grep_args.push_back( arg ); break;
+ case 'c': grep_args.push_back( "-c" ); break;
+ case 'C': grep_args.push_back( "-C" ); grep_args.push_back( arg ); break;
+ case 'e': grep_args.push_back( "-e" ); grep_args.push_back( arg );
+ pattern_found = true; break;
+ case 'E': grep_args.push_back( "-E" ); break;
+ case 'f': grep_args.push_back( "-f" ); grep_args.push_back( arg );
+ pattern_found = true; break;
+ case 'F': grep_args.push_back( "-F" ); break;
+ case 'G': grep_args.push_back( "-G" ); break;
+ case 'h': show_name = false; break;
+ case 'H': show_name = true; break;
+ case 'i': grep_args.push_back( "-i" ); break;
+ case 'I': grep_args.push_back( "-I" ); break;
+ case 'l': grep_args.push_back( "-l" ); list_mode = 1; break;
+ case 'L': grep_args.push_back( "-L" ); list_mode = -1; break;
+ case 'm': grep_args.push_back( "-m" ); grep_args.push_back( arg ); break;
+ case 'M': parse_format_list( sarg, pn ); break;
+ case 'n': grep_args.push_back( "-n" ); break;
+ case 'N': break;
+ case 'o': grep_args.push_back( "-o" ); break;
+ case 'O': format_index = parse_format_type( sarg, pn ); break;
+ case 'P': grep_args.push_back( "-P" ); break;
+ case 'q': grep_args.push_back( "-q" ); verbosity = -1; break;
+ case 'r': recursive = 1; break;
+ case 'R': recursive = 2; break;
+ case 's': grep_args.push_back( "-s" ); no_messages = true; break;
+ case 'T': grep_args.push_back( "-T" ); initial_tab = true; break;
+ case 'U': grep_args.push_back( "-U" ); break;
+ case 'v': grep_args.push_back( "-v" ); break;
+ case 'V': show_version( GREP " --version" ); return 0;
+ case 'w': grep_args.push_back( "-w" ); break;
+ case 'x': grep_args.push_back( "-x" ); break;
+ case 'Z': z_null = true; break;
+ case help_opt: show_help(); return 0;
+ case verbose_opt: no_messages = false; if( verbosity < 4 ) ++verbosity;
+ break;
+ case color_opt: color_option = "--color";
+ if( !sarg.empty() ) { color_option += '='; color_option += sarg; }
+ break;
+ case label_opt: label_option = "--label="; label_option += sarg;
+ label = arg; break;
+ case linebuf_opt: grep_args.push_back( "--line-buffered" );
+ line_buffered = true; break;
+ case bz2_opt: parse_compressor( sarg, pn, fmt_bz2 ); break;
+ case gz_opt: parse_compressor( sarg, pn, fmt_gz ); break;
+ case lz_opt: parse_compressor( sarg, pn, fmt_lz ); break;
+ case xz_opt: parse_compressor( sarg, pn, fmt_xz ); break;
+ case zst_opt: parse_compressor( sarg, pn, fmt_zst ); break;
+ default: internal_error( "uncaught option." );
+ }
+ } // end process options
+
+ if( !color_option.empty() ) // push the last value set
+ grep_args.push_back( color_option.c_str() );
+ if( !label_option.empty() ) // for "Binary file <label> matches"
+ grep_args.push_back( label_option.c_str() );
+
+#if defined __MSVCRT__ || defined __OS2__
+ setmode( STDIN_FILENO, O_BINARY );
+ setmode( STDOUT_FILENO, O_BINARY );
+#endif
+
+ if( !pattern_found )
+ {
+ if( argind >= parser.arguments() )
+ { show_error( "Pattern not found." ); return 2; }
+ const std::string & pat = parser.argument( argind++ );
+ if( pat.size() && pat[0] == '-' ) grep_args.push_back( "-e" );
+ grep_args.push_back( pat.c_str() );
+ }
+
+ for( ; argind < parser.arguments(); ++argind )
+ filenames.push_back( parser.argument( argind ) );
+
+ if( filenames.empty() ) filenames.push_back( recursive ? "." : "-" );
+
+ if( show_name < 0 ) show_name = ( filenames.size() != 1 || recursive );
+
+ std::string input_filename;
+ int retval = 1;
+ bool error = false;
+ bool stdin_used = false;
+ while( next_filename( filenames, input_filename, error, recursive,
+ false, no_messages ) )
+ {
+ int infd;
+ if( input_filename == "." )
+ {
+ if( stdin_used ) continue; else stdin_used = true;
+ infd = STDIN_FILENO; input_filename = label;
+ }
+ else
+ {
+ infd = open_instream( input_filename, format_index < 0, no_messages );
+ if( infd < 0 ) { error = true; continue; }
+ }
+
+ const int tmp = zgrep_file( infd, format_index, input_filename, grep_args,
+ list_mode, initial_tab, line_buffered,
+ show_name, z_null );
+ if( tmp == 0 || ( tmp == 2 && retval == 1 ) ) retval = tmp;
+
+ if( close( infd ) != 0 )
+ { show_file_error( input_filename.c_str(), "Error closing input file",
+ errno ); error = true; }
+ if( retval == 0 && verbosity < 0 ) break;
+ }
+
+ if( std::fclose( stdout ) != 0 )
+ {
+ show_error( "Error closing stdout", errno );
+ error = true;
+ }
+ if( error && ( retval != 0 || verbosity >= 0 ) ) retval = 2;
+ return retval;
+ }