diff options
Diffstat (limited to 'src/grep/src/pcresearch.c')
-rw-r--r-- | src/grep/src/pcresearch.c | 352 |
1 files changed, 352 insertions, 0 deletions
diff --git a/src/grep/src/pcresearch.c b/src/grep/src/pcresearch.c new file mode 100644 index 0000000..37f7e40 --- /dev/null +++ b/src/grep/src/pcresearch.c @@ -0,0 +1,352 @@ +/* pcresearch.c - searching subroutines using PCRE for grep. + Copyright 2000, 2007, 2009-2021 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA + 02110-1301, USA. */ + +/* Written August 1992 by Mike Haertel. */ + +#include <config.h> +#include "search.h" +#include "die.h" + +#include <pcre.h> + +/* This must be at least 2; everything after that is for performance + in pcre_exec. */ +enum { NSUB = 300 }; + +#ifndef PCRE_EXTRA_MATCH_LIMIT_RECURSION +# define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0 +#endif +#ifndef PCRE_STUDY_JIT_COMPILE +# define PCRE_STUDY_JIT_COMPILE 0 +#endif +#ifndef PCRE_STUDY_EXTRA_NEEDED +# define PCRE_STUDY_EXTRA_NEEDED 0 +#endif + +struct pcre_comp +{ + /* Compiled internal form of a Perl regular expression. */ + pcre *cre; + + /* Additional information about the pattern. */ + pcre_extra *extra; + +#if PCRE_STUDY_JIT_COMPILE + /* The JIT stack and its maximum size. */ + pcre_jit_stack *jit_stack; + int jit_stack_size; +#endif + + /* Table, indexed by ! (flag & PCRE_NOTBOL), of whether the empty + string matches when that flag is used. */ + int empty_match[2]; +}; + + +/* Match the already-compiled PCRE pattern against the data in SUBJECT, + of size SEARCH_BYTES and starting with offset SEARCH_OFFSET, with + options OPTIONS, and storing resulting matches into SUB. Return + the (nonnegative) match location or a (negative) error number. */ +static int +jit_exec (struct pcre_comp *pc, char const *subject, int search_bytes, + int search_offset, int options, int *sub) +{ + while (true) + { + int e = pcre_exec (pc->cre, pc->extra, subject, search_bytes, + search_offset, options, sub, NSUB); + +#if PCRE_STUDY_JIT_COMPILE + if (e == PCRE_ERROR_JIT_STACKLIMIT + && 0 < pc->jit_stack_size && pc->jit_stack_size <= INT_MAX / 2) + { + int old_size = pc->jit_stack_size; + int new_size = pc->jit_stack_size = old_size * 2; + if (pc->jit_stack) + pcre_jit_stack_free (pc->jit_stack); + pc->jit_stack = pcre_jit_stack_alloc (old_size, new_size); + if (!pc->jit_stack) + die (EXIT_TROUBLE, 0, + _("failed to allocate memory for the PCRE JIT stack")); + pcre_assign_jit_stack (pc->extra, NULL, pc->jit_stack); + continue; + } +#endif + +#if PCRE_EXTRA_MATCH_LIMIT_RECURSION + if (e == PCRE_ERROR_RECURSIONLIMIT + && (PCRE_STUDY_EXTRA_NEEDED || pc->extra)) + { + unsigned long lim + = (pc->extra->flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION + ? pc->extra->match_limit_recursion + : 0); + if (lim <= ULONG_MAX / 2) + { + pc->extra->match_limit_recursion = lim ? 2 * lim : (1 << 24) - 1; + pc->extra->flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION; + continue; + } + } +#endif + + return e; + } +} + +/* Compile the -P style PATTERN, containing SIZE bytes that are + followed by '\n'. Return a description of the compiled pattern. */ + +void * +Pcompile (char *pattern, size_t size, reg_syntax_t ignored, bool exact) +{ + int e; + char const *ep; + static char const wprefix[] = "(?<!\\w)(?:"; + static char const wsuffix[] = ")(?!\\w)"; + static char const xprefix[] = "^(?:"; + static char const xsuffix[] = ")$"; + int fix_len_max = MAX (sizeof wprefix - 1 + sizeof wsuffix - 1, + sizeof xprefix - 1 + sizeof xsuffix - 1); + char *re = xnmalloc (4, size + (fix_len_max + 4 - 1) / 4); + int flags = PCRE_DOLLAR_ENDONLY | (match_icase ? PCRE_CASELESS : 0); + char *patlim = pattern + size; + char *n = re; + char const *p; + char const *pnul; + struct pcre_comp *pc = xcalloc (1, sizeof (*pc)); + + if (localeinfo.multibyte) + { + if (! localeinfo.using_utf8) + die (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales")); + flags |= PCRE_UTF8; + } + + /* FIXME: Remove this restriction. */ + if (rawmemchr (pattern, '\n') != patlim) + die (EXIT_TROUBLE, 0, _("the -P option only supports a single pattern")); + + *n = '\0'; + if (match_words) + strcpy (n, wprefix); + if (match_lines) + strcpy (n, xprefix); + n += strlen (n); + + /* The PCRE interface doesn't allow NUL bytes in the pattern, so + replace each NUL byte in the pattern with the four characters + "\000", removing a preceding backslash if there are an odd + number of backslashes before the NUL. */ + *patlim = '\0'; + for (p = pattern; (pnul = p + strlen (p)) < patlim; p = pnul + 1) + { + memcpy (n, p, pnul - p); + n += pnul - p; + for (p = pnul; pattern < p && p[-1] == '\\'; p--) + continue; + n -= (pnul - p) & 1; + strcpy (n, "\\000"); + n += 4; + } + memcpy (n, p, patlim - p + 1); + n += patlim - p; + *patlim = '\n'; + + if (match_words) + strcpy (n, wsuffix); + if (match_lines) + strcpy (n, xsuffix); + + pc->cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ()); + if (!pc->cre) + die (EXIT_TROUBLE, 0, "%s", ep); + + int pcre_study_flags = PCRE_STUDY_EXTRA_NEEDED | PCRE_STUDY_JIT_COMPILE; + pc->extra = pcre_study (pc->cre, pcre_study_flags, &ep); + if (ep) + die (EXIT_TROUBLE, 0, "%s", ep); + +#if PCRE_STUDY_JIT_COMPILE + if (pcre_fullinfo (pc->cre, pc->extra, PCRE_INFO_JIT, &e)) + die (EXIT_TROUBLE, 0, _("internal error (should never happen)")); + + /* The PCRE documentation says that a 32 KiB stack is the default. */ + if (e) + pc->jit_stack_size = 32 << 10; +#endif + + free (re); + + int sub[NSUB]; + pc->empty_match[false] = pcre_exec (pc->cre, pc->extra, "", 0, 0, + PCRE_NOTBOL, sub, NSUB); + pc->empty_match[true] = pcre_exec (pc->cre, pc->extra, "", 0, 0, 0, sub, + NSUB); + + return pc; +} + +size_t +Pexecute (void *vcp, char const *buf, size_t size, size_t *match_size, + char const *start_ptr) +{ + int sub[NSUB]; + char const *p = start_ptr ? start_ptr : buf; + bool bol = p[-1] == eolbyte; + char const *line_start = buf; + int e = PCRE_ERROR_NOMATCH; + char const *line_end; + struct pcre_comp *pc = vcp; + + /* The search address to pass to pcre_exec. This is the start of + the buffer, or just past the most-recently discovered encoding + error or line end. */ + char const *subject = buf; + + do + { + /* Search line by line. Although this code formerly used + PCRE_MULTILINE for performance, the performance wasn't always + better and the correctness issues were too puzzling. See + Bug#22655. */ + line_end = rawmemchr (p, eolbyte); + if (INT_MAX < line_end - p) + die (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit")); + + for (;;) + { + /* Skip past bytes that are easily determined to be encoding + errors, treating them as data that cannot match. This is + faster than having pcre_exec check them. */ + while (localeinfo.sbclen[to_uchar (*p)] == -1) + { + p++; + subject = p; + bol = false; + } + + int search_offset = p - subject; + + /* Check for an empty match; this is faster than letting + pcre_exec do it. */ + if (p == line_end) + { + sub[0] = sub[1] = search_offset; + e = pc->empty_match[bol]; + break; + } + + int options = 0; + if (!bol) + options |= PCRE_NOTBOL; + + e = jit_exec (pc, subject, line_end - subject, search_offset, + options, sub); + if (e != PCRE_ERROR_BADUTF8) + break; + int valid_bytes = sub[0]; + + if (search_offset <= valid_bytes) + { + /* Try to match the string before the encoding error. */ + if (valid_bytes == 0) + { + /* Handle the empty-match case specially, for speed. + This optimization is valid if VALID_BYTES is zero, + which means SEARCH_OFFSET is also zero. */ + sub[1] = 0; + e = pc->empty_match[bol]; + } + else + e = jit_exec (pc, subject, valid_bytes, search_offset, + options | PCRE_NO_UTF8_CHECK | PCRE_NOTEOL, sub); + + if (e != PCRE_ERROR_NOMATCH) + break; + + /* Treat the encoding error as data that cannot match. */ + p = subject + valid_bytes + 1; + bol = false; + } + + subject += valid_bytes + 1; + } + + if (e != PCRE_ERROR_NOMATCH) + break; + bol = true; + p = subject = line_start = line_end + 1; + } + while (p < buf + size); + + if (e <= 0) + { + switch (e) + { + case PCRE_ERROR_NOMATCH: + break; + + case PCRE_ERROR_NOMEMORY: + die (EXIT_TROUBLE, 0, _("%s: memory exhausted"), input_filename ()); + +#if PCRE_STUDY_JIT_COMPILE + case PCRE_ERROR_JIT_STACKLIMIT: + die (EXIT_TROUBLE, 0, _("%s: exhausted PCRE JIT stack"), + input_filename ()); +#endif + + case PCRE_ERROR_MATCHLIMIT: + die (EXIT_TROUBLE, 0, _("%s: exceeded PCRE's backtracking limit"), + input_filename ()); + + case PCRE_ERROR_RECURSIONLIMIT: + die (EXIT_TROUBLE, 0, _("%s: exceeded PCRE's recursion limit"), + input_filename ()); + + default: + /* For now, we lump all remaining PCRE failures into this basket. + If anyone cares to provide sample grep usage that can trigger + particular PCRE errors, we can add to the list (above) of more + detailed diagnostics. */ + die (EXIT_TROUBLE, 0, _("%s: internal PCRE error: %d"), + input_filename (), e); + } + + return -1; + } + else + { + char const *matchbeg = subject + sub[0]; + char const *matchend = subject + sub[1]; + char const *beg; + char const *end; + if (start_ptr) + { + beg = matchbeg; + end = matchend; + } + else + { + beg = line_start; + end = line_end + 1; + } + *match_size = end - beg; + return beg - buf; + } +} |