diff options
Diffstat (limited to 'src/grep/src/dfasearch.c')
-rw-r--r-- | src/grep/src/dfasearch.c | 590 |
1 files changed, 590 insertions, 0 deletions
diff --git a/src/grep/src/dfasearch.c b/src/grep/src/dfasearch.c new file mode 100644 index 0000000..d6afa8d --- /dev/null +++ b/src/grep/src/dfasearch.c @@ -0,0 +1,590 @@ +/* dfasearch.c - searching subroutines using dfa and regex for grep. + Copyright 1992, 1998, 2000, 2007, 2009-2021 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA + 02110-1301, USA. */ + +/* Written August 1992 by Mike Haertel. */ + +#include <config.h> +#include "intprops.h" +#include "search.h" +#include "die.h" +#include <error.h> + +struct dfa_comp +{ + /* KWset compiled pattern. For GEAcompile, we compile + a list of strings, at least one of which is known to occur in + any string matching the regexp. */ + kwset_t kwset; + + /* DFA compiled regexp. */ + struct dfa *dfa; + + /* Regex compiled regexps. */ + struct re_pattern_buffer *patterns; + size_t pcount; + struct re_registers regs; + + /* Number of compiled fixed strings known to exactly match the regexp. + If kwsexec returns < kwset_exact_matches, then we don't need to + call the regexp matcher at all. */ + ptrdiff_t kwset_exact_matches; + + bool begline; +}; + +void +dfaerror (char const *mesg) +{ + die (EXIT_TROUBLE, 0, "%s", mesg); +} + +/* For now, the sole dfawarn-eliciting condition (use of a regexp + like '[:lower:]') is unequivocally an error, so treat it as such, + when possible. */ +void +dfawarn (char const *mesg) +{ + if (!getenv ("POSIXLY_CORRECT")) + dfaerror (mesg); +} + +/* If the DFA turns out to have some set of fixed strings one of + which must occur in the match, then we build a kwset matcher + to find those strings, and thus quickly filter out impossible + matches. */ +static void +kwsmusts (struct dfa_comp *dc) +{ + struct dfamust *dm = dfamust (dc->dfa); + if (!dm) + return; + dc->kwset = kwsinit (false); + if (dm->exact) + { + /* Prepare a substring whose presence implies a match. + The kwset matcher will return the index of the matching + string that it chooses. */ + ++dc->kwset_exact_matches; + ptrdiff_t old_len = strlen (dm->must); + ptrdiff_t new_len = old_len + dm->begline + dm->endline; + char *must = xmalloc (new_len); + char *mp = must; + *mp = eolbyte; + mp += dm->begline; + dc->begline |= dm->begline; + memcpy (mp, dm->must, old_len); + if (dm->endline) + mp[old_len] = eolbyte; + kwsincr (dc->kwset, must, new_len); + free (must); + } + else + { + /* Otherwise, filtering with this substring should help reduce the + search space, but we'll still have to use the regexp matcher. */ + kwsincr (dc->kwset, dm->must, strlen (dm->must)); + } + kwsprep (dc->kwset); + dfamustfree (dm); +} + +/* Return true if KEYS, of length LEN, might contain a back-reference. + Return false if KEYS cannot contain a back-reference. + BS_SAFE is true of encodings where a backslash cannot appear as the + last byte of a multibyte character. */ +static bool _GL_ATTRIBUTE_PURE +possible_backrefs_in_pattern (char const *keys, ptrdiff_t len, bool bs_safe) +{ + /* Normally a backslash, but in an unsafe encoding this is a non-char + value so that the comparison below always fails, because if there + are two adjacent '\' bytes, the first might be the last byte of a + multibyte character. */ + int second_backslash = bs_safe ? '\\' : CHAR_MAX + 1; + + /* This code can return true even if KEYS lacks a back-reference, for + patterns like [\2], or for encodings where '\' appears as the last + byte of a multibyte character. However, false alarms should be + rare and do not affect correctness. */ + + /* Do not look for a backslash in the pattern's last byte, since it + can't be part of a back-reference and this streamlines the code. */ + len--; + + if (0 <= len) + { + char const *lim = keys + len; + for (char const *p = keys; (p = memchr (p, '\\', lim - p)); p++) + { + if ('1' <= p[1] && p[1] <= '9') + return true; + if (p[1] == second_backslash) + { + p++; + if (p == lim) + break; + } + } + } + return false; +} + +static bool +regex_compile (struct dfa_comp *dc, char const *p, ptrdiff_t len, + ptrdiff_t pcount, ptrdiff_t lineno, reg_syntax_t syntax_bits, + bool syntax_only) +{ + struct re_pattern_buffer pat0; + struct re_pattern_buffer *pat = syntax_only ? &pat0 : &dc->patterns[pcount]; + pat->buffer = NULL; + pat->allocated = 0; + + /* Do not use a fastmap with -i, to work around glibc Bug#20381. */ + pat->fastmap = (syntax_only | match_icase) ? NULL : xmalloc (UCHAR_MAX + 1); + + pat->translate = NULL; + + if (syntax_only) + re_set_syntax (syntax_bits | RE_NO_SUB); + else + re_set_syntax (syntax_bits); + + char const *err = re_compile_pattern (p, len, pat); + if (!err) + return true; + + /* Emit a filename:lineno: prefix for patterns taken from files. */ + size_t pat_lineno; + char const *pat_filename + = lineno < 0 ? "" : pattern_file_name (lineno, &pat_lineno); + + if (*pat_filename == '\0') + error (0, 0, "%s", err); + else + error (0, 0, "%s:%zu: %s", pat_filename, pat_lineno, err); + + return false; +} + +/* Compile PATTERN, containing SIZE bytes that are followed by '\n'. + SYNTAX_BITS specifies whether PATTERN uses style -G, -E, or -A. + Return a description of the compiled pattern. */ + +void * +GEAcompile (char *pattern, size_t size, reg_syntax_t syntax_bits, + bool exact) +{ + char *motif; + struct dfa_comp *dc = xcalloc (1, sizeof (*dc)); + + dc->dfa = dfaalloc (); + + if (match_icase) + syntax_bits |= RE_ICASE; + int dfaopts = eolbyte ? 0 : DFA_EOL_NUL; + dfasyntax (dc->dfa, &localeinfo, syntax_bits, dfaopts); + bool bs_safe = !localeinfo.multibyte | localeinfo.using_utf8; + + /* For GNU regex, pass the patterns separately to detect errors like + "[\nallo\n]\n", where the patterns are "[", "allo" and "]", and + this should be a syntax error. The same for backref, where the + backref should be local to each pattern. */ + char const *p = pattern; + char const *patlim = pattern + size; + bool compilation_failed = false; + + dc->patterns = xmalloc (sizeof *dc->patterns); + dc->patterns++; + dc->pcount = 0; + size_t palloc = 1; + + char const *prev = pattern; + + /* Buffer containing back-reference-free patterns. */ + char *buf = NULL; + ptrdiff_t buflen = 0; + size_t bufalloc = 0; + + ptrdiff_t lineno = 0; + + do + { + char const *sep = rawmemchr (p, '\n'); + ptrdiff_t len = sep - p; + + bool backref = possible_backrefs_in_pattern (p, len, bs_safe); + + if (backref && prev < p) + { + ptrdiff_t prevlen = p - prev; + while (bufalloc < buflen + prevlen) + buf = x2realloc (buf, &bufalloc); + memcpy (buf + buflen, prev, prevlen); + buflen += prevlen; + } + + /* Ensure room for at least two more patterns. The extra one is + for the regex_compile that may be executed after this loop + exits, and its (unused) slot is patterns[-1] until then. */ + while (palloc <= dc->pcount + 1) + { + dc->patterns = x2nrealloc (dc->patterns - 1, &palloc, + sizeof *dc->patterns); + dc->patterns++; + } + + re_set_syntax (syntax_bits); + + if (!regex_compile (dc, p, len, dc->pcount, lineno, syntax_bits, + !backref)) + compilation_failed = true; + + p = sep + 1; + lineno++; + + if (backref) + { + dc->pcount++; + prev = p; + } + } + while (p <= patlim); + + if (compilation_failed) + exit (EXIT_TROUBLE); + + if (prev <= patlim) + { + if (pattern < prev) + { + ptrdiff_t prevlen = patlim - prev; + buf = xrealloc (buf, buflen + prevlen); + memcpy (buf + buflen, prev, prevlen); + buflen += prevlen; + } + else + { + buf = pattern; + buflen = size; + } + } + + /* In the match_words and match_lines cases, we use a different pattern + for the DFA matcher that will quickly throw out cases that won't work. + Then if DFA succeeds we do some hairy stuff using the regex matcher + to decide whether the match should really count. */ + if (match_words || match_lines) + { + static char const line_beg_no_bk[] = "^("; + static char const line_end_no_bk[] = ")$"; + static char const word_beg_no_bk[] = "(^|[^[:alnum:]_])("; + static char const word_end_no_bk[] = ")([^[:alnum:]_]|$)"; + static char const line_beg_bk[] = "^\\("; + static char const line_end_bk[] = "\\)$"; + static char const word_beg_bk[] = "\\(^\\|[^[:alnum:]_]\\)\\("; + static char const word_end_bk[] = "\\)\\([^[:alnum:]_]\\|$\\)"; + int bk = !(syntax_bits & RE_NO_BK_PARENS); + char *n = xmalloc (sizeof word_beg_bk - 1 + size + sizeof word_end_bk); + + strcpy (n, match_lines ? (bk ? line_beg_bk : line_beg_no_bk) + : (bk ? word_beg_bk : word_beg_no_bk)); + size_t total = strlen (n); + memcpy (n + total, pattern, size); + total += size; + strcpy (n + total, match_lines ? (bk ? line_end_bk : line_end_no_bk) + : (bk ? word_end_bk : word_end_no_bk)); + total += strlen (n + total); + pattern = motif = n; + size = total; + } + else + motif = NULL; + + dfaparse (pattern, size, dc->dfa); + kwsmusts (dc); + dfacomp (NULL, 0, dc->dfa, 1); + + if (buf != NULL) + { + if (exact || !dfasupported (dc->dfa)) + { + dc->patterns--; + dc->pcount++; + + if (!regex_compile (dc, buf, buflen, 0, -1, syntax_bits, false)) + abort (); + } + + if (buf != pattern) + free (buf); + } + + free (motif); + + return dc; +} + +size_t +EGexecute (void *vdc, char const *buf, size_t size, size_t *match_size, + char const *start_ptr) +{ + char const *buflim, *beg, *end, *ptr, *match, *best_match, *mb_start; + char eol = eolbyte; + regoff_t start; + size_t len, best_len; + struct kwsmatch kwsm; + size_t i; + struct dfa_comp *dc = vdc; + struct dfa *superset = dfasuperset (dc->dfa); + bool dfafast = dfaisfast (dc->dfa); + + mb_start = buf; + buflim = buf + size; + + for (beg = end = buf; end < buflim; beg = end) + { + end = buflim; + + if (!start_ptr) + { + char const *next_beg, *dfa_beg = beg; + ptrdiff_t count = 0; + bool exact_kwset_match = false; + bool backref = false; + + /* Try matching with KWset, if it's defined. */ + if (dc->kwset) + { + char const *prev_beg; + + /* Find a possible match using the KWset matcher. */ + ptrdiff_t offset = kwsexec (dc->kwset, beg - dc->begline, + buflim - beg + dc->begline, + &kwsm, true); + if (offset < 0) + return offset; + match = beg + offset; + prev_beg = beg; + + /* Narrow down to the line containing the possible match. */ + beg = memrchr (buf, eol, match - buf); + beg = beg ? beg + 1 : buf; + dfa_beg = beg; + + /* Determine the end pointer to give the DFA next. Typically + this is after the first newline after MATCH; but if the KWset + match is not exact, the DFA is fast, and the offset from + PREV_BEG is less than 64 or (MATCH - PREV_BEG), this is the + greater of the latter two values; this temporarily prefers + the DFA to KWset. */ + exact_kwset_match = kwsm.index < dc->kwset_exact_matches; + if (exact_kwset_match || !dfafast + || MAX (16, match - beg) < (match - prev_beg) >> 2) + { + end = rawmemchr (match, eol); + end++; + } + else if (MAX (16, match - beg) < (buflim - prev_beg) >> 2) + { + end = rawmemchr (prev_beg + 4 * MAX (16, match - beg), eol); + end++; + } + else + end = buflim; + + if (exact_kwset_match) + { + if (!localeinfo.multibyte | localeinfo.using_utf8) + goto success; + if (mb_start < beg) + mb_start = beg; + if (mb_goback (&mb_start, NULL, match, buflim) == 0) + goto success; + /* The matched line starts in the middle of a multibyte + character. Perform the DFA search starting from the + beginning of the next character. */ + dfa_beg = mb_start; + } + } + + /* Try matching with the superset of DFA, if it's defined. */ + if (superset && !exact_kwset_match) + { + /* Keep using the superset while it reports multiline + potential matches; this is more likely to be fast + than falling back to KWset would be. */ + next_beg = dfaexec (superset, dfa_beg, (char *) end, 0, + &count, NULL); + if (next_beg == NULL || next_beg == end) + continue; + + /* Narrow down to the line we've found. */ + if (count != 0) + { + beg = memrchr (buf, eol, next_beg - buf); + beg++; + dfa_beg = beg; + } + end = rawmemchr (next_beg, eol); + end++; + + count = 0; + } + + /* Try matching with DFA. */ + next_beg = dfaexec (dc->dfa, dfa_beg, (char *) end, 0, &count, + &backref); + + /* If there's no match, or if we've matched the sentinel, + we're done. */ + if (next_beg == NULL || next_beg == end) + continue; + + /* Narrow down to the line we've found. */ + if (count != 0) + { + beg = memrchr (buf, eol, next_beg - buf); + beg++; + } + end = rawmemchr (next_beg, eol); + end++; + + /* Successful, no back-references encountered! */ + if (!backref) + goto success; + ptr = beg; + } + else + { + /* We are looking for the leftmost (then longest) exact match. + We will go through the outer loop only once. */ + ptr = start_ptr; + } + + /* If the "line" is longer than the maximum regexp offset, + die as if we've run out of memory. */ + if (TYPE_MAXIMUM (regoff_t) < end - beg - 1) + xalloc_die (); + + /* Run the possible match through Regex. */ + best_match = end; + best_len = 0; + for (i = 0; i < dc->pcount; i++) + { + dc->patterns[i].not_eol = 0; + dc->patterns[i].newline_anchor = eolbyte == '\n'; + start = re_search (&dc->patterns[i], beg, end - beg - 1, + ptr - beg, end - ptr - 1, &dc->regs); + if (start < -1) + xalloc_die (); + else if (0 <= start) + { + len = dc->regs.end[0] - start; + match = beg + start; + if (match > best_match) + continue; + if (start_ptr && !match_words) + goto assess_pattern_match; + if ((!match_lines && !match_words) + || (match_lines && len == end - ptr - 1)) + { + match = ptr; + len = end - ptr; + goto assess_pattern_match; + } + /* If -w and not -x, check whether the match aligns with + word boundaries. Do this iteratively because: + (a) the line may contain more than one occurrence of the + pattern, and + (b) Several alternatives in the pattern might be valid at a + given point, and we may need to consider a shorter one to + find a word boundary. */ + if (!match_lines && match_words) + while (match <= best_match) + { + regoff_t shorter_len = 0; + if (! wordchar_next (match + len, end - 1) + && ! wordchar_prev (beg, match, end - 1)) + goto assess_pattern_match; + if (len > 0) + { + /* Try a shorter length anchored at the same place. */ + --len; + dc->patterns[i].not_eol = 1; + shorter_len = re_match (&dc->patterns[i], beg, + match + len - ptr, match - beg, + &dc->regs); + if (shorter_len < -1) + xalloc_die (); + } + if (0 < shorter_len) + len = shorter_len; + else + { + /* Try looking further on. */ + if (match == end - 1) + break; + match++; + dc->patterns[i].not_eol = 0; + start = re_search (&dc->patterns[i], beg, end - beg - 1, + match - beg, end - match - 1, + &dc->regs); + if (start < 0) + { + if (start < -1) + xalloc_die (); + break; + } + len = dc->regs.end[0] - start; + match = beg + start; + } + } /* while (match <= best_match) */ + continue; + assess_pattern_match: + if (!start_ptr) + { + /* Good enough for a non-exact match. + No need to look at further patterns, if any. */ + goto success; + } + if (match < best_match || (match == best_match && len > best_len)) + { + /* Best exact match: leftmost, then longest. */ + best_match = match; + best_len = len; + } + } /* if re_search >= 0 */ + } /* for Regex patterns. */ + if (best_match < end) + { + /* We have found an exact match. We were just + waiting for the best one (leftmost then longest). */ + beg = best_match; + len = best_len; + goto success_in_len; + } + } /* for (beg = end ..) */ + + return -1; + + success: + len = end - beg; + success_in_len:; + size_t off = beg - buf; + *match_size = len; + return off; +} |