1 files changed, 220 insertions, 0 deletions
diff --git a/third_party/rust/regex-automata/src/meta/reverse_inner.rs b/third_party/rust/regex-automata/src/meta/reverse_inner.rs
new file mode 100644
index 0000000000..3d78779f6f
--- /dev/null
+++ b/third_party/rust/regex-automata/src/meta/reverse_inner.rs
@@ -0,0 +1,220 @@
+/*!
+A module dedicated to plucking inner literals out of a regex pattern, and
+then constructing a prefilter for them. We also include a regex pattern
+"prefix" that corresponds to the bits of the regex that need to match before
+the literals do. The reverse inner optimization then proceeds by looking for
+matches of the inner literal(s), and then doing a reverse search of the prefix
+from the start of the literal match to find the overall start position of the
+match.
+
+The essential invariant we want to uphold here is that the literals we return
+reflect a set where *at least* one of them must match in order for the overall
+regex to match. We also need to maintain the invariant that the regex prefix
+returned corresponds to the entirety of the regex up until the literals we
+return.
+
+This somewhat limits what we can do. That is, if we a regex like
+`\w+(@!|%%)\w+`, then we can pluck the `{@!, %%}` out and build a prefilter
+from it. Then we just need to compile `\w+` in reverse. No fuss no muss. But if
+we have a regex like \d+@!|\w+%%`, then we get kind of stymied. Technically,
+we could still extract `{@!, %%}`, and it is true that at least of them must
+match. But then, what is our regex prefix? Again, in theory, that could be
+`\d+|\w+`, but that's not quite right, because the `\d+` only matches when `@!`
+matches, and `\w+` only matches when `%%` matches.
+
+All of that is technically possible to do, but it seemingly requires a lot of
+sophistication and machinery. Probably the way to tackle that is with some kind
+of formalism and approach this problem more generally.
+
+For now, the code below basically just looks for a top-level concatenation.
+And if it can find one, it looks for literals in each of the direct child
+sub-expressions of that concatenation. If some good ones are found, we return
+those and a concatenation of the Hir expressions seen up to that point.
+*/
+
+use alloc::vec::Vec;
+
+use regex_syntax::hir::{self, literal, Hir, HirKind};
+
+use crate::{util::prefilter::Prefilter, MatchKind};
+
+/// Attempts to extract an "inner" prefilter from the given HIR expressions. If
+/// one was found, then a concatenation of the HIR expressions that precede it
+/// is returned.
+///
+/// The idea here is that the prefilter returned can be used to find candidate
+/// matches. And then the HIR returned can be used to build a reverse regex
+/// matcher, which will find the start of the candidate match. Finally, the
+/// match still has to be confirmed with a normal anchored forward scan to find
+/// the end position of the match.
+///
+/// Note that this assumes leftmost-first match semantics, so callers must
+/// not call this otherwise.
+pub(crate) fn extract(hirs: &[&Hir]) -> Option<(Hir, Prefilter)> {
+    if hirs.len() != 1 {
+        debug!(
+            "skipping reverse inner optimization since it only \
+		 	 supports 1 pattern, {} were given",
+            hirs.len(),
+        );
+        return None;
+    }
+    let mut concat = match top_concat(hirs[0]) {
+        Some(concat) => concat,
+        None => {
+            debug!(
+                "skipping reverse inner optimization because a top-level \
+		 	     concatenation could not found",
+            );
+            return None;
+        }
+    };
+    // We skip the first HIR because if it did have a prefix prefilter in it,
+    // we probably wouldn't be here looking for an inner prefilter.
+    for i in 1..concat.len() {
+        let hir = &concat[i];
+        let pre = match prefilter(hir) {
+            None => continue,
+            Some(pre) => pre,
+        };
+        // Even if we got a prefilter, if it isn't consider "fast," then we
+        // probably don't want to bother with it. Namely, since the reverse
+        // inner optimization requires some overhead, it likely only makes
+        // sense if the prefilter scan itself is (believed) to be much faster
+        // than the regex engine.
+        if !pre.is_fast() {
+            debug!(
+                "skipping extracted inner prefilter because \
+				 it probably isn't fast"
+            );
+            continue;
+        }
+        let concat_suffix = Hir::concat(concat.split_off(i));
+        let concat_prefix = Hir::concat(concat);
+        // Look for a prefilter again. Why? Because above we only looked for
+        // a prefilter on the individual 'hir', but we might be able to find
+        // something better and more discriminatory by looking at the entire
+        // suffix. We don't do this above to avoid making this loop worst case
+        // quadratic in the length of 'concat'.
+        let pre2 = match prefilter(&concat_suffix) {
+            None => pre,
+            Some(pre2) => {
+                if pre2.is_fast() {
+                    pre2
+                } else {
+                    pre
+                }
+            }
+        };
+        return Some((concat_prefix, pre2));
+    }
+    debug!(
+        "skipping reverse inner optimization because a top-level \
+	     sub-expression with a fast prefilter could not be found"
+    );
+    None
+}
+
+/// Attempt to extract a prefilter from an HIR expression.
+///
+/// We do a little massaging here to do our best that the prefilter we get out
+/// of this is *probably* fast. Basically, the false positive rate has a much
+/// higher impact for things like the reverse inner optimization because more
+/// work needs to potentially be done for each candidate match.
+///
+/// Note that this assumes leftmost-first match semantics, so callers must
+/// not call this otherwise.
+fn prefilter(hir: &Hir) -> Option<Prefilter> {
+    let mut extractor = literal::Extractor::new();
+    extractor.kind(literal::ExtractKind::Prefix);
+    let mut prefixes = extractor.extract(hir);
+    debug!(
+        "inner prefixes (len={:?}) extracted before optimization: {:?}",
+        prefixes.len(),
+        prefixes
+    );
+    // Since these are inner literals, we know they cannot be exact. But the
+    // extractor doesn't know this. We mark them as inexact because this might
+    // impact literal optimization. Namely, optimization weights "all literals
+    // are exact" as very high, because it presumes that any match results in
+    // an overall match. But of course, that is not the case here.
+    //
+    // In practice, this avoids plucking out a ASCII-only \s as an alternation
+    // of single-byte whitespace characters.
+    prefixes.make_inexact();
+    prefixes.optimize_for_prefix_by_preference();
+    debug!(
+        "inner prefixes (len={:?}) extracted after optimization: {:?}",
+        prefixes.len(),
+        prefixes
+    );
+    prefixes
+        .literals()
+        .and_then(|lits| Prefilter::new(MatchKind::LeftmostFirst, lits))
+}
+
+/// Looks for a "top level" HirKind::Concat item in the given HIR. This will
+/// try to return one even if it's embedded in a capturing group, but is
+/// otherwise pretty conservative in what is returned.
+///
+/// The HIR returned is a complete copy of the concat with all capturing
+/// groups removed. In effect, the concat returned is "flattened" with respect
+/// to capturing groups. This makes the detection logic above for prefixes
+/// a bit simpler, and it works because 1) capturing groups never influence
+/// whether a match occurs or not and 2) capturing groups are not used when
+/// doing the reverse inner search to find the start of the match.
+fn top_concat(mut hir: &Hir) -> Option<Vec<Hir>> {
+    loop {
+        hir = match hir.kind() {
+            HirKind::Empty
+            | HirKind::Literal(_)
+            | HirKind::Class(_)
+            | HirKind::Look(_)
+            | HirKind::Repetition(_)
+            | HirKind::Alternation(_) => return None,
+            HirKind::Capture(hir::Capture { ref sub, .. }) => sub,
+            HirKind::Concat(ref subs) => {
+                // We are careful to only do the flattening/copy when we know
+                // we have a "top level" concat we can inspect. This avoids
+                // doing extra work in cases where we definitely won't use it.
+                // (This might still be wasted work if we can't go on to find
+                // some literals to extract.)
+                let concat =
+                    Hir::concat(subs.iter().map(|h| flatten(h)).collect());
+                return match concat.into_kind() {
+                    HirKind::Concat(xs) => Some(xs),
+                    // It is actually possible for this case to occur, because
+                    // 'Hir::concat' might simplify the expression to the point
+                    // that concatenations are actually removed. One wonders
+                    // whether this leads to other cases where we should be
+                    // extracting literals, but in theory, I believe if we do
+                    // get here, then it means that a "real" prefilter failed
+                    // to be extracted and we should probably leave well enough
+                    // alone. (A "real" prefilter is unbothered by "top-level
+                    // concats" and "capturing groups.")
+                    _ => return None,
+                };
+            }
+        };
+    }
+}
+
+/// Returns a copy of the given HIR but with all capturing groups removed.
+fn flatten(hir: &Hir) -> Hir {
+    match hir.kind() {
+        HirKind::Empty => Hir::empty(),
+        HirKind::Literal(hir::Literal(ref x)) => Hir::literal(x.clone()),
+        HirKind::Class(ref x) => Hir::class(x.clone()),
+        HirKind::Look(ref x) => Hir::look(x.clone()),
+        HirKind::Repetition(ref x) => Hir::repetition(x.with(flatten(&x.sub))),
+        // This is the interesting case. We just drop the group information
+        // entirely and use the child HIR itself.
+        HirKind::Capture(hir::Capture { ref sub, .. }) => flatten(sub),
+        HirKind::Alternation(ref xs) => {
+            Hir::alternation(xs.iter().map(|x| flatten(x)).collect())
+        }
+        HirKind::Concat(ref xs) => {
+            Hir::concat(xs.iter().map(|x| flatten(x)).collect())
+        }
+    }
+}