summaryrefslogtreecommitdiffstats
path: root/third_party/rust/aho-corasick
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
commit26a029d407be480d791972afb5975cf62c9360a6 (patch)
treef435a8308119effd964b339f76abb83a57c29483 /third_party/rust/aho-corasick
parentInitial commit. (diff)
downloadfirefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz
firefox-26a029d407be480d791972afb5975cf62c9360a6.zip
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/aho-corasick')
-rw-r--r--third_party/rust/aho-corasick/.cargo-checksum.json1
-rw-r--r--third_party/rust/aho-corasick/COPYING3
-rw-r--r--third_party/rust/aho-corasick/Cargo.toml74
-rw-r--r--third_party/rust/aho-corasick/DESIGN.md481
-rw-r--r--third_party/rust/aho-corasick/LICENSE-MIT21
-rw-r--r--third_party/rust/aho-corasick/README.md174
-rw-r--r--third_party/rust/aho-corasick/UNLICENSE24
-rw-r--r--third_party/rust/aho-corasick/rustfmt.toml2
-rw-r--r--third_party/rust/aho-corasick/src/ahocorasick.rs2789
-rw-r--r--third_party/rust/aho-corasick/src/automaton.rs1608
-rw-r--r--third_party/rust/aho-corasick/src/dfa.rs814
-rw-r--r--third_party/rust/aho-corasick/src/lib.rs326
-rw-r--r--third_party/rust/aho-corasick/src/macros.rs18
-rw-r--r--third_party/rust/aho-corasick/src/nfa/contiguous.rs1141
-rw-r--r--third_party/rust/aho-corasick/src/nfa/mod.rs40
-rw-r--r--third_party/rust/aho-corasick/src/nfa/noncontiguous.rs1762
-rw-r--r--third_party/rust/aho-corasick/src/packed/api.rs687
-rw-r--r--third_party/rust/aho-corasick/src/packed/ext.rs39
-rw-r--r--third_party/rust/aho-corasick/src/packed/mod.rs120
-rw-r--r--third_party/rust/aho-corasick/src/packed/pattern.rs480
-rw-r--r--third_party/rust/aho-corasick/src/packed/rabinkarp.rs168
-rw-r--r--third_party/rust/aho-corasick/src/packed/teddy/README.md386
-rw-r--r--third_party/rust/aho-corasick/src/packed/teddy/builder.rs780
-rw-r--r--third_party/rust/aho-corasick/src/packed/teddy/generic.rs1382
-rw-r--r--third_party/rust/aho-corasick/src/packed/teddy/mod.rs9
-rw-r--r--third_party/rust/aho-corasick/src/packed/tests.rs583
-rw-r--r--third_party/rust/aho-corasick/src/packed/vector.rs1750
-rw-r--r--third_party/rust/aho-corasick/src/tests.rs1664
-rw-r--r--third_party/rust/aho-corasick/src/transducer.rs270
-rw-r--r--third_party/rust/aho-corasick/src/util/alphabet.rs409
-rw-r--r--third_party/rust/aho-corasick/src/util/buffer.rs124
-rw-r--r--third_party/rust/aho-corasick/src/util/byte_frequencies.rs258
-rw-r--r--third_party/rust/aho-corasick/src/util/debug.rs26
-rw-r--r--third_party/rust/aho-corasick/src/util/error.rs259
-rw-r--r--third_party/rust/aho-corasick/src/util/int.rs284
-rw-r--r--third_party/rust/aho-corasick/src/util/mod.rs12
-rw-r--r--third_party/rust/aho-corasick/src/util/prefilter.rs924
-rw-r--r--third_party/rust/aho-corasick/src/util/primitives.rs759
-rw-r--r--third_party/rust/aho-corasick/src/util/remapper.rs214
-rw-r--r--third_party/rust/aho-corasick/src/util/search.rs1148
-rw-r--r--third_party/rust/aho-corasick/src/util/special.rs42
41 files changed, 22055 insertions, 0 deletions
diff --git a/third_party/rust/aho-corasick/.cargo-checksum.json b/third_party/rust/aho-corasick/.cargo-checksum.json
new file mode 100644
index 0000000000..233f8202c8
--- /dev/null
+++ b/third_party/rust/aho-corasick/.cargo-checksum.json
@@ -0,0 +1 @@
+{"files":{"COPYING":"01c266bced4a434da0051174d6bee16a4c82cf634e2679b6155d40d75012390f","Cargo.toml":"55608b09e18c96a0c245d8af2546e63bb4576fa378c1f2ce38c7909f3b225007","DESIGN.md":"59c960e1b73b1d7fb41e4df6c0c1b1fcf44dd2ebc8a349597a7d0595f8cb5130","LICENSE-MIT":"0f96a83840e146e43c0ec96a22ec1f392e0680e6c1226e6f3ba87e0740af850f","README.md":"afc4d559a98cf190029af0bf320fc0022725e349cd2a303aac860254e28f3c53","UNLICENSE":"7e12e5df4bae12cb21581ba157ced20e1986a0508dd10d0e8a4ab9a4cf94e85c","rustfmt.toml":"1ca600239a27401c4a43f363cf3f38183a212affc1f31bff3ae93234bbaec228","src/ahocorasick.rs":"c699c07df70be45c666e128509ad571a7649d2073e4ae16ac1efd6793c9c6890","src/automaton.rs":"22258a3e118672413119f8f543a9b912cce954e63524575c0ebfdf9011f9c2dd","src/dfa.rs":"c495d615545981e1d0a4174bf0a6ab87fd81c30c7d4527acf2d64a56323bdc36","src/lib.rs":"2a92d5c5e930f2d306508802e8a929135e1f41c9f5f8deda8f7eb98947179dd2","src/macros.rs":"c6c52ae05b24433cffaca7b78b3645d797862c5d5feffddf9f54909095ed6e05","src/nfa/contiguous.rs":"aeb6ee5fd80eea04decbc4b46aa27d1ab270b78d416a644da25b7934f009ee66","src/nfa/mod.rs":"ee7b3109774d14bbad5239c16bb980dd6b8185ec136d94fbaf2f0dc27d5ffa15","src/nfa/noncontiguous.rs":"de94f02b04efd8744fb096759a8897c22012b0e0ca3ace161fd87c71befefe04","src/packed/api.rs":"160d3b10823316f7b0924e13c3afd222c8a7db5c0a00432401f311ef27d6a1b7","src/packed/ext.rs":"66be06fde8558429da23a290584d4b9fae665bf64c2578db4fe5f5f3ee864869","src/packed/mod.rs":"0020cd6f07ba5c8955923a9516d7f758864260eda53a6b6f629131c45ddeec62","src/packed/pattern.rs":"1e3a289a730c141fc30b295811e372d046c6619c7fd670308299b889a06c7673","src/packed/rabinkarp.rs":"403146eb1d838a84601d171393542340513cd1ee7ff750f2372161dd47746586","src/packed/teddy/README.md":"3a43194b64e221543d885176aba3beb1224a927385a20eca842daf6b0ea2f342","src/packed/teddy/builder.rs":"720735ea6c7ff92b081426513e6e82feed24a922849297bb538d28f7b8129f81","src/packed/teddy/generic.rs":"ea252ab05b32cea7dd9d71e332071d243db7dd0362e049252a27e5881ba2bf39","src/packed/teddy/mod.rs":"17d741f7e2fb9dbac5ba7d1bd4542cf1e35e9f146ace728e23fe6bbed20028b2","src/packed/tests.rs":"8e2f56eb3890ed3876ecb47d3121996e416563127b6430110d7b516df3f83b4b","src/packed/vector.rs":"6e0400422de015e181c758ef3a4ff517fc8d0481b078a82de00f6e29e9d2e1c8","src/tests.rs":"c68192ab97b6161d0d6ee96fefd80cc7d14e4486ddcd8d1f82b5c92432c24ed5","src/transducer.rs":"02daa33a5d6dac41dcfd67f51df7c0d4a91c5131c781fb54c4de3520c585a6e1","src/util/alphabet.rs":"6dc22658a38deddc0279892035b18870d4585069e35ba7c7e649a24509acfbcc","src/util/buffer.rs":"f9e37f662c46c6ecd734458dedbe76c3bb0e84a93b6b0117c0d4ad3042413891","src/util/byte_frequencies.rs":"2fb85b381c038c1e44ce94294531cdcd339dca48b1e61f41455666e802cbbc9e","src/util/debug.rs":"ab301ad59aa912529cb97233a54a05914dd3cb2ec43e6fec7334170b97ac5998","src/util/error.rs":"ecccd60e7406305023efcc6adcc826eeeb083ab8f7fbfe3d97469438cd4c4e5c","src/util/int.rs":"4ab6dbdba10027ddec2af63a9b28ce4eee30ded0daa5d8eb068b2b55542b6039","src/util/mod.rs":"7ab28d11323ecdbd982087f32eb8bceeee84f1a2583f3aae27039c36d58cf12c","src/util/prefilter.rs":"9fa4498f18bf70478b1996c1a013698b626d15f119aa81dbc536673c9f045718","src/util/primitives.rs":"f89f3fa1d8db4e37de9ca767c6d05e346404837cade6d063bba68972fafa610b","src/util/remapper.rs":"9f12d911583a325c11806eeceb46d0dfec863cfcfa241aed84d31af73da746e5","src/util/search.rs":"6af803e08b8b8c8a33db100623f1621b0d741616524ce40893d8316897f27ffe","src/util/special.rs":"7d2f9cb9dd9771f59816e829b2d96b1239996f32939ba98764e121696c52b146"},"package":"0f2135563fb5c609d2b2b87c1e8ce7bc41b0b45430fa9661f457981503dd5bf0"} \ No newline at end of file
diff --git a/third_party/rust/aho-corasick/COPYING b/third_party/rust/aho-corasick/COPYING
new file mode 100644
index 0000000000..bb9c20a094
--- /dev/null
+++ b/third_party/rust/aho-corasick/COPYING
@@ -0,0 +1,3 @@
+This project is dual-licensed under the Unlicense and MIT licenses.
+
+You may use this code under the terms of either license.
diff --git a/third_party/rust/aho-corasick/Cargo.toml b/third_party/rust/aho-corasick/Cargo.toml
new file mode 100644
index 0000000000..f2ebca9b9b
--- /dev/null
+++ b/third_party/rust/aho-corasick/Cargo.toml
@@ -0,0 +1,74 @@
+# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
+#
+# When uploading crates to the registry Cargo will automatically
+# "normalize" Cargo.toml files for maximal compatibility
+# with all versions of Cargo and also rewrite `path` dependencies
+# to registry (e.g., crates.io) dependencies.
+#
+# If you are reading this file be aware that the original Cargo.toml
+# will likely look very different (and much more reasonable).
+# See Cargo.toml.orig for the original contents.
+
+[package]
+edition = "2021"
+rust-version = "1.60.0"
+name = "aho-corasick"
+version = "1.1.0"
+authors = ["Andrew Gallant <jamslam@gmail.com>"]
+exclude = [
+ "/aho-corasick-debug",
+ "/benchmarks",
+ "/tmp",
+]
+autotests = false
+description = "Fast multiple substring searching."
+homepage = "https://github.com/BurntSushi/aho-corasick"
+readme = "README.md"
+keywords = [
+ "string",
+ "search",
+ "text",
+ "pattern",
+ "multi",
+]
+categories = ["text-processing"]
+license = "Unlicense OR MIT"
+repository = "https://github.com/BurntSushi/aho-corasick"
+
+[package.metadata.docs.rs]
+all-features = true
+rustdoc-args = [
+ "--cfg",
+ "docsrs",
+ "--generate-link-to-definition",
+]
+
+[profile.bench]
+debug = 2
+
+[profile.release]
+debug = 2
+
+[lib]
+name = "aho_corasick"
+
+[dependencies.log]
+version = "0.4.17"
+optional = true
+
+[dependencies.memchr]
+version = "2.4.0"
+optional = true
+default-features = false
+
+[dev-dependencies.doc-comment]
+version = "0.3.3"
+
+[features]
+default = [
+ "std",
+ "perf-literal",
+]
+logging = ["dep:log"]
+perf-literal = ["dep:memchr"]
+std = ["memchr?/std"]
diff --git a/third_party/rust/aho-corasick/DESIGN.md b/third_party/rust/aho-corasick/DESIGN.md
new file mode 100644
index 0000000000..f911f0c3ad
--- /dev/null
+++ b/third_party/rust/aho-corasick/DESIGN.md
@@ -0,0 +1,481 @@
+This document describes the internal design of this crate, which is an object
+lesson in what happens when you take a fairly simple old algorithm like
+Aho-Corasick and make it fast and production ready.
+
+The target audience of this document is Rust programmers that have some
+familiarity with string searching, however, one does not need to know the
+Aho-Corasick algorithm in order to read this (it is explained below). One
+should, however, know what a trie is. (If you don't, go read its Wikipedia
+article.)
+
+The center-piece of this crate is an implementation of Aho-Corasick. On its
+own, Aho-Corasick isn't that complicated. The complex pieces come from the
+different variants of Aho-Corasick implemented in this crate. Specifically,
+they are:
+
+* Aho-Corasick as a noncontiguous NFA. States have their transitions
+ represented sparsely, and each state puts its transitions in its own separate
+ allocation. Hence the same "noncontiguous."
+* Aho-Corasick as a contiguous NFA. This NFA uses a single allocation to
+ represent the transitions of all states. That is, transitions are laid out
+ contiguously in memory. Moreover, states near the starting state are
+ represented densely, such that finding the next state ID takes a constant
+ number of instructions.
+* Aho-Corasick as a DFA. In this case, all states are represented densely in
+ a transition table that uses one allocation.
+* Supporting "standard" match semantics, along with its overlapping variant,
+ in addition to leftmost-first and leftmost-longest semantics. The "standard"
+ semantics are typically what you see in a textbook description of
+ Aho-Corasick. However, Aho-Corasick is also useful as an optimization in
+ regex engines, which often use leftmost-first or leftmost-longest semantics.
+ Thus, it is useful to implement those semantics here. The "standard" and
+ "leftmost" search algorithms are subtly different, and also require slightly
+ different construction algorithms.
+* Support for ASCII case insensitive matching.
+* Support for accelerating searches when the patterns all start with a small
+ number of fixed bytes. Or alternatively, when the patterns all contain a
+ small number of rare bytes. (Searching for these bytes uses SIMD vectorized
+ code courtesy of `memchr`.)
+* Transparent support for alternative SIMD vectorized search routines for
+ smaller number of literals, such as the Teddy algorithm. We called these
+ "packed" search routines because they use SIMD. They can often be an order of
+ magnitude faster than just Aho-Corasick, but don't scale as well.
+* Support for searching streams. This can reuse most of the underlying code,
+ but does require careful buffering support.
+* Support for anchored searches, which permit efficient "is prefix" checks for
+ a large number of patterns.
+
+When you combine all of this together along with trying to make everything as
+fast as possible, what you end up with is enitrely too much code with too much
+`unsafe`. Alas, I was not smart enough to figure out how to reduce it. Instead,
+we will explain it.
+
+
+# Basics
+
+The fundamental problem this crate is trying to solve is to determine the
+occurrences of possibly many patterns in a haystack. The naive way to solve
+this is to look for a match for each pattern at each position in the haystack:
+
+ for i in 0..haystack.len():
+ for p in patterns.iter():
+ if haystack[i..].starts_with(p.bytes()):
+ return Match(p.id(), i, i + p.bytes().len())
+
+Those four lines are effectively all this crate does. The problem with those
+four lines is that they are very slow, especially when you're searching for a
+large number of patterns.
+
+While there are many different algorithms available to solve this, a popular
+one is Aho-Corasick. It's a common solution because it's not too hard to
+implement, scales quite well even when searching for thousands of patterns and
+is generally pretty fast. Aho-Corasick does well here because, regardless of
+the number of patterns you're searching for, it always visits each byte in the
+haystack exactly once. This means, generally speaking, adding more patterns to
+an Aho-Corasick automaton does not make it slower. (Strictly speaking, however,
+this is not true, since a larger automaton will make less effective use of the
+CPU's cache.)
+
+Aho-Corasick can be succinctly described as a trie with state transitions
+between some of the nodes that efficiently instruct the search algorithm to
+try matching alternative keys in the trie. The trick is that these state
+transitions are arranged such that each byte of input needs to be inspected
+only once. These state transitions are typically called "failure transitions,"
+because they instruct the searcher (the thing traversing the automaton while
+reading from the haystack) what to do when a byte in the haystack does not
+correspond to a valid transition in the current state of the trie.
+
+More formally, a failure transition points to a state in the automaton that may
+lead to a match whose prefix is a proper suffix of the path traversed through
+the trie so far. (If no such proper suffix exists, then the failure transition
+points back to the start state of the trie, effectively restarting the search.)
+This is perhaps simpler to explain pictorally. For example, let's say we built
+an Aho-Corasick automaton with the following patterns: 'abcd' and 'cef'. The
+trie looks like this:
+
+ a - S1 - b - S2 - c - S3 - d - S4*
+ /
+ S0 - c - S5 - e - S6 - f - S7*
+
+where states marked with a `*` are match states (meaning, the search algorithm
+should stop and report a match to the caller).
+
+So given this trie, it should be somewhat straight-forward to see how it can
+be used to determine whether any particular haystack *starts* with either
+`abcd` or `cef`. It's easy to express this in code:
+
+ fn has_prefix(trie: &Trie, haystack: &[u8]) -> bool {
+ let mut state_id = trie.start();
+ // If the empty pattern is in trie, then state_id is a match state.
+ if trie.is_match(state_id) {
+ return true;
+ }
+ for (i, &b) in haystack.iter().enumerate() {
+ state_id = match trie.next_state(state_id, b) {
+ Some(id) => id,
+ // If there was no transition for this state and byte, then we know
+ // the haystack does not start with one of the patterns in our trie.
+ None => return false,
+ };
+ if trie.is_match(state_id) {
+ return true;
+ }
+ }
+ false
+ }
+
+And that's pretty much it. All we do is move through the trie starting with the
+bytes at the beginning of the haystack. If we find ourselves in a position
+where we can't move, or if we've looked through the entire haystack without
+seeing a match state, then we know the haystack does not start with any of the
+patterns in the trie.
+
+The meat of the Aho-Corasick algorithm is in how we add failure transitions to
+our trie to keep searching efficient. Specifically, it permits us to not only
+check whether a haystack *starts* with any one of a number of patterns, but
+rather, whether the haystack contains any of a number of patterns *anywhere* in
+the haystack.
+
+As mentioned before, failure transitions connect a proper suffix of the path
+traversed through the trie before, with a path that leads to a match that has a
+prefix corresponding to that proper suffix. So in our case, for patterns `abcd`
+and `cef`, with a haystack `abcef`, we want to transition to state `S5` (from
+the diagram above) from `S3` upon seeing that the byte following `c` is not
+`d`. Namely, the proper suffix in this example is `c`, which is a prefix of
+`cef`. So the modified diagram looks like this:
+
+
+ a - S1 - b - S2 - c - S3 - d - S4*
+ / /
+ / ----------------
+ / /
+ S0 - c - S5 - e - S6 - f - S7*
+
+One thing that isn't shown in this diagram is that *all* states have a failure
+transition, but only `S3` has a *non-trivial* failure transition. That is, all
+other states have a failure transition back to the start state. So if our
+haystack was `abzabcd`, then the searcher would transition back to `S0` after
+seeing `z`, which effectively restarts the search. (Because there is no pattern
+in our trie that has a prefix of `bz` or `z`.)
+
+The code for traversing this *automaton* or *finite state machine* (it is no
+longer just a trie) is not that much different from the `has_prefix` code
+above:
+
+ fn contains(fsm: &FiniteStateMachine, haystack: &[u8]) -> bool {
+ let mut state_id = fsm.start();
+ // If the empty pattern is in fsm, then state_id is a match state.
+ if fsm.is_match(state_id) {
+ return true;
+ }
+ for (i, &b) in haystack.iter().enumerate() {
+ // While the diagram above doesn't show this, we may wind up needing
+ // to follow multiple failure transitions before we land on a state
+ // in which we can advance. Therefore, when searching for the next
+ // state, we need to loop until we don't see a failure transition.
+ //
+ // This loop terminates because the start state has no empty
+ // transitions. Every transition from the start state either points to
+ // another state, or loops back to the start state.
+ loop {
+ match fsm.next_state(state_id, b) {
+ Some(id) => {
+ state_id = id;
+ break;
+ }
+ // Unlike our code above, if there was no transition for this
+ // state, then we don't quit. Instead, we look for this state's
+ // failure transition and follow that instead.
+ None => {
+ state_id = fsm.next_fail_state(state_id);
+ }
+ };
+ }
+ if fsm.is_match(state_id) {
+ return true;
+ }
+ }
+ false
+ }
+
+Other than the complication around traversing failure transitions, this code
+is still roughly "traverse the automaton with bytes from the haystack, and quit
+when a match is seen."
+
+And that concludes our section on the basics. While we didn't go deep into how
+the automaton is built (see `src/nfa/noncontiguous.rs`, which has detailed
+comments about that), the basic structure of Aho-Corasick should be reasonably
+clear.
+
+
+# NFAs and DFAs
+
+There are generally two types of finite automata: non-deterministic finite
+automata (NFA) and deterministic finite automata (DFA). The difference between
+them is, principally, that an NFA can be in multiple states at once. This is
+typically accomplished by things called _epsilon_ transitions, where one could
+move to a new state without consuming any bytes from the input. (The other
+mechanism by which NFAs can be in more than one state is where the same byte in
+a particular state transitions to multiple distinct states.) In contrast, a DFA
+can only ever be in one state at a time. A DFA has no epsilon transitions, and
+for any given state, a byte transitions to at most one other state.
+
+By this formulation, the Aho-Corasick automaton described in the previous
+section is an NFA. This is because failure transitions are, effectively,
+epsilon transitions. That is, whenever the automaton is in state `S`, it is
+actually in the set of states that are reachable by recursively following
+failure transitions from `S` until you reach the start state. (This means
+that, for example, the start state is always active since the start state is
+reachable via failure transitions from any state in the automaton.)
+
+NFAs have a lot of nice properties. They tend to be easier to construct, and
+also tend to use less memory. However, their primary downside is that they are
+typically slower to execute a search with. For example, the code above showing
+how to search with an Aho-Corasick automaton needs to potentially iterate
+through many failure transitions for every byte of input. While this is a
+fairly small amount of overhead, this can add up, especially if the automaton
+has a lot of overlapping patterns with a lot of failure transitions.
+
+A DFA's search code, by contrast, looks like this:
+
+ fn contains(dfa: &DFA, haystack: &[u8]) -> bool {
+ let mut state_id = dfa.start();
+ // If the empty pattern is in dfa, then state_id is a match state.
+ if dfa.is_match(state_id) {
+ return true;
+ }
+ for (i, &b) in haystack.iter().enumerate() {
+ // An Aho-Corasick DFA *never* has a missing state that requires
+ // failure transitions to be followed. One byte of input advances the
+ // automaton by one state. Always.
+ state_id = dfa.next_state(state_id, b);
+ if dfa.is_match(state_id) {
+ return true;
+ }
+ }
+ false
+ }
+
+The search logic here is much simpler than for the NFA, and this tends to
+translate into significant performance benefits as well, since there's a lot
+less work being done for each byte in the haystack. How is this accomplished?
+It's done by pre-following all failure transitions for all states for all bytes
+in the alphabet, and then building a single state transition table. Building
+this DFA can be much more costly than building the NFA, and use much more
+memory, but the better performance can be worth it.
+
+Users of this crate can actually choose between using one of two possible NFAs
+(noncontiguous or contiguous) or a DFA. By default, a contiguous NFA is used,
+in most circumstances, but if the number of patterns is small enough a DFA will
+be used. A contiguous NFA is chosen because it uses orders of magnitude less
+memory than a DFA, takes only a little longer to build than a noncontiguous
+NFA and usually gets pretty close to the search speed of a DFA. (Callers can
+override this automatic selection via the `AhoCorasickBuilder::start_kind`
+configuration.)
+
+
+# More DFA tricks
+
+As described in the previous section, one of the downsides of using a DFA
+is that it uses more memory and can take longer to build. One small way of
+mitigating these concerns is to map the alphabet used by the automaton into
+a smaller space. Typically, the alphabet of a DFA has 256 elements in it:
+one element for each possible value that fits into a byte. However, in many
+cases, one does not need the full alphabet. For example, if all patterns in an
+Aho-Corasick automaton are ASCII letters, then this only uses up 52 distinct
+bytes. As far as the automaton is concerned, the rest of the 204 bytes are
+indistinguishable from one another: they will never disrciminate between a
+match or a non-match. Therefore, in cases like that, the alphabet can be shrunk
+to just 53 elements. One for each ASCII letter, and then another to serve as a
+placeholder for every other unused byte.
+
+In practice, this library doesn't quite compute the optimal set of equivalence
+classes, but it's close enough in most cases. The key idea is that this then
+allows the transition table for the DFA to be potentially much smaller. The
+downside of doing this, however, is that since the transition table is defined
+in terms of this smaller alphabet space, every byte in the haystack must be
+re-mapped to this smaller space. This requires an additional 256-byte table.
+In practice, this can lead to a small search time hit, but it can be difficult
+to measure. Moreover, it can sometimes lead to faster search times for bigger
+automata, since it could be difference between more parts of the automaton
+staying in the CPU cache or not.
+
+One other trick for DFAs employed by this crate is the notion of premultiplying
+state identifiers. Specifically, the normal way to compute the next transition
+in a DFA is via the following (assuming that the transition table is laid out
+sequentially in memory, in row-major order, where the rows are states):
+
+ next_state_id = dfa.transitions[current_state_id * 256 + current_byte]
+
+However, since the value `256` is a fixed constant, we can actually premultiply
+the state identifiers in the table when we build the table initially. Then, the
+next transition computation simply becomes:
+
+ next_state_id = dfa.transitions[current_state_id + current_byte]
+
+This doesn't seem like much, but when this is being executed for every byte of
+input that you're searching, saving that extra multiplication instruction can
+add up.
+
+The same optimization works even when equivalence classes are enabled, as
+described above. The only difference is that the premultiplication is by the
+total number of equivalence classes instead of 256.
+
+There isn't much downside to premultiplying state identifiers, other than it
+imposes a smaller limit on the total number of states in the DFA. Namely, with
+premultiplied state identifiers, you run out of room in your state identifier
+representation more rapidly than if the identifiers are just state indices.
+
+Both equivalence classes and premultiplication are always enabled. There is a
+`AhoCorasickBuilder::byte_classes` configuration, but disabling this just makes
+it so there are always 256 equivalence classes, i.e., every class corresponds
+to precisely one byte. When it's disabled, the equivalence class map itself is
+still used. The purpose of disabling it is when one is debugging the underlying
+automaton. It can be easier to comprehend when it uses actual byte values for
+its transitions instead of equivalence classes.
+
+
+# Match semantics
+
+One of the more interesting things about this implementation of Aho-Corasick
+that (as far as this author knows) separates it from other implementations, is
+that it natively supports leftmost-first and leftmost-longest match semantics.
+Briefly, match semantics refer to the decision procedure by which searching
+will disambiguate matches when there are multiple to choose from:
+
+* **standard** match semantics emits matches as soon as they are detected by
+ the automaton. This is typically equivalent to the textbook non-overlapping
+ formulation of Aho-Corasick.
+* **leftmost-first** match semantics means that 1) the next match is the match
+ starting at the leftmost position and 2) among multiple matches starting at
+ the same leftmost position, the match corresponding to the pattern provided
+ first by the caller is reported.
+* **leftmost-longest** is like leftmost-first, except when there are multiple
+ matches starting at the same leftmost position, the pattern corresponding to
+ the longest match is returned.
+
+(The crate API documentation discusses these differences, with examples, in
+more depth on the `MatchKind` type.)
+
+The reason why supporting these match semantics is important is because it
+gives the user more control over the match procedure. For example,
+leftmost-first permits users to implement match priority by simply putting the
+higher priority patterns first. Leftmost-longest, on the other hand, permits
+finding the longest possible match, which might be useful when trying to find
+words matching a dictionary. Additionally, regex engines often want to use
+Aho-Corasick as an optimization when searching for an alternation of literals.
+In order to preserve correct match semantics, regex engines typically can't use
+the standard textbook definition directly, since regex engines will implement
+either leftmost-first (Perl-like) or leftmost-longest (POSIX) match semantics.
+
+Supporting leftmost semantics requires a couple key changes:
+
+* Constructing the Aho-Corasick automaton changes a bit in both how the trie is
+ constructed and how failure transitions are found. Namely, only a subset
+ of the failure transitions are added. Specifically, only the failure
+ transitions that either do not occur after a match or do occur after a match
+ but preserve that match are kept. (More details on this can be found in
+ `src/nfa/noncontiguous.rs`.)
+* The search algorithm changes slightly. Since we are looking for the leftmost
+ match, we cannot quit as soon as a match is detected. Instead, after a match
+ is detected, we must keep searching until either the end of the input or
+ until a dead state is seen. (Dead states are not used for standard match
+ semantics. Dead states mean that searching should stop after a match has been
+ found.)
+
+Most other implementations of Aho-Corasick do support leftmost match semantics,
+but they do it with more overhead at search time, or even worse, with a queue
+of matches and sophisticated hijinks to disambiguate the matches. While our
+construction algorithm becomes a bit more complicated, the correct match
+semantics fall out from the structure of the automaton itself.
+
+
+# Overlapping matches
+
+One of the nice properties of an Aho-Corasick automaton is that it can report
+all possible matches, even when they overlap with one another. In this mode,
+the match semantics don't matter, since all possible matches are reported.
+Overlapping searches work just like regular searches, except the state
+identifier at which the previous search left off is carried over to the next
+search, so that it can pick up where it left off. If there are additional
+matches at that state, then they are reported before resuming the search.
+
+Enabling leftmost-first or leftmost-longest match semantics causes the
+automaton to use a subset of all failure transitions, which means that
+overlapping searches cannot be used. Therefore, if leftmost match semantics are
+used, attempting to do an overlapping search will return an error (or panic
+when using the infallible APIs). Thus, to get overlapping searches, the caller
+must use the default standard match semantics. This behavior was chosen because
+there are only two alternatives, which were deemed worse:
+
+* Compile two automatons internally, one for standard semantics and one for
+ the semantics requested by the caller (if not standard).
+* Create a new type, distinct from the `AhoCorasick` type, which has different
+ capabilities based on the configuration options.
+
+The first is untenable because of the amount of memory used by the automaton.
+The second increases the complexity of the API too much by adding too many
+types that do similar things. It is conceptually much simpler to keep all
+searching isolated to a single type.
+
+
+# Stream searching
+
+Since Aho-Corasick is an automaton, it is possible to do partial searches on
+partial parts of the haystack, and then resume that search on subsequent pieces
+of the haystack. This is useful when the haystack you're trying to search is
+not stored contiguously in memory, or if one does not want to read the entire
+haystack into memory at once.
+
+Currently, only standard semantics are supported for stream searching. This is
+some of the more complicated code in this crate, and is something I would very
+much like to improve. In particular, it currently has the restriction that it
+must buffer at least enough of the haystack in memory in order to fit the
+longest possible match. The difficulty in getting stream searching right is
+that the implementation choices (such as the buffer size) often impact what the
+API looks like and what it's allowed to do.
+
+
+# Prefilters
+
+In some cases, Aho-Corasick is not the fastest way to find matches containing
+multiple patterns. Sometimes, the search can be accelerated using highly
+optimized SIMD routines. For example, consider searching the following
+patterns:
+
+ Sherlock
+ Moriarty
+ Watson
+
+It is plausible that it would be much faster to quickly look for occurrences of
+the leading bytes, `S`, `M` or `W`, before trying to start searching via the
+automaton. Indeed, this is exactly what this crate will do.
+
+When there are more than three distinct starting bytes, then this crate will
+look for three distinct bytes occurring at any position in the patterns, while
+preferring bytes that are heuristically determined to be rare over others. For
+example:
+
+ Abuzz
+ Sanchez
+ Vasquez
+ Topaz
+ Waltz
+
+Here, we have more than 3 distinct starting bytes, but all of the patterns
+contain `z`, which is typically a rare byte. In this case, the prefilter will
+scan for `z`, back up a bit, and then execute the Aho-Corasick automaton.
+
+If all of that fails, then a packed multiple substring algorithm will be
+attempted. Currently, the only algorithm available for this is Teddy, but more
+may be added in the future. Teddy is unlike the above prefilters in that it
+confirms its own matches, so when Teddy is active, it might not be necessary
+for Aho-Corasick to run at all. However, the current Teddy implementation
+only works in `x86_64` when SSSE3 or AVX2 are available or in `aarch64`
+(using NEON), and moreover, only works _well_ when there are a small number
+of patterns (say, less than 100). Teddy also requires the haystack to be of a
+certain length (more than 16-34 bytes). When the haystack is shorter than that,
+Rabin-Karp is used instead. (See `src/packed/rabinkarp.rs`.)
+
+There is a more thorough description of Teddy at
+[`src/packed/teddy/README.md`](src/packed/teddy/README.md).
diff --git a/third_party/rust/aho-corasick/LICENSE-MIT b/third_party/rust/aho-corasick/LICENSE-MIT
new file mode 100644
index 0000000000..3b0a5dc09c
--- /dev/null
+++ b/third_party/rust/aho-corasick/LICENSE-MIT
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2015 Andrew Gallant
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/third_party/rust/aho-corasick/README.md b/third_party/rust/aho-corasick/README.md
new file mode 100644
index 0000000000..c0f525fdc6
--- /dev/null
+++ b/third_party/rust/aho-corasick/README.md
@@ -0,0 +1,174 @@
+aho-corasick
+============
+A library for finding occurrences of many patterns at once with SIMD
+acceleration in some cases. This library provides multiple pattern
+search principally through an implementation of the
+[Aho-Corasick algorithm](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm),
+which builds a finite state machine for executing searches in linear time.
+Features include case insensitive matching, overlapping matches, fast searching
+via SIMD and optional full DFA construction and search & replace in streams.
+
+[![Build status](https://github.com/BurntSushi/aho-corasick/workflows/ci/badge.svg)](https://github.com/BurntSushi/aho-corasick/actions)
+[![crates.io](https://img.shields.io/crates/v/aho-corasick.svg)](https://crates.io/crates/aho-corasick)
+
+Dual-licensed under MIT or the [UNLICENSE](https://unlicense.org/).
+
+
+### Documentation
+
+https://docs.rs/aho-corasick
+
+
+### Usage
+
+Run `cargo add aho-corasick` to automatically add this crate as a dependency
+in your `Cargo.toml` file.
+
+
+### Example: basic searching
+
+This example shows how to search for occurrences of multiple patterns
+simultaneously. Each match includes the pattern that matched along with the
+byte offsets of the match.
+
+```rust
+use aho_corasick::{AhoCorasick, PatternID};
+
+let patterns = &["apple", "maple", "Snapple"];
+let haystack = "Nobody likes maple in their apple flavored Snapple.";
+
+let ac = AhoCorasick::new(patterns).unwrap();
+let mut matches = vec![];
+for mat in ac.find_iter(haystack) {
+ matches.push((mat.pattern(), mat.start(), mat.end()));
+}
+assert_eq!(matches, vec![
+ (PatternID::must(1), 13, 18),
+ (PatternID::must(0), 28, 33),
+ (PatternID::must(2), 43, 50),
+]);
+```
+
+
+### Example: ASCII case insensitivity
+
+This is like the previous example, but matches `Snapple` case insensitively
+using `AhoCorasickBuilder`:
+
+```rust
+use aho_corasick::{AhoCorasick, PatternID};
+
+let patterns = &["apple", "maple", "snapple"];
+let haystack = "Nobody likes maple in their apple flavored Snapple.";
+
+let ac = AhoCorasick::builder()
+ .ascii_case_insensitive(true)
+ .build(patterns)
+ .unwrap();
+let mut matches = vec![];
+for mat in ac.find_iter(haystack) {
+ matches.push((mat.pattern(), mat.start(), mat.end()));
+}
+assert_eq!(matches, vec![
+ (PatternID::must(1), 13, 18),
+ (PatternID::must(0), 28, 33),
+ (PatternID::must(2), 43, 50),
+]);
+```
+
+
+### Example: replacing matches in a stream
+
+This example shows how to execute a search and replace on a stream without
+loading the entire stream into memory first.
+
+```rust,ignore
+use aho_corasick::AhoCorasick;
+
+let patterns = &["fox", "brown", "quick"];
+let replace_with = &["sloth", "grey", "slow"];
+
+// In a real example, these might be `std::fs::File`s instead. All you need to
+// do is supply a pair of `std::io::Read` and `std::io::Write` implementations.
+let rdr = "The quick brown fox.";
+let mut wtr = vec![];
+
+let ac = AhoCorasick::new(patterns).unwrap();
+ac.stream_replace_all(rdr.as_bytes(), &mut wtr, replace_with)
+ .expect("stream_replace_all failed");
+assert_eq!(b"The slow grey sloth.".to_vec(), wtr);
+```
+
+
+### Example: finding the leftmost first match
+
+In the textbook description of Aho-Corasick, its formulation is typically
+structured such that it reports all possible matches, even when they overlap
+with another. In many cases, overlapping matches may not be desired, such as
+the case of finding all successive non-overlapping matches like you might with
+a standard regular expression.
+
+Unfortunately the "obvious" way to modify the Aho-Corasick algorithm to do
+this doesn't always work in the expected way, since it will report matches as
+soon as they are seen. For example, consider matching the regex `Samwise|Sam`
+against the text `Samwise`. Most regex engines (that are Perl-like, or
+non-POSIX) will report `Samwise` as a match, but the standard Aho-Corasick
+algorithm modified for reporting non-overlapping matches will report `Sam`.
+
+A novel contribution of this library is the ability to change the match
+semantics of Aho-Corasick (without additional search time overhead) such that
+`Samwise` is reported instead. For example, here's the standard approach:
+
+```rust
+use aho_corasick::AhoCorasick;
+
+let patterns = &["Samwise", "Sam"];
+let haystack = "Samwise";
+
+let ac = AhoCorasick::new(patterns).unwrap();
+let mat = ac.find(haystack).expect("should have a match");
+assert_eq!("Sam", &haystack[mat.start()..mat.end()]);
+```
+
+And now here's the leftmost-first version, which matches how a Perl-like
+regex will work:
+
+```rust
+use aho_corasick::{AhoCorasick, MatchKind};
+
+let patterns = &["Samwise", "Sam"];
+let haystack = "Samwise";
+
+let ac = AhoCorasick::builder()
+ .match_kind(MatchKind::LeftmostFirst)
+ .build(patterns)
+ .unwrap();
+let mat = ac.find(haystack).expect("should have a match");
+assert_eq!("Samwise", &haystack[mat.start()..mat.end()]);
+```
+
+In addition to leftmost-first semantics, this library also supports
+leftmost-longest semantics, which match the POSIX behavior of a regular
+expression alternation. See `MatchKind` in the docs for more details.
+
+
+### Minimum Rust version policy
+
+This crate's minimum supported `rustc` version is `1.60.0`.
+
+The current policy is that the minimum Rust version required to use this crate
+can be increased in minor version updates. For example, if `crate 1.0` requires
+Rust 1.20.0, then `crate 1.0.z` for all values of `z` will also require Rust
+1.20.0 or newer. However, `crate 1.y` for `y > 0` may require a newer minimum
+version of Rust.
+
+In general, this crate will be conservative with respect to the minimum
+supported version of Rust.
+
+
+### FFI bindings
+
+* [G-Research/ahocorasick_rs](https://github.com/G-Research/ahocorasick_rs/)
+is a Python wrapper for this library.
+* [tmikus/ahocorasick_rs](https://github.com/tmikus/ahocorasick_rs) is a Go
+ wrapper for this library.
diff --git a/third_party/rust/aho-corasick/UNLICENSE b/third_party/rust/aho-corasick/UNLICENSE
new file mode 100644
index 0000000000..68a49daad8
--- /dev/null
+++ b/third_party/rust/aho-corasick/UNLICENSE
@@ -0,0 +1,24 @@
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <http://unlicense.org/>
diff --git a/third_party/rust/aho-corasick/rustfmt.toml b/third_party/rust/aho-corasick/rustfmt.toml
new file mode 100644
index 0000000000..aa37a218b9
--- /dev/null
+++ b/third_party/rust/aho-corasick/rustfmt.toml
@@ -0,0 +1,2 @@
+max_width = 79
+use_small_heuristics = "max"
diff --git a/third_party/rust/aho-corasick/src/ahocorasick.rs b/third_party/rust/aho-corasick/src/ahocorasick.rs
new file mode 100644
index 0000000000..2947627704
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/ahocorasick.rs
@@ -0,0 +1,2789 @@
+use core::{
+ fmt::Debug,
+ panic::{RefUnwindSafe, UnwindSafe},
+};
+
+use alloc::{string::String, sync::Arc, vec::Vec};
+
+use crate::{
+ automaton::{self, Automaton, OverlappingState},
+ dfa,
+ nfa::{contiguous, noncontiguous},
+ util::{
+ error::{BuildError, MatchError},
+ prefilter::Prefilter,
+ primitives::{PatternID, StateID},
+ search::{Anchored, Input, Match, MatchKind, StartKind},
+ },
+};
+
+/// An automaton for searching multiple strings in linear time.
+///
+/// The `AhoCorasick` type supports a few basic ways of constructing an
+/// automaton, with the default being [`AhoCorasick::new`]. However, there
+/// are a fair number of configurable options that can be set by using
+/// [`AhoCorasickBuilder`] instead. Such options include, but are not limited
+/// to, how matches are determined, simple case insensitivity, whether to use a
+/// DFA or not and various knobs for controlling the space-vs-time trade offs
+/// taken when building the automaton.
+///
+/// # Resource usage
+///
+/// Aho-Corasick automatons are always constructed in `O(p)` time, where
+/// `p` is the combined length of all patterns being searched. With that
+/// said, building an automaton can be fairly costly because of high constant
+/// factors, particularly when enabling the [DFA](AhoCorasickKind::DFA) option
+/// with [`AhoCorasickBuilder::kind`]. For this reason, it's generally a good
+/// idea to build an automaton once and reuse it as much as possible.
+///
+/// Aho-Corasick automatons can also use a fair bit of memory. To get
+/// a concrete idea of how much memory is being used, try using the
+/// [`AhoCorasick::memory_usage`] method.
+///
+/// To give a quick idea of the differences between Aho-Corasick
+/// implementations and their resource usage, here's a sample of construction
+/// times and heap memory used after building an automaton from 100,000
+/// randomly selected titles from Wikipedia:
+///
+/// * 99MB for a [`noncontiguous::NFA`] in 240ms.
+/// * 21MB for a [`contiguous::NFA`] in 275ms.
+/// * 1.6GB for a [`dfa::DFA`] in 1.88s.
+///
+/// (Note that the memory usage above reflects the size of each automaton and
+/// not peak memory usage. For example, building a contiguous NFA requires
+/// first building a noncontiguous NFA. Once the contiguous NFA is built, the
+/// noncontiguous NFA is freed.)
+///
+/// This experiment very strongly argues that a contiguous NFA is often the
+/// best balance in terms of resource usage. It takes a little longer to build,
+/// but its memory usage is quite small. Its search speed (not listed) is
+/// also often faster than a noncontiguous NFA, but a little slower than a
+/// DFA. Indeed, when no specific [`AhoCorasickKind`] is used (which is the
+/// default), a contiguous NFA is used in most cases.
+///
+/// The only "catch" to using a contiguous NFA is that, because of its variety
+/// of compression tricks, it may not be able to support automatons as large as
+/// what the noncontiguous NFA supports. In which case, building a contiguous
+/// NFA will fail and (by default) `AhoCorasick` will automatically fall
+/// back to a noncontiguous NFA. (This typically only happens when building
+/// automatons from millions of patterns.) Otherwise, the small additional time
+/// for building a contiguous NFA is almost certainly worth it.
+///
+/// # Cloning
+///
+/// The `AhoCorasick` type uses thread safe reference counting internally. It
+/// is guaranteed that it is cheap to clone.
+///
+/// # Search configuration
+///
+/// Most of the search routines accept anything that can be cheaply converted
+/// to an [`Input`]. This includes `&[u8]`, `&str` and `Input` itself.
+///
+/// # Construction failure
+///
+/// It is generally possible for building an Aho-Corasick automaton to fail.
+/// Construction can fail in generally one way: when the inputs provided are
+/// too big. Whether that's a pattern that is too long, too many patterns
+/// or some combination of both. A first approximation for the scale at which
+/// construction can fail is somewhere around "millions of patterns."
+///
+/// For that reason, if you're building an Aho-Corasick automaton from
+/// untrusted input (or input that doesn't have any reasonable bounds on its
+/// size), then it is strongly recommended to handle the possibility of an
+/// error.
+///
+/// If you're constructing an Aho-Corasick automaton from static or trusted
+/// data, then it is likely acceptable to panic (by calling `unwrap()` or
+/// `expect()`) if construction fails.
+///
+/// # Fallibility
+///
+/// The `AhoCorasick` type provides a number of methods for searching, as one
+/// might expect. Depending on how the Aho-Corasick automaton was built and
+/// depending on the search configuration, it is possible for a search to
+/// return an error. Since an error is _never_ dependent on the actual contents
+/// of the haystack, this type provides both infallible and fallible methods
+/// for searching. The infallible methods panic if an error occurs, and can be
+/// used for convenience and when you know the search will never return an
+/// error.
+///
+/// For example, the [`AhoCorasick::find_iter`] method is the infallible
+/// version of the [`AhoCorasick::try_find_iter`] method.
+///
+/// Examples of errors that can occur:
+///
+/// * Running a search that requires [`MatchKind::Standard`] semantics (such
+/// as a stream or overlapping search) with an automaton that was built with
+/// [`MatchKind::LeftmostFirst`] or [`MatchKind::LeftmostLongest`] semantics.
+/// * Running an anchored search with an automaton that only supports
+/// unanchored searches. (By default, `AhoCorasick` only supports unanchored
+/// searches. But this can be toggled with [`AhoCorasickBuilder::start_kind`].)
+/// * Running an unanchored search with an automaton that only supports
+/// anchored searches.
+///
+/// The common thread between the different types of errors is that they are
+/// all rooted in the automaton construction and search configurations. If
+/// those configurations are a static property of your program, then it is
+/// reasonable to call infallible routines since you know an error will never
+/// occur. And if one _does_ occur, then it's a bug in your program.
+///
+/// To re-iterate, if the patterns, build or search configuration come from
+/// user or untrusted data, then you should handle errors at build or search
+/// time. If only the haystack comes from user or untrusted data, then there
+/// should be no need to handle errors anywhere and it is generally encouraged
+/// to `unwrap()` (or `expect()`) both build and search time calls.
+///
+/// # Examples
+///
+/// This example shows how to search for occurrences of multiple patterns
+/// simultaneously in a case insensitive fashion. Each match includes the
+/// pattern that matched along with the byte offsets of the match.
+///
+/// ```
+/// use aho_corasick::{AhoCorasick, PatternID};
+///
+/// let patterns = &["apple", "maple", "snapple"];
+/// let haystack = "Nobody likes maple in their apple flavored Snapple.";
+///
+/// let ac = AhoCorasick::builder()
+/// .ascii_case_insensitive(true)
+/// .build(patterns)
+/// .unwrap();
+/// let mut matches = vec![];
+/// for mat in ac.find_iter(haystack) {
+/// matches.push((mat.pattern(), mat.start(), mat.end()));
+/// }
+/// assert_eq!(matches, vec![
+/// (PatternID::must(1), 13, 18),
+/// (PatternID::must(0), 28, 33),
+/// (PatternID::must(2), 43, 50),
+/// ]);
+/// ```
+///
+/// This example shows how to replace matches with some other string:
+///
+/// ```
+/// use aho_corasick::AhoCorasick;
+///
+/// let patterns = &["fox", "brown", "quick"];
+/// let haystack = "The quick brown fox.";
+/// let replace_with = &["sloth", "grey", "slow"];
+///
+/// let ac = AhoCorasick::new(patterns).unwrap();
+/// let result = ac.replace_all(haystack, replace_with);
+/// assert_eq!(result, "The slow grey sloth.");
+/// ```
+#[derive(Clone)]
+pub struct AhoCorasick {
+ /// The underlying Aho-Corasick automaton. It's one of
+ /// nfa::noncontiguous::NFA, nfa::contiguous::NFA or dfa::DFA.
+ aut: Arc<dyn AcAutomaton>,
+ /// The specific Aho-Corasick kind chosen. This makes it possible to
+ /// inspect any `AhoCorasick` and know what kind of search strategy it
+ /// uses.
+ kind: AhoCorasickKind,
+ /// The start kind of this automaton as configured by the caller.
+ ///
+ /// We don't really *need* to put this here, since the underlying automaton
+ /// will correctly return errors if the caller requests an unsupported
+ /// search type. But we do keep this here for API behavior consistency.
+ /// Namely, the NFAs in this crate support both unanchored and anchored
+ /// searches unconditionally. There's no way to disable one or the other.
+ /// They always both work. But the DFA in this crate specifically only
+ /// supports both unanchored and anchored searches if it's configured to
+ /// do so. Why? Because for the DFA, supporting both essentially requires
+ /// two copies of the transition table: one generated by following failure
+ /// transitions from the original NFA and one generated by not following
+ /// those failure transitions.
+ ///
+ /// So why record the start kind here? Well, consider what happens
+ /// when no specific 'AhoCorasickKind' is selected by the caller and
+ /// 'StartKind::Unanchored' is used (both are the default). It *might*
+ /// result in using a DFA or it might pick an NFA. If it picks an NFA, the
+ /// caller would then be able to run anchored searches, even though the
+ /// caller only asked for support for unanchored searches. Maybe that's
+ /// fine, but what if the DFA was chosen instead? Oops, the caller would
+ /// get an error.
+ ///
+ /// Basically, it seems bad to return an error or not based on some
+ /// internal implementation choice. So we smooth things out and ensure
+ /// anchored searches *always* report an error when only unanchored support
+ /// was asked for (and vice versa), even if the underlying automaton
+ /// supports it.
+ start_kind: StartKind,
+}
+
+/// Convenience constructors for an Aho-Corasick searcher. To configure the
+/// searcher, use an [`AhoCorasickBuilder`] instead.
+impl AhoCorasick {
+ /// Create a new Aho-Corasick automaton using the default configuration.
+ ///
+ /// The default configuration optimizes for less space usage, but at the
+ /// expense of longer search times. To change the configuration, use
+ /// [`AhoCorasickBuilder`].
+ ///
+ /// This uses the default [`MatchKind::Standard`] match semantics, which
+ /// reports a match as soon as it is found. This corresponds to the
+ /// standard match semantics supported by textbook descriptions of the
+ /// Aho-Corasick algorithm.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, PatternID};
+ ///
+ /// let ac = AhoCorasick::new(&["foo", "bar", "baz"]).unwrap();
+ /// assert_eq!(
+ /// Some(PatternID::must(1)),
+ /// ac.find("xxx bar xxx").map(|m| m.pattern()),
+ /// );
+ /// ```
+ pub fn new<I, P>(patterns: I) -> Result<AhoCorasick, BuildError>
+ where
+ I: IntoIterator<Item = P>,
+ P: AsRef<[u8]>,
+ {
+ AhoCorasickBuilder::new().build(patterns)
+ }
+
+ /// A convenience method for returning a new Aho-Corasick builder.
+ ///
+ /// This usually permits one to just import the `AhoCorasick` type.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, Match, MatchKind};
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .match_kind(MatchKind::LeftmostFirst)
+ /// .build(&["samwise", "sam"])
+ /// .unwrap();
+ /// assert_eq!(Some(Match::must(0, 0..7)), ac.find("samwise"));
+ /// ```
+ pub fn builder() -> AhoCorasickBuilder {
+ AhoCorasickBuilder::new()
+ }
+}
+
+/// Infallible search routines. These APIs panic when the underlying search
+/// would otherwise fail. Infallible routines are useful because the errors are
+/// a result of both search-time configuration and what configuration is used
+/// to build the Aho-Corasick searcher. Both of these things are not usually
+/// the result of user input, and thus, an error is typically indicative of a
+/// programmer error. In cases where callers want errors instead of panics, use
+/// the corresponding `try` method in the section below.
+impl AhoCorasick {
+ /// Returns true if and only if this automaton matches the haystack at any
+ /// position.
+ ///
+ /// `input` may be any type that is cheaply convertible to an `Input`. This
+ /// includes, but is not limited to, `&str` and `&[u8]`.
+ ///
+ /// Aside from convenience, when `AhoCorasick` was built with
+ /// leftmost-first or leftmost-longest semantics, this might result in a
+ /// search that visits less of the haystack than [`AhoCorasick::find`]
+ /// would otherwise. (For standard semantics, matches are always
+ /// immediately returned once they are seen, so there is no way for this to
+ /// do less work in that case.)
+ ///
+ /// Note that there is no corresponding fallible routine for this method.
+ /// If you need a fallible version of this, then [`AhoCorasick::try_find`]
+ /// can be used with [`Input::earliest`] enabled.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::AhoCorasick;
+ ///
+ /// let ac = AhoCorasick::new(&[
+ /// "foo", "bar", "quux", "baz",
+ /// ]).unwrap();
+ /// assert!(ac.is_match("xxx bar xxx"));
+ /// assert!(!ac.is_match("xxx qux xxx"));
+ /// ```
+ pub fn is_match<'h, I: Into<Input<'h>>>(&self, input: I) -> bool {
+ self.aut
+ .try_find(&input.into().earliest(true))
+ .expect("AhoCorasick::try_find is not expected to fail")
+ .is_some()
+ }
+
+ /// Returns the location of the first match according to the match
+ /// semantics that this automaton was constructed with.
+ ///
+ /// `input` may be any type that is cheaply convertible to an `Input`. This
+ /// includes, but is not limited to, `&str` and `&[u8]`.
+ ///
+ /// This is the infallible version of [`AhoCorasick::try_find`].
+ ///
+ /// # Panics
+ ///
+ /// This panics when [`AhoCorasick::try_find`] would return an error.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage, with standard semantics:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, MatchKind};
+ ///
+ /// let patterns = &["b", "abc", "abcd"];
+ /// let haystack = "abcd";
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .match_kind(MatchKind::Standard) // default, not necessary
+ /// .build(patterns)
+ /// .unwrap();
+ /// let mat = ac.find(haystack).expect("should have a match");
+ /// assert_eq!("b", &haystack[mat.start()..mat.end()]);
+ /// ```
+ ///
+ /// Now with leftmost-first semantics:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, MatchKind};
+ ///
+ /// let patterns = &["b", "abc", "abcd"];
+ /// let haystack = "abcd";
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .match_kind(MatchKind::LeftmostFirst)
+ /// .build(patterns)
+ /// .unwrap();
+ /// let mat = ac.find(haystack).expect("should have a match");
+ /// assert_eq!("abc", &haystack[mat.start()..mat.end()]);
+ /// ```
+ ///
+ /// And finally, leftmost-longest semantics:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, MatchKind};
+ ///
+ /// let patterns = &["b", "abc", "abcd"];
+ /// let haystack = "abcd";
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .match_kind(MatchKind::LeftmostLongest)
+ /// .build(patterns)
+ /// .unwrap();
+ /// let mat = ac.find(haystack).expect("should have a match");
+ /// ```
+ ///
+ /// # Example: configuring a search
+ ///
+ /// Because this method accepts anything that can be turned into an
+ /// [`Input`], it's possible to provide an `Input` directly in order to
+ /// configure the search. In this example, we show how to use the
+ /// `earliest` option to force the search to return as soon as it knows
+ /// a match has occurred.
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, Input, MatchKind};
+ ///
+ /// let patterns = &["b", "abc", "abcd"];
+ /// let haystack = "abcd";
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .match_kind(MatchKind::LeftmostLongest)
+ /// .build(patterns)
+ /// .unwrap();
+ /// let mat = ac.find(Input::new(haystack).earliest(true))
+ /// .expect("should have a match");
+ /// // The correct leftmost-longest match here is 'abcd', but since we
+ /// // told the search to quit as soon as it knows a match has occurred,
+ /// // we get a different match back.
+ /// assert_eq!("b", &haystack[mat.start()..mat.end()]);
+ /// ```
+ pub fn find<'h, I: Into<Input<'h>>>(&self, input: I) -> Option<Match> {
+ self.try_find(input)
+ .expect("AhoCorasick::try_find is not expected to fail")
+ }
+
+ /// Returns the location of the first overlapping match in the given
+ /// input with respect to the current state of the underlying searcher.
+ ///
+ /// `input` may be any type that is cheaply convertible to an `Input`. This
+ /// includes, but is not limited to, `&str` and `&[u8]`.
+ ///
+ /// Overlapping searches do not report matches in their return value.
+ /// Instead, matches can be accessed via [`OverlappingState::get_match`]
+ /// after a search call.
+ ///
+ /// This is the infallible version of
+ /// [`AhoCorasick::try_find_overlapping`].
+ ///
+ /// # Panics
+ ///
+ /// This panics when [`AhoCorasick::try_find_overlapping`] would
+ /// return an error. For example, when the Aho-Corasick searcher
+ /// doesn't support overlapping searches. (Only searchers built with
+ /// [`MatchKind::Standard`] semantics support overlapping searches.)
+ ///
+ /// # Example
+ ///
+ /// This shows how we can repeatedly call an overlapping search without
+ /// ever needing to explicitly re-slice the haystack. Overlapping search
+ /// works this way because searches depend on state saved during the
+ /// previous search.
+ ///
+ /// ```
+ /// use aho_corasick::{
+ /// automaton::OverlappingState,
+ /// AhoCorasick, Input, Match,
+ /// };
+ ///
+ /// let patterns = &["append", "appendage", "app"];
+ /// let haystack = "append the app to the appendage";
+ ///
+ /// let ac = AhoCorasick::new(patterns).unwrap();
+ /// let mut state = OverlappingState::start();
+ ///
+ /// ac.find_overlapping(haystack, &mut state);
+ /// assert_eq!(Some(Match::must(2, 0..3)), state.get_match());
+ ///
+ /// ac.find_overlapping(haystack, &mut state);
+ /// assert_eq!(Some(Match::must(0, 0..6)), state.get_match());
+ ///
+ /// ac.find_overlapping(haystack, &mut state);
+ /// assert_eq!(Some(Match::must(2, 11..14)), state.get_match());
+ ///
+ /// ac.find_overlapping(haystack, &mut state);
+ /// assert_eq!(Some(Match::must(2, 22..25)), state.get_match());
+ ///
+ /// ac.find_overlapping(haystack, &mut state);
+ /// assert_eq!(Some(Match::must(0, 22..28)), state.get_match());
+ ///
+ /// ac.find_overlapping(haystack, &mut state);
+ /// assert_eq!(Some(Match::must(1, 22..31)), state.get_match());
+ ///
+ /// // No more match matches to be found.
+ /// ac.find_overlapping(haystack, &mut state);
+ /// assert_eq!(None, state.get_match());
+ /// ```
+ pub fn find_overlapping<'h, I: Into<Input<'h>>>(
+ &self,
+ input: I,
+ state: &mut OverlappingState,
+ ) {
+ self.try_find_overlapping(input, state).expect(
+ "AhoCorasick::try_find_overlapping is not expected to fail",
+ )
+ }
+
+ /// Returns an iterator of non-overlapping matches, using the match
+ /// semantics that this automaton was constructed with.
+ ///
+ /// `input` may be any type that is cheaply convertible to an `Input`. This
+ /// includes, but is not limited to, `&str` and `&[u8]`.
+ ///
+ /// This is the infallible version of [`AhoCorasick::try_find_iter`].
+ ///
+ /// # Panics
+ ///
+ /// This panics when [`AhoCorasick::try_find_iter`] would return an error.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage, with standard semantics:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, MatchKind, PatternID};
+ ///
+ /// let patterns = &["append", "appendage", "app"];
+ /// let haystack = "append the app to the appendage";
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .match_kind(MatchKind::Standard) // default, not necessary
+ /// .build(patterns)
+ /// .unwrap();
+ /// let matches: Vec<PatternID> = ac
+ /// .find_iter(haystack)
+ /// .map(|mat| mat.pattern())
+ /// .collect();
+ /// assert_eq!(vec![
+ /// PatternID::must(2),
+ /// PatternID::must(2),
+ /// PatternID::must(2),
+ /// ], matches);
+ /// ```
+ ///
+ /// Now with leftmost-first semantics:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, MatchKind, PatternID};
+ ///
+ /// let patterns = &["append", "appendage", "app"];
+ /// let haystack = "append the app to the appendage";
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .match_kind(MatchKind::LeftmostFirst)
+ /// .build(patterns)
+ /// .unwrap();
+ /// let matches: Vec<PatternID> = ac
+ /// .find_iter(haystack)
+ /// .map(|mat| mat.pattern())
+ /// .collect();
+ /// assert_eq!(vec![
+ /// PatternID::must(0),
+ /// PatternID::must(2),
+ /// PatternID::must(0),
+ /// ], matches);
+ /// ```
+ ///
+ /// And finally, leftmost-longest semantics:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, MatchKind, PatternID};
+ ///
+ /// let patterns = &["append", "appendage", "app"];
+ /// let haystack = "append the app to the appendage";
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .match_kind(MatchKind::LeftmostLongest)
+ /// .build(patterns)
+ /// .unwrap();
+ /// let matches: Vec<PatternID> = ac
+ /// .find_iter(haystack)
+ /// .map(|mat| mat.pattern())
+ /// .collect();
+ /// assert_eq!(vec![
+ /// PatternID::must(0),
+ /// PatternID::must(2),
+ /// PatternID::must(1),
+ /// ], matches);
+ /// ```
+ pub fn find_iter<'a, 'h, I: Into<Input<'h>>>(
+ &'a self,
+ input: I,
+ ) -> FindIter<'a, 'h> {
+ self.try_find_iter(input)
+ .expect("AhoCorasick::try_find_iter is not expected to fail")
+ }
+
+ /// Returns an iterator of overlapping matches. Stated differently, this
+ /// returns an iterator of all possible matches at every position.
+ ///
+ /// `input` may be any type that is cheaply convertible to an `Input`. This
+ /// includes, but is not limited to, `&str` and `&[u8]`.
+ ///
+ /// This is the infallible version of
+ /// [`AhoCorasick::try_find_overlapping_iter`].
+ ///
+ /// # Panics
+ ///
+ /// This panics when `AhoCorasick::try_find_overlapping_iter` would return
+ /// an error. For example, when the Aho-Corasick searcher is built with
+ /// either leftmost-first or leftmost-longest match semantics. Stated
+ /// differently, overlapping searches require one to build the searcher
+ /// with [`MatchKind::Standard`] (it is the default).
+ ///
+ /// # Example: basic usage
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, PatternID};
+ ///
+ /// let patterns = &["append", "appendage", "app"];
+ /// let haystack = "append the app to the appendage";
+ ///
+ /// let ac = AhoCorasick::new(patterns).unwrap();
+ /// let matches: Vec<PatternID> = ac
+ /// .find_overlapping_iter(haystack)
+ /// .map(|mat| mat.pattern())
+ /// .collect();
+ /// assert_eq!(vec![
+ /// PatternID::must(2),
+ /// PatternID::must(0),
+ /// PatternID::must(2),
+ /// PatternID::must(2),
+ /// PatternID::must(0),
+ /// PatternID::must(1),
+ /// ], matches);
+ /// ```
+ pub fn find_overlapping_iter<'a, 'h, I: Into<Input<'h>>>(
+ &'a self,
+ input: I,
+ ) -> FindOverlappingIter<'a, 'h> {
+ self.try_find_overlapping_iter(input).expect(
+ "AhoCorasick::try_find_overlapping_iter is not expected to fail",
+ )
+ }
+
+ /// Replace all matches with a corresponding value in the `replace_with`
+ /// slice given. Matches correspond to the same matches as reported by
+ /// [`AhoCorasick::find_iter`].
+ ///
+ /// Replacements are determined by the index of the matching pattern.
+ /// For example, if the pattern with index `2` is found, then it is
+ /// replaced by `replace_with[2]`.
+ ///
+ /// This is the infallible version of [`AhoCorasick::try_replace_all`].
+ ///
+ /// # Panics
+ ///
+ /// This panics when [`AhoCorasick::try_replace_all`] would return an
+ /// error.
+ ///
+ /// This also panics when `replace_with.len()` does not equal
+ /// [`AhoCorasick::patterns_len`].
+ ///
+ /// # Example: basic usage
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, MatchKind};
+ ///
+ /// let patterns = &["append", "appendage", "app"];
+ /// let haystack = "append the app to the appendage";
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .match_kind(MatchKind::LeftmostFirst)
+ /// .build(patterns)
+ /// .unwrap();
+ /// let result = ac.replace_all(haystack, &["x", "y", "z"]);
+ /// assert_eq!("x the z to the xage", result);
+ /// ```
+ pub fn replace_all<B>(&self, haystack: &str, replace_with: &[B]) -> String
+ where
+ B: AsRef<str>,
+ {
+ self.try_replace_all(haystack, replace_with)
+ .expect("AhoCorasick::try_replace_all is not expected to fail")
+ }
+
+ /// Replace all matches using raw bytes with a corresponding value in the
+ /// `replace_with` slice given. Matches correspond to the same matches as
+ /// reported by [`AhoCorasick::find_iter`].
+ ///
+ /// Replacements are determined by the index of the matching pattern.
+ /// For example, if the pattern with index `2` is found, then it is
+ /// replaced by `replace_with[2]`.
+ ///
+ /// This is the infallible version of
+ /// [`AhoCorasick::try_replace_all_bytes`].
+ ///
+ /// # Panics
+ ///
+ /// This panics when [`AhoCorasick::try_replace_all_bytes`] would return an
+ /// error.
+ ///
+ /// This also panics when `replace_with.len()` does not equal
+ /// [`AhoCorasick::patterns_len`].
+ ///
+ /// # Example: basic usage
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, MatchKind};
+ ///
+ /// let patterns = &["append", "appendage", "app"];
+ /// let haystack = b"append the app to the appendage";
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .match_kind(MatchKind::LeftmostFirst)
+ /// .build(patterns)
+ /// .unwrap();
+ /// let result = ac.replace_all_bytes(haystack, &["x", "y", "z"]);
+ /// assert_eq!(b"x the z to the xage".to_vec(), result);
+ /// ```
+ pub fn replace_all_bytes<B>(
+ &self,
+ haystack: &[u8],
+ replace_with: &[B],
+ ) -> Vec<u8>
+ where
+ B: AsRef<[u8]>,
+ {
+ self.try_replace_all_bytes(haystack, replace_with)
+ .expect("AhoCorasick::try_replace_all_bytes should not fail")
+ }
+
+ /// Replace all matches using a closure called on each match.
+ /// Matches correspond to the same matches as reported by
+ /// [`AhoCorasick::find_iter`].
+ ///
+ /// The closure accepts three parameters: the match found, the text of
+ /// the match and a string buffer with which to write the replaced text
+ /// (if any). If the closure returns `true`, then it continues to the next
+ /// match. If the closure returns `false`, then searching is stopped.
+ ///
+ /// Note that any matches with boundaries that don't fall on a valid UTF-8
+ /// boundary are silently skipped.
+ ///
+ /// This is the infallible version of
+ /// [`AhoCorasick::try_replace_all_with`].
+ ///
+ /// # Panics
+ ///
+ /// This panics when [`AhoCorasick::try_replace_all_with`] would return an
+ /// error.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, MatchKind};
+ ///
+ /// let patterns = &["append", "appendage", "app"];
+ /// let haystack = "append the app to the appendage";
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .match_kind(MatchKind::LeftmostFirst)
+ /// .build(patterns)
+ /// .unwrap();
+ /// let mut result = String::new();
+ /// ac.replace_all_with(haystack, &mut result, |mat, _, dst| {
+ /// dst.push_str(&mat.pattern().as_usize().to_string());
+ /// true
+ /// });
+ /// assert_eq!("0 the 2 to the 0age", result);
+ /// ```
+ ///
+ /// Stopping the replacement by returning `false` (continued from the
+ /// example above):
+ ///
+ /// ```
+ /// # use aho_corasick::{AhoCorasick, MatchKind, PatternID};
+ /// # let patterns = &["append", "appendage", "app"];
+ /// # let haystack = "append the app to the appendage";
+ /// # let ac = AhoCorasick::builder()
+ /// # .match_kind(MatchKind::LeftmostFirst)
+ /// # .build(patterns)
+ /// # .unwrap();
+ /// let mut result = String::new();
+ /// ac.replace_all_with(haystack, &mut result, |mat, _, dst| {
+ /// dst.push_str(&mat.pattern().as_usize().to_string());
+ /// mat.pattern() != PatternID::must(2)
+ /// });
+ /// assert_eq!("0 the 2 to the appendage", result);
+ /// ```
+ pub fn replace_all_with<F>(
+ &self,
+ haystack: &str,
+ dst: &mut String,
+ replace_with: F,
+ ) where
+ F: FnMut(&Match, &str, &mut String) -> bool,
+ {
+ self.try_replace_all_with(haystack, dst, replace_with)
+ .expect("AhoCorasick::try_replace_all_with should not fail")
+ }
+
+ /// Replace all matches using raw bytes with a closure called on each
+ /// match. Matches correspond to the same matches as reported by
+ /// [`AhoCorasick::find_iter`].
+ ///
+ /// The closure accepts three parameters: the match found, the text of
+ /// the match and a byte buffer with which to write the replaced text
+ /// (if any). If the closure returns `true`, then it continues to the next
+ /// match. If the closure returns `false`, then searching is stopped.
+ ///
+ /// This is the infallible version of
+ /// [`AhoCorasick::try_replace_all_with_bytes`].
+ ///
+ /// # Panics
+ ///
+ /// This panics when [`AhoCorasick::try_replace_all_with_bytes`] would
+ /// return an error.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, MatchKind};
+ ///
+ /// let patterns = &["append", "appendage", "app"];
+ /// let haystack = b"append the app to the appendage";
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .match_kind(MatchKind::LeftmostFirst)
+ /// .build(patterns)
+ /// .unwrap();
+ /// let mut result = vec![];
+ /// ac.replace_all_with_bytes(haystack, &mut result, |mat, _, dst| {
+ /// dst.extend(mat.pattern().as_usize().to_string().bytes());
+ /// true
+ /// });
+ /// assert_eq!(b"0 the 2 to the 0age".to_vec(), result);
+ /// ```
+ ///
+ /// Stopping the replacement by returning `false` (continued from the
+ /// example above):
+ ///
+ /// ```
+ /// # use aho_corasick::{AhoCorasick, MatchKind, PatternID};
+ /// # let patterns = &["append", "appendage", "app"];
+ /// # let haystack = b"append the app to the appendage";
+ /// # let ac = AhoCorasick::builder()
+ /// # .match_kind(MatchKind::LeftmostFirst)
+ /// # .build(patterns)
+ /// # .unwrap();
+ /// let mut result = vec![];
+ /// ac.replace_all_with_bytes(haystack, &mut result, |mat, _, dst| {
+ /// dst.extend(mat.pattern().as_usize().to_string().bytes());
+ /// mat.pattern() != PatternID::must(2)
+ /// });
+ /// assert_eq!(b"0 the 2 to the appendage".to_vec(), result);
+ /// ```
+ pub fn replace_all_with_bytes<F>(
+ &self,
+ haystack: &[u8],
+ dst: &mut Vec<u8>,
+ replace_with: F,
+ ) where
+ F: FnMut(&Match, &[u8], &mut Vec<u8>) -> bool,
+ {
+ self.try_replace_all_with_bytes(haystack, dst, replace_with)
+ .expect("AhoCorasick::try_replace_all_with_bytes should not fail")
+ }
+
+ /// Returns an iterator of non-overlapping matches in the given
+ /// stream. Matches correspond to the same matches as reported by
+ /// [`AhoCorasick::find_iter`].
+ ///
+ /// The matches yielded by this iterator use absolute position offsets in
+ /// the stream given, where the first byte has index `0`. Matches are
+ /// yieled until the stream is exhausted.
+ ///
+ /// Each item yielded by the iterator is an `Result<Match,
+ /// std::io::Error>`, where an error is yielded if there was a problem
+ /// reading from the reader given.
+ ///
+ /// When searching a stream, an internal buffer is used. Therefore, callers
+ /// should avoiding providing a buffered reader, if possible.
+ ///
+ /// This is the infallible version of
+ /// [`AhoCorasick::try_stream_find_iter`]. Note that both methods return
+ /// iterators that produce `Result` values. The difference is that this
+ /// routine panics if _construction_ of the iterator failed. The `Result`
+ /// values yield by the iterator come from whether the given reader returns
+ /// an error or not during the search.
+ ///
+ /// # Memory usage
+ ///
+ /// In general, searching streams will use a constant amount of memory for
+ /// its internal buffer. The one requirement is that the internal buffer
+ /// must be at least the size of the longest possible match. In most use
+ /// cases, the default buffer size will be much larger than any individual
+ /// match.
+ ///
+ /// # Panics
+ ///
+ /// This panics when [`AhoCorasick::try_stream_find_iter`] would return
+ /// an error. For example, when the Aho-Corasick searcher doesn't support
+ /// stream searches. (Only searchers built with [`MatchKind::Standard`]
+ /// semantics support stream searches.)
+ ///
+ /// # Example: basic usage
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, PatternID};
+ ///
+ /// let patterns = &["append", "appendage", "app"];
+ /// let haystack = "append the app to the appendage";
+ ///
+ /// let ac = AhoCorasick::new(patterns).unwrap();
+ /// let mut matches = vec![];
+ /// for result in ac.stream_find_iter(haystack.as_bytes()) {
+ /// let mat = result?;
+ /// matches.push(mat.pattern());
+ /// }
+ /// assert_eq!(vec![
+ /// PatternID::must(2),
+ /// PatternID::must(2),
+ /// PatternID::must(2),
+ /// ], matches);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "std")]
+ pub fn stream_find_iter<'a, R: std::io::Read>(
+ &'a self,
+ rdr: R,
+ ) -> StreamFindIter<'a, R> {
+ self.try_stream_find_iter(rdr)
+ .expect("AhoCorasick::try_stream_find_iter should not fail")
+ }
+}
+
+/// Fallible search routines. These APIs return an error in cases where the
+/// infallible routines would panic.
+impl AhoCorasick {
+ /// Returns the location of the first match according to the match
+ /// semantics that this automaton was constructed with, and according
+ /// to the given `Input` configuration.
+ ///
+ /// This is the fallible version of [`AhoCorasick::find`].
+ ///
+ /// # Errors
+ ///
+ /// This returns an error when this Aho-Corasick searcher does not support
+ /// the given `Input` configuration.
+ ///
+ /// For example, if the Aho-Corasick searcher only supports anchored
+ /// searches or only supports unanchored searches, then providing an
+ /// `Input` that requests an anchored (or unanchored) search when it isn't
+ /// supported would result in an error.
+ ///
+ /// # Example: leftmost-first searching
+ ///
+ /// Basic usage with leftmost-first semantics:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, MatchKind, Input};
+ ///
+ /// let patterns = &["b", "abc", "abcd"];
+ /// let haystack = "foo abcd";
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .match_kind(MatchKind::LeftmostFirst)
+ /// .build(patterns)
+ /// .unwrap();
+ /// let mat = ac.try_find(haystack)?.expect("should have a match");
+ /// assert_eq!("abc", &haystack[mat.span()]);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: anchored leftmost-first searching
+ ///
+ /// This shows how to anchor the search, so that even if the haystack
+ /// contains a match somewhere, a match won't be reported unless one can
+ /// be found that starts at the beginning of the search:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, Anchored, Input, MatchKind, StartKind};
+ ///
+ /// let patterns = &["b", "abc", "abcd"];
+ /// let haystack = "foo abcd";
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .match_kind(MatchKind::LeftmostFirst)
+ /// .start_kind(StartKind::Anchored)
+ /// .build(patterns)
+ /// .unwrap();
+ /// let input = Input::new(haystack).anchored(Anchored::Yes);
+ /// assert_eq!(None, ac.try_find(input)?);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// If the beginning of the search is changed to where a match begins, then
+ /// it will be found:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, Anchored, Input, MatchKind, StartKind};
+ ///
+ /// let patterns = &["b", "abc", "abcd"];
+ /// let haystack = "foo abcd";
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .match_kind(MatchKind::LeftmostFirst)
+ /// .start_kind(StartKind::Anchored)
+ /// .build(patterns)
+ /// .unwrap();
+ /// let input = Input::new(haystack).range(4..).anchored(Anchored::Yes);
+ /// let mat = ac.try_find(input)?.expect("should have a match");
+ /// assert_eq!("abc", &haystack[mat.span()]);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: earliest leftmost-first searching
+ ///
+ /// This shows how to run an "earliest" search even when the Aho-Corasick
+ /// searcher was compiled with leftmost-first match semantics. In this
+ /// case, the search is stopped as soon as it is known that a match has
+ /// occurred, even if it doesn't correspond to the leftmost-first match.
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, Input, MatchKind};
+ ///
+ /// let patterns = &["b", "abc", "abcd"];
+ /// let haystack = "foo abcd";
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .match_kind(MatchKind::LeftmostFirst)
+ /// .build(patterns)
+ /// .unwrap();
+ /// let input = Input::new(haystack).earliest(true);
+ /// let mat = ac.try_find(input)?.expect("should have a match");
+ /// assert_eq!("b", &haystack[mat.span()]);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn try_find<'h, I: Into<Input<'h>>>(
+ &self,
+ input: I,
+ ) -> Result<Option<Match>, MatchError> {
+ let input = input.into();
+ enforce_anchored_consistency(self.start_kind, input.get_anchored())?;
+ self.aut.try_find(&input)
+ }
+
+ /// Returns the location of the first overlapping match in the given
+ /// input with respect to the current state of the underlying searcher.
+ ///
+ /// Overlapping searches do not report matches in their return value.
+ /// Instead, matches can be accessed via [`OverlappingState::get_match`]
+ /// after a search call.
+ ///
+ /// This is the fallible version of [`AhoCorasick::find_overlapping`].
+ ///
+ /// # Errors
+ ///
+ /// This returns an error when this Aho-Corasick searcher does not support
+ /// the given `Input` configuration or if overlapping search is not
+ /// supported.
+ ///
+ /// One example is that only Aho-Corasicker searchers built with
+ /// [`MatchKind::Standard`] semantics support overlapping searches. Using
+ /// any other match semantics will result in this returning an error.
+ ///
+ /// # Example: basic usage
+ ///
+ /// This shows how we can repeatedly call an overlapping search without
+ /// ever needing to explicitly re-slice the haystack. Overlapping search
+ /// works this way because searches depend on state saved during the
+ /// previous search.
+ ///
+ /// ```
+ /// use aho_corasick::{
+ /// automaton::OverlappingState,
+ /// AhoCorasick, Input, Match,
+ /// };
+ ///
+ /// let patterns = &["append", "appendage", "app"];
+ /// let haystack = "append the app to the appendage";
+ ///
+ /// let ac = AhoCorasick::new(patterns).unwrap();
+ /// let mut state = OverlappingState::start();
+ ///
+ /// ac.try_find_overlapping(haystack, &mut state)?;
+ /// assert_eq!(Some(Match::must(2, 0..3)), state.get_match());
+ ///
+ /// ac.try_find_overlapping(haystack, &mut state)?;
+ /// assert_eq!(Some(Match::must(0, 0..6)), state.get_match());
+ ///
+ /// ac.try_find_overlapping(haystack, &mut state)?;
+ /// assert_eq!(Some(Match::must(2, 11..14)), state.get_match());
+ ///
+ /// ac.try_find_overlapping(haystack, &mut state)?;
+ /// assert_eq!(Some(Match::must(2, 22..25)), state.get_match());
+ ///
+ /// ac.try_find_overlapping(haystack, &mut state)?;
+ /// assert_eq!(Some(Match::must(0, 22..28)), state.get_match());
+ ///
+ /// ac.try_find_overlapping(haystack, &mut state)?;
+ /// assert_eq!(Some(Match::must(1, 22..31)), state.get_match());
+ ///
+ /// // No more match matches to be found.
+ /// ac.try_find_overlapping(haystack, &mut state)?;
+ /// assert_eq!(None, state.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: implementing your own overlapping iteration
+ ///
+ /// The previous example can be easily adapted to implement your own
+ /// iteration by repeatedly calling `try_find_overlapping` until either
+ /// an error occurs or no more matches are reported.
+ ///
+ /// This is effectively equivalent to the iterator returned by
+ /// [`AhoCorasick::try_find_overlapping_iter`], with the only difference
+ /// being that the iterator checks for errors before construction and
+ /// absolves the caller of needing to check for errors on every search
+ /// call. (Indeed, if the first `try_find_overlapping` call succeeds and
+ /// the same `Input` is given to subsequent calls, then all subsequent
+ /// calls are guaranteed to succeed.)
+ ///
+ /// ```
+ /// use aho_corasick::{
+ /// automaton::OverlappingState,
+ /// AhoCorasick, Input, Match,
+ /// };
+ ///
+ /// let patterns = &["append", "appendage", "app"];
+ /// let haystack = "append the app to the appendage";
+ ///
+ /// let ac = AhoCorasick::new(patterns).unwrap();
+ /// let mut state = OverlappingState::start();
+ /// let mut matches = vec![];
+ ///
+ /// loop {
+ /// ac.try_find_overlapping(haystack, &mut state)?;
+ /// let mat = match state.get_match() {
+ /// None => break,
+ /// Some(mat) => mat,
+ /// };
+ /// matches.push(mat);
+ /// }
+ /// let expected = vec![
+ /// Match::must(2, 0..3),
+ /// Match::must(0, 0..6),
+ /// Match::must(2, 11..14),
+ /// Match::must(2, 22..25),
+ /// Match::must(0, 22..28),
+ /// Match::must(1, 22..31),
+ /// ];
+ /// assert_eq!(expected, matches);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: anchored iteration
+ ///
+ /// The previous example can also be adapted to implement
+ /// iteration over all anchored matches. In particular,
+ /// [`AhoCorasick::try_find_overlapping_iter`] does not support this
+ /// because it isn't totally clear what the match semantics ought to be.
+ ///
+ /// In this example, we will find all overlapping matches that start at
+ /// the beginning of our search.
+ ///
+ /// ```
+ /// use aho_corasick::{
+ /// automaton::OverlappingState,
+ /// AhoCorasick, Anchored, Input, Match, StartKind,
+ /// };
+ ///
+ /// let patterns = &["append", "appendage", "app"];
+ /// let haystack = "append the app to the appendage";
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .start_kind(StartKind::Anchored)
+ /// .build(patterns)
+ /// .unwrap();
+ /// let input = Input::new(haystack).anchored(Anchored::Yes);
+ /// let mut state = OverlappingState::start();
+ /// let mut matches = vec![];
+ ///
+ /// loop {
+ /// ac.try_find_overlapping(input.clone(), &mut state)?;
+ /// let mat = match state.get_match() {
+ /// None => break,
+ /// Some(mat) => mat,
+ /// };
+ /// matches.push(mat);
+ /// }
+ /// let expected = vec![
+ /// Match::must(2, 0..3),
+ /// Match::must(0, 0..6),
+ /// ];
+ /// assert_eq!(expected, matches);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn try_find_overlapping<'h, I: Into<Input<'h>>>(
+ &self,
+ input: I,
+ state: &mut OverlappingState,
+ ) -> Result<(), MatchError> {
+ let input = input.into();
+ enforce_anchored_consistency(self.start_kind, input.get_anchored())?;
+ self.aut.try_find_overlapping(&input, state)
+ }
+
+ /// Returns an iterator of non-overlapping matches, using the match
+ /// semantics that this automaton was constructed with.
+ ///
+ /// This is the fallible version of [`AhoCorasick::find_iter`].
+ ///
+ /// Note that the error returned by this method occurs during construction
+ /// of the iterator. The iterator itself yields `Match` values. That is,
+ /// once the iterator is constructed, the iteration itself will never
+ /// report an error.
+ ///
+ /// # Errors
+ ///
+ /// This returns an error when this Aho-Corasick searcher does not support
+ /// the given `Input` configuration.
+ ///
+ /// For example, if the Aho-Corasick searcher only supports anchored
+ /// searches or only supports unanchored searches, then providing an
+ /// `Input` that requests an anchored (or unanchored) search when it isn't
+ /// supported would result in an error.
+ ///
+ /// # Example: leftmost-first searching
+ ///
+ /// Basic usage with leftmost-first semantics:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, Input, MatchKind, PatternID};
+ ///
+ /// let patterns = &["append", "appendage", "app"];
+ /// let haystack = "append the app to the appendage";
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .match_kind(MatchKind::LeftmostFirst)
+ /// .build(patterns)
+ /// .unwrap();
+ /// let matches: Vec<PatternID> = ac
+ /// .try_find_iter(Input::new(haystack))?
+ /// .map(|mat| mat.pattern())
+ /// .collect();
+ /// assert_eq!(vec![
+ /// PatternID::must(0),
+ /// PatternID::must(2),
+ /// PatternID::must(0),
+ /// ], matches);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: anchored leftmost-first searching
+ ///
+ /// This shows how to anchor the search, such that all matches must begin
+ /// at the starting location of the search. For an iterator, an anchored
+ /// search implies that all matches are adjacent.
+ ///
+ /// ```
+ /// use aho_corasick::{
+ /// AhoCorasick, Anchored, Input, MatchKind, PatternID, StartKind,
+ /// };
+ ///
+ /// let patterns = &["foo", "bar", "quux"];
+ /// let haystack = "fooquuxbar foo";
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .match_kind(MatchKind::LeftmostFirst)
+ /// .start_kind(StartKind::Anchored)
+ /// .build(patterns)
+ /// .unwrap();
+ /// let matches: Vec<PatternID> = ac
+ /// .try_find_iter(Input::new(haystack).anchored(Anchored::Yes))?
+ /// .map(|mat| mat.pattern())
+ /// .collect();
+ /// assert_eq!(vec![
+ /// PatternID::must(0),
+ /// PatternID::must(2),
+ /// PatternID::must(1),
+ /// // The final 'foo' is not found because it is not adjacent to the
+ /// // 'bar' match. It needs to be adjacent because our search is
+ /// // anchored.
+ /// ], matches);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn try_find_iter<'a, 'h, I: Into<Input<'h>>>(
+ &'a self,
+ input: I,
+ ) -> Result<FindIter<'a, 'h>, MatchError> {
+ let input = input.into();
+ enforce_anchored_consistency(self.start_kind, input.get_anchored())?;
+ Ok(FindIter(self.aut.try_find_iter(input)?))
+ }
+
+ /// Returns an iterator of overlapping matches.
+ ///
+ /// This is the fallible version of [`AhoCorasick::find_overlapping_iter`].
+ ///
+ /// Note that the error returned by this method occurs during construction
+ /// of the iterator. The iterator itself yields `Match` values. That is,
+ /// once the iterator is constructed, the iteration itself will never
+ /// report an error.
+ ///
+ /// # Errors
+ ///
+ /// This returns an error when this Aho-Corasick searcher does not support
+ /// the given `Input` configuration or does not support overlapping
+ /// searches.
+ ///
+ /// One example is that only Aho-Corasicker searchers built with
+ /// [`MatchKind::Standard`] semantics support overlapping searches. Using
+ /// any other match semantics will result in this returning an error.
+ ///
+ /// # Example: basic usage
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, Input, PatternID};
+ ///
+ /// let patterns = &["append", "appendage", "app"];
+ /// let haystack = "append the app to the appendage";
+ ///
+ /// let ac = AhoCorasick::new(patterns).unwrap();
+ /// let matches: Vec<PatternID> = ac
+ /// .try_find_overlapping_iter(Input::new(haystack))?
+ /// .map(|mat| mat.pattern())
+ /// .collect();
+ /// assert_eq!(vec![
+ /// PatternID::must(2),
+ /// PatternID::must(0),
+ /// PatternID::must(2),
+ /// PatternID::must(2),
+ /// PatternID::must(0),
+ /// PatternID::must(1),
+ /// ], matches);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: anchored overlapping search returns an error
+ ///
+ /// It isn't clear what the match semantics for anchored overlapping
+ /// iterators *ought* to be, so currently an error is returned. Callers
+ /// may use [`AhoCorasick::try_find_overlapping`] to implement their own
+ /// semantics if desired.
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, Anchored, Input, StartKind};
+ ///
+ /// let patterns = &["append", "appendage", "app"];
+ /// let haystack = "appendappendage app";
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .start_kind(StartKind::Anchored)
+ /// .build(patterns)
+ /// .unwrap();
+ /// let input = Input::new(haystack).anchored(Anchored::Yes);
+ /// assert!(ac.try_find_overlapping_iter(input).is_err());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn try_find_overlapping_iter<'a, 'h, I: Into<Input<'h>>>(
+ &'a self,
+ input: I,
+ ) -> Result<FindOverlappingIter<'a, 'h>, MatchError> {
+ let input = input.into();
+ enforce_anchored_consistency(self.start_kind, input.get_anchored())?;
+ Ok(FindOverlappingIter(self.aut.try_find_overlapping_iter(input)?))
+ }
+
+ /// Replace all matches with a corresponding value in the `replace_with`
+ /// slice given. Matches correspond to the same matches as reported by
+ /// [`AhoCorasick::try_find_iter`].
+ ///
+ /// Replacements are determined by the index of the matching pattern.
+ /// For example, if the pattern with index `2` is found, then it is
+ /// replaced by `replace_with[2]`.
+ ///
+ /// # Panics
+ ///
+ /// This panics when `replace_with.len()` does not equal
+ /// [`AhoCorasick::patterns_len`].
+ ///
+ /// # Errors
+ ///
+ /// This returns an error when this Aho-Corasick searcher does not support
+ /// the default `Input` configuration. More specifically, this occurs only
+ /// when the Aho-Corasick searcher does not support unanchored searches
+ /// since this replacement routine always does an unanchored search.
+ ///
+ /// # Example: basic usage
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, MatchKind};
+ ///
+ /// let patterns = &["append", "appendage", "app"];
+ /// let haystack = "append the app to the appendage";
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .match_kind(MatchKind::LeftmostFirst)
+ /// .build(patterns)
+ /// .unwrap();
+ /// let result = ac.try_replace_all(haystack, &["x", "y", "z"])?;
+ /// assert_eq!("x the z to the xage", result);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn try_replace_all<B>(
+ &self,
+ haystack: &str,
+ replace_with: &[B],
+ ) -> Result<String, MatchError>
+ where
+ B: AsRef<str>,
+ {
+ enforce_anchored_consistency(self.start_kind, Anchored::No)?;
+ self.aut.try_replace_all(haystack, replace_with)
+ }
+
+ /// Replace all matches using raw bytes with a corresponding value in the
+ /// `replace_with` slice given. Matches correspond to the same matches as
+ /// reported by [`AhoCorasick::try_find_iter`].
+ ///
+ /// Replacements are determined by the index of the matching pattern.
+ /// For example, if the pattern with index `2` is found, then it is
+ /// replaced by `replace_with[2]`.
+ ///
+ /// This is the fallible version of [`AhoCorasick::replace_all_bytes`].
+ ///
+ /// # Panics
+ ///
+ /// This panics when `replace_with.len()` does not equal
+ /// [`AhoCorasick::patterns_len`].
+ ///
+ /// # Errors
+ ///
+ /// This returns an error when this Aho-Corasick searcher does not support
+ /// the default `Input` configuration. More specifically, this occurs only
+ /// when the Aho-Corasick searcher does not support unanchored searches
+ /// since this replacement routine always does an unanchored search.
+ ///
+ /// # Example: basic usage
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, MatchKind};
+ ///
+ /// let patterns = &["append", "appendage", "app"];
+ /// let haystack = b"append the app to the appendage";
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .match_kind(MatchKind::LeftmostFirst)
+ /// .build(patterns)
+ /// .unwrap();
+ /// let result = ac.try_replace_all_bytes(haystack, &["x", "y", "z"])?;
+ /// assert_eq!(b"x the z to the xage".to_vec(), result);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn try_replace_all_bytes<B>(
+ &self,
+ haystack: &[u8],
+ replace_with: &[B],
+ ) -> Result<Vec<u8>, MatchError>
+ where
+ B: AsRef<[u8]>,
+ {
+ enforce_anchored_consistency(self.start_kind, Anchored::No)?;
+ self.aut.try_replace_all_bytes(haystack, replace_with)
+ }
+
+ /// Replace all matches using a closure called on each match.
+ /// Matches correspond to the same matches as reported by
+ /// [`AhoCorasick::try_find_iter`].
+ ///
+ /// The closure accepts three parameters: the match found, the text of
+ /// the match and a string buffer with which to write the replaced text
+ /// (if any). If the closure returns `true`, then it continues to the next
+ /// match. If the closure returns `false`, then searching is stopped.
+ ///
+ /// Note that any matches with boundaries that don't fall on a valid UTF-8
+ /// boundary are silently skipped.
+ ///
+ /// This is the fallible version of [`AhoCorasick::replace_all_with`].
+ ///
+ /// # Errors
+ ///
+ /// This returns an error when this Aho-Corasick searcher does not support
+ /// the default `Input` configuration. More specifically, this occurs only
+ /// when the Aho-Corasick searcher does not support unanchored searches
+ /// since this replacement routine always does an unanchored search.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, MatchKind};
+ ///
+ /// let patterns = &["append", "appendage", "app"];
+ /// let haystack = "append the app to the appendage";
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .match_kind(MatchKind::LeftmostFirst)
+ /// .build(patterns)
+ /// .unwrap();
+ /// let mut result = String::new();
+ /// ac.try_replace_all_with(haystack, &mut result, |mat, _, dst| {
+ /// dst.push_str(&mat.pattern().as_usize().to_string());
+ /// true
+ /// })?;
+ /// assert_eq!("0 the 2 to the 0age", result);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// Stopping the replacement by returning `false` (continued from the
+ /// example above):
+ ///
+ /// ```
+ /// # use aho_corasick::{AhoCorasick, MatchKind, PatternID};
+ /// # let patterns = &["append", "appendage", "app"];
+ /// # let haystack = "append the app to the appendage";
+ /// # let ac = AhoCorasick::builder()
+ /// # .match_kind(MatchKind::LeftmostFirst)
+ /// # .build(patterns)
+ /// # .unwrap();
+ /// let mut result = String::new();
+ /// ac.try_replace_all_with(haystack, &mut result, |mat, _, dst| {
+ /// dst.push_str(&mat.pattern().as_usize().to_string());
+ /// mat.pattern() != PatternID::must(2)
+ /// })?;
+ /// assert_eq!("0 the 2 to the appendage", result);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn try_replace_all_with<F>(
+ &self,
+ haystack: &str,
+ dst: &mut String,
+ replace_with: F,
+ ) -> Result<(), MatchError>
+ where
+ F: FnMut(&Match, &str, &mut String) -> bool,
+ {
+ enforce_anchored_consistency(self.start_kind, Anchored::No)?;
+ self.aut.try_replace_all_with(haystack, dst, replace_with)
+ }
+
+ /// Replace all matches using raw bytes with a closure called on each
+ /// match. Matches correspond to the same matches as reported by
+ /// [`AhoCorasick::try_find_iter`].
+ ///
+ /// The closure accepts three parameters: the match found, the text of
+ /// the match and a byte buffer with which to write the replaced text
+ /// (if any). If the closure returns `true`, then it continues to the next
+ /// match. If the closure returns `false`, then searching is stopped.
+ ///
+ /// This is the fallible version of
+ /// [`AhoCorasick::replace_all_with_bytes`].
+ ///
+ /// # Errors
+ ///
+ /// This returns an error when this Aho-Corasick searcher does not support
+ /// the default `Input` configuration. More specifically, this occurs only
+ /// when the Aho-Corasick searcher does not support unanchored searches
+ /// since this replacement routine always does an unanchored search.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, MatchKind};
+ ///
+ /// let patterns = &["append", "appendage", "app"];
+ /// let haystack = b"append the app to the appendage";
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .match_kind(MatchKind::LeftmostFirst)
+ /// .build(patterns)
+ /// .unwrap();
+ /// let mut result = vec![];
+ /// ac.try_replace_all_with_bytes(haystack, &mut result, |mat, _, dst| {
+ /// dst.extend(mat.pattern().as_usize().to_string().bytes());
+ /// true
+ /// })?;
+ /// assert_eq!(b"0 the 2 to the 0age".to_vec(), result);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// Stopping the replacement by returning `false` (continued from the
+ /// example above):
+ ///
+ /// ```
+ /// # use aho_corasick::{AhoCorasick, MatchKind, PatternID};
+ /// # let patterns = &["append", "appendage", "app"];
+ /// # let haystack = b"append the app to the appendage";
+ /// # let ac = AhoCorasick::builder()
+ /// # .match_kind(MatchKind::LeftmostFirst)
+ /// # .build(patterns)
+ /// # .unwrap();
+ /// let mut result = vec![];
+ /// ac.try_replace_all_with_bytes(haystack, &mut result, |mat, _, dst| {
+ /// dst.extend(mat.pattern().as_usize().to_string().bytes());
+ /// mat.pattern() != PatternID::must(2)
+ /// })?;
+ /// assert_eq!(b"0 the 2 to the appendage".to_vec(), result);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn try_replace_all_with_bytes<F>(
+ &self,
+ haystack: &[u8],
+ dst: &mut Vec<u8>,
+ replace_with: F,
+ ) -> Result<(), MatchError>
+ where
+ F: FnMut(&Match, &[u8], &mut Vec<u8>) -> bool,
+ {
+ enforce_anchored_consistency(self.start_kind, Anchored::No)?;
+ self.aut.try_replace_all_with_bytes(haystack, dst, replace_with)
+ }
+
+ /// Returns an iterator of non-overlapping matches in the given
+ /// stream. Matches correspond to the same matches as reported by
+ /// [`AhoCorasick::try_find_iter`].
+ ///
+ /// The matches yielded by this iterator use absolute position offsets in
+ /// the stream given, where the first byte has index `0`. Matches are
+ /// yieled until the stream is exhausted.
+ ///
+ /// Each item yielded by the iterator is an `Result<Match,
+ /// std::io::Error>`, where an error is yielded if there was a problem
+ /// reading from the reader given.
+ ///
+ /// When searching a stream, an internal buffer is used. Therefore, callers
+ /// should avoiding providing a buffered reader, if possible.
+ ///
+ /// This is the fallible version of [`AhoCorasick::stream_find_iter`].
+ /// Note that both methods return iterators that produce `Result` values.
+ /// The difference is that this routine returns an error if _construction_
+ /// of the iterator failed. The `Result` values yield by the iterator
+ /// come from whether the given reader returns an error or not during the
+ /// search.
+ ///
+ /// # Memory usage
+ ///
+ /// In general, searching streams will use a constant amount of memory for
+ /// its internal buffer. The one requirement is that the internal buffer
+ /// must be at least the size of the longest possible match. In most use
+ /// cases, the default buffer size will be much larger than any individual
+ /// match.
+ ///
+ /// # Errors
+ ///
+ /// This returns an error when this Aho-Corasick searcher does not support
+ /// the default `Input` configuration. More specifically, this occurs only
+ /// when the Aho-Corasick searcher does not support unanchored searches
+ /// since this stream searching routine always does an unanchored search.
+ ///
+ /// This also returns an error if the searcher does not support stream
+ /// searches. Only searchers built with [`MatchKind::Standard`] semantics
+ /// support stream searches.
+ ///
+ /// # Example: basic usage
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, PatternID};
+ ///
+ /// let patterns = &["append", "appendage", "app"];
+ /// let haystack = "append the app to the appendage";
+ ///
+ /// let ac = AhoCorasick::new(patterns).unwrap();
+ /// let mut matches = vec![];
+ /// for result in ac.try_stream_find_iter(haystack.as_bytes())? {
+ /// let mat = result?;
+ /// matches.push(mat.pattern());
+ /// }
+ /// assert_eq!(vec![
+ /// PatternID::must(2),
+ /// PatternID::must(2),
+ /// PatternID::must(2),
+ /// ], matches);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "std")]
+ pub fn try_stream_find_iter<'a, R: std::io::Read>(
+ &'a self,
+ rdr: R,
+ ) -> Result<StreamFindIter<'a, R>, MatchError> {
+ enforce_anchored_consistency(self.start_kind, Anchored::No)?;
+ self.aut.try_stream_find_iter(rdr).map(StreamFindIter)
+ }
+
+ /// Search for and replace all matches of this automaton in
+ /// the given reader, and write the replacements to the given
+ /// writer. Matches correspond to the same matches as reported by
+ /// [`AhoCorasick::try_find_iter`].
+ ///
+ /// Replacements are determined by the index of the matching pattern. For
+ /// example, if the pattern with index `2` is found, then it is replaced by
+ /// `replace_with[2]`.
+ ///
+ /// After all matches are replaced, the writer is _not_ flushed.
+ ///
+ /// If there was a problem reading from the given reader or writing to the
+ /// given writer, then the corresponding `io::Error` is returned and all
+ /// replacement is stopped.
+ ///
+ /// When searching a stream, an internal buffer is used. Therefore, callers
+ /// should avoiding providing a buffered reader, if possible. However,
+ /// callers may want to provide a buffered writer.
+ ///
+ /// Note that there is currently no infallible version of this routine.
+ ///
+ /// # Memory usage
+ ///
+ /// In general, searching streams will use a constant amount of memory for
+ /// its internal buffer. The one requirement is that the internal buffer
+ /// must be at least the size of the longest possible match. In most use
+ /// cases, the default buffer size will be much larger than any individual
+ /// match.
+ ///
+ /// # Panics
+ ///
+ /// This panics when `replace_with.len()` does not equal
+ /// [`AhoCorasick::patterns_len`].
+ ///
+ /// # Errors
+ ///
+ /// This returns an error when this Aho-Corasick searcher does not support
+ /// the default `Input` configuration. More specifically, this occurs only
+ /// when the Aho-Corasick searcher does not support unanchored searches
+ /// since this stream searching routine always does an unanchored search.
+ ///
+ /// This also returns an error if the searcher does not support stream
+ /// searches. Only searchers built with [`MatchKind::Standard`] semantics
+ /// support stream searches.
+ ///
+ /// # Example: basic usage
+ ///
+ /// ```
+ /// use aho_corasick::AhoCorasick;
+ ///
+ /// let patterns = &["fox", "brown", "quick"];
+ /// let haystack = "The quick brown fox.";
+ /// let replace_with = &["sloth", "grey", "slow"];
+ ///
+ /// let ac = AhoCorasick::new(patterns).unwrap();
+ /// let mut result = vec![];
+ /// ac.try_stream_replace_all(
+ /// haystack.as_bytes(),
+ /// &mut result,
+ /// replace_with,
+ /// )?;
+ /// assert_eq!(b"The slow grey sloth.".to_vec(), result);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "std")]
+ pub fn try_stream_replace_all<R, W, B>(
+ &self,
+ rdr: R,
+ wtr: W,
+ replace_with: &[B],
+ ) -> Result<(), std::io::Error>
+ where
+ R: std::io::Read,
+ W: std::io::Write,
+ B: AsRef<[u8]>,
+ {
+ enforce_anchored_consistency(self.start_kind, Anchored::No)
+ .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
+ self.aut.try_stream_replace_all(rdr, wtr, replace_with)
+ }
+
+ /// Search the given reader and replace all matches of this automaton
+ /// using the given closure. The result is written to the given
+ /// writer. Matches correspond to the same matches as reported by
+ /// [`AhoCorasick::try_find_iter`].
+ ///
+ /// The closure accepts three parameters: the match found, the text of
+ /// the match and the writer with which to write the replaced text (if any).
+ ///
+ /// After all matches are replaced, the writer is _not_ flushed.
+ ///
+ /// If there was a problem reading from the given reader or writing to the
+ /// given writer, then the corresponding `io::Error` is returned and all
+ /// replacement is stopped.
+ ///
+ /// When searching a stream, an internal buffer is used. Therefore, callers
+ /// should avoiding providing a buffered reader, if possible. However,
+ /// callers may want to provide a buffered writer.
+ ///
+ /// Note that there is currently no infallible version of this routine.
+ ///
+ /// # Memory usage
+ ///
+ /// In general, searching streams will use a constant amount of memory for
+ /// its internal buffer. The one requirement is that the internal buffer
+ /// must be at least the size of the longest possible match. In most use
+ /// cases, the default buffer size will be much larger than any individual
+ /// match.
+ ///
+ /// # Errors
+ ///
+ /// This returns an error when this Aho-Corasick searcher does not support
+ /// the default `Input` configuration. More specifically, this occurs only
+ /// when the Aho-Corasick searcher does not support unanchored searches
+ /// since this stream searching routine always does an unanchored search.
+ ///
+ /// This also returns an error if the searcher does not support stream
+ /// searches. Only searchers built with [`MatchKind::Standard`] semantics
+ /// support stream searches.
+ ///
+ /// # Example: basic usage
+ ///
+ /// ```
+ /// use std::io::Write;
+ /// use aho_corasick::AhoCorasick;
+ ///
+ /// let patterns = &["fox", "brown", "quick"];
+ /// let haystack = "The quick brown fox.";
+ ///
+ /// let ac = AhoCorasick::new(patterns).unwrap();
+ /// let mut result = vec![];
+ /// ac.try_stream_replace_all_with(
+ /// haystack.as_bytes(),
+ /// &mut result,
+ /// |mat, _, wtr| {
+ /// wtr.write_all(mat.pattern().as_usize().to_string().as_bytes())
+ /// },
+ /// )?;
+ /// assert_eq!(b"The 2 1 0.".to_vec(), result);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "std")]
+ pub fn try_stream_replace_all_with<R, W, F>(
+ &self,
+ rdr: R,
+ wtr: W,
+ replace_with: F,
+ ) -> Result<(), std::io::Error>
+ where
+ R: std::io::Read,
+ W: std::io::Write,
+ F: FnMut(&Match, &[u8], &mut W) -> Result<(), std::io::Error>,
+ {
+ enforce_anchored_consistency(self.start_kind, Anchored::No)
+ .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
+ self.aut.try_stream_replace_all_with(rdr, wtr, replace_with)
+ }
+}
+
+/// Routines for querying information about the Aho-Corasick automaton.
+impl AhoCorasick {
+ /// Returns the kind of the Aho-Corasick automaton used by this searcher.
+ ///
+ /// Knowing the Aho-Corasick kind is principally useful for diagnostic
+ /// purposes. In particular, if no specific kind was given to
+ /// [`AhoCorasickBuilder::kind`], then one is automatically chosen and
+ /// this routine will report which one.
+ ///
+ /// Note that the heuristics used for choosing which `AhoCorasickKind`
+ /// may be changed in a semver compatible release.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, AhoCorasickKind};
+ ///
+ /// let ac = AhoCorasick::new(&["foo", "bar", "quux", "baz"]).unwrap();
+ /// // The specific Aho-Corasick kind chosen is not guaranteed!
+ /// assert_eq!(AhoCorasickKind::DFA, ac.kind());
+ /// ```
+ pub fn kind(&self) -> AhoCorasickKind {
+ self.kind
+ }
+
+ /// Returns the type of starting search configuration supported by this
+ /// Aho-Corasick automaton.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, StartKind};
+ ///
+ /// let ac = AhoCorasick::new(&["foo", "bar", "quux", "baz"]).unwrap();
+ /// assert_eq!(StartKind::Unanchored, ac.start_kind());
+ /// ```
+ pub fn start_kind(&self) -> StartKind {
+ self.start_kind
+ }
+
+ /// Returns the match kind used by this automaton.
+ ///
+ /// The match kind is important because it determines what kinds of
+ /// matches are returned. Also, some operations (such as overlapping
+ /// search and stream searching) are only supported when using the
+ /// [`MatchKind::Standard`] match kind.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, MatchKind};
+ ///
+ /// let ac = AhoCorasick::new(&["foo", "bar", "quux", "baz"]).unwrap();
+ /// assert_eq!(MatchKind::Standard, ac.match_kind());
+ /// ```
+ pub fn match_kind(&self) -> MatchKind {
+ self.aut.match_kind()
+ }
+
+ /// Returns the length of the shortest pattern matched by this automaton.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::AhoCorasick;
+ ///
+ /// let ac = AhoCorasick::new(&["foo", "bar", "quux", "baz"]).unwrap();
+ /// assert_eq!(3, ac.min_pattern_len());
+ /// ```
+ ///
+ /// Note that an `AhoCorasick` automaton has a minimum length of `0` if
+ /// and only if it can match the empty string:
+ ///
+ /// ```
+ /// use aho_corasick::AhoCorasick;
+ ///
+ /// let ac = AhoCorasick::new(&["foo", "", "quux", "baz"]).unwrap();
+ /// assert_eq!(0, ac.min_pattern_len());
+ /// ```
+ pub fn min_pattern_len(&self) -> usize {
+ self.aut.min_pattern_len()
+ }
+
+ /// Returns the length of the longest pattern matched by this automaton.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::AhoCorasick;
+ ///
+ /// let ac = AhoCorasick::new(&["foo", "bar", "quux", "baz"]).unwrap();
+ /// assert_eq!(4, ac.max_pattern_len());
+ /// ```
+ pub fn max_pattern_len(&self) -> usize {
+ self.aut.max_pattern_len()
+ }
+
+ /// Return the total number of patterns matched by this automaton.
+ ///
+ /// This includes patterns that may never participate in a match. For
+ /// example, if [`MatchKind::LeftmostFirst`] match semantics are used, and
+ /// the patterns `Sam` and `Samwise` were used to build the automaton (in
+ /// that order), then `Samwise` can never participate in a match because
+ /// `Sam` will always take priority.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::AhoCorasick;
+ ///
+ /// let ac = AhoCorasick::new(&["foo", "bar", "baz"]).unwrap();
+ /// assert_eq!(3, ac.patterns_len());
+ /// ```
+ pub fn patterns_len(&self) -> usize {
+ self.aut.patterns_len()
+ }
+
+ /// Returns the approximate total amount of heap used by this automaton, in
+ /// units of bytes.
+ ///
+ /// # Examples
+ ///
+ /// This example shows the difference in heap usage between a few
+ /// configurations:
+ ///
+ /// ```
+ /// # if !cfg!(target_pointer_width = "64") { return; }
+ /// use aho_corasick::{AhoCorasick, AhoCorasickKind, MatchKind};
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .kind(None) // default
+ /// .build(&["foobar", "bruce", "triskaidekaphobia", "springsteen"])
+ /// .unwrap();
+ /// assert_eq!(5_632, ac.memory_usage());
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .kind(None) // default
+ /// .ascii_case_insensitive(true)
+ /// .build(&["foobar", "bruce", "triskaidekaphobia", "springsteen"])
+ /// .unwrap();
+ /// assert_eq!(11_136, ac.memory_usage());
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .kind(Some(AhoCorasickKind::NoncontiguousNFA))
+ /// .ascii_case_insensitive(true)
+ /// .build(&["foobar", "bruce", "triskaidekaphobia", "springsteen"])
+ /// .unwrap();
+ /// assert_eq!(10_879, ac.memory_usage());
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .kind(Some(AhoCorasickKind::ContiguousNFA))
+ /// .ascii_case_insensitive(true)
+ /// .build(&["foobar", "bruce", "triskaidekaphobia", "springsteen"])
+ /// .unwrap();
+ /// assert_eq!(2_584, ac.memory_usage());
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .kind(Some(AhoCorasickKind::DFA))
+ /// .ascii_case_insensitive(true)
+ /// .build(&["foobar", "bruce", "triskaidekaphobia", "springsteen"])
+ /// .unwrap();
+ /// // While this shows the DFA being the biggest here by a small margin,
+ /// // don't let the difference fool you. With such a small number of
+ /// // patterns, the difference is small, but a bigger number of patterns
+ /// // will reveal that the rate of growth of the DFA is far bigger than
+ /// // the NFAs above. For a large number of patterns, it is easy for the
+ /// // DFA to take an order of magnitude more heap space (or more!).
+ /// assert_eq!(11_136, ac.memory_usage());
+ /// ```
+ pub fn memory_usage(&self) -> usize {
+ self.aut.memory_usage()
+ }
+}
+
+// We provide a manual debug impl so that we don't include the 'start_kind',
+// principally because it's kind of weird to do so and because it screws with
+// the carefully curated debug output for the underlying automaton.
+impl core::fmt::Debug for AhoCorasick {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ f.debug_tuple("AhoCorasick").field(&self.aut).finish()
+ }
+}
+
+/// An iterator of non-overlapping matches in a particular haystack.
+///
+/// This iterator yields matches according to the [`MatchKind`] used by this
+/// automaton.
+///
+/// This iterator is constructed via the [`AhoCorasick::find_iter`] and
+/// [`AhoCorasick::try_find_iter`] methods.
+///
+/// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton.
+///
+/// The lifetime `'h` refers to the lifetime of the haystack being searched.
+#[derive(Debug)]
+pub struct FindIter<'a, 'h>(automaton::FindIter<'a, 'h, Arc<dyn AcAutomaton>>);
+
+impl<'a, 'h> Iterator for FindIter<'a, 'h> {
+ type Item = Match;
+
+ #[inline]
+ fn next(&mut self) -> Option<Match> {
+ self.0.next()
+ }
+}
+
+/// An iterator of overlapping matches in a particular haystack.
+///
+/// This iterator will report all possible matches in a particular haystack,
+/// even when the matches overlap.
+///
+/// This iterator is constructed via the [`AhoCorasick::find_overlapping_iter`]
+/// and [`AhoCorasick::try_find_overlapping_iter`] methods.
+///
+/// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton.
+///
+/// The lifetime `'h` refers to the lifetime of the haystack being searched.
+#[derive(Debug)]
+pub struct FindOverlappingIter<'a, 'h>(
+ automaton::FindOverlappingIter<'a, 'h, Arc<dyn AcAutomaton>>,
+);
+
+impl<'a, 'h> Iterator for FindOverlappingIter<'a, 'h> {
+ type Item = Match;
+
+ #[inline]
+ fn next(&mut self) -> Option<Match> {
+ self.0.next()
+ }
+}
+
+/// An iterator that reports Aho-Corasick matches in a stream.
+///
+/// This iterator yields elements of type `Result<Match, std::io::Error>`,
+/// where an error is reported if there was a problem reading from the
+/// underlying stream. The iterator terminates only when the underlying stream
+/// reaches `EOF`.
+///
+/// This iterator is constructed via the [`AhoCorasick::stream_find_iter`] and
+/// [`AhoCorasick::try_stream_find_iter`] methods.
+///
+/// The type variable `R` refers to the `io::Read` stream that is being read
+/// from.
+///
+/// The lifetime `'a` refers to the lifetime of the corresponding
+/// [`AhoCorasick`] searcher.
+#[cfg(feature = "std")]
+#[derive(Debug)]
+pub struct StreamFindIter<'a, R>(
+ automaton::StreamFindIter<'a, Arc<dyn AcAutomaton>, R>,
+);
+
+#[cfg(feature = "std")]
+impl<'a, R: std::io::Read> Iterator for StreamFindIter<'a, R> {
+ type Item = Result<Match, std::io::Error>;
+
+ fn next(&mut self) -> Option<Result<Match, std::io::Error>> {
+ self.0.next()
+ }
+}
+
+/// A builder for configuring an Aho-Corasick automaton.
+///
+/// # Quick advice
+///
+/// * Use [`AhoCorasickBuilder::match_kind`] to configure your searcher
+/// with [`MatchKind::LeftmostFirst`] if you want to match how backtracking
+/// regex engines execute searches for `pat1|pat2|..|patN`. Use
+/// [`MatchKind::LeftmostLongest`] if you want to match how POSIX regex engines
+/// do it.
+/// * If you need an anchored search, use [`AhoCorasickBuilder::start_kind`] to
+/// set the [`StartKind::Anchored`] mode since [`StartKind::Unanchored`] is the
+/// default. Or just use [`StartKind::Both`] to support both types of searches.
+/// * You might want to use [`AhoCorasickBuilder::kind`] to set your searcher
+/// to always use a [`AhoCorasickKind::DFA`] if search speed is critical and
+/// memory usage isn't a concern. Otherwise, not setting a kind will probably
+/// make the right choice for you. Beware that if you use [`StartKind::Both`]
+/// to build a searcher that supports both unanchored and anchored searches
+/// _and_ you set [`AhoCorasickKind::DFA`], then the DFA will essentially be
+/// duplicated to support both simultaneously. This results in very high memory
+/// usage.
+/// * For all other options, their defaults are almost certainly what you want.
+#[derive(Clone, Debug, Default)]
+pub struct AhoCorasickBuilder {
+ nfa_noncontiguous: noncontiguous::Builder,
+ nfa_contiguous: contiguous::Builder,
+ dfa: dfa::Builder,
+ kind: Option<AhoCorasickKind>,
+ start_kind: StartKind,
+}
+
+impl AhoCorasickBuilder {
+ /// Create a new builder for configuring an Aho-Corasick automaton.
+ ///
+ /// The builder provides a way to configure a number of things, including
+ /// ASCII case insensitivity and what kind of match semantics are used.
+ pub fn new() -> AhoCorasickBuilder {
+ AhoCorasickBuilder::default()
+ }
+
+ /// Build an Aho-Corasick automaton using the configuration set on this
+ /// builder.
+ ///
+ /// A builder may be reused to create more automatons.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasickBuilder, PatternID};
+ ///
+ /// let patterns = &["foo", "bar", "baz"];
+ /// let ac = AhoCorasickBuilder::new().build(patterns).unwrap();
+ /// assert_eq!(
+ /// Some(PatternID::must(1)),
+ /// ac.find("xxx bar xxx").map(|m| m.pattern()),
+ /// );
+ /// ```
+ pub fn build<I, P>(&self, patterns: I) -> Result<AhoCorasick, BuildError>
+ where
+ I: IntoIterator<Item = P>,
+ P: AsRef<[u8]>,
+ {
+ let nfa = self.nfa_noncontiguous.build(patterns)?;
+ let (aut, kind): (Arc<dyn AcAutomaton>, AhoCorasickKind) =
+ match self.kind {
+ None => {
+ debug!(
+ "asked for automatic Aho-Corasick implementation, \
+ criteria: <patterns: {:?}, max pattern len: {:?}, \
+ start kind: {:?}>",
+ nfa.patterns_len(),
+ nfa.max_pattern_len(),
+ self.start_kind,
+ );
+ self.build_auto(nfa)
+ }
+ Some(AhoCorasickKind::NoncontiguousNFA) => {
+ debug!("forcefully chose noncontiguous NFA");
+ (Arc::new(nfa), AhoCorasickKind::NoncontiguousNFA)
+ }
+ Some(AhoCorasickKind::ContiguousNFA) => {
+ debug!("forcefully chose contiguous NFA");
+ let cnfa =
+ self.nfa_contiguous.build_from_noncontiguous(&nfa)?;
+ (Arc::new(cnfa), AhoCorasickKind::ContiguousNFA)
+ }
+ Some(AhoCorasickKind::DFA) => {
+ debug!("forcefully chose DFA");
+ let dfa = self.dfa.build_from_noncontiguous(&nfa)?;
+ (Arc::new(dfa), AhoCorasickKind::DFA)
+ }
+ };
+ Ok(AhoCorasick { aut, kind, start_kind: self.start_kind })
+ }
+
+ /// Implements the automatic selection logic for the Aho-Corasick
+ /// implementation to use. Since all Aho-Corasick automatons are built
+ /// from a non-contiguous NFA, the caller is responsible for building
+ /// that first.
+ fn build_auto(
+ &self,
+ nfa: noncontiguous::NFA,
+ ) -> (Arc<dyn AcAutomaton>, AhoCorasickKind) {
+ // We try to build a DFA if we have a very small number of patterns,
+ // otherwise the memory usage just gets too crazy. We also only do it
+ // when the start kind is unanchored or anchored, but not both, because
+ // both implies two full copies of the transition table.
+ let try_dfa = !matches!(self.start_kind, StartKind::Both)
+ && nfa.patterns_len() <= 100;
+ if try_dfa {
+ match self.dfa.build_from_noncontiguous(&nfa) {
+ Ok(dfa) => {
+ debug!("chose a DFA");
+ return (Arc::new(dfa), AhoCorasickKind::DFA);
+ }
+ Err(_err) => {
+ debug!(
+ "failed to build DFA, trying something else: {}",
+ _err
+ );
+ }
+ }
+ }
+ // We basically always want a contiguous NFA if the limited
+ // circumstances in which we use a DFA are not true. It is quite fast
+ // and has excellent memory usage. The only way we don't use it is if
+ // there are so many states that it can't fit in a contiguous NFA.
+ // And the only way to know that is to try to build it. Building a
+ // contiguous NFA is mostly just reshuffling data from a noncontiguous
+ // NFA, so it isn't too expensive, especially relative to building a
+ // noncontiguous NFA in the first place.
+ match self.nfa_contiguous.build_from_noncontiguous(&nfa) {
+ Ok(nfa) => {
+ debug!("chose contiguous NFA");
+ return (Arc::new(nfa), AhoCorasickKind::ContiguousNFA);
+ }
+ #[allow(unused_variables)] // unused when 'logging' is disabled
+ Err(_err) => {
+ debug!(
+ "failed to build contiguous NFA, \
+ trying something else: {}",
+ _err
+ );
+ }
+ }
+ debug!("chose non-contiguous NFA");
+ (Arc::new(nfa), AhoCorasickKind::NoncontiguousNFA)
+ }
+
+ /// Set the desired match semantics.
+ ///
+ /// The default is [`MatchKind::Standard`], which corresponds to the match
+ /// semantics supported by the standard textbook description of the
+ /// Aho-Corasick algorithm. Namely, matches are reported as soon as they
+ /// are found. Moreover, this is the only way to get overlapping matches
+ /// or do stream searching.
+ ///
+ /// The other kinds of match semantics that are supported are
+ /// [`MatchKind::LeftmostFirst`] and [`MatchKind::LeftmostLongest`]. The
+ /// former corresponds to the match you would get if you were to try to
+ /// match each pattern at each position in the haystack in the same order
+ /// that you give to the automaton. That is, it returns the leftmost match
+ /// corresponding to the earliest pattern given to the automaton. The
+ /// latter corresponds to finding the longest possible match among all
+ /// leftmost matches.
+ ///
+ /// For more details on match semantics, see the [documentation for
+ /// `MatchKind`](MatchKind).
+ ///
+ /// Note that setting this to [`MatchKind::LeftmostFirst`] or
+ /// [`MatchKind::LeftmostLongest`] will cause some search routines on
+ /// [`AhoCorasick`] to return an error (or panic if you're using the
+ /// infallible API). Notably, this includes stream and overlapping
+ /// searches.
+ ///
+ /// # Examples
+ ///
+ /// In these examples, we demonstrate the differences between match
+ /// semantics for a particular set of patterns in a specific order:
+ /// `b`, `abc`, `abcd`.
+ ///
+ /// Standard semantics:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, MatchKind};
+ ///
+ /// let patterns = &["b", "abc", "abcd"];
+ /// let haystack = "abcd";
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .match_kind(MatchKind::Standard) // default, not necessary
+ /// .build(patterns)
+ /// .unwrap();
+ /// let mat = ac.find(haystack).expect("should have a match");
+ /// assert_eq!("b", &haystack[mat.start()..mat.end()]);
+ /// ```
+ ///
+ /// Leftmost-first semantics:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, MatchKind};
+ ///
+ /// let patterns = &["b", "abc", "abcd"];
+ /// let haystack = "abcd";
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .match_kind(MatchKind::LeftmostFirst)
+ /// .build(patterns)
+ /// .unwrap();
+ /// let mat = ac.find(haystack).expect("should have a match");
+ /// assert_eq!("abc", &haystack[mat.start()..mat.end()]);
+ /// ```
+ ///
+ /// Leftmost-longest semantics:
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, MatchKind};
+ ///
+ /// let patterns = &["b", "abc", "abcd"];
+ /// let haystack = "abcd";
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .match_kind(MatchKind::LeftmostLongest)
+ /// .build(patterns)
+ /// .unwrap();
+ /// let mat = ac.find(haystack).expect("should have a match");
+ /// assert_eq!("abcd", &haystack[mat.start()..mat.end()]);
+ /// ```
+ pub fn match_kind(&mut self, kind: MatchKind) -> &mut AhoCorasickBuilder {
+ self.nfa_noncontiguous.match_kind(kind);
+ self.nfa_contiguous.match_kind(kind);
+ self.dfa.match_kind(kind);
+ self
+ }
+
+ /// Sets the starting state configuration for the automaton.
+ ///
+ /// Every Aho-Corasick automaton is capable of having two start states: one
+ /// that is used for unanchored searches and one that is used for anchored
+ /// searches. Some automatons, like the NFAs, support this with almost zero
+ /// additional cost. Other automatons, like the DFA, require two copies of
+ /// the underlying transition table to support both simultaneously.
+ ///
+ /// Because there may be an added non-trivial cost to supporting both, it
+ /// is possible to configure which starting state configuration is needed.
+ ///
+ /// Indeed, since anchored searches tend to be somewhat more rare,
+ /// _only_ unanchored searches are supported by default. Thus,
+ /// [`StartKind::Unanchored`] is the default.
+ ///
+ /// Note that when this is set to [`StartKind::Unanchored`], then
+ /// running an anchored search will result in an error (or a panic
+ /// if using the infallible APIs). Similarly, when this is set to
+ /// [`StartKind::Anchored`], then running an unanchored search will
+ /// result in an error (or a panic if using the infallible APIs). When
+ /// [`StartKind::Both`] is used, then both unanchored and anchored searches
+ /// are always supported.
+ ///
+ /// Also note that even if an `AhoCorasick` searcher is using an NFA
+ /// internally (which always supports both unanchored and anchored
+ /// searches), an error will still be reported for a search that isn't
+ /// supported by the configuration set via this method. This means,
+ /// for example, that an error is never dependent on which internal
+ /// implementation of Aho-Corasick is used.
+ ///
+ /// # Example: anchored search
+ ///
+ /// This shows how to build a searcher that only supports anchored
+ /// searches:
+ ///
+ /// ```
+ /// use aho_corasick::{
+ /// AhoCorasick, Anchored, Input, Match, MatchKind, StartKind,
+ /// };
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .match_kind(MatchKind::LeftmostFirst)
+ /// .start_kind(StartKind::Anchored)
+ /// .build(&["b", "abc", "abcd"])
+ /// .unwrap();
+ ///
+ /// // An unanchored search is not supported! An error here is guaranteed
+ /// // given the configuration above regardless of which kind of
+ /// // Aho-Corasick implementation ends up being used internally.
+ /// let input = Input::new("foo abcd").anchored(Anchored::No);
+ /// assert!(ac.try_find(input).is_err());
+ ///
+ /// let input = Input::new("foo abcd").anchored(Anchored::Yes);
+ /// assert_eq!(None, ac.try_find(input)?);
+ ///
+ /// let input = Input::new("abcd").anchored(Anchored::Yes);
+ /// assert_eq!(Some(Match::must(1, 0..3)), ac.try_find(input)?);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: unanchored and anchored searches
+ ///
+ /// This shows how to build a searcher that supports both unanchored and
+ /// anchored searches:
+ ///
+ /// ```
+ /// use aho_corasick::{
+ /// AhoCorasick, Anchored, Input, Match, MatchKind, StartKind,
+ /// };
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .match_kind(MatchKind::LeftmostFirst)
+ /// .start_kind(StartKind::Both)
+ /// .build(&["b", "abc", "abcd"])
+ /// .unwrap();
+ ///
+ /// let input = Input::new("foo abcd").anchored(Anchored::No);
+ /// assert_eq!(Some(Match::must(1, 4..7)), ac.try_find(input)?);
+ ///
+ /// let input = Input::new("foo abcd").anchored(Anchored::Yes);
+ /// assert_eq!(None, ac.try_find(input)?);
+ ///
+ /// let input = Input::new("abcd").anchored(Anchored::Yes);
+ /// assert_eq!(Some(Match::must(1, 0..3)), ac.try_find(input)?);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn start_kind(&mut self, kind: StartKind) -> &mut AhoCorasickBuilder {
+ self.dfa.start_kind(kind);
+ self.start_kind = kind;
+ self
+ }
+
+ /// Enable ASCII-aware case insensitive matching.
+ ///
+ /// When this option is enabled, searching will be performed without
+ /// respect to case for ASCII letters (`a-z` and `A-Z`) only.
+ ///
+ /// Enabling this option does not change the search algorithm, but it may
+ /// increase the size of the automaton.
+ ///
+ /// **NOTE:** It is unlikely that support for Unicode case folding will
+ /// be added in the future. The ASCII case works via a simple hack to the
+ /// underlying automaton, but full Unicode handling requires a fair bit of
+ /// sophistication. If you do need Unicode handling, you might consider
+ /// using the [`regex` crate](https://docs.rs/regex) or the lower level
+ /// [`regex-automata` crate](https://docs.rs/regex-automata).
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::AhoCorasick;
+ ///
+ /// let patterns = &["FOO", "bAr", "BaZ"];
+ /// let haystack = "foo bar baz";
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .ascii_case_insensitive(true)
+ /// .build(patterns)
+ /// .unwrap();
+ /// assert_eq!(3, ac.find_iter(haystack).count());
+ /// ```
+ pub fn ascii_case_insensitive(
+ &mut self,
+ yes: bool,
+ ) -> &mut AhoCorasickBuilder {
+ self.nfa_noncontiguous.ascii_case_insensitive(yes);
+ self.nfa_contiguous.ascii_case_insensitive(yes);
+ self.dfa.ascii_case_insensitive(yes);
+ self
+ }
+
+ /// Choose the type of underlying automaton to use.
+ ///
+ /// Currently, there are four choices:
+ ///
+ /// * [`AhoCorasickKind::NoncontiguousNFA`] instructs the searcher to
+ /// use a [`noncontiguous::NFA`]. A noncontiguous NFA is the fastest to
+ /// be built, has moderate memory usage and is typically the slowest to
+ /// execute a search.
+ /// * [`AhoCorasickKind::ContiguousNFA`] instructs the searcher to use a
+ /// [`contiguous::NFA`]. A contiguous NFA is a little slower to build than
+ /// a noncontiguous NFA, has excellent memory usage and is typically a
+ /// little slower than a DFA for a search.
+ /// * [`AhoCorasickKind::DFA`] instructs the searcher to use a
+ /// [`dfa::DFA`]. A DFA is very slow to build, uses exorbitant amounts of
+ /// memory, but will typically execute searches the fastest.
+ /// * `None` (the default) instructs the searcher to choose the "best"
+ /// Aho-Corasick implementation. This choice is typically based primarily
+ /// on the number of patterns.
+ ///
+ /// Setting this configuration does not change the time complexity for
+ /// constructing the Aho-Corasick automaton (which is `O(p)` where `p`
+ /// is the total number of patterns being compiled). Setting this to
+ /// [`AhoCorasickKind::DFA`] does however reduce the time complexity of
+ /// non-overlapping searches from `O(n + p)` to `O(n)`, where `n` is the
+ /// length of the haystack.
+ ///
+ /// In general, you should probably stick to the default unless you have
+ /// some kind of reason to use a specific Aho-Corasick implementation. For
+ /// example, you might choose `AhoCorasickKind::DFA` if you don't care
+ /// about memory usage and want the fastest possible search times.
+ ///
+ /// Setting this guarantees that the searcher returned uses the chosen
+ /// implementation. If that implementation could not be constructed, then
+ /// an error will be returned. In contrast, when `None` is used, it is
+ /// possible for it to attempt to construct, for example, a contiguous
+ /// NFA and have it fail. In which case, it will fall back to using a
+ /// noncontiguous NFA.
+ ///
+ /// If `None` is given, then one may use [`AhoCorasick::kind`] to determine
+ /// which Aho-Corasick implementation was chosen.
+ ///
+ /// Note that the heuristics used for choosing which `AhoCorasickKind`
+ /// may be changed in a semver compatible release.
+ pub fn kind(
+ &mut self,
+ kind: Option<AhoCorasickKind>,
+ ) -> &mut AhoCorasickBuilder {
+ self.kind = kind;
+ self
+ }
+
+ /// Enable heuristic prefilter optimizations.
+ ///
+ /// When enabled, searching will attempt to quickly skip to match
+ /// candidates using specialized literal search routines. A prefilter
+ /// cannot always be used, and is generally treated as a heuristic. It
+ /// can be useful to disable this if the prefilter is observed to be
+ /// sub-optimal for a particular workload.
+ ///
+ /// Currently, prefilters are typically only active when building searchers
+ /// with a small (less than 100) number of patterns.
+ ///
+ /// This is enabled by default.
+ pub fn prefilter(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
+ self.nfa_noncontiguous.prefilter(yes);
+ self.nfa_contiguous.prefilter(yes);
+ self.dfa.prefilter(yes);
+ self
+ }
+
+ /// Set the limit on how many states use a dense representation for their
+ /// transitions. Other states will generally use a sparse representation.
+ ///
+ /// A dense representation uses more memory but is generally faster, since
+ /// the next transition in a dense representation can be computed in a
+ /// constant number of instructions. A sparse representation uses less
+ /// memory but is generally slower, since the next transition in a sparse
+ /// representation requires executing a variable number of instructions.
+ ///
+ /// This setting is only used when an Aho-Corasick implementation is used
+ /// that supports the dense versus sparse representation trade off. Not all
+ /// do.
+ ///
+ /// This limit is expressed in terms of the depth of a state, i.e., the
+ /// number of transitions from the starting state of the automaton. The
+ /// idea is that most of the time searching will be spent near the starting
+ /// state of the automaton, so states near the start state should use a
+ /// dense representation. States further away from the start state would
+ /// then use a sparse representation.
+ ///
+ /// By default, this is set to a low but non-zero number. Setting this to
+ /// `0` is almost never what you want, since it is likely to make searches
+ /// very slow due to the start state itself being forced to use a sparse
+ /// representation. However, it is unlikely that increasing this number
+ /// will help things much, since the most active states have a small depth.
+ /// More to the point, the memory usage increases superlinearly as this
+ /// number increases.
+ pub fn dense_depth(&mut self, depth: usize) -> &mut AhoCorasickBuilder {
+ self.nfa_noncontiguous.dense_depth(depth);
+ self.nfa_contiguous.dense_depth(depth);
+ self
+ }
+
+ /// A debug settting for whether to attempt to shrink the size of the
+ /// automaton's alphabet or not.
+ ///
+ /// This option is enabled by default and should never be disabled unless
+ /// one is debugging the underlying automaton.
+ ///
+ /// When enabled, some (but not all) Aho-Corasick automatons will use a map
+ /// from all possible bytes to their corresponding equivalence class. Each
+ /// equivalence class represents a set of bytes that does not discriminate
+ /// between a match and a non-match in the automaton.
+ ///
+ /// The advantage of this map is that the size of the transition table can
+ /// be reduced drastically from `#states * 256 * sizeof(u32)` to
+ /// `#states * k * sizeof(u32)` where `k` is the number of equivalence
+ /// classes (rounded up to the nearest power of 2). As a result, total
+ /// space usage can decrease substantially. Moreover, since a smaller
+ /// alphabet is used, automaton compilation becomes faster as well.
+ ///
+ /// **WARNING:** This is only useful for debugging automatons. Disabling
+ /// this does not yield any speed advantages. Namely, even when this is
+ /// disabled, a byte class map is still used while searching. The only
+ /// difference is that every byte will be forced into its own distinct
+ /// equivalence class. This is useful for debugging the actual generated
+ /// transitions because it lets one see the transitions defined on actual
+ /// bytes instead of the equivalence classes.
+ pub fn byte_classes(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
+ self.nfa_contiguous.byte_classes(yes);
+ self.dfa.byte_classes(yes);
+ self
+ }
+}
+
+/// The type of Aho-Corasick implementation to use in an [`AhoCorasick`]
+/// searcher.
+///
+/// This is principally used as an input to the
+/// [`AhoCorasickBuilder::start_kind`] method. Its documentation goes into more
+/// detail about each choice.
+#[non_exhaustive]
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum AhoCorasickKind {
+ /// Use a noncontiguous NFA.
+ NoncontiguousNFA,
+ /// Use a contiguous NFA.
+ ContiguousNFA,
+ /// Use a DFA. Warning: DFAs typically use a large amount of memory.
+ DFA,
+}
+
+/// A trait that effectively gives us practical dynamic dispatch over anything
+/// that impls `Automaton`, but without needing to add a bunch of bounds to
+/// the core `Automaton` trait. Basically, we provide all of the marker traits
+/// that our automatons have, in addition to `Debug` impls and requiring that
+/// there is no borrowed data. Without these, the main `AhoCorasick` type would
+/// not be able to meaningfully impl `Debug` or the marker traits without also
+/// requiring that all impls of `Automaton` do so, which would be not great.
+trait AcAutomaton:
+ Automaton + Debug + Send + Sync + UnwindSafe + RefUnwindSafe + 'static
+{
+}
+
+impl<A> AcAutomaton for A where
+ A: Automaton + Debug + Send + Sync + UnwindSafe + RefUnwindSafe + 'static
+{
+}
+
+impl crate::automaton::private::Sealed for Arc<dyn AcAutomaton> {}
+
+// I'm not sure why this trait impl shows up in the docs, as the AcAutomaton
+// trait is not exported. So we forcefully hide it.
+//
+// SAFETY: This just defers to the underlying 'AcAutomaton' and thus inherits
+// its safety properties.
+#[doc(hidden)]
+unsafe impl Automaton for Arc<dyn AcAutomaton> {
+ #[inline(always)]
+ fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> {
+ (**self).start_state(anchored)
+ }
+
+ #[inline(always)]
+ fn next_state(
+ &self,
+ anchored: Anchored,
+ sid: StateID,
+ byte: u8,
+ ) -> StateID {
+ (**self).next_state(anchored, sid, byte)
+ }
+
+ #[inline(always)]
+ fn is_special(&self, sid: StateID) -> bool {
+ (**self).is_special(sid)
+ }
+
+ #[inline(always)]
+ fn is_dead(&self, sid: StateID) -> bool {
+ (**self).is_dead(sid)
+ }
+
+ #[inline(always)]
+ fn is_match(&self, sid: StateID) -> bool {
+ (**self).is_match(sid)
+ }
+
+ #[inline(always)]
+ fn is_start(&self, sid: StateID) -> bool {
+ (**self).is_start(sid)
+ }
+
+ #[inline(always)]
+ fn match_kind(&self) -> MatchKind {
+ (**self).match_kind()
+ }
+
+ #[inline(always)]
+ fn match_len(&self, sid: StateID) -> usize {
+ (**self).match_len(sid)
+ }
+
+ #[inline(always)]
+ fn match_pattern(&self, sid: StateID, index: usize) -> PatternID {
+ (**self).match_pattern(sid, index)
+ }
+
+ #[inline(always)]
+ fn patterns_len(&self) -> usize {
+ (**self).patterns_len()
+ }
+
+ #[inline(always)]
+ fn pattern_len(&self, pid: PatternID) -> usize {
+ (**self).pattern_len(pid)
+ }
+
+ #[inline(always)]
+ fn min_pattern_len(&self) -> usize {
+ (**self).min_pattern_len()
+ }
+
+ #[inline(always)]
+ fn max_pattern_len(&self) -> usize {
+ (**self).max_pattern_len()
+ }
+
+ #[inline(always)]
+ fn memory_usage(&self) -> usize {
+ (**self).memory_usage()
+ }
+
+ #[inline(always)]
+ fn prefilter(&self) -> Option<&Prefilter> {
+ (**self).prefilter()
+ }
+
+ // Even though 'try_find' and 'try_find_overlapping' each have their
+ // own default impls, we explicitly define them here to fix a perf bug.
+ // Without these explicit definitions, the default impl will wind up using
+ // dynamic dispatch for all 'Automaton' method calls, including things like
+ // 'next_state' that absolutely must get inlined or else perf is trashed.
+ // Defining them explicitly here like this still requires dynamic dispatch
+ // to call 'try_find' itself, but all uses of 'Automaton' within 'try_find'
+ // are monomorphized.
+ //
+ // We don't need to explicitly impl any other methods, I think, because
+ // they are all implemented themselves in terms of 'try_find' and
+ // 'try_find_overlapping'. We still might wind up with an extra virtual
+ // call here or there, but that's okay since it's outside of any perf
+ // critical areas.
+
+ #[inline(always)]
+ fn try_find(
+ &self,
+ input: &Input<'_>,
+ ) -> Result<Option<Match>, MatchError> {
+ (**self).try_find(input)
+ }
+
+ #[inline(always)]
+ fn try_find_overlapping(
+ &self,
+ input: &Input<'_>,
+ state: &mut OverlappingState,
+ ) -> Result<(), MatchError> {
+ (**self).try_find_overlapping(input, state)
+ }
+}
+
+/// Returns an error if the start state configuration does not support the
+/// desired search configuration. See the internal 'AhoCorasick::start_kind'
+/// field docs for more details.
+fn enforce_anchored_consistency(
+ have: StartKind,
+ want: Anchored,
+) -> Result<(), MatchError> {
+ match have {
+ StartKind::Both => Ok(()),
+ StartKind::Unanchored if !want.is_anchored() => Ok(()),
+ StartKind::Unanchored => Err(MatchError::invalid_input_anchored()),
+ StartKind::Anchored if want.is_anchored() => Ok(()),
+ StartKind::Anchored => Err(MatchError::invalid_input_unanchored()),
+ }
+}
diff --git a/third_party/rust/aho-corasick/src/automaton.rs b/third_party/rust/aho-corasick/src/automaton.rs
new file mode 100644
index 0000000000..c41dc6e1db
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/automaton.rs
@@ -0,0 +1,1608 @@
+/*!
+Provides [`Automaton`] trait for abstracting over Aho-Corasick automata.
+
+The `Automaton` trait provides a way to write generic code over any
+Aho-Corasick automaton. It also provides access to lower level APIs that
+permit walking the state transitions of an Aho-Corasick automaton manually.
+*/
+
+use alloc::{string::String, vec::Vec};
+
+use crate::util::{
+ error::MatchError,
+ primitives::PatternID,
+ search::{Anchored, Input, Match, MatchKind, Span},
+};
+
+pub use crate::util::{
+ prefilter::{Candidate, Prefilter},
+ primitives::{StateID, StateIDError},
+};
+
+/// We seal the `Automaton` trait for now. It's a big trait, and it's
+/// conceivable that I might want to add new required methods, and sealing the
+/// trait permits doing that in a backwards compatible fashion. On other the
+/// hand, if you have a solid use case for implementing the trait yourself,
+/// please file an issue and we can discuss it. This was *mostly* done as a
+/// conservative step.
+pub(crate) mod private {
+ pub trait Sealed {}
+}
+impl private::Sealed for crate::nfa::noncontiguous::NFA {}
+impl private::Sealed for crate::nfa::contiguous::NFA {}
+impl private::Sealed for crate::dfa::DFA {}
+
+impl<'a, T: private::Sealed + ?Sized> private::Sealed for &'a T {}
+
+/// A trait that abstracts over Aho-Corasick automata.
+///
+/// This trait primarily exists for niche use cases such as:
+///
+/// * Using an NFA or DFA directly, bypassing the top-level
+/// [`AhoCorasick`](crate::AhoCorasick) searcher. Currently, these include
+/// [`noncontiguous::NFA`](crate::nfa::noncontiguous::NFA),
+/// [`contiguous::NFA`](crate::nfa::contiguous::NFA) and
+/// [`dfa::DFA`](crate::dfa::DFA).
+/// * Implementing your own custom search routine by walking the automaton
+/// yourself. This might be useful for implementing search on non-contiguous
+/// strings or streams.
+///
+/// For most use cases, it is not expected that users will need
+/// to use or even know about this trait. Indeed, the top level
+/// [`AhoCorasick`](crate::AhoCorasick) searcher does not expose any details
+/// about this trait, nor does it implement it itself.
+///
+/// Note that this trait defines a number of default methods, such as
+/// [`Automaton::try_find`] and [`Automaton::try_find_iter`], which implement
+/// higher level search routines in terms of the lower level automata API.
+///
+/// # Sealed
+///
+/// Currently, this trait is sealed. That means users of this crate can write
+/// generic routines over this trait but cannot implement it themselves. This
+/// restriction may be lifted in the future, but sealing the trait permits
+/// adding new required methods in a backwards compatible fashion.
+///
+/// # Special states
+///
+/// This trait encodes a notion of "special" states in an automaton. Namely,
+/// a state is treated as special if it is a dead, match or start state:
+///
+/// * A dead state is a state that cannot be left once entered. All transitions
+/// on a dead state lead back to itself. The dead state is meant to be treated
+/// as a sentinel indicating that the search should stop and return a match if
+/// one has been found, and nothing otherwise.
+/// * A match state is a state that indicates one or more patterns have
+/// matched. Depending on the [`MatchKind`] of the automaton, a search may
+/// stop once a match is seen, or it may continue looking for matches until
+/// it enters a dead state or sees the end of the haystack.
+/// * A start state is a state that a search begins in. It is useful to know
+/// when a search enters a start state because it may mean that a prefilter can
+/// be used to skip ahead and quickly look for candidate matches. Unlike dead
+/// and match states, it is never necessary to explicitly handle start states
+/// for correctness. Indeed, in this crate, implementations of `Automaton`
+/// will only treat start states as "special" when a prefilter is enabled and
+/// active. Otherwise, treating it as special has no purpose and winds up
+/// slowing down the overall search because it results in ping-ponging between
+/// the main state transition and the "special" state logic.
+///
+/// Since checking whether a state is special by doing three different
+/// checks would be too expensive inside a fast search loop, the
+/// [`Automaton::is_special`] method is provided for quickly checking whether
+/// the state is special. The `Automaton::is_dead`, `Automaton::is_match` and
+/// `Automaton::is_start` predicates can then be used to determine which kind
+/// of special state it is.
+///
+/// # Panics
+///
+/// Most of the APIs on this trait should panic or give incorrect results
+/// if invalid inputs are given to it. For example, `Automaton::next_state`
+/// has unspecified behavior if the state ID given to it is not a valid
+/// state ID for the underlying automaton. Valid state IDs can only be
+/// retrieved in one of two ways: calling `Automaton::start_state` or calling
+/// `Automaton::next_state` with a valid state ID.
+///
+/// # Safety
+///
+/// This trait is not safe to implement so that code may rely on the
+/// correctness of implementations of this trait to avoid undefined behavior.
+/// The primary correctness guarantees are:
+///
+/// * `Automaton::start_state` always returns a valid state ID or an error or
+/// panics.
+/// * `Automaton::next_state`, when given a valid state ID, always returns
+/// a valid state ID for all values of `anchored` and `byte`, or otherwise
+/// panics.
+///
+/// In general, the rest of the methods on `Automaton` need to uphold their
+/// contracts as well. For example, `Automaton::is_dead` should only returns
+/// true if the given state ID is actually a dead state.
+///
+/// Note that currently this crate does not rely on the safety property defined
+/// here to avoid undefined behavior. Instead, this was done to make it
+/// _possible_ to do in the future.
+///
+/// # Example
+///
+/// This example shows how one might implement a basic but correct search
+/// routine. We keep things simple by not using prefilters or worrying about
+/// anchored searches, but do make sure our search is correct for all possible
+/// [`MatchKind`] semantics. (The comments in the code below note the parts
+/// that are needed to support certain `MatchKind` semantics.)
+///
+/// ```
+/// use aho_corasick::{
+/// automaton::Automaton,
+/// nfa::noncontiguous::NFA,
+/// Anchored, Match, MatchError, MatchKind,
+/// };
+///
+/// // Run an unanchored search for 'aut' in 'haystack'. Return the first match
+/// // seen according to the automaton's match semantics. This returns an error
+/// // if the given automaton does not support unanchored searches.
+/// fn find<A: Automaton>(
+/// aut: A,
+/// haystack: &[u8],
+/// ) -> Result<Option<Match>, MatchError> {
+/// let mut sid = aut.start_state(Anchored::No)?;
+/// let mut at = 0;
+/// let mut mat = None;
+/// let get_match = |sid, at| {
+/// let pid = aut.match_pattern(sid, 0);
+/// let len = aut.pattern_len(pid);
+/// Match::new(pid, (at - len)..at)
+/// };
+/// // Start states can be match states!
+/// if aut.is_match(sid) {
+/// mat = Some(get_match(sid, at));
+/// // Standard semantics require matches to be reported as soon as
+/// // they're seen. Otherwise, we continue until we see a dead state
+/// // or the end of the haystack.
+/// if matches!(aut.match_kind(), MatchKind::Standard) {
+/// return Ok(mat);
+/// }
+/// }
+/// while at < haystack.len() {
+/// sid = aut.next_state(Anchored::No, sid, haystack[at]);
+/// if aut.is_special(sid) {
+/// if aut.is_dead(sid) {
+/// return Ok(mat);
+/// } else if aut.is_match(sid) {
+/// mat = Some(get_match(sid, at + 1));
+/// // As above, standard semantics require that we return
+/// // immediately once a match is found.
+/// if matches!(aut.match_kind(), MatchKind::Standard) {
+/// return Ok(mat);
+/// }
+/// }
+/// }
+/// at += 1;
+/// }
+/// Ok(mat)
+/// }
+///
+/// // Show that it works for standard searches.
+/// let nfa = NFA::new(&["samwise", "sam"]).unwrap();
+/// assert_eq!(Some(Match::must(1, 0..3)), find(&nfa, b"samwise")?);
+///
+/// // But also works when using leftmost-first. Notice how the match result
+/// // has changed!
+/// let nfa = NFA::builder()
+/// .match_kind(MatchKind::LeftmostFirst)
+/// .build(&["samwise", "sam"])
+/// .unwrap();
+/// assert_eq!(Some(Match::must(0, 0..7)), find(&nfa, b"samwise")?);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+pub unsafe trait Automaton: private::Sealed {
+ /// Returns the starting state for the given anchor mode.
+ ///
+ /// Upon success, the state ID returned is guaranteed to be valid for
+ /// this automaton.
+ ///
+ /// # Errors
+ ///
+ /// This returns an error when the given search configuration is not
+ /// supported by the underlying automaton. For example, if the underlying
+ /// automaton only supports unanchored searches but the given configuration
+ /// was set to an anchored search, then this must return an error.
+ fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError>;
+
+ /// Performs a state transition from `sid` for `byte` and returns the next
+ /// state.
+ ///
+ /// `anchored` should be [`Anchored::Yes`] when executing an anchored
+ /// search and [`Anchored::No`] otherwise. For some implementations of
+ /// `Automaton`, it is required to know whether the search is anchored
+ /// or not in order to avoid following failure transitions. Other
+ /// implementations may ignore `anchored` altogether and depend on
+ /// `Automaton::start_state` returning a state that walks a different path
+ /// through the automaton depending on whether the search is anchored or
+ /// not.
+ ///
+ /// # Panics
+ ///
+ /// This routine may panic or return incorrect results when the given state
+ /// ID is invalid. A state ID is valid if and only if:
+ ///
+ /// 1. It came from a call to `Automaton::start_state`, or
+ /// 2. It came from a previous call to `Automaton::next_state` with a
+ /// valid state ID.
+ ///
+ /// Implementations must treat all possible values of `byte` as valid.
+ ///
+ /// Implementations may panic on unsupported values of `anchored`, but are
+ /// not required to do so.
+ fn next_state(
+ &self,
+ anchored: Anchored,
+ sid: StateID,
+ byte: u8,
+ ) -> StateID;
+
+ /// Returns true if the given ID represents a "special" state. A special
+ /// state is a dead, match or start state.
+ ///
+ /// Note that implementations may choose to return false when the given ID
+ /// corresponds to a start state. Namely, it always correct to treat start
+ /// states as non-special. Implementations must return true for states that
+ /// are dead or contain matches.
+ ///
+ /// This has unspecified behavior when given an invalid state ID.
+ fn is_special(&self, sid: StateID) -> bool;
+
+ /// Returns true if the given ID represents a dead state.
+ ///
+ /// A dead state is a type of "sink" in a finite state machine. It
+ /// corresponds to a state whose transitions all loop back to itself. That
+ /// is, once entered, it can never be left. In practice, it serves as a
+ /// sentinel indicating that the search should terminate.
+ ///
+ /// This has unspecified behavior when given an invalid state ID.
+ fn is_dead(&self, sid: StateID) -> bool;
+
+ /// Returns true if the given ID represents a match state.
+ ///
+ /// A match state is always associated with one or more pattern IDs that
+ /// matched at the position in the haystack when the match state was
+ /// entered. When a match state is entered, the match semantics dictate
+ /// whether it should be returned immediately (for `MatchKind::Standard`)
+ /// or if the search should continue (for `MatchKind::LeftmostFirst` and
+ /// `MatchKind::LeftmostLongest`) until a dead state is seen or the end of
+ /// the haystack has been reached.
+ ///
+ /// This has unspecified behavior when given an invalid state ID.
+ fn is_match(&self, sid: StateID) -> bool;
+
+ /// Returns true if the given ID represents a start state.
+ ///
+ /// While it is never incorrect to ignore start states during a search
+ /// (except for the start of the search of course), knowing whether one has
+ /// entered a start state can be useful for certain classes of performance
+ /// optimizations. For example, if one is in a start state, it may be legal
+ /// to try to skip ahead and look for match candidates more quickly than
+ /// would otherwise be accomplished by walking the automaton.
+ ///
+ /// Implementations of `Automaton` in this crate "unspecialize" start
+ /// states when a prefilter is not active or enabled. In this case, it
+ /// is possible for `Automaton::is_special(sid)` to return false while
+ /// `Automaton::is_start(sid)` returns true.
+ ///
+ /// This has unspecified behavior when given an invalid state ID.
+ fn is_start(&self, sid: StateID) -> bool;
+
+ /// Returns the match semantics that this automaton was built with.
+ fn match_kind(&self) -> MatchKind;
+
+ /// Returns the total number of matches for the given state ID.
+ ///
+ /// This has unspecified behavior if the given ID does not refer to a match
+ /// state.
+ fn match_len(&self, sid: StateID) -> usize;
+
+ /// Returns the pattern ID for the match state given by `sid` at the
+ /// `index` given.
+ ///
+ /// Typically, `index` is only ever greater than `0` when implementing an
+ /// overlapping search. Otherwise, it's likely that your search only cares
+ /// about reporting the first pattern ID in a match state.
+ ///
+ /// This has unspecified behavior if the given ID does not refer to a match
+ /// state, or if the index is greater than or equal to the total number of
+ /// matches in this match state.
+ fn match_pattern(&self, sid: StateID, index: usize) -> PatternID;
+
+ /// Returns the total number of patterns compiled into this automaton.
+ fn patterns_len(&self) -> usize;
+
+ /// Returns the length of the pattern for the given ID.
+ ///
+ /// This has unspecified behavior when given an invalid pattern
+ /// ID. A pattern ID is valid if and only if it is less than
+ /// `Automaton::patterns_len`.
+ fn pattern_len(&self, pid: PatternID) -> usize;
+
+ /// Returns the length, in bytes, of the shortest pattern in this
+ /// automaton.
+ fn min_pattern_len(&self) -> usize;
+
+ /// Returns the length, in bytes, of the longest pattern in this automaton.
+ fn max_pattern_len(&self) -> usize;
+
+ /// Returns the heap memory usage, in bytes, used by this automaton.
+ fn memory_usage(&self) -> usize;
+
+ /// Returns a prefilter, if available, that can be used to accelerate
+ /// searches for this automaton.
+ ///
+ /// The typical way this is used is when the start state is entered during
+ /// a search. When that happens, one can use a prefilter to skip ahead and
+ /// look for candidate matches without having to walk the automaton on the
+ /// bytes between candidates.
+ ///
+ /// Typically a prefilter is only available when there are a small (<100)
+ /// number of patterns built into the automaton.
+ fn prefilter(&self) -> Option<&Prefilter>;
+
+ /// Executes a non-overlapping search with this automaton using the given
+ /// configuration.
+ ///
+ /// See
+ /// [`AhoCorasick::try_find`](crate::AhoCorasick::try_find)
+ /// for more documentation and examples.
+ fn try_find(
+ &self,
+ input: &Input<'_>,
+ ) -> Result<Option<Match>, MatchError> {
+ try_find_fwd(&self, input)
+ }
+
+ /// Executes a overlapping search with this automaton using the given
+ /// configuration.
+ ///
+ /// See
+ /// [`AhoCorasick::try_find_overlapping`](crate::AhoCorasick::try_find_overlapping)
+ /// for more documentation and examples.
+ fn try_find_overlapping(
+ &self,
+ input: &Input<'_>,
+ state: &mut OverlappingState,
+ ) -> Result<(), MatchError> {
+ try_find_overlapping_fwd(&self, input, state)
+ }
+
+ /// Returns an iterator of non-overlapping matches with this automaton
+ /// using the given configuration.
+ ///
+ /// See
+ /// [`AhoCorasick::try_find_iter`](crate::AhoCorasick::try_find_iter)
+ /// for more documentation and examples.
+ fn try_find_iter<'a, 'h>(
+ &'a self,
+ input: Input<'h>,
+ ) -> Result<FindIter<'a, 'h, Self>, MatchError>
+ where
+ Self: Sized,
+ {
+ FindIter::new(self, input)
+ }
+
+ /// Returns an iterator of overlapping matches with this automaton
+ /// using the given configuration.
+ ///
+ /// See
+ /// [`AhoCorasick::try_find_overlapping_iter`](crate::AhoCorasick::try_find_overlapping_iter)
+ /// for more documentation and examples.
+ fn try_find_overlapping_iter<'a, 'h>(
+ &'a self,
+ input: Input<'h>,
+ ) -> Result<FindOverlappingIter<'a, 'h, Self>, MatchError>
+ where
+ Self: Sized,
+ {
+ if !self.match_kind().is_standard() {
+ return Err(MatchError::unsupported_overlapping(
+ self.match_kind(),
+ ));
+ }
+ // We might consider lifting this restriction. The reason why I added
+ // it was to ban the combination of "anchored search" and "overlapping
+ // iteration." The match semantics aren't totally clear in that case.
+ // Should we allow *any* matches that are adjacent to *any* previous
+ // match? Or only following the most recent one? Or only matches
+ // that start at the beginning of the search? We might also elect to
+ // just keep this restriction in place, as callers should be able to
+ // implement it themselves if they want to.
+ if input.get_anchored().is_anchored() {
+ return Err(MatchError::invalid_input_anchored());
+ }
+ let _ = self.start_state(input.get_anchored())?;
+ let state = OverlappingState::start();
+ Ok(FindOverlappingIter { aut: self, input, state })
+ }
+
+ /// Replaces all non-overlapping matches in `haystack` with
+ /// strings from `replace_with` depending on the pattern that
+ /// matched. The `replace_with` slice must have length equal to
+ /// `Automaton::patterns_len`.
+ ///
+ /// See
+ /// [`AhoCorasick::try_replace_all`](crate::AhoCorasick::try_replace_all)
+ /// for more documentation and examples.
+ fn try_replace_all<B>(
+ &self,
+ haystack: &str,
+ replace_with: &[B],
+ ) -> Result<String, MatchError>
+ where
+ Self: Sized,
+ B: AsRef<str>,
+ {
+ assert_eq!(
+ replace_with.len(),
+ self.patterns_len(),
+ "replace_all requires a replacement for every pattern \
+ in the automaton"
+ );
+ let mut dst = String::with_capacity(haystack.len());
+ self.try_replace_all_with(haystack, &mut dst, |mat, _, dst| {
+ dst.push_str(replace_with[mat.pattern()].as_ref());
+ true
+ })?;
+ Ok(dst)
+ }
+
+ /// Replaces all non-overlapping matches in `haystack` with
+ /// strings from `replace_with` depending on the pattern that
+ /// matched. The `replace_with` slice must have length equal to
+ /// `Automaton::patterns_len`.
+ ///
+ /// See
+ /// [`AhoCorasick::try_replace_all_bytes`](crate::AhoCorasick::try_replace_all_bytes)
+ /// for more documentation and examples.
+ fn try_replace_all_bytes<B>(
+ &self,
+ haystack: &[u8],
+ replace_with: &[B],
+ ) -> Result<Vec<u8>, MatchError>
+ where
+ Self: Sized,
+ B: AsRef<[u8]>,
+ {
+ assert_eq!(
+ replace_with.len(),
+ self.patterns_len(),
+ "replace_all requires a replacement for every pattern \
+ in the automaton"
+ );
+ let mut dst = Vec::with_capacity(haystack.len());
+ self.try_replace_all_with_bytes(haystack, &mut dst, |mat, _, dst| {
+ dst.extend(replace_with[mat.pattern()].as_ref());
+ true
+ })?;
+ Ok(dst)
+ }
+
+ /// Replaces all non-overlapping matches in `haystack` by calling the
+ /// `replace_with` closure given.
+ ///
+ /// See
+ /// [`AhoCorasick::try_replace_all_with`](crate::AhoCorasick::try_replace_all_with)
+ /// for more documentation and examples.
+ fn try_replace_all_with<F>(
+ &self,
+ haystack: &str,
+ dst: &mut String,
+ mut replace_with: F,
+ ) -> Result<(), MatchError>
+ where
+ Self: Sized,
+ F: FnMut(&Match, &str, &mut String) -> bool,
+ {
+ let mut last_match = 0;
+ for m in self.try_find_iter(Input::new(haystack))? {
+ // Since there are no restrictions on what kinds of patterns are
+ // in an Aho-Corasick automaton, we might get matches that split
+ // a codepoint, or even matches of a partial codepoint. When that
+ // happens, we just skip the match.
+ if !haystack.is_char_boundary(m.start())
+ || !haystack.is_char_boundary(m.end())
+ {
+ continue;
+ }
+ dst.push_str(&haystack[last_match..m.start()]);
+ last_match = m.end();
+ if !replace_with(&m, &haystack[m.start()..m.end()], dst) {
+ break;
+ };
+ }
+ dst.push_str(&haystack[last_match..]);
+ Ok(())
+ }
+
+ /// Replaces all non-overlapping matches in `haystack` by calling the
+ /// `replace_with` closure given.
+ ///
+ /// See
+ /// [`AhoCorasick::try_replace_all_with_bytes`](crate::AhoCorasick::try_replace_all_with_bytes)
+ /// for more documentation and examples.
+ fn try_replace_all_with_bytes<F>(
+ &self,
+ haystack: &[u8],
+ dst: &mut Vec<u8>,
+ mut replace_with: F,
+ ) -> Result<(), MatchError>
+ where
+ Self: Sized,
+ F: FnMut(&Match, &[u8], &mut Vec<u8>) -> bool,
+ {
+ let mut last_match = 0;
+ for m in self.try_find_iter(Input::new(haystack))? {
+ dst.extend(&haystack[last_match..m.start()]);
+ last_match = m.end();
+ if !replace_with(&m, &haystack[m.start()..m.end()], dst) {
+ break;
+ };
+ }
+ dst.extend(&haystack[last_match..]);
+ Ok(())
+ }
+
+ /// Returns an iterator of non-overlapping matches with this automaton
+ /// from the stream given.
+ ///
+ /// See
+ /// [`AhoCorasick::try_stream_find_iter`](crate::AhoCorasick::try_stream_find_iter)
+ /// for more documentation and examples.
+ #[cfg(feature = "std")]
+ fn try_stream_find_iter<'a, R: std::io::Read>(
+ &'a self,
+ rdr: R,
+ ) -> Result<StreamFindIter<'a, Self, R>, MatchError>
+ where
+ Self: Sized,
+ {
+ Ok(StreamFindIter { it: StreamChunkIter::new(self, rdr)? })
+ }
+
+ /// Replaces all non-overlapping matches in `rdr` with strings from
+ /// `replace_with` depending on the pattern that matched, and writes the
+ /// result to `wtr`. The `replace_with` slice must have length equal to
+ /// `Automaton::patterns_len`.
+ ///
+ /// See
+ /// [`AhoCorasick::try_stream_replace_all`](crate::AhoCorasick::try_stream_replace_all)
+ /// for more documentation and examples.
+ #[cfg(feature = "std")]
+ fn try_stream_replace_all<R, W, B>(
+ &self,
+ rdr: R,
+ wtr: W,
+ replace_with: &[B],
+ ) -> std::io::Result<()>
+ where
+ Self: Sized,
+ R: std::io::Read,
+ W: std::io::Write,
+ B: AsRef<[u8]>,
+ {
+ assert_eq!(
+ replace_with.len(),
+ self.patterns_len(),
+ "streaming replace_all requires a replacement for every pattern \
+ in the automaton",
+ );
+ self.try_stream_replace_all_with(rdr, wtr, |mat, _, wtr| {
+ wtr.write_all(replace_with[mat.pattern()].as_ref())
+ })
+ }
+
+ /// Replaces all non-overlapping matches in `rdr` by calling the
+ /// `replace_with` closure given and writing the result to `wtr`.
+ ///
+ /// See
+ /// [`AhoCorasick::try_stream_replace_all_with`](crate::AhoCorasick::try_stream_replace_all_with)
+ /// for more documentation and examples.
+ #[cfg(feature = "std")]
+ fn try_stream_replace_all_with<R, W, F>(
+ &self,
+ rdr: R,
+ mut wtr: W,
+ mut replace_with: F,
+ ) -> std::io::Result<()>
+ where
+ Self: Sized,
+ R: std::io::Read,
+ W: std::io::Write,
+ F: FnMut(&Match, &[u8], &mut W) -> std::io::Result<()>,
+ {
+ let mut it = StreamChunkIter::new(self, rdr).map_err(|e| {
+ let kind = std::io::ErrorKind::Other;
+ std::io::Error::new(kind, e)
+ })?;
+ while let Some(result) = it.next() {
+ let chunk = result?;
+ match chunk {
+ StreamChunk::NonMatch { bytes, .. } => {
+ wtr.write_all(bytes)?;
+ }
+ StreamChunk::Match { bytes, mat } => {
+ replace_with(&mat, bytes, &mut wtr)?;
+ }
+ }
+ }
+ Ok(())
+ }
+}
+
+// SAFETY: This just defers to the underlying 'AcAutomaton' and thus inherits
+// its safety properties.
+unsafe impl<'a, A: Automaton + ?Sized> Automaton for &'a A {
+ #[inline(always)]
+ fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> {
+ (**self).start_state(anchored)
+ }
+
+ #[inline(always)]
+ fn next_state(
+ &self,
+ anchored: Anchored,
+ sid: StateID,
+ byte: u8,
+ ) -> StateID {
+ (**self).next_state(anchored, sid, byte)
+ }
+
+ #[inline(always)]
+ fn is_special(&self, sid: StateID) -> bool {
+ (**self).is_special(sid)
+ }
+
+ #[inline(always)]
+ fn is_dead(&self, sid: StateID) -> bool {
+ (**self).is_dead(sid)
+ }
+
+ #[inline(always)]
+ fn is_match(&self, sid: StateID) -> bool {
+ (**self).is_match(sid)
+ }
+
+ #[inline(always)]
+ fn is_start(&self, sid: StateID) -> bool {
+ (**self).is_start(sid)
+ }
+
+ #[inline(always)]
+ fn match_kind(&self) -> MatchKind {
+ (**self).match_kind()
+ }
+
+ #[inline(always)]
+ fn match_len(&self, sid: StateID) -> usize {
+ (**self).match_len(sid)
+ }
+
+ #[inline(always)]
+ fn match_pattern(&self, sid: StateID, index: usize) -> PatternID {
+ (**self).match_pattern(sid, index)
+ }
+
+ #[inline(always)]
+ fn patterns_len(&self) -> usize {
+ (**self).patterns_len()
+ }
+
+ #[inline(always)]
+ fn pattern_len(&self, pid: PatternID) -> usize {
+ (**self).pattern_len(pid)
+ }
+
+ #[inline(always)]
+ fn min_pattern_len(&self) -> usize {
+ (**self).min_pattern_len()
+ }
+
+ #[inline(always)]
+ fn max_pattern_len(&self) -> usize {
+ (**self).max_pattern_len()
+ }
+
+ #[inline(always)]
+ fn memory_usage(&self) -> usize {
+ (**self).memory_usage()
+ }
+
+ #[inline(always)]
+ fn prefilter(&self) -> Option<&Prefilter> {
+ (**self).prefilter()
+ }
+}
+
+/// Represents the current state of an overlapping search.
+///
+/// This is used for overlapping searches since they need to know something
+/// about the previous search. For example, when multiple patterns match at the
+/// same position, this state tracks the last reported pattern so that the next
+/// search knows whether to report another matching pattern or continue with
+/// the search at the next position. Additionally, it also tracks which state
+/// the last search call terminated in and the current offset of the search
+/// in the haystack.
+///
+/// This type provides limited introspection capabilities. The only thing a
+/// caller can do is construct it and pass it around to permit search routines
+/// to use it to track state, and to ask whether a match has been found.
+///
+/// Callers should always provide a fresh state constructed via
+/// [`OverlappingState::start`] when starting a new search. That same state
+/// should be reused for subsequent searches on the same `Input`. The state
+/// given will advance through the haystack itself. Callers can detect the end
+/// of a search when neither an error nor a match is returned.
+///
+/// # Example
+///
+/// This example shows how to manually iterate over all overlapping matches. If
+/// you need this, you might consider using
+/// [`AhoCorasick::find_overlapping_iter`](crate::AhoCorasick::find_overlapping_iter)
+/// instead, but this shows how to correctly use an `OverlappingState`.
+///
+/// ```
+/// use aho_corasick::{
+/// automaton::OverlappingState,
+/// AhoCorasick, Input, Match,
+/// };
+///
+/// let patterns = &["append", "appendage", "app"];
+/// let haystack = "append the app to the appendage";
+///
+/// let ac = AhoCorasick::new(patterns).unwrap();
+/// let mut state = OverlappingState::start();
+/// let mut matches = vec![];
+///
+/// loop {
+/// ac.find_overlapping(haystack, &mut state);
+/// let mat = match state.get_match() {
+/// None => break,
+/// Some(mat) => mat,
+/// };
+/// matches.push(mat);
+/// }
+/// let expected = vec![
+/// Match::must(2, 0..3),
+/// Match::must(0, 0..6),
+/// Match::must(2, 11..14),
+/// Match::must(2, 22..25),
+/// Match::must(0, 22..28),
+/// Match::must(1, 22..31),
+/// ];
+/// assert_eq!(expected, matches);
+/// ```
+#[derive(Clone, Debug)]
+pub struct OverlappingState {
+ /// The match reported by the most recent overlapping search to use this
+ /// state.
+ ///
+ /// If a search does not find any matches, then it is expected to clear
+ /// this value.
+ mat: Option<Match>,
+ /// The state ID of the state at which the search was in when the call
+ /// terminated. When this is a match state, `last_match` must be set to a
+ /// non-None value.
+ ///
+ /// A `None` value indicates the start state of the corresponding
+ /// automaton. We cannot use the actual ID, since any one automaton may
+ /// have many start states, and which one is in use depends on search-time
+ /// factors (such as whether the search is anchored or not).
+ id: Option<StateID>,
+ /// The position of the search.
+ ///
+ /// When `id` is None (i.e., we are starting a search), this is set to
+ /// the beginning of the search as given by the caller regardless of its
+ /// current value. Subsequent calls to an overlapping search pick up at
+ /// this offset.
+ at: usize,
+ /// The index into the matching patterns of the next match to report if the
+ /// current state is a match state. Note that this may be 1 greater than
+ /// the total number of matches to report for the current match state. (In
+ /// which case, no more matches should be reported at the current position
+ /// and the search should advance to the next position.)
+ next_match_index: Option<usize>,
+}
+
+impl OverlappingState {
+ /// Create a new overlapping state that begins at the start state.
+ pub fn start() -> OverlappingState {
+ OverlappingState { mat: None, id: None, at: 0, next_match_index: None }
+ }
+
+ /// Return the match result of the most recent search to execute with this
+ /// state.
+ ///
+ /// Every search will clear this result automatically, such that if no
+ /// match is found, this will always correctly report `None`.
+ pub fn get_match(&self) -> Option<Match> {
+ self.mat
+ }
+}
+
+/// An iterator of non-overlapping matches in a particular haystack.
+///
+/// This iterator yields matches according to the [`MatchKind`] used by this
+/// automaton.
+///
+/// This iterator is constructed via the [`Automaton::try_find_iter`] method.
+///
+/// The type variable `A` refers to the implementation of the [`Automaton`]
+/// trait used to execute the search.
+///
+/// The lifetime `'a` refers to the lifetime of the [`Automaton`]
+/// implementation.
+///
+/// The lifetime `'h` refers to the lifetime of the haystack being searched.
+#[derive(Debug)]
+pub struct FindIter<'a, 'h, A> {
+ /// The automaton used to drive the search.
+ aut: &'a A,
+ /// The input parameters to give to each search call.
+ ///
+ /// The start position of the search is mutated during iteration.
+ input: Input<'h>,
+ /// Records the end offset of the most recent match. This is necessary to
+ /// handle a corner case for preventing empty matches from overlapping with
+ /// the ending bounds of a prior match.
+ last_match_end: Option<usize>,
+}
+
+impl<'a, 'h, A: Automaton> FindIter<'a, 'h, A> {
+ /// Creates a new non-overlapping iterator. If the given automaton would
+ /// return an error on a search with the given input configuration, then
+ /// that error is returned here.
+ fn new(
+ aut: &'a A,
+ input: Input<'h>,
+ ) -> Result<FindIter<'a, 'h, A>, MatchError> {
+ // The only way this search can fail is if we cannot retrieve the start
+ // state. e.g., Asking for an anchored search when only unanchored
+ // searches are supported.
+ let _ = aut.start_state(input.get_anchored())?;
+ Ok(FindIter { aut, input, last_match_end: None })
+ }
+
+ /// Executes a search and returns a match if one is found.
+ ///
+ /// This does not advance the input forward. It just executes a search
+ /// based on the current configuration/offsets.
+ fn search(&self) -> Option<Match> {
+ // The unwrap is OK here because we check at iterator construction time
+ // that no subsequent search call (using the same configuration) will
+ // ever return an error.
+ self.aut
+ .try_find(&self.input)
+ .expect("already checked that no match error can occur")
+ }
+
+ /// Handles the special case of an empty match by ensuring that 1) the
+ /// iterator always advances and 2) empty matches never overlap with other
+ /// matches.
+ ///
+ /// (1) is necessary because we principally make progress by setting the
+ /// starting location of the next search to the ending location of the last
+ /// match. But if a match is empty, then this results in a search that does
+ /// not advance and thus does not terminate.
+ ///
+ /// (2) is not strictly necessary, but makes intuitive sense and matches
+ /// the presiding behavior of most general purpose regex engines.
+ /// (Obviously this crate isn't a regex engine, but we choose to match
+ /// their semantics.) The "intuitive sense" here is that we want to report
+ /// NON-overlapping matches. So for example, given the patterns 'a' and
+ /// '' (an empty string) against the haystack 'a', without the special
+ /// handling, you'd get the matches [0, 1) and [1, 1), where the latter
+ /// overlaps with the end bounds of the former.
+ ///
+ /// Note that we mark this cold and forcefully prevent inlining because
+ /// handling empty matches like this is extremely rare and does require
+ /// quite a bit of code, comparatively. Keeping this code out of the main
+ /// iterator function keeps it smaller and more amenable to inlining
+ /// itself.
+ #[cold]
+ #[inline(never)]
+ fn handle_overlapping_empty_match(
+ &mut self,
+ mut m: Match,
+ ) -> Option<Match> {
+ assert!(m.is_empty());
+ if Some(m.end()) == self.last_match_end {
+ self.input.set_start(self.input.start().checked_add(1).unwrap());
+ m = self.search()?;
+ }
+ Some(m)
+ }
+}
+
+impl<'a, 'h, A: Automaton> Iterator for FindIter<'a, 'h, A> {
+ type Item = Match;
+
+ #[inline(always)]
+ fn next(&mut self) -> Option<Match> {
+ let mut m = self.search()?;
+ if m.is_empty() {
+ m = self.handle_overlapping_empty_match(m)?;
+ }
+ self.input.set_start(m.end());
+ self.last_match_end = Some(m.end());
+ Some(m)
+ }
+}
+
+/// An iterator of overlapping matches in a particular haystack.
+///
+/// This iterator will report all possible matches in a particular haystack,
+/// even when the matches overlap.
+///
+/// This iterator is constructed via the
+/// [`Automaton::try_find_overlapping_iter`] method.
+///
+/// The type variable `A` refers to the implementation of the [`Automaton`]
+/// trait used to execute the search.
+///
+/// The lifetime `'a` refers to the lifetime of the [`Automaton`]
+/// implementation.
+///
+/// The lifetime `'h` refers to the lifetime of the haystack being searched.
+#[derive(Debug)]
+pub struct FindOverlappingIter<'a, 'h, A> {
+ aut: &'a A,
+ input: Input<'h>,
+ state: OverlappingState,
+}
+
+impl<'a, 'h, A: Automaton> Iterator for FindOverlappingIter<'a, 'h, A> {
+ type Item = Match;
+
+ #[inline(always)]
+ fn next(&mut self) -> Option<Match> {
+ self.aut
+ .try_find_overlapping(&self.input, &mut self.state)
+ .expect("already checked that no match error can occur here");
+ self.state.get_match()
+ }
+}
+
+/// An iterator that reports matches in a stream.
+///
+/// This iterator yields elements of type `io::Result<Match>`, where an error
+/// is reported if there was a problem reading from the underlying stream.
+/// The iterator terminates only when the underlying stream reaches `EOF`.
+///
+/// This iterator is constructed via the [`Automaton::try_stream_find_iter`]
+/// method.
+///
+/// The type variable `A` refers to the implementation of the [`Automaton`]
+/// trait used to execute the search.
+///
+/// The type variable `R` refers to the `io::Read` stream that is being read
+/// from.
+///
+/// The lifetime `'a` refers to the lifetime of the [`Automaton`]
+/// implementation.
+#[cfg(feature = "std")]
+#[derive(Debug)]
+pub struct StreamFindIter<'a, A, R> {
+ it: StreamChunkIter<'a, A, R>,
+}
+
+#[cfg(feature = "std")]
+impl<'a, A: Automaton, R: std::io::Read> Iterator
+ for StreamFindIter<'a, A, R>
+{
+ type Item = std::io::Result<Match>;
+
+ fn next(&mut self) -> Option<std::io::Result<Match>> {
+ loop {
+ match self.it.next() {
+ None => return None,
+ Some(Err(err)) => return Some(Err(err)),
+ Some(Ok(StreamChunk::NonMatch { .. })) => {}
+ Some(Ok(StreamChunk::Match { mat, .. })) => {
+ return Some(Ok(mat));
+ }
+ }
+ }
+ }
+}
+
+/// An iterator that reports matches in a stream.
+///
+/// (This doesn't actually implement the `Iterator` trait because it returns
+/// something with a lifetime attached to a buffer it owns, but that's OK. It
+/// still has a `next` method and is iterator-like enough to be fine.)
+///
+/// This iterator yields elements of type `io::Result<StreamChunk>`, where
+/// an error is reported if there was a problem reading from the underlying
+/// stream. The iterator terminates only when the underlying stream reaches
+/// `EOF`.
+///
+/// The idea here is that each chunk represents either a match or a non-match,
+/// and if you concatenated all of the chunks together, you'd reproduce the
+/// entire contents of the stream, byte-for-byte.
+///
+/// This chunk machinery is a bit complicated and it isn't strictly required
+/// for a stream searcher that just reports matches. But we do need something
+/// like this to deal with the "replacement" API, which needs to know which
+/// chunks it can copy and which it needs to replace.
+#[cfg(feature = "std")]
+#[derive(Debug)]
+struct StreamChunkIter<'a, A, R> {
+ /// The underlying automaton to do the search.
+ aut: &'a A,
+ /// The source of bytes we read from.
+ rdr: R,
+ /// A roll buffer for managing bytes from `rdr`. Basically, this is used
+ /// to handle the case of a match that is split by two different
+ /// calls to `rdr.read()`. This isn't strictly needed if all we needed to
+ /// do was report matches, but here we are reporting chunks of non-matches
+ /// and matches and in order to do that, we really just cannot treat our
+ /// stream as non-overlapping blocks of bytes. We need to permit some
+ /// overlap while we retain bytes from a previous `read` call in memory.
+ buf: crate::util::buffer::Buffer,
+ /// The unanchored starting state of this automaton.
+ start: StateID,
+ /// The state of the automaton.
+ sid: StateID,
+ /// The absolute position over the entire stream.
+ absolute_pos: usize,
+ /// The position we're currently at within `buf`.
+ buffer_pos: usize,
+ /// The buffer position of the end of the bytes that we last returned
+ /// to the caller. Basically, whenever we find a match, we look to see if
+ /// there is a difference between where the match started and the position
+ /// of the last byte we returned to the caller. If there's a difference,
+ /// then we need to return a 'NonMatch' chunk.
+ buffer_reported_pos: usize,
+}
+
+#[cfg(feature = "std")]
+impl<'a, A: Automaton, R: std::io::Read> StreamChunkIter<'a, A, R> {
+ fn new(
+ aut: &'a A,
+ rdr: R,
+ ) -> Result<StreamChunkIter<'a, A, R>, MatchError> {
+ // This restriction is a carry-over from older versions of this crate.
+ // I didn't have the bandwidth to think through how to handle, say,
+ // leftmost-first or leftmost-longest matching, but... it should be
+ // possible? The main problem is that once you see a match state in
+ // leftmost-first semantics, you can't just stop at that point and
+ // report a match. You have to keep going until you either hit a dead
+ // state or EOF. So how do you know when you'll hit a dead state? Well,
+ // you don't. With Aho-Corasick, I believe you can put a bound on it
+ // and say, "once a match has been seen, you'll need to scan forward at
+ // most N bytes" where N=aut.max_pattern_len().
+ //
+ // Which is fine, but it does mean that state about whether we're still
+ // looking for a dead state or not needs to persist across buffer
+ // refills. Which this code doesn't really handle. It does preserve
+ // *some* state across buffer refills, basically ensuring that a match
+ // span is always in memory.
+ if !aut.match_kind().is_standard() {
+ return Err(MatchError::unsupported_stream(aut.match_kind()));
+ }
+ // This is kind of a cop-out, but empty matches are SUPER annoying.
+ // If we know they can't happen (which is what we enforce here), then
+ // it makes a lot of logic much simpler. With that said, I'm open to
+ // supporting this case, but we need to define proper semantics for it
+ // first. It wasn't totally clear to me what it should do at the time
+ // of writing, so I decided to just be conservative.
+ //
+ // It also seems like a very weird case to support anyway. Why search a
+ // stream if you're just going to get a match at every position?
+ //
+ // ¯\_(ツ)_/¯
+ if aut.min_pattern_len() == 0 {
+ return Err(MatchError::unsupported_empty());
+ }
+ let start = aut.start_state(Anchored::No)?;
+ Ok(StreamChunkIter {
+ aut,
+ rdr,
+ buf: crate::util::buffer::Buffer::new(aut.max_pattern_len()),
+ start,
+ sid: start,
+ absolute_pos: 0,
+ buffer_pos: 0,
+ buffer_reported_pos: 0,
+ })
+ }
+
+ fn next(&mut self) -> Option<std::io::Result<StreamChunk>> {
+ // This code is pretty gnarly. It IS simpler than the equivalent code
+ // in the previous aho-corasick release, in part because we inline
+ // automaton traversal here and also in part because we have abdicated
+ // support for automatons that contain an empty pattern.
+ //
+ // I suspect this code could be made a bit simpler by designing a
+ // better buffer abstraction.
+ //
+ // But in general, this code is basically write-only. So you'll need
+ // to go through it step-by-step to grok it. One of the key bits of
+ // complexity is tracking a few different offsets. 'buffer_pos' is
+ // where we are in the buffer for search. 'buffer_reported_pos' is the
+ // position immediately following the last byte in the buffer that
+ // we've returned to the caller. And 'absolute_pos' is the overall
+ // current absolute position of the search in the entire stream, and
+ // this is what match spans are reported in terms of.
+ loop {
+ if self.aut.is_match(self.sid) {
+ let mat = self.get_match();
+ if let Some(r) = self.get_non_match_chunk(mat) {
+ self.buffer_reported_pos += r.len();
+ let bytes = &self.buf.buffer()[r];
+ return Some(Ok(StreamChunk::NonMatch { bytes }));
+ }
+ self.sid = self.start;
+ let r = self.get_match_chunk(mat);
+ self.buffer_reported_pos += r.len();
+ let bytes = &self.buf.buffer()[r];
+ return Some(Ok(StreamChunk::Match { bytes, mat }));
+ }
+ if self.buffer_pos >= self.buf.buffer().len() {
+ if let Some(r) = self.get_pre_roll_non_match_chunk() {
+ self.buffer_reported_pos += r.len();
+ let bytes = &self.buf.buffer()[r];
+ return Some(Ok(StreamChunk::NonMatch { bytes }));
+ }
+ if self.buf.buffer().len() >= self.buf.min_buffer_len() {
+ self.buffer_pos = self.buf.min_buffer_len();
+ self.buffer_reported_pos -=
+ self.buf.buffer().len() - self.buf.min_buffer_len();
+ self.buf.roll();
+ }
+ match self.buf.fill(&mut self.rdr) {
+ Err(err) => return Some(Err(err)),
+ Ok(true) => {}
+ Ok(false) => {
+ // We've hit EOF, but if there are still some
+ // unreported bytes remaining, return them now.
+ if let Some(r) = self.get_eof_non_match_chunk() {
+ self.buffer_reported_pos += r.len();
+ let bytes = &self.buf.buffer()[r];
+ return Some(Ok(StreamChunk::NonMatch { bytes }));
+ }
+ // We've reported everything!
+ return None;
+ }
+ }
+ }
+ let start = self.absolute_pos;
+ for &byte in self.buf.buffer()[self.buffer_pos..].iter() {
+ self.sid = self.aut.next_state(Anchored::No, self.sid, byte);
+ self.absolute_pos += 1;
+ if self.aut.is_match(self.sid) {
+ break;
+ }
+ }
+ self.buffer_pos += self.absolute_pos - start;
+ }
+ }
+
+ /// Return a match chunk for the given match. It is assumed that the match
+ /// ends at the current `buffer_pos`.
+ fn get_match_chunk(&self, mat: Match) -> core::ops::Range<usize> {
+ let start = self.buffer_pos - mat.len();
+ let end = self.buffer_pos;
+ start..end
+ }
+
+ /// Return a non-match chunk, if necessary, just before reporting a match.
+ /// This returns `None` if there is nothing to report. Otherwise, this
+ /// assumes that the given match ends at the current `buffer_pos`.
+ fn get_non_match_chunk(
+ &self,
+ mat: Match,
+ ) -> Option<core::ops::Range<usize>> {
+ let buffer_mat_start = self.buffer_pos - mat.len();
+ if buffer_mat_start > self.buffer_reported_pos {
+ let start = self.buffer_reported_pos;
+ let end = buffer_mat_start;
+ return Some(start..end);
+ }
+ None
+ }
+
+ /// Look for any bytes that should be reported as a non-match just before
+ /// rolling the buffer.
+ ///
+ /// Note that this only reports bytes up to `buffer.len() -
+ /// min_buffer_len`, as it's not possible to know whether the bytes
+ /// following that will participate in a match or not.
+ fn get_pre_roll_non_match_chunk(&self) -> Option<core::ops::Range<usize>> {
+ let end =
+ self.buf.buffer().len().saturating_sub(self.buf.min_buffer_len());
+ if self.buffer_reported_pos < end {
+ return Some(self.buffer_reported_pos..end);
+ }
+ None
+ }
+
+ /// Return any unreported bytes as a non-match up to the end of the buffer.
+ ///
+ /// This should only be called when the entire contents of the buffer have
+ /// been searched and EOF has been hit when trying to fill the buffer.
+ fn get_eof_non_match_chunk(&self) -> Option<core::ops::Range<usize>> {
+ if self.buffer_reported_pos < self.buf.buffer().len() {
+ return Some(self.buffer_reported_pos..self.buf.buffer().len());
+ }
+ None
+ }
+
+ /// Return the match at the current position for the current state.
+ ///
+ /// This panics if `self.aut.is_match(self.sid)` isn't true.
+ fn get_match(&self) -> Match {
+ get_match(self.aut, self.sid, 0, self.absolute_pos)
+ }
+}
+
+/// A single chunk yielded by the stream chunk iterator.
+///
+/// The `'r` lifetime refers to the lifetime of the stream chunk iterator.
+#[cfg(feature = "std")]
+#[derive(Debug)]
+enum StreamChunk<'r> {
+ /// A chunk that does not contain any matches.
+ NonMatch { bytes: &'r [u8] },
+ /// A chunk that precisely contains a match.
+ Match { bytes: &'r [u8], mat: Match },
+}
+
+#[inline(never)]
+pub(crate) fn try_find_fwd<A: Automaton + ?Sized>(
+ aut: &A,
+ input: &Input<'_>,
+) -> Result<Option<Match>, MatchError> {
+ if input.is_done() {
+ return Ok(None);
+ }
+ let earliest = aut.match_kind().is_standard() || input.get_earliest();
+ if input.get_anchored().is_anchored() {
+ try_find_fwd_imp(aut, input, None, Anchored::Yes, earliest)
+ } else if let Some(pre) = aut.prefilter() {
+ if earliest {
+ try_find_fwd_imp(aut, input, Some(pre), Anchored::No, true)
+ } else {
+ try_find_fwd_imp(aut, input, Some(pre), Anchored::No, false)
+ }
+ } else {
+ if earliest {
+ try_find_fwd_imp(aut, input, None, Anchored::No, true)
+ } else {
+ try_find_fwd_imp(aut, input, None, Anchored::No, false)
+ }
+ }
+}
+
+#[inline(always)]
+fn try_find_fwd_imp<A: Automaton + ?Sized>(
+ aut: &A,
+ input: &Input<'_>,
+ pre: Option<&Prefilter>,
+ anchored: Anchored,
+ earliest: bool,
+) -> Result<Option<Match>, MatchError> {
+ let mut sid = aut.start_state(input.get_anchored())?;
+ let mut at = input.start();
+ let mut mat = None;
+ if aut.is_match(sid) {
+ mat = Some(get_match(aut, sid, 0, at));
+ if earliest {
+ return Ok(mat);
+ }
+ }
+ if let Some(pre) = pre {
+ match pre.find_in(input.haystack(), input.get_span()) {
+ Candidate::None => return Ok(None),
+ Candidate::Match(m) => return Ok(Some(m)),
+ Candidate::PossibleStartOfMatch(i) => {
+ at = i;
+ }
+ }
+ }
+ while at < input.end() {
+ // I've tried unrolling this loop and eliding bounds checks, but no
+ // matter what I did, I could not observe a consistent improvement on
+ // any benchmark I could devise. (If someone wants to re-litigate this,
+ // the way to do it is to add an 'next_state_unchecked' method to the
+ // 'Automaton' trait with a default impl that uses 'next_state'. Then
+ // use 'aut.next_state_unchecked' here and implement it on DFA using
+ // unchecked slice index acces.)
+ sid = aut.next_state(anchored, sid, input.haystack()[at]);
+ if aut.is_special(sid) {
+ if aut.is_dead(sid) {
+ return Ok(mat);
+ } else if aut.is_match(sid) {
+ // We use 'at + 1' here because the match state is entered
+ // at the last byte of the pattern. Since we use half-open
+ // intervals, the end of the range of the match is one past the
+ // last byte.
+ let m = get_match(aut, sid, 0, at + 1);
+ // For the automata in this crate, we make a size trade off
+ // where we reuse the same automaton for both anchored and
+ // unanchored searches. We achieve this, principally, by simply
+ // not following failure transitions while computing the next
+ // state. Instead, if we fail to find the next state, we return
+ // a dead state, which instructs the search to stop. (This
+ // is why 'next_state' needs to know whether the search is
+ // anchored or not.) In addition, we have different start
+ // states for anchored and unanchored searches. The latter has
+ // a self-loop where as the former does not.
+ //
+ // In this way, we can use the same trie to execute both
+ // anchored and unanchored searches. There is a catch though.
+ // When building an Aho-Corasick automaton for unanchored
+ // searches, we copy matches from match states to other states
+ // (which would otherwise not be match states) if they are
+ // reachable via a failure transition. In the case of an
+ // anchored search, we *specifically* do not want to report
+ // these matches because they represent matches that start past
+ // the beginning of the search.
+ //
+ // Now we could tweak the automaton somehow to differentiate
+ // anchored from unanchored match states, but this would make
+ // 'aut.is_match' and potentially 'aut.is_special' slower. And
+ // also make the automaton itself more complex.
+ //
+ // Instead, we insert a special hack: if the search is
+ // anchored, we simply ignore matches that don't begin at
+ // the start of the search. This is not quite ideal, but we
+ // do specialize this function in such a way that unanchored
+ // searches don't pay for this additional branch. While this
+ // might cause a search to continue on for more than it
+ // otherwise optimally would, it will be no more than the
+ // longest pattern in the automaton. The reason for this is
+ // that we ensure we don't follow failure transitions during
+ // an anchored search. Combined with using a different anchored
+ // starting state with no self-loop, we guarantee that we'll
+ // at worst move through a number of transitions equal to the
+ // longest pattern.
+ //
+ // Now for DFAs, the whole point of them is to eliminate
+ // failure transitions entirely. So there is no way to say "if
+ // it's an anchored search don't follow failure transitions."
+ // Instead, we actually have to build two entirely separate
+ // automatons into the transition table. One with failure
+ // transitions built into it and another that is effectively
+ // just an encoding of the base trie into a transition table.
+ // DFAs still need this check though, because the match states
+ // still carry matches only reachable via a failure transition.
+ // Why? Because removing them seems difficult, although I
+ // haven't given it a lot of thought.
+ if !(anchored.is_anchored() && m.start() > input.start()) {
+ mat = Some(m);
+ if earliest {
+ return Ok(mat);
+ }
+ }
+ } else if let Some(pre) = pre {
+ // If we're here, we know it's a special state that is not a
+ // dead or a match state AND that a prefilter is active. Thus,
+ // it must be a start state.
+ debug_assert!(aut.is_start(sid));
+ // We don't care about 'Candidate::Match' here because if such
+ // a match were possible, it would have been returned above
+ // when we run the prefilter before walking the automaton.
+ let span = Span::from(at..input.end());
+ match pre.find_in(input.haystack(), span).into_option() {
+ None => return Ok(None),
+ Some(i) => {
+ if i > at {
+ at = i;
+ continue;
+ }
+ }
+ }
+ } else {
+ // When pre.is_none(), then starting states should not be
+ // treated as special. That is, without a prefilter, is_special
+ // should only return true when the state is a dead or a match
+ // state.
+ //
+ // It is possible to execute a search without a prefilter even
+ // when the underlying searcher has one: an anchored search.
+ // But in this case, the automaton makes it impossible to move
+ // back to the start state by construction, and thus, we should
+ // never reach this branch.
+ debug_assert!(false, "unreachable");
+ }
+ }
+ at += 1;
+ }
+ Ok(mat)
+}
+
+#[inline(never)]
+fn try_find_overlapping_fwd<A: Automaton + ?Sized>(
+ aut: &A,
+ input: &Input<'_>,
+ state: &mut OverlappingState,
+) -> Result<(), MatchError> {
+ state.mat = None;
+ if input.is_done() {
+ return Ok(());
+ }
+ // Searching with a pattern ID is always anchored, so we should only ever
+ // use a prefilter when no pattern ID is given.
+ if aut.prefilter().is_some() && !input.get_anchored().is_anchored() {
+ let pre = aut.prefilter().unwrap();
+ try_find_overlapping_fwd_imp(aut, input, Some(pre), state)
+ } else {
+ try_find_overlapping_fwd_imp(aut, input, None, state)
+ }
+}
+
+#[inline(always)]
+fn try_find_overlapping_fwd_imp<A: Automaton + ?Sized>(
+ aut: &A,
+ input: &Input<'_>,
+ pre: Option<&Prefilter>,
+ state: &mut OverlappingState,
+) -> Result<(), MatchError> {
+ let mut sid = match state.id {
+ None => {
+ let sid = aut.start_state(input.get_anchored())?;
+ // Handle the case where the start state is a match state. That is,
+ // the empty string is in our automaton. We report every match we
+ // can here before moving on and updating 'state.at' and 'state.id'
+ // to find more matches in other parts of the haystack.
+ if aut.is_match(sid) {
+ let i = state.next_match_index.unwrap_or(0);
+ let len = aut.match_len(sid);
+ if i < len {
+ state.next_match_index = Some(i + 1);
+ state.mat = Some(get_match(aut, sid, i, input.start()));
+ return Ok(());
+ }
+ }
+ state.at = input.start();
+ state.id = Some(sid);
+ state.next_match_index = None;
+ state.mat = None;
+ sid
+ }
+ Some(sid) => {
+ // If we still have matches left to report in this state then
+ // report them until we've exhausted them. Only after that do we
+ // advance to the next offset in the haystack.
+ if let Some(i) = state.next_match_index {
+ let len = aut.match_len(sid);
+ if i < len {
+ state.next_match_index = Some(i + 1);
+ state.mat = Some(get_match(aut, sid, i, state.at + 1));
+ return Ok(());
+ }
+ // Once we've reported all matches at a given position, we need
+ // to advance the search to the next position.
+ state.at += 1;
+ state.next_match_index = None;
+ state.mat = None;
+ }
+ sid
+ }
+ };
+ while state.at < input.end() {
+ sid = aut.next_state(
+ input.get_anchored(),
+ sid,
+ input.haystack()[state.at],
+ );
+ if aut.is_special(sid) {
+ state.id = Some(sid);
+ if aut.is_dead(sid) {
+ return Ok(());
+ } else if aut.is_match(sid) {
+ state.next_match_index = Some(1);
+ state.mat = Some(get_match(aut, sid, 0, state.at + 1));
+ return Ok(());
+ } else if let Some(pre) = pre {
+ // If we're here, we know it's a special state that is not a
+ // dead or a match state AND that a prefilter is active. Thus,
+ // it must be a start state.
+ debug_assert!(aut.is_start(sid));
+ let span = Span::from(state.at..input.end());
+ match pre.find_in(input.haystack(), span).into_option() {
+ None => return Ok(()),
+ Some(i) => {
+ if i > state.at {
+ state.at = i;
+ continue;
+ }
+ }
+ }
+ } else {
+ // When pre.is_none(), then starting states should not be
+ // treated as special. That is, without a prefilter, is_special
+ // should only return true when the state is a dead or a match
+ // state.
+ //
+ // ... except for one special case: in stream searching, we
+ // currently call overlapping search with a 'None' prefilter,
+ // regardless of whether one exists or not, because stream
+ // searching can't currently deal with prefilters correctly in
+ // all cases.
+ }
+ }
+ state.at += 1;
+ }
+ state.id = Some(sid);
+ Ok(())
+}
+
+#[inline(always)]
+fn get_match<A: Automaton + ?Sized>(
+ aut: &A,
+ sid: StateID,
+ index: usize,
+ at: usize,
+) -> Match {
+ let pid = aut.match_pattern(sid, index);
+ let len = aut.pattern_len(pid);
+ Match::new(pid, (at - len)..at)
+}
+
+/// Write a prefix "state" indicator for fmt::Debug impls. It always writes
+/// exactly two printable bytes to the given formatter.
+///
+/// Specifically, this tries to succinctly distinguish the different types of
+/// states: dead states, start states and match states. It even accounts for
+/// the possible overlappings of different state types. (The only possible
+/// overlapping is that of match and start states.)
+pub(crate) fn fmt_state_indicator<A: Automaton>(
+ f: &mut core::fmt::Formatter<'_>,
+ aut: A,
+ id: StateID,
+) -> core::fmt::Result {
+ if aut.is_dead(id) {
+ write!(f, "D ")?;
+ } else if aut.is_match(id) {
+ if aut.is_start(id) {
+ write!(f, "*>")?;
+ } else {
+ write!(f, "* ")?;
+ }
+ } else if aut.is_start(id) {
+ write!(f, " >")?;
+ } else {
+ write!(f, " ")?;
+ }
+ Ok(())
+}
+
+/// Return an iterator of transitions in a sparse format given an iterator
+/// of all explicitly defined transitions. The iterator yields ranges of
+/// transitions, such that any adjacent transitions mapped to the same
+/// state are combined into a single range.
+pub(crate) fn sparse_transitions<'a>(
+ mut it: impl Iterator<Item = (u8, StateID)> + 'a,
+) -> impl Iterator<Item = (u8, u8, StateID)> + 'a {
+ let mut cur: Option<(u8, u8, StateID)> = None;
+ core::iter::from_fn(move || {
+ while let Some((class, next)) = it.next() {
+ let (prev_start, prev_end, prev_next) = match cur {
+ Some(x) => x,
+ None => {
+ cur = Some((class, class, next));
+ continue;
+ }
+ };
+ if prev_next == next {
+ cur = Some((prev_start, class, prev_next));
+ } else {
+ cur = Some((class, class, next));
+ return Some((prev_start, prev_end, prev_next));
+ }
+ }
+ if let Some((start, end, next)) = cur.take() {
+ return Some((start, end, next));
+ }
+ None
+ })
+}
diff --git a/third_party/rust/aho-corasick/src/dfa.rs b/third_party/rust/aho-corasick/src/dfa.rs
new file mode 100644
index 0000000000..f0370a6168
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/dfa.rs
@@ -0,0 +1,814 @@
+/*!
+Provides direct access to a DFA implementation of Aho-Corasick.
+
+This is a low-level API that generally only needs to be used in niche
+circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick)
+instead of a DFA directly. Using an `DFA` directly is typically only necessary
+when one needs access to the [`Automaton`] trait implementation.
+*/
+
+use alloc::{vec, vec::Vec};
+
+use crate::{
+ automaton::Automaton,
+ nfa::noncontiguous,
+ util::{
+ alphabet::ByteClasses,
+ error::{BuildError, MatchError},
+ int::{Usize, U32},
+ prefilter::Prefilter,
+ primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID},
+ search::{Anchored, MatchKind, StartKind},
+ special::Special,
+ },
+};
+
+/// A DFA implementation of Aho-Corasick.
+///
+/// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of
+/// this type directly. Using a `DFA` directly is typically only necessary when
+/// one needs access to the [`Automaton`] trait implementation.
+///
+/// This DFA can only be built by first constructing a [`noncontiguous::NFA`].
+/// Both [`DFA::new`] and [`Builder::build`] do this for you automatically, but
+/// [`Builder::build_from_noncontiguous`] permits doing it explicitly.
+///
+/// A DFA provides the best possible search performance (in this crate) via two
+/// mechanisms:
+///
+/// * All states use a dense representation for their transitions.
+/// * All failure transitions are pre-computed such that they are never
+/// explicitly handled at search time.
+///
+/// These two facts combined mean that every state transition is performed
+/// using a constant number of instructions. However, this comes at
+/// great cost. The memory usage of a DFA can be quite exorbitant.
+/// It is potentially multiple orders of magnitude greater than a
+/// [`contiguous::NFA`](crate::nfa::contiguous::NFA) for example. In exchange,
+/// a DFA will typically have better search speed than a `contiguous::NFA`, but
+/// not by orders of magnitude.
+///
+/// Unless you have a small number of patterns or memory usage is not a concern
+/// and search performance is critical, a DFA is usually not the best choice.
+///
+/// Moreover, unlike the NFAs in this crate, it is costly for a DFA to
+/// support for anchored and unanchored search configurations. Namely,
+/// since failure transitions are pre-computed, supporting both anchored
+/// and unanchored searches requires a duplication of the transition table,
+/// making the memory usage of such a DFA ever bigger. (The NFAs in this crate
+/// unconditionally support both anchored and unanchored searches because there
+/// is essentially no added cost for doing so.) It is for this reason that
+/// a DFA's support for anchored and unanchored searches can be configured
+/// via [`Builder::start_kind`]. By default, a DFA only supports unanchored
+/// searches.
+///
+/// # Example
+///
+/// This example shows how to build an `DFA` directly and use it to execute
+/// [`Automaton::try_find`]:
+///
+/// ```
+/// use aho_corasick::{
+/// automaton::Automaton,
+/// dfa::DFA,
+/// Input, Match,
+/// };
+///
+/// let patterns = &["b", "abc", "abcd"];
+/// let haystack = "abcd";
+///
+/// let nfa = DFA::new(patterns).unwrap();
+/// assert_eq!(
+/// Some(Match::must(0, 1..2)),
+/// nfa.try_find(&Input::new(haystack))?,
+/// );
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+///
+/// It is also possible to implement your own version of `try_find`. See the
+/// [`Automaton`] documentation for an example.
+#[derive(Clone)]
+pub struct DFA {
+ /// The DFA transition table. IDs in this table are pre-multiplied. So
+ /// instead of the IDs being 0, 1, 2, 3, ..., they are 0*stride, 1*stride,
+ /// 2*stride, 3*stride, ...
+ trans: Vec<StateID>,
+ /// The matches for every match state in this DFA. This is first indexed by
+ /// state index (so that's `sid >> stride2`) and then by order in which the
+ /// matches are meant to occur.
+ matches: Vec<Vec<PatternID>>,
+ /// The amount of heap memory used, in bytes, by the inner Vecs of
+ /// 'matches'.
+ matches_memory_usage: usize,
+ /// The length of each pattern. This is used to compute the start offset
+ /// of a match.
+ pattern_lens: Vec<SmallIndex>,
+ /// A prefilter for accelerating searches, if one exists.
+ prefilter: Option<Prefilter>,
+ /// The match semantics built into this DFA.
+ match_kind: MatchKind,
+ /// The total number of states in this DFA.
+ state_len: usize,
+ /// The alphabet size, or total number of equivalence classes, for this
+ /// DFA. Note that the actual number of transitions in each state is
+ /// stride=2^stride2, where stride is the smallest power of 2 greater than
+ /// or equal to alphabet_len. We do things this way so that we can use
+ /// bitshifting to go from a state ID to an index into 'matches'.
+ alphabet_len: usize,
+ /// The exponent with a base 2, such that stride=2^stride2. Given a state
+ /// index 'i', its state identifier is 'i << stride2'. Given a state
+ /// identifier 'sid', its state index is 'sid >> stride2'.
+ stride2: usize,
+ /// The equivalence classes for this DFA. All transitions are defined on
+ /// equivalence classes and not on the 256 distinct byte values.
+ byte_classes: ByteClasses,
+ /// The length of the shortest pattern in this automaton.
+ min_pattern_len: usize,
+ /// The length of the longest pattern in this automaton.
+ max_pattern_len: usize,
+ /// The information required to deduce which states are "special" in this
+ /// DFA.
+ special: Special,
+}
+
+impl DFA {
+ /// Create a new Aho-Corasick DFA using the default configuration.
+ ///
+ /// Use a [`Builder`] if you want to change the configuration.
+ pub fn new<I, P>(patterns: I) -> Result<DFA, BuildError>
+ where
+ I: IntoIterator<Item = P>,
+ P: AsRef<[u8]>,
+ {
+ DFA::builder().build(patterns)
+ }
+
+ /// A convenience method for returning a new Aho-Corasick DFA builder.
+ ///
+ /// This usually permits one to just import the `DFA` type.
+ pub fn builder() -> Builder {
+ Builder::new()
+ }
+}
+
+impl DFA {
+ /// A sentinel state ID indicating that a search should stop once it has
+ /// entered this state. When a search stops, it returns a match if one has
+ /// been found, otherwise no match. A DFA always has an actual dead state
+ /// at this ID.
+ ///
+ /// N.B. DFAs, unlike NFAs, do not have any notion of a FAIL state.
+ /// Namely, the whole point of a DFA is that the FAIL state is completely
+ /// compiled away. That is, DFA construction involves pre-computing the
+ /// failure transitions everywhere, such that failure transitions are no
+ /// longer used at search time. This, combined with its uniformly dense
+ /// representation, are the two most important factors in why it's faster
+ /// than the NFAs in this crate.
+ const DEAD: StateID = StateID::new_unchecked(0);
+
+ /// Adds the given pattern IDs as matches to the given state and also
+ /// records the added memory usage.
+ fn set_matches(
+ &mut self,
+ sid: StateID,
+ pids: impl Iterator<Item = PatternID>,
+ ) {
+ let index = (sid.as_usize() >> self.stride2).checked_sub(2).unwrap();
+ let mut at_least_one = false;
+ for pid in pids {
+ self.matches[index].push(pid);
+ self.matches_memory_usage += PatternID::SIZE;
+ at_least_one = true;
+ }
+ assert!(at_least_one, "match state must have non-empty pids");
+ }
+}
+
+// SAFETY: 'start_state' always returns a valid state ID, 'next_state' always
+// returns a valid state ID given a valid state ID. We otherwise claim that
+// all other methods are correct as well.
+unsafe impl Automaton for DFA {
+ #[inline(always)]
+ fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> {
+ // Either of the start state IDs can be DEAD, in which case, support
+ // for that type of search is not provided by this DFA. Which start
+ // state IDs are inactive depends on the 'StartKind' configuration at
+ // DFA construction time.
+ match anchored {
+ Anchored::No => {
+ let start = self.special.start_unanchored_id;
+ if start == DFA::DEAD {
+ Err(MatchError::invalid_input_unanchored())
+ } else {
+ Ok(start)
+ }
+ }
+ Anchored::Yes => {
+ let start = self.special.start_anchored_id;
+ if start == DFA::DEAD {
+ Err(MatchError::invalid_input_anchored())
+ } else {
+ Ok(start)
+ }
+ }
+ }
+ }
+
+ #[inline(always)]
+ fn next_state(
+ &self,
+ _anchored: Anchored,
+ sid: StateID,
+ byte: u8,
+ ) -> StateID {
+ let class = self.byte_classes.get(byte);
+ self.trans[(sid.as_u32() + u32::from(class)).as_usize()]
+ }
+
+ #[inline(always)]
+ fn is_special(&self, sid: StateID) -> bool {
+ sid <= self.special.max_special_id
+ }
+
+ #[inline(always)]
+ fn is_dead(&self, sid: StateID) -> bool {
+ sid == DFA::DEAD
+ }
+
+ #[inline(always)]
+ fn is_match(&self, sid: StateID) -> bool {
+ !self.is_dead(sid) && sid <= self.special.max_match_id
+ }
+
+ #[inline(always)]
+ fn is_start(&self, sid: StateID) -> bool {
+ sid == self.special.start_unanchored_id
+ || sid == self.special.start_anchored_id
+ }
+
+ #[inline(always)]
+ fn match_kind(&self) -> MatchKind {
+ self.match_kind
+ }
+
+ #[inline(always)]
+ fn patterns_len(&self) -> usize {
+ self.pattern_lens.len()
+ }
+
+ #[inline(always)]
+ fn pattern_len(&self, pid: PatternID) -> usize {
+ self.pattern_lens[pid].as_usize()
+ }
+
+ #[inline(always)]
+ fn min_pattern_len(&self) -> usize {
+ self.min_pattern_len
+ }
+
+ #[inline(always)]
+ fn max_pattern_len(&self) -> usize {
+ self.max_pattern_len
+ }
+
+ #[inline(always)]
+ fn match_len(&self, sid: StateID) -> usize {
+ debug_assert!(self.is_match(sid));
+ let offset = (sid.as_usize() >> self.stride2) - 2;
+ self.matches[offset].len()
+ }
+
+ #[inline(always)]
+ fn match_pattern(&self, sid: StateID, index: usize) -> PatternID {
+ debug_assert!(self.is_match(sid));
+ let offset = (sid.as_usize() >> self.stride2) - 2;
+ self.matches[offset][index]
+ }
+
+ #[inline(always)]
+ fn memory_usage(&self) -> usize {
+ use core::mem::size_of;
+
+ (self.trans.len() * size_of::<u32>())
+ + (self.matches.len() * size_of::<Vec<PatternID>>())
+ + self.matches_memory_usage
+ + (self.pattern_lens.len() * size_of::<SmallIndex>())
+ + self.prefilter.as_ref().map_or(0, |p| p.memory_usage())
+ }
+
+ #[inline(always)]
+ fn prefilter(&self) -> Option<&Prefilter> {
+ self.prefilter.as_ref()
+ }
+}
+
+impl core::fmt::Debug for DFA {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ use crate::{
+ automaton::{fmt_state_indicator, sparse_transitions},
+ util::debug::DebugByte,
+ };
+
+ writeln!(f, "dfa::DFA(")?;
+ for index in 0..self.state_len {
+ let sid = StateID::new_unchecked(index << self.stride2);
+ // While we do currently include the FAIL state in the transition
+ // table (to simplify construction), it is never actually used. It
+ // poses problems with the code below because it gets treated as
+ // a match state incidentally when it is, of course, not. So we
+ // special case it. The fail state is always the first state after
+ // the dead state.
+ //
+ // If the construction is changed to remove the fail state (it
+ // probably should be), then this special case should be updated.
+ if index == 1 {
+ writeln!(f, "F {:06}:", sid.as_usize())?;
+ continue;
+ }
+ fmt_state_indicator(f, self, sid)?;
+ write!(f, "{:06}: ", sid.as_usize())?;
+
+ let it = (0..self.byte_classes.alphabet_len()).map(|class| {
+ (class.as_u8(), self.trans[sid.as_usize() + class])
+ });
+ for (i, (start, end, next)) in sparse_transitions(it).enumerate() {
+ if i > 0 {
+ write!(f, ", ")?;
+ }
+ if start == end {
+ write!(
+ f,
+ "{:?} => {:?}",
+ DebugByte(start),
+ next.as_usize()
+ )?;
+ } else {
+ write!(
+ f,
+ "{:?}-{:?} => {:?}",
+ DebugByte(start),
+ DebugByte(end),
+ next.as_usize()
+ )?;
+ }
+ }
+ write!(f, "\n")?;
+ if self.is_match(sid) {
+ write!(f, " matches: ")?;
+ for i in 0..self.match_len(sid) {
+ if i > 0 {
+ write!(f, ", ")?;
+ }
+ let pid = self.match_pattern(sid, i);
+ write!(f, "{}", pid.as_usize())?;
+ }
+ write!(f, "\n")?;
+ }
+ }
+ writeln!(f, "match kind: {:?}", self.match_kind)?;
+ writeln!(f, "prefilter: {:?}", self.prefilter.is_some())?;
+ writeln!(f, "state length: {:?}", self.state_len)?;
+ writeln!(f, "pattern length: {:?}", self.patterns_len())?;
+ writeln!(f, "shortest pattern length: {:?}", self.min_pattern_len)?;
+ writeln!(f, "longest pattern length: {:?}", self.max_pattern_len)?;
+ writeln!(f, "alphabet length: {:?}", self.alphabet_len)?;
+ writeln!(f, "stride: {:?}", 1 << self.stride2)?;
+ writeln!(f, "byte classes: {:?}", self.byte_classes)?;
+ writeln!(f, "memory usage: {:?}", self.memory_usage())?;
+ writeln!(f, ")")?;
+ Ok(())
+ }
+}
+
+/// A builder for configuring an Aho-Corasick DFA.
+///
+/// This builder has a subset of the options available to a
+/// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options,
+/// their behavior is identical.
+#[derive(Clone, Debug)]
+pub struct Builder {
+ noncontiguous: noncontiguous::Builder,
+ start_kind: StartKind,
+ byte_classes: bool,
+}
+
+impl Default for Builder {
+ fn default() -> Builder {
+ Builder {
+ noncontiguous: noncontiguous::Builder::new(),
+ start_kind: StartKind::Unanchored,
+ byte_classes: true,
+ }
+ }
+}
+
+impl Builder {
+ /// Create a new builder for configuring an Aho-Corasick DFA.
+ pub fn new() -> Builder {
+ Builder::default()
+ }
+
+ /// Build an Aho-Corasick DFA from the given iterator of patterns.
+ ///
+ /// A builder may be reused to create more DFAs.
+ pub fn build<I, P>(&self, patterns: I) -> Result<DFA, BuildError>
+ where
+ I: IntoIterator<Item = P>,
+ P: AsRef<[u8]>,
+ {
+ let nnfa = self.noncontiguous.build(patterns)?;
+ self.build_from_noncontiguous(&nnfa)
+ }
+
+ /// Build an Aho-Corasick DFA from the given noncontiguous NFA.
+ ///
+ /// Note that when this method is used, only the `start_kind` and
+ /// `byte_classes` settings on this builder are respected. The other
+ /// settings only apply to the initial construction of the Aho-Corasick
+ /// automaton. Since using this method requires that initial construction
+ /// has already completed, all settings impacting only initial construction
+ /// are no longer relevant.
+ pub fn build_from_noncontiguous(
+ &self,
+ nnfa: &noncontiguous::NFA,
+ ) -> Result<DFA, BuildError> {
+ debug!("building DFA");
+ let byte_classes = if self.byte_classes {
+ nnfa.byte_classes().clone()
+ } else {
+ ByteClasses::singletons()
+ };
+ let state_len = match self.start_kind {
+ StartKind::Unanchored | StartKind::Anchored => nnfa.states().len(),
+ StartKind::Both => {
+ // These unwraps are OK because we know that the number of
+ // NFA states is < StateID::LIMIT which is in turn less than
+ // i32::MAX. Thus, there is always room to multiply by 2.
+ // Finally, the number of states is always at least 4 in the
+ // NFA (DEAD, FAIL, START-UNANCHORED, START-ANCHORED), so the
+ // subtraction of 4 is okay.
+ //
+ // Note that we subtract 4 because the "anchored" part of
+ // the DFA duplicates the unanchored part (without failure
+ // transitions), but reuses the DEAD, FAIL and START states.
+ nnfa.states()
+ .len()
+ .checked_mul(2)
+ .unwrap()
+ .checked_sub(4)
+ .unwrap()
+ }
+ };
+ let trans_len =
+ match state_len.checked_shl(byte_classes.stride2().as_u32()) {
+ Some(trans_len) => trans_len,
+ None => {
+ return Err(BuildError::state_id_overflow(
+ StateID::MAX.as_u64(),
+ usize::MAX.as_u64(),
+ ))
+ }
+ };
+ StateID::new(trans_len.checked_sub(byte_classes.stride()).unwrap())
+ .map_err(|e| {
+ BuildError::state_id_overflow(
+ StateID::MAX.as_u64(),
+ e.attempted(),
+ )
+ })?;
+ let num_match_states = match self.start_kind {
+ StartKind::Unanchored | StartKind::Anchored => {
+ nnfa.special().max_match_id.as_usize().checked_sub(1).unwrap()
+ }
+ StartKind::Both => nnfa
+ .special()
+ .max_match_id
+ .as_usize()
+ .checked_sub(1)
+ .unwrap()
+ .checked_mul(2)
+ .unwrap(),
+ };
+ let mut dfa = DFA {
+ trans: vec![DFA::DEAD; trans_len],
+ matches: vec![vec![]; num_match_states],
+ matches_memory_usage: 0,
+ pattern_lens: nnfa.pattern_lens_raw().to_vec(),
+ prefilter: nnfa.prefilter().map(|p| p.clone()),
+ match_kind: nnfa.match_kind(),
+ state_len,
+ alphabet_len: byte_classes.alphabet_len(),
+ stride2: byte_classes.stride2(),
+ byte_classes,
+ min_pattern_len: nnfa.min_pattern_len(),
+ max_pattern_len: nnfa.max_pattern_len(),
+ // The special state IDs are set later.
+ special: Special::zero(),
+ };
+ match self.start_kind {
+ StartKind::Both => {
+ self.finish_build_both_starts(nnfa, &mut dfa);
+ }
+ StartKind::Unanchored => {
+ self.finish_build_one_start(Anchored::No, nnfa, &mut dfa);
+ }
+ StartKind::Anchored => {
+ self.finish_build_one_start(Anchored::Yes, nnfa, &mut dfa)
+ }
+ }
+ debug!(
+ "DFA built, <states: {:?}, size: {:?}, \
+ alphabet len: {:?}, stride: {:?}>",
+ dfa.state_len,
+ dfa.memory_usage(),
+ dfa.byte_classes.alphabet_len(),
+ dfa.byte_classes.stride(),
+ );
+ // The vectors can grow ~twice as big during construction because a
+ // Vec amortizes growth. But here, let's shrink things back down to
+ // what we actually need since we're never going to add more to it.
+ dfa.trans.shrink_to_fit();
+ dfa.pattern_lens.shrink_to_fit();
+ dfa.matches.shrink_to_fit();
+ // TODO: We might also want to shrink each Vec inside of `dfa.matches`,
+ // or even better, convert it to one contiguous allocation. But I think
+ // I went with nested allocs for good reason (can't remember), so this
+ // may be tricky to do. I decided not to shrink them here because it
+ // might require a fair bit of work to do. It's unclear whether it's
+ // worth it.
+ Ok(dfa)
+ }
+
+ /// Finishes building a DFA for either unanchored or anchored searches,
+ /// but NOT both.
+ fn finish_build_one_start(
+ &self,
+ anchored: Anchored,
+ nnfa: &noncontiguous::NFA,
+ dfa: &mut DFA,
+ ) {
+ // This function always succeeds because we check above that all of the
+ // states in the NFA can be mapped to DFA state IDs.
+ let stride2 = dfa.stride2;
+ let old2new = |oldsid: StateID| {
+ StateID::new_unchecked(oldsid.as_usize() << stride2)
+ };
+ for (oldsid, state) in nnfa.states().iter().with_state_ids() {
+ let newsid = old2new(oldsid);
+ if state.is_match() {
+ dfa.set_matches(newsid, nnfa.iter_matches(oldsid));
+ }
+ sparse_iter(
+ nnfa,
+ oldsid,
+ &dfa.byte_classes,
+ |byte, class, mut oldnextsid| {
+ if oldnextsid == noncontiguous::NFA::FAIL {
+ if anchored.is_anchored() {
+ oldnextsid = noncontiguous::NFA::DEAD;
+ } else {
+ oldnextsid = nnfa.next_state(
+ Anchored::No,
+ state.fail(),
+ byte,
+ );
+ }
+ }
+ dfa.trans[newsid.as_usize() + usize::from(class)] =
+ old2new(oldnextsid);
+ },
+ );
+ }
+ // Now that we've remapped all the IDs in our states, all that's left
+ // is remapping the special state IDs.
+ let old = nnfa.special();
+ let new = &mut dfa.special;
+ new.max_special_id = old2new(old.max_special_id);
+ new.max_match_id = old2new(old.max_match_id);
+ if anchored.is_anchored() {
+ new.start_unanchored_id = DFA::DEAD;
+ new.start_anchored_id = old2new(old.start_anchored_id);
+ } else {
+ new.start_unanchored_id = old2new(old.start_unanchored_id);
+ new.start_anchored_id = DFA::DEAD;
+ }
+ }
+
+ /// Finishes building a DFA that supports BOTH unanchored and anchored
+ /// searches. It works by inter-leaving unanchored states with anchored
+ /// states in the same transition table. This way, we avoid needing to
+ /// re-shuffle states afterward to ensure that our states still look like
+ /// DEAD, MATCH, ..., START-UNANCHORED, START-ANCHORED, NON-MATCH, ...
+ ///
+ /// Honestly this is pretty inscrutable... Simplifications are most
+ /// welcome.
+ fn finish_build_both_starts(
+ &self,
+ nnfa: &noncontiguous::NFA,
+ dfa: &mut DFA,
+ ) {
+ let stride2 = dfa.stride2;
+ let stride = 1 << stride2;
+ let mut remap_unanchored = vec![DFA::DEAD; nnfa.states().len()];
+ let mut remap_anchored = vec![DFA::DEAD; nnfa.states().len()];
+ let mut is_anchored = vec![false; dfa.state_len];
+ let mut newsid = DFA::DEAD;
+ let next_dfa_id =
+ |sid: StateID| StateID::new_unchecked(sid.as_usize() + stride);
+ for (oldsid, state) in nnfa.states().iter().with_state_ids() {
+ if oldsid == noncontiguous::NFA::DEAD
+ || oldsid == noncontiguous::NFA::FAIL
+ {
+ remap_unanchored[oldsid] = newsid;
+ remap_anchored[oldsid] = newsid;
+ newsid = next_dfa_id(newsid);
+ } else if oldsid == nnfa.special().start_unanchored_id
+ || oldsid == nnfa.special().start_anchored_id
+ {
+ if oldsid == nnfa.special().start_unanchored_id {
+ remap_unanchored[oldsid] = newsid;
+ remap_anchored[oldsid] = DFA::DEAD;
+ } else {
+ remap_unanchored[oldsid] = DFA::DEAD;
+ remap_anchored[oldsid] = newsid;
+ is_anchored[newsid.as_usize() >> stride2] = true;
+ }
+ if state.is_match() {
+ dfa.set_matches(newsid, nnfa.iter_matches(oldsid));
+ }
+ sparse_iter(
+ nnfa,
+ oldsid,
+ &dfa.byte_classes,
+ |_, class, oldnextsid| {
+ let class = usize::from(class);
+ if oldnextsid == noncontiguous::NFA::FAIL {
+ dfa.trans[newsid.as_usize() + class] = DFA::DEAD;
+ } else {
+ dfa.trans[newsid.as_usize() + class] = oldnextsid;
+ }
+ },
+ );
+ newsid = next_dfa_id(newsid);
+ } else {
+ let unewsid = newsid;
+ newsid = next_dfa_id(newsid);
+ let anewsid = newsid;
+ newsid = next_dfa_id(newsid);
+
+ remap_unanchored[oldsid] = unewsid;
+ remap_anchored[oldsid] = anewsid;
+ is_anchored[anewsid.as_usize() >> stride2] = true;
+ if state.is_match() {
+ dfa.set_matches(unewsid, nnfa.iter_matches(oldsid));
+ dfa.set_matches(anewsid, nnfa.iter_matches(oldsid));
+ }
+ sparse_iter(
+ nnfa,
+ oldsid,
+ &dfa.byte_classes,
+ |byte, class, oldnextsid| {
+ let class = usize::from(class);
+ if oldnextsid == noncontiguous::NFA::FAIL {
+ dfa.trans[unewsid.as_usize() + class] = nnfa
+ .next_state(Anchored::No, state.fail(), byte);
+ } else {
+ dfa.trans[unewsid.as_usize() + class] = oldnextsid;
+ dfa.trans[anewsid.as_usize() + class] = oldnextsid;
+ }
+ },
+ );
+ }
+ }
+ for i in 0..dfa.state_len {
+ let sid = i << stride2;
+ if is_anchored[i] {
+ for next in dfa.trans[sid..][..stride].iter_mut() {
+ *next = remap_anchored[*next];
+ }
+ } else {
+ for next in dfa.trans[sid..][..stride].iter_mut() {
+ *next = remap_unanchored[*next];
+ }
+ }
+ }
+ // Now that we've remapped all the IDs in our states, all that's left
+ // is remapping the special state IDs.
+ let old = nnfa.special();
+ let new = &mut dfa.special;
+ new.max_special_id = remap_anchored[old.max_special_id];
+ new.max_match_id = remap_anchored[old.max_match_id];
+ new.start_unanchored_id = remap_unanchored[old.start_unanchored_id];
+ new.start_anchored_id = remap_anchored[old.start_anchored_id];
+ }
+
+ /// Set the desired match semantics.
+ ///
+ /// This only applies when using [`Builder::build`] and not
+ /// [`Builder::build_from_noncontiguous`].
+ ///
+ /// See
+ /// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind)
+ /// for more documentation and examples.
+ pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder {
+ self.noncontiguous.match_kind(kind);
+ self
+ }
+
+ /// Enable ASCII-aware case insensitive matching.
+ ///
+ /// This only applies when using [`Builder::build`] and not
+ /// [`Builder::build_from_noncontiguous`].
+ ///
+ /// See
+ /// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive)
+ /// for more documentation and examples.
+ pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder {
+ self.noncontiguous.ascii_case_insensitive(yes);
+ self
+ }
+
+ /// Enable heuristic prefilter optimizations.
+ ///
+ /// This only applies when using [`Builder::build`] and not
+ /// [`Builder::build_from_noncontiguous`].
+ ///
+ /// See
+ /// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter)
+ /// for more documentation and examples.
+ pub fn prefilter(&mut self, yes: bool) -> &mut Builder {
+ self.noncontiguous.prefilter(yes);
+ self
+ }
+
+ /// Sets the starting state configuration for the automaton.
+ ///
+ /// See
+ /// [`AhoCorasickBuilder::start_kind`](crate::AhoCorasickBuilder::start_kind)
+ /// for more documentation and examples.
+ pub fn start_kind(&mut self, kind: StartKind) -> &mut Builder {
+ self.start_kind = kind;
+ self
+ }
+
+ /// A debug setting for whether to attempt to shrink the size of the
+ /// automaton's alphabet or not.
+ ///
+ /// This should never be enabled unless you're debugging an automaton.
+ /// Namely, disabling byte classes makes transitions easier to reason
+ /// about, since they use the actual bytes instead of equivalence classes.
+ /// Disabling this confers no performance benefit at search time.
+ ///
+ /// See
+ /// [`AhoCorasickBuilder::byte_classes`](crate::AhoCorasickBuilder::byte_classes)
+ /// for more documentation and examples.
+ pub fn byte_classes(&mut self, yes: bool) -> &mut Builder {
+ self.byte_classes = yes;
+ self
+ }
+}
+
+/// Iterate over all possible equivalence class transitions in this state.
+/// The closure is called for all transitions with a distinct equivalence
+/// class, even those not explicitly represented in this sparse state. For
+/// any implicitly defined transitions, the given closure is called with
+/// the fail state ID.
+///
+/// The closure is guaranteed to be called precisely
+/// `byte_classes.alphabet_len()` times, once for every possible class in
+/// ascending order.
+fn sparse_iter<F: FnMut(u8, u8, StateID)>(
+ nnfa: &noncontiguous::NFA,
+ oldsid: StateID,
+ classes: &ByteClasses,
+ mut f: F,
+) {
+ let mut prev_class = None;
+ let mut byte = 0usize;
+ for t in nnfa.iter_trans(oldsid) {
+ while byte < usize::from(t.byte()) {
+ let rep = byte.as_u8();
+ let class = classes.get(rep);
+ byte += 1;
+ if prev_class != Some(class) {
+ f(rep, class, noncontiguous::NFA::FAIL);
+ prev_class = Some(class);
+ }
+ }
+ let rep = t.byte();
+ let class = classes.get(rep);
+ byte += 1;
+ if prev_class != Some(class) {
+ f(rep, class, t.next());
+ prev_class = Some(class);
+ }
+ }
+ for b in byte..=255 {
+ let rep = b.as_u8();
+ let class = classes.get(rep);
+ if prev_class != Some(class) {
+ f(rep, class, noncontiguous::NFA::FAIL);
+ prev_class = Some(class);
+ }
+ }
+}
diff --git a/third_party/rust/aho-corasick/src/lib.rs b/third_party/rust/aho-corasick/src/lib.rs
new file mode 100644
index 0000000000..20e8b81115
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/lib.rs
@@ -0,0 +1,326 @@
+/*!
+A library for finding occurrences of many patterns at once. This library
+provides multiple pattern search principally through an implementation of the
+[Aho-Corasick algorithm](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm),
+which builds a fast finite state machine for executing searches in linear time.
+
+Additionally, this library provides a number of configuration options for
+building the automaton that permit controlling the space versus time trade
+off. Other features include simple ASCII case insensitive matching, finding
+overlapping matches, replacements, searching streams and even searching and
+replacing text in streams.
+
+Finally, unlike most other Aho-Corasick implementations, this one
+supports enabling [leftmost-first](MatchKind::LeftmostFirst) or
+[leftmost-longest](MatchKind::LeftmostLongest) match semantics, using a
+(seemingly) novel alternative construction algorithm. For more details on what
+match semantics means, see the [`MatchKind`] type.
+
+# Overview
+
+This section gives a brief overview of the primary types in this crate:
+
+* [`AhoCorasick`] is the primary type and represents an Aho-Corasick automaton.
+This is the type you use to execute searches.
+* [`AhoCorasickBuilder`] can be used to build an Aho-Corasick automaton, and
+supports configuring a number of options.
+* [`Match`] represents a single match reported by an Aho-Corasick automaton.
+Each match has two pieces of information: the pattern that matched and the
+start and end byte offsets corresponding to the position in the haystack at
+which it matched.
+
+# Example: basic searching
+
+This example shows how to search for occurrences of multiple patterns
+simultaneously. Each match includes the pattern that matched along with the
+byte offsets of the match.
+
+```
+use aho_corasick::{AhoCorasick, PatternID};
+
+let patterns = &["apple", "maple", "Snapple"];
+let haystack = "Nobody likes maple in their apple flavored Snapple.";
+
+let ac = AhoCorasick::new(patterns).unwrap();
+let mut matches = vec![];
+for mat in ac.find_iter(haystack) {
+ matches.push((mat.pattern(), mat.start(), mat.end()));
+}
+assert_eq!(matches, vec![
+ (PatternID::must(1), 13, 18),
+ (PatternID::must(0), 28, 33),
+ (PatternID::must(2), 43, 50),
+]);
+```
+
+# Example: case insensitivity
+
+This is like the previous example, but matches `Snapple` case insensitively
+using `AhoCorasickBuilder`:
+
+```
+use aho_corasick::{AhoCorasick, PatternID};
+
+let patterns = &["apple", "maple", "snapple"];
+let haystack = "Nobody likes maple in their apple flavored Snapple.";
+
+let ac = AhoCorasick::builder()
+ .ascii_case_insensitive(true)
+ .build(patterns)
+ .unwrap();
+let mut matches = vec![];
+for mat in ac.find_iter(haystack) {
+ matches.push((mat.pattern(), mat.start(), mat.end()));
+}
+assert_eq!(matches, vec![
+ (PatternID::must(1), 13, 18),
+ (PatternID::must(0), 28, 33),
+ (PatternID::must(2), 43, 50),
+]);
+```
+
+# Example: replacing matches in a stream
+
+This example shows how to execute a search and replace on a stream without
+loading the entire stream into memory first.
+
+```
+# #[cfg(feature = "std")] {
+use aho_corasick::AhoCorasick;
+
+# fn example() -> Result<(), std::io::Error> {
+let patterns = &["fox", "brown", "quick"];
+let replace_with = &["sloth", "grey", "slow"];
+
+// In a real example, these might be `std::fs::File`s instead. All you need to
+// do is supply a pair of `std::io::Read` and `std::io::Write` implementations.
+let rdr = "The quick brown fox.";
+let mut wtr = vec![];
+
+let ac = AhoCorasick::new(patterns).unwrap();
+ac.try_stream_replace_all(rdr.as_bytes(), &mut wtr, replace_with)?;
+assert_eq!(b"The slow grey sloth.".to_vec(), wtr);
+# Ok(()) }; example().unwrap()
+# }
+```
+
+# Example: finding the leftmost first match
+
+In the textbook description of Aho-Corasick, its formulation is typically
+structured such that it reports all possible matches, even when they overlap
+with another. In many cases, overlapping matches may not be desired, such as
+the case of finding all successive non-overlapping matches like you might with
+a standard regular expression.
+
+Unfortunately the "obvious" way to modify the Aho-Corasick algorithm to do
+this doesn't always work in the expected way, since it will report matches as
+soon as they are seen. For example, consider matching the regex `Samwise|Sam`
+against the text `Samwise`. Most regex engines (that are Perl-like, or
+non-POSIX) will report `Samwise` as a match, but the standard Aho-Corasick
+algorithm modified for reporting non-overlapping matches will report `Sam`.
+
+A novel contribution of this library is the ability to change the match
+semantics of Aho-Corasick (without additional search time overhead) such that
+`Samwise` is reported instead. For example, here's the standard approach:
+
+```
+use aho_corasick::AhoCorasick;
+
+let patterns = &["Samwise", "Sam"];
+let haystack = "Samwise";
+
+let ac = AhoCorasick::new(patterns).unwrap();
+let mat = ac.find(haystack).expect("should have a match");
+assert_eq!("Sam", &haystack[mat.start()..mat.end()]);
+```
+
+And now here's the leftmost-first version, which matches how a Perl-like
+regex will work:
+
+```
+use aho_corasick::{AhoCorasick, MatchKind};
+
+let patterns = &["Samwise", "Sam"];
+let haystack = "Samwise";
+
+let ac = AhoCorasick::builder()
+ .match_kind(MatchKind::LeftmostFirst)
+ .build(patterns)
+ .unwrap();
+let mat = ac.find(haystack).expect("should have a match");
+assert_eq!("Samwise", &haystack[mat.start()..mat.end()]);
+```
+
+In addition to leftmost-first semantics, this library also supports
+leftmost-longest semantics, which match the POSIX behavior of a regular
+expression alternation. See [`MatchKind`] for more details.
+
+# Prefilters
+
+While an Aho-Corasick automaton can perform admirably when compared to more
+naive solutions, it is generally slower than more specialized algorithms that
+are accelerated using vector instructions such as SIMD.
+
+For that reason, this library will internally use a "prefilter" to attempt
+to accelerate searches when possible. Currently, this library has several
+different algorithms it might use depending on the patterns provided. Once the
+number of patterns gets too big, prefilters are no longer used.
+
+While a prefilter is generally good to have on by default since it works
+well in the common case, it can lead to less predictable or even sub-optimal
+performance in some cases. For that reason, prefilters can be explicitly
+disabled via [`AhoCorasickBuilder::prefilter`].
+
+# Lower level APIs
+
+This crate also provides several sub-modules that collectively expose many of
+the implementation details of the main [`AhoCorasick`] type. Most users of this
+library can completely ignore the submodules and their contents, but if you
+needed finer grained control, some parts of them may be useful to you. Here is
+a brief overview of each and why you might want to use them:
+
+* The [`packed`] sub-module contains a lower level API for using fast
+vectorized routines for finding a small number of patterns in a haystack.
+You might want to use this API when you want to completely side-step using
+Aho-Corasick automata. Otherwise, the fast vectorized routines are used
+automatically as prefilters for `AhoCorasick` searches whenever possible.
+* The [`automaton`] sub-module provides a lower level finite state
+machine interface that the various Aho-Corasick implementations in
+this crate implement. This sub-module's main contribution is the
+[`Automaton`](automaton::Automaton) trait, which permits manually walking the
+state transitions of an Aho-Corasick automaton.
+* The [`dfa`] and [`nfa`] sub-modules provide DFA and NFA implementations of
+the aforementioned `Automaton` trait. The main reason one might want to use
+these sub-modules is to get access to a type that implements the `Automaton`
+trait. (The top-level `AhoCorasick` type does not implement the `Automaton`
+trait.)
+
+As mentioned above, if you aren't sure whether you need these sub-modules,
+you should be able to safely ignore them and just focus on the [`AhoCorasick`]
+type.
+
+# Crate features
+
+This crate exposes a few features for controlling dependency usage and whether
+this crate can be used without the standard library.
+
+* **std** -
+ Enables support for the standard library. This feature is enabled by
+ default. When disabled, only `core` and `alloc` are used. At an API
+ level, enabling `std` enables `std::error::Error` trait impls for the
+ various error types, and higher level stream search routines such as
+ [`AhoCorasick::try_stream_find_iter`]. But the `std` feature is also required
+ to enable vectorized prefilters. Prefilters can greatly accelerate searches,
+ but generally only apply when the number of patterns is small (less than
+ ~100).
+* **perf-literal** -
+ Enables support for literal prefilters that use vectorized routines from
+ external crates. This feature is enabled by default. If you're only using
+ Aho-Corasick for large numbers of patterns or otherwise can abide lower
+ throughput when searching with a small number of patterns, then it is
+ reasonable to disable this feature.
+* **logging** -
+ Enables a dependency on the `log` crate and emits messages to aide in
+ diagnostics. This feature is disabled by default.
+*/
+
+#![no_std]
+#![deny(missing_docs)]
+#![deny(rustdoc::broken_intra_doc_links)]
+#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+
+extern crate alloc;
+#[cfg(any(test, feature = "std"))]
+extern crate std;
+
+#[cfg(doctest)]
+doc_comment::doctest!("../README.md");
+
+#[cfg(feature = "std")]
+pub use crate::ahocorasick::StreamFindIter;
+pub use crate::{
+ ahocorasick::{
+ AhoCorasick, AhoCorasickBuilder, AhoCorasickKind, FindIter,
+ FindOverlappingIter,
+ },
+ util::{
+ error::{BuildError, MatchError, MatchErrorKind},
+ primitives::{PatternID, PatternIDError},
+ search::{Anchored, Input, Match, MatchKind, Span, StartKind},
+ },
+};
+
+#[macro_use]
+mod macros;
+
+mod ahocorasick;
+pub mod automaton;
+pub mod dfa;
+pub mod nfa;
+pub mod packed;
+#[cfg(test)]
+mod tests;
+// I wrote out the module for implementing fst::Automaton only to later realize
+// that this would make fst a public dependency and fst is not at 1.0 yet. I
+// decided to just keep the code in tree, but build it only during tests.
+//
+// TODO: I think I've changed my mind again. I'm considering pushing it out
+// into either a separate crate or into 'fst' directly as an optional feature.
+// #[cfg(test)]
+// #[allow(dead_code)]
+// mod transducer;
+pub(crate) mod util;
+
+#[cfg(test)]
+mod testoibits {
+ use std::panic::{RefUnwindSafe, UnwindSafe};
+
+ use super::*;
+
+ fn assert_all<T: Send + Sync + UnwindSafe + RefUnwindSafe>() {}
+
+ #[test]
+ fn oibits_main() {
+ assert_all::<AhoCorasick>();
+ assert_all::<AhoCorasickBuilder>();
+ assert_all::<AhoCorasickKind>();
+ assert_all::<FindIter>();
+ assert_all::<FindOverlappingIter>();
+
+ assert_all::<BuildError>();
+ assert_all::<MatchError>();
+ assert_all::<MatchErrorKind>();
+
+ assert_all::<Anchored>();
+ assert_all::<Input>();
+ assert_all::<Match>();
+ assert_all::<MatchKind>();
+ assert_all::<Span>();
+ assert_all::<StartKind>();
+ }
+
+ #[test]
+ fn oibits_automaton() {
+ use crate::{automaton, dfa::DFA};
+
+ assert_all::<automaton::FindIter<DFA>>();
+ assert_all::<automaton::FindOverlappingIter<DFA>>();
+ #[cfg(feature = "std")]
+ assert_all::<automaton::StreamFindIter<DFA, std::io::Stdin>>();
+ assert_all::<automaton::OverlappingState>();
+
+ assert_all::<automaton::Prefilter>();
+ assert_all::<automaton::Candidate>();
+ }
+
+ #[test]
+ fn oibits_packed() {
+ use crate::packed;
+
+ assert_all::<packed::Config>();
+ assert_all::<packed::Builder>();
+ assert_all::<packed::Searcher>();
+ assert_all::<packed::FindIter>();
+ assert_all::<packed::MatchKind>();
+ }
+}
diff --git a/third_party/rust/aho-corasick/src/macros.rs b/third_party/rust/aho-corasick/src/macros.rs
new file mode 100644
index 0000000000..fc73e6eddd
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/macros.rs
@@ -0,0 +1,18 @@
+#![allow(unused_macros)]
+
+macro_rules! log {
+ ($($tt:tt)*) => {
+ #[cfg(feature = "logging")]
+ {
+ $($tt)*
+ }
+ }
+}
+
+macro_rules! debug {
+ ($($tt:tt)*) => { log!(log::debug!($($tt)*)) }
+}
+
+macro_rules! trace {
+ ($($tt:tt)*) => { log!(log::trace!($($tt)*)) }
+}
diff --git a/third_party/rust/aho-corasick/src/nfa/contiguous.rs b/third_party/rust/aho-corasick/src/nfa/contiguous.rs
new file mode 100644
index 0000000000..29c162107d
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/nfa/contiguous.rs
@@ -0,0 +1,1141 @@
+/*!
+Provides a contiguous NFA implementation of Aho-Corasick.
+
+This is a low-level API that generally only needs to be used in niche
+circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick)
+instead of a contiguous NFA directly. Using an `NFA` directly is typically only
+necessary when one needs access to the [`Automaton`] trait implementation.
+*/
+
+use alloc::{vec, vec::Vec};
+
+use crate::{
+ automaton::Automaton,
+ nfa::noncontiguous,
+ util::{
+ alphabet::ByteClasses,
+ error::{BuildError, MatchError},
+ int::{Usize, U16, U32},
+ prefilter::Prefilter,
+ primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID},
+ search::{Anchored, MatchKind},
+ special::Special,
+ },
+};
+
+/// A contiguous NFA implementation of Aho-Corasick.
+///
+/// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of
+/// this type directly. Using an `NFA` directly is typically only necessary
+/// when one needs access to the [`Automaton`] trait implementation.
+///
+/// This NFA can only be built by first constructing a [`noncontiguous::NFA`].
+/// Both [`NFA::new`] and [`Builder::build`] do this for you automatically, but
+/// [`Builder::build_from_noncontiguous`] permits doing it explicitly.
+///
+/// The main difference between a noncontiguous NFA and a contiguous NFA is
+/// that the latter represents all of its states and transitions in a single
+/// allocation, where as the former uses a separate allocation for each state.
+/// Doing this at construction time while keeping a low memory footprint isn't
+/// feasible, which is primarily why there are two different NFA types: one
+/// that does the least amount of work possible to build itself, and another
+/// that does a little extra work to compact itself and make state transitions
+/// faster by making some states use a dense representation.
+///
+/// Because a contiguous NFA uses a single allocation, there is a lot more
+/// opportunity for compression tricks to reduce the heap memory used. Indeed,
+/// it is not uncommon for a contiguous NFA to use an order of magnitude less
+/// heap memory than a noncontiguous NFA. Since building a contiguous NFA
+/// usually only takes a fraction of the time it takes to build a noncontiguous
+/// NFA, the overall build time is not much slower. Thus, in most cases, a
+/// contiguous NFA is the best choice.
+///
+/// Since a contiguous NFA uses various tricks for compression and to achieve
+/// faster state transitions, currently, its limit on the number of states
+/// is somewhat smaller than what a noncontiguous NFA can achieve. Generally
+/// speaking, you shouldn't expect to run into this limit if the number of
+/// patterns is under 1 million. It is plausible that this limit will be
+/// increased in the future. If the limit is reached, building a contiguous NFA
+/// will return an error. Often, since building a contiguous NFA is relatively
+/// cheap, it can make sense to always try it even if you aren't sure if it
+/// will fail or not. If it does, you can always fall back to a noncontiguous
+/// NFA. (Indeed, the main [`AhoCorasick`](crate::AhoCorasick) type employs a
+/// strategy similar to this at construction time.)
+///
+/// # Example
+///
+/// This example shows how to build an `NFA` directly and use it to execute
+/// [`Automaton::try_find`]:
+///
+/// ```
+/// use aho_corasick::{
+/// automaton::Automaton,
+/// nfa::contiguous::NFA,
+/// Input, Match,
+/// };
+///
+/// let patterns = &["b", "abc", "abcd"];
+/// let haystack = "abcd";
+///
+/// let nfa = NFA::new(patterns).unwrap();
+/// assert_eq!(
+/// Some(Match::must(0, 1..2)),
+/// nfa.try_find(&Input::new(haystack))?,
+/// );
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+///
+/// It is also possible to implement your own version of `try_find`. See the
+/// [`Automaton`] documentation for an example.
+#[derive(Clone)]
+pub struct NFA {
+ /// The raw NFA representation. Each state is packed with a header
+ /// (containing the format of the state, the failure transition and, for
+ /// a sparse state, the number of transitions), its transitions and any
+ /// matching pattern IDs for match states.
+ repr: Vec<u32>,
+ /// The length of each pattern. This is used to compute the start offset
+ /// of a match.
+ pattern_lens: Vec<SmallIndex>,
+ /// The total number of states in this NFA.
+ state_len: usize,
+ /// A prefilter for accelerating searches, if one exists.
+ prefilter: Option<Prefilter>,
+ /// The match semantics built into this NFA.
+ match_kind: MatchKind,
+ /// The alphabet size, or total number of equivalence classes, for this
+ /// NFA. Dense states always have this many transitions.
+ alphabet_len: usize,
+ /// The equivalence classes for this NFA. All transitions, dense and
+ /// sparse, are defined on equivalence classes and not on the 256 distinct
+ /// byte values.
+ byte_classes: ByteClasses,
+ /// The length of the shortest pattern in this automaton.
+ min_pattern_len: usize,
+ /// The length of the longest pattern in this automaton.
+ max_pattern_len: usize,
+ /// The information required to deduce which states are "special" in this
+ /// NFA.
+ special: Special,
+}
+
+impl NFA {
+ /// Create a new Aho-Corasick contiguous NFA using the default
+ /// configuration.
+ ///
+ /// Use a [`Builder`] if you want to change the configuration.
+ pub fn new<I, P>(patterns: I) -> Result<NFA, BuildError>
+ where
+ I: IntoIterator<Item = P>,
+ P: AsRef<[u8]>,
+ {
+ NFA::builder().build(patterns)
+ }
+
+ /// A convenience method for returning a new Aho-Corasick contiguous NFA
+ /// builder.
+ ///
+ /// This usually permits one to just import the `NFA` type.
+ pub fn builder() -> Builder {
+ Builder::new()
+ }
+}
+
+impl NFA {
+ /// A sentinel state ID indicating that a search should stop once it has
+ /// entered this state. When a search stops, it returns a match if one
+ /// has been found, otherwise no match. A contiguous NFA always has an
+ /// actual dead state at this ID.
+ const DEAD: StateID = StateID::new_unchecked(0);
+ /// Another sentinel state ID indicating that a search should move through
+ /// current state's failure transition.
+ ///
+ /// Note that unlike DEAD, this does not actually point to a valid state
+ /// in a contiguous NFA. (noncontiguous::NFA::FAIL does point to a valid
+ /// state.) Instead, this points to the position that is guaranteed to
+ /// never be a valid state ID (by making sure it points to a place in the
+ /// middle of the encoding of the DEAD state). Since we never need to
+ /// actually look at the FAIL state itself, this works out.
+ ///
+ /// By why do it this way? So that FAIL is a constant. I don't have any
+ /// concrete evidence that this materially helps matters, but it's easy to
+ /// do. The alternative would be making the FAIL ID point to the second
+ /// state, which could be made a constant but is a little trickier to do.
+ /// The easiest path is to just make the FAIL state a runtime value, but
+ /// since comparisons with FAIL occur in perf critical parts of the search,
+ /// we want it to be as tight as possible and not waste any registers.
+ ///
+ /// Very hand wavy... But the code complexity that results from this is
+ /// very mild.
+ const FAIL: StateID = StateID::new_unchecked(1);
+}
+
+// SAFETY: 'start_state' always returns a valid state ID, 'next_state' always
+// returns a valid state ID given a valid state ID. We otherwise claim that
+// all other methods are correct as well.
+unsafe impl Automaton for NFA {
+ #[inline(always)]
+ fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> {
+ match anchored {
+ Anchored::No => Ok(self.special.start_unanchored_id),
+ Anchored::Yes => Ok(self.special.start_anchored_id),
+ }
+ }
+
+ #[inline(always)]
+ fn next_state(
+ &self,
+ anchored: Anchored,
+ mut sid: StateID,
+ byte: u8,
+ ) -> StateID {
+ let repr = &self.repr;
+ let class = self.byte_classes.get(byte);
+ let u32tosid = StateID::from_u32_unchecked;
+ loop {
+ let o = sid.as_usize();
+ let kind = repr[o] & 0xFF;
+ // I tried to encapsulate the "next transition" logic into its own
+ // function, but it seemed to always result in sub-optimal codegen
+ // that led to real and significant slowdowns. So we just inline
+ // the logic here.
+ //
+ // I've also tried a lot of different ways to speed up this
+ // routine, and most of them have failed.
+ if kind == State::KIND_DENSE {
+ let next = u32tosid(repr[o + 2 + usize::from(class)]);
+ if next != NFA::FAIL {
+ return next;
+ }
+ } else if kind == State::KIND_ONE {
+ if class == repr[o].low_u16().high_u8() {
+ return u32tosid(repr[o + 2]);
+ }
+ } else {
+ // NOTE: I tried a SWAR technique in the loop below, but found
+ // it slower. See the 'swar' test in the tests for this module.
+ let trans_len = kind.as_usize();
+ let classes_len = u32_len(trans_len);
+ let trans_offset = o + 2 + classes_len;
+ for (i, &chunk) in
+ repr[o + 2..][..classes_len].iter().enumerate()
+ {
+ let classes = chunk.to_ne_bytes();
+ if classes[0] == class {
+ return u32tosid(repr[trans_offset + i * 4]);
+ }
+ if classes[1] == class {
+ return u32tosid(repr[trans_offset + i * 4 + 1]);
+ }
+ if classes[2] == class {
+ return u32tosid(repr[trans_offset + i * 4 + 2]);
+ }
+ if classes[3] == class {
+ return u32tosid(repr[trans_offset + i * 4 + 3]);
+ }
+ }
+ }
+ // For an anchored search, we never follow failure transitions
+ // because failure transitions lead us down a path to matching
+ // a *proper* suffix of the path we were on. Thus, it can only
+ // produce matches that appear after the beginning of the search.
+ if anchored.is_anchored() {
+ return NFA::DEAD;
+ }
+ sid = u32tosid(repr[o + 1]);
+ }
+ }
+
+ #[inline(always)]
+ fn is_special(&self, sid: StateID) -> bool {
+ sid <= self.special.max_special_id
+ }
+
+ #[inline(always)]
+ fn is_dead(&self, sid: StateID) -> bool {
+ sid == NFA::DEAD
+ }
+
+ #[inline(always)]
+ fn is_match(&self, sid: StateID) -> bool {
+ !self.is_dead(sid) && sid <= self.special.max_match_id
+ }
+
+ #[inline(always)]
+ fn is_start(&self, sid: StateID) -> bool {
+ sid == self.special.start_unanchored_id
+ || sid == self.special.start_anchored_id
+ }
+
+ #[inline(always)]
+ fn match_kind(&self) -> MatchKind {
+ self.match_kind
+ }
+
+ #[inline(always)]
+ fn patterns_len(&self) -> usize {
+ self.pattern_lens.len()
+ }
+
+ #[inline(always)]
+ fn pattern_len(&self, pid: PatternID) -> usize {
+ self.pattern_lens[pid].as_usize()
+ }
+
+ #[inline(always)]
+ fn min_pattern_len(&self) -> usize {
+ self.min_pattern_len
+ }
+
+ #[inline(always)]
+ fn max_pattern_len(&self) -> usize {
+ self.max_pattern_len
+ }
+
+ #[inline(always)]
+ fn match_len(&self, sid: StateID) -> usize {
+ State::match_len(self.alphabet_len, &self.repr[sid.as_usize()..])
+ }
+
+ #[inline(always)]
+ fn match_pattern(&self, sid: StateID, index: usize) -> PatternID {
+ State::match_pattern(
+ self.alphabet_len,
+ &self.repr[sid.as_usize()..],
+ index,
+ )
+ }
+
+ #[inline(always)]
+ fn memory_usage(&self) -> usize {
+ use core::mem::size_of;
+
+ (self.repr.len() * size_of::<u32>())
+ + (self.pattern_lens.len() * size_of::<SmallIndex>())
+ + self.prefilter.as_ref().map_or(0, |p| p.memory_usage())
+ }
+
+ #[inline(always)]
+ fn prefilter(&self) -> Option<&Prefilter> {
+ self.prefilter.as_ref()
+ }
+}
+
+impl core::fmt::Debug for NFA {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ use crate::automaton::fmt_state_indicator;
+
+ writeln!(f, "contiguous::NFA(")?;
+ let mut sid = NFA::DEAD; // always the first state and always present
+ loop {
+ let raw = &self.repr[sid.as_usize()..];
+ if raw.is_empty() {
+ break;
+ }
+ let is_match = self.is_match(sid);
+ let state = State::read(self.alphabet_len, is_match, raw);
+ fmt_state_indicator(f, self, sid)?;
+ write!(
+ f,
+ "{:06}({:06}): ",
+ sid.as_usize(),
+ state.fail.as_usize()
+ )?;
+ state.fmt(f)?;
+ write!(f, "\n")?;
+ if self.is_match(sid) {
+ write!(f, " matches: ")?;
+ for i in 0..state.match_len {
+ let pid = State::match_pattern(self.alphabet_len, raw, i);
+ if i > 0 {
+ write!(f, ", ")?;
+ }
+ write!(f, "{}", pid.as_usize())?;
+ }
+ write!(f, "\n")?;
+ }
+ // The FAIL state doesn't actually have space for a state allocated
+ // for it, so we have to treat it as a special case. write below
+ // the DEAD state.
+ if sid == NFA::DEAD {
+ writeln!(f, "F {:06}:", NFA::FAIL.as_usize())?;
+ }
+ let len = State::len(self.alphabet_len, is_match, raw);
+ sid = StateID::new(sid.as_usize().checked_add(len).unwrap())
+ .unwrap();
+ }
+ writeln!(f, "match kind: {:?}", self.match_kind)?;
+ writeln!(f, "prefilter: {:?}", self.prefilter.is_some())?;
+ writeln!(f, "state length: {:?}", self.state_len)?;
+ writeln!(f, "pattern length: {:?}", self.patterns_len())?;
+ writeln!(f, "shortest pattern length: {:?}", self.min_pattern_len)?;
+ writeln!(f, "longest pattern length: {:?}", self.max_pattern_len)?;
+ writeln!(f, "alphabet length: {:?}", self.alphabet_len)?;
+ writeln!(f, "byte classes: {:?}", self.byte_classes)?;
+ writeln!(f, "memory usage: {:?}", self.memory_usage())?;
+ writeln!(f, ")")?;
+
+ Ok(())
+ }
+}
+
+/// The "in memory" representation a single dense or sparse state.
+///
+/// A `State`'s in memory representation is not ever actually materialized
+/// during a search with a contiguous NFA. Doing so would be too slow. (Indeed,
+/// the only time a `State` is actually constructed is in `Debug` impls.)
+/// Instead, a `State` exposes a number of static methods for reading certain
+/// things from the raw binary encoding of the state.
+#[derive(Clone)]
+struct State<'a> {
+ /// The state to transition to when 'class_to_next' yields a transition
+ /// to the FAIL state.
+ fail: StateID,
+ /// The number of pattern IDs in this state. For a non-match state, this is
+ /// always zero. Otherwise it is always bigger than zero.
+ match_len: usize,
+ /// The sparse or dense representation of the transitions for this state.
+ trans: StateTrans<'a>,
+}
+
+/// The underlying representation of sparse or dense transitions for a state.
+///
+/// Note that like `State`, we don't typically construct values of this type
+/// during a search since we don't always need all values and thus would
+/// represent a lot of wasteful work.
+#[derive(Clone)]
+enum StateTrans<'a> {
+ /// A sparse representation of transitions for a state, where only non-FAIL
+ /// transitions are explicitly represented.
+ Sparse {
+ classes: &'a [u32],
+ /// The transitions for this state, where each transition is packed
+ /// into a u32. The low 8 bits correspond to the byte class for the
+ /// transition, and the high 24 bits correspond to the next state ID.
+ ///
+ /// This packing is why the max state ID allowed for a contiguous
+ /// NFA is 2^24-1.
+ nexts: &'a [u32],
+ },
+ /// A "one transition" state that is never a match state.
+ ///
+ /// These are by far the most common state, so we use a specialized and
+ /// very compact representation for them.
+ One {
+ /// The element of this NFA's alphabet that this transition is
+ /// defined for.
+ class: u8,
+ /// The state this should transition to if the current symbol is
+ /// equal to 'class'.
+ next: u32,
+ },
+ /// A dense representation of transitions for a state, where all
+ /// transitions are explicitly represented, including transitions to the
+ /// FAIL state.
+ Dense {
+ /// A dense set of transitions to other states. The transitions may
+ /// point to a FAIL state, in which case, the search should try the
+ /// same transition lookup at 'fail'.
+ ///
+ /// Note that this is indexed by byte equivalence classes and not
+ /// byte values. That means 'class_to_next[byte]' is wrong and
+ /// 'class_to_next[classes.get(byte)]' is correct. The number of
+ /// transitions is always equivalent to 'classes.alphabet_len()'.
+ class_to_next: &'a [u32],
+ },
+}
+
+impl<'a> State<'a> {
+ /// The offset of where the "kind" of a state is stored. If it isn't one
+ /// of the sentinel values below, then it's a sparse state and the kind
+ /// corresponds to the number of transitions in the state.
+ const KIND: usize = 0;
+
+ /// A sentinel value indicating that the state uses a dense representation.
+ const KIND_DENSE: u32 = 0xFF;
+ /// A sentinel value indicating that the state uses a special "one
+ /// transition" encoding. In practice, non-match states with one transition
+ /// make up the overwhelming majority of all states in any given
+ /// Aho-Corasick automaton, so we can specialize them using a very compact
+ /// representation.
+ const KIND_ONE: u32 = 0xFE;
+
+ /// The maximum number of transitions to encode as a sparse state. Usually
+ /// states with a lot of transitions are either very rare, or occur near
+ /// the start state. In the latter case, they are probably dense already
+ /// anyway. In the former case, making them dense is fine because they're
+ /// rare.
+ ///
+ /// This needs to be small enough to permit each of the sentinel values for
+ /// 'KIND' above. Namely, a sparse state embeds the number of transitions
+ /// into the 'KIND'. Basically, "sparse" is a state kind too, but it's the
+ /// "else" branch.
+ ///
+ /// N.B. There isn't anything particularly magical about 127 here. I
+ /// just picked it because I figured any sparse state with this many
+ /// transitions is going to be exceptionally rare, and if it did have this
+ /// many transitions, then it would be quite slow to do a linear scan on
+ /// the transitions during a search anyway.
+ const MAX_SPARSE_TRANSITIONS: usize = 127;
+
+ /// Remap state IDs in-place.
+ ///
+ /// `state` should be the the raw binary encoding of a state. (The start
+ /// of the slice must correspond to the start of the state, but the slice
+ /// may extend past the end of the encoding of the state.)
+ fn remap(
+ alphabet_len: usize,
+ old_to_new: &[StateID],
+ state: &mut [u32],
+ ) -> Result<(), BuildError> {
+ let kind = State::kind(state);
+ if kind == State::KIND_DENSE {
+ state[1] = old_to_new[state[1].as_usize()].as_u32();
+ for next in state[2..][..alphabet_len].iter_mut() {
+ *next = old_to_new[next.as_usize()].as_u32();
+ }
+ } else if kind == State::KIND_ONE {
+ state[1] = old_to_new[state[1].as_usize()].as_u32();
+ state[2] = old_to_new[state[2].as_usize()].as_u32();
+ } else {
+ let trans_len = State::sparse_trans_len(state);
+ let classes_len = u32_len(trans_len);
+ state[1] = old_to_new[state[1].as_usize()].as_u32();
+ for next in state[2 + classes_len..][..trans_len].iter_mut() {
+ *next = old_to_new[next.as_usize()].as_u32();
+ }
+ }
+ Ok(())
+ }
+
+ /// Returns the length, in number of u32s, of this state.
+ ///
+ /// This is useful for reading states consecutively, e.g., in the Debug
+ /// impl without needing to store a separate map from state index to state
+ /// identifier.
+ ///
+ /// `state` should be the the raw binary encoding of a state. (The start
+ /// of the slice must correspond to the start of the state, but the slice
+ /// may extend past the end of the encoding of the state.)
+ fn len(alphabet_len: usize, is_match: bool, state: &[u32]) -> usize {
+ let kind_len = 1;
+ let fail_len = 1;
+ let kind = State::kind(state);
+ let (classes_len, trans_len) = if kind == State::KIND_DENSE {
+ (0, alphabet_len)
+ } else if kind == State::KIND_ONE {
+ (0, 1)
+ } else {
+ let trans_len = State::sparse_trans_len(state);
+ let classes_len = u32_len(trans_len);
+ (classes_len, trans_len)
+ };
+ let match_len = if !is_match {
+ 0
+ } else if State::match_len(alphabet_len, state) == 1 {
+ // This is a special case because when there is one pattern ID for
+ // a match state, it is represented by a single u32 with its high
+ // bit set (which is impossible for a valid pattern ID).
+ 1
+ } else {
+ // We add 1 to include the u32 that indicates the number of
+ // pattern IDs that follow.
+ 1 + State::match_len(alphabet_len, state)
+ };
+ kind_len + fail_len + classes_len + trans_len + match_len
+ }
+
+ /// Returns the kind of this state.
+ ///
+ /// This only includes the low byte.
+ #[inline(always)]
+ fn kind(state: &[u32]) -> u32 {
+ state[State::KIND] & 0xFF
+ }
+
+ /// Get the number of sparse transitions in this state. This can never
+ /// be more than State::MAX_SPARSE_TRANSITIONS, as all states with more
+ /// transitions are encoded as dense states.
+ ///
+ /// `state` should be the the raw binary encoding of a sparse state. (The
+ /// start of the slice must correspond to the start of the state, but the
+ /// slice may extend past the end of the encoding of the state.) If this
+ /// isn't a sparse state, then the return value is unspecified.
+ ///
+ /// Do note that this is only legal to call on a sparse state. So for
+ /// example, "one transition" state is not a sparse state, so it would not
+ /// be legal to call this method on such a state.
+ #[inline(always)]
+ fn sparse_trans_len(state: &[u32]) -> usize {
+ (state[State::KIND] & 0xFF).as_usize()
+ }
+
+ /// Returns the total number of matching pattern IDs in this state. Calling
+ /// this on a state that isn't a match results in unspecified behavior.
+ /// Thus, the returned number is never 0 for all correct calls.
+ ///
+ /// `state` should be the the raw binary encoding of a state. (The start
+ /// of the slice must correspond to the start of the state, but the slice
+ /// may extend past the end of the encoding of the state.)
+ #[inline(always)]
+ fn match_len(alphabet_len: usize, state: &[u32]) -> usize {
+ // We don't need to handle KIND_ONE here because it can never be a
+ // match state.
+ let packed = if State::kind(state) == State::KIND_DENSE {
+ let start = 2 + alphabet_len;
+ state[start].as_usize()
+ } else {
+ let trans_len = State::sparse_trans_len(state);
+ let classes_len = u32_len(trans_len);
+ let start = 2 + classes_len + trans_len;
+ state[start].as_usize()
+ };
+ if packed & (1 << 31) == 0 {
+ packed
+ } else {
+ 1
+ }
+ }
+
+ /// Returns the pattern ID corresponding to the given index for the state
+ /// given. The `index` provided must be less than the number of pattern IDs
+ /// in this state.
+ ///
+ /// `state` should be the the raw binary encoding of a state. (The start of
+ /// the slice must correspond to the start of the state, but the slice may
+ /// extend past the end of the encoding of the state.)
+ ///
+ /// If the given state is not a match state or if the index is out of
+ /// bounds, then this has unspecified behavior.
+ #[inline(always)]
+ fn match_pattern(
+ alphabet_len: usize,
+ state: &[u32],
+ index: usize,
+ ) -> PatternID {
+ // We don't need to handle KIND_ONE here because it can never be a
+ // match state.
+ let start = if State::kind(state) == State::KIND_DENSE {
+ 2 + alphabet_len
+ } else {
+ let trans_len = State::sparse_trans_len(state);
+ let classes_len = u32_len(trans_len);
+ 2 + classes_len + trans_len
+ };
+ let packed = state[start];
+ let pid = if packed & (1 << 31) == 0 {
+ state[start + 1 + index]
+ } else {
+ assert_eq!(0, index);
+ packed & !(1 << 31)
+ };
+ PatternID::from_u32_unchecked(pid)
+ }
+
+ /// Read a state's binary encoding to its in-memory representation.
+ ///
+ /// `alphabet_len` should be the total number of transitions defined for
+ /// dense states.
+ ///
+ /// `is_match` should be true if this state is a match state and false
+ /// otherwise.
+ ///
+ /// `state` should be the the raw binary encoding of a state. (The start
+ /// of the slice must correspond to the start of the state, but the slice
+ /// may extend past the end of the encoding of the state.)
+ fn read(
+ alphabet_len: usize,
+ is_match: bool,
+ state: &'a [u32],
+ ) -> State<'a> {
+ let kind = State::kind(state);
+ let match_len =
+ if !is_match { 0 } else { State::match_len(alphabet_len, state) };
+ let (trans, fail) = if kind == State::KIND_DENSE {
+ let fail = StateID::from_u32_unchecked(state[1]);
+ let class_to_next = &state[2..][..alphabet_len];
+ (StateTrans::Dense { class_to_next }, fail)
+ } else if kind == State::KIND_ONE {
+ let fail = StateID::from_u32_unchecked(state[1]);
+ let class = state[State::KIND].low_u16().high_u8();
+ let next = state[2];
+ (StateTrans::One { class, next }, fail)
+ } else {
+ let fail = StateID::from_u32_unchecked(state[1]);
+ let trans_len = State::sparse_trans_len(state);
+ let classes_len = u32_len(trans_len);
+ let classes = &state[2..][..classes_len];
+ let nexts = &state[2 + classes_len..][..trans_len];
+ (StateTrans::Sparse { classes, nexts }, fail)
+ };
+ State { fail, match_len, trans }
+ }
+
+ /// Encode the "old" state from a noncontiguous NFA to its binary
+ /// representation to the given `dst` slice. `classes` should be the byte
+ /// classes computed for the noncontiguous NFA that the given state came
+ /// from.
+ ///
+ /// This returns an error if `dst` became so big that `StateID`s can no
+ /// longer be created for new states. Otherwise, it returns the state ID of
+ /// the new state created.
+ ///
+ /// When `force_dense` is true, then the encoded state will always use a
+ /// dense format. Otherwise, the choice between dense and sparse will be
+ /// automatically chosen based on the old state.
+ fn write(
+ nnfa: &noncontiguous::NFA,
+ oldsid: StateID,
+ old: &noncontiguous::State,
+ classes: &ByteClasses,
+ dst: &mut Vec<u32>,
+ force_dense: bool,
+ ) -> Result<StateID, BuildError> {
+ let sid = StateID::new(dst.len()).map_err(|e| {
+ BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted())
+ })?;
+ let old_len = nnfa.iter_trans(oldsid).count();
+ // For states with a lot of transitions, we might as well just make
+ // them dense. These kinds of hot states tend to be very rare, so we're
+ // okay with it. This also gives us more sentinels in the state's
+ // 'kind', which lets us create different state kinds to save on
+ // space.
+ let kind = if force_dense || old_len > State::MAX_SPARSE_TRANSITIONS {
+ State::KIND_DENSE
+ } else if old_len == 1 && !old.is_match() {
+ State::KIND_ONE
+ } else {
+ // For a sparse state, the kind is just the number of transitions.
+ u32::try_from(old_len).unwrap()
+ };
+ if kind == State::KIND_DENSE {
+ dst.push(kind);
+ dst.push(old.fail().as_u32());
+ State::write_dense_trans(nnfa, oldsid, classes, dst)?;
+ } else if kind == State::KIND_ONE {
+ let t = nnfa.iter_trans(oldsid).next().unwrap();
+ let class = u32::from(classes.get(t.byte()));
+ dst.push(kind | (class << 8));
+ dst.push(old.fail().as_u32());
+ dst.push(t.next().as_u32());
+ } else {
+ dst.push(kind);
+ dst.push(old.fail().as_u32());
+ State::write_sparse_trans(nnfa, oldsid, classes, dst)?;
+ }
+ // Now finally write the number of matches and the matches themselves.
+ if old.is_match() {
+ let matches_len = nnfa.iter_matches(oldsid).count();
+ if matches_len == 1 {
+ let pid = nnfa.iter_matches(oldsid).next().unwrap().as_u32();
+ assert_eq!(0, pid & (1 << 31));
+ dst.push((1 << 31) | pid);
+ } else {
+ assert_eq!(0, matches_len & (1 << 31));
+ dst.push(matches_len.as_u32());
+ dst.extend(nnfa.iter_matches(oldsid).map(|pid| pid.as_u32()));
+ }
+ }
+ Ok(sid)
+ }
+
+ /// Encode the "old" state transitions from a noncontiguous NFA to its
+ /// binary sparse representation to the given `dst` slice. `classes` should
+ /// be the byte classes computed for the noncontiguous NFA that the given
+ /// state came from.
+ ///
+ /// This returns an error if `dst` became so big that `StateID`s can no
+ /// longer be created for new states.
+ fn write_sparse_trans(
+ nnfa: &noncontiguous::NFA,
+ oldsid: StateID,
+ classes: &ByteClasses,
+ dst: &mut Vec<u32>,
+ ) -> Result<(), BuildError> {
+ let (mut chunk, mut len) = ([0; 4], 0);
+ for t in nnfa.iter_trans(oldsid) {
+ chunk[len] = classes.get(t.byte());
+ len += 1;
+ if len == 4 {
+ dst.push(u32::from_ne_bytes(chunk));
+ chunk = [0; 4];
+ len = 0;
+ }
+ }
+ if len > 0 {
+ // In the case where the number of transitions isn't divisible
+ // by 4, the last u32 chunk will have some left over room. In
+ // this case, we "just" repeat the last equivalence class. By
+ // doing this, we know the leftover faux transitions will never
+ // be followed because if they were, it would have been followed
+ // prior to it in the last equivalence class. This saves us some
+ // branching in the search time state transition code.
+ let repeat = chunk[len - 1];
+ while len < 4 {
+ chunk[len] = repeat;
+ len += 1;
+ }
+ dst.push(u32::from_ne_bytes(chunk));
+ }
+ for t in nnfa.iter_trans(oldsid) {
+ dst.push(t.next().as_u32());
+ }
+ Ok(())
+ }
+
+ /// Encode the "old" state transitions from a noncontiguous NFA to its
+ /// binary dense representation to the given `dst` slice. `classes` should
+ /// be the byte classes computed for the noncontiguous NFA that the given
+ /// state came from.
+ ///
+ /// This returns an error if `dst` became so big that `StateID`s can no
+ /// longer be created for new states.
+ fn write_dense_trans(
+ nnfa: &noncontiguous::NFA,
+ oldsid: StateID,
+ classes: &ByteClasses,
+ dst: &mut Vec<u32>,
+ ) -> Result<(), BuildError> {
+ // Our byte classes let us shrink the size of our dense states to the
+ // number of equivalence classes instead of just fixing it to 256.
+ // Any non-explicitly defined transition is just a transition to the
+ // FAIL state, so we fill that in first and then overwrite them with
+ // explicitly defined transitions. (Most states probably only have one
+ // or two explicitly defined transitions.)
+ //
+ // N.B. Remember that while building the contiguous NFA, we use state
+ // IDs from the noncontiguous NFA. It isn't until we've added all
+ // states that we go back and map noncontiguous IDs to contiguous IDs.
+ let start = dst.len();
+ dst.extend(
+ core::iter::repeat(noncontiguous::NFA::FAIL.as_u32())
+ .take(classes.alphabet_len()),
+ );
+ assert!(start < dst.len(), "equivalence classes are never empty");
+ for t in nnfa.iter_trans(oldsid) {
+ dst[start + usize::from(classes.get(t.byte()))] =
+ t.next().as_u32();
+ }
+ Ok(())
+ }
+
+ /// Return an iterator over every explicitly defined transition in this
+ /// state.
+ fn transitions<'b>(&'b self) -> impl Iterator<Item = (u8, StateID)> + 'b {
+ let mut i = 0;
+ core::iter::from_fn(move || match self.trans {
+ StateTrans::Sparse { classes, nexts } => {
+ if i >= nexts.len() {
+ return None;
+ }
+ let chunk = classes[i / 4];
+ let class = chunk.to_ne_bytes()[i % 4];
+ let next = StateID::from_u32_unchecked(nexts[i]);
+ i += 1;
+ Some((class, next))
+ }
+ StateTrans::One { class, next } => {
+ if i == 0 {
+ i += 1;
+ Some((class, StateID::from_u32_unchecked(next)))
+ } else {
+ None
+ }
+ }
+ StateTrans::Dense { class_to_next } => {
+ if i >= class_to_next.len() {
+ return None;
+ }
+ let class = i.as_u8();
+ let next = StateID::from_u32_unchecked(class_to_next[i]);
+ i += 1;
+ Some((class, next))
+ }
+ })
+ }
+}
+
+impl<'a> core::fmt::Debug for State<'a> {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ use crate::{automaton::sparse_transitions, util::debug::DebugByte};
+
+ let it = sparse_transitions(self.transitions())
+ // Writing out all FAIL transitions is quite noisy. Instead, we
+ // just require readers of the output to assume anything absent
+ // maps to the FAIL transition.
+ .filter(|&(_, _, sid)| sid != NFA::FAIL)
+ .enumerate();
+ for (i, (start, end, sid)) in it {
+ if i > 0 {
+ write!(f, ", ")?;
+ }
+ if start == end {
+ write!(f, "{:?} => {:?}", DebugByte(start), sid.as_usize())?;
+ } else {
+ write!(
+ f,
+ "{:?}-{:?} => {:?}",
+ DebugByte(start),
+ DebugByte(end),
+ sid.as_usize()
+ )?;
+ }
+ }
+ Ok(())
+ }
+}
+
+/// A builder for configuring an Aho-Corasick contiguous NFA.
+///
+/// This builder has a subset of the options available to a
+/// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options,
+/// their behavior is identical.
+#[derive(Clone, Debug)]
+pub struct Builder {
+ noncontiguous: noncontiguous::Builder,
+ dense_depth: usize,
+ byte_classes: bool,
+}
+
+impl Default for Builder {
+ fn default() -> Builder {
+ Builder {
+ noncontiguous: noncontiguous::Builder::new(),
+ dense_depth: 2,
+ byte_classes: true,
+ }
+ }
+}
+
+impl Builder {
+ /// Create a new builder for configuring an Aho-Corasick contiguous NFA.
+ pub fn new() -> Builder {
+ Builder::default()
+ }
+
+ /// Build an Aho-Corasick contiguous NFA from the given iterator of
+ /// patterns.
+ ///
+ /// A builder may be reused to create more NFAs.
+ pub fn build<I, P>(&self, patterns: I) -> Result<NFA, BuildError>
+ where
+ I: IntoIterator<Item = P>,
+ P: AsRef<[u8]>,
+ {
+ let nnfa = self.noncontiguous.build(patterns)?;
+ self.build_from_noncontiguous(&nnfa)
+ }
+
+ /// Build an Aho-Corasick contiguous NFA from the given noncontiguous NFA.
+ ///
+ /// Note that when this method is used, only the `dense_depth` and
+ /// `byte_classes` settings on this builder are respected. The other
+ /// settings only apply to the initial construction of the Aho-Corasick
+ /// automaton. Since using this method requires that initial construction
+ /// has already completed, all settings impacting only initial construction
+ /// are no longer relevant.
+ pub fn build_from_noncontiguous(
+ &self,
+ nnfa: &noncontiguous::NFA,
+ ) -> Result<NFA, BuildError> {
+ debug!("building contiguous NFA");
+ let byte_classes = if self.byte_classes {
+ nnfa.byte_classes().clone()
+ } else {
+ ByteClasses::singletons()
+ };
+ let mut index_to_state_id = vec![NFA::DEAD; nnfa.states().len()];
+ let mut nfa = NFA {
+ repr: vec![],
+ pattern_lens: nnfa.pattern_lens_raw().to_vec(),
+ state_len: nnfa.states().len(),
+ prefilter: nnfa.prefilter().map(|p| p.clone()),
+ match_kind: nnfa.match_kind(),
+ alphabet_len: byte_classes.alphabet_len(),
+ byte_classes,
+ min_pattern_len: nnfa.min_pattern_len(),
+ max_pattern_len: nnfa.max_pattern_len(),
+ // The special state IDs are set later.
+ special: Special::zero(),
+ };
+ for (oldsid, state) in nnfa.states().iter().with_state_ids() {
+ // We don't actually encode a fail state since it isn't necessary.
+ // But we still want to make sure any FAIL ids are mapped
+ // correctly.
+ if oldsid == noncontiguous::NFA::FAIL {
+ index_to_state_id[oldsid] = NFA::FAIL;
+ continue;
+ }
+ let force_dense = state.depth().as_usize() < self.dense_depth;
+ let newsid = State::write(
+ nnfa,
+ oldsid,
+ state,
+ &nfa.byte_classes,
+ &mut nfa.repr,
+ force_dense,
+ )?;
+ index_to_state_id[oldsid] = newsid;
+ }
+ for &newsid in index_to_state_id.iter() {
+ if newsid == NFA::FAIL {
+ continue;
+ }
+ let state = &mut nfa.repr[newsid.as_usize()..];
+ State::remap(nfa.alphabet_len, &index_to_state_id, state)?;
+ }
+ // Now that we've remapped all the IDs in our states, all that's left
+ // is remapping the special state IDs.
+ let remap = &index_to_state_id;
+ let old = nnfa.special();
+ let new = &mut nfa.special;
+ new.max_special_id = remap[old.max_special_id];
+ new.max_match_id = remap[old.max_match_id];
+ new.start_unanchored_id = remap[old.start_unanchored_id];
+ new.start_anchored_id = remap[old.start_anchored_id];
+ debug!(
+ "contiguous NFA built, <states: {:?}, size: {:?}, \
+ alphabet len: {:?}>",
+ nfa.state_len,
+ nfa.memory_usage(),
+ nfa.byte_classes.alphabet_len(),
+ );
+ // The vectors can grow ~twice as big during construction because a
+ // Vec amortizes growth. But here, let's shrink things back down to
+ // what we actually need since we're never going to add more to it.
+ nfa.repr.shrink_to_fit();
+ nfa.pattern_lens.shrink_to_fit();
+ Ok(nfa)
+ }
+
+ /// Set the desired match semantics.
+ ///
+ /// This only applies when using [`Builder::build`] and not
+ /// [`Builder::build_from_noncontiguous`].
+ ///
+ /// See
+ /// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind)
+ /// for more documentation and examples.
+ pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder {
+ self.noncontiguous.match_kind(kind);
+ self
+ }
+
+ /// Enable ASCII-aware case insensitive matching.
+ ///
+ /// This only applies when using [`Builder::build`] and not
+ /// [`Builder::build_from_noncontiguous`].
+ ///
+ /// See
+ /// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive)
+ /// for more documentation and examples.
+ pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder {
+ self.noncontiguous.ascii_case_insensitive(yes);
+ self
+ }
+
+ /// Enable heuristic prefilter optimizations.
+ ///
+ /// This only applies when using [`Builder::build`] and not
+ /// [`Builder::build_from_noncontiguous`].
+ ///
+ /// See
+ /// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter)
+ /// for more documentation and examples.
+ pub fn prefilter(&mut self, yes: bool) -> &mut Builder {
+ self.noncontiguous.prefilter(yes);
+ self
+ }
+
+ /// Set the limit on how many states use a dense representation for their
+ /// transitions. Other states will generally use a sparse representation.
+ ///
+ /// See
+ /// [`AhoCorasickBuilder::dense_depth`](crate::AhoCorasickBuilder::dense_depth)
+ /// for more documentation and examples.
+ pub fn dense_depth(&mut self, depth: usize) -> &mut Builder {
+ self.dense_depth = depth;
+ self
+ }
+
+ /// A debug setting for whether to attempt to shrink the size of the
+ /// automaton's alphabet or not.
+ ///
+ /// This should never be enabled unless you're debugging an automaton.
+ /// Namely, disabling byte classes makes transitions easier to reason
+ /// about, since they use the actual bytes instead of equivalence classes.
+ /// Disabling this confers no performance benefit at search time.
+ ///
+ /// See
+ /// [`AhoCorasickBuilder::byte_classes`](crate::AhoCorasickBuilder::byte_classes)
+ /// for more documentation and examples.
+ pub fn byte_classes(&mut self, yes: bool) -> &mut Builder {
+ self.byte_classes = yes;
+ self
+ }
+}
+
+/// Computes the number of u32 values needed to represent one byte per the
+/// number of transitions given.
+fn u32_len(ntrans: usize) -> usize {
+ if ntrans % 4 == 0 {
+ ntrans >> 2
+ } else {
+ (ntrans >> 2) + 1
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ // This test demonstrates a SWAR technique I tried in the sparse transition
+ // code inside of 'next_state'. Namely, sparse transitions work by
+ // iterating over u32 chunks, with each chunk containing up to 4 classes
+ // corresponding to 4 transitions. This SWAR technique lets us find a
+ // matching transition without converting the u32 to a [u8; 4].
+ //
+ // It turned out to be a little slower unfortunately, which isn't too
+ // surprising, since this is likely a throughput oriented optimization.
+ // Loop unrolling doesn't really help us because the vast majority of
+ // states have very few transitions.
+ //
+ // Anyway, this code was a little tricky to write, so I converted it to a
+ // test in case someone figures out how to use it more effectively than
+ // I could.
+ //
+ // (This also only works on little endian. So big endian would need to be
+ // accounted for if we ever decided to use this I think.)
+ #[cfg(target_endian = "little")]
+ #[test]
+ fn swar() {
+ use super::*;
+
+ fn has_zero_byte(x: u32) -> u32 {
+ const LO_U32: u32 = 0x01010101;
+ const HI_U32: u32 = 0x80808080;
+
+ x.wrapping_sub(LO_U32) & !x & HI_U32
+ }
+
+ fn broadcast(b: u8) -> u32 {
+ (u32::from(b)) * (u32::MAX / 255)
+ }
+
+ fn index_of(x: u32) -> usize {
+ let o =
+ (((x - 1) & 0x01010101).wrapping_mul(0x01010101) >> 24) - 1;
+ o.as_usize()
+ }
+
+ let bytes: [u8; 4] = [b'1', b'A', b'a', b'z'];
+ let chunk = u32::from_ne_bytes(bytes);
+
+ let needle = broadcast(b'1');
+ assert_eq!(0, index_of(has_zero_byte(needle ^ chunk)));
+ let needle = broadcast(b'A');
+ assert_eq!(1, index_of(has_zero_byte(needle ^ chunk)));
+ let needle = broadcast(b'a');
+ assert_eq!(2, index_of(has_zero_byte(needle ^ chunk)));
+ let needle = broadcast(b'z');
+ assert_eq!(3, index_of(has_zero_byte(needle ^ chunk)));
+ }
+}
diff --git a/third_party/rust/aho-corasick/src/nfa/mod.rs b/third_party/rust/aho-corasick/src/nfa/mod.rs
new file mode 100644
index 0000000000..93f4dc25c2
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/nfa/mod.rs
@@ -0,0 +1,40 @@
+/*!
+Provides direct access to NFA implementations of Aho-Corasick.
+
+The principle characteristic of an NFA in this crate is that it may
+transition through multiple states per byte of haystack. In Aho-Corasick
+parlance, NFAs follow failure transitions during a search. In contrast,
+a [`DFA`](crate::dfa::DFA) pre-computes all failure transitions during
+compilation at the expense of a much bigger memory footprint.
+
+Currently, there are two NFA implementations provided: noncontiguous and
+contiguous. The names reflect their internal representation, and consequently,
+the trade offs associated with them:
+
+* A [`noncontiguous::NFA`] uses a separate allocation for every NFA state to
+represent its transitions in a sparse format. This is ideal for building an
+NFA, since it cheaply permits different states to have a different number of
+transitions. A noncontiguous NFA is where the main Aho-Corasick construction
+algorithm is implemented. All other Aho-Corasick implementations are built by
+first constructing a noncontiguous NFA.
+* A [`contiguous::NFA`] is uses a single allocation to represent all states,
+while still encoding most states as sparse states but permitting states near
+the starting state to have a dense representation. The dense representation
+uses more memory, but permits computing transitions during a search more
+quickly. By only making the most active states dense (the states near the
+starting state), a contiguous NFA better balances memory usage with search
+speed. The single contiguous allocation also uses less overhead per state and
+enables compression tricks where most states only use 8 bytes of heap memory.
+
+When given the choice between these two, you almost always want to pick a
+contiguous NFA. It takes only a little longer to build, but both its memory
+usage and search speed are typically much better than a noncontiguous NFA. A
+noncontiguous NFA is useful when prioritizing build times, or when there are
+so many patterns that a contiguous NFA could not be built. (Currently, because
+of both memory and search speed improvements, a contiguous NFA has a smaller
+internal limit on the total number of NFA states it can represent. But you
+would likely need to have hundreds of thousands or even millions of patterns
+before you hit this limit.)
+*/
+pub mod contiguous;
+pub mod noncontiguous;
diff --git a/third_party/rust/aho-corasick/src/nfa/noncontiguous.rs b/third_party/rust/aho-corasick/src/nfa/noncontiguous.rs
new file mode 100644
index 0000000000..af32617c90
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/nfa/noncontiguous.rs
@@ -0,0 +1,1762 @@
+/*!
+Provides a noncontiguous NFA implementation of Aho-Corasick.
+
+This is a low-level API that generally only needs to be used in niche
+circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick)
+instead of a noncontiguous NFA directly. Using an `NFA` directly is typically
+only necessary when one needs access to the [`Automaton`] trait implementation.
+*/
+
+use alloc::{
+ collections::{BTreeSet, VecDeque},
+ vec,
+ vec::Vec,
+};
+
+use crate::{
+ automaton::Automaton,
+ util::{
+ alphabet::{ByteClassSet, ByteClasses},
+ error::{BuildError, MatchError},
+ prefilter::{self, opposite_ascii_case, Prefilter},
+ primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID},
+ remapper::Remapper,
+ search::{Anchored, MatchKind},
+ special::Special,
+ },
+};
+
+/// A noncontiguous NFA implementation of Aho-Corasick.
+///
+/// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of
+/// this type directly. Using an `NFA` directly is typically only necessary
+/// when one needs access to the [`Automaton`] trait implementation.
+///
+/// This NFA represents the "core" implementation of Aho-Corasick in this
+/// crate. Namely, constructing this NFA involving building a trie and then
+/// filling in the failure transitions between states, similar to what is
+/// described in any standard textbook description of Aho-Corasick.
+///
+/// In order to minimize heap usage and to avoid additional construction costs,
+/// this implementation represents the transitions of all states as distinct
+/// sparse memory allocations. This is where it gets its name from. That is,
+/// this NFA has no contiguous memory allocation for its transition table. Each
+/// state gets its own allocation.
+///
+/// While the sparse representation keeps memory usage to somewhat reasonable
+/// levels, it is still quite large and also results in somewhat mediocre
+/// search performance. For this reason, it is almost always a good idea to
+/// use a [`contiguous::NFA`](crate::nfa::contiguous::NFA) instead. It is
+/// marginally slower to build, but has higher throughput and can sometimes use
+/// an order of magnitude less memory. The main reason to use a noncontiguous
+/// NFA is when you need the fastest possible construction time, or when a
+/// contiguous NFA does not have the desired capacity. (The total number of NFA
+/// states it can have is fewer than a noncontiguous NFA.)
+///
+/// # Example
+///
+/// This example shows how to build an `NFA` directly and use it to execute
+/// [`Automaton::try_find`]:
+///
+/// ```
+/// use aho_corasick::{
+/// automaton::Automaton,
+/// nfa::noncontiguous::NFA,
+/// Input, Match,
+/// };
+///
+/// let patterns = &["b", "abc", "abcd"];
+/// let haystack = "abcd";
+///
+/// let nfa = NFA::new(patterns).unwrap();
+/// assert_eq!(
+/// Some(Match::must(0, 1..2)),
+/// nfa.try_find(&Input::new(haystack))?,
+/// );
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+///
+/// It is also possible to implement your own version of `try_find`. See the
+/// [`Automaton`] documentation for an example.
+#[derive(Clone)]
+pub struct NFA {
+ /// The match semantics built into this NFA.
+ match_kind: MatchKind,
+ /// A set of states. Each state defines its own transitions, a fail
+ /// transition and a set of indices corresponding to matches.
+ ///
+ /// The first state is always the fail state, which is used only as a
+ /// sentinel. Namely, in the final NFA, no transition into the fail state
+ /// exists. (Well, they do, but they aren't followed. Instead, the state's
+ /// failure transition is followed.)
+ ///
+ /// The second state (index 1) is always the dead state. Dead states are
+ /// in every automaton, but only used when leftmost-{first,longest} match
+ /// semantics are enabled. Specifically, they instruct search to stop
+ /// at specific points in order to report the correct match location. In
+ /// the standard Aho-Corasick construction, there are no transitions to
+ /// the dead state.
+ ///
+ /// The third state (index 2) is generally intended to be the starting or
+ /// "root" state.
+ states: Vec<State>,
+ /// Transitions stored in a sparse representation via a linked list.
+ ///
+ /// Each transition contains three pieces of information: the byte it
+ /// is defined for, the state it transitions to and a link to the next
+ /// transition in the same state (or `StateID::ZERO` if it is the last
+ /// transition).
+ ///
+ /// The first transition for each state is determined by `State::sparse`.
+ ///
+ /// Note that this contains a complete set of all transitions in this NFA,
+ /// including states that have a dense representation for transitions.
+ /// (Adding dense transitions for a state doesn't remove its sparse
+ /// transitions, since deleting transitions from this particular sparse
+ /// representation would be fairly expensive.)
+ sparse: Vec<Transition>,
+ /// Transitions stored in a dense representation.
+ ///
+ /// A state has a row in this table if and only if `State::dense` is
+ /// not equal to `StateID::ZERO`. When not zero, there are precisely
+ /// `NFA::byte_classes::alphabet_len()` entries beginning at `State::dense`
+ /// in this table.
+ ///
+ /// Generally a very small minority of states have a dense representation
+ /// since it uses so much memory.
+ dense: Vec<StateID>,
+ /// Matches stored in linked list for each state.
+ ///
+ /// Like sparse transitions, each match has a link to the next match in the
+ /// state.
+ ///
+ /// The first match for each state is determined by `State::matches`.
+ matches: Vec<Match>,
+ /// The length, in bytes, of each pattern in this NFA. This slice is
+ /// indexed by `PatternID`.
+ ///
+ /// The number of entries in this vector corresponds to the total number of
+ /// patterns in this automaton.
+ pattern_lens: Vec<SmallIndex>,
+ /// A prefilter for quickly skipping to candidate matches, if pertinent.
+ prefilter: Option<Prefilter>,
+ /// A set of equivalence classes in terms of bytes. We compute this while
+ /// building the NFA, but don't use it in the NFA's states. Instead, we
+ /// use this for building the DFA. We store it on the NFA since it's easy
+ /// to compute while visiting the patterns.
+ byte_classes: ByteClasses,
+ /// The length, in bytes, of the shortest pattern in this automaton. This
+ /// information is useful for detecting whether an automaton matches the
+ /// empty string or not.
+ min_pattern_len: usize,
+ /// The length, in bytes, of the longest pattern in this automaton. This
+ /// information is useful for keeping correct buffer sizes when searching
+ /// on streams.
+ max_pattern_len: usize,
+ /// The information required to deduce which states are "special" in this
+ /// NFA.
+ ///
+ /// Since the DEAD and FAIL states are always the first two states and
+ /// there are only ever two start states (which follow all of the match
+ /// states), it follows that we can determine whether a state is a fail,
+ /// dead, match or start with just a few comparisons on the ID itself:
+ ///
+ /// is_dead(sid): sid == NFA::DEAD
+ /// is_fail(sid): sid == NFA::FAIL
+ /// is_match(sid): NFA::FAIL < sid && sid <= max_match_id
+ /// is_start(sid): sid == start_unanchored_id || sid == start_anchored_id
+ ///
+ /// Note that this only applies to the NFA after it has been constructed.
+ /// During construction, the start states are the first ones added and the
+ /// match states are inter-leaved with non-match states. Once all of the
+ /// states have been added, the states are shuffled such that the above
+ /// predicates hold.
+ special: Special,
+}
+
+impl NFA {
+ /// Create a new Aho-Corasick noncontiguous NFA using the default
+ /// configuration.
+ ///
+ /// Use a [`Builder`] if you want to change the configuration.
+ pub fn new<I, P>(patterns: I) -> Result<NFA, BuildError>
+ where
+ I: IntoIterator<Item = P>,
+ P: AsRef<[u8]>,
+ {
+ NFA::builder().build(patterns)
+ }
+
+ /// A convenience method for returning a new Aho-Corasick noncontiguous NFA
+ /// builder.
+ ///
+ /// This usually permits one to just import the `NFA` type.
+ pub fn builder() -> Builder {
+ Builder::new()
+ }
+}
+
+impl NFA {
+ /// The DEAD state is a sentinel state like the FAIL state. The DEAD state
+ /// instructs any search to stop and return any currently recorded match,
+ /// or no match otherwise. Generally speaking, it is impossible for an
+ /// unanchored standard search to enter a DEAD state. But an anchored
+ /// search can, and so to can a leftmost search.
+ ///
+ /// We put DEAD before FAIL so that DEAD is always 0. We repeat this
+ /// decision across the other Aho-Corasicm automata, so that DEAD
+ /// states there are always 0 too. It's not that we need all of the
+ /// implementations to agree, but rather, the contiguous NFA and the DFA
+ /// use a sort of "premultiplied" state identifier where the only state
+ /// whose ID is always known and constant is the first state. Subsequent
+ /// state IDs depend on how much space has already been used in the
+ /// transition table.
+ pub(crate) const DEAD: StateID = StateID::new_unchecked(0);
+ /// The FAIL state mostly just corresponds to the ID of any transition on a
+ /// state that isn't explicitly defined. When one transitions into the FAIL
+ /// state, one must follow the previous state's failure transition before
+ /// doing the next state lookup. In this way, FAIL is more of a sentinel
+ /// than a state that one actually transitions into. In particular, it is
+ /// never exposed in the `Automaton` interface.
+ pub(crate) const FAIL: StateID = StateID::new_unchecked(1);
+
+ /// Returns the equivalence classes of bytes found while constructing
+ /// this NFA.
+ ///
+ /// Note that the NFA doesn't actually make use of these equivalence
+ /// classes. Instead, these are useful for building the DFA when desired.
+ pub(crate) fn byte_classes(&self) -> &ByteClasses {
+ &self.byte_classes
+ }
+
+ /// Returns a slice containing the length of each pattern in this searcher.
+ /// It is indexed by `PatternID` and has length `NFA::patterns_len`.
+ ///
+ /// This is exposed for convenience when building a contiguous NFA. But it
+ /// can be reconstructed from the `Automaton` API if necessary.
+ pub(crate) fn pattern_lens_raw(&self) -> &[SmallIndex] {
+ &self.pattern_lens
+ }
+
+ /// Returns a slice of all states in this non-contiguous NFA.
+ pub(crate) fn states(&self) -> &[State] {
+ &self.states
+ }
+
+ /// Returns the underlying "special" state information for this NFA.
+ pub(crate) fn special(&self) -> &Special {
+ &self.special
+ }
+
+ /// Swaps the states at `id1` and `id2`.
+ ///
+ /// This does not update the transitions of any state to account for the
+ /// state swap.
+ pub(crate) fn swap_states(&mut self, id1: StateID, id2: StateID) {
+ self.states.swap(id1.as_usize(), id2.as_usize());
+ }
+
+ /// Re-maps all state IDs in this NFA according to the `map` function
+ /// given.
+ pub(crate) fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
+ let alphabet_len = self.byte_classes.alphabet_len();
+ for state in self.states.iter_mut() {
+ state.fail = map(state.fail);
+ let mut link = state.sparse;
+ while link != StateID::ZERO {
+ let t = &mut self.sparse[link];
+ t.next = map(t.next);
+ link = t.link;
+ }
+ if state.dense != StateID::ZERO {
+ let start = state.dense.as_usize();
+ for next in self.dense[start..][..alphabet_len].iter_mut() {
+ *next = map(*next);
+ }
+ }
+ }
+ }
+
+ /// Iterate over all of the transitions for the given state ID.
+ pub(crate) fn iter_trans(
+ &self,
+ sid: StateID,
+ ) -> impl Iterator<Item = Transition> + '_ {
+ let mut link = self.states[sid].sparse;
+ core::iter::from_fn(move || {
+ if link == StateID::ZERO {
+ return None;
+ }
+ let t = self.sparse[link];
+ link = t.link;
+ Some(t)
+ })
+ }
+
+ /// Iterate over all of the matches for the given state ID.
+ pub(crate) fn iter_matches(
+ &self,
+ sid: StateID,
+ ) -> impl Iterator<Item = PatternID> + '_ {
+ let mut link = self.states[sid].matches;
+ core::iter::from_fn(move || {
+ if link == StateID::ZERO {
+ return None;
+ }
+ let m = self.matches[link];
+ link = m.link;
+ Some(m.pid)
+ })
+ }
+
+ /// Return the link following the one given. If the one given is the last
+ /// link for the given state, then return `None`.
+ ///
+ /// If no previous link is given, then this returns the first link in the
+ /// state, if one exists.
+ ///
+ /// This is useful for manually iterating over the transitions in a single
+ /// state without borrowing the NFA. This permits mutating other parts of
+ /// the NFA during iteration. Namely, one can access the transition pointed
+ /// to by the link via `self.sparse[link]`.
+ fn next_link(
+ &self,
+ sid: StateID,
+ prev: Option<StateID>,
+ ) -> Option<StateID> {
+ let link =
+ prev.map_or(self.states[sid].sparse, |p| self.sparse[p].link);
+ if link == StateID::ZERO {
+ None
+ } else {
+ Some(link)
+ }
+ }
+
+ /// Follow the transition for the given byte in the given state. If no such
+ /// transition exists, then the FAIL state ID is returned.
+ #[inline(always)]
+ fn follow_transition(&self, sid: StateID, byte: u8) -> StateID {
+ let s = &self.states[sid];
+ // This is a special case that targets starting states and states
+ // near a start state. Namely, after the initial trie is constructed,
+ // we look for states close to the start state to convert to a dense
+ // representation for their transitions. This winds up using a lot more
+ // memory per state in exchange for faster transition lookups. But
+ // since we only do this for a small number of states (by default), the
+ // memory usage is usually minimal.
+ //
+ // This has *massive* benefit when executing searches because the
+ // unanchored starting state is by far the hottest state and is
+ // frequently visited. Moreover, the 'for' loop below that works
+ // decently on an actually sparse state is disastrous on a state that
+ // is nearly or completely dense.
+ if s.dense == StateID::ZERO {
+ self.follow_transition_sparse(sid, byte)
+ } else {
+ let class = usize::from(self.byte_classes.get(byte));
+ self.dense[s.dense.as_usize() + class]
+ }
+ }
+
+ /// Like `follow_transition`, but always uses the sparse representation.
+ #[inline(always)]
+ fn follow_transition_sparse(&self, sid: StateID, byte: u8) -> StateID {
+ for t in self.iter_trans(sid) {
+ if byte <= t.byte {
+ if byte == t.byte {
+ return t.next;
+ }
+ break;
+ }
+ }
+ NFA::FAIL
+ }
+
+ /// Set the transition for the given byte to the state ID given.
+ ///
+ /// Note that one should not set transitions to the FAIL state. It is not
+ /// technically incorrect, but it wastes space. If a transition is not
+ /// defined, then it is automatically assumed to lead to the FAIL state.
+ fn add_transition(
+ &mut self,
+ prev: StateID,
+ byte: u8,
+ next: StateID,
+ ) -> Result<(), BuildError> {
+ if self.states[prev].dense != StateID::ZERO {
+ let dense = self.states[prev].dense;
+ let class = usize::from(self.byte_classes.get(byte));
+ self.dense[dense.as_usize() + class] = next;
+ }
+
+ let head = self.states[prev].sparse;
+ if head == StateID::ZERO || byte < self.sparse[head].byte {
+ let new_link = self.alloc_transition()?;
+ self.sparse[new_link] = Transition { byte, next, link: head };
+ self.states[prev].sparse = new_link;
+ return Ok(());
+ } else if byte == self.sparse[head].byte {
+ self.sparse[head].next = next;
+ return Ok(());
+ }
+
+ // We handled the only cases where the beginning of the transition
+ // chain needs to change. At this point, we now know that there is
+ // at least one entry in the transition chain and the byte for that
+ // transition is less than the byte for the transition we're adding.
+ let (mut link_prev, mut link_next) = (head, self.sparse[head].link);
+ while link_next != StateID::ZERO && byte > self.sparse[link_next].byte
+ {
+ link_prev = link_next;
+ link_next = self.sparse[link_next].link;
+ }
+ if link_next == StateID::ZERO || byte < self.sparse[link_next].byte {
+ let link = self.alloc_transition()?;
+ self.sparse[link] = Transition { byte, next, link: link_next };
+ self.sparse[link_prev].link = link;
+ } else {
+ assert_eq!(byte, self.sparse[link_next].byte);
+ self.sparse[link_next].next = next;
+ }
+ Ok(())
+ }
+
+ /// This sets every possible transition (all 255 of them) for the given
+ /// state to the name `next` value.
+ ///
+ /// This is useful for efficiently initializing start/dead states.
+ ///
+ /// # Panics
+ ///
+ /// This requires that the state has no transitions added to it already.
+ /// If it has any transitions, then this panics. It will also panic if
+ /// the state has been densified prior to calling this.
+ fn init_full_state(
+ &mut self,
+ prev: StateID,
+ next: StateID,
+ ) -> Result<(), BuildError> {
+ assert_eq!(
+ StateID::ZERO,
+ self.states[prev].dense,
+ "state must not be dense yet"
+ );
+ assert_eq!(
+ StateID::ZERO,
+ self.states[prev].sparse,
+ "state must have zero transitions"
+ );
+ let mut prev_link = StateID::ZERO;
+ for byte in 0..=255 {
+ let new_link = self.alloc_transition()?;
+ self.sparse[new_link] =
+ Transition { byte, next, link: StateID::ZERO };
+ if prev_link == StateID::ZERO {
+ self.states[prev].sparse = new_link;
+ } else {
+ self.sparse[prev_link].link = new_link;
+ }
+ prev_link = new_link;
+ }
+ Ok(())
+ }
+
+ /// Add a match for the given pattern ID to the state for the given ID.
+ fn add_match(
+ &mut self,
+ sid: StateID,
+ pid: PatternID,
+ ) -> Result<(), BuildError> {
+ let head = self.states[sid].matches;
+ let mut link = head;
+ while self.matches[link].link != StateID::ZERO {
+ link = self.matches[link].link;
+ }
+ let new_match_link = self.alloc_match()?;
+ self.matches[new_match_link].pid = pid;
+ if link == StateID::ZERO {
+ self.states[sid].matches = new_match_link;
+ } else {
+ self.matches[link].link = new_match_link;
+ }
+ Ok(())
+ }
+
+ /// Copy matches from the `src` state to the `dst` state. This is useful
+ /// when a match state can be reached via a failure transition. In which
+ /// case, you'll want to copy the matches (if any) from the state reached
+ /// by the failure transition to the original state you were at.
+ fn copy_matches(
+ &mut self,
+ src: StateID,
+ dst: StateID,
+ ) -> Result<(), BuildError> {
+ let head_dst = self.states[dst].matches;
+ let mut link_dst = head_dst;
+ while self.matches[link_dst].link != StateID::ZERO {
+ link_dst = self.matches[link_dst].link;
+ }
+ let mut link_src = self.states[src].matches;
+ while link_src != StateID::ZERO {
+ let new_match_link =
+ StateID::new(self.matches.len()).map_err(|e| {
+ BuildError::state_id_overflow(
+ StateID::MAX.as_u64(),
+ e.attempted(),
+ )
+ })?;
+ self.matches.push(Match {
+ pid: self.matches[link_src].pid,
+ link: StateID::ZERO,
+ });
+ if link_dst == StateID::ZERO {
+ self.states[dst].matches = new_match_link;
+ } else {
+ self.matches[link_dst].link = new_match_link;
+ }
+
+ link_dst = new_match_link;
+ link_src = self.matches[link_src].link;
+ }
+ Ok(())
+ }
+
+ /// Create a new entry in `NFA::trans`, if there's room, and return that
+ /// entry's ID. If there's no room, then an error is returned.
+ fn alloc_transition(&mut self) -> Result<StateID, BuildError> {
+ let id = StateID::new(self.sparse.len()).map_err(|e| {
+ BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted())
+ })?;
+ self.sparse.push(Transition::default());
+ Ok(id)
+ }
+
+ /// Create a new entry in `NFA::matches`, if there's room, and return that
+ /// entry's ID. If there's no room, then an error is returned.
+ fn alloc_match(&mut self) -> Result<StateID, BuildError> {
+ let id = StateID::new(self.matches.len()).map_err(|e| {
+ BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted())
+ })?;
+ self.matches.push(Match::default());
+ Ok(id)
+ }
+
+ /// Create a new set of `N` transitions in this NFA's dense transition
+ /// table. The ID return corresponds to the index at which the `N`
+ /// transitions begin. So `id+0` is the first transition and `id+(N-1)` is
+ /// the last.
+ ///
+ /// `N` is determined via `NFA::byte_classes::alphabet_len`.
+ fn alloc_dense_state(&mut self) -> Result<StateID, BuildError> {
+ let id = StateID::new(self.dense.len()).map_err(|e| {
+ BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted())
+ })?;
+ // We use FAIL because it's the correct default. If a state doesn't
+ // have a transition defined for every possible byte value, then the
+ // transition function should return NFA::FAIL.
+ self.dense.extend(
+ core::iter::repeat(NFA::FAIL)
+ .take(self.byte_classes.alphabet_len()),
+ );
+ Ok(id)
+ }
+
+ /// Allocate and add a fresh state to the underlying NFA and return its
+ /// ID (guaranteed to be one more than the ID of the previously allocated
+ /// state). If the ID would overflow `StateID`, then this returns an error.
+ fn alloc_state(&mut self, depth: usize) -> Result<StateID, BuildError> {
+ // This is OK because we error when building the trie if we see a
+ // pattern whose length cannot fit into a 'SmallIndex', and the longest
+ // possible depth corresponds to the length of the longest pattern.
+ let depth = SmallIndex::new(depth)
+ .expect("patterns longer than SmallIndex::MAX are not allowed");
+ let id = StateID::new(self.states.len()).map_err(|e| {
+ BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted())
+ })?;
+ self.states.push(State {
+ sparse: StateID::ZERO,
+ dense: StateID::ZERO,
+ matches: StateID::ZERO,
+ fail: self.special.start_unanchored_id,
+ depth,
+ });
+ Ok(id)
+ }
+}
+
+// SAFETY: 'start_state' always returns a valid state ID, 'next_state' always
+// returns a valid state ID given a valid state ID. We otherwise claim that
+// all other methods are correct as well.
+unsafe impl Automaton for NFA {
+ #[inline(always)]
+ fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> {
+ match anchored {
+ Anchored::No => Ok(self.special.start_unanchored_id),
+ Anchored::Yes => Ok(self.special.start_anchored_id),
+ }
+ }
+
+ #[inline(always)]
+ fn next_state(
+ &self,
+ anchored: Anchored,
+ mut sid: StateID,
+ byte: u8,
+ ) -> StateID {
+ // This terminates since:
+ //
+ // 1. state.fail never points to the FAIL state.
+ // 2. All state.fail values point to a state closer to the start state.
+ // 3. The start state has no transitions to the FAIL state.
+ loop {
+ let next = self.follow_transition(sid, byte);
+ if next != NFA::FAIL {
+ return next;
+ }
+ // For an anchored search, we never follow failure transitions
+ // because failure transitions lead us down a path to matching
+ // a *proper* suffix of the path we were on. Thus, it can only
+ // produce matches that appear after the beginning of the search.
+ if anchored.is_anchored() {
+ return NFA::DEAD;
+ }
+ sid = self.states[sid].fail();
+ }
+ }
+
+ #[inline(always)]
+ fn is_special(&self, sid: StateID) -> bool {
+ sid <= self.special.max_special_id
+ }
+
+ #[inline(always)]
+ fn is_dead(&self, sid: StateID) -> bool {
+ sid == NFA::DEAD
+ }
+
+ #[inline(always)]
+ fn is_match(&self, sid: StateID) -> bool {
+ // N.B. This returns true when sid==NFA::FAIL but that's okay because
+ // NFA::FAIL is not actually a valid state ID from the perspective of
+ // the Automaton trait. Namely, it is never returned by 'start_state'
+ // or by 'next_state'. So we don't need to care about it here.
+ !self.is_dead(sid) && sid <= self.special.max_match_id
+ }
+
+ #[inline(always)]
+ fn is_start(&self, sid: StateID) -> bool {
+ sid == self.special.start_unanchored_id
+ || sid == self.special.start_anchored_id
+ }
+
+ #[inline(always)]
+ fn match_kind(&self) -> MatchKind {
+ self.match_kind
+ }
+
+ #[inline(always)]
+ fn patterns_len(&self) -> usize {
+ self.pattern_lens.len()
+ }
+
+ #[inline(always)]
+ fn pattern_len(&self, pid: PatternID) -> usize {
+ self.pattern_lens[pid].as_usize()
+ }
+
+ #[inline(always)]
+ fn min_pattern_len(&self) -> usize {
+ self.min_pattern_len
+ }
+
+ #[inline(always)]
+ fn max_pattern_len(&self) -> usize {
+ self.max_pattern_len
+ }
+
+ #[inline(always)]
+ fn match_len(&self, sid: StateID) -> usize {
+ self.iter_matches(sid).count()
+ }
+
+ #[inline(always)]
+ fn match_pattern(&self, sid: StateID, index: usize) -> PatternID {
+ self.iter_matches(sid).nth(index).unwrap()
+ }
+
+ #[inline(always)]
+ fn memory_usage(&self) -> usize {
+ self.states.len() * core::mem::size_of::<State>()
+ + self.sparse.len() * core::mem::size_of::<Transition>()
+ + self.matches.len() * core::mem::size_of::<Match>()
+ + self.dense.len() * StateID::SIZE
+ + self.pattern_lens.len() * SmallIndex::SIZE
+ + self.prefilter.as_ref().map_or(0, |p| p.memory_usage())
+ }
+
+ #[inline(always)]
+ fn prefilter(&self) -> Option<&Prefilter> {
+ self.prefilter.as_ref()
+ }
+}
+
+/// A representation of a sparse NFA state for an Aho-Corasick automaton.
+///
+/// It contains the transitions to the next state, a failure transition for
+/// cases where there exists no other transition for the current input byte
+/// and the matches implied by visiting this state (if any).
+#[derive(Clone, Debug)]
+pub(crate) struct State {
+ /// A pointer to `NFA::trans` corresponding to the head of a linked list
+ /// containing all of the transitions for this state.
+ ///
+ /// This is `StateID::ZERO` if and only if this state has zero transitions.
+ sparse: StateID,
+ /// A pointer to a row of `N` transitions in `NFA::dense`. These
+ /// transitions correspond precisely to what is obtained by traversing
+ /// `sparse`, but permits constant time lookup.
+ ///
+ /// When this is zero (which is true for most states in the default
+ /// configuration), then this state has no dense representation.
+ ///
+ /// Note that `N` is equal to `NFA::byte_classes::alphabet_len()`. This is
+ /// typically much less than 256 (the maximum value).
+ dense: StateID,
+ /// A pointer to `NFA::matches` corresponding to the head of a linked list
+ /// containing all of the matches for this state.
+ ///
+ /// This is `StateID::ZERO` if and only if this state is not a match state.
+ matches: StateID,
+ /// The state that should be transitioned to if the current byte in the
+ /// haystack does not have a corresponding transition defined in this
+ /// state.
+ fail: StateID,
+ /// The depth of this state. Specifically, this is the distance from this
+ /// state to the starting state. (For the special sentinel states DEAD and
+ /// FAIL, their depth is always 0.) The depth of a starting state is 0.
+ ///
+ /// Note that depth is currently not used in this non-contiguous NFA. It
+ /// may in the future, but it is used in the contiguous NFA. Namely, it
+ /// permits an optimization where states near the starting state have their
+ /// transitions stored in a dense fashion, but all other states have their
+ /// transitions stored in a sparse fashion. (This non-contiguous NFA uses
+ /// a sparse representation for all states unconditionally.) In any case,
+ /// this is really the only convenient place to compute and store this
+ /// information, which we need when building the contiguous NFA.
+ depth: SmallIndex,
+}
+
+impl State {
+ /// Return true if and only if this state is a match state.
+ pub(crate) fn is_match(&self) -> bool {
+ self.matches != StateID::ZERO
+ }
+
+ /// Returns the failure transition for this state.
+ pub(crate) fn fail(&self) -> StateID {
+ self.fail
+ }
+
+ /// Returns the depth of this state. That is, the number of transitions
+ /// this state is from the start state of the NFA.
+ pub(crate) fn depth(&self) -> SmallIndex {
+ self.depth
+ }
+}
+
+/// A single transition in a non-contiguous NFA.
+#[derive(Clone, Copy, Default)]
+#[repr(packed)]
+pub(crate) struct Transition {
+ byte: u8,
+ next: StateID,
+ link: StateID,
+}
+
+impl Transition {
+ /// Return the byte for which this transition is defined.
+ pub(crate) fn byte(&self) -> u8 {
+ self.byte
+ }
+
+ /// Return the ID of the state that this transition points to.
+ pub(crate) fn next(&self) -> StateID {
+ self.next
+ }
+
+ /// Return the ID of the next transition.
+ fn link(&self) -> StateID {
+ self.link
+ }
+}
+
+impl core::fmt::Debug for Transition {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(
+ f,
+ "Transition(byte: {:X?}, next: {:?}, link: {:?})",
+ self.byte,
+ self.next().as_usize(),
+ self.link().as_usize()
+ )
+ }
+}
+
+/// A single match in a non-contiguous NFA.
+#[derive(Clone, Copy, Default)]
+struct Match {
+ pid: PatternID,
+ link: StateID,
+}
+
+impl Match {
+ /// Return the pattern ID for this match.
+ pub(crate) fn pattern(&self) -> PatternID {
+ self.pid
+ }
+
+ /// Return the ID of the next match.
+ fn link(&self) -> StateID {
+ self.link
+ }
+}
+
+impl core::fmt::Debug for Match {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(
+ f,
+ "Match(pid: {:?}, link: {:?})",
+ self.pattern().as_usize(),
+ self.link().as_usize()
+ )
+ }
+}
+
+/// A builder for configuring an Aho-Corasick noncontiguous NFA.
+///
+/// This builder has a subset of the options available to a
+/// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options,
+/// their behavior is identical.
+#[derive(Clone, Debug)]
+pub struct Builder {
+ match_kind: MatchKind,
+ prefilter: bool,
+ ascii_case_insensitive: bool,
+ dense_depth: usize,
+}
+
+impl Default for Builder {
+ fn default() -> Builder {
+ Builder {
+ match_kind: MatchKind::default(),
+ prefilter: true,
+ ascii_case_insensitive: false,
+ dense_depth: 3,
+ }
+ }
+}
+
+impl Builder {
+ /// Create a new builder for configuring an Aho-Corasick noncontiguous NFA.
+ pub fn new() -> Builder {
+ Builder::default()
+ }
+
+ /// Build an Aho-Corasick noncontiguous NFA from the given iterator of
+ /// patterns.
+ ///
+ /// A builder may be reused to create more NFAs.
+ pub fn build<I, P>(&self, patterns: I) -> Result<NFA, BuildError>
+ where
+ I: IntoIterator<Item = P>,
+ P: AsRef<[u8]>,
+ {
+ debug!("building non-contiguous NFA");
+ let nfa = Compiler::new(self)?.compile(patterns)?;
+ debug!(
+ "non-contiguous NFA built, <states: {:?}, size: {:?}>",
+ nfa.states.len(),
+ nfa.memory_usage()
+ );
+ Ok(nfa)
+ }
+
+ /// Set the desired match semantics.
+ ///
+ /// See
+ /// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind)
+ /// for more documentation and examples.
+ pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder {
+ self.match_kind = kind;
+ self
+ }
+
+ /// Enable ASCII-aware case insensitive matching.
+ ///
+ /// See
+ /// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive)
+ /// for more documentation and examples.
+ pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder {
+ self.ascii_case_insensitive = yes;
+ self
+ }
+
+ /// Set the limit on how many states use a dense representation for their
+ /// transitions. Other states will generally use a sparse representation.
+ ///
+ /// See
+ /// [`AhoCorasickBuilder::dense_depth`](crate::AhoCorasickBuilder::dense_depth)
+ /// for more documentation and examples.
+ pub fn dense_depth(&mut self, depth: usize) -> &mut Builder {
+ self.dense_depth = depth;
+ self
+ }
+
+ /// Enable heuristic prefilter optimizations.
+ ///
+ /// See
+ /// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter)
+ /// for more documentation and examples.
+ pub fn prefilter(&mut self, yes: bool) -> &mut Builder {
+ self.prefilter = yes;
+ self
+ }
+}
+
+/// A compiler uses a builder configuration and builds up the NFA formulation
+/// of an Aho-Corasick automaton. This roughly corresponds to the standard
+/// formulation described in textbooks, with some tweaks to support leftmost
+/// searching.
+#[derive(Debug)]
+struct Compiler<'a> {
+ builder: &'a Builder,
+ prefilter: prefilter::Builder,
+ nfa: NFA,
+ byteset: ByteClassSet,
+}
+
+impl<'a> Compiler<'a> {
+ fn new(builder: &'a Builder) -> Result<Compiler<'a>, BuildError> {
+ let prefilter = prefilter::Builder::new(builder.match_kind)
+ .ascii_case_insensitive(builder.ascii_case_insensitive);
+ Ok(Compiler {
+ builder,
+ prefilter,
+ nfa: NFA {
+ match_kind: builder.match_kind,
+ states: vec![],
+ sparse: vec![],
+ dense: vec![],
+ matches: vec![],
+ pattern_lens: vec![],
+ prefilter: None,
+ byte_classes: ByteClasses::singletons(),
+ min_pattern_len: usize::MAX,
+ max_pattern_len: 0,
+ special: Special::zero(),
+ },
+ byteset: ByteClassSet::empty(),
+ })
+ }
+
+ fn compile<I, P>(mut self, patterns: I) -> Result<NFA, BuildError>
+ where
+ I: IntoIterator<Item = P>,
+ P: AsRef<[u8]>,
+ {
+ // Add dummy transition/match links, so that no valid link will point
+ // to another link at index 0.
+ self.nfa.sparse.push(Transition::default());
+ self.nfa.matches.push(Match::default());
+ // Add a dummy dense transition so that no states can have dense==0
+ // represent a valid pointer to dense transitions. This permits
+ // dense==0 to be a sentinel indicating "no dense transitions."
+ self.nfa.dense.push(NFA::DEAD);
+ // the dead state, only used for leftmost and fixed to id==0
+ self.nfa.alloc_state(0)?;
+ // the fail state, which is never entered and fixed to id==1
+ self.nfa.alloc_state(0)?;
+ // unanchored start state, initially fixed to id==2 but later shuffled
+ // to appear after all non-start match states.
+ self.nfa.special.start_unanchored_id = self.nfa.alloc_state(0)?;
+ // anchored start state, initially fixed to id==3 but later shuffled
+ // to appear after unanchored start state.
+ self.nfa.special.start_anchored_id = self.nfa.alloc_state(0)?;
+ // Initialize the unanchored starting state in order to make it dense,
+ // and thus make transition lookups on this state faster.
+ self.init_unanchored_start_state()?;
+ // Set all transitions on the DEAD state to point to itself. This way,
+ // the DEAD state can never be escaped. It MUST be used as a sentinel
+ // in any correct search.
+ self.add_dead_state_loop()?;
+ // Build the base trie from the given patterns.
+ self.build_trie(patterns)?;
+ self.nfa.states.shrink_to_fit();
+ // Turn our set of bytes into equivalent classes. This NFA
+ // implementation uses byte classes only for states that use a dense
+ // representation of transitions. (And that's why this comes before
+ // `self.densify()`, as the byte classes need to be set first.)
+ self.nfa.byte_classes = self.byteset.byte_classes();
+ // Add transitions (and maybe matches) to the anchored starting state.
+ // The anchored starting state is used for anchored searches. The only
+ // mechanical difference between it and the unanchored start state is
+ // that missing transitions map to the DEAD state instead of the FAIL
+ // state.
+ self.set_anchored_start_state()?;
+ // Rewrite transitions to the FAIL state on the unanchored start state
+ // as self-transitions. This keeps the start state active at all times.
+ self.add_unanchored_start_state_loop();
+ // Make some (possibly zero) states use a dense representation for
+ // transitions. It's important to do this right after the states
+ // and non-failure transitions are solidified. That way, subsequent
+ // accesses (particularly `fill_failure_transitions`) will benefit from
+ // the faster transition lookup in densified states.
+ self.densify()?;
+ // The meat of the Aho-Corasick algorithm: compute and write failure
+ // transitions. i.e., the state to move to when a transition isn't
+ // defined in the current state. These are epsilon transitions and thus
+ // make this formulation an NFA.
+ self.fill_failure_transitions()?;
+ // Handle a special case under leftmost semantics when at least one
+ // of the patterns is the empty string.
+ self.close_start_state_loop_for_leftmost();
+ // Shuffle states so that we have DEAD, FAIL, MATCH, ..., START, START,
+ // NON-MATCH, ... This permits us to very quickly query the type of
+ // the state we're currently in during a search.
+ self.shuffle();
+ self.nfa.prefilter = self.prefilter.build();
+ // Store the maximum ID of all *relevant* special states. Start states
+ // are only relevant when we have a prefilter, otherwise, there is zero
+ // reason to care about whether a state is a start state or not during
+ // a search. Indeed, without a prefilter, we are careful to explicitly
+ // NOT care about start states, otherwise the search can ping pong
+ // between the unrolled loop and the handling of special-status states
+ // and destroy perf.
+ self.nfa.special.max_special_id = if self.nfa.prefilter.is_some() {
+ // Why the anchored starting state? Because we always put it
+ // after the unanchored starting state and it is therefore the
+ // maximum. Why put unanchored followed by anchored? No particular
+ // reason, but that's how the states are logically organized in the
+ // Thompson NFA implementation found in regex-automata. ¯\_(ツ)_/¯
+ self.nfa.special.start_anchored_id
+ } else {
+ self.nfa.special.max_match_id
+ };
+ self.nfa.sparse.shrink_to_fit();
+ self.nfa.dense.shrink_to_fit();
+ self.nfa.matches.shrink_to_fit();
+ self.nfa.pattern_lens.shrink_to_fit();
+ Ok(self.nfa)
+ }
+
+ /// This sets up the initial prefix trie that makes up the Aho-Corasick
+ /// automaton. Effectively, it creates the basic structure of the
+ /// automaton, where every pattern given has a path from the start state to
+ /// the end of the pattern.
+ fn build_trie<I, P>(&mut self, patterns: I) -> Result<(), BuildError>
+ where
+ I: IntoIterator<Item = P>,
+ P: AsRef<[u8]>,
+ {
+ 'PATTERNS: for (i, pat) in patterns.into_iter().enumerate() {
+ let pid = PatternID::new(i).map_err(|e| {
+ BuildError::pattern_id_overflow(
+ PatternID::MAX.as_u64(),
+ e.attempted(),
+ )
+ })?;
+ let pat = pat.as_ref();
+ let patlen = SmallIndex::new(pat.len())
+ .map_err(|_| BuildError::pattern_too_long(pid, pat.len()))?;
+ self.nfa.min_pattern_len =
+ core::cmp::min(self.nfa.min_pattern_len, pat.len());
+ self.nfa.max_pattern_len =
+ core::cmp::max(self.nfa.max_pattern_len, pat.len());
+ assert_eq!(
+ i,
+ self.nfa.pattern_lens.len(),
+ "expected number of patterns to match pattern ID"
+ );
+ self.nfa.pattern_lens.push(patlen);
+ // We add the pattern to the prefilter here because the pattern
+ // ID in the prefilter is determined with respect to the patterns
+ // added to the prefilter. That is, it isn't the ID we have here,
+ // but the one determined by its own accounting of patterns.
+ // To ensure they line up, we add every pattern we see to the
+ // prefilter, even if some patterns ultimately are impossible to
+ // match (in leftmost-first semantics specifically).
+ //
+ // Another way of doing this would be to expose an API in the
+ // prefilter to permit setting your own pattern IDs. Or to just use
+ // our own map and go between them. But this case is sufficiently
+ // rare that we don't bother and just make sure they're in sync.
+ if self.builder.prefilter {
+ self.prefilter.add(pat);
+ }
+
+ let mut prev = self.nfa.special.start_unanchored_id;
+ let mut saw_match = false;
+ for (depth, &b) in pat.iter().enumerate() {
+ // When leftmost-first match semantics are requested, we
+ // specifically stop adding patterns when a previously added
+ // pattern is a prefix of it. We avoid adding it because
+ // leftmost-first semantics imply that the pattern can never
+ // match. This is not just an optimization to save space! It
+ // is necessary for correctness. In fact, this is the only
+ // difference in the automaton between the implementations for
+ // leftmost-first and leftmost-longest.
+ saw_match = saw_match || self.nfa.states[prev].is_match();
+ if self.builder.match_kind.is_leftmost_first() && saw_match {
+ // Skip to the next pattern immediately. This avoids
+ // incorrectly adding a match after this loop terminates.
+ continue 'PATTERNS;
+ }
+
+ // Add this byte to our equivalence classes. These don't
+ // get used while building the trie, but other Aho-Corasick
+ // implementations may use them.
+ self.byteset.set_range(b, b);
+ if self.builder.ascii_case_insensitive {
+ let b = opposite_ascii_case(b);
+ self.byteset.set_range(b, b);
+ }
+
+ // If the transition from prev using the current byte already
+ // exists, then just move through it. Otherwise, add a new
+ // state. We track the depth here so that we can determine
+ // how to represent transitions. States near the start state
+ // use a dense representation that uses more memory but is
+ // faster. Other states use a sparse representation that uses
+ // less memory but is slower.
+ let next = self.nfa.follow_transition(prev, b);
+ if next != NFA::FAIL {
+ prev = next;
+ } else {
+ let next = self.nfa.alloc_state(depth)?;
+ self.nfa.add_transition(prev, b, next)?;
+ if self.builder.ascii_case_insensitive {
+ let b = opposite_ascii_case(b);
+ self.nfa.add_transition(prev, b, next)?;
+ }
+ prev = next;
+ }
+ }
+ // Once the pattern has been added, log the match in the final
+ // state that it reached.
+ self.nfa.add_match(prev, pid)?;
+ }
+ Ok(())
+ }
+
+ /// This routine creates failure transitions according to the standard
+ /// textbook formulation of the Aho-Corasick algorithm, with a couple small
+ /// tweaks to support "leftmost" semantics.
+ ///
+ /// Building failure transitions is the most interesting part of building
+ /// the Aho-Corasick automaton, because they are what allow searches to
+ /// be performed in linear time. Specifically, a failure transition is
+ /// a single transition associated with each state that points back to
+ /// the longest proper suffix of the pattern being searched. The failure
+ /// transition is followed whenever there exists no transition on the
+ /// current state for the current input byte. If there is no other proper
+ /// suffix, then the failure transition points back to the starting state.
+ ///
+ /// For example, let's say we built an Aho-Corasick automaton with the
+ /// following patterns: 'abcd' and 'cef'. The trie looks like this:
+ ///
+ /// ```ignore
+ /// a - S1 - b - S2 - c - S3 - d - S4*
+ /// /
+ /// S0 - c - S5 - e - S6 - f - S7*
+ /// ```
+ ///
+ /// At this point, it should be fairly straight-forward to see how this
+ /// trie can be used in a simplistic way. At any given position in the
+ /// text we're searching (called the "subject" string), all we need to do
+ /// is follow the transitions in the trie by consuming one transition for
+ /// each byte in the subject string. If we reach a match state, then we can
+ /// report that location as a match.
+ ///
+ /// The trick comes when searching a subject string like 'abcef'. We'll
+ /// initially follow the transition from S0 to S1 and wind up in S3 after
+ /// observng the 'c' byte. At this point, the next byte is 'e' but state
+ /// S3 has no transition for 'e', so the search fails. We then would need
+ /// to restart the search at the next position in 'abcef', which
+ /// corresponds to 'b'. The match would fail, but the next search starting
+ /// at 'c' would finally succeed. The problem with this approach is that
+ /// we wind up searching the subject string potentially many times. In
+ /// effect, this makes the algorithm have worst case `O(n * m)` complexity,
+ /// where `n ~ len(subject)` and `m ~ len(all patterns)`. We would instead
+ /// like to achieve a `O(n + m)` worst case complexity.
+ ///
+ /// This is where failure transitions come in. Instead of dying at S3 in
+ /// the first search, the automaton can instruct the search to move to
+ /// another part of the automaton that corresponds to a suffix of what
+ /// we've seen so far. Recall that we've seen 'abc' in the subject string,
+ /// and the automaton does indeed have a non-empty suffix, 'c', that could
+ /// potentially lead to another match. Thus, the actual Aho-Corasick
+ /// automaton for our patterns in this case looks like this:
+ ///
+ /// ```ignore
+ /// a - S1 - b - S2 - c - S3 - d - S4*
+ /// / /
+ /// / ----------------
+ /// / /
+ /// S0 - c - S5 - e - S6 - f - S7*
+ /// ```
+ ///
+ /// That is, we have a failure transition from S3 to S5, which is followed
+ /// exactly in cases when we are in state S3 but see any byte other than
+ /// 'd' (that is, we've "failed" to find a match in this portion of our
+ /// trie). We know we can transition back to S5 because we've already seen
+ /// a 'c' byte, so we don't need to re-scan it. We can then pick back up
+ /// with the search starting at S5 and complete our match.
+ ///
+ /// Adding failure transitions to a trie is fairly simple, but subtle. The
+ /// key issue is that you might have multiple failure transition that you
+ /// need to follow. For example, look at the trie for the patterns
+ /// 'abcd', 'b', 'bcd' and 'cd':
+ ///
+ /// ```ignore
+ /// - a - S1 - b - S2* - c - S3 - d - S4*
+ /// / / /
+ /// / ------- -------
+ /// / / /
+ /// S0 --- b - S5* - c - S6 - d - S7*
+ /// \ /
+ /// \ --------
+ /// \ /
+ /// - c - S8 - d - S9*
+ /// ```
+ ///
+ /// The failure transitions for this trie are defined from S2 to S5,
+ /// S3 to S6 and S6 to S8. Moreover, state S2 needs to track that it
+ /// corresponds to a match, since its failure transition to S5 is itself
+ /// a match state.
+ ///
+ /// Perhaps simplest way to think about adding these failure transitions
+ /// is recursively. That is, if you know the failure transitions for every
+ /// possible previous state that could be visited (e.g., when computing the
+ /// failure transition for S3, you already know the failure transitions
+ /// for S0, S1 and S2), then you can simply follow the failure transition
+ /// of the previous state and check whether the incoming transition is
+ /// defined after following the failure transition.
+ ///
+ /// For example, when determining the failure state for S3, by our
+ /// assumptions, we already know that there is a failure transition from
+ /// S2 (the previous state) to S5. So we follow that transition and check
+ /// whether the transition connecting S2 to S3 is defined. Indeed, it is,
+ /// as there is a transition from S5 to S6 for the byte 'c'. If no such
+ /// transition existed, we could keep following the failure transitions
+ /// until we reach the start state, which is the failure transition for
+ /// every state that has no corresponding proper suffix.
+ ///
+ /// We don't actually use recursion to implement this, but instead, use a
+ /// breadth first search of the automaton. Our base case is the start
+ /// state, whose failure transition is just a transition to itself.
+ ///
+ /// When building a leftmost automaton, we proceed as above, but only
+ /// include a subset of failure transitions. Namely, we omit any failure
+ /// transitions that appear after a match state in the trie. This is
+ /// because failure transitions always point back to a proper suffix of
+ /// what has been seen so far. Thus, following a failure transition after
+ /// a match implies looking for a match that starts after the one that has
+ /// already been seen, which is of course therefore not the leftmost match.
+ ///
+ /// N.B. I came up with this algorithm on my own, and after scouring all of
+ /// the other AC implementations I know of (Perl, Snort, many on GitHub).
+ /// I couldn't find any that implement leftmost semantics like this.
+ /// Perl of course needs leftmost-first semantics, but they implement it
+ /// with a seeming hack at *search* time instead of encoding it into the
+ /// automaton. There are also a couple Java libraries that support leftmost
+ /// longest semantics, but they do it by building a queue of matches at
+ /// search time, which is even worse than what Perl is doing. ---AG
+ fn fill_failure_transitions(&mut self) -> Result<(), BuildError> {
+ let is_leftmost = self.builder.match_kind.is_leftmost();
+ let start_uid = self.nfa.special.start_unanchored_id;
+ // Initialize the queue for breadth first search with all transitions
+ // out of the start state. We handle the start state specially because
+ // we only want to follow non-self transitions. If we followed self
+ // transitions, then this would never terminate.
+ let mut queue = VecDeque::new();
+ let mut seen = self.queued_set();
+ let mut prev_link = None;
+ while let Some(link) = self.nfa.next_link(start_uid, prev_link) {
+ prev_link = Some(link);
+ let t = self.nfa.sparse[link];
+
+ // Skip anything we've seen before and any self-transitions on the
+ // start state.
+ if start_uid == t.next() || seen.contains(t.next) {
+ continue;
+ }
+ queue.push_back(t.next);
+ seen.insert(t.next);
+ // Under leftmost semantics, if a state immediately following
+ // the start state is a match state, then we never want to
+ // follow its failure transition since the failure transition
+ // necessarily leads back to the start state, which we never
+ // want to do for leftmost matching after a match has been
+ // found.
+ //
+ // We apply the same logic to non-start states below as well.
+ if is_leftmost && self.nfa.states[t.next].is_match() {
+ self.nfa.states[t.next].fail = NFA::DEAD;
+ }
+ }
+ while let Some(id) = queue.pop_front() {
+ let mut prev_link = None;
+ while let Some(link) = self.nfa.next_link(id, prev_link) {
+ prev_link = Some(link);
+ let t = self.nfa.sparse[link];
+
+ if seen.contains(t.next) {
+ // The only way to visit a duplicate state in a transition
+ // list is when ASCII case insensitivity is enabled. In
+ // this case, we want to skip it since it's redundant work.
+ // But it would also end up duplicating matches, which
+ // results in reporting duplicate matches in some cases.
+ // See the 'acasei010' regression test.
+ continue;
+ }
+ queue.push_back(t.next);
+ seen.insert(t.next);
+
+ // As above for start states, under leftmost semantics, once
+ // we see a match all subsequent states should have no failure
+ // transitions because failure transitions always imply looking
+ // for a match that is a suffix of what has been seen so far
+ // (where "seen so far" corresponds to the string formed by
+ // following the transitions from the start state to the
+ // current state). Under leftmost semantics, we specifically do
+ // not want to allow this to happen because we always want to
+ // report the match found at the leftmost position.
+ //
+ // The difference between leftmost-first and leftmost-longest
+ // occurs previously while we build the trie. For
+ // leftmost-first, we simply omit any entries that would
+ // otherwise require passing through a match state.
+ //
+ // Note that for correctness, the failure transition has to be
+ // set to the dead state for ALL states following a match, not
+ // just the match state itself. However, by setting the failure
+ // transition to the dead state on all match states, the dead
+ // state will automatically propagate to all subsequent states
+ // via the failure state computation below.
+ if is_leftmost && self.nfa.states[t.next].is_match() {
+ self.nfa.states[t.next].fail = NFA::DEAD;
+ continue;
+ }
+ let mut fail = self.nfa.states[id].fail;
+ while self.nfa.follow_transition(fail, t.byte) == NFA::FAIL {
+ fail = self.nfa.states[fail].fail;
+ }
+ fail = self.nfa.follow_transition(fail, t.byte);
+ self.nfa.states[t.next].fail = fail;
+ self.nfa.copy_matches(fail, t.next)?;
+ }
+ // If the start state is a match state, then this automaton can
+ // match the empty string. This implies all states are match states
+ // since every position matches the empty string, so copy the
+ // matches from the start state to every state. Strictly speaking,
+ // this is only necessary for overlapping matches since each
+ // non-empty non-start match state needs to report empty matches
+ // in addition to its own. For the non-overlapping case, such
+ // states only report the first match, which is never empty since
+ // it isn't a start state.
+ if !is_leftmost {
+ self.nfa
+ .copy_matches(self.nfa.special.start_unanchored_id, id)?;
+ }
+ }
+ Ok(())
+ }
+
+ /// Shuffle the states so that they appear in this sequence:
+ ///
+ /// DEAD, FAIL, MATCH..., START, START, NON-MATCH...
+ ///
+ /// The idea here is that if we know how special states are laid out in our
+ /// transition table, then we can determine what "kind" of state we're in
+ /// just by comparing our current state ID with a particular value. In this
+ /// way, we avoid doing extra memory lookups.
+ ///
+ /// Before shuffling begins, our states look something like this:
+ ///
+ /// DEAD, FAIL, START, START, (MATCH | NON-MATCH)...
+ ///
+ /// So all we need to do is move all of the MATCH states so that they
+ /// all appear before any NON-MATCH state, like so:
+ ///
+ /// DEAD, FAIL, START, START, MATCH... NON-MATCH...
+ ///
+ /// Then it's just a simple matter of swapping the two START states with
+ /// the last two MATCH states.
+ ///
+ /// (This is the same technique used for fully compiled DFAs in
+ /// regex-automata.)
+ fn shuffle(&mut self) {
+ let old_start_uid = self.nfa.special.start_unanchored_id;
+ let old_start_aid = self.nfa.special.start_anchored_id;
+ assert!(old_start_uid < old_start_aid);
+ assert_eq!(
+ 3,
+ old_start_aid.as_usize(),
+ "anchored start state should be at index 3"
+ );
+ // We implement shuffling by a sequence of pairwise swaps of states.
+ // Since we have a number of things referencing states via their
+ // IDs and swapping them changes their IDs, we need to record every
+ // swap we make so that we can remap IDs. The remapper handles this
+ // book-keeping for us.
+ let mut remapper = Remapper::new(&self.nfa, 0);
+ // The way we proceed here is by moving all match states so that
+ // they directly follow the start states. So it will go: DEAD, FAIL,
+ // START-UNANCHORED, START-ANCHORED, MATCH, ..., NON-MATCH, ...
+ //
+ // To do that, we proceed forward through all states after
+ // START-ANCHORED and swap match states so that they appear before all
+ // non-match states.
+ let mut next_avail = StateID::from(4u8);
+ for i in next_avail.as_usize()..self.nfa.states.len() {
+ let sid = StateID::new(i).unwrap();
+ if !self.nfa.states[sid].is_match() {
+ continue;
+ }
+ remapper.swap(&mut self.nfa, sid, next_avail);
+ // The key invariant here is that only non-match states exist
+ // between 'next_avail' and 'sid' (with them being potentially
+ // equivalent). Thus, incrementing 'next_avail' by 1 is guaranteed
+ // to land on the leftmost non-match state. (Unless 'next_avail'
+ // and 'sid' are equivalent, in which case, a swap will occur but
+ // it is a no-op.)
+ next_avail = StateID::new(next_avail.one_more()).unwrap();
+ }
+ // Now we'd like to move the start states to immediately following the
+ // match states. (The start states may themselves be match states, but
+ // we'll handle that later.) We arrange the states this way so that we
+ // don't necessarily need to check whether a state is a start state or
+ // not before checking whether a state is a match state. For example,
+ // we'd like to be able to write this as our state machine loop:
+ //
+ // sid = start()
+ // for byte in haystack:
+ // sid = next(sid, byte)
+ // if sid <= nfa.max_start_id:
+ // if sid <= nfa.max_dead_id:
+ // # search complete
+ // elif sid <= nfa.max_match_id:
+ // # found match
+ //
+ // The important context here is that we might not want to look for
+ // start states at all. Namely, if a searcher doesn't have a prefilter,
+ // then there is no reason to care about whether we're in a start state
+ // or not. And indeed, if we did check for it, this very hot loop would
+ // ping pong between the special state handling and the main state
+ // transition logic. This in turn stalls the CPU by killing branch
+ // prediction.
+ //
+ // So essentially, we really want to be able to "forget" that start
+ // states even exist and this is why we put them at the end.
+ let new_start_aid =
+ StateID::new(next_avail.as_usize().checked_sub(1).unwrap())
+ .unwrap();
+ remapper.swap(&mut self.nfa, old_start_aid, new_start_aid);
+ let new_start_uid =
+ StateID::new(next_avail.as_usize().checked_sub(2).unwrap())
+ .unwrap();
+ remapper.swap(&mut self.nfa, old_start_uid, new_start_uid);
+ let new_max_match_id =
+ StateID::new(next_avail.as_usize().checked_sub(3).unwrap())
+ .unwrap();
+ self.nfa.special.max_match_id = new_max_match_id;
+ self.nfa.special.start_unanchored_id = new_start_uid;
+ self.nfa.special.start_anchored_id = new_start_aid;
+ // If one start state is a match state, then they both are.
+ if self.nfa.states[self.nfa.special.start_anchored_id].is_match() {
+ self.nfa.special.max_match_id = self.nfa.special.start_anchored_id;
+ }
+ remapper.remap(&mut self.nfa);
+ }
+
+ /// Attempts to convert the transition representation of a subset of states
+ /// in this NFA from sparse to dense. This can greatly improve search
+ /// performance since states with a higher number of transitions tend to
+ /// correlate with very active states.
+ ///
+ /// We generally only densify states that are close to the start state.
+ /// These tend to be the most active states and thus benefit from a dense
+ /// representation more than other states.
+ ///
+ /// This tends to best balance between memory usage and performance. In
+ /// particular, the *vast majority* of all states in a typical Aho-Corasick
+ /// automaton have only 1 transition and are usually farther from the start
+ /// state and thus don't get densified.
+ ///
+ /// Note that this doesn't remove the sparse representation of transitions
+ /// for states that are densified. It could be done, but actually removing
+ /// entries from `NFA::sparse` is likely more expensive than it's worth.
+ fn densify(&mut self) -> Result<(), BuildError> {
+ for i in 0..self.nfa.states.len() {
+ let sid = StateID::new(i).unwrap();
+ // Don't bother densifying states that are only used as sentinels.
+ if sid == NFA::DEAD || sid == NFA::FAIL {
+ continue;
+ }
+ // Only densify states that are "close enough" to the start state.
+ if self.nfa.states[sid].depth.as_usize()
+ >= self.builder.dense_depth
+ {
+ continue;
+ }
+ let dense = self.nfa.alloc_dense_state()?;
+ let mut prev_link = None;
+ while let Some(link) = self.nfa.next_link(sid, prev_link) {
+ prev_link = Some(link);
+ let t = self.nfa.sparse[link];
+
+ let class = usize::from(self.nfa.byte_classes.get(t.byte));
+ let index = dense.as_usize() + class;
+ self.nfa.dense[index] = t.next;
+ }
+ self.nfa.states[sid].dense = dense;
+ }
+ Ok(())
+ }
+
+ /// Returns a set that tracked queued states.
+ ///
+ /// This is only necessary when ASCII case insensitivity is enabled, since
+ /// it is the only way to visit the same state twice. Otherwise, this
+ /// returns an inert set that nevers adds anything and always reports
+ /// `false` for every member test.
+ fn queued_set(&self) -> QueuedSet {
+ if self.builder.ascii_case_insensitive {
+ QueuedSet::active()
+ } else {
+ QueuedSet::inert()
+ }
+ }
+
+ /// Initializes the unanchored start state by making it dense. This is
+ /// achieved by explicitly setting every transition to the FAIL state.
+ /// This isn't necessary for correctness, since any missing transition is
+ /// automatically assumed to be mapped to the FAIL state. We do this to
+ /// make the unanchored starting state dense, and thus in turn make
+ /// transition lookups on it faster. (Which is worth doing because it's
+ /// the most active state.)
+ fn init_unanchored_start_state(&mut self) -> Result<(), BuildError> {
+ let start_uid = self.nfa.special.start_unanchored_id;
+ let start_aid = self.nfa.special.start_anchored_id;
+ self.nfa.init_full_state(start_uid, NFA::FAIL)?;
+ self.nfa.init_full_state(start_aid, NFA::FAIL)?;
+ Ok(())
+ }
+
+ /// Setup the anchored start state by copying all of the transitions and
+ /// matches from the unanchored starting state with one change: the failure
+ /// transition is changed to the DEAD state, so that for any undefined
+ /// transitions, the search will stop.
+ fn set_anchored_start_state(&mut self) -> Result<(), BuildError> {
+ let start_uid = self.nfa.special.start_unanchored_id;
+ let start_aid = self.nfa.special.start_anchored_id;
+ let (mut uprev_link, mut aprev_link) = (None, None);
+ loop {
+ let unext = self.nfa.next_link(start_uid, uprev_link);
+ let anext = self.nfa.next_link(start_aid, aprev_link);
+ let (ulink, alink) = match (unext, anext) {
+ (Some(ulink), Some(alink)) => (ulink, alink),
+ (None, None) => break,
+ _ => unreachable!(),
+ };
+ uprev_link = Some(ulink);
+ aprev_link = Some(alink);
+ self.nfa.sparse[alink].next = self.nfa.sparse[ulink].next;
+ }
+ self.nfa.copy_matches(start_uid, start_aid)?;
+ // This is the main difference between the unanchored and anchored
+ // starting states. If a lookup on an anchored starting state fails,
+ // then the search should stop.
+ //
+ // N.B. This assumes that the loop on the unanchored starting state
+ // hasn't been created yet.
+ self.nfa.states[start_aid].fail = NFA::DEAD;
+ Ok(())
+ }
+
+ /// Set the failure transitions on the start state to loop back to the
+ /// start state. This effectively permits the Aho-Corasick automaton to
+ /// match at any position. This is also required for finding the next
+ /// state to terminate, namely, finding the next state should never return
+ /// a fail_id.
+ ///
+ /// This must be done after building the initial trie, since trie
+ /// construction depends on transitions to `fail_id` to determine whether a
+ /// state already exists or not.
+ fn add_unanchored_start_state_loop(&mut self) {
+ let start_uid = self.nfa.special.start_unanchored_id;
+ let mut prev_link = None;
+ while let Some(link) = self.nfa.next_link(start_uid, prev_link) {
+ prev_link = Some(link);
+ if self.nfa.sparse[link].next() == NFA::FAIL {
+ self.nfa.sparse[link].next = start_uid;
+ }
+ }
+ }
+
+ /// Remove the start state loop by rewriting any transitions on the start
+ /// state back to the start state with transitions to the dead state.
+ ///
+ /// The loop is only closed when two conditions are met: the start state
+ /// is a match state and the match kind is leftmost-first or
+ /// leftmost-longest.
+ ///
+ /// The reason for this is that under leftmost semantics, a start state
+ /// that is also a match implies that we should never restart the search
+ /// process. We allow normal transitions out of the start state, but if
+ /// none exist, we transition to the dead state, which signals that
+ /// searching should stop.
+ fn close_start_state_loop_for_leftmost(&mut self) {
+ let start_uid = self.nfa.special.start_unanchored_id;
+ let start = &mut self.nfa.states[start_uid];
+ let dense = start.dense;
+ if self.builder.match_kind.is_leftmost() && start.is_match() {
+ let mut prev_link = None;
+ while let Some(link) = self.nfa.next_link(start_uid, prev_link) {
+ prev_link = Some(link);
+ if self.nfa.sparse[link].next() == start_uid {
+ self.nfa.sparse[link].next = NFA::DEAD;
+ if dense != StateID::ZERO {
+ let b = self.nfa.sparse[link].byte;
+ let class = usize::from(self.nfa.byte_classes.get(b));
+ self.nfa.dense[dense.as_usize() + class] = NFA::DEAD;
+ }
+ }
+ }
+ }
+ }
+
+ /// Sets all transitions on the dead state to point back to the dead state.
+ /// Normally, missing transitions map back to the failure state, but the
+ /// point of the dead state is to act as a sink that can never be escaped.
+ fn add_dead_state_loop(&mut self) -> Result<(), BuildError> {
+ self.nfa.init_full_state(NFA::DEAD, NFA::DEAD)?;
+ Ok(())
+ }
+}
+
+/// A set of state identifiers used to avoid revisiting the same state multiple
+/// times when filling in failure transitions.
+///
+/// This set has an "inert" and an "active" mode. When inert, the set never
+/// stores anything and always returns `false` for every member test. This is
+/// useful to avoid the performance and memory overhead of maintaining this
+/// set when it is not needed.
+#[derive(Debug)]
+struct QueuedSet {
+ set: Option<BTreeSet<StateID>>,
+}
+
+impl QueuedSet {
+ /// Return an inert set that returns `false` for every state ID membership
+ /// test.
+ fn inert() -> QueuedSet {
+ QueuedSet { set: None }
+ }
+
+ /// Return an active set that tracks state ID membership.
+ fn active() -> QueuedSet {
+ QueuedSet { set: Some(BTreeSet::new()) }
+ }
+
+ /// Inserts the given state ID into this set. (If the set is inert, then
+ /// this is a no-op.)
+ fn insert(&mut self, state_id: StateID) {
+ if let Some(ref mut set) = self.set {
+ set.insert(state_id);
+ }
+ }
+
+ /// Returns true if and only if the given state ID is in this set. If the
+ /// set is inert, this always returns false.
+ fn contains(&self, state_id: StateID) -> bool {
+ match self.set {
+ None => false,
+ Some(ref set) => set.contains(&state_id),
+ }
+ }
+}
+
+impl core::fmt::Debug for NFA {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ use crate::{
+ automaton::{fmt_state_indicator, sparse_transitions},
+ util::debug::DebugByte,
+ };
+
+ writeln!(f, "noncontiguous::NFA(")?;
+ for (sid, state) in self.states.iter().with_state_ids() {
+ // The FAIL state doesn't actually have space for a state allocated
+ // for it, so we have to treat it as a special case.
+ if sid == NFA::FAIL {
+ writeln!(f, "F {:06}:", sid.as_usize())?;
+ continue;
+ }
+ fmt_state_indicator(f, self, sid)?;
+ write!(
+ f,
+ "{:06}({:06}): ",
+ sid.as_usize(),
+ state.fail.as_usize()
+ )?;
+
+ let it = sparse_transitions(
+ self.iter_trans(sid).map(|t| (t.byte, t.next)),
+ )
+ .enumerate();
+ for (i, (start, end, sid)) in it {
+ if i > 0 {
+ write!(f, ", ")?;
+ }
+ if start == end {
+ write!(
+ f,
+ "{:?} => {:?}",
+ DebugByte(start),
+ sid.as_usize()
+ )?;
+ } else {
+ write!(
+ f,
+ "{:?}-{:?} => {:?}",
+ DebugByte(start),
+ DebugByte(end),
+ sid.as_usize()
+ )?;
+ }
+ }
+
+ write!(f, "\n")?;
+ if self.is_match(sid) {
+ write!(f, " matches: ")?;
+ for (i, pid) in self.iter_matches(sid).enumerate() {
+ if i > 0 {
+ write!(f, ", ")?;
+ }
+ write!(f, "{}", pid.as_usize())?;
+ }
+ write!(f, "\n")?;
+ }
+ }
+ writeln!(f, "match kind: {:?}", self.match_kind)?;
+ writeln!(f, "prefilter: {:?}", self.prefilter.is_some())?;
+ writeln!(f, "state length: {:?}", self.states.len())?;
+ writeln!(f, "pattern length: {:?}", self.patterns_len())?;
+ writeln!(f, "shortest pattern length: {:?}", self.min_pattern_len)?;
+ writeln!(f, "longest pattern length: {:?}", self.max_pattern_len)?;
+ writeln!(f, "memory usage: {:?}", self.memory_usage())?;
+ writeln!(f, ")")?;
+ Ok(())
+ }
+}
diff --git a/third_party/rust/aho-corasick/src/packed/api.rs b/third_party/rust/aho-corasick/src/packed/api.rs
new file mode 100644
index 0000000000..44f0bc9be3
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/packed/api.rs
@@ -0,0 +1,687 @@
+use alloc::sync::Arc;
+
+use crate::{
+ packed::{pattern::Patterns, rabinkarp::RabinKarp, teddy},
+ util::search::{Match, Span},
+};
+
+/// This is a limit placed on the total number of patterns we're willing to try
+/// and match at once. As more sophisticated algorithms are added, this number
+/// may be increased.
+const PATTERN_LIMIT: usize = 128;
+
+/// A knob for controlling the match semantics of a packed multiple string
+/// searcher.
+///
+/// This differs from the [`MatchKind`](crate::MatchKind) type in the top-level
+/// crate module in that it doesn't support "standard" match semantics,
+/// and instead only supports leftmost-first or leftmost-longest. Namely,
+/// "standard" semantics cannot be easily supported by packed searchers.
+///
+/// For more information on the distinction between leftmost-first and
+/// leftmost-longest, see the docs on the top-level `MatchKind` type.
+///
+/// Unlike the top-level `MatchKind` type, the default match semantics for this
+/// type are leftmost-first.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+#[non_exhaustive]
+pub enum MatchKind {
+ /// Use leftmost-first match semantics, which reports leftmost matches.
+ /// When there are multiple possible leftmost matches, the match
+ /// corresponding to the pattern that appeared earlier when constructing
+ /// the automaton is reported.
+ ///
+ /// This is the default.
+ LeftmostFirst,
+ /// Use leftmost-longest match semantics, which reports leftmost matches.
+ /// When there are multiple possible leftmost matches, the longest match
+ /// is chosen.
+ LeftmostLongest,
+}
+
+impl Default for MatchKind {
+ fn default() -> MatchKind {
+ MatchKind::LeftmostFirst
+ }
+}
+
+/// The configuration for a packed multiple pattern searcher.
+///
+/// The configuration is currently limited only to being able to select the
+/// match semantics (leftmost-first or leftmost-longest) of a searcher. In the
+/// future, more knobs may be made available.
+///
+/// A configuration produces a [`packed::Builder`](Builder), which in turn can
+/// be used to construct a [`packed::Searcher`](Searcher) for searching.
+///
+/// # Example
+///
+/// This example shows how to use leftmost-longest semantics instead of the
+/// default (leftmost-first).
+///
+/// ```
+/// use aho_corasick::{packed::{Config, MatchKind}, PatternID};
+///
+/// # fn example() -> Option<()> {
+/// let searcher = Config::new()
+/// .match_kind(MatchKind::LeftmostLongest)
+/// .builder()
+/// .add("foo")
+/// .add("foobar")
+/// .build()?;
+/// let matches: Vec<PatternID> = searcher
+/// .find_iter("foobar")
+/// .map(|mat| mat.pattern())
+/// .collect();
+/// assert_eq!(vec![PatternID::must(1)], matches);
+/// # Some(()) }
+/// # if cfg!(all(feature = "std", any(
+/// # target_arch = "x86_64", target_arch = "aarch64",
+/// # ))) {
+/// # example().unwrap()
+/// # } else {
+/// # assert!(example().is_none());
+/// # }
+/// ```
+#[derive(Clone, Debug)]
+pub struct Config {
+ kind: MatchKind,
+ force: Option<ForceAlgorithm>,
+ only_teddy_fat: Option<bool>,
+ only_teddy_256bit: Option<bool>,
+ heuristic_pattern_limits: bool,
+}
+
+/// An internal option for forcing the use of a particular packed algorithm.
+///
+/// When an algorithm is forced, if a searcher could not be constructed for it,
+/// then no searcher will be returned even if an alternative algorithm would
+/// work.
+#[derive(Clone, Debug)]
+enum ForceAlgorithm {
+ Teddy,
+ RabinKarp,
+}
+
+impl Default for Config {
+ fn default() -> Config {
+ Config::new()
+ }
+}
+
+impl Config {
+ /// Create a new default configuration. A default configuration uses
+ /// leftmost-first match semantics.
+ pub fn new() -> Config {
+ Config {
+ kind: MatchKind::LeftmostFirst,
+ force: None,
+ only_teddy_fat: None,
+ only_teddy_256bit: None,
+ heuristic_pattern_limits: true,
+ }
+ }
+
+ /// Create a packed builder from this configuration. The builder can be
+ /// used to accumulate patterns and create a [`Searcher`] from them.
+ pub fn builder(&self) -> Builder {
+ Builder::from_config(self.clone())
+ }
+
+ /// Set the match semantics for this configuration.
+ pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config {
+ self.kind = kind;
+ self
+ }
+
+ /// An undocumented method for forcing the use of the Teddy algorithm.
+ ///
+ /// This is only exposed for more precise testing and benchmarks. Callers
+ /// should not use it as it is not part of the API stability guarantees of
+ /// this crate.
+ #[doc(hidden)]
+ pub fn only_teddy(&mut self, yes: bool) -> &mut Config {
+ if yes {
+ self.force = Some(ForceAlgorithm::Teddy);
+ } else {
+ self.force = None;
+ }
+ self
+ }
+
+ /// An undocumented method for forcing the use of the Fat Teddy algorithm.
+ ///
+ /// This is only exposed for more precise testing and benchmarks. Callers
+ /// should not use it as it is not part of the API stability guarantees of
+ /// this crate.
+ #[doc(hidden)]
+ pub fn only_teddy_fat(&mut self, yes: Option<bool>) -> &mut Config {
+ self.only_teddy_fat = yes;
+ self
+ }
+
+ /// An undocumented method for forcing the use of SSE (`Some(false)`) or
+ /// AVX (`Some(true)`) algorithms.
+ ///
+ /// This is only exposed for more precise testing and benchmarks. Callers
+ /// should not use it as it is not part of the API stability guarantees of
+ /// this crate.
+ #[doc(hidden)]
+ pub fn only_teddy_256bit(&mut self, yes: Option<bool>) -> &mut Config {
+ self.only_teddy_256bit = yes;
+ self
+ }
+
+ /// An undocumented method for forcing the use of the Rabin-Karp algorithm.
+ ///
+ /// This is only exposed for more precise testing and benchmarks. Callers
+ /// should not use it as it is not part of the API stability guarantees of
+ /// this crate.
+ #[doc(hidden)]
+ pub fn only_rabin_karp(&mut self, yes: bool) -> &mut Config {
+ if yes {
+ self.force = Some(ForceAlgorithm::RabinKarp);
+ } else {
+ self.force = None;
+ }
+ self
+ }
+
+ /// Request that heuristic limitations on the number of patterns be
+ /// employed. This useful to disable for benchmarking where one wants to
+ /// explore how Teddy performs on large number of patterns even if the
+ /// heuristics would otherwise refuse construction.
+ ///
+ /// This is enabled by default.
+ pub fn heuristic_pattern_limits(&mut self, yes: bool) -> &mut Config {
+ self.heuristic_pattern_limits = yes;
+ self
+ }
+}
+
+/// A builder for constructing a packed searcher from a collection of patterns.
+///
+/// # Example
+///
+/// This example shows how to use a builder to construct a searcher. By
+/// default, leftmost-first match semantics are used.
+///
+/// ```
+/// use aho_corasick::{packed::{Builder, MatchKind}, PatternID};
+///
+/// # fn example() -> Option<()> {
+/// let searcher = Builder::new()
+/// .add("foobar")
+/// .add("foo")
+/// .build()?;
+/// let matches: Vec<PatternID> = searcher
+/// .find_iter("foobar")
+/// .map(|mat| mat.pattern())
+/// .collect();
+/// assert_eq!(vec![PatternID::ZERO], matches);
+/// # Some(()) }
+/// # if cfg!(all(feature = "std", any(
+/// # target_arch = "x86_64", target_arch = "aarch64",
+/// # ))) {
+/// # example().unwrap()
+/// # } else {
+/// # assert!(example().is_none());
+/// # }
+/// ```
+#[derive(Clone, Debug)]
+pub struct Builder {
+ /// The configuration of this builder and subsequent matcher.
+ config: Config,
+ /// Set to true if the builder detects that a matcher cannot be built.
+ inert: bool,
+ /// The patterns provided by the caller.
+ patterns: Patterns,
+}
+
+impl Builder {
+ /// Create a new builder for constructing a multi-pattern searcher. This
+ /// constructor uses the default configuration.
+ pub fn new() -> Builder {
+ Builder::from_config(Config::new())
+ }
+
+ fn from_config(config: Config) -> Builder {
+ Builder { config, inert: false, patterns: Patterns::new() }
+ }
+
+ /// Build a searcher from the patterns added to this builder so far.
+ pub fn build(&self) -> Option<Searcher> {
+ if self.inert || self.patterns.is_empty() {
+ return None;
+ }
+ let mut patterns = self.patterns.clone();
+ patterns.set_match_kind(self.config.kind);
+ let patterns = Arc::new(patterns);
+ let rabinkarp = RabinKarp::new(&patterns);
+ // Effectively, we only want to return a searcher if we can use Teddy,
+ // since Teddy is our only fast packed searcher at the moment.
+ // Rabin-Karp is only used when searching haystacks smaller than what
+ // Teddy can support. Thus, the only way to get a Rabin-Karp searcher
+ // is to force it using undocumented APIs (for tests/benchmarks).
+ let (search_kind, minimum_len) = match self.config.force {
+ None | Some(ForceAlgorithm::Teddy) => {
+ debug!("trying to build Teddy packed matcher");
+ let teddy = match self.build_teddy(Arc::clone(&patterns)) {
+ None => return None,
+ Some(teddy) => teddy,
+ };
+ let minimum_len = teddy.minimum_len();
+ (SearchKind::Teddy(teddy), minimum_len)
+ }
+ Some(ForceAlgorithm::RabinKarp) => {
+ debug!("using Rabin-Karp packed matcher");
+ (SearchKind::RabinKarp, 0)
+ }
+ };
+ Some(Searcher { patterns, rabinkarp, search_kind, minimum_len })
+ }
+
+ fn build_teddy(&self, patterns: Arc<Patterns>) -> Option<teddy::Searcher> {
+ teddy::Builder::new()
+ .only_256bit(self.config.only_teddy_256bit)
+ .only_fat(self.config.only_teddy_fat)
+ .heuristic_pattern_limits(self.config.heuristic_pattern_limits)
+ .build(patterns)
+ }
+
+ /// Add the given pattern to this set to match.
+ ///
+ /// The order in which patterns are added is significant. Namely, when
+ /// using leftmost-first match semantics, then when multiple patterns can
+ /// match at a particular location, the pattern that was added first is
+ /// used as the match.
+ ///
+ /// If the number of patterns added exceeds the amount supported by packed
+ /// searchers, then the builder will stop accumulating patterns and render
+ /// itself inert. At this point, constructing a searcher will always return
+ /// `None`.
+ pub fn add<P: AsRef<[u8]>>(&mut self, pattern: P) -> &mut Builder {
+ if self.inert {
+ return self;
+ } else if self.patterns.len() >= PATTERN_LIMIT {
+ self.inert = true;
+ self.patterns.reset();
+ return self;
+ }
+ // Just in case PATTERN_LIMIT increases beyond u16::MAX.
+ assert!(self.patterns.len() <= core::u16::MAX as usize);
+
+ let pattern = pattern.as_ref();
+ if pattern.is_empty() {
+ self.inert = true;
+ self.patterns.reset();
+ return self;
+ }
+ self.patterns.add(pattern);
+ self
+ }
+
+ /// Add the given iterator of patterns to this set to match.
+ ///
+ /// The iterator must yield elements that can be converted into a `&[u8]`.
+ ///
+ /// The order in which patterns are added is significant. Namely, when
+ /// using leftmost-first match semantics, then when multiple patterns can
+ /// match at a particular location, the pattern that was added first is
+ /// used as the match.
+ ///
+ /// If the number of patterns added exceeds the amount supported by packed
+ /// searchers, then the builder will stop accumulating patterns and render
+ /// itself inert. At this point, constructing a searcher will always return
+ /// `None`.
+ pub fn extend<I, P>(&mut self, patterns: I) -> &mut Builder
+ where
+ I: IntoIterator<Item = P>,
+ P: AsRef<[u8]>,
+ {
+ for p in patterns {
+ self.add(p);
+ }
+ self
+ }
+
+ /// Returns the number of patterns added to this builder.
+ pub fn len(&self) -> usize {
+ self.patterns.len()
+ }
+
+ /// Returns the length, in bytes, of the shortest pattern added.
+ pub fn minimum_len(&self) -> usize {
+ self.patterns.minimum_len()
+ }
+}
+
+impl Default for Builder {
+ fn default() -> Builder {
+ Builder::new()
+ }
+}
+
+/// A packed searcher for quickly finding occurrences of multiple patterns.
+///
+/// If callers need more flexible construction, or if one wants to change the
+/// match semantics (either leftmost-first or leftmost-longest), then one can
+/// use the [`Config`] and/or [`Builder`] types for more fine grained control.
+///
+/// # Example
+///
+/// This example shows how to create a searcher from an iterator of patterns.
+/// By default, leftmost-first match semantics are used.
+///
+/// ```
+/// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
+///
+/// # fn example() -> Option<()> {
+/// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
+/// let matches: Vec<PatternID> = searcher
+/// .find_iter("foobar")
+/// .map(|mat| mat.pattern())
+/// .collect();
+/// assert_eq!(vec![PatternID::ZERO], matches);
+/// # Some(()) }
+/// # if cfg!(all(feature = "std", any(
+/// # target_arch = "x86_64", target_arch = "aarch64",
+/// # ))) {
+/// # example().unwrap()
+/// # } else {
+/// # assert!(example().is_none());
+/// # }
+/// ```
+#[derive(Clone, Debug)]
+pub struct Searcher {
+ patterns: Arc<Patterns>,
+ rabinkarp: RabinKarp,
+ search_kind: SearchKind,
+ minimum_len: usize,
+}
+
+#[derive(Clone, Debug)]
+enum SearchKind {
+ Teddy(teddy::Searcher),
+ RabinKarp,
+}
+
+impl Searcher {
+ /// A convenience function for constructing a searcher from an iterator
+ /// of things that can be converted to a `&[u8]`.
+ ///
+ /// If a searcher could not be constructed (either because of an
+ /// unsupported CPU or because there are too many patterns), then `None`
+ /// is returned.
+ ///
+ /// # Example
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
+ ///
+ /// # fn example() -> Option<()> {
+ /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
+ /// let matches: Vec<PatternID> = searcher
+ /// .find_iter("foobar")
+ /// .map(|mat| mat.pattern())
+ /// .collect();
+ /// assert_eq!(vec![PatternID::ZERO], matches);
+ /// # Some(()) }
+ /// # if cfg!(all(feature = "std", any(
+ /// # target_arch = "x86_64", target_arch = "aarch64",
+ /// # ))) {
+ /// # example().unwrap()
+ /// # } else {
+ /// # assert!(example().is_none());
+ /// # }
+ /// ```
+ pub fn new<I, P>(patterns: I) -> Option<Searcher>
+ where
+ I: IntoIterator<Item = P>,
+ P: AsRef<[u8]>,
+ {
+ Builder::new().extend(patterns).build()
+ }
+
+ /// A convenience function for calling `Config::new()`.
+ ///
+ /// This is useful for avoiding an additional import.
+ pub fn config() -> Config {
+ Config::new()
+ }
+
+ /// A convenience function for calling `Builder::new()`.
+ ///
+ /// This is useful for avoiding an additional import.
+ pub fn builder() -> Builder {
+ Builder::new()
+ }
+
+ /// Return the first occurrence of any of the patterns in this searcher,
+ /// according to its match semantics, in the given haystack. The `Match`
+ /// returned will include the identifier of the pattern that matched, which
+ /// corresponds to the index of the pattern (starting from `0`) in which it
+ /// was added.
+ ///
+ /// # Example
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
+ ///
+ /// # fn example() -> Option<()> {
+ /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
+ /// let mat = searcher.find("foobar")?;
+ /// assert_eq!(PatternID::ZERO, mat.pattern());
+ /// assert_eq!(0, mat.start());
+ /// assert_eq!(6, mat.end());
+ /// # Some(()) }
+ /// # if cfg!(all(feature = "std", any(
+ /// # target_arch = "x86_64", target_arch = "aarch64",
+ /// # ))) {
+ /// # example().unwrap()
+ /// # } else {
+ /// # assert!(example().is_none());
+ /// # }
+ /// ```
+ #[inline]
+ pub fn find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> {
+ let haystack = haystack.as_ref();
+ self.find_in(haystack, Span::from(0..haystack.len()))
+ }
+
+ /// Return the first occurrence of any of the patterns in this searcher,
+ /// according to its match semantics, in the given haystack starting from
+ /// the given position.
+ ///
+ /// The `Match` returned will include the identifier of the pattern that
+ /// matched, which corresponds to the index of the pattern (starting from
+ /// `0`) in which it was added. The offsets in the `Match` will be relative
+ /// to the start of `haystack` (and not `at`).
+ ///
+ /// # Example
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID, Span};
+ ///
+ /// # fn example() -> Option<()> {
+ /// let haystack = "foofoobar";
+ /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
+ /// let mat = searcher.find_in(haystack, Span::from(3..haystack.len()))?;
+ /// assert_eq!(PatternID::ZERO, mat.pattern());
+ /// assert_eq!(3, mat.start());
+ /// assert_eq!(9, mat.end());
+ /// # Some(()) }
+ /// # if cfg!(all(feature = "std", any(
+ /// # target_arch = "x86_64", target_arch = "aarch64",
+ /// # ))) {
+ /// # example().unwrap()
+ /// # } else {
+ /// # assert!(example().is_none());
+ /// # }
+ /// ```
+ #[inline]
+ pub fn find_in<B: AsRef<[u8]>>(
+ &self,
+ haystack: B,
+ span: Span,
+ ) -> Option<Match> {
+ let haystack = haystack.as_ref();
+ match self.search_kind {
+ SearchKind::Teddy(ref teddy) => {
+ if haystack[span].len() < teddy.minimum_len() {
+ return self.find_in_slow(haystack, span);
+ }
+ teddy.find(&haystack[..span.end], span.start)
+ }
+ SearchKind::RabinKarp => {
+ self.rabinkarp.find_at(&haystack[..span.end], span.start)
+ }
+ }
+ }
+
+ /// Return an iterator of non-overlapping occurrences of the patterns in
+ /// this searcher, according to its match semantics, in the given haystack.
+ ///
+ /// # Example
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
+ ///
+ /// # fn example() -> Option<()> {
+ /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
+ /// let matches: Vec<PatternID> = searcher
+ /// .find_iter("foobar fooba foofoo")
+ /// .map(|mat| mat.pattern())
+ /// .collect();
+ /// assert_eq!(vec![
+ /// PatternID::must(0),
+ /// PatternID::must(1),
+ /// PatternID::must(1),
+ /// PatternID::must(1),
+ /// ], matches);
+ /// # Some(()) }
+ /// # if cfg!(all(feature = "std", any(
+ /// # target_arch = "x86_64", target_arch = "aarch64",
+ /// # ))) {
+ /// # example().unwrap()
+ /// # } else {
+ /// # assert!(example().is_none());
+ /// # }
+ /// ```
+ #[inline]
+ pub fn find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>(
+ &'a self,
+ haystack: &'b B,
+ ) -> FindIter<'a, 'b> {
+ let haystack = haystack.as_ref();
+ let span = Span::from(0..haystack.len());
+ FindIter { searcher: self, haystack, span }
+ }
+
+ /// Returns the match kind used by this packed searcher.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use aho_corasick::packed::{MatchKind, Searcher};
+ ///
+ /// # fn example() -> Option<()> {
+ /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
+ /// // leftmost-first is the default.
+ /// assert_eq!(&MatchKind::LeftmostFirst, searcher.match_kind());
+ /// # Some(()) }
+ /// # if cfg!(all(feature = "std", any(
+ /// # target_arch = "x86_64", target_arch = "aarch64",
+ /// # ))) {
+ /// # example().unwrap()
+ /// # } else {
+ /// # assert!(example().is_none());
+ /// # }
+ /// ```
+ #[inline]
+ pub fn match_kind(&self) -> &MatchKind {
+ self.patterns.match_kind()
+ }
+
+ /// Returns the minimum length of a haystack that is required in order for
+ /// packed searching to be effective.
+ ///
+ /// In some cases, the underlying packed searcher may not be able to search
+ /// very short haystacks. When that occurs, the implementation will defer
+ /// to a slower non-packed searcher (which is still generally faster than
+ /// Aho-Corasick for a small number of patterns). However, callers may
+ /// want to avoid ever using the slower variant, which one can do by
+ /// never passing a haystack shorter than the minimum length returned by
+ /// this method.
+ #[inline]
+ pub fn minimum_len(&self) -> usize {
+ self.minimum_len
+ }
+
+ /// Returns the approximate total amount of heap used by this searcher, in
+ /// units of bytes.
+ #[inline]
+ pub fn memory_usage(&self) -> usize {
+ self.patterns.memory_usage()
+ + self.rabinkarp.memory_usage()
+ + self.search_kind.memory_usage()
+ }
+
+ /// Use a slow (non-packed) searcher.
+ ///
+ /// This is useful when a packed searcher could be constructed, but could
+ /// not be used to search a specific haystack. For example, if Teddy was
+ /// built but the haystack is smaller than ~34 bytes, then Teddy might not
+ /// be able to run.
+ fn find_in_slow(&self, haystack: &[u8], span: Span) -> Option<Match> {
+ self.rabinkarp.find_at(&haystack[..span.end], span.start)
+ }
+}
+
+impl SearchKind {
+ fn memory_usage(&self) -> usize {
+ match *self {
+ SearchKind::Teddy(ref ted) => ted.memory_usage(),
+ SearchKind::RabinKarp => 0,
+ }
+ }
+}
+
+/// An iterator over non-overlapping matches from a packed searcher.
+///
+/// The lifetime `'s` refers to the lifetime of the underlying [`Searcher`],
+/// while the lifetime `'h` refers to the lifetime of the haystack being
+/// searched.
+#[derive(Debug)]
+pub struct FindIter<'s, 'h> {
+ searcher: &'s Searcher,
+ haystack: &'h [u8],
+ span: Span,
+}
+
+impl<'s, 'h> Iterator for FindIter<'s, 'h> {
+ type Item = Match;
+
+ fn next(&mut self) -> Option<Match> {
+ if self.span.start > self.span.end {
+ return None;
+ }
+ match self.searcher.find_in(&self.haystack, self.span) {
+ None => None,
+ Some(m) => {
+ self.span.start = m.end();
+ Some(m)
+ }
+ }
+ }
+}
diff --git a/third_party/rust/aho-corasick/src/packed/ext.rs b/third_party/rust/aho-corasick/src/packed/ext.rs
new file mode 100644
index 0000000000..b689642bca
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/packed/ext.rs
@@ -0,0 +1,39 @@
+/// A trait for adding some helper routines to pointers.
+pub(crate) trait Pointer {
+ /// Returns the distance, in units of `T`, between `self` and `origin`.
+ ///
+ /// # Safety
+ ///
+ /// Same as `ptr::offset_from` in addition to `self >= origin`.
+ unsafe fn distance(self, origin: Self) -> usize;
+
+ /// Casts this pointer to `usize`.
+ ///
+ /// Callers should not convert the `usize` back to a pointer if at all
+ /// possible. (And if you believe it's necessary, open an issue to discuss
+ /// why. Otherwise, it has the potential to violate pointer provenance.)
+ /// The purpose of this function is just to be able to do arithmetic, i.e.,
+ /// computing offsets or alignments.
+ fn as_usize(self) -> usize;
+}
+
+impl<T> Pointer for *const T {
+ unsafe fn distance(self, origin: *const T) -> usize {
+ // TODO: Replace with `ptr::sub_ptr` once stabilized.
+ usize::try_from(self.offset_from(origin)).unwrap_unchecked()
+ }
+
+ fn as_usize(self) -> usize {
+ self as usize
+ }
+}
+
+impl<T> Pointer for *mut T {
+ unsafe fn distance(self, origin: *mut T) -> usize {
+ (self as *const T).distance(origin as *const T)
+ }
+
+ fn as_usize(self) -> usize {
+ (self as *const T).as_usize()
+ }
+}
diff --git a/third_party/rust/aho-corasick/src/packed/mod.rs b/third_party/rust/aho-corasick/src/packed/mod.rs
new file mode 100644
index 0000000000..3990bc9330
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/packed/mod.rs
@@ -0,0 +1,120 @@
+/*!
+Provides packed multiple substring search, principally for a small number of
+patterns.
+
+This sub-module provides vectorized routines for quickly finding
+matches of a small number of patterns. In general, users of this crate
+shouldn't need to interface with this module directly, as the primary
+[`AhoCorasick`](crate::AhoCorasick) searcher will use these routines
+automatically as a prefilter when applicable. However, in some cases, callers
+may want to bypass the Aho-Corasick machinery entirely and use this vectorized
+searcher directly.
+
+# Overview
+
+The primary types in this sub-module are:
+
+* [`Searcher`] executes the actual search algorithm to report matches in a
+haystack.
+* [`Builder`] accumulates patterns incrementally and can construct a
+`Searcher`.
+* [`Config`] permits tuning the searcher, and itself will produce a `Builder`
+(which can then be used to build a `Searcher`). Currently, the only tuneable
+knob are the match semantics, but this may be expanded in the future.
+
+# Examples
+
+This example shows how to create a searcher from an iterator of patterns.
+By default, leftmost-first match semantics are used. (See the top-level
+[`MatchKind`] type for more details about match semantics, which apply
+similarly to packed substring search.)
+
+```
+use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
+
+# fn example() -> Option<()> {
+let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
+let matches: Vec<PatternID> = searcher
+ .find_iter("foobar")
+ .map(|mat| mat.pattern())
+ .collect();
+assert_eq!(vec![PatternID::ZERO], matches);
+# Some(()) }
+# if cfg!(all(feature = "std", any(
+# target_arch = "x86_64", target_arch = "aarch64",
+# ))) {
+# example().unwrap()
+# } else {
+# assert!(example().is_none());
+# }
+```
+
+This example shows how to use [`Config`] to change the match semantics to
+leftmost-longest:
+
+```
+use aho_corasick::{packed::{Config, MatchKind}, PatternID};
+
+# fn example() -> Option<()> {
+let searcher = Config::new()
+ .match_kind(MatchKind::LeftmostLongest)
+ .builder()
+ .add("foo")
+ .add("foobar")
+ .build()?;
+let matches: Vec<PatternID> = searcher
+ .find_iter("foobar")
+ .map(|mat| mat.pattern())
+ .collect();
+assert_eq!(vec![PatternID::must(1)], matches);
+# Some(()) }
+# if cfg!(all(feature = "std", any(
+# target_arch = "x86_64", target_arch = "aarch64",
+# ))) {
+# example().unwrap()
+# } else {
+# assert!(example().is_none());
+# }
+```
+
+# Packed substring searching
+
+Packed substring searching refers to the use of SIMD (Single Instruction,
+Multiple Data) to accelerate the detection of matches in a haystack. Unlike
+conventional algorithms, such as Aho-Corasick, SIMD algorithms for substring
+search tend to do better with a small number of patterns, where as Aho-Corasick
+generally maintains reasonably consistent performance regardless of the number
+of patterns you give it. Because of this, the vectorized searcher in this
+sub-module cannot be used as a general purpose searcher, since building the
+searcher may fail even when given a small number of patterns. However, in
+exchange, when searching for a small number of patterns, searching can be quite
+a bit faster than Aho-Corasick (sometimes by an order of magnitude).
+
+The key take away here is that constructing a searcher from a list of patterns
+is a fallible operation with no clear rules for when it will fail. While the
+precise conditions under which building a searcher can fail is specifically an
+implementation detail, here are some common reasons:
+
+* Too many patterns were given. Typically, the limit is on the order of 100 or
+ so, but this limit may fluctuate based on available CPU features.
+* The available packed algorithms require CPU features that aren't available.
+ For example, currently, this crate only provides packed algorithms for
+ `x86_64` and `aarch64`. Therefore, constructing a packed searcher on any
+ other target will always fail.
+* Zero patterns were given, or one of the patterns given was empty. Packed
+ searchers require at least one pattern and that all patterns are non-empty.
+* Something else about the nature of the patterns (typically based on
+ heuristics) suggests that a packed searcher would perform very poorly, so
+ no searcher is built.
+*/
+
+pub use crate::packed::api::{Builder, Config, FindIter, MatchKind, Searcher};
+
+mod api;
+mod ext;
+mod pattern;
+mod rabinkarp;
+mod teddy;
+#[cfg(all(feature = "std", test))]
+mod tests;
+mod vector;
diff --git a/third_party/rust/aho-corasick/src/packed/pattern.rs b/third_party/rust/aho-corasick/src/packed/pattern.rs
new file mode 100644
index 0000000000..95aca4d95b
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/packed/pattern.rs
@@ -0,0 +1,480 @@
+use core::{cmp, fmt, mem, u16, usize};
+
+use alloc::{boxed::Box, string::String, vec, vec::Vec};
+
+use crate::{
+ packed::{api::MatchKind, ext::Pointer},
+ PatternID,
+};
+
+/// A non-empty collection of non-empty patterns to search for.
+///
+/// This collection of patterns is what is passed around to both execute
+/// searches and to construct the searchers themselves. Namely, this permits
+/// searches to avoid copying all of the patterns, and allows us to keep only
+/// one copy throughout all packed searchers.
+///
+/// Note that this collection is not a set. The same pattern can appear more
+/// than once.
+#[derive(Clone, Debug)]
+pub(crate) struct Patterns {
+ /// The match semantics supported by this collection of patterns.
+ ///
+ /// The match semantics determines the order of the iterator over patterns.
+ /// For leftmost-first, patterns are provided in the same order as were
+ /// provided by the caller. For leftmost-longest, patterns are provided in
+ /// descending order of length, with ties broken by the order in which they
+ /// were provided by the caller.
+ kind: MatchKind,
+ /// The collection of patterns, indexed by their identifier.
+ by_id: Vec<Vec<u8>>,
+ /// The order of patterns defined for iteration, given by pattern
+ /// identifiers. The order of `by_id` and `order` is always the same for
+ /// leftmost-first semantics, but may be different for leftmost-longest
+ /// semantics.
+ order: Vec<PatternID>,
+ /// The length of the smallest pattern, in bytes.
+ minimum_len: usize,
+ /// The total number of pattern bytes across the entire collection. This
+ /// is used for reporting total heap usage in constant time.
+ total_pattern_bytes: usize,
+}
+
+// BREADCRUMBS: I think we want to experiment with a different bucket
+// representation. Basically, each bucket is just a Range<usize> to a single
+// contiguous allocation? Maybe length-prefixed patterns or something? The
+// idea is to try to get rid of the pointer chasing in verification. I don't
+// know that that is the issue, but I suspect it is.
+
+impl Patterns {
+ /// Create a new collection of patterns for the given match semantics. The
+ /// ID of each pattern is the index of the pattern at which it occurs in
+ /// the `by_id` slice.
+ ///
+ /// If any of the patterns in the slice given are empty, then this panics.
+ /// Similarly, if the number of patterns given is zero, then this also
+ /// panics.
+ pub(crate) fn new() -> Patterns {
+ Patterns {
+ kind: MatchKind::default(),
+ by_id: vec![],
+ order: vec![],
+ minimum_len: usize::MAX,
+ total_pattern_bytes: 0,
+ }
+ }
+
+ /// Add a pattern to this collection.
+ ///
+ /// This panics if the pattern given is empty.
+ pub(crate) fn add(&mut self, bytes: &[u8]) {
+ assert!(!bytes.is_empty());
+ assert!(self.by_id.len() <= u16::MAX as usize);
+
+ let id = PatternID::new(self.by_id.len()).unwrap();
+ self.order.push(id);
+ self.by_id.push(bytes.to_vec());
+ self.minimum_len = cmp::min(self.minimum_len, bytes.len());
+ self.total_pattern_bytes += bytes.len();
+ }
+
+ /// Set the match kind semantics for this collection of patterns.
+ ///
+ /// If the kind is not set, then the default is leftmost-first.
+ pub(crate) fn set_match_kind(&mut self, kind: MatchKind) {
+ self.kind = kind;
+ match self.kind {
+ MatchKind::LeftmostFirst => {
+ self.order.sort();
+ }
+ MatchKind::LeftmostLongest => {
+ let (order, by_id) = (&mut self.order, &mut self.by_id);
+ order.sort_by(|&id1, &id2| {
+ by_id[id1].len().cmp(&by_id[id2].len()).reverse()
+ });
+ }
+ }
+ }
+
+ /// Return the number of patterns in this collection.
+ ///
+ /// This is guaranteed to be greater than zero.
+ pub(crate) fn len(&self) -> usize {
+ self.by_id.len()
+ }
+
+ /// Returns true if and only if this collection of patterns is empty.
+ pub(crate) fn is_empty(&self) -> bool {
+ self.len() == 0
+ }
+
+ /// Returns the approximate total amount of heap used by these patterns, in
+ /// units of bytes.
+ pub(crate) fn memory_usage(&self) -> usize {
+ self.order.len() * mem::size_of::<PatternID>()
+ + self.by_id.len() * mem::size_of::<Vec<u8>>()
+ + self.total_pattern_bytes
+ }
+
+ /// Clears all heap memory associated with this collection of patterns and
+ /// resets all state such that it is a valid empty collection.
+ pub(crate) fn reset(&mut self) {
+ self.kind = MatchKind::default();
+ self.by_id.clear();
+ self.order.clear();
+ self.minimum_len = usize::MAX;
+ }
+
+ /// Returns the length, in bytes, of the smallest pattern.
+ ///
+ /// This is guaranteed to be at least one.
+ pub(crate) fn minimum_len(&self) -> usize {
+ self.minimum_len
+ }
+
+ /// Returns the match semantics used by these patterns.
+ pub(crate) fn match_kind(&self) -> &MatchKind {
+ &self.kind
+ }
+
+ /// Return the pattern with the given identifier. If such a pattern does
+ /// not exist, then this panics.
+ pub(crate) fn get(&self, id: PatternID) -> Pattern<'_> {
+ Pattern(&self.by_id[id])
+ }
+
+ /// Return the pattern with the given identifier without performing bounds
+ /// checks.
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that a pattern with the given identifier exists
+ /// before using this method.
+ pub(crate) unsafe fn get_unchecked(&self, id: PatternID) -> Pattern<'_> {
+ Pattern(self.by_id.get_unchecked(id.as_usize()))
+ }
+
+ /// Return an iterator over all the patterns in this collection, in the
+ /// order in which they should be matched.
+ ///
+ /// Specifically, in a naive multi-pattern matcher, the following is
+ /// guaranteed to satisfy the match semantics of this collection of
+ /// patterns:
+ ///
+ /// ```ignore
+ /// for i in 0..haystack.len():
+ /// for p in patterns.iter():
+ /// if haystack[i..].starts_with(p.bytes()):
+ /// return Match(p.id(), i, i + p.bytes().len())
+ /// ```
+ ///
+ /// Namely, among the patterns in a collection, if they are matched in
+ /// the order provided by this iterator, then the result is guaranteed
+ /// to satisfy the correct match semantics. (Either leftmost-first or
+ /// leftmost-longest.)
+ pub(crate) fn iter(&self) -> PatternIter<'_> {
+ PatternIter { patterns: self, i: 0 }
+ }
+}
+
+/// An iterator over the patterns in the `Patterns` collection.
+///
+/// The order of the patterns provided by this iterator is consistent with the
+/// match semantics of the originating collection of patterns.
+///
+/// The lifetime `'p` corresponds to the lifetime of the collection of patterns
+/// this is iterating over.
+#[derive(Debug)]
+pub(crate) struct PatternIter<'p> {
+ patterns: &'p Patterns,
+ i: usize,
+}
+
+impl<'p> Iterator for PatternIter<'p> {
+ type Item = (PatternID, Pattern<'p>);
+
+ fn next(&mut self) -> Option<(PatternID, Pattern<'p>)> {
+ if self.i >= self.patterns.len() {
+ return None;
+ }
+ let id = self.patterns.order[self.i];
+ let p = self.patterns.get(id);
+ self.i += 1;
+ Some((id, p))
+ }
+}
+
+/// A pattern that is used in packed searching.
+#[derive(Clone)]
+pub(crate) struct Pattern<'a>(&'a [u8]);
+
+impl<'a> fmt::Debug for Pattern<'a> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ f.debug_struct("Pattern")
+ .field("lit", &String::from_utf8_lossy(&self.0))
+ .finish()
+ }
+}
+
+impl<'p> Pattern<'p> {
+ /// Returns the length of this pattern, in bytes.
+ pub(crate) fn len(&self) -> usize {
+ self.0.len()
+ }
+
+ /// Returns the bytes of this pattern.
+ pub(crate) fn bytes(&self) -> &[u8] {
+ &self.0
+ }
+
+ /// Returns the first `len` low nybbles from this pattern. If this pattern
+ /// is shorter than `len`, then this panics.
+ pub(crate) fn low_nybbles(&self, len: usize) -> Box<[u8]> {
+ let mut nybs = vec![0; len].into_boxed_slice();
+ for (i, byte) in self.bytes().iter().take(len).enumerate() {
+ nybs[i] = byte & 0xF;
+ }
+ nybs
+ }
+
+ /// Returns true if this pattern is a prefix of the given bytes.
+ #[inline(always)]
+ pub(crate) fn is_prefix(&self, bytes: &[u8]) -> bool {
+ is_prefix(bytes, self.bytes())
+ }
+
+ /// Returns true if this pattern is a prefix of the haystack given by the
+ /// raw `start` and `end` pointers.
+ ///
+ /// # Safety
+ ///
+ /// * It must be the case that `start < end` and that the distance between
+ /// them is at least equal to `V::BYTES`. That is, it must always be valid
+ /// to do at least an unaligned load of `V` at `start`.
+ /// * Both `start` and `end` must be valid for reads.
+ /// * Both `start` and `end` must point to an initialized value.
+ /// * Both `start` and `end` must point to the same allocated object and
+ /// must either be in bounds or at most one byte past the end of the
+ /// allocated object.
+ /// * Both `start` and `end` must be _derived from_ a pointer to the same
+ /// object.
+ /// * The distance between `start` and `end` must not overflow `isize`.
+ /// * The distance being in bounds must not rely on "wrapping around" the
+ /// address space.
+ #[inline(always)]
+ pub(crate) unsafe fn is_prefix_raw(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> bool {
+ let patlen = self.bytes().len();
+ let haylen = end.distance(start);
+ if patlen > haylen {
+ return false;
+ }
+ // SAFETY: We've checked that the haystack has length at least equal
+ // to this pattern. All other safety concerns are the responsibility
+ // of the caller.
+ is_equal_raw(start, self.bytes().as_ptr(), patlen)
+ }
+}
+
+/// Returns true if and only if `needle` is a prefix of `haystack`.
+///
+/// This uses a latency optimized variant of `memcmp` internally which *might*
+/// make this faster for very short strings.
+///
+/// # Inlining
+///
+/// This routine is marked `inline(always)`. If you want to call this function
+/// in a way that is not always inlined, you'll need to wrap a call to it in
+/// another function that is marked as `inline(never)` or just `inline`.
+#[inline(always)]
+fn is_prefix(haystack: &[u8], needle: &[u8]) -> bool {
+ if needle.len() > haystack.len() {
+ return false;
+ }
+ // SAFETY: Our pointers are derived directly from borrowed slices which
+ // uphold all of our safety guarantees except for length. We account for
+ // length with the check above.
+ unsafe { is_equal_raw(haystack.as_ptr(), needle.as_ptr(), needle.len()) }
+}
+
+/// Compare corresponding bytes in `x` and `y` for equality.
+///
+/// That is, this returns true if and only if `x.len() == y.len()` and
+/// `x[i] == y[i]` for all `0 <= i < x.len()`.
+///
+/// Note that this isn't used. We only use it in tests as a convenient way
+/// of testing `is_equal_raw`.
+///
+/// # Inlining
+///
+/// This routine is marked `inline(always)`. If you want to call this function
+/// in a way that is not always inlined, you'll need to wrap a call to it in
+/// another function that is marked as `inline(never)` or just `inline`.
+///
+/// # Motivation
+///
+/// Why not use slice equality instead? Well, slice equality usually results in
+/// a call out to the current platform's `libc` which might not be inlineable
+/// or have other overhead. This routine isn't guaranteed to be a win, but it
+/// might be in some cases.
+#[cfg(test)]
+#[inline(always)]
+fn is_equal(x: &[u8], y: &[u8]) -> bool {
+ if x.len() != y.len() {
+ return false;
+ }
+ // SAFETY: Our pointers are derived directly from borrowed slices which
+ // uphold all of our safety guarantees except for length. We account for
+ // length with the check above.
+ unsafe { is_equal_raw(x.as_ptr(), y.as_ptr(), x.len()) }
+}
+
+/// Compare `n` bytes at the given pointers for equality.
+///
+/// This returns true if and only if `*x.add(i) == *y.add(i)` for all
+/// `0 <= i < n`.
+///
+/// # Inlining
+///
+/// This routine is marked `inline(always)`. If you want to call this function
+/// in a way that is not always inlined, you'll need to wrap a call to it in
+/// another function that is marked as `inline(never)` or just `inline`.
+///
+/// # Motivation
+///
+/// Why not use slice equality instead? Well, slice equality usually results in
+/// a call out to the current platform's `libc` which might not be inlineable
+/// or have other overhead. This routine isn't guaranteed to be a win, but it
+/// might be in some cases.
+///
+/// # Safety
+///
+/// * Both `x` and `y` must be valid for reads of up to `n` bytes.
+/// * Both `x` and `y` must point to an initialized value.
+/// * Both `x` and `y` must each point to an allocated object and
+/// must either be in bounds or at most one byte past the end of the
+/// allocated object. `x` and `y` do not need to point to the same allocated
+/// object, but they may.
+/// * Both `x` and `y` must be _derived from_ a pointer to their respective
+/// allocated objects.
+/// * The distance between `x` and `x+n` must not overflow `isize`. Similarly
+/// for `y` and `y+n`.
+/// * The distance being in bounds must not rely on "wrapping around" the
+/// address space.
+#[inline(always)]
+unsafe fn is_equal_raw(mut x: *const u8, mut y: *const u8, n: usize) -> bool {
+ // If we don't have enough bytes to do 4-byte at a time loads, then
+ // handle each possible length specially. Note that I used to have a
+ // byte-at-a-time loop here and that turned out to be quite a bit slower
+ // for the memmem/pathological/defeat-simple-vector-alphabet benchmark.
+ if n < 4 {
+ return match n {
+ 0 => true,
+ 1 => x.read() == y.read(),
+ 2 => {
+ x.cast::<u16>().read_unaligned()
+ == y.cast::<u16>().read_unaligned()
+ }
+ // I also tried copy_nonoverlapping here and it looks like the
+ // codegen is the same.
+ 3 => x.cast::<[u8; 3]>().read() == y.cast::<[u8; 3]>().read(),
+ _ => unreachable!(),
+ };
+ }
+ // When we have 4 or more bytes to compare, then proceed in chunks of 4 at
+ // a time using unaligned loads.
+ //
+ // Also, why do 4 byte loads instead of, say, 8 byte loads? The reason is
+ // that this particular version of memcmp is likely to be called with tiny
+ // needles. That means that if we do 8 byte loads, then a higher proportion
+ // of memcmp calls will use the slower variant above. With that said, this
+ // is a hypothesis and is only loosely supported by benchmarks. There's
+ // likely some improvement that could be made here. The main thing here
+ // though is to optimize for latency, not throughput.
+
+ // SAFETY: The caller is responsible for ensuring the pointers we get are
+ // valid and readable for at least `n` bytes. We also do unaligned loads,
+ // so there's no need to ensure we're aligned. (This is justified by this
+ // routine being specifically for short strings.)
+ let xend = x.add(n.wrapping_sub(4));
+ let yend = y.add(n.wrapping_sub(4));
+ while x < xend {
+ let vx = x.cast::<u32>().read_unaligned();
+ let vy = y.cast::<u32>().read_unaligned();
+ if vx != vy {
+ return false;
+ }
+ x = x.add(4);
+ y = y.add(4);
+ }
+ let vx = xend.cast::<u32>().read_unaligned();
+ let vy = yend.cast::<u32>().read_unaligned();
+ vx == vy
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn equals_different_lengths() {
+ assert!(!is_equal(b"", b"a"));
+ assert!(!is_equal(b"a", b""));
+ assert!(!is_equal(b"ab", b"a"));
+ assert!(!is_equal(b"a", b"ab"));
+ }
+
+ #[test]
+ fn equals_mismatch() {
+ let one_mismatch = [
+ (&b"a"[..], &b"x"[..]),
+ (&b"ab"[..], &b"ax"[..]),
+ (&b"abc"[..], &b"abx"[..]),
+ (&b"abcd"[..], &b"abcx"[..]),
+ (&b"abcde"[..], &b"abcdx"[..]),
+ (&b"abcdef"[..], &b"abcdex"[..]),
+ (&b"abcdefg"[..], &b"abcdefx"[..]),
+ (&b"abcdefgh"[..], &b"abcdefgx"[..]),
+ (&b"abcdefghi"[..], &b"abcdefghx"[..]),
+ (&b"abcdefghij"[..], &b"abcdefghix"[..]),
+ (&b"abcdefghijk"[..], &b"abcdefghijx"[..]),
+ (&b"abcdefghijkl"[..], &b"abcdefghijkx"[..]),
+ (&b"abcdefghijklm"[..], &b"abcdefghijklx"[..]),
+ (&b"abcdefghijklmn"[..], &b"abcdefghijklmx"[..]),
+ ];
+ for (x, y) in one_mismatch {
+ assert_eq!(x.len(), y.len(), "lengths should match");
+ assert!(!is_equal(x, y));
+ assert!(!is_equal(y, x));
+ }
+ }
+
+ #[test]
+ fn equals_yes() {
+ assert!(is_equal(b"", b""));
+ assert!(is_equal(b"a", b"a"));
+ assert!(is_equal(b"ab", b"ab"));
+ assert!(is_equal(b"abc", b"abc"));
+ assert!(is_equal(b"abcd", b"abcd"));
+ assert!(is_equal(b"abcde", b"abcde"));
+ assert!(is_equal(b"abcdef", b"abcdef"));
+ assert!(is_equal(b"abcdefg", b"abcdefg"));
+ assert!(is_equal(b"abcdefgh", b"abcdefgh"));
+ assert!(is_equal(b"abcdefghi", b"abcdefghi"));
+ }
+
+ #[test]
+ fn prefix() {
+ assert!(is_prefix(b"", b""));
+ assert!(is_prefix(b"a", b""));
+ assert!(is_prefix(b"ab", b""));
+ assert!(is_prefix(b"foo", b"foo"));
+ assert!(is_prefix(b"foobar", b"foo"));
+
+ assert!(!is_prefix(b"foo", b"fob"));
+ assert!(!is_prefix(b"foobar", b"fob"));
+ }
+}
diff --git a/third_party/rust/aho-corasick/src/packed/rabinkarp.rs b/third_party/rust/aho-corasick/src/packed/rabinkarp.rs
new file mode 100644
index 0000000000..fdd8a6f0b4
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/packed/rabinkarp.rs
@@ -0,0 +1,168 @@
+use alloc::{sync::Arc, vec, vec::Vec};
+
+use crate::{packed::pattern::Patterns, util::search::Match, PatternID};
+
+/// The type of the rolling hash used in the Rabin-Karp algorithm.
+type Hash = usize;
+
+/// The number of buckets to store our patterns in. We don't want this to be
+/// too big in order to avoid wasting memory, but we don't want it to be too
+/// small either to avoid spending too much time confirming literals.
+///
+/// The number of buckets MUST be a power of two. Otherwise, determining the
+/// bucket from a hash will slow down the code considerably. Using a power
+/// of two means `hash % NUM_BUCKETS` can compile down to a simple `and`
+/// instruction.
+const NUM_BUCKETS: usize = 64;
+
+/// An implementation of the Rabin-Karp algorithm. The main idea of this
+/// algorithm is to maintain a rolling hash as it moves through the input, and
+/// then check whether that hash corresponds to the same hash for any of the
+/// patterns we're looking for.
+///
+/// A draw back of naively scaling Rabin-Karp to multiple patterns is that
+/// it requires all of the patterns to be the same length, which in turn
+/// corresponds to the number of bytes to hash. We adapt this to work for
+/// multiple patterns of varying size by fixing the number of bytes to hash
+/// to be the length of the smallest pattern. We also split the patterns into
+/// several buckets to hopefully make the confirmation step faster.
+///
+/// Wikipedia has a decent explanation, if a bit heavy on the theory:
+/// https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
+///
+/// But ESMAJ provides something a bit more concrete:
+/// https://www-igm.univ-mlv.fr/~lecroq/string/node5.html
+#[derive(Clone, Debug)]
+pub(crate) struct RabinKarp {
+ /// The patterns we're searching for.
+ patterns: Arc<Patterns>,
+ /// The order of patterns in each bucket is significant. Namely, they are
+ /// arranged such that the first one to match is the correct match. This
+ /// may not necessarily correspond to the order provided by the caller.
+ /// For example, if leftmost-longest semantics are used, then the patterns
+ /// are sorted by their length in descending order. If leftmost-first
+ /// semantics are used, then the patterns are sorted by their pattern ID
+ /// in ascending order (which corresponds to the caller's order).
+ buckets: Vec<Vec<(Hash, PatternID)>>,
+ /// The length of the hashing window. Generally, this corresponds to the
+ /// length of the smallest pattern.
+ hash_len: usize,
+ /// The factor to subtract out of a hash before updating it with a new
+ /// byte.
+ hash_2pow: usize,
+}
+
+impl RabinKarp {
+ /// Compile a new Rabin-Karp matcher from the patterns given.
+ ///
+ /// This panics if any of the patterns in the collection are empty, or if
+ /// the collection is itself empty.
+ pub(crate) fn new(patterns: &Arc<Patterns>) -> RabinKarp {
+ assert!(patterns.len() >= 1);
+ let hash_len = patterns.minimum_len();
+ assert!(hash_len >= 1);
+
+ let mut hash_2pow = 1usize;
+ for _ in 1..hash_len {
+ hash_2pow = hash_2pow.wrapping_shl(1);
+ }
+
+ let mut rk = RabinKarp {
+ patterns: Arc::clone(patterns),
+ buckets: vec![vec![]; NUM_BUCKETS],
+ hash_len,
+ hash_2pow,
+ };
+ for (id, pat) in patterns.iter() {
+ let hash = rk.hash(&pat.bytes()[..rk.hash_len]);
+ let bucket = hash % NUM_BUCKETS;
+ rk.buckets[bucket].push((hash, id));
+ }
+ rk
+ }
+
+ /// Return the first matching pattern in the given haystack, begining the
+ /// search at `at`.
+ pub(crate) fn find_at(
+ &self,
+ haystack: &[u8],
+ mut at: usize,
+ ) -> Option<Match> {
+ assert_eq!(NUM_BUCKETS, self.buckets.len());
+
+ if at + self.hash_len > haystack.len() {
+ return None;
+ }
+ let mut hash = self.hash(&haystack[at..at + self.hash_len]);
+ loop {
+ let bucket = &self.buckets[hash % NUM_BUCKETS];
+ for &(phash, pid) in bucket {
+ if phash == hash {
+ if let Some(c) = self.verify(pid, haystack, at) {
+ return Some(c);
+ }
+ }
+ }
+ if at + self.hash_len >= haystack.len() {
+ return None;
+ }
+ hash = self.update_hash(
+ hash,
+ haystack[at],
+ haystack[at + self.hash_len],
+ );
+ at += 1;
+ }
+ }
+
+ /// Returns the approximate total amount of heap used by this searcher, in
+ /// units of bytes.
+ pub(crate) fn memory_usage(&self) -> usize {
+ self.buckets.len() * core::mem::size_of::<Vec<(Hash, PatternID)>>()
+ + self.patterns.len() * core::mem::size_of::<(Hash, PatternID)>()
+ }
+
+ /// Verify whether the pattern with the given id matches at
+ /// `haystack[at..]`.
+ ///
+ /// We tag this function as `cold` because it helps improve codegen.
+ /// Intuitively, it would seem like inlining it would be better. However,
+ /// the only time this is called and a match is not found is when there
+ /// there is a hash collision, or when a prefix of a pattern matches but
+ /// the entire pattern doesn't match. This is hopefully fairly rare, and
+ /// if it does occur a lot, it's going to be slow no matter what we do.
+ #[cold]
+ fn verify(
+ &self,
+ id: PatternID,
+ haystack: &[u8],
+ at: usize,
+ ) -> Option<Match> {
+ let pat = self.patterns.get(id);
+ if pat.is_prefix(&haystack[at..]) {
+ Some(Match::new(id, at..at + pat.len()))
+ } else {
+ None
+ }
+ }
+
+ /// Hash the given bytes.
+ fn hash(&self, bytes: &[u8]) -> Hash {
+ assert_eq!(self.hash_len, bytes.len());
+
+ let mut hash = 0usize;
+ for &b in bytes {
+ hash = hash.wrapping_shl(1).wrapping_add(b as usize);
+ }
+ hash
+ }
+
+ /// Update the hash given based on removing `old_byte` at the beginning
+ /// of some byte string, and appending `new_byte` to the end of that same
+ /// byte string.
+ fn update_hash(&self, prev: Hash, old_byte: u8, new_byte: u8) -> Hash {
+ prev.wrapping_sub((old_byte as usize).wrapping_mul(self.hash_2pow))
+ .wrapping_shl(1)
+ .wrapping_add(new_byte as usize)
+ }
+}
diff --git a/third_party/rust/aho-corasick/src/packed/teddy/README.md b/third_party/rust/aho-corasick/src/packed/teddy/README.md
new file mode 100644
index 0000000000..f0928cbe5c
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/packed/teddy/README.md
@@ -0,0 +1,386 @@
+Teddy is a SIMD accelerated multiple substring matching algorithm. The name
+and the core ideas in the algorithm were learned from the [Hyperscan][1_u]
+project. The implementation in this repository was mostly motivated for use in
+accelerating regex searches by searching for small sets of required literals
+extracted from the regex.
+
+
+# Background
+
+The key idea of Teddy is to do *packed* substring matching. In the literature,
+packed substring matching is the idea of examining multiple bytes in a haystack
+at a time to detect matches. Implementations of, for example, memchr (which
+detects matches of a single byte) have been doing this for years. Only
+recently, with the introduction of various SIMD instructions, has this been
+extended to substring matching. The PCMPESTRI instruction (and its relatives),
+for example, implements substring matching in hardware. It is, however, limited
+to substrings of length 16 bytes or fewer, but this restriction is fine in a
+regex engine, since we rarely care about the performance difference between
+searching for a 16 byte literal and a 16 + N literal; 16 is already long
+enough. The key downside of the PCMPESTRI instruction, on current (2016) CPUs
+at least, is its latency and throughput. As a result, it is often faster to
+do substring search with a Boyer-Moore (or Two-Way) variant and a well placed
+memchr to quickly skip through the haystack.
+
+There are fewer results from the literature on packed substring matching,
+and even fewer for packed multiple substring matching. Ben-Kiki et al. [2]
+describes use of PCMPESTRI for substring matching, but is mostly theoretical
+and hand-waves performance. There is other theoretical work done by Bille [3]
+as well.
+
+The rest of the work in the field, as far as I'm aware, is by Faro and Kulekci
+and is generally focused on multiple pattern search. Their first paper [4a]
+introduces the concept of a fingerprint, which is computed for every block of
+N bytes in every pattern. The haystack is then scanned N bytes at a time and
+a fingerprint is computed in the same way it was computed for blocks in the
+patterns. If the fingerprint corresponds to one that was found in a pattern,
+then a verification step follows to confirm that one of the substrings with the
+corresponding fingerprint actually matches at the current location. Various
+implementation tricks are employed to make sure the fingerprint lookup is fast;
+typically by truncating the fingerprint. (This may, of course, provoke more
+steps in the verification process, so a balance must be struck.)
+
+The main downside of [4a] is that the minimum substring length is 32 bytes,
+presumably because of how the algorithm uses certain SIMD instructions. This
+essentially makes it useless for general purpose regex matching, where a small
+number of short patterns is far more likely.
+
+Faro and Kulekci published another paper [4b] that is conceptually very similar
+to [4a]. The key difference is that it uses the CRC32 instruction (introduced
+as part of SSE 4.2) to compute fingerprint values. This also enables the
+algorithm to work effectively on substrings as short as 7 bytes with 4 byte
+windows. 7 bytes is unfortunately still too long. The window could be
+technically shrunk to 2 bytes, thereby reducing minimum length to 3, but the
+small window size ends up negating most performance benefits—and it's likely
+the common case in a general purpose regex engine.
+
+Faro and Kulekci also published [4c] that appears to be intended as a
+replacement to using PCMPESTRI. In particular, it is specifically motivated by
+the high throughput/latency time of PCMPESTRI and therefore chooses other SIMD
+instructions that are faster. While this approach works for short substrings,
+I personally couldn't see a way to generalize it to multiple substring search.
+
+Faro and Kulekci have another paper [4d] that I haven't been able to read
+because it is behind a paywall.
+
+
+# Teddy
+
+Finally, we get to Teddy. If the above literature review is complete, then it
+appears that Teddy is a novel algorithm. More than that, in my experience, it
+completely blows away the competition for short substrings, which is exactly
+what we want in a general purpose regex engine. Again, the algorithm appears
+to be developed by the authors of [Hyperscan][1_u]. Hyperscan was open sourced
+late 2015, and no earlier history could be found. Therefore, tracking the exact
+provenance of the algorithm with respect to the published literature seems
+difficult.
+
+At a high level, Teddy works somewhat similarly to the fingerprint algorithms
+published by Faro and Kulekci, but Teddy does it in a way that scales a bit
+better. Namely:
+
+1. Teddy's core algorithm scans the haystack in 16 (for SSE, or 32 for AVX)
+ byte chunks. 16 (or 32) is significant because it corresponds to the number
+ of bytes in a SIMD vector.
+2. Bitwise operations are performed on each chunk to discover if any region of
+ it matches a set of precomputed fingerprints from the patterns. If there are
+ matches, then a verification step is performed. In this implementation, our
+ verification step is naive. This can be improved upon.
+
+The details to make this work are quite clever. First, we must choose how to
+pick our fingerprints. In Hyperscan's implementation, I *believe* they use the
+last N bytes of each substring, where N must be at least the minimum length of
+any substring in the set being searched. In this implementation, we use the
+first N bytes of each substring. (The tradeoffs between these choices aren't
+yet clear to me.) We then must figure out how to quickly test whether an
+occurrence of any fingerprint from the set of patterns appears in a 16 byte
+block from the haystack. To keep things simple, let's assume N = 1 and examine
+some examples to motivate the approach. Here are our patterns:
+
+```ignore
+foo
+bar
+baz
+```
+
+The corresponding fingerprints, for N = 1, are `f`, `b` and `b`. Now let's set
+our 16 byte block to:
+
+```ignore
+bat cat foo bump
+xxxxxxxxxxxxxxxx
+```
+
+To cut to the chase, Teddy works by using bitsets. In particular, Teddy creates
+a mask that allows us to quickly compute membership of a fingerprint in a 16
+byte block that also tells which pattern the fingerprint corresponds to. In
+this case, our fingerprint is a single byte, so an appropriate abstraction is
+a map from a single byte to a list of patterns that contain that fingerprint:
+
+```ignore
+f |--> foo
+b |--> bar, baz
+```
+
+Now, all we need to do is figure out how to represent this map in vector space
+and use normal SIMD operations to perform a lookup. The first simplification
+we can make is to represent our patterns as bit fields occupying a single
+byte. This is important, because a single SIMD vector can store 16 bytes.
+
+```ignore
+f |--> 00000001
+b |--> 00000010, 00000100
+```
+
+How do we perform lookup though? It turns out that SSSE3 introduced a very cool
+instruction called PSHUFB. The instruction takes two SIMD vectors, `A` and `B`,
+and returns a third vector `C`. All vectors are treated as 16 8-bit integers.
+`C` is formed by `C[i] = A[B[i]]`. (This is a bit of a simplification, but true
+for the purposes of this algorithm. For full details, see [Intel's Intrinsics
+Guide][5_u].) This essentially lets us use the values in `B` to lookup values
+in `A`.
+
+If we could somehow cause `B` to contain our 16 byte block from the haystack,
+and if `A` could contain our bitmasks, then we'd end up with something like
+this for `A`:
+
+```ignore
+ 0x00 0x01 ... 0x62 ... 0x66 ... 0xFF
+A = 0 0 00000110 00000001 0
+```
+
+And if `B` contains our window from our haystack, we could use shuffle to take
+the values from `B` and use them to look up our bitsets in `A`. But of course,
+we can't do this because `A` in the above example contains 256 bytes, which
+is much larger than the size of a SIMD vector.
+
+Nybbles to the rescue! A nybble is 4 bits. Instead of one mask to hold all of
+our bitsets, we can use two masks, where one mask corresponds to the lower four
+bits of our fingerprint and the other mask corresponds to the upper four bits.
+So our map now looks like:
+
+```ignore
+'f' & 0xF = 0x6 |--> 00000001
+'f' >> 4 = 0x6 |--> 00000111
+'b' & 0xF = 0x2 |--> 00000110
+'b' >> 4 = 0x6 |--> 00000111
+```
+
+Notice that the bitsets for each nybble correspond to the union of all
+fingerprints that contain that nybble. For example, both `f` and `b` have the
+same upper 4 bits but differ on the lower 4 bits. Putting this together, we
+have `A0`, `A1` and `B`, where `A0` is our mask for the lower nybble, `A1` is
+our mask for the upper nybble and `B` is our 16 byte block from the haystack:
+
+```ignore
+ 0x00 0x01 0x02 0x03 ... 0x06 ... 0xF
+A0 = 0 0 00000110 0 00000001 0
+A1 = 0 0 0 0 00000111 0
+B = b a t _ t p
+B = 0x62 0x61 0x74 0x20 0x74 0x70
+```
+
+But of course, we can't use `B` with `PSHUFB` yet, since its values are 8 bits,
+and we need indexes that are at most 4 bits (corresponding to one of 16
+values). We can apply the same transformation to split `B` into lower and upper
+nybbles as we did `A`. As before, `B0` corresponds to the lower nybbles and
+`B1` corresponds to the upper nybbles:
+
+```ignore
+ b a t _ c a t _ f o o _ b u m p
+B0 = 0x2 0x1 0x4 0x0 0x3 0x1 0x4 0x0 0x6 0xF 0xF 0x0 0x2 0x5 0xD 0x0
+B1 = 0x6 0x6 0x7 0x2 0x6 0x6 0x7 0x2 0x6 0x6 0x6 0x2 0x6 0x7 0x6 0x7
+```
+
+And now we have a nice correspondence. `B0` can index `A0` and `B1` can index
+`A1`. Here's what we get when we apply `C0 = PSHUFB(A0, B0)`:
+
+```ignore
+ b a ... f o ... p
+ A0[0x2] A0[0x1] A0[0x6] A0[0xF] A0[0x0]
+C0 = 00000110 0 00000001 0 0
+```
+
+And `C1 = PSHUFB(A1, B1)`:
+
+```ignore
+ b a ... f o ... p
+ A1[0x6] A1[0x6] A1[0x6] A1[0x6] A1[0x7]
+C1 = 00000111 00000111 00000111 00000111 0
+```
+
+Notice how neither one of `C0` or `C1` is guaranteed to report fully correct
+results all on its own. For example, `C1` claims that `b` is a fingerprint for
+the pattern `foo` (since `A1[0x6] = 00000111`), and that `o` is a fingerprint
+for all of our patterns. But if we combined `C0` and `C1` with an `AND`
+operation:
+
+```ignore
+ b a ... f o ... p
+C = 00000110 0 00000001 0 0
+```
+
+Then we now have that `C[i]` contains a bitset corresponding to the matching
+fingerprints in a haystack's 16 byte block, where `i` is the `ith` byte in that
+block.
+
+Once we have that, we can look for the position of the least significant bit
+in `C`. (Least significant because we only target little endian here. Thus,
+the least significant bytes correspond to bytes in our haystack at a lower
+address.) That position, modulo `8`, gives us the pattern that the fingerprint
+matches. That position, integer divided by `8`, also gives us the byte offset
+that the fingerprint occurs in inside the 16 byte haystack block. Using those
+two pieces of information, we can run a verification procedure that tries
+to match all substrings containing that fingerprint at that position in the
+haystack.
+
+
+# Implementation notes
+
+The problem with the algorithm as described above is that it uses a single byte
+for a fingerprint. This will work well if the fingerprints are rare in the
+haystack (e.g., capital letters or special characters in normal English text),
+but if the fingerprints are common, you'll wind up spending too much time in
+the verification step, which effectively negates the performance benefits of
+scanning 16 bytes at a time. Remember, the key to the performance of this
+algorithm is to do as little work as possible per 16 (or 32) bytes.
+
+This algorithm can be extrapolated in a relatively straight-forward way to use
+larger fingerprints. That is, instead of a single byte prefix, we might use a
+two or three byte prefix. The implementation here implements N = {1, 2, 3}
+and always picks the largest N possible. The rationale is that the bigger the
+fingerprint, the fewer verification steps we'll do. Of course, if N is too
+large, then we'll end up doing too much on each step.
+
+The way to extend it is:
+
+1. Add a mask for each byte in the fingerprint. (Remember that each mask is
+ composed of two SIMD vectors.) This results in a value of `C` for each byte
+ in the fingerprint while searching.
+2. When testing each 16 (or 32) byte block, each value of `C` must be shifted
+ so that they are aligned. Once aligned, they should all be `AND`'d together.
+ This will give you only the bitsets corresponding to the full match of the
+ fingerprint. To do this, one needs to save the last byte (for N=2) or last
+ two bytes (for N=3) from the previous iteration, and then line them up with
+ the first one or two bytes of the next iteration.
+
+## Verification
+
+Verification generally follows the procedure outlined above. The tricky parts
+are in the right formulation of operations to get our bits out of our vectors.
+We have a limited set of operations available to us on SIMD vectors as 128-bit
+or 256-bit numbers, so we wind up needing to rip out 2 (or 4) 64-bit integers
+from our vectors, and then run our verification step on each of those. The
+verification step looks at the least significant bit set, and from its
+position, we can derive the byte offset and bucket. (Again, as described
+above.) Once we know the bucket, we do a fairly naive exhaustive search for
+every literal in that bucket. (Hyperscan is a bit smarter here and uses a hash
+table, but I haven't had time to thoroughly explore that. A few initial
+half-hearted attempts resulted in worse performance.)
+
+## AVX
+
+The AVX version of Teddy extrapolates almost perfectly from the SSE version.
+The only hickup is that PALIGNR is used to align chunks in the 16-bit version,
+and there is no equivalent instruction in AVX. AVX does have VPALIGNR, but it
+only works within 128-bit lanes. So there's a bit of tomfoolery to get around
+this by shuffling the vectors before calling VPALIGNR.
+
+The only other aspect to AVX is that since our masks are still fundamentally
+16-bytes (0x0-0xF), they are duplicated to 32-bytes, so that they can apply to
+32-byte chunks.
+
+## Fat Teddy
+
+In the version of Teddy described above, 8 buckets are used to group patterns
+that we want to search for. However, when AVX is available, we can extend the
+number of buckets to 16 by permitting each byte in our masks to use 16-bits
+instead of 8-bits to represent the buckets it belongs to. (This variant is also
+in Hyperscan.) However, what we give up is the ability to scan 32 bytes at a
+time, even though we're using AVX. Instead, we have to scan 16 bytes at a time.
+What we gain, though, is (hopefully) less work in our verification routine.
+It patterns are more spread out across more buckets, then there should overall
+be fewer false positives. In general, Fat Teddy permits us to grow our capacity
+a bit and search for more literals before Teddy gets overwhelmed.
+
+The tricky part of Fat Teddy is in how we adjust our masks and our verification
+procedure. For the masks, we simply represent the first 8 buckets in each of
+the low 16 bytes, and then the second 8 buckets in each of the high 16 bytes.
+Then, in the search loop, instead of loading 32 bytes from the haystack, we
+load the same 16 bytes from the haystack into both the low and high 16 byte
+portions of our 256-bit vector. So for example, a mask might look like this:
+
+ bits: 00100001 00000000 ... 11000000 00000000 00000001 ... 00000000
+ byte: 31 30 16 15 14 0
+ offset: 15 14 0 15 14 0
+ buckets: 8-15 8-15 8-15 0-7 0-7 0-7
+
+Where `byte` is the position in the vector (higher numbers corresponding to
+more significant bits), `offset` is the corresponding position in the haystack
+chunk, and `buckets` corresponds to the bucket assignments for that particular
+byte.
+
+In particular, notice that the bucket assignments for offset `0` are spread
+out between bytes `0` and `16`. This works well for the chunk-by-chunk search
+procedure, but verification really wants to process all bucket assignments for
+each offset at once. Otherwise, we might wind up finding a match at offset
+`1` in one the first 8 buckets, when we really should have reported a match
+at offset `0` in one of the second 8 buckets. (Because we want the leftmost
+match.)
+
+Thus, for verification, we rearrange the above vector such that it is a
+sequence of 16-bit integers, where the least significant 16-bit integer
+corresponds to all of the bucket assignments for offset `0`. So with the
+above vector, the least significant 16-bit integer would be
+
+ 11000000 000000
+
+which was taken from bytes `16` and `0`. Then the verification step pretty much
+runs as described, except with 16 buckets instead of 8.
+
+
+# References
+
+- **[1]** [Hyperscan on GitHub](https://github.com/intel/hyperscan),
+ [webpage](https://www.hyperscan.io/)
+- **[2a]** Ben-Kiki, O., Bille, P., Breslauer, D., Gasieniec, L., Grossi, R.,
+ & Weimann, O. (2011).
+ _Optimal packed string matching_.
+ In LIPIcs-Leibniz International Proceedings in Informatics (Vol. 13).
+ Schloss Dagstuhl-Leibniz-Zentrum fuer Informatik.
+ DOI: 10.4230/LIPIcs.FSTTCS.2011.423.
+ [PDF](https://drops.dagstuhl.de/opus/volltexte/2011/3355/pdf/37.pdf).
+- **[2b]** Ben-Kiki, O., Bille, P., Breslauer, D., Ga̧sieniec, L., Grossi, R.,
+ & Weimann, O. (2014).
+ _Towards optimal packed string matching_.
+ Theoretical Computer Science, 525, 111-129.
+ DOI: 10.1016/j.tcs.2013.06.013.
+ [PDF](https://www.cs.haifa.ac.il/~oren/Publications/bpsm.pdf).
+- **[3]** Bille, P. (2011).
+ _Fast searching in packed strings_.
+ Journal of Discrete Algorithms, 9(1), 49-56.
+ DOI: 10.1016/j.jda.2010.09.003.
+ [PDF](https://www.sciencedirect.com/science/article/pii/S1570866710000353).
+- **[4a]** Faro, S., & Külekci, M. O. (2012, October).
+ _Fast multiple string matching using streaming SIMD extensions technology_.
+ In String Processing and Information Retrieval (pp. 217-228).
+ Springer Berlin Heidelberg.
+ DOI: 10.1007/978-3-642-34109-0_23.
+ [PDF](https://www.dmi.unict.it/faro/papers/conference/faro32.pdf).
+- **[4b]** Faro, S., & Külekci, M. O. (2013, September).
+ _Towards a Very Fast Multiple String Matching Algorithm for Short Patterns_.
+ In Stringology (pp. 78-91).
+ [PDF](https://www.dmi.unict.it/faro/papers/conference/faro36.pdf).
+- **[4c]** Faro, S., & Külekci, M. O. (2013, January).
+ _Fast packed string matching for short patterns_.
+ In Proceedings of the Meeting on Algorithm Engineering & Expermiments
+ (pp. 113-121).
+ Society for Industrial and Applied Mathematics.
+ [PDF](https://arxiv.org/pdf/1209.6449.pdf).
+- **[4d]** Faro, S., & Külekci, M. O. (2014).
+ _Fast and flexible packed string matching_.
+ Journal of Discrete Algorithms, 28, 61-72.
+ DOI: 10.1016/j.jda.2014.07.003.
+
+[1_u]: https://github.com/intel/hyperscan
+[5_u]: https://software.intel.com/sites/landingpage/IntrinsicsGuide
diff --git a/third_party/rust/aho-corasick/src/packed/teddy/builder.rs b/third_party/rust/aho-corasick/src/packed/teddy/builder.rs
new file mode 100644
index 0000000000..be91777beb
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/packed/teddy/builder.rs
@@ -0,0 +1,780 @@
+use core::{
+ fmt::Debug,
+ panic::{RefUnwindSafe, UnwindSafe},
+};
+
+use alloc::sync::Arc;
+
+use crate::packed::{ext::Pointer, pattern::Patterns, teddy::generic::Match};
+
+/// A builder for constructing a Teddy matcher.
+///
+/// The builder primarily permits fine grained configuration of the Teddy
+/// matcher. Most options are made only available for testing/benchmarking
+/// purposes. In reality, options are automatically determined by the nature
+/// and number of patterns given to the builder.
+#[derive(Clone, Debug)]
+pub(crate) struct Builder {
+ /// When none, this is automatically determined. Otherwise, `false` means
+ /// slim Teddy is used (8 buckets) and `true` means fat Teddy is used
+ /// (16 buckets). Fat Teddy requires AVX2, so if that CPU feature isn't
+ /// available and Fat Teddy was requested, no matcher will be built.
+ only_fat: Option<bool>,
+ /// When none, this is automatically determined. Otherwise, `false` means
+ /// that 128-bit vectors will be used (up to SSSE3 instructions) where as
+ /// `true` means that 256-bit vectors will be used. As with `fat`, if
+ /// 256-bit vectors are requested and they aren't available, then a
+ /// searcher will not be built.
+ only_256bit: Option<bool>,
+ /// When true (the default), the number of patterns will be used as a
+ /// heuristic for refusing construction of a Teddy searcher. The point here
+ /// is that too many patterns can overwhelm Teddy. But this can be disabled
+ /// in cases where the caller knows better.
+ heuristic_pattern_limits: bool,
+}
+
+impl Default for Builder {
+ fn default() -> Builder {
+ Builder::new()
+ }
+}
+
+impl Builder {
+ /// Create a new builder for configuring a Teddy matcher.
+ pub(crate) fn new() -> Builder {
+ Builder {
+ only_fat: None,
+ only_256bit: None,
+ heuristic_pattern_limits: true,
+ }
+ }
+
+ /// Build a matcher for the set of patterns given. If a matcher could not
+ /// be built, then `None` is returned.
+ ///
+ /// Generally, a matcher isn't built if the necessary CPU features aren't
+ /// available, an unsupported target or if the searcher is believed to be
+ /// slower than standard techniques (i.e., if there are too many literals).
+ pub(crate) fn build(&self, patterns: Arc<Patterns>) -> Option<Searcher> {
+ self.build_imp(patterns)
+ }
+
+ /// Require the use of Fat (true) or Slim (false) Teddy. Fat Teddy uses
+ /// 16 buckets where as Slim Teddy uses 8 buckets. More buckets are useful
+ /// for a larger set of literals.
+ ///
+ /// `None` is the default, which results in an automatic selection based
+ /// on the number of literals and available CPU features.
+ pub(crate) fn only_fat(&mut self, yes: Option<bool>) -> &mut Builder {
+ self.only_fat = yes;
+ self
+ }
+
+ /// Request the use of 256-bit vectors (true) or 128-bit vectors (false).
+ /// Generally, a larger vector size is better since it either permits
+ /// matching more patterns or matching more bytes in the haystack at once.
+ ///
+ /// `None` is the default, which results in an automatic selection based on
+ /// the number of literals and available CPU features.
+ pub(crate) fn only_256bit(&mut self, yes: Option<bool>) -> &mut Builder {
+ self.only_256bit = yes;
+ self
+ }
+
+ /// Request that heuristic limitations on the number of patterns be
+ /// employed. This useful to disable for benchmarking where one wants to
+ /// explore how Teddy performs on large number of patterns even if the
+ /// heuristics would otherwise refuse construction.
+ ///
+ /// This is enabled by default.
+ pub(crate) fn heuristic_pattern_limits(
+ &mut self,
+ yes: bool,
+ ) -> &mut Builder {
+ self.heuristic_pattern_limits = yes;
+ self
+ }
+
+ fn build_imp(&self, patterns: Arc<Patterns>) -> Option<Searcher> {
+ let patlimit = self.heuristic_pattern_limits;
+ // There's no particular reason why we limit ourselves to little endian
+ // here, but it seems likely that some parts of Teddy as they are
+ // currently written (e.g., the uses of `trailing_zeros`) are likely
+ // wrong on non-little-endian targets. Such things are likely easy to
+ // fix, but at the time of writing (2023/09/18), I actually do not know
+ // how to test this code on a big-endian target. So for now, we're
+ // conservative and just bail out.
+ if !cfg!(target_endian = "little") {
+ debug!("skipping Teddy because target isn't little endian");
+ return None;
+ }
+ // Too many patterns will overwhelm Teddy and likely lead to slow
+ // downs, typically in the verification step.
+ if patlimit && patterns.len() > 64 {
+ debug!("skipping Teddy because of too many patterns");
+ return None;
+ }
+
+ #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
+ {
+ use self::x86_64::{FatAVX2, SlimAVX2, SlimSSSE3};
+
+ let mask_len = core::cmp::min(4, patterns.minimum_len());
+ let beefy = patterns.len() > 32;
+ let has_avx2 = self::x86_64::is_available_avx2();
+ let has_ssse3 = has_avx2 || self::x86_64::is_available_ssse3();
+ let use_avx2 = if self.only_256bit == Some(true) {
+ if !has_avx2 {
+ debug!(
+ "skipping Teddy because avx2 was demanded but unavailable"
+ );
+ return None;
+ }
+ true
+ } else if self.only_256bit == Some(false) {
+ if !has_ssse3 {
+ debug!(
+ "skipping Teddy because ssse3 was demanded but unavailable"
+ );
+ return None;
+ }
+ false
+ } else if !has_ssse3 && !has_avx2 {
+ debug!(
+ "skipping Teddy because ssse3 and avx2 are unavailable"
+ );
+ return None;
+ } else {
+ has_avx2
+ };
+ let fat = match self.only_fat {
+ None => use_avx2 && beefy,
+ Some(false) => false,
+ Some(true) if !use_avx2 => {
+ debug!(
+ "skipping Teddy because fat was demanded, but fat \
+ Teddy requires avx2 which is unavailable"
+ );
+ return None;
+ }
+ Some(true) => true,
+ };
+ // Just like for aarch64, it's possible that too many patterns will
+ // overhwelm Teddy. Unlike aarch64 though, we have Fat teddy which
+ // helps things scale a bit more by spreading patterns over more
+ // buckets.
+ //
+ // These thresholds were determined by looking at the measurements
+ // for the rust/aho-corasick/packed/leftmost-first and
+ // rust/aho-corasick/dfa/leftmost-first engines on the `teddy/`
+ // benchmarks.
+ if patlimit && mask_len == 1 && patterns.len() > 16 {
+ debug!(
+ "skipping Teddy (mask len: 1) because there are \
+ too many patterns",
+ );
+ return None;
+ }
+ match (mask_len, use_avx2, fat) {
+ (1, false, _) => {
+ debug!("Teddy choice: 128-bit slim, 1 byte");
+ SlimSSSE3::<1>::new(&patterns)
+ }
+ (1, true, false) => {
+ debug!("Teddy choice: 256-bit slim, 1 byte");
+ SlimAVX2::<1>::new(&patterns)
+ }
+ (1, true, true) => {
+ debug!("Teddy choice: 256-bit fat, 1 byte");
+ FatAVX2::<1>::new(&patterns)
+ }
+ (2, false, _) => {
+ debug!("Teddy choice: 128-bit slim, 2 bytes");
+ SlimSSSE3::<2>::new(&patterns)
+ }
+ (2, true, false) => {
+ debug!("Teddy choice: 256-bit slim, 2 bytes");
+ SlimAVX2::<2>::new(&patterns)
+ }
+ (2, true, true) => {
+ debug!("Teddy choice: 256-bit fat, 2 bytes");
+ FatAVX2::<2>::new(&patterns)
+ }
+ (3, false, _) => {
+ debug!("Teddy choice: 128-bit slim, 3 bytes");
+ SlimSSSE3::<3>::new(&patterns)
+ }
+ (3, true, false) => {
+ debug!("Teddy choice: 256-bit slim, 3 bytes");
+ SlimAVX2::<3>::new(&patterns)
+ }
+ (3, true, true) => {
+ debug!("Teddy choice: 256-bit fat, 3 bytes");
+ FatAVX2::<3>::new(&patterns)
+ }
+ (4, false, _) => {
+ debug!("Teddy choice: 128-bit slim, 4 bytes");
+ SlimSSSE3::<4>::new(&patterns)
+ }
+ (4, true, false) => {
+ debug!("Teddy choice: 256-bit slim, 4 bytes");
+ SlimAVX2::<4>::new(&patterns)
+ }
+ (4, true, true) => {
+ debug!("Teddy choice: 256-bit fat, 4 bytes");
+ FatAVX2::<4>::new(&patterns)
+ }
+ _ => {
+ debug!("no supported Teddy configuration found");
+ None
+ }
+ }
+ }
+ #[cfg(target_arch = "aarch64")]
+ {
+ use self::aarch64::SlimNeon;
+
+ let mask_len = core::cmp::min(4, patterns.minimum_len());
+ if self.only_256bit == Some(true) {
+ debug!(
+ "skipping Teddy because 256-bits were demanded \
+ but unavailable"
+ );
+ return None;
+ }
+ if self.only_fat == Some(true) {
+ debug!(
+ "skipping Teddy because fat was demanded but unavailable"
+ );
+ }
+ // Since we don't have Fat teddy in aarch64 (I think we'd want at
+ // least 256-bit vectors for that), we need to be careful not to
+ // allow too many patterns as it might overwhelm Teddy. Generally
+ // speaking, as the mask length goes up, the more patterns we can
+ // handle because the mask length results in fewer candidates
+ // generated.
+ //
+ // These thresholds were determined by looking at the measurements
+ // for the rust/aho-corasick/packed/leftmost-first and
+ // rust/aho-corasick/dfa/leftmost-first engines on the `teddy/`
+ // benchmarks.
+ match mask_len {
+ 1 => {
+ if patlimit && patterns.len() > 16 {
+ debug!(
+ "skipping Teddy (mask len: 1) because there are \
+ too many patterns",
+ );
+ }
+ debug!("Teddy choice: 128-bit slim, 1 byte");
+ SlimNeon::<1>::new(&patterns)
+ }
+ 2 => {
+ if patlimit && patterns.len() > 32 {
+ debug!(
+ "skipping Teddy (mask len: 2) because there are \
+ too many patterns",
+ );
+ }
+ debug!("Teddy choice: 128-bit slim, 2 bytes");
+ SlimNeon::<2>::new(&patterns)
+ }
+ 3 => {
+ if patlimit && patterns.len() > 48 {
+ debug!(
+ "skipping Teddy (mask len: 3) because there are \
+ too many patterns",
+ );
+ }
+ debug!("Teddy choice: 128-bit slim, 3 bytes");
+ SlimNeon::<3>::new(&patterns)
+ }
+ 4 => {
+ debug!("Teddy choice: 128-bit slim, 4 bytes");
+ SlimNeon::<4>::new(&patterns)
+ }
+ _ => {
+ debug!("no supported Teddy configuration found");
+ None
+ }
+ }
+ }
+ #[cfg(not(any(
+ all(target_arch = "x86_64", target_feature = "sse2"),
+ target_arch = "aarch64"
+ )))]
+ {
+ None
+ }
+ }
+}
+
+/// A searcher that dispatches to one of several possible Teddy variants.
+#[derive(Clone, Debug)]
+pub(crate) struct Searcher {
+ /// The Teddy variant we use. We use dynamic dispatch under the theory that
+ /// it results in better codegen then a enum, although this is a specious
+ /// claim.
+ ///
+ /// This `Searcher` is essentially a wrapper for a `SearcherT` trait
+ /// object. We just make `memory_usage` and `minimum_len` available without
+ /// going through dynamic dispatch.
+ imp: Arc<dyn SearcherT>,
+ /// Total heap memory used by the Teddy variant.
+ memory_usage: usize,
+ /// The minimum haystack length this searcher can handle. It is intended
+ /// for callers to use some other search routine (such as Rabin-Karp) in
+ /// cases where the haystack (or remainer of the haystack) is too short.
+ minimum_len: usize,
+}
+
+impl Searcher {
+ /// Look for the leftmost occurrence of any pattern in this search in the
+ /// given haystack starting at the given position.
+ ///
+ /// # Panics
+ ///
+ /// This panics when `haystack[at..].len()` is less than the minimum length
+ /// for this haystack.
+ #[inline(always)]
+ pub(crate) fn find(
+ &self,
+ haystack: &[u8],
+ at: usize,
+ ) -> Option<crate::Match> {
+ // SAFETY: The Teddy implementations all require a minimum haystack
+ // length, and this is required for safety. Therefore, we assert it
+ // here in order to make this method sound.
+ assert!(haystack[at..].len() >= self.minimum_len);
+ let hayptr = haystack.as_ptr();
+ // SAFETY: Construction of the searcher guarantees that we are able
+ // to run it in the current environment (i.e., we won't get an AVX2
+ // searcher on a x86-64 CPU without AVX2 support). Also, the pointers
+ // are valid as they are derived directly from a borrowed slice.
+ let teddym = unsafe {
+ self.imp.find(hayptr.add(at), hayptr.add(haystack.len()))?
+ };
+ let start = teddym.start().as_usize().wrapping_sub(hayptr.as_usize());
+ let end = teddym.end().as_usize().wrapping_sub(hayptr.as_usize());
+ let span = crate::Span { start, end };
+ // OK because we won't permit the construction of a searcher that
+ // could report a pattern ID bigger than what can fit in the crate-wide
+ // PatternID type.
+ let pid = crate::PatternID::new_unchecked(teddym.pattern().as_usize());
+ let m = crate::Match::new(pid, span);
+ Some(m)
+ }
+
+ /// Returns the approximate total amount of heap used by this type, in
+ /// units of bytes.
+ #[inline(always)]
+ pub(crate) fn memory_usage(&self) -> usize {
+ self.memory_usage
+ }
+
+ /// Returns the minimum length, in bytes, that a haystack must be in order
+ /// to use it with this searcher.
+ #[inline(always)]
+ pub(crate) fn minimum_len(&self) -> usize {
+ self.minimum_len
+ }
+}
+
+/// A trait that provides dynamic dispatch over the different possible Teddy
+/// variants on the same algorithm.
+///
+/// On `x86_64` for example, it isn't known until runtime which of 12 possible
+/// variants will be used. One might use one of the four slim 128-bit vector
+/// variants, or one of the four 256-bit vector variants or even one of the
+/// four fat 256-bit vector variants.
+///
+/// Since this choice is generally made when the Teddy searcher is constructed
+/// and this choice is based on the patterns given and what the current CPU
+/// supports, it follows that there must be some kind of indirection at search
+/// time that "selects" the variant chosen at build time.
+///
+/// There are a few different ways to go about this. One approach is to use an
+/// enum. It works fine, but in my experiments, this generally results in worse
+/// codegen. Another approach, which is what we use here, is dynamic dispatch
+/// via a trait object. We basically implement this trait for each possible
+/// variant, select the variant we want at build time and convert it to a
+/// trait object for use at search time.
+///
+/// Another approach is to use function pointers and stick each of the possible
+/// variants into a union. This is essentially isomorphic to the dynamic
+/// dispatch approach, but doesn't require any allocations. Since this crate
+/// requires `alloc`, there's no real reason (AFAIK) to go down this path. (The
+/// `memchr` crate does this.)
+trait SearcherT:
+ Debug + Send + Sync + UnwindSafe + RefUnwindSafe + 'static
+{
+ /// Execute a search on the given haystack (identified by `start` and `end`
+ /// raw pointers).
+ ///
+ /// # Safety
+ ///
+ /// Essentially, the `start` and `end` pointers must be valid and point
+ /// to a haystack one can read. As long as you derive them from, for
+ /// example, a `&[u8]`, they should automatically satisfy all of the safety
+ /// obligations:
+ ///
+ /// * Both `start` and `end` must be valid for reads.
+ /// * Both `start` and `end` must point to an initialized value.
+ /// * Both `start` and `end` must point to the same allocated object and
+ /// must either be in bounds or at most one byte past the end of the
+ /// allocated object.
+ /// * Both `start` and `end` must be _derived from_ a pointer to the same
+ /// object.
+ /// * The distance between `start` and `end` must not overflow `isize`.
+ /// * The distance being in bounds must not rely on "wrapping around" the
+ /// address space.
+ /// * It must be the case that `start <= end`.
+ /// * `end - start` must be greater than the minimum length for this
+ /// searcher.
+ ///
+ /// Also, it is expected that implementations of this trait will tag this
+ /// method with a `target_feature` attribute. Callers must ensure that
+ /// they are executing this method in an environment where that attribute
+ /// is valid.
+ unsafe fn find(&self, start: *const u8, end: *const u8) -> Option<Match>;
+}
+
+#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
+mod x86_64 {
+ use core::arch::x86_64::{__m128i, __m256i};
+
+ use alloc::sync::Arc;
+
+ use crate::packed::{
+ ext::Pointer,
+ pattern::Patterns,
+ teddy::generic::{self, Match},
+ };
+
+ use super::{Searcher, SearcherT};
+
+ #[derive(Clone, Debug)]
+ pub(super) struct SlimSSSE3<const BYTES: usize> {
+ slim128: generic::Slim<__m128i, BYTES>,
+ }
+
+ // Defines SlimSSSE3 wrapper functions for 1, 2, 3 and 4 bytes.
+ macro_rules! slim_ssse3 {
+ ($len:expr) => {
+ impl SlimSSSE3<$len> {
+ /// Creates a new searcher using "slim" Teddy with 128-bit
+ /// vectors. If SSSE3 is not available in the current
+ /// environment, then this returns `None`.
+ pub(super) fn new(
+ patterns: &Arc<Patterns>,
+ ) -> Option<Searcher> {
+ if !is_available_ssse3() {
+ return None;
+ }
+ Some(unsafe { SlimSSSE3::<$len>::new_unchecked(patterns) })
+ }
+
+ /// Creates a new searcher using "slim" Teddy with 256-bit
+ /// vectors without checking whether SSSE3 is available or not.
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that SSSE3 is available in the current
+ /// environment.
+ #[target_feature(enable = "ssse3")]
+ unsafe fn new_unchecked(patterns: &Arc<Patterns>) -> Searcher {
+ let slim128 = generic::Slim::<__m128i, $len>::new(
+ Arc::clone(patterns),
+ );
+ let memory_usage = slim128.memory_usage();
+ let minimum_len = slim128.minimum_len();
+ let imp = Arc::new(SlimSSSE3 { slim128 });
+ Searcher { imp, memory_usage, minimum_len }
+ }
+ }
+
+ impl SearcherT for SlimSSSE3<$len> {
+ #[target_feature(enable = "ssse3")]
+ #[inline]
+ unsafe fn find(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<Match> {
+ // SAFETY: All obligations except for `target_feature` are
+ // passed to the caller. Our use of `target_feature` is
+ // safe because construction of this type requires that the
+ // requisite target features are available.
+ self.slim128.find(start, end)
+ }
+ }
+ };
+ }
+
+ slim_ssse3!(1);
+ slim_ssse3!(2);
+ slim_ssse3!(3);
+ slim_ssse3!(4);
+
+ #[derive(Clone, Debug)]
+ pub(super) struct SlimAVX2<const BYTES: usize> {
+ slim128: generic::Slim<__m128i, BYTES>,
+ slim256: generic::Slim<__m256i, BYTES>,
+ }
+
+ // Defines SlimAVX2 wrapper functions for 1, 2, 3 and 4 bytes.
+ macro_rules! slim_avx2 {
+ ($len:expr) => {
+ impl SlimAVX2<$len> {
+ /// Creates a new searcher using "slim" Teddy with 256-bit
+ /// vectors. If AVX2 is not available in the current
+ /// environment, then this returns `None`.
+ pub(super) fn new(
+ patterns: &Arc<Patterns>,
+ ) -> Option<Searcher> {
+ if !is_available_avx2() {
+ return None;
+ }
+ Some(unsafe { SlimAVX2::<$len>::new_unchecked(patterns) })
+ }
+
+ /// Creates a new searcher using "slim" Teddy with 256-bit
+ /// vectors without checking whether AVX2 is available or not.
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that AVX2 is available in the current
+ /// environment.
+ #[target_feature(enable = "avx2")]
+ unsafe fn new_unchecked(patterns: &Arc<Patterns>) -> Searcher {
+ let slim128 = generic::Slim::<__m128i, $len>::new(
+ Arc::clone(&patterns),
+ );
+ let slim256 = generic::Slim::<__m256i, $len>::new(
+ Arc::clone(&patterns),
+ );
+ let memory_usage =
+ slim128.memory_usage() + slim256.memory_usage();
+ let minimum_len = slim128.minimum_len();
+ let imp = Arc::new(SlimAVX2 { slim128, slim256 });
+ Searcher { imp, memory_usage, minimum_len }
+ }
+ }
+
+ impl SearcherT for SlimAVX2<$len> {
+ #[target_feature(enable = "avx2")]
+ #[inline]
+ unsafe fn find(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<Match> {
+ // SAFETY: All obligations except for `target_feature` are
+ // passed to the caller. Our use of `target_feature` is
+ // safe because construction of this type requires that the
+ // requisite target features are available.
+ let len = end.distance(start);
+ if len < self.slim256.minimum_len() {
+ self.slim128.find(start, end)
+ } else {
+ self.slim256.find(start, end)
+ }
+ }
+ }
+ };
+ }
+
+ slim_avx2!(1);
+ slim_avx2!(2);
+ slim_avx2!(3);
+ slim_avx2!(4);
+
+ #[derive(Clone, Debug)]
+ pub(super) struct FatAVX2<const BYTES: usize> {
+ fat256: generic::Fat<__m256i, BYTES>,
+ }
+
+ // Defines SlimAVX2 wrapper functions for 1, 2, 3 and 4 bytes.
+ macro_rules! fat_avx2 {
+ ($len:expr) => {
+ impl FatAVX2<$len> {
+ /// Creates a new searcher using "slim" Teddy with 256-bit
+ /// vectors. If AVX2 is not available in the current
+ /// environment, then this returns `None`.
+ pub(super) fn new(
+ patterns: &Arc<Patterns>,
+ ) -> Option<Searcher> {
+ if !is_available_avx2() {
+ return None;
+ }
+ Some(unsafe { FatAVX2::<$len>::new_unchecked(patterns) })
+ }
+
+ /// Creates a new searcher using "slim" Teddy with 256-bit
+ /// vectors without checking whether AVX2 is available or not.
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that AVX2 is available in the current
+ /// environment.
+ #[target_feature(enable = "avx2")]
+ unsafe fn new_unchecked(patterns: &Arc<Patterns>) -> Searcher {
+ let fat256 = generic::Fat::<__m256i, $len>::new(
+ Arc::clone(&patterns),
+ );
+ let memory_usage = fat256.memory_usage();
+ let minimum_len = fat256.minimum_len();
+ let imp = Arc::new(FatAVX2 { fat256 });
+ Searcher { imp, memory_usage, minimum_len }
+ }
+ }
+
+ impl SearcherT for FatAVX2<$len> {
+ #[target_feature(enable = "avx2")]
+ #[inline]
+ unsafe fn find(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<Match> {
+ // SAFETY: All obligations except for `target_feature` are
+ // passed to the caller. Our use of `target_feature` is
+ // safe because construction of this type requires that the
+ // requisite target features are available.
+ self.fat256.find(start, end)
+ }
+ }
+ };
+ }
+
+ fat_avx2!(1);
+ fat_avx2!(2);
+ fat_avx2!(3);
+ fat_avx2!(4);
+
+ #[inline]
+ pub(super) fn is_available_ssse3() -> bool {
+ #[cfg(not(target_feature = "sse2"))]
+ {
+ false
+ }
+ #[cfg(target_feature = "sse2")]
+ {
+ #[cfg(target_feature = "ssse3")]
+ {
+ true
+ }
+ #[cfg(not(target_feature = "ssse3"))]
+ {
+ #[cfg(feature = "std")]
+ {
+ std::is_x86_feature_detected!("ssse3")
+ }
+ #[cfg(not(feature = "std"))]
+ {
+ false
+ }
+ }
+ }
+ }
+
+ #[inline]
+ pub(super) fn is_available_avx2() -> bool {
+ #[cfg(not(target_feature = "sse2"))]
+ {
+ false
+ }
+ #[cfg(target_feature = "sse2")]
+ {
+ #[cfg(target_feature = "avx2")]
+ {
+ true
+ }
+ #[cfg(not(target_feature = "avx2"))]
+ {
+ #[cfg(feature = "std")]
+ {
+ std::is_x86_feature_detected!("avx2")
+ }
+ #[cfg(not(feature = "std"))]
+ {
+ false
+ }
+ }
+ }
+ }
+}
+
+#[cfg(target_arch = "aarch64")]
+mod aarch64 {
+ use core::arch::aarch64::uint8x16_t;
+
+ use alloc::sync::Arc;
+
+ use crate::packed::{
+ pattern::Patterns,
+ teddy::generic::{self, Match},
+ };
+
+ use super::{Searcher, SearcherT};
+
+ #[derive(Clone, Debug)]
+ pub(super) struct SlimNeon<const BYTES: usize> {
+ slim128: generic::Slim<uint8x16_t, BYTES>,
+ }
+
+ // Defines SlimSSSE3 wrapper functions for 1, 2, 3 and 4 bytes.
+ macro_rules! slim_neon {
+ ($len:expr) => {
+ impl SlimNeon<$len> {
+ /// Creates a new searcher using "slim" Teddy with 128-bit
+ /// vectors. If SSSE3 is not available in the current
+ /// environment, then this returns `None`.
+ pub(super) fn new(
+ patterns: &Arc<Patterns>,
+ ) -> Option<Searcher> {
+ Some(unsafe { SlimNeon::<$len>::new_unchecked(patterns) })
+ }
+
+ /// Creates a new searcher using "slim" Teddy with 256-bit
+ /// vectors without checking whether SSSE3 is available or not.
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that SSSE3 is available in the current
+ /// environment.
+ #[target_feature(enable = "neon")]
+ unsafe fn new_unchecked(patterns: &Arc<Patterns>) -> Searcher {
+ let slim128 = generic::Slim::<uint8x16_t, $len>::new(
+ Arc::clone(patterns),
+ );
+ let memory_usage = slim128.memory_usage();
+ let minimum_len = slim128.minimum_len();
+ let imp = Arc::new(SlimNeon { slim128 });
+ Searcher { imp, memory_usage, minimum_len }
+ }
+ }
+
+ impl SearcherT for SlimNeon<$len> {
+ #[target_feature(enable = "neon")]
+ #[inline]
+ unsafe fn find(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<Match> {
+ // SAFETY: All obligations except for `target_feature` are
+ // passed to the caller. Our use of `target_feature` is
+ // safe because construction of this type requires that the
+ // requisite target features are available.
+ self.slim128.find(start, end)
+ }
+ }
+ };
+ }
+
+ slim_neon!(1);
+ slim_neon!(2);
+ slim_neon!(3);
+ slim_neon!(4);
+}
diff --git a/third_party/rust/aho-corasick/src/packed/teddy/generic.rs b/third_party/rust/aho-corasick/src/packed/teddy/generic.rs
new file mode 100644
index 0000000000..2aacd00357
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/packed/teddy/generic.rs
@@ -0,0 +1,1382 @@
+use core::fmt::Debug;
+
+use alloc::{
+ boxed::Box, collections::BTreeMap, format, sync::Arc, vec, vec::Vec,
+};
+
+use crate::{
+ packed::{
+ ext::Pointer,
+ pattern::Patterns,
+ vector::{FatVector, Vector},
+ },
+ util::int::U32,
+ PatternID,
+};
+
+/// A match type specialized to the Teddy implementations below.
+///
+/// Essentially, instead of representing a match at byte offsets, we use
+/// raw pointers. This is because the implementations below operate on raw
+/// pointers, and so this is a more natural return type based on how the
+/// implementation works.
+///
+/// Also, the `PatternID` used here is a `u16`.
+#[derive(Clone, Copy, Debug)]
+pub(crate) struct Match {
+ pid: PatternID,
+ start: *const u8,
+ end: *const u8,
+}
+
+impl Match {
+ /// Returns the ID of the pattern that matched.
+ pub(crate) fn pattern(&self) -> PatternID {
+ self.pid
+ }
+
+ /// Returns a pointer into the haystack at which the match starts.
+ pub(crate) fn start(&self) -> *const u8 {
+ self.start
+ }
+
+ /// Returns a pointer into the haystack at which the match ends.
+ pub(crate) fn end(&self) -> *const u8 {
+ self.end
+ }
+}
+
+/// A "slim" Teddy implementation that is generic over both the vector type
+/// and the minimum length of the patterns being searched for.
+///
+/// Only 1, 2, 3 and 4 bytes are supported as minimum lengths.
+#[derive(Clone, Debug)]
+pub(crate) struct Slim<V, const BYTES: usize> {
+ /// A generic data structure for doing "slim" Teddy verification.
+ teddy: Teddy<8>,
+ /// The masks used as inputs to the shuffle operation to generate
+ /// candidates (which are fed into the verification routines).
+ masks: [Mask<V>; BYTES],
+}
+
+impl<V: Vector, const BYTES: usize> Slim<V, BYTES> {
+ /// Create a new "slim" Teddy searcher for the given patterns.
+ ///
+ /// # Panics
+ ///
+ /// This panics when `BYTES` is any value other than 1, 2, 3 or 4.
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that this is okay to call in the current target for
+ /// the current CPU.
+ #[inline(always)]
+ pub(crate) unsafe fn new(patterns: Arc<Patterns>) -> Slim<V, BYTES> {
+ assert!(
+ 1 <= BYTES && BYTES <= 4,
+ "only 1, 2, 3 or 4 bytes are supported"
+ );
+ let teddy = Teddy::new(patterns);
+ let masks = SlimMaskBuilder::from_teddy(&teddy);
+ Slim { teddy, masks }
+ }
+
+ /// Returns the approximate total amount of heap used by this type, in
+ /// units of bytes.
+ #[inline(always)]
+ pub(crate) fn memory_usage(&self) -> usize {
+ self.teddy.memory_usage()
+ }
+
+ /// Returns the minimum length, in bytes, that a haystack must be in order
+ /// to use it with this searcher.
+ #[inline(always)]
+ pub(crate) fn minimum_len(&self) -> usize {
+ V::BYTES + (BYTES - 1)
+ }
+}
+
+impl<V: Vector> Slim<V, 1> {
+ /// Look for an occurrences of the patterns in this finder in the haystack
+ /// given by the `start` and `end` pointers.
+ ///
+ /// If no match could be found, then `None` is returned.
+ ///
+ /// # Safety
+ ///
+ /// The given pointers representing the haystack must be valid to read
+ /// from. They must also point to a region of memory that is at least the
+ /// minimum length required by this searcher.
+ ///
+ /// Callers must ensure that this is okay to call in the current target for
+ /// the current CPU.
+ #[inline(always)]
+ pub(crate) unsafe fn find(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<Match> {
+ let len = end.distance(start);
+ debug_assert!(len >= self.minimum_len());
+ let mut cur = start;
+ while cur <= end.sub(V::BYTES) {
+ if let Some(m) = self.find_one(cur, end) {
+ return Some(m);
+ }
+ cur = cur.add(V::BYTES);
+ }
+ if cur < end {
+ cur = end.sub(V::BYTES);
+ if let Some(m) = self.find_one(cur, end) {
+ return Some(m);
+ }
+ }
+ None
+ }
+
+ /// Look for a match starting at the `V::BYTES` at and after `cur`. If
+ /// there isn't one, then `None` is returned.
+ ///
+ /// # Safety
+ ///
+ /// The given pointers representing the haystack must be valid to read
+ /// from. They must also point to a region of memory that is at least the
+ /// minimum length required by this searcher.
+ ///
+ /// Callers must ensure that this is okay to call in the current target for
+ /// the current CPU.
+ #[inline(always)]
+ unsafe fn find_one(
+ &self,
+ cur: *const u8,
+ end: *const u8,
+ ) -> Option<Match> {
+ let c = self.candidate(cur);
+ if !c.is_zero() {
+ if let Some(m) = self.teddy.verify(cur, end, c) {
+ return Some(m);
+ }
+ }
+ None
+ }
+
+ /// Look for a candidate match (represented as a vector) starting at the
+ /// `V::BYTES` at and after `cur`. If there isn't one, then a vector with
+ /// all bits set to zero is returned.
+ ///
+ /// # Safety
+ ///
+ /// The given pointer representing the haystack must be valid to read
+ /// from.
+ ///
+ /// Callers must ensure that this is okay to call in the current target for
+ /// the current CPU.
+ #[inline(always)]
+ unsafe fn candidate(&self, cur: *const u8) -> V {
+ let chunk = V::load_unaligned(cur);
+ Mask::members1(chunk, self.masks)
+ }
+}
+
+impl<V: Vector> Slim<V, 2> {
+ /// See Slim<V, 1>::find.
+ #[inline(always)]
+ pub(crate) unsafe fn find(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<Match> {
+ let len = end.distance(start);
+ debug_assert!(len >= self.minimum_len());
+ let mut cur = start.add(1);
+ let mut prev0 = V::splat(0xFF);
+ while cur <= end.sub(V::BYTES) {
+ if let Some(m) = self.find_one(cur, end, &mut prev0) {
+ return Some(m);
+ }
+ cur = cur.add(V::BYTES);
+ }
+ if cur < end {
+ cur = end.sub(V::BYTES);
+ prev0 = V::splat(0xFF);
+ if let Some(m) = self.find_one(cur, end, &mut prev0) {
+ return Some(m);
+ }
+ }
+ None
+ }
+
+ /// See Slim<V, 1>::find_one.
+ #[inline(always)]
+ unsafe fn find_one(
+ &self,
+ cur: *const u8,
+ end: *const u8,
+ prev0: &mut V,
+ ) -> Option<Match> {
+ let c = self.candidate(cur, prev0);
+ if !c.is_zero() {
+ if let Some(m) = self.teddy.verify(cur.sub(1), end, c) {
+ return Some(m);
+ }
+ }
+ None
+ }
+
+ /// See Slim<V, 1>::candidate.
+ #[inline(always)]
+ unsafe fn candidate(&self, cur: *const u8, prev0: &mut V) -> V {
+ let chunk = V::load_unaligned(cur);
+ let (res0, res1) = Mask::members2(chunk, self.masks);
+ let res0prev0 = res0.shift_in_one_byte(*prev0);
+ let res = res0prev0.and(res1);
+ *prev0 = res0;
+ res
+ }
+}
+
+impl<V: Vector> Slim<V, 3> {
+ /// See Slim<V, 1>::find.
+ #[inline(always)]
+ pub(crate) unsafe fn find(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<Match> {
+ let len = end.distance(start);
+ debug_assert!(len >= self.minimum_len());
+ let mut cur = start.add(2);
+ let mut prev0 = V::splat(0xFF);
+ let mut prev1 = V::splat(0xFF);
+ while cur <= end.sub(V::BYTES) {
+ if let Some(m) = self.find_one(cur, end, &mut prev0, &mut prev1) {
+ return Some(m);
+ }
+ cur = cur.add(V::BYTES);
+ }
+ if cur < end {
+ cur = end.sub(V::BYTES);
+ prev0 = V::splat(0xFF);
+ prev1 = V::splat(0xFF);
+ if let Some(m) = self.find_one(cur, end, &mut prev0, &mut prev1) {
+ return Some(m);
+ }
+ }
+ None
+ }
+
+ /// See Slim<V, 1>::find_one.
+ #[inline(always)]
+ unsafe fn find_one(
+ &self,
+ cur: *const u8,
+ end: *const u8,
+ prev0: &mut V,
+ prev1: &mut V,
+ ) -> Option<Match> {
+ let c = self.candidate(cur, prev0, prev1);
+ if !c.is_zero() {
+ if let Some(m) = self.teddy.verify(cur.sub(2), end, c) {
+ return Some(m);
+ }
+ }
+ None
+ }
+
+ /// See Slim<V, 1>::candidate.
+ #[inline(always)]
+ unsafe fn candidate(
+ &self,
+ cur: *const u8,
+ prev0: &mut V,
+ prev1: &mut V,
+ ) -> V {
+ let chunk = V::load_unaligned(cur);
+ let (res0, res1, res2) = Mask::members3(chunk, self.masks);
+ let res0prev0 = res0.shift_in_two_bytes(*prev0);
+ let res1prev1 = res1.shift_in_one_byte(*prev1);
+ let res = res0prev0.and(res1prev1).and(res2);
+ *prev0 = res0;
+ *prev1 = res1;
+ res
+ }
+}
+
+impl<V: Vector> Slim<V, 4> {
+ /// See Slim<V, 1>::find.
+ #[inline(always)]
+ pub(crate) unsafe fn find(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<Match> {
+ let len = end.distance(start);
+ debug_assert!(len >= self.minimum_len());
+ let mut cur = start.add(3);
+ let mut prev0 = V::splat(0xFF);
+ let mut prev1 = V::splat(0xFF);
+ let mut prev2 = V::splat(0xFF);
+ while cur <= end.sub(V::BYTES) {
+ if let Some(m) =
+ self.find_one(cur, end, &mut prev0, &mut prev1, &mut prev2)
+ {
+ return Some(m);
+ }
+ cur = cur.add(V::BYTES);
+ }
+ if cur < end {
+ cur = end.sub(V::BYTES);
+ prev0 = V::splat(0xFF);
+ prev1 = V::splat(0xFF);
+ prev2 = V::splat(0xFF);
+ if let Some(m) =
+ self.find_one(cur, end, &mut prev0, &mut prev1, &mut prev2)
+ {
+ return Some(m);
+ }
+ }
+ None
+ }
+
+ /// See Slim<V, 1>::find_one.
+ #[inline(always)]
+ unsafe fn find_one(
+ &self,
+ cur: *const u8,
+ end: *const u8,
+ prev0: &mut V,
+ prev1: &mut V,
+ prev2: &mut V,
+ ) -> Option<Match> {
+ let c = self.candidate(cur, prev0, prev1, prev2);
+ if !c.is_zero() {
+ if let Some(m) = self.teddy.verify(cur.sub(3), end, c) {
+ return Some(m);
+ }
+ }
+ None
+ }
+
+ /// See Slim<V, 1>::candidate.
+ #[inline(always)]
+ unsafe fn candidate(
+ &self,
+ cur: *const u8,
+ prev0: &mut V,
+ prev1: &mut V,
+ prev2: &mut V,
+ ) -> V {
+ let chunk = V::load_unaligned(cur);
+ let (res0, res1, res2, res3) = Mask::members4(chunk, self.masks);
+ let res0prev0 = res0.shift_in_three_bytes(*prev0);
+ let res1prev1 = res1.shift_in_two_bytes(*prev1);
+ let res2prev2 = res2.shift_in_one_byte(*prev2);
+ let res = res0prev0.and(res1prev1).and(res2prev2).and(res3);
+ *prev0 = res0;
+ *prev1 = res1;
+ *prev2 = res2;
+ res
+ }
+}
+
+/// A "fat" Teddy implementation that is generic over both the vector type
+/// and the minimum length of the patterns being searched for.
+///
+/// Only 1, 2, 3 and 4 bytes are supported as minimum lengths.
+#[derive(Clone, Debug)]
+pub(crate) struct Fat<V, const BYTES: usize> {
+ /// A generic data structure for doing "fat" Teddy verification.
+ teddy: Teddy<16>,
+ /// The masks used as inputs to the shuffle operation to generate
+ /// candidates (which are fed into the verification routines).
+ masks: [Mask<V>; BYTES],
+}
+
+impl<V: FatVector, const BYTES: usize> Fat<V, BYTES> {
+ /// Create a new "fat" Teddy searcher for the given patterns.
+ ///
+ /// # Panics
+ ///
+ /// This panics when `BYTES` is any value other than 1, 2, 3 or 4.
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that this is okay to call in the current target for
+ /// the current CPU.
+ #[inline(always)]
+ pub(crate) unsafe fn new(patterns: Arc<Patterns>) -> Fat<V, BYTES> {
+ assert!(
+ 1 <= BYTES && BYTES <= 4,
+ "only 1, 2, 3 or 4 bytes are supported"
+ );
+ let teddy = Teddy::new(patterns);
+ let masks = FatMaskBuilder::from_teddy(&teddy);
+ Fat { teddy, masks }
+ }
+
+ /// Returns the approximate total amount of heap used by this type, in
+ /// units of bytes.
+ #[inline(always)]
+ pub(crate) fn memory_usage(&self) -> usize {
+ self.teddy.memory_usage()
+ }
+
+ /// Returns the minimum length, in bytes, that a haystack must be in order
+ /// to use it with this searcher.
+ #[inline(always)]
+ pub(crate) fn minimum_len(&self) -> usize {
+ V::Half::BYTES + (BYTES - 1)
+ }
+}
+
+impl<V: FatVector> Fat<V, 1> {
+ /// Look for an occurrences of the patterns in this finder in the haystack
+ /// given by the `start` and `end` pointers.
+ ///
+ /// If no match could be found, then `None` is returned.
+ ///
+ /// # Safety
+ ///
+ /// The given pointers representing the haystack must be valid to read
+ /// from. They must also point to a region of memory that is at least the
+ /// minimum length required by this searcher.
+ ///
+ /// Callers must ensure that this is okay to call in the current target for
+ /// the current CPU.
+ #[inline(always)]
+ pub(crate) unsafe fn find(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<Match> {
+ let len = end.distance(start);
+ debug_assert!(len >= self.minimum_len());
+ let mut cur = start;
+ while cur <= end.sub(V::Half::BYTES) {
+ if let Some(m) = self.find_one(cur, end) {
+ return Some(m);
+ }
+ cur = cur.add(V::Half::BYTES);
+ }
+ if cur < end {
+ cur = end.sub(V::Half::BYTES);
+ if let Some(m) = self.find_one(cur, end) {
+ return Some(m);
+ }
+ }
+ None
+ }
+
+ /// Look for a match starting at the `V::BYTES` at and after `cur`. If
+ /// there isn't one, then `None` is returned.
+ ///
+ /// # Safety
+ ///
+ /// The given pointers representing the haystack must be valid to read
+ /// from. They must also point to a region of memory that is at least the
+ /// minimum length required by this searcher.
+ ///
+ /// Callers must ensure that this is okay to call in the current target for
+ /// the current CPU.
+ #[inline(always)]
+ unsafe fn find_one(
+ &self,
+ cur: *const u8,
+ end: *const u8,
+ ) -> Option<Match> {
+ let c = self.candidate(cur);
+ if !c.is_zero() {
+ if let Some(m) = self.teddy.verify(cur, end, c) {
+ return Some(m);
+ }
+ }
+ None
+ }
+
+ /// Look for a candidate match (represented as a vector) starting at the
+ /// `V::BYTES` at and after `cur`. If there isn't one, then a vector with
+ /// all bits set to zero is returned.
+ ///
+ /// # Safety
+ ///
+ /// The given pointer representing the haystack must be valid to read
+ /// from.
+ ///
+ /// Callers must ensure that this is okay to call in the current target for
+ /// the current CPU.
+ #[inline(always)]
+ unsafe fn candidate(&self, cur: *const u8) -> V {
+ let chunk = V::load_half_unaligned(cur);
+ Mask::members1(chunk, self.masks)
+ }
+}
+
+impl<V: FatVector> Fat<V, 2> {
+ /// See `Fat<V, 1>::find`.
+ #[inline(always)]
+ pub(crate) unsafe fn find(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<Match> {
+ let len = end.distance(start);
+ debug_assert!(len >= self.minimum_len());
+ let mut cur = start.add(1);
+ let mut prev0 = V::splat(0xFF);
+ while cur <= end.sub(V::Half::BYTES) {
+ if let Some(m) = self.find_one(cur, end, &mut prev0) {
+ return Some(m);
+ }
+ cur = cur.add(V::Half::BYTES);
+ }
+ if cur < end {
+ cur = end.sub(V::Half::BYTES);
+ prev0 = V::splat(0xFF);
+ if let Some(m) = self.find_one(cur, end, &mut prev0) {
+ return Some(m);
+ }
+ }
+ None
+ }
+
+ /// See `Fat<V, 1>::find_one`.
+ #[inline(always)]
+ unsafe fn find_one(
+ &self,
+ cur: *const u8,
+ end: *const u8,
+ prev0: &mut V,
+ ) -> Option<Match> {
+ let c = self.candidate(cur, prev0);
+ if !c.is_zero() {
+ if let Some(m) = self.teddy.verify(cur.sub(1), end, c) {
+ return Some(m);
+ }
+ }
+ None
+ }
+
+ /// See `Fat<V, 1>::candidate`.
+ #[inline(always)]
+ unsafe fn candidate(&self, cur: *const u8, prev0: &mut V) -> V {
+ let chunk = V::load_half_unaligned(cur);
+ let (res0, res1) = Mask::members2(chunk, self.masks);
+ let res0prev0 = res0.half_shift_in_one_byte(*prev0);
+ let res = res0prev0.and(res1);
+ *prev0 = res0;
+ res
+ }
+}
+
+impl<V: FatVector> Fat<V, 3> {
+ /// See `Fat<V, 1>::find`.
+ #[inline(always)]
+ pub(crate) unsafe fn find(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<Match> {
+ let len = end.distance(start);
+ debug_assert!(len >= self.minimum_len());
+ let mut cur = start.add(2);
+ let mut prev0 = V::splat(0xFF);
+ let mut prev1 = V::splat(0xFF);
+ while cur <= end.sub(V::Half::BYTES) {
+ if let Some(m) = self.find_one(cur, end, &mut prev0, &mut prev1) {
+ return Some(m);
+ }
+ cur = cur.add(V::Half::BYTES);
+ }
+ if cur < end {
+ cur = end.sub(V::Half::BYTES);
+ prev0 = V::splat(0xFF);
+ prev1 = V::splat(0xFF);
+ if let Some(m) = self.find_one(cur, end, &mut prev0, &mut prev1) {
+ return Some(m);
+ }
+ }
+ None
+ }
+
+ /// See `Fat<V, 1>::find_one`.
+ #[inline(always)]
+ unsafe fn find_one(
+ &self,
+ cur: *const u8,
+ end: *const u8,
+ prev0: &mut V,
+ prev1: &mut V,
+ ) -> Option<Match> {
+ let c = self.candidate(cur, prev0, prev1);
+ if !c.is_zero() {
+ if let Some(m) = self.teddy.verify(cur.sub(2), end, c) {
+ return Some(m);
+ }
+ }
+ None
+ }
+
+ /// See `Fat<V, 1>::candidate`.
+ #[inline(always)]
+ unsafe fn candidate(
+ &self,
+ cur: *const u8,
+ prev0: &mut V,
+ prev1: &mut V,
+ ) -> V {
+ let chunk = V::load_half_unaligned(cur);
+ let (res0, res1, res2) = Mask::members3(chunk, self.masks);
+ let res0prev0 = res0.half_shift_in_two_bytes(*prev0);
+ let res1prev1 = res1.half_shift_in_one_byte(*prev1);
+ let res = res0prev0.and(res1prev1).and(res2);
+ *prev0 = res0;
+ *prev1 = res1;
+ res
+ }
+}
+
+impl<V: FatVector> Fat<V, 4> {
+ /// See `Fat<V, 1>::find`.
+ #[inline(always)]
+ pub(crate) unsafe fn find(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<Match> {
+ let len = end.distance(start);
+ debug_assert!(len >= self.minimum_len());
+ let mut cur = start.add(3);
+ let mut prev0 = V::splat(0xFF);
+ let mut prev1 = V::splat(0xFF);
+ let mut prev2 = V::splat(0xFF);
+ while cur <= end.sub(V::Half::BYTES) {
+ if let Some(m) =
+ self.find_one(cur, end, &mut prev0, &mut prev1, &mut prev2)
+ {
+ return Some(m);
+ }
+ cur = cur.add(V::Half::BYTES);
+ }
+ if cur < end {
+ cur = end.sub(V::Half::BYTES);
+ prev0 = V::splat(0xFF);
+ prev1 = V::splat(0xFF);
+ prev2 = V::splat(0xFF);
+ if let Some(m) =
+ self.find_one(cur, end, &mut prev0, &mut prev1, &mut prev2)
+ {
+ return Some(m);
+ }
+ }
+ None
+ }
+
+ /// See `Fat<V, 1>::find_one`.
+ #[inline(always)]
+ unsafe fn find_one(
+ &self,
+ cur: *const u8,
+ end: *const u8,
+ prev0: &mut V,
+ prev1: &mut V,
+ prev2: &mut V,
+ ) -> Option<Match> {
+ let c = self.candidate(cur, prev0, prev1, prev2);
+ if !c.is_zero() {
+ if let Some(m) = self.teddy.verify(cur.sub(3), end, c) {
+ return Some(m);
+ }
+ }
+ None
+ }
+
+ /// See `Fat<V, 1>::candidate`.
+ #[inline(always)]
+ unsafe fn candidate(
+ &self,
+ cur: *const u8,
+ prev0: &mut V,
+ prev1: &mut V,
+ prev2: &mut V,
+ ) -> V {
+ let chunk = V::load_half_unaligned(cur);
+ let (res0, res1, res2, res3) = Mask::members4(chunk, self.masks);
+ let res0prev0 = res0.half_shift_in_three_bytes(*prev0);
+ let res1prev1 = res1.half_shift_in_two_bytes(*prev1);
+ let res2prev2 = res2.half_shift_in_one_byte(*prev2);
+ let res = res0prev0.and(res1prev1).and(res2prev2).and(res3);
+ *prev0 = res0;
+ *prev1 = res1;
+ *prev2 = res2;
+ res
+ }
+}
+
+/// The common elements of all "slim" and "fat" Teddy search implementations.
+///
+/// Essentially, this contains the patterns and the buckets. Namely, it
+/// contains enough to implement the verification step after candidates are
+/// identified via the shuffle masks.
+///
+/// It is generic over the number of buckets used. In general, the number of
+/// buckets is either 8 (for "slim" Teddy) or 16 (for "fat" Teddy). The generic
+/// parameter isn't really meant to be instantiated for any value other than
+/// 8 or 16, although it is technically possible. The main hiccup is that there
+/// is some bit-shifting done in the critical part of verification that could
+/// be quite expensive if `N` is not a multiple of 2.
+#[derive(Clone, Debug)]
+struct Teddy<const BUCKETS: usize> {
+ /// The patterns we are searching for.
+ ///
+ /// A pattern string can be found by its `PatternID`.
+ patterns: Arc<Patterns>,
+ /// The allocation of patterns in buckets. This only contains the IDs of
+ /// patterns. In order to do full verification, callers must provide the
+ /// actual patterns when using Teddy.
+ buckets: [Vec<PatternID>; BUCKETS],
+ // N.B. The above representation is very simple, but it definitely results
+ // in ping-ponging between different allocations during verification. I've
+ // tried experimenting with other representations that flatten the pattern
+ // strings into a single allocation, but it doesn't seem to help much.
+ // Probably everything is small enough to fit into cache anyway, and so the
+ // pointer chasing isn't a big deal?
+ //
+ // One other avenue I haven't explored is some kind of hashing trick
+ // that let's us do another high-confidence check before launching into
+ // `memcmp`.
+}
+
+impl<const BUCKETS: usize> Teddy<BUCKETS> {
+ /// Create a new generic data structure for Teddy verification.
+ fn new(patterns: Arc<Patterns>) -> Teddy<BUCKETS> {
+ assert_ne!(0, patterns.len(), "Teddy requires at least one pattern");
+ assert_ne!(
+ 0,
+ patterns.minimum_len(),
+ "Teddy does not support zero-length patterns"
+ );
+ assert!(
+ BUCKETS == 8 || BUCKETS == 16,
+ "Teddy only supports 8 or 16 buckets"
+ );
+ // MSRV(1.63): Use core::array::from_fn below instead of allocating a
+ // superfluous outer Vec. Not a big deal (especially given the BTreeMap
+ // allocation below), but nice to not do it.
+ let buckets =
+ <[Vec<PatternID>; BUCKETS]>::try_from(vec![vec![]; BUCKETS])
+ .unwrap();
+ let mut t = Teddy { patterns, buckets };
+
+ let mut map: BTreeMap<Box<[u8]>, usize> = BTreeMap::new();
+ for (id, pattern) in t.patterns.iter() {
+ // We try to be slightly clever in how we assign patterns into
+ // buckets. Generally speaking, we want patterns with the same
+ // prefix to be in the same bucket, since it minimizes the amount
+ // of time we spend churning through buckets in the verification
+ // step.
+ //
+ // So we could assign patterns with the same N-prefix (where N is
+ // the size of the mask, which is one of {1, 2, 3}) to the same
+ // bucket. However, case insensitive searches are fairly common, so
+ // we'd for example, ideally want to treat `abc` and `ABC` as if
+ // they shared the same prefix. ASCII has the nice property that
+ // the lower 4 bits of A and a are the same, so we therefore group
+ // patterns with the same low-nybble-N-prefix into the same bucket.
+ //
+ // MOREOVER, this is actually necessary for correctness! In
+ // particular, by grouping patterns with the same prefix into the
+ // same bucket, we ensure that we preserve correct leftmost-first
+ // and leftmost-longest match semantics. In addition to the fact
+ // that `patterns.iter()` iterates in the correct order, this
+ // guarantees that all possible ambiguous matches will occur in
+ // the same bucket. The verification routine could be adjusted to
+ // support correct leftmost match semantics regardless of bucket
+ // allocation, but that results in a performance hit. It's much
+ // nicer to be able to just stop as soon as a match is found.
+ let lonybs = pattern.low_nybbles(t.mask_len());
+ if let Some(&bucket) = map.get(&lonybs) {
+ t.buckets[bucket].push(id);
+ } else {
+ // N.B. We assign buckets in reverse because it shouldn't have
+ // any influence on performance, but it does make it harder to
+ // get leftmost match semantics accidentally correct.
+ let bucket = (BUCKETS - 1) - (id.as_usize() % BUCKETS);
+ t.buckets[bucket].push(id);
+ map.insert(lonybs, bucket);
+ }
+ }
+ t
+ }
+
+ /// Verify whether there are any matches starting at or after `cur` in the
+ /// haystack. The candidate chunk given should correspond to 8-bit bitsets
+ /// for N buckets.
+ ///
+ /// # Safety
+ ///
+ /// The given pointers representing the haystack must be valid to read
+ /// from.
+ #[inline(always)]
+ unsafe fn verify64(
+ &self,
+ cur: *const u8,
+ end: *const u8,
+ mut candidate_chunk: u64,
+ ) -> Option<Match> {
+ while candidate_chunk != 0 {
+ let bit = candidate_chunk.trailing_zeros().as_usize();
+ candidate_chunk &= !(1 << bit);
+
+ let cur = cur.add(bit / BUCKETS);
+ let bucket = bit % BUCKETS;
+ if let Some(m) = self.verify_bucket(cur, end, bucket) {
+ return Some(m);
+ }
+ }
+ None
+ }
+
+ /// Verify whether there are any matches starting at `at` in the given
+ /// `haystack` corresponding only to patterns in the given bucket.
+ ///
+ /// # Safety
+ ///
+ /// The given pointers representing the haystack must be valid to read
+ /// from.
+ ///
+ /// The bucket index must be less than or equal to `self.buckets.len()`.
+ #[inline(always)]
+ unsafe fn verify_bucket(
+ &self,
+ cur: *const u8,
+ end: *const u8,
+ bucket: usize,
+ ) -> Option<Match> {
+ debug_assert!(bucket < self.buckets.len());
+ // SAFETY: The caller must ensure that the bucket index is correct.
+ for pid in self.buckets.get_unchecked(bucket).iter().copied() {
+ // SAFETY: This is safe because we are guaranteed that every
+ // index in a Teddy bucket is a valid index into `pats`, by
+ // construction.
+ debug_assert!(pid.as_usize() < self.patterns.len());
+ let pat = self.patterns.get_unchecked(pid);
+ if pat.is_prefix_raw(cur, end) {
+ let start = cur;
+ let end = start.add(pat.len());
+ return Some(Match { pid, start, end });
+ }
+ }
+ None
+ }
+
+ /// Returns the total number of masks required by the patterns in this
+ /// Teddy searcher.
+ ///
+ /// Basically, the mask length corresponds to the type of Teddy searcher
+ /// to use: a 1-byte, 2-byte, 3-byte or 4-byte searcher. The bigger the
+ /// better, typically, since searching for longer substrings usually
+ /// decreases the rate of false positives. Therefore, the number of masks
+ /// needed is the length of the shortest pattern in this searcher. If the
+ /// length of the shortest pattern (in bytes) is bigger than 4, then the
+ /// mask length is 4 since there are no Teddy searchers for more than 4
+ /// bytes.
+ fn mask_len(&self) -> usize {
+ core::cmp::min(4, self.patterns.minimum_len())
+ }
+
+ /// Returns the approximate total amount of heap used by this type, in
+ /// units of bytes.
+ fn memory_usage(&self) -> usize {
+ // This is an upper bound rather than a precise accounting. No
+ // particular reason, other than it's probably very close to actual
+ // memory usage in practice.
+ self.patterns.len() * core::mem::size_of::<PatternID>()
+ }
+}
+
+impl Teddy<8> {
+ /// Runs the verification routine for "slim" Teddy.
+ ///
+ /// The candidate given should be a collection of 8-bit bitsets (one bitset
+ /// per lane), where the ith bit is set in the jth lane if and only if the
+ /// byte occurring at `at + j` in `cur` is in the bucket `i`.
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that this is okay to call in the current target for
+ /// the current CPU.
+ ///
+ /// The given pointers must be valid to read from.
+ #[inline(always)]
+ unsafe fn verify<V: Vector>(
+ &self,
+ mut cur: *const u8,
+ end: *const u8,
+ candidate: V,
+ ) -> Option<Match> {
+ debug_assert!(!candidate.is_zero());
+ // Convert the candidate into 64-bit chunks, and then verify each of
+ // those chunks.
+ candidate.for_each_64bit_lane(
+ #[inline(always)]
+ |_, chunk| {
+ let result = self.verify64(cur, end, chunk);
+ cur = cur.add(8);
+ result
+ },
+ )
+ }
+}
+
+impl Teddy<16> {
+ /// Runs the verification routine for "fat" Teddy.
+ ///
+ /// The candidate given should be a collection of 8-bit bitsets (one bitset
+ /// per lane), where the ith bit is set in the jth lane if and only if the
+ /// byte occurring at `at + (j < 16 ? j : j - 16)` in `cur` is in the
+ /// bucket `j < 16 ? i : i + 8`.
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that this is okay to call in the current target for
+ /// the current CPU.
+ ///
+ /// The given pointers must be valid to read from.
+ #[inline(always)]
+ unsafe fn verify<V: FatVector>(
+ &self,
+ mut cur: *const u8,
+ end: *const u8,
+ candidate: V,
+ ) -> Option<Match> {
+ // This is a bit tricky, but we basically want to convert our
+ // candidate, which looks like this (assuming a 256-bit vector):
+ //
+ // a31 a30 ... a17 a16 a15 a14 ... a01 a00
+ //
+ // where each a(i) is an 8-bit bitset corresponding to the activated
+ // buckets, to this
+ //
+ // a31 a15 a30 a14 a29 a13 ... a18 a02 a17 a01 a16 a00
+ //
+ // Namely, for Fat Teddy, the high 128-bits of the candidate correspond
+ // to the same bytes in the haystack in the low 128-bits (so we only
+ // scan 16 bytes at a time), but are for buckets 8-15 instead of 0-7.
+ //
+ // The verification routine wants to look at all potentially matching
+ // buckets before moving on to the next lane. So for example, both
+ // a16 and a00 both correspond to the first byte in our window; a00
+ // contains buckets 0-7 and a16 contains buckets 8-15. Specifically,
+ // a16 should be checked before a01. So the transformation shown above
+ // allows us to use our normal verification procedure with one small
+ // change: we treat each bitset as 16 bits instead of 8 bits.
+ debug_assert!(!candidate.is_zero());
+
+ // Swap the 128-bit lanes in the candidate vector.
+ let swapped = candidate.swap_halves();
+ // Interleave the bytes from the low 128-bit lanes, starting with
+ // cand first.
+ let r1 = candidate.interleave_low_8bit_lanes(swapped);
+ // Interleave the bytes from the high 128-bit lanes, starting with
+ // cand first.
+ let r2 = candidate.interleave_high_8bit_lanes(swapped);
+ // Now just take the 2 low 64-bit integers from both r1 and r2. We
+ // can drop the high 64-bit integers because they are a mirror image
+ // of the low 64-bit integers. All we care about are the low 128-bit
+ // lanes of r1 and r2. Combined, they contain all our 16-bit bitsets
+ // laid out in the desired order, as described above.
+ r1.for_each_low_64bit_lane(
+ r2,
+ #[inline(always)]
+ |_, chunk| {
+ let result = self.verify64(cur, end, chunk);
+ cur = cur.add(4);
+ result
+ },
+ )
+ }
+}
+
+/// A vector generic mask for the low and high nybbles in a set of patterns.
+/// Each 8-bit lane `j` in a vector corresponds to a bitset where the `i`th bit
+/// is set if and only if the nybble `j` is in the bucket `i` at a particular
+/// position.
+///
+/// This is slightly tweaked dependending on whether Slim or Fat Teddy is being
+/// used. For Slim Teddy, the bitsets in the lower half are the same as the
+/// bitsets in the higher half, so that we can search `V::BYTES` bytes at a
+/// time. (Remember, the nybbles in the haystack are used as indices into these
+/// masks, and 256-bit shuffles only operate on 128-bit lanes.)
+///
+/// For Fat Teddy, the bitsets are not repeated, but instead, the high half
+/// bits correspond to an addition 8 buckets. So that a bitset `00100010` has
+/// buckets 1 and 5 set if it's in the lower half, but has buckets 9 and 13 set
+/// if it's in the higher half.
+#[derive(Clone, Copy, Debug)]
+struct Mask<V> {
+ lo: V,
+ hi: V,
+}
+
+impl<V: Vector> Mask<V> {
+ /// Return a candidate for Teddy (fat or slim) that is searching for 1-byte
+ /// candidates.
+ ///
+ /// If a candidate is returned, it will be a collection of 8-bit bitsets
+ /// (one bitset per lane), where the ith bit is set in the jth lane if and
+ /// only if the byte occurring at the jth lane in `chunk` is in the bucket
+ /// `i`. If no candidate is found, then the vector returned will have all
+ /// lanes set to zero.
+ ///
+ /// `chunk` should correspond to a `V::BYTES` window of the haystack (where
+ /// the least significant byte corresponds to the start of the window). For
+ /// fat Teddy, the haystack window length should be `V::BYTES / 2`, with
+ /// the window repeated in each half of the vector.
+ ///
+ /// `mask1` should correspond to a low/high mask for the first byte of all
+ /// patterns that are being searched.
+ #[inline(always)]
+ unsafe fn members1(chunk: V, masks: [Mask<V>; 1]) -> V {
+ let lomask = V::splat(0xF);
+ let hlo = chunk.and(lomask);
+ let hhi = chunk.shift_8bit_lane_right::<4>().and(lomask);
+ let locand = masks[0].lo.shuffle_bytes(hlo);
+ let hicand = masks[0].hi.shuffle_bytes(hhi);
+ locand.and(hicand)
+ }
+
+ /// Return a candidate for Teddy (fat or slim) that is searching for 2-byte
+ /// candidates.
+ ///
+ /// If candidates are returned, each will be a collection of 8-bit bitsets
+ /// (one bitset per lane), where the ith bit is set in the jth lane if and
+ /// only if the byte occurring at the jth lane in `chunk` is in the bucket
+ /// `i`. Each candidate returned corresponds to the first and second bytes
+ /// of the patterns being searched. If no candidate is found, then all of
+ /// the lanes will be set to zero in at least one of the vectors returned.
+ ///
+ /// `chunk` should correspond to a `V::BYTES` window of the haystack (where
+ /// the least significant byte corresponds to the start of the window). For
+ /// fat Teddy, the haystack window length should be `V::BYTES / 2`, with
+ /// the window repeated in each half of the vector.
+ ///
+ /// The masks should correspond to the masks computed for the first and
+ /// second bytes of all patterns that are being searched.
+ #[inline(always)]
+ unsafe fn members2(chunk: V, masks: [Mask<V>; 2]) -> (V, V) {
+ let lomask = V::splat(0xF);
+ let hlo = chunk.and(lomask);
+ let hhi = chunk.shift_8bit_lane_right::<4>().and(lomask);
+
+ let locand1 = masks[0].lo.shuffle_bytes(hlo);
+ let hicand1 = masks[0].hi.shuffle_bytes(hhi);
+ let cand1 = locand1.and(hicand1);
+
+ let locand2 = masks[1].lo.shuffle_bytes(hlo);
+ let hicand2 = masks[1].hi.shuffle_bytes(hhi);
+ let cand2 = locand2.and(hicand2);
+
+ (cand1, cand2)
+ }
+
+ /// Return a candidate for Teddy (fat or slim) that is searching for 3-byte
+ /// candidates.
+ ///
+ /// If candidates are returned, each will be a collection of 8-bit bitsets
+ /// (one bitset per lane), where the ith bit is set in the jth lane if and
+ /// only if the byte occurring at the jth lane in `chunk` is in the bucket
+ /// `i`. Each candidate returned corresponds to the first, second and third
+ /// bytes of the patterns being searched. If no candidate is found, then
+ /// all of the lanes will be set to zero in at least one of the vectors
+ /// returned.
+ ///
+ /// `chunk` should correspond to a `V::BYTES` window of the haystack (where
+ /// the least significant byte corresponds to the start of the window). For
+ /// fat Teddy, the haystack window length should be `V::BYTES / 2`, with
+ /// the window repeated in each half of the vector.
+ ///
+ /// The masks should correspond to the masks computed for the first, second
+ /// and third bytes of all patterns that are being searched.
+ #[inline(always)]
+ unsafe fn members3(chunk: V, masks: [Mask<V>; 3]) -> (V, V, V) {
+ let lomask = V::splat(0xF);
+ let hlo = chunk.and(lomask);
+ let hhi = chunk.shift_8bit_lane_right::<4>().and(lomask);
+
+ let locand1 = masks[0].lo.shuffle_bytes(hlo);
+ let hicand1 = masks[0].hi.shuffle_bytes(hhi);
+ let cand1 = locand1.and(hicand1);
+
+ let locand2 = masks[1].lo.shuffle_bytes(hlo);
+ let hicand2 = masks[1].hi.shuffle_bytes(hhi);
+ let cand2 = locand2.and(hicand2);
+
+ let locand3 = masks[2].lo.shuffle_bytes(hlo);
+ let hicand3 = masks[2].hi.shuffle_bytes(hhi);
+ let cand3 = locand3.and(hicand3);
+
+ (cand1, cand2, cand3)
+ }
+
+ /// Return a candidate for Teddy (fat or slim) that is searching for 4-byte
+ /// candidates.
+ ///
+ /// If candidates are returned, each will be a collection of 8-bit bitsets
+ /// (one bitset per lane), where the ith bit is set in the jth lane if and
+ /// only if the byte occurring at the jth lane in `chunk` is in the bucket
+ /// `i`. Each candidate returned corresponds to the first, second, third
+ /// and fourth bytes of the patterns being searched. If no candidate is
+ /// found, then all of the lanes will be set to zero in at least one of the
+ /// vectors returned.
+ ///
+ /// `chunk` should correspond to a `V::BYTES` window of the haystack (where
+ /// the least significant byte corresponds to the start of the window). For
+ /// fat Teddy, the haystack window length should be `V::BYTES / 2`, with
+ /// the window repeated in each half of the vector.
+ ///
+ /// The masks should correspond to the masks computed for the first,
+ /// second, third and fourth bytes of all patterns that are being searched.
+ #[inline(always)]
+ unsafe fn members4(chunk: V, masks: [Mask<V>; 4]) -> (V, V, V, V) {
+ let lomask = V::splat(0xF);
+ let hlo = chunk.and(lomask);
+ let hhi = chunk.shift_8bit_lane_right::<4>().and(lomask);
+
+ let locand1 = masks[0].lo.shuffle_bytes(hlo);
+ let hicand1 = masks[0].hi.shuffle_bytes(hhi);
+ let cand1 = locand1.and(hicand1);
+
+ let locand2 = masks[1].lo.shuffle_bytes(hlo);
+ let hicand2 = masks[1].hi.shuffle_bytes(hhi);
+ let cand2 = locand2.and(hicand2);
+
+ let locand3 = masks[2].lo.shuffle_bytes(hlo);
+ let hicand3 = masks[2].hi.shuffle_bytes(hhi);
+ let cand3 = locand3.and(hicand3);
+
+ let locand4 = masks[3].lo.shuffle_bytes(hlo);
+ let hicand4 = masks[3].hi.shuffle_bytes(hhi);
+ let cand4 = locand4.and(hicand4);
+
+ (cand1, cand2, cand3, cand4)
+ }
+}
+
+/// Represents the low and high nybble masks that will be used during
+/// search. Each mask is 32 bytes wide, although only the first 16 bytes are
+/// used for 128-bit vectors.
+///
+/// Each byte in the mask corresponds to a 8-bit bitset, where bit `i` is set
+/// if and only if the corresponding nybble is in the ith bucket. The index of
+/// the byte (0-15, inclusive) corresponds to the nybble.
+///
+/// Each mask is used as the target of a shuffle, where the indices for the
+/// shuffle are taken from the haystack. AND'ing the shuffles for both the
+/// low and high masks together also results in 8-bit bitsets, but where bit
+/// `i` is set if and only if the correspond *byte* is in the ith bucket.
+#[derive(Clone, Default)]
+struct SlimMaskBuilder {
+ lo: [u8; 32],
+ hi: [u8; 32],
+}
+
+impl SlimMaskBuilder {
+ /// Update this mask by adding the given byte to the given bucket. The
+ /// given bucket must be in the range 0-7.
+ ///
+ /// # Panics
+ ///
+ /// When `bucket >= 8`.
+ fn add(&mut self, bucket: usize, byte: u8) {
+ assert!(bucket < 8);
+
+ let bucket = u8::try_from(bucket).unwrap();
+ let byte_lo = usize::from(byte & 0xF);
+ let byte_hi = usize::from((byte >> 4) & 0xF);
+ // When using 256-bit vectors, we need to set this bucket assignment in
+ // the low and high 128-bit portions of the mask. This allows us to
+ // process 32 bytes at a time. Namely, AVX2 shuffles operate on each
+ // of the 128-bit lanes, rather than the full 256-bit vector at once.
+ self.lo[byte_lo] |= 1 << bucket;
+ self.lo[byte_lo + 16] |= 1 << bucket;
+ self.hi[byte_hi] |= 1 << bucket;
+ self.hi[byte_hi + 16] |= 1 << bucket;
+ }
+
+ /// Turn this builder into a vector mask.
+ ///
+ /// # Panics
+ ///
+ /// When `V` represents a vector bigger than what `MaskBytes` can contain.
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that this is okay to call in the current target for
+ /// the current CPU.
+ #[inline(always)]
+ unsafe fn build<V: Vector>(&self) -> Mask<V> {
+ assert!(V::BYTES <= self.lo.len());
+ assert!(V::BYTES <= self.hi.len());
+ Mask {
+ lo: V::load_unaligned(self.lo[..].as_ptr()),
+ hi: V::load_unaligned(self.hi[..].as_ptr()),
+ }
+ }
+
+ /// A convenience function for building `N` vector masks from a slim
+ /// `Teddy` value.
+ ///
+ /// # Panics
+ ///
+ /// When `V` represents a vector bigger than what `MaskBytes` can contain.
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that this is okay to call in the current target for
+ /// the current CPU.
+ #[inline(always)]
+ unsafe fn from_teddy<const BYTES: usize, V: Vector>(
+ teddy: &Teddy<8>,
+ ) -> [Mask<V>; BYTES] {
+ // MSRV(1.63): Use core::array::from_fn to just build the array here
+ // instead of creating a vector and turning it into an array.
+ let mut mask_builders = vec![SlimMaskBuilder::default(); BYTES];
+ for (bucket_index, bucket) in teddy.buckets.iter().enumerate() {
+ for pid in bucket.iter().copied() {
+ let pat = teddy.patterns.get(pid);
+ for (i, builder) in mask_builders.iter_mut().enumerate() {
+ builder.add(bucket_index, pat.bytes()[i]);
+ }
+ }
+ }
+ let array =
+ <[SlimMaskBuilder; BYTES]>::try_from(mask_builders).unwrap();
+ array.map(|builder| builder.build())
+ }
+}
+
+impl Debug for SlimMaskBuilder {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ let (mut parts_lo, mut parts_hi) = (vec![], vec![]);
+ for i in 0..32 {
+ parts_lo.push(format!("{:02}: {:08b}", i, self.lo[i]));
+ parts_hi.push(format!("{:02}: {:08b}", i, self.hi[i]));
+ }
+ f.debug_struct("SlimMaskBuilder")
+ .field("lo", &parts_lo)
+ .field("hi", &parts_hi)
+ .finish()
+ }
+}
+
+/// Represents the low and high nybble masks that will be used during "fat"
+/// Teddy search.
+///
+/// Each mask is 32 bytes wide, and at the time of writing, only 256-bit vectors
+/// support fat Teddy.
+///
+/// A fat Teddy mask is like a slim Teddy mask, except that instead of
+/// repeating the bitsets in the high and low 128-bits in 256-bit vectors, the
+/// high and low 128-bit halves each represent distinct buckets. (Bringing the
+/// total to 16 instead of 8.) This permits spreading the patterns out a bit
+/// more and thus putting less pressure on verification to be fast.
+///
+/// Each byte in the mask corresponds to a 8-bit bitset, where bit `i` is set
+/// if and only if the corresponding nybble is in the ith bucket. The index of
+/// the byte (0-15, inclusive) corresponds to the nybble.
+#[derive(Clone, Copy, Default)]
+struct FatMaskBuilder {
+ lo: [u8; 32],
+ hi: [u8; 32],
+}
+
+impl FatMaskBuilder {
+ /// Update this mask by adding the given byte to the given bucket. The
+ /// given bucket must be in the range 0-15.
+ ///
+ /// # Panics
+ ///
+ /// When `bucket >= 16`.
+ fn add(&mut self, bucket: usize, byte: u8) {
+ assert!(bucket < 16);
+
+ let bucket = u8::try_from(bucket).unwrap();
+ let byte_lo = usize::from(byte & 0xF);
+ let byte_hi = usize::from((byte >> 4) & 0xF);
+ // Unlike slim teddy, fat teddy only works with AVX2. For fat teddy,
+ // the high 128 bits of our mask correspond to buckets 8-15, while the
+ // low 128 bits correspond to buckets 0-7.
+ if bucket < 8 {
+ self.lo[byte_lo] |= 1 << bucket;
+ self.hi[byte_hi] |= 1 << bucket;
+ } else {
+ self.lo[byte_lo + 16] |= 1 << (bucket % 8);
+ self.hi[byte_hi + 16] |= 1 << (bucket % 8);
+ }
+ }
+
+ /// Turn this builder into a vector mask.
+ ///
+ /// # Panics
+ ///
+ /// When `V` represents a vector bigger than what `MaskBytes` can contain.
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that this is okay to call in the current target for
+ /// the current CPU.
+ #[inline(always)]
+ unsafe fn build<V: Vector>(&self) -> Mask<V> {
+ assert!(V::BYTES <= self.lo.len());
+ assert!(V::BYTES <= self.hi.len());
+ Mask {
+ lo: V::load_unaligned(self.lo[..].as_ptr()),
+ hi: V::load_unaligned(self.hi[..].as_ptr()),
+ }
+ }
+
+ /// A convenience function for building `N` vector masks from a fat
+ /// `Teddy` value.
+ ///
+ /// # Panics
+ ///
+ /// When `V` represents a vector bigger than what `MaskBytes` can contain.
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that this is okay to call in the current target for
+ /// the current CPU.
+ #[inline(always)]
+ unsafe fn from_teddy<const BYTES: usize, V: Vector>(
+ teddy: &Teddy<16>,
+ ) -> [Mask<V>; BYTES] {
+ // MSRV(1.63): Use core::array::from_fn to just build the array here
+ // instead of creating a vector and turning it into an array.
+ let mut mask_builders = vec![FatMaskBuilder::default(); BYTES];
+ for (bucket_index, bucket) in teddy.buckets.iter().enumerate() {
+ for pid in bucket.iter().copied() {
+ let pat = teddy.patterns.get(pid);
+ for (i, builder) in mask_builders.iter_mut().enumerate() {
+ builder.add(bucket_index, pat.bytes()[i]);
+ }
+ }
+ }
+ let array =
+ <[FatMaskBuilder; BYTES]>::try_from(mask_builders).unwrap();
+ array.map(|builder| builder.build())
+ }
+}
+
+impl Debug for FatMaskBuilder {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ let (mut parts_lo, mut parts_hi) = (vec![], vec![]);
+ for i in 0..32 {
+ parts_lo.push(format!("{:02}: {:08b}", i, self.lo[i]));
+ parts_hi.push(format!("{:02}: {:08b}", i, self.hi[i]));
+ }
+ f.debug_struct("FatMaskBuilder")
+ .field("lo", &parts_lo)
+ .field("hi", &parts_hi)
+ .finish()
+ }
+}
diff --git a/third_party/rust/aho-corasick/src/packed/teddy/mod.rs b/third_party/rust/aho-corasick/src/packed/teddy/mod.rs
new file mode 100644
index 0000000000..26cfcdc450
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/packed/teddy/mod.rs
@@ -0,0 +1,9 @@
+// Regrettable, but Teddy stuff just isn't used on all targets. And for some
+// targets, like aarch64, only "slim" Teddy is used and so "fat" Teddy gets a
+// bunch of dead-code warnings. Just not worth trying to squash them. Blech.
+#![allow(dead_code)]
+
+pub(crate) use self::builder::{Builder, Searcher};
+
+mod builder;
+mod generic;
diff --git a/third_party/rust/aho-corasick/src/packed/tests.rs b/third_party/rust/aho-corasick/src/packed/tests.rs
new file mode 100644
index 0000000000..2b0d44ee6f
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/packed/tests.rs
@@ -0,0 +1,583 @@
+use std::collections::HashMap;
+
+use alloc::{
+ format,
+ string::{String, ToString},
+ vec,
+ vec::Vec,
+};
+
+use crate::{
+ packed::{Config, MatchKind},
+ util::search::Match,
+};
+
+/// A description of a single test against a multi-pattern searcher.
+///
+/// A single test may not necessarily pass on every configuration of a
+/// searcher. The tests are categorized and grouped appropriately below.
+#[derive(Clone, Debug, Eq, PartialEq)]
+struct SearchTest {
+ /// The name of this test, for debugging.
+ name: &'static str,
+ /// The patterns to search for.
+ patterns: &'static [&'static str],
+ /// The text to search.
+ haystack: &'static str,
+ /// Each match is a triple of (pattern_index, start, end), where
+ /// pattern_index is an index into `patterns` and `start`/`end` are indices
+ /// into `haystack`.
+ matches: &'static [(usize, usize, usize)],
+}
+
+struct SearchTestOwned {
+ offset: usize,
+ name: String,
+ patterns: Vec<String>,
+ haystack: String,
+ matches: Vec<(usize, usize, usize)>,
+}
+
+impl SearchTest {
+ fn variations(&self) -> Vec<SearchTestOwned> {
+ let count = if cfg!(miri) { 1 } else { 261 };
+ let mut tests = vec![];
+ for i in 0..count {
+ tests.push(self.offset_prefix(i));
+ tests.push(self.offset_suffix(i));
+ tests.push(self.offset_both(i));
+ }
+ tests
+ }
+
+ fn offset_both(&self, off: usize) -> SearchTestOwned {
+ SearchTestOwned {
+ offset: off,
+ name: self.name.to_string(),
+ patterns: self.patterns.iter().map(|s| s.to_string()).collect(),
+ haystack: format!(
+ "{}{}{}",
+ "Z".repeat(off),
+ self.haystack,
+ "Z".repeat(off)
+ ),
+ matches: self
+ .matches
+ .iter()
+ .map(|&(id, s, e)| (id, s + off, e + off))
+ .collect(),
+ }
+ }
+
+ fn offset_prefix(&self, off: usize) -> SearchTestOwned {
+ SearchTestOwned {
+ offset: off,
+ name: self.name.to_string(),
+ patterns: self.patterns.iter().map(|s| s.to_string()).collect(),
+ haystack: format!("{}{}", "Z".repeat(off), self.haystack),
+ matches: self
+ .matches
+ .iter()
+ .map(|&(id, s, e)| (id, s + off, e + off))
+ .collect(),
+ }
+ }
+
+ fn offset_suffix(&self, off: usize) -> SearchTestOwned {
+ SearchTestOwned {
+ offset: off,
+ name: self.name.to_string(),
+ patterns: self.patterns.iter().map(|s| s.to_string()).collect(),
+ haystack: format!("{}{}", self.haystack, "Z".repeat(off)),
+ matches: self.matches.to_vec(),
+ }
+ }
+}
+
+/// Short-hand constructor for SearchTest. We use it a lot below.
+macro_rules! t {
+ ($name:ident, $patterns:expr, $haystack:expr, $matches:expr) => {
+ SearchTest {
+ name: stringify!($name),
+ patterns: $patterns,
+ haystack: $haystack,
+ matches: $matches,
+ }
+ };
+}
+
+/// A collection of test groups.
+type TestCollection = &'static [&'static [SearchTest]];
+
+// Define several collections corresponding to the different type of match
+// semantics supported. These collections have some overlap, but each
+// collection should have some tests that no other collection has.
+
+/// Tests for leftmost-first match semantics.
+const PACKED_LEFTMOST_FIRST: TestCollection =
+ &[BASICS, LEFTMOST, LEFTMOST_FIRST, REGRESSION, TEDDY];
+
+/// Tests for leftmost-longest match semantics.
+const PACKED_LEFTMOST_LONGEST: TestCollection =
+ &[BASICS, LEFTMOST, LEFTMOST_LONGEST, REGRESSION, TEDDY];
+
+// Now define the individual tests that make up the collections above.
+
+/// A collection of tests for the that should always be true regardless of
+/// match semantics. That is, all combinations of leftmost-{first, longest}
+/// should produce the same answer.
+const BASICS: &'static [SearchTest] = &[
+ t!(basic001, &["a"], "", &[]),
+ t!(basic010, &["a"], "a", &[(0, 0, 1)]),
+ t!(basic020, &["a"], "aa", &[(0, 0, 1), (0, 1, 2)]),
+ t!(basic030, &["a"], "aaa", &[(0, 0, 1), (0, 1, 2), (0, 2, 3)]),
+ t!(basic040, &["a"], "aba", &[(0, 0, 1), (0, 2, 3)]),
+ t!(basic050, &["a"], "bba", &[(0, 2, 3)]),
+ t!(basic060, &["a"], "bbb", &[]),
+ t!(basic070, &["a"], "bababbbba", &[(0, 1, 2), (0, 3, 4), (0, 8, 9)]),
+ t!(basic100, &["aa"], "", &[]),
+ t!(basic110, &["aa"], "aa", &[(0, 0, 2)]),
+ t!(basic120, &["aa"], "aabbaa", &[(0, 0, 2), (0, 4, 6)]),
+ t!(basic130, &["aa"], "abbab", &[]),
+ t!(basic140, &["aa"], "abbabaa", &[(0, 5, 7)]),
+ t!(basic150, &["aaa"], "aaa", &[(0, 0, 3)]),
+ t!(basic200, &["abc"], "abc", &[(0, 0, 3)]),
+ t!(basic210, &["abc"], "zazabzabcz", &[(0, 6, 9)]),
+ t!(basic220, &["abc"], "zazabczabcz", &[(0, 3, 6), (0, 7, 10)]),
+ t!(basic230, &["abcd"], "abcd", &[(0, 0, 4)]),
+ t!(basic240, &["abcd"], "zazabzabcdz", &[(0, 6, 10)]),
+ t!(basic250, &["abcd"], "zazabcdzabcdz", &[(0, 3, 7), (0, 8, 12)]),
+ t!(basic300, &["a", "b"], "", &[]),
+ t!(basic310, &["a", "b"], "z", &[]),
+ t!(basic320, &["a", "b"], "b", &[(1, 0, 1)]),
+ t!(basic330, &["a", "b"], "a", &[(0, 0, 1)]),
+ t!(
+ basic340,
+ &["a", "b"],
+ "abba",
+ &[(0, 0, 1), (1, 1, 2), (1, 2, 3), (0, 3, 4),]
+ ),
+ t!(
+ basic350,
+ &["b", "a"],
+ "abba",
+ &[(1, 0, 1), (0, 1, 2), (0, 2, 3), (1, 3, 4),]
+ ),
+ t!(basic360, &["abc", "bc"], "xbc", &[(1, 1, 3),]),
+ t!(basic400, &["foo", "bar"], "", &[]),
+ t!(basic410, &["foo", "bar"], "foobar", &[(0, 0, 3), (1, 3, 6),]),
+ t!(basic420, &["foo", "bar"], "barfoo", &[(1, 0, 3), (0, 3, 6),]),
+ t!(basic430, &["foo", "bar"], "foofoo", &[(0, 0, 3), (0, 3, 6),]),
+ t!(basic440, &["foo", "bar"], "barbar", &[(1, 0, 3), (1, 3, 6),]),
+ t!(basic450, &["foo", "bar"], "bafofoo", &[(0, 4, 7),]),
+ t!(basic460, &["bar", "foo"], "bafofoo", &[(1, 4, 7),]),
+ t!(basic470, &["foo", "bar"], "fobabar", &[(1, 4, 7),]),
+ t!(basic480, &["bar", "foo"], "fobabar", &[(0, 4, 7),]),
+ t!(basic700, &["yabcdef", "abcdezghi"], "yabcdefghi", &[(0, 0, 7),]),
+ t!(basic710, &["yabcdef", "abcdezghi"], "yabcdezghi", &[(1, 1, 10),]),
+ t!(
+ basic720,
+ &["yabcdef", "bcdeyabc", "abcdezghi"],
+ "yabcdezghi",
+ &[(2, 1, 10),]
+ ),
+ t!(basic810, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4),]),
+ t!(basic820, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4),]),
+ t!(basic830, &["abc", "bc"], "zazabcz", &[(0, 3, 6),]),
+ t!(
+ basic840,
+ &["ab", "ba"],
+ "abababa",
+ &[(0, 0, 2), (0, 2, 4), (0, 4, 6),]
+ ),
+ t!(basic850, &["foo", "foo"], "foobarfoo", &[(0, 0, 3), (0, 6, 9),]),
+];
+
+/// Tests for leftmost match semantics. These should pass for both
+/// leftmost-first and leftmost-longest match kinds. Stated differently, among
+/// ambiguous matches, the longest match and the match that appeared first when
+/// constructing the automaton should always be the same.
+const LEFTMOST: &'static [SearchTest] = &[
+ t!(leftmost000, &["ab", "ab"], "abcd", &[(0, 0, 2)]),
+ t!(leftmost030, &["a", "ab"], "aa", &[(0, 0, 1), (0, 1, 2)]),
+ t!(leftmost031, &["ab", "a"], "aa", &[(1, 0, 1), (1, 1, 2)]),
+ t!(leftmost032, &["ab", "a"], "xayabbbz", &[(1, 1, 2), (0, 3, 5)]),
+ t!(leftmost300, &["abcd", "bce", "b"], "abce", &[(1, 1, 4)]),
+ t!(leftmost310, &["abcd", "ce", "bc"], "abce", &[(2, 1, 3)]),
+ t!(leftmost320, &["abcd", "bce", "ce", "b"], "abce", &[(1, 1, 4)]),
+ t!(leftmost330, &["abcd", "bce", "cz", "bc"], "abcz", &[(3, 1, 3)]),
+ t!(leftmost340, &["bce", "cz", "bc"], "bcz", &[(2, 0, 2)]),
+ t!(leftmost350, &["abc", "bd", "ab"], "abd", &[(2, 0, 2)]),
+ t!(
+ leftmost360,
+ &["abcdefghi", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(2, 0, 8),]
+ ),
+ t!(
+ leftmost370,
+ &["abcdefghi", "cde", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(3, 0, 8),]
+ ),
+ t!(
+ leftmost380,
+ &["abcdefghi", "hz", "abcdefgh", "a"],
+ "abcdefghz",
+ &[(2, 0, 8),]
+ ),
+ t!(
+ leftmost390,
+ &["b", "abcdefghi", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(3, 0, 8),]
+ ),
+ t!(
+ leftmost400,
+ &["h", "abcdefghi", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(3, 0, 8),]
+ ),
+ t!(
+ leftmost410,
+ &["z", "abcdefghi", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(3, 0, 8), (0, 8, 9),]
+ ),
+];
+
+/// Tests for non-overlapping leftmost-first match semantics. These tests
+/// should generally be specific to leftmost-first, which means they should
+/// generally fail under leftmost-longest semantics.
+const LEFTMOST_FIRST: &'static [SearchTest] = &[
+ t!(leftfirst000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]),
+ t!(leftfirst020, &["abcd", "ab"], "abcd", &[(0, 0, 4)]),
+ t!(leftfirst030, &["ab", "ab"], "abcd", &[(0, 0, 2)]),
+ t!(leftfirst040, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (0, 3, 4)]),
+ t!(leftfirst100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(1, 1, 5)]),
+ t!(leftfirst110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]),
+ t!(leftfirst300, &["abcd", "b", "bce"], "abce", &[(1, 1, 2)]),
+ t!(
+ leftfirst310,
+ &["abcd", "b", "bce", "ce"],
+ "abce",
+ &[(1, 1, 2), (3, 2, 4),]
+ ),
+ t!(
+ leftfirst320,
+ &["a", "abcdefghi", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(0, 0, 1), (2, 7, 9),]
+ ),
+ t!(leftfirst330, &["a", "abab"], "abab", &[(0, 0, 1), (0, 2, 3)]),
+ t!(
+ leftfirst340,
+ &["abcdef", "x", "x", "x", "x", "x", "x", "abcde"],
+ "abcdef",
+ &[(0, 0, 6)]
+ ),
+];
+
+/// Tests for non-overlapping leftmost-longest match semantics. These tests
+/// should generally be specific to leftmost-longest, which means they should
+/// generally fail under leftmost-first semantics.
+const LEFTMOST_LONGEST: &'static [SearchTest] = &[
+ t!(leftlong000, &["ab", "abcd"], "abcd", &[(1, 0, 4)]),
+ t!(leftlong010, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4),]),
+ t!(leftlong040, &["a", "ab"], "a", &[(0, 0, 1)]),
+ t!(leftlong050, &["a", "ab"], "ab", &[(1, 0, 2)]),
+ t!(leftlong060, &["ab", "a"], "a", &[(1, 0, 1)]),
+ t!(leftlong070, &["ab", "a"], "ab", &[(0, 0, 2)]),
+ t!(leftlong100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(2, 1, 6)]),
+ t!(leftlong110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]),
+ t!(leftlong300, &["abcd", "b", "bce"], "abce", &[(2, 1, 4)]),
+ t!(
+ leftlong310,
+ &["a", "abcdefghi", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(3, 0, 8),]
+ ),
+ t!(leftlong320, &["a", "abab"], "abab", &[(1, 0, 4)]),
+ t!(leftlong330, &["abcd", "b", "ce"], "abce", &[(1, 1, 2), (2, 2, 4),]),
+ t!(leftlong340, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (1, 3, 5)]),
+];
+
+/// Regression tests that are applied to all combinations.
+///
+/// If regression tests are needed for specific match semantics, then add them
+/// to the appropriate group above.
+const REGRESSION: &'static [SearchTest] = &[
+ t!(regression010, &["inf", "ind"], "infind", &[(0, 0, 3), (1, 3, 6),]),
+ t!(regression020, &["ind", "inf"], "infind", &[(1, 0, 3), (0, 3, 6),]),
+ t!(
+ regression030,
+ &["libcore/", "libstd/"],
+ "libcore/char/methods.rs",
+ &[(0, 0, 8),]
+ ),
+ t!(
+ regression040,
+ &["libstd/", "libcore/"],
+ "libcore/char/methods.rs",
+ &[(1, 0, 8),]
+ ),
+ t!(
+ regression050,
+ &["\x00\x00\x01", "\x00\x00\x00"],
+ "\x00\x00\x00",
+ &[(1, 0, 3),]
+ ),
+ t!(
+ regression060,
+ &["\x00\x00\x00", "\x00\x00\x01"],
+ "\x00\x00\x00",
+ &[(0, 0, 3),]
+ ),
+];
+
+const TEDDY: &'static [SearchTest] = &[
+ t!(
+ teddy010,
+ &["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"],
+ "abcdefghijk",
+ &[
+ (0, 0, 1),
+ (1, 1, 2),
+ (2, 2, 3),
+ (3, 3, 4),
+ (4, 4, 5),
+ (5, 5, 6),
+ (6, 6, 7),
+ (7, 7, 8),
+ (8, 8, 9),
+ (9, 9, 10),
+ (10, 10, 11)
+ ]
+ ),
+ t!(
+ teddy020,
+ &["ab", "bc", "cd", "de", "ef", "fg", "gh", "hi", "ij", "jk", "kl"],
+ "abcdefghijk",
+ &[(0, 0, 2), (2, 2, 4), (4, 4, 6), (6, 6, 8), (8, 8, 10),]
+ ),
+ t!(
+ teddy030,
+ &["abc"],
+ "abcdefghijklmnopqrstuvwxyzabcdefghijk",
+ &[(0, 0, 3), (0, 26, 29)]
+ ),
+];
+
+// Now define a test for each combination of things above that we want to run.
+// Since there are a few different combinations for each collection of tests,
+// we define a couple of macros to avoid repetition drudgery. The testconfig
+// macro constructs the automaton from a given match kind, and runs the search
+// tests one-by-one over the given collection. The `with` parameter allows one
+// to configure the config with additional parameters. The testcombo macro
+// invokes testconfig in precisely this way: it sets up several tests where
+// each one turns a different knob on Config.
+
+macro_rules! testconfig {
+ ($name:ident, $collection:expr, $with:expr) => {
+ #[test]
+ fn $name() {
+ run_search_tests($collection, |test| {
+ let mut config = Config::new();
+ $with(&mut config);
+ let mut builder = config.builder();
+ builder.extend(test.patterns.iter().map(|p| p.as_bytes()));
+ let searcher = match builder.build() {
+ Some(searcher) => searcher,
+ None => {
+ // For x86-64 and aarch64, not building a searcher is
+ // probably a bug, so be loud.
+ if cfg!(any(
+ target_arch = "x86_64",
+ target_arch = "aarch64"
+ )) {
+ panic!("failed to build packed searcher")
+ }
+ return None;
+ }
+ };
+ Some(searcher.find_iter(&test.haystack).collect())
+ });
+ }
+ };
+}
+
+testconfig!(
+ search_default_leftmost_first,
+ PACKED_LEFTMOST_FIRST,
+ |_: &mut Config| {}
+);
+
+testconfig!(
+ search_default_leftmost_longest,
+ PACKED_LEFTMOST_LONGEST,
+ |c: &mut Config| {
+ c.match_kind(MatchKind::LeftmostLongest);
+ }
+);
+
+testconfig!(
+ search_teddy_leftmost_first,
+ PACKED_LEFTMOST_FIRST,
+ |c: &mut Config| {
+ c.only_teddy(true);
+ }
+);
+
+testconfig!(
+ search_teddy_leftmost_longest,
+ PACKED_LEFTMOST_LONGEST,
+ |c: &mut Config| {
+ c.only_teddy(true).match_kind(MatchKind::LeftmostLongest);
+ }
+);
+
+testconfig!(
+ search_teddy_ssse3_leftmost_first,
+ PACKED_LEFTMOST_FIRST,
+ |c: &mut Config| {
+ c.only_teddy(true);
+ #[cfg(target_arch = "x86_64")]
+ if std::is_x86_feature_detected!("ssse3") {
+ c.only_teddy_256bit(Some(false));
+ }
+ }
+);
+
+testconfig!(
+ search_teddy_ssse3_leftmost_longest,
+ PACKED_LEFTMOST_LONGEST,
+ |c: &mut Config| {
+ c.only_teddy(true).match_kind(MatchKind::LeftmostLongest);
+ #[cfg(target_arch = "x86_64")]
+ if std::is_x86_feature_detected!("ssse3") {
+ c.only_teddy_256bit(Some(false));
+ }
+ }
+);
+
+testconfig!(
+ search_teddy_avx2_leftmost_first,
+ PACKED_LEFTMOST_FIRST,
+ |c: &mut Config| {
+ c.only_teddy(true);
+ #[cfg(target_arch = "x86_64")]
+ if std::is_x86_feature_detected!("avx2") {
+ c.only_teddy_256bit(Some(true));
+ }
+ }
+);
+
+testconfig!(
+ search_teddy_avx2_leftmost_longest,
+ PACKED_LEFTMOST_LONGEST,
+ |c: &mut Config| {
+ c.only_teddy(true).match_kind(MatchKind::LeftmostLongest);
+ #[cfg(target_arch = "x86_64")]
+ if std::is_x86_feature_detected!("avx2") {
+ c.only_teddy_256bit(Some(true));
+ }
+ }
+);
+
+testconfig!(
+ search_teddy_fat_leftmost_first,
+ PACKED_LEFTMOST_FIRST,
+ |c: &mut Config| {
+ c.only_teddy(true);
+ #[cfg(target_arch = "x86_64")]
+ if std::is_x86_feature_detected!("avx2") {
+ c.only_teddy_fat(Some(true));
+ }
+ }
+);
+
+testconfig!(
+ search_teddy_fat_leftmost_longest,
+ PACKED_LEFTMOST_LONGEST,
+ |c: &mut Config| {
+ c.only_teddy(true).match_kind(MatchKind::LeftmostLongest);
+ #[cfg(target_arch = "x86_64")]
+ if std::is_x86_feature_detected!("avx2") {
+ c.only_teddy_fat(Some(true));
+ }
+ }
+);
+
+testconfig!(
+ search_rabinkarp_leftmost_first,
+ PACKED_LEFTMOST_FIRST,
+ |c: &mut Config| {
+ c.only_rabin_karp(true);
+ }
+);
+
+testconfig!(
+ search_rabinkarp_leftmost_longest,
+ PACKED_LEFTMOST_LONGEST,
+ |c: &mut Config| {
+ c.only_rabin_karp(true).match_kind(MatchKind::LeftmostLongest);
+ }
+);
+
+#[test]
+fn search_tests_have_unique_names() {
+ let assert = |constname, tests: &[SearchTest]| {
+ let mut seen = HashMap::new(); // map from test name to position
+ for (i, test) in tests.iter().enumerate() {
+ if !seen.contains_key(test.name) {
+ seen.insert(test.name, i);
+ } else {
+ let last = seen[test.name];
+ panic!(
+ "{} tests have duplicate names at positions {} and {}",
+ constname, last, i
+ );
+ }
+ }
+ };
+ assert("BASICS", BASICS);
+ assert("LEFTMOST", LEFTMOST);
+ assert("LEFTMOST_FIRST", LEFTMOST_FIRST);
+ assert("LEFTMOST_LONGEST", LEFTMOST_LONGEST);
+ assert("REGRESSION", REGRESSION);
+ assert("TEDDY", TEDDY);
+}
+
+fn run_search_tests<F: FnMut(&SearchTestOwned) -> Option<Vec<Match>>>(
+ which: TestCollection,
+ mut f: F,
+) {
+ let get_match_triples =
+ |matches: Vec<Match>| -> Vec<(usize, usize, usize)> {
+ matches
+ .into_iter()
+ .map(|m| (m.pattern().as_usize(), m.start(), m.end()))
+ .collect()
+ };
+ for &tests in which {
+ for spec in tests {
+ for test in spec.variations() {
+ let results = match f(&test) {
+ None => continue,
+ Some(results) => results,
+ };
+ assert_eq!(
+ test.matches,
+ get_match_triples(results).as_slice(),
+ "test: {}, patterns: {:?}, haystack(len={:?}): {:?}, \
+ offset: {:?}",
+ test.name,
+ test.patterns,
+ test.haystack.len(),
+ test.haystack,
+ test.offset,
+ );
+ }
+ }
+ }
+}
diff --git a/third_party/rust/aho-corasick/src/packed/vector.rs b/third_party/rust/aho-corasick/src/packed/vector.rs
new file mode 100644
index 0000000000..f19b86ce1e
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/packed/vector.rs
@@ -0,0 +1,1750 @@
+// NOTE: The descriptions for each of the vector methods on the traits below
+// are pretty inscrutable. For this reason, there are tests for every method
+// on for every trait impl below. If you're confused about what an op does,
+// consult its test. (They probably should be doc tests, but I couldn't figure
+// out how to write them in a non-annoying way.)
+
+use core::{
+ fmt::Debug,
+ panic::{RefUnwindSafe, UnwindSafe},
+};
+
+/// A trait for describing vector operations used by vectorized searchers.
+///
+/// The trait is highly constrained to low level vector operations needed for
+/// the specific algorithms used in this crate. In general, it was invented
+/// mostly to be generic over x86's __m128i and __m256i types. At time of
+/// writing, it also supports wasm and aarch64 128-bit vector types as well.
+///
+/// # Safety
+///
+/// All methods are not safe since they are intended to be implemented using
+/// vendor intrinsics, which are also not safe. Callers must ensure that
+/// the appropriate target features are enabled in the calling function,
+/// and that the current CPU supports them. All implementations should
+/// avoid marking the routines with `#[target_feature]` and instead mark
+/// them as `#[inline(always)]` to ensure they get appropriately inlined.
+/// (`inline(always)` cannot be used with target_feature.)
+pub(crate) trait Vector:
+ Copy + Debug + Send + Sync + UnwindSafe + RefUnwindSafe
+{
+ /// The number of bits in the vector.
+ const BITS: usize;
+ /// The number of bytes in the vector. That is, this is the size of the
+ /// vector in memory.
+ const BYTES: usize;
+
+ /// Create a vector with 8-bit lanes with the given byte repeated into each
+ /// lane.
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that this is okay to call in the current target for
+ /// the current CPU.
+ unsafe fn splat(byte: u8) -> Self;
+
+ /// Read a vector-size number of bytes from the given pointer. The pointer
+ /// does not need to be aligned.
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that this is okay to call in the current target for
+ /// the current CPU.
+ ///
+ /// Callers must guarantee that at least `BYTES` bytes are readable from
+ /// `data`.
+ unsafe fn load_unaligned(data: *const u8) -> Self;
+
+ /// Returns true if and only if this vector has zero in all of its lanes.
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that this is okay to call in the current target for
+ /// the current CPU.
+ unsafe fn is_zero(self) -> bool;
+
+ /// Do an 8-bit pairwise equality check. If lane `i` is equal in this
+ /// vector and the one given, then lane `i` in the resulting vector is set
+ /// to `0xFF`. Otherwise, it is set to `0x00`.
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that this is okay to call in the current target for
+ /// the current CPU.
+ unsafe fn cmpeq(self, vector2: Self) -> Self;
+
+ /// Perform a bitwise 'and' of this vector and the one given and return
+ /// the result.
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that this is okay to call in the current target for
+ /// the current CPU.
+ unsafe fn and(self, vector2: Self) -> Self;
+
+ /// Perform a bitwise 'or' of this vector and the one given and return
+ /// the result.
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that this is okay to call in the current target for
+ /// the current CPU.
+ unsafe fn or(self, vector2: Self) -> Self;
+
+ /// Shift each 8-bit lane in this vector to the right by the number of
+ /// bits indictated by the `BITS` type parameter.
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that this is okay to call in the current target for
+ /// the current CPU.
+ unsafe fn shift_8bit_lane_right<const BITS: i32>(self) -> Self;
+
+ /// Shift this vector to the left by one byte and shift the most
+ /// significant byte of `vector2` into the least significant position of
+ /// this vector.
+ ///
+ /// Stated differently, this behaves as if `self` and `vector2` were
+ /// concatenated into a `2 * Self::BITS` temporary buffer and then shifted
+ /// right by `Self::BYTES - 1` bytes.
+ ///
+ /// With respect to the Teddy algorithm, `vector2` is usually a previous
+ /// `Self::BYTES` chunk from the haystack and `self` is the chunk
+ /// immediately following it. This permits combining the last two bytes
+ /// from the previous chunk (`vector2`) with the first `Self::BYTES - 1`
+ /// bytes from the current chunk. This permits aligning the result of
+ /// various shuffles so that they can be and-ed together and a possible
+ /// candidate discovered.
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that this is okay to call in the current target for
+ /// the current CPU.
+ unsafe fn shift_in_one_byte(self, vector2: Self) -> Self;
+
+ /// Shift this vector to the left by two bytes and shift the two most
+ /// significant bytes of `vector2` into the least significant position of
+ /// this vector.
+ ///
+ /// Stated differently, this behaves as if `self` and `vector2` were
+ /// concatenated into a `2 * Self::BITS` temporary buffer and then shifted
+ /// right by `Self::BYTES - 2` bytes.
+ ///
+ /// With respect to the Teddy algorithm, `vector2` is usually a previous
+ /// `Self::BYTES` chunk from the haystack and `self` is the chunk
+ /// immediately following it. This permits combining the last two bytes
+ /// from the previous chunk (`vector2`) with the first `Self::BYTES - 2`
+ /// bytes from the current chunk. This permits aligning the result of
+ /// various shuffles so that they can be and-ed together and a possible
+ /// candidate discovered.
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that this is okay to call in the current target for
+ /// the current CPU.
+ unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self;
+
+ /// Shift this vector to the left by three bytes and shift the three most
+ /// significant bytes of `vector2` into the least significant position of
+ /// this vector.
+ ///
+ /// Stated differently, this behaves as if `self` and `vector2` were
+ /// concatenated into a `2 * Self::BITS` temporary buffer and then shifted
+ /// right by `Self::BYTES - 3` bytes.
+ ///
+ /// With respect to the Teddy algorithm, `vector2` is usually a previous
+ /// `Self::BYTES` chunk from the haystack and `self` is the chunk
+ /// immediately following it. This permits combining the last three bytes
+ /// from the previous chunk (`vector2`) with the first `Self::BYTES - 3`
+ /// bytes from the current chunk. This permits aligning the result of
+ /// various shuffles so that they can be and-ed together and a possible
+ /// candidate discovered.
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that this is okay to call in the current target for
+ /// the current CPU.
+ unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self;
+
+ /// Shuffles the bytes in this vector according to the indices in each of
+ /// the corresponding lanes in `indices`.
+ ///
+ /// If `i` is the index of corresponding lanes, `A` is this vector, `B` is
+ /// indices and `C` is the resulting vector, then `C = A[B[i]]`.
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that this is okay to call in the current target for
+ /// the current CPU.
+ unsafe fn shuffle_bytes(self, indices: Self) -> Self;
+
+ /// Call the provided function for each 64-bit lane in this vector. The
+ /// given function is provided the lane index and lane value as a `u64`.
+ ///
+ /// If `f` returns `Some`, then iteration over the lanes is stopped and the
+ /// value is returned. Otherwise, this returns `None`.
+ ///
+ /// # Notes
+ ///
+ /// Conceptually it would be nice if we could have a
+ /// `unpack64(self) -> [u64; BITS / 64]` method, but defining that is
+ /// tricky given Rust's [current support for const generics][support].
+ /// And even if we could, it would be tricky to write generic code over
+ /// it. (Not impossible. We could introduce another layer that requires
+ /// `AsRef<[u64]>` or something.)
+ ///
+ /// [support]: https://github.com/rust-lang/rust/issues/60551
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that this is okay to call in the current target for
+ /// the current CPU.
+ unsafe fn for_each_64bit_lane<T>(
+ self,
+ f: impl FnMut(usize, u64) -> Option<T>,
+ ) -> Option<T>;
+}
+
+/// This trait extends the `Vector` trait with additional operations to support
+/// Fat Teddy.
+///
+/// Fat Teddy uses 16 buckets instead of 8, but reads half as many bytes (as
+/// the vector size) instead of the full size of a vector per iteration. For
+/// example, when using a 256-bit vector, Slim Teddy reads 32 bytes at a timr
+/// but Fat Teddy reads 16 bytes at a time.
+///
+/// Fat Teddy is useful when searching for a large number of literals.
+/// The extra number of buckets spreads the literals out more and reduces
+/// verification time.
+///
+/// Currently we only implement this for AVX on x86_64. It would be nice to
+/// implement this for SSE on x86_64 and NEON on aarch64, with the latter two
+/// only reading 8 bytes at a time. It's not clear how well it would work, but
+/// there are some tricky things to figure out in terms of implementation. The
+/// `half_shift_in_{one,two,three}_bytes` methods in particular are probably
+/// the trickiest of the bunch. For AVX2, these are implemented by taking
+/// advantage of the fact that `_mm256_alignr_epi8` operates on each 128-bit
+/// half instead of the full 256-bit vector. (Where as `_mm_alignr_epi8`
+/// operates on the full 128-bit vector and not on each 64-bit half.) I didn't
+/// do a careful survey of NEON to see if it could easily support these
+/// operations.
+pub(crate) trait FatVector: Vector {
+ type Half: Vector;
+
+ /// Read a half-vector-size number of bytes from the given pointer, and
+ /// broadcast it across both halfs of a full vector. The pointer does not
+ /// need to be aligned.
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that this is okay to call in the current target for
+ /// the current CPU.
+ ///
+ /// Callers must guarantee that at least `Self::HALF::BYTES` bytes are
+ /// readable from `data`.
+ unsafe fn load_half_unaligned(data: *const u8) -> Self;
+
+ /// Like `Vector::shift_in_one_byte`, except this is done for each half
+ /// of the vector instead.
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that this is okay to call in the current target for
+ /// the current CPU.
+ unsafe fn half_shift_in_one_byte(self, vector2: Self) -> Self;
+
+ /// Like `Vector::shift_in_two_bytes`, except this is done for each half
+ /// of the vector instead.
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that this is okay to call in the current target for
+ /// the current CPU.
+ unsafe fn half_shift_in_two_bytes(self, vector2: Self) -> Self;
+
+ /// Like `Vector::shift_in_two_bytes`, except this is done for each half
+ /// of the vector instead.
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that this is okay to call in the current target for
+ /// the current CPU.
+ unsafe fn half_shift_in_three_bytes(self, vector2: Self) -> Self;
+
+ /// Swap the 128-bit lanes in this vector.
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that this is okay to call in the current target for
+ /// the current CPU.
+ unsafe fn swap_halves(self) -> Self;
+
+ /// Unpack and interleave the 8-bit lanes from the low 128 bits of each
+ /// vector and return the result.
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that this is okay to call in the current target for
+ /// the current CPU.
+ unsafe fn interleave_low_8bit_lanes(self, vector2: Self) -> Self;
+
+ /// Unpack and interleave the 8-bit lanes from the high 128 bits of each
+ /// vector and return the result.
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that this is okay to call in the current target for
+ /// the current CPU.
+ unsafe fn interleave_high_8bit_lanes(self, vector2: Self) -> Self;
+
+ /// Call the provided function for each 64-bit lane in the lower half
+ /// of this vector and then in the other vector. The given function is
+ /// provided the lane index and lane value as a `u64`. (The high 128-bits
+ /// of each vector are ignored.)
+ ///
+ /// If `f` returns `Some`, then iteration over the lanes is stopped and the
+ /// value is returned. Otherwise, this returns `None`.
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that this is okay to call in the current target for
+ /// the current CPU.
+ unsafe fn for_each_low_64bit_lane<T>(
+ self,
+ vector2: Self,
+ f: impl FnMut(usize, u64) -> Option<T>,
+ ) -> Option<T>;
+}
+
+#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
+mod x86_64_ssse3 {
+ use core::arch::x86_64::*;
+
+ use crate::util::int::{I32, I64, I8};
+
+ use super::Vector;
+
+ impl Vector for __m128i {
+ const BITS: usize = 128;
+ const BYTES: usize = 16;
+
+ #[inline(always)]
+ unsafe fn splat(byte: u8) -> __m128i {
+ _mm_set1_epi8(i8::from_bits(byte))
+ }
+
+ #[inline(always)]
+ unsafe fn load_unaligned(data: *const u8) -> __m128i {
+ _mm_loadu_si128(data.cast::<__m128i>())
+ }
+
+ #[inline(always)]
+ unsafe fn is_zero(self) -> bool {
+ let cmp = self.cmpeq(Self::splat(0));
+ _mm_movemask_epi8(cmp).to_bits() == 0xFFFF
+ }
+
+ #[inline(always)]
+ unsafe fn cmpeq(self, vector2: Self) -> __m128i {
+ _mm_cmpeq_epi8(self, vector2)
+ }
+
+ #[inline(always)]
+ unsafe fn and(self, vector2: Self) -> __m128i {
+ _mm_and_si128(self, vector2)
+ }
+
+ #[inline(always)]
+ unsafe fn or(self, vector2: Self) -> __m128i {
+ _mm_or_si128(self, vector2)
+ }
+
+ #[inline(always)]
+ unsafe fn shift_8bit_lane_right<const BITS: i32>(self) -> Self {
+ // Apparently there is no _mm_srli_epi8, so we emulate it by
+ // shifting 16-bit integers and masking out the high nybble of each
+ // 8-bit lane (since that nybble will contain bits from the low
+ // nybble of the previous lane).
+ let lomask = Self::splat(0xF);
+ _mm_srli_epi16(self, BITS).and(lomask)
+ }
+
+ #[inline(always)]
+ unsafe fn shift_in_one_byte(self, vector2: Self) -> Self {
+ _mm_alignr_epi8(self, vector2, 15)
+ }
+
+ #[inline(always)]
+ unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self {
+ _mm_alignr_epi8(self, vector2, 14)
+ }
+
+ #[inline(always)]
+ unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self {
+ _mm_alignr_epi8(self, vector2, 13)
+ }
+
+ #[inline(always)]
+ unsafe fn shuffle_bytes(self, indices: Self) -> Self {
+ _mm_shuffle_epi8(self, indices)
+ }
+
+ #[inline(always)]
+ unsafe fn for_each_64bit_lane<T>(
+ self,
+ mut f: impl FnMut(usize, u64) -> Option<T>,
+ ) -> Option<T> {
+ let lane = _mm_extract_epi64(self, 0).to_bits();
+ if let Some(t) = f(0, lane) {
+ return Some(t);
+ }
+ let lane = _mm_extract_epi64(self, 1).to_bits();
+ if let Some(t) = f(1, lane) {
+ return Some(t);
+ }
+ None
+ }
+ }
+}
+
+#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
+mod x86_64_avx2 {
+ use core::arch::x86_64::*;
+
+ use crate::util::int::{I32, I64, I8};
+
+ use super::{FatVector, Vector};
+
+ impl Vector for __m256i {
+ const BITS: usize = 256;
+ const BYTES: usize = 32;
+
+ #[inline(always)]
+ unsafe fn splat(byte: u8) -> __m256i {
+ _mm256_set1_epi8(i8::from_bits(byte))
+ }
+
+ #[inline(always)]
+ unsafe fn load_unaligned(data: *const u8) -> __m256i {
+ _mm256_loadu_si256(data.cast::<__m256i>())
+ }
+
+ #[inline(always)]
+ unsafe fn is_zero(self) -> bool {
+ let cmp = self.cmpeq(Self::splat(0));
+ _mm256_movemask_epi8(cmp).to_bits() == 0xFFFFFFFF
+ }
+
+ #[inline(always)]
+ unsafe fn cmpeq(self, vector2: Self) -> __m256i {
+ _mm256_cmpeq_epi8(self, vector2)
+ }
+
+ #[inline(always)]
+ unsafe fn and(self, vector2: Self) -> __m256i {
+ _mm256_and_si256(self, vector2)
+ }
+
+ #[inline(always)]
+ unsafe fn or(self, vector2: Self) -> __m256i {
+ _mm256_or_si256(self, vector2)
+ }
+
+ #[inline(always)]
+ unsafe fn shift_8bit_lane_right<const BITS: i32>(self) -> Self {
+ let lomask = Self::splat(0xF);
+ _mm256_srli_epi16(self, BITS).and(lomask)
+ }
+
+ #[inline(always)]
+ unsafe fn shift_in_one_byte(self, vector2: Self) -> Self {
+ // Credit goes to jneem for figuring this out:
+ // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184
+ //
+ // TL;DR avx2's PALIGNR instruction is actually just two 128-bit
+ // PALIGNR instructions, which is not what we want, so we need to
+ // do some extra shuffling.
+ let v = _mm256_permute2x128_si256(vector2, self, 0x21);
+ _mm256_alignr_epi8(self, v, 15)
+ }
+
+ #[inline(always)]
+ unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self {
+ // Credit goes to jneem for figuring this out:
+ // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184
+ //
+ // TL;DR avx2's PALIGNR instruction is actually just two 128-bit
+ // PALIGNR instructions, which is not what we want, so we need to
+ // do some extra shuffling.
+ let v = _mm256_permute2x128_si256(vector2, self, 0x21);
+ _mm256_alignr_epi8(self, v, 14)
+ }
+
+ #[inline(always)]
+ unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self {
+ // Credit goes to jneem for figuring this out:
+ // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184
+ //
+ // TL;DR avx2's PALIGNR instruction is actually just two 128-bit
+ // PALIGNR instructions, which is not what we want, so we need to
+ // do some extra shuffling.
+ let v = _mm256_permute2x128_si256(vector2, self, 0x21);
+ _mm256_alignr_epi8(self, v, 13)
+ }
+
+ #[inline(always)]
+ unsafe fn shuffle_bytes(self, indices: Self) -> Self {
+ _mm256_shuffle_epi8(self, indices)
+ }
+
+ #[inline(always)]
+ unsafe fn for_each_64bit_lane<T>(
+ self,
+ mut f: impl FnMut(usize, u64) -> Option<T>,
+ ) -> Option<T> {
+ // NOTE: At one point in the past, I used transmute to this to
+ // get a [u64; 4], but it turned out to lead to worse codegen IIRC.
+ // I've tried it more recently, and it looks like that's no longer
+ // the case. But since there's no difference, we stick with the
+ // slightly more complicated but transmute-free version.
+ let lane = _mm256_extract_epi64(self, 0).to_bits();
+ if let Some(t) = f(0, lane) {
+ return Some(t);
+ }
+ let lane = _mm256_extract_epi64(self, 1).to_bits();
+ if let Some(t) = f(1, lane) {
+ return Some(t);
+ }
+ let lane = _mm256_extract_epi64(self, 2).to_bits();
+ if let Some(t) = f(2, lane) {
+ return Some(t);
+ }
+ let lane = _mm256_extract_epi64(self, 3).to_bits();
+ if let Some(t) = f(3, lane) {
+ return Some(t);
+ }
+ None
+ }
+ }
+
+ impl FatVector for __m256i {
+ type Half = __m128i;
+
+ #[inline(always)]
+ unsafe fn load_half_unaligned(data: *const u8) -> Self {
+ let half = Self::Half::load_unaligned(data);
+ _mm256_broadcastsi128_si256(half)
+ }
+
+ #[inline(always)]
+ unsafe fn half_shift_in_one_byte(self, vector2: Self) -> Self {
+ _mm256_alignr_epi8(self, vector2, 15)
+ }
+
+ #[inline(always)]
+ unsafe fn half_shift_in_two_bytes(self, vector2: Self) -> Self {
+ _mm256_alignr_epi8(self, vector2, 14)
+ }
+
+ #[inline(always)]
+ unsafe fn half_shift_in_three_bytes(self, vector2: Self) -> Self {
+ _mm256_alignr_epi8(self, vector2, 13)
+ }
+
+ #[inline(always)]
+ unsafe fn swap_halves(self) -> Self {
+ _mm256_permute4x64_epi64(self, 0x4E)
+ }
+
+ #[inline(always)]
+ unsafe fn interleave_low_8bit_lanes(self, vector2: Self) -> Self {
+ _mm256_unpacklo_epi8(self, vector2)
+ }
+
+ #[inline(always)]
+ unsafe fn interleave_high_8bit_lanes(self, vector2: Self) -> Self {
+ _mm256_unpackhi_epi8(self, vector2)
+ }
+
+ #[inline(always)]
+ unsafe fn for_each_low_64bit_lane<T>(
+ self,
+ vector2: Self,
+ mut f: impl FnMut(usize, u64) -> Option<T>,
+ ) -> Option<T> {
+ let lane = _mm256_extract_epi64(self, 0).to_bits();
+ if let Some(t) = f(0, lane) {
+ return Some(t);
+ }
+ let lane = _mm256_extract_epi64(self, 1).to_bits();
+ if let Some(t) = f(1, lane) {
+ return Some(t);
+ }
+ let lane = _mm256_extract_epi64(vector2, 0).to_bits();
+ if let Some(t) = f(2, lane) {
+ return Some(t);
+ }
+ let lane = _mm256_extract_epi64(vector2, 1).to_bits();
+ if let Some(t) = f(3, lane) {
+ return Some(t);
+ }
+ None
+ }
+ }
+}
+
+#[cfg(target_arch = "aarch64")]
+mod aarch64_neon {
+ use core::arch::aarch64::*;
+
+ use super::Vector;
+
+ impl Vector for uint8x16_t {
+ const BITS: usize = 128;
+ const BYTES: usize = 16;
+
+ #[inline(always)]
+ unsafe fn splat(byte: u8) -> uint8x16_t {
+ vdupq_n_u8(byte)
+ }
+
+ #[inline(always)]
+ unsafe fn load_unaligned(data: *const u8) -> uint8x16_t {
+ vld1q_u8(data)
+ }
+
+ #[inline(always)]
+ unsafe fn is_zero(self) -> bool {
+ // Could also use vmaxvq_u8.
+ // ... I tried that and couldn't observe any meaningful difference
+ // in benchmarks.
+ let maxes = vreinterpretq_u64_u8(vpmaxq_u8(self, self));
+ vgetq_lane_u64(maxes, 0) == 0
+ }
+
+ #[inline(always)]
+ unsafe fn cmpeq(self, vector2: Self) -> uint8x16_t {
+ vceqq_u8(self, vector2)
+ }
+
+ #[inline(always)]
+ unsafe fn and(self, vector2: Self) -> uint8x16_t {
+ vandq_u8(self, vector2)
+ }
+
+ #[inline(always)]
+ unsafe fn or(self, vector2: Self) -> uint8x16_t {
+ vorrq_u8(self, vector2)
+ }
+
+ #[inline(always)]
+ unsafe fn shift_8bit_lane_right<const BITS: i32>(self) -> Self {
+ debug_assert!(BITS <= 7);
+ vshrq_n_u8(self, BITS)
+ }
+
+ #[inline(always)]
+ unsafe fn shift_in_one_byte(self, vector2: Self) -> Self {
+ vextq_u8(vector2, self, 15)
+ }
+
+ #[inline(always)]
+ unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self {
+ vextq_u8(vector2, self, 14)
+ }
+
+ #[inline(always)]
+ unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self {
+ vextq_u8(vector2, self, 13)
+ }
+
+ #[inline(always)]
+ unsafe fn shuffle_bytes(self, indices: Self) -> Self {
+ vqtbl1q_u8(self, indices)
+ }
+
+ #[inline(always)]
+ unsafe fn for_each_64bit_lane<T>(
+ self,
+ mut f: impl FnMut(usize, u64) -> Option<T>,
+ ) -> Option<T> {
+ let this = vreinterpretq_u64_u8(self);
+ let lane = vgetq_lane_u64(this, 0);
+ if let Some(t) = f(0, lane) {
+ return Some(t);
+ }
+ let lane = vgetq_lane_u64(this, 1);
+ if let Some(t) = f(1, lane) {
+ return Some(t);
+ }
+ None
+ }
+ }
+}
+
+#[cfg(all(test, target_arch = "x86_64", target_feature = "sse2"))]
+mod tests_x86_64_ssse3 {
+ use core::arch::x86_64::*;
+
+ use crate::util::int::{I32, U32};
+
+ use super::*;
+
+ fn is_runnable() -> bool {
+ std::is_x86_feature_detected!("ssse3")
+ }
+
+ #[target_feature(enable = "ssse3")]
+ unsafe fn load(lanes: [u8; 16]) -> __m128i {
+ __m128i::load_unaligned(&lanes as *const u8)
+ }
+
+ #[target_feature(enable = "ssse3")]
+ unsafe fn unload(v: __m128i) -> [u8; 16] {
+ [
+ _mm_extract_epi8(v, 0).to_bits().low_u8(),
+ _mm_extract_epi8(v, 1).to_bits().low_u8(),
+ _mm_extract_epi8(v, 2).to_bits().low_u8(),
+ _mm_extract_epi8(v, 3).to_bits().low_u8(),
+ _mm_extract_epi8(v, 4).to_bits().low_u8(),
+ _mm_extract_epi8(v, 5).to_bits().low_u8(),
+ _mm_extract_epi8(v, 6).to_bits().low_u8(),
+ _mm_extract_epi8(v, 7).to_bits().low_u8(),
+ _mm_extract_epi8(v, 8).to_bits().low_u8(),
+ _mm_extract_epi8(v, 9).to_bits().low_u8(),
+ _mm_extract_epi8(v, 10).to_bits().low_u8(),
+ _mm_extract_epi8(v, 11).to_bits().low_u8(),
+ _mm_extract_epi8(v, 12).to_bits().low_u8(),
+ _mm_extract_epi8(v, 13).to_bits().low_u8(),
+ _mm_extract_epi8(v, 14).to_bits().low_u8(),
+ _mm_extract_epi8(v, 15).to_bits().low_u8(),
+ ]
+ }
+
+ #[test]
+ fn vector_splat() {
+ #[target_feature(enable = "ssse3")]
+ unsafe fn test() {
+ let v = __m128i::splat(0xAF);
+ assert_eq!(
+ unload(v),
+ [
+ 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF,
+ 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF
+ ]
+ );
+ }
+ if !is_runnable() {
+ return;
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn vector_is_zero() {
+ #[target_feature(enable = "ssse3")]
+ unsafe fn test() {
+ let v = load([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+ assert!(!v.is_zero());
+ let v = load([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+ assert!(v.is_zero());
+ }
+ if !is_runnable() {
+ return;
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn vector_cmpeq() {
+ #[target_feature(enable = "ssse3")]
+ unsafe fn test() {
+ let v1 =
+ load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1]);
+ let v2 =
+ load([16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]);
+ assert_eq!(
+ unload(v1.cmpeq(v2)),
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF]
+ );
+ }
+ if !is_runnable() {
+ return;
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn vector_and() {
+ #[target_feature(enable = "ssse3")]
+ unsafe fn test() {
+ let v1 =
+ load([0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+ let v2 =
+ load([0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+ assert_eq!(
+ unload(v1.and(v2)),
+ [0, 0, 0, 0, 0, 0b1000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+ );
+ }
+ if !is_runnable() {
+ return;
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn vector_or() {
+ #[target_feature(enable = "ssse3")]
+ unsafe fn test() {
+ let v1 =
+ load([0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+ let v2 =
+ load([0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+ assert_eq!(
+ unload(v1.or(v2)),
+ [0, 0, 0, 0, 0, 0b1011, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+ );
+ }
+ if !is_runnable() {
+ return;
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn vector_shift_8bit_lane_right() {
+ #[target_feature(enable = "ssse3")]
+ unsafe fn test() {
+ let v = load([
+ 0, 0, 0, 0, 0b1011, 0b0101, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ ]);
+ assert_eq!(
+ unload(v.shift_8bit_lane_right::<2>()),
+ [0, 0, 0, 0, 0b0010, 0b0001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+ );
+ }
+ if !is_runnable() {
+ return;
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn vector_shift_in_one_byte() {
+ #[target_feature(enable = "ssse3")]
+ unsafe fn test() {
+ let v1 =
+ load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
+ let v2 = load([
+ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+ ]);
+ assert_eq!(
+ unload(v1.shift_in_one_byte(v2)),
+ [32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+ );
+ }
+ if !is_runnable() {
+ return;
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn vector_shift_in_two_bytes() {
+ #[target_feature(enable = "ssse3")]
+ unsafe fn test() {
+ let v1 =
+ load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
+ let v2 = load([
+ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+ ]);
+ assert_eq!(
+ unload(v1.shift_in_two_bytes(v2)),
+ [31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
+ );
+ }
+ if !is_runnable() {
+ return;
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn vector_shift_in_three_bytes() {
+ #[target_feature(enable = "ssse3")]
+ unsafe fn test() {
+ let v1 =
+ load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
+ let v2 = load([
+ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+ ]);
+ assert_eq!(
+ unload(v1.shift_in_three_bytes(v2)),
+ [30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+ );
+ }
+ if !is_runnable() {
+ return;
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn vector_shuffle_bytes() {
+ #[target_feature(enable = "ssse3")]
+ unsafe fn test() {
+ let v1 =
+ load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
+ let v2 =
+ load([0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12]);
+ assert_eq!(
+ unload(v1.shuffle_bytes(v2)),
+ [1, 1, 1, 1, 5, 5, 5, 5, 9, 9, 9, 9, 13, 13, 13, 13],
+ );
+ }
+ if !is_runnable() {
+ return;
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn vector_for_each_64bit_lane() {
+ #[target_feature(enable = "ssse3")]
+ unsafe fn test() {
+ let v = load([
+ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
+ 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10,
+ ]);
+ let mut lanes = [0u64; 2];
+ v.for_each_64bit_lane(|i, lane| {
+ lanes[i] = lane;
+ None::<()>
+ });
+ assert_eq!(lanes, [0x0807060504030201, 0x100F0E0D0C0B0A09],);
+ }
+ if !is_runnable() {
+ return;
+ }
+ unsafe { test() }
+ }
+}
+
+#[cfg(all(test, target_arch = "x86_64", target_feature = "sse2"))]
+mod tests_x86_64_avx2 {
+ use core::arch::x86_64::*;
+
+ use crate::util::int::{I32, U32};
+
+ use super::*;
+
+ fn is_runnable() -> bool {
+ std::is_x86_feature_detected!("avx2")
+ }
+
+ #[target_feature(enable = "avx2")]
+ unsafe fn load(lanes: [u8; 32]) -> __m256i {
+ __m256i::load_unaligned(&lanes as *const u8)
+ }
+
+ #[target_feature(enable = "avx2")]
+ unsafe fn load_half(lanes: [u8; 16]) -> __m256i {
+ __m256i::load_half_unaligned(&lanes as *const u8)
+ }
+
+ #[target_feature(enable = "avx2")]
+ unsafe fn unload(v: __m256i) -> [u8; 32] {
+ [
+ _mm256_extract_epi8(v, 0).to_bits().low_u8(),
+ _mm256_extract_epi8(v, 1).to_bits().low_u8(),
+ _mm256_extract_epi8(v, 2).to_bits().low_u8(),
+ _mm256_extract_epi8(v, 3).to_bits().low_u8(),
+ _mm256_extract_epi8(v, 4).to_bits().low_u8(),
+ _mm256_extract_epi8(v, 5).to_bits().low_u8(),
+ _mm256_extract_epi8(v, 6).to_bits().low_u8(),
+ _mm256_extract_epi8(v, 7).to_bits().low_u8(),
+ _mm256_extract_epi8(v, 8).to_bits().low_u8(),
+ _mm256_extract_epi8(v, 9).to_bits().low_u8(),
+ _mm256_extract_epi8(v, 10).to_bits().low_u8(),
+ _mm256_extract_epi8(v, 11).to_bits().low_u8(),
+ _mm256_extract_epi8(v, 12).to_bits().low_u8(),
+ _mm256_extract_epi8(v, 13).to_bits().low_u8(),
+ _mm256_extract_epi8(v, 14).to_bits().low_u8(),
+ _mm256_extract_epi8(v, 15).to_bits().low_u8(),
+ _mm256_extract_epi8(v, 16).to_bits().low_u8(),
+ _mm256_extract_epi8(v, 17).to_bits().low_u8(),
+ _mm256_extract_epi8(v, 18).to_bits().low_u8(),
+ _mm256_extract_epi8(v, 19).to_bits().low_u8(),
+ _mm256_extract_epi8(v, 20).to_bits().low_u8(),
+ _mm256_extract_epi8(v, 21).to_bits().low_u8(),
+ _mm256_extract_epi8(v, 22).to_bits().low_u8(),
+ _mm256_extract_epi8(v, 23).to_bits().low_u8(),
+ _mm256_extract_epi8(v, 24).to_bits().low_u8(),
+ _mm256_extract_epi8(v, 25).to_bits().low_u8(),
+ _mm256_extract_epi8(v, 26).to_bits().low_u8(),
+ _mm256_extract_epi8(v, 27).to_bits().low_u8(),
+ _mm256_extract_epi8(v, 28).to_bits().low_u8(),
+ _mm256_extract_epi8(v, 29).to_bits().low_u8(),
+ _mm256_extract_epi8(v, 30).to_bits().low_u8(),
+ _mm256_extract_epi8(v, 31).to_bits().low_u8(),
+ ]
+ }
+
+ #[test]
+ fn vector_splat() {
+ #[target_feature(enable = "avx2")]
+ unsafe fn test() {
+ let v = __m256i::splat(0xAF);
+ assert_eq!(
+ unload(v),
+ [
+ 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF,
+ 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF,
+ 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF,
+ 0xAF, 0xAF, 0xAF, 0xAF, 0xAF,
+ ]
+ );
+ }
+ if !is_runnable() {
+ return;
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn vector_is_zero() {
+ #[target_feature(enable = "avx2")]
+ unsafe fn test() {
+ let v = load([
+ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ ]);
+ assert!(!v.is_zero());
+ let v = load([
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ ]);
+ assert!(v.is_zero());
+ }
+ if !is_runnable() {
+ return;
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn vector_cmpeq() {
+ #[target_feature(enable = "avx2")]
+ unsafe fn test() {
+ let v1 = load([
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+ 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 1,
+ ]);
+ let v2 = load([
+ 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18,
+ 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
+ ]);
+ assert_eq!(
+ unload(v1.cmpeq(v2)),
+ [
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF
+ ]
+ );
+ }
+ if !is_runnable() {
+ return;
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn vector_and() {
+ #[target_feature(enable = "avx2")]
+ unsafe fn test() {
+ let v1 = load([
+ 0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ ]);
+ let v2 = load([
+ 0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ ]);
+ assert_eq!(
+ unload(v1.and(v2)),
+ [
+ 0, 0, 0, 0, 0, 0b1000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ ]
+ );
+ }
+ if !is_runnable() {
+ return;
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn vector_or() {
+ #[target_feature(enable = "avx2")]
+ unsafe fn test() {
+ let v1 = load([
+ 0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ ]);
+ let v2 = load([
+ 0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ ]);
+ assert_eq!(
+ unload(v1.or(v2)),
+ [
+ 0, 0, 0, 0, 0, 0b1011, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ ]
+ );
+ }
+ if !is_runnable() {
+ return;
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn vector_shift_8bit_lane_right() {
+ #[target_feature(enable = "avx2")]
+ unsafe fn test() {
+ let v = load([
+ 0, 0, 0, 0, 0b1011, 0b0101, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ ]);
+ assert_eq!(
+ unload(v.shift_8bit_lane_right::<2>()),
+ [
+ 0, 0, 0, 0, 0b0010, 0b0001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ ]
+ );
+ }
+ if !is_runnable() {
+ return;
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn vector_shift_in_one_byte() {
+ #[target_feature(enable = "avx2")]
+ unsafe fn test() {
+ let v1 = load([
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+ 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+ ]);
+ let v2 = load([
+ 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
+ 63, 64,
+ ]);
+ assert_eq!(
+ unload(v1.shift_in_one_byte(v2)),
+ [
+ 64, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+ 31,
+ ],
+ );
+ }
+ if !is_runnable() {
+ return;
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn vector_shift_in_two_bytes() {
+ #[target_feature(enable = "avx2")]
+ unsafe fn test() {
+ let v1 = load([
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+ 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+ ]);
+ let v2 = load([
+ 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
+ 63, 64,
+ ]);
+ assert_eq!(
+ unload(v1.shift_in_two_bytes(v2)),
+ [
+ 63, 64, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30,
+ ],
+ );
+ }
+ if !is_runnable() {
+ return;
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn vector_shift_in_three_bytes() {
+ #[target_feature(enable = "avx2")]
+ unsafe fn test() {
+ let v1 = load([
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+ 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+ ]);
+ let v2 = load([
+ 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
+ 63, 64,
+ ]);
+ assert_eq!(
+ unload(v1.shift_in_three_bytes(v2)),
+ [
+ 62, 63, 64, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+ 29,
+ ],
+ );
+ }
+ if !is_runnable() {
+ return;
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn vector_shuffle_bytes() {
+ #[target_feature(enable = "avx2")]
+ unsafe fn test() {
+ let v1 = load([
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+ 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+ ]);
+ let v2 = load([
+ 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12, 16, 16,
+ 16, 16, 20, 20, 20, 20, 24, 24, 24, 24, 28, 28, 28, 28,
+ ]);
+ assert_eq!(
+ unload(v1.shuffle_bytes(v2)),
+ [
+ 1, 1, 1, 1, 5, 5, 5, 5, 9, 9, 9, 9, 13, 13, 13, 13, 17,
+ 17, 17, 17, 21, 21, 21, 21, 25, 25, 25, 25, 29, 29, 29,
+ 29
+ ],
+ );
+ }
+ if !is_runnable() {
+ return;
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn vector_for_each_64bit_lane() {
+ #[target_feature(enable = "avx2")]
+ unsafe fn test() {
+ let v = load([
+ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
+ 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14,
+ 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E,
+ 0x1F, 0x20,
+ ]);
+ let mut lanes = [0u64; 4];
+ v.for_each_64bit_lane(|i, lane| {
+ lanes[i] = lane;
+ None::<()>
+ });
+ assert_eq!(
+ lanes,
+ [
+ 0x0807060504030201,
+ 0x100F0E0D0C0B0A09,
+ 0x1817161514131211,
+ 0x201F1E1D1C1B1A19
+ ]
+ );
+ }
+ if !is_runnable() {
+ return;
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn fat_vector_half_shift_in_one_byte() {
+ #[target_feature(enable = "avx2")]
+ unsafe fn test() {
+ let v1 = load_half([
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+ ]);
+ let v2 = load_half([
+ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+ ]);
+ assert_eq!(
+ unload(v1.half_shift_in_one_byte(v2)),
+ [
+ 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32,
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ ],
+ );
+ }
+ if !is_runnable() {
+ return;
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn fat_vector_half_shift_in_two_bytes() {
+ #[target_feature(enable = "avx2")]
+ unsafe fn test() {
+ let v1 = load_half([
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+ ]);
+ let v2 = load_half([
+ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+ ]);
+ assert_eq!(
+ unload(v1.half_shift_in_two_bytes(v2)),
+ [
+ 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 31,
+ 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ ],
+ );
+ }
+ if !is_runnable() {
+ return;
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn fat_vector_half_shift_in_three_bytes() {
+ #[target_feature(enable = "avx2")]
+ unsafe fn test() {
+ let v1 = load_half([
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+ ]);
+ let v2 = load_half([
+ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+ ]);
+ assert_eq!(
+ unload(v1.half_shift_in_three_bytes(v2)),
+ [
+ 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 30,
+ 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+ ],
+ );
+ }
+ if !is_runnable() {
+ return;
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn fat_vector_swap_halves() {
+ #[target_feature(enable = "avx2")]
+ unsafe fn test() {
+ let v = load([
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+ 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+ ]);
+ assert_eq!(
+ unload(v.swap_halves()),
+ [
+ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+ 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16,
+ ],
+ );
+ }
+ if !is_runnable() {
+ return;
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn fat_vector_interleave_low_8bit_lanes() {
+ #[target_feature(enable = "avx2")]
+ unsafe fn test() {
+ let v1 = load([
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+ 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+ ]);
+ let v2 = load([
+ 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
+ 63, 64,
+ ]);
+ assert_eq!(
+ unload(v1.interleave_low_8bit_lanes(v2)),
+ [
+ 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40,
+ 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55,
+ 24, 56,
+ ],
+ );
+ }
+ if !is_runnable() {
+ return;
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn fat_vector_interleave_high_8bit_lanes() {
+ #[target_feature(enable = "avx2")]
+ unsafe fn test() {
+ let v1 = load([
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+ 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+ ]);
+ let v2 = load([
+ 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
+ 63, 64,
+ ]);
+ assert_eq!(
+ unload(v1.interleave_high_8bit_lanes(v2)),
+ [
+ 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, 16,
+ 48, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31,
+ 63, 32, 64,
+ ],
+ );
+ }
+ if !is_runnable() {
+ return;
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn fat_vector_for_each_low_64bit_lane() {
+ #[target_feature(enable = "avx2")]
+ unsafe fn test() {
+ let v1 = load([
+ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
+ 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14,
+ 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E,
+ 0x1F, 0x20,
+ ]);
+ let v2 = load([
+ 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A,
+ 0x2B, 0x2C, 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34,
+ 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E,
+ 0x3F, 0x40,
+ ]);
+ let mut lanes = [0u64; 4];
+ v1.for_each_low_64bit_lane(v2, |i, lane| {
+ lanes[i] = lane;
+ None::<()>
+ });
+ assert_eq!(
+ lanes,
+ [
+ 0x0807060504030201,
+ 0x100F0E0D0C0B0A09,
+ 0x2827262524232221,
+ 0x302F2E2D2C2B2A29
+ ]
+ );
+ }
+ if !is_runnable() {
+ return;
+ }
+ unsafe { test() }
+ }
+}
+
+#[cfg(all(test, target_arch = "aarch64", target_feature = "neon"))]
+mod tests_aarch64_neon {
+ use core::arch::aarch64::*;
+
+ use super::*;
+
+ #[target_feature(enable = "neon")]
+ unsafe fn load(lanes: [u8; 16]) -> uint8x16_t {
+ uint8x16_t::load_unaligned(&lanes as *const u8)
+ }
+
+ #[target_feature(enable = "neon")]
+ unsafe fn unload(v: uint8x16_t) -> [u8; 16] {
+ [
+ vgetq_lane_u8(v, 0),
+ vgetq_lane_u8(v, 1),
+ vgetq_lane_u8(v, 2),
+ vgetq_lane_u8(v, 3),
+ vgetq_lane_u8(v, 4),
+ vgetq_lane_u8(v, 5),
+ vgetq_lane_u8(v, 6),
+ vgetq_lane_u8(v, 7),
+ vgetq_lane_u8(v, 8),
+ vgetq_lane_u8(v, 9),
+ vgetq_lane_u8(v, 10),
+ vgetq_lane_u8(v, 11),
+ vgetq_lane_u8(v, 12),
+ vgetq_lane_u8(v, 13),
+ vgetq_lane_u8(v, 14),
+ vgetq_lane_u8(v, 15),
+ ]
+ }
+
+ // Example functions. These don't test the Vector traits, but rather,
+ // specific NEON instructions. They are basically little experiments I
+ // wrote to figure out what an instruction does since their descriptions
+ // are so dense. I decided to keep the experiments around as example tests
+ // in case there' useful.
+
+ #[test]
+ fn example_vmaxvq_u8_non_zero() {
+ #[target_feature(enable = "neon")]
+ unsafe fn example() {
+ let v = load([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+ assert_eq!(vmaxvq_u8(v), 1);
+ }
+ unsafe { example() }
+ }
+
+ #[test]
+ fn example_vmaxvq_u8_zero() {
+ #[target_feature(enable = "neon")]
+ unsafe fn example() {
+ let v = load([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+ assert_eq!(vmaxvq_u8(v), 0);
+ }
+ unsafe { example() }
+ }
+
+ #[test]
+ fn example_vpmaxq_u8_non_zero() {
+ #[target_feature(enable = "neon")]
+ unsafe fn example() {
+ let v = load([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+ let r = vpmaxq_u8(v, v);
+ assert_eq!(
+ unload(r),
+ [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
+ );
+ }
+ unsafe { example() }
+ }
+
+ #[test]
+ fn example_vpmaxq_u8_self() {
+ #[target_feature(enable = "neon")]
+ unsafe fn example() {
+ let v =
+ load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
+ let r = vpmaxq_u8(v, v);
+ assert_eq!(
+ unload(r),
+ [2, 4, 6, 8, 10, 12, 14, 16, 2, 4, 6, 8, 10, 12, 14, 16]
+ );
+ }
+ unsafe { example() }
+ }
+
+ #[test]
+ fn example_vpmaxq_u8_other() {
+ #[target_feature(enable = "neon")]
+ unsafe fn example() {
+ let v1 =
+ load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
+ let v2 = load([
+ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+ ]);
+ let r = vpmaxq_u8(v1, v2);
+ assert_eq!(
+ unload(r),
+ [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32]
+ );
+ }
+ unsafe { example() }
+ }
+
+ // Now we test the actual methods on the Vector trait.
+
+ #[test]
+ fn vector_splat() {
+ #[target_feature(enable = "neon")]
+ unsafe fn test() {
+ let v = uint8x16_t::splat(0xAF);
+ assert_eq!(
+ unload(v),
+ [
+ 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF,
+ 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF
+ ]
+ );
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn vector_is_zero() {
+ #[target_feature(enable = "neon")]
+ unsafe fn test() {
+ let v = load([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+ assert!(!v.is_zero());
+ let v = load([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+ assert!(v.is_zero());
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn vector_cmpeq() {
+ #[target_feature(enable = "neon")]
+ unsafe fn test() {
+ let v1 =
+ load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1]);
+ let v2 =
+ load([16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]);
+ assert_eq!(
+ unload(v1.cmpeq(v2)),
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF]
+ );
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn vector_and() {
+ #[target_feature(enable = "neon")]
+ unsafe fn test() {
+ let v1 =
+ load([0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+ let v2 =
+ load([0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+ assert_eq!(
+ unload(v1.and(v2)),
+ [0, 0, 0, 0, 0, 0b1000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+ );
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn vector_or() {
+ #[target_feature(enable = "neon")]
+ unsafe fn test() {
+ let v1 =
+ load([0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+ let v2 =
+ load([0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+ assert_eq!(
+ unload(v1.or(v2)),
+ [0, 0, 0, 0, 0, 0b1011, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+ );
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn vector_shift_8bit_lane_right() {
+ #[target_feature(enable = "neon")]
+ unsafe fn test() {
+ let v = load([
+ 0, 0, 0, 0, 0b1011, 0b0101, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ ]);
+ assert_eq!(
+ unload(v.shift_8bit_lane_right::<2>()),
+ [0, 0, 0, 0, 0b0010, 0b0001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+ );
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn vector_shift_in_one_byte() {
+ #[target_feature(enable = "neon")]
+ unsafe fn test() {
+ let v1 =
+ load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
+ let v2 = load([
+ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+ ]);
+ assert_eq!(
+ unload(v1.shift_in_one_byte(v2)),
+ [32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+ );
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn vector_shift_in_two_bytes() {
+ #[target_feature(enable = "neon")]
+ unsafe fn test() {
+ let v1 =
+ load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
+ let v2 = load([
+ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+ ]);
+ assert_eq!(
+ unload(v1.shift_in_two_bytes(v2)),
+ [31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
+ );
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn vector_shift_in_three_bytes() {
+ #[target_feature(enable = "neon")]
+ unsafe fn test() {
+ let v1 =
+ load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
+ let v2 = load([
+ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+ ]);
+ assert_eq!(
+ unload(v1.shift_in_three_bytes(v2)),
+ [30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+ );
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn vector_shuffle_bytes() {
+ #[target_feature(enable = "neon")]
+ unsafe fn test() {
+ let v1 =
+ load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
+ let v2 =
+ load([0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12]);
+ assert_eq!(
+ unload(v1.shuffle_bytes(v2)),
+ [1, 1, 1, 1, 5, 5, 5, 5, 9, 9, 9, 9, 13, 13, 13, 13],
+ );
+ }
+ unsafe { test() }
+ }
+
+ #[test]
+ fn vector_for_each_64bit_lane() {
+ #[target_feature(enable = "neon")]
+ unsafe fn test() {
+ let v = load([
+ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
+ 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10,
+ ]);
+ let mut lanes = [0u64; 2];
+ v.for_each_64bit_lane(|i, lane| {
+ lanes[i] = lane;
+ None::<()>
+ });
+ assert_eq!(lanes, [0x0807060504030201, 0x100F0E0D0C0B0A09],);
+ }
+ unsafe { test() }
+ }
+}
diff --git a/third_party/rust/aho-corasick/src/tests.rs b/third_party/rust/aho-corasick/src/tests.rs
new file mode 100644
index 0000000000..a5276f85f6
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/tests.rs
@@ -0,0 +1,1664 @@
+use std::{collections::HashMap, format, string::String, vec::Vec};
+
+use crate::{
+ AhoCorasick, AhoCorasickBuilder, AhoCorasickKind, Anchored, Input, Match,
+ MatchKind, StartKind,
+};
+
+/// A description of a single test against an Aho-Corasick automaton.
+///
+/// A single test may not necessarily pass on every configuration of an
+/// Aho-Corasick automaton. The tests are categorized and grouped appropriately
+/// below.
+#[derive(Clone, Debug, Eq, PartialEq)]
+struct SearchTest {
+ /// The name of this test, for debugging.
+ name: &'static str,
+ /// The patterns to search for.
+ patterns: &'static [&'static str],
+ /// The text to search.
+ haystack: &'static str,
+ /// Each match is a triple of (pattern_index, start, end), where
+ /// pattern_index is an index into `patterns` and `start`/`end` are indices
+ /// into `haystack`.
+ matches: &'static [(usize, usize, usize)],
+}
+
+/// Short-hand constructor for SearchTest. We use it a lot below.
+macro_rules! t {
+ ($name:ident, $patterns:expr, $haystack:expr, $matches:expr) => {
+ SearchTest {
+ name: stringify!($name),
+ patterns: $patterns,
+ haystack: $haystack,
+ matches: $matches,
+ }
+ };
+}
+
+/// A collection of test groups.
+type TestCollection = &'static [&'static [SearchTest]];
+
+// Define several collections corresponding to the different type of match
+// semantics supported by Aho-Corasick. These collections have some overlap,
+// but each collection should have some tests that no other collection has.
+
+/// Tests for Aho-Corasick's standard non-overlapping match semantics.
+const AC_STANDARD_NON_OVERLAPPING: TestCollection =
+ &[BASICS, NON_OVERLAPPING, STANDARD, REGRESSION];
+
+/// Tests for Aho-Corasick's anchored standard non-overlapping match semantics.
+const AC_STANDARD_ANCHORED_NON_OVERLAPPING: TestCollection =
+ &[ANCHORED_BASICS, ANCHORED_NON_OVERLAPPING, STANDARD_ANCHORED];
+
+/// Tests for Aho-Corasick's standard overlapping match semantics.
+const AC_STANDARD_OVERLAPPING: TestCollection =
+ &[BASICS, OVERLAPPING, REGRESSION];
+
+/*
+Iterators of anchored overlapping searches were removed from the API in
+after 0.7, but we leave the tests commented out for posterity.
+/// Tests for Aho-Corasick's anchored standard overlapping match semantics.
+const AC_STANDARD_ANCHORED_OVERLAPPING: TestCollection =
+ &[ANCHORED_BASICS, ANCHORED_OVERLAPPING];
+*/
+
+/// Tests for Aho-Corasick's leftmost-first match semantics.
+const AC_LEFTMOST_FIRST: TestCollection =
+ &[BASICS, NON_OVERLAPPING, LEFTMOST, LEFTMOST_FIRST, REGRESSION];
+
+/// Tests for Aho-Corasick's anchored leftmost-first match semantics.
+const AC_LEFTMOST_FIRST_ANCHORED: TestCollection = &[
+ ANCHORED_BASICS,
+ ANCHORED_NON_OVERLAPPING,
+ ANCHORED_LEFTMOST,
+ ANCHORED_LEFTMOST_FIRST,
+];
+
+/// Tests for Aho-Corasick's leftmost-longest match semantics.
+const AC_LEFTMOST_LONGEST: TestCollection =
+ &[BASICS, NON_OVERLAPPING, LEFTMOST, LEFTMOST_LONGEST, REGRESSION];
+
+/// Tests for Aho-Corasick's anchored leftmost-longest match semantics.
+const AC_LEFTMOST_LONGEST_ANCHORED: TestCollection = &[
+ ANCHORED_BASICS,
+ ANCHORED_NON_OVERLAPPING,
+ ANCHORED_LEFTMOST,
+ ANCHORED_LEFTMOST_LONGEST,
+];
+
+// Now define the individual tests that make up the collections above.
+
+/// A collection of tests for the Aho-Corasick algorithm that should always be
+/// true regardless of match semantics. That is, all combinations of
+/// leftmost-{shortest, first, longest} x {overlapping, non-overlapping}
+/// should produce the same answer.
+const BASICS: &'static [SearchTest] = &[
+ t!(basic000, &[], "", &[]),
+ t!(basic001, &[""], "a", &[(0, 0, 0), (0, 1, 1)]),
+ t!(basic002, &["a"], "", &[]),
+ t!(basic010, &["a"], "a", &[(0, 0, 1)]),
+ t!(basic020, &["a"], "aa", &[(0, 0, 1), (0, 1, 2)]),
+ t!(basic030, &["a"], "aaa", &[(0, 0, 1), (0, 1, 2), (0, 2, 3)]),
+ t!(basic040, &["a"], "aba", &[(0, 0, 1), (0, 2, 3)]),
+ t!(basic050, &["a"], "bba", &[(0, 2, 3)]),
+ t!(basic060, &["a"], "bbb", &[]),
+ t!(basic070, &["a"], "bababbbba", &[(0, 1, 2), (0, 3, 4), (0, 8, 9)]),
+ t!(basic100, &["aa"], "", &[]),
+ t!(basic110, &["aa"], "aa", &[(0, 0, 2)]),
+ t!(basic120, &["aa"], "aabbaa", &[(0, 0, 2), (0, 4, 6)]),
+ t!(basic130, &["aa"], "abbab", &[]),
+ t!(basic140, &["aa"], "abbabaa", &[(0, 5, 7)]),
+ t!(basic200, &["abc"], "abc", &[(0, 0, 3)]),
+ t!(basic210, &["abc"], "zazabzabcz", &[(0, 6, 9)]),
+ t!(basic220, &["abc"], "zazabczabcz", &[(0, 3, 6), (0, 7, 10)]),
+ t!(basic300, &["a", "b"], "", &[]),
+ t!(basic310, &["a", "b"], "z", &[]),
+ t!(basic320, &["a", "b"], "b", &[(1, 0, 1)]),
+ t!(basic330, &["a", "b"], "a", &[(0, 0, 1)]),
+ t!(
+ basic340,
+ &["a", "b"],
+ "abba",
+ &[(0, 0, 1), (1, 1, 2), (1, 2, 3), (0, 3, 4),]
+ ),
+ t!(
+ basic350,
+ &["b", "a"],
+ "abba",
+ &[(1, 0, 1), (0, 1, 2), (0, 2, 3), (1, 3, 4),]
+ ),
+ t!(basic360, &["abc", "bc"], "xbc", &[(1, 1, 3),]),
+ t!(basic400, &["foo", "bar"], "", &[]),
+ t!(basic410, &["foo", "bar"], "foobar", &[(0, 0, 3), (1, 3, 6),]),
+ t!(basic420, &["foo", "bar"], "barfoo", &[(1, 0, 3), (0, 3, 6),]),
+ t!(basic430, &["foo", "bar"], "foofoo", &[(0, 0, 3), (0, 3, 6),]),
+ t!(basic440, &["foo", "bar"], "barbar", &[(1, 0, 3), (1, 3, 6),]),
+ t!(basic450, &["foo", "bar"], "bafofoo", &[(0, 4, 7),]),
+ t!(basic460, &["bar", "foo"], "bafofoo", &[(1, 4, 7),]),
+ t!(basic470, &["foo", "bar"], "fobabar", &[(1, 4, 7),]),
+ t!(basic480, &["bar", "foo"], "fobabar", &[(0, 4, 7),]),
+ t!(basic600, &[""], "", &[(0, 0, 0)]),
+ t!(basic610, &[""], "a", &[(0, 0, 0), (0, 1, 1)]),
+ t!(basic620, &[""], "abc", &[(0, 0, 0), (0, 1, 1), (0, 2, 2), (0, 3, 3)]),
+ t!(basic700, &["yabcdef", "abcdezghi"], "yabcdefghi", &[(0, 0, 7),]),
+ t!(basic710, &["yabcdef", "abcdezghi"], "yabcdezghi", &[(1, 1, 10),]),
+ t!(
+ basic720,
+ &["yabcdef", "bcdeyabc", "abcdezghi"],
+ "yabcdezghi",
+ &[(2, 1, 10),]
+ ),
+];
+
+/// A collection of *anchored* tests for the Aho-Corasick algorithm that should
+/// always be true regardless of match semantics. That is, all combinations of
+/// leftmost-{shortest, first, longest} x {overlapping, non-overlapping} should
+/// produce the same answer.
+const ANCHORED_BASICS: &'static [SearchTest] = &[
+ t!(abasic000, &[], "", &[]),
+ t!(abasic001, &[], "a", &[]),
+ t!(abasic002, &[], "abc", &[]),
+ t!(abasic010, &[""], "", &[(0, 0, 0)]),
+ t!(abasic020, &[""], "a", &[(0, 0, 0), (0, 1, 1)]),
+ t!(abasic030, &[""], "abc", &[(0, 0, 0), (0, 1, 1), (0, 2, 2), (0, 3, 3)]),
+ t!(abasic100, &["a"], "a", &[(0, 0, 1)]),
+ t!(abasic110, &["a"], "aa", &[(0, 0, 1), (0, 1, 2)]),
+ t!(abasic120, &["a", "b"], "ab", &[(0, 0, 1), (1, 1, 2)]),
+ t!(abasic130, &["a", "b"], "ba", &[(1, 0, 1), (0, 1, 2)]),
+ t!(abasic140, &["foo", "foofoo"], "foo", &[(0, 0, 3)]),
+ t!(abasic150, &["foofoo", "foo"], "foo", &[(1, 0, 3)]),
+ t!(abasic200, &["foo"], "foofoo foo", &[(0, 0, 3), (0, 3, 6)]),
+];
+
+/// Tests for non-overlapping standard match semantics.
+///
+/// These tests generally shouldn't pass for leftmost-{first,longest}, although
+/// some do in order to write clearer tests. For example, standard000 will
+/// pass with leftmost-first semantics, but standard010 will not. We write
+/// both to emphasize how the match semantics work.
+const STANDARD: &'static [SearchTest] = &[
+ t!(standard000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]),
+ t!(standard010, &["abcd", "ab"], "abcd", &[(1, 0, 2)]),
+ t!(standard020, &["abcd", "ab", "abc"], "abcd", &[(1, 0, 2)]),
+ t!(standard030, &["abcd", "abc", "ab"], "abcd", &[(2, 0, 2)]),
+ t!(standard040, &["a", ""], "a", &[(1, 0, 0), (1, 1, 1)]),
+ t!(
+ standard400,
+ &["abcd", "bcd", "cd", "b"],
+ "abcd",
+ &[(3, 1, 2), (2, 2, 4),]
+ ),
+ t!(standard410, &["", "a"], "a", &[(0, 0, 0), (0, 1, 1),]),
+ t!(standard420, &["", "a"], "aa", &[(0, 0, 0), (0, 1, 1), (0, 2, 2),]),
+ t!(standard430, &["", "a", ""], "a", &[(0, 0, 0), (0, 1, 1),]),
+ t!(standard440, &["a", "", ""], "a", &[(1, 0, 0), (1, 1, 1),]),
+ t!(standard450, &["", "", "a"], "a", &[(0, 0, 0), (0, 1, 1),]),
+];
+
+/// Like STANDARD, but for anchored searches.
+const STANDARD_ANCHORED: &'static [SearchTest] = &[
+ t!(astandard000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]),
+ t!(astandard010, &["abcd", "ab"], "abcd", &[(1, 0, 2)]),
+ t!(astandard020, &["abcd", "ab", "abc"], "abcd", &[(1, 0, 2)]),
+ t!(astandard030, &["abcd", "abc", "ab"], "abcd", &[(2, 0, 2)]),
+ t!(astandard040, &["a", ""], "a", &[(1, 0, 0), (1, 1, 1)]),
+ t!(astandard050, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4)]),
+ t!(astandard410, &["", "a"], "a", &[(0, 0, 0), (0, 1, 1)]),
+ t!(astandard420, &["", "a"], "aa", &[(0, 0, 0), (0, 1, 1), (0, 2, 2)]),
+ t!(astandard430, &["", "a", ""], "a", &[(0, 0, 0), (0, 1, 1)]),
+ t!(astandard440, &["a", "", ""], "a", &[(1, 0, 0), (1, 1, 1)]),
+ t!(astandard450, &["", "", "a"], "a", &[(0, 0, 0), (0, 1, 1)]),
+];
+
+/// Tests for non-overlapping leftmost match semantics. These should pass for
+/// both leftmost-first and leftmost-longest match kinds. Stated differently,
+/// among ambiguous matches, the longest match and the match that appeared
+/// first when constructing the automaton should always be the same.
+const LEFTMOST: &'static [SearchTest] = &[
+ t!(leftmost000, &["ab", "ab"], "abcd", &[(0, 0, 2)]),
+ t!(leftmost010, &["a", ""], "a", &[(0, 0, 1)]),
+ t!(leftmost011, &["a", ""], "ab", &[(0, 0, 1), (1, 2, 2)]),
+ t!(leftmost020, &["", ""], "a", &[(0, 0, 0), (0, 1, 1)]),
+ t!(leftmost030, &["a", "ab"], "aa", &[(0, 0, 1), (0, 1, 2)]),
+ t!(leftmost031, &["ab", "a"], "aa", &[(1, 0, 1), (1, 1, 2)]),
+ t!(leftmost032, &["ab", "a"], "xayabbbz", &[(1, 1, 2), (0, 3, 5)]),
+ t!(leftmost300, &["abcd", "bce", "b"], "abce", &[(1, 1, 4)]),
+ t!(leftmost310, &["abcd", "ce", "bc"], "abce", &[(2, 1, 3)]),
+ t!(leftmost320, &["abcd", "bce", "ce", "b"], "abce", &[(1, 1, 4)]),
+ t!(leftmost330, &["abcd", "bce", "cz", "bc"], "abcz", &[(3, 1, 3)]),
+ t!(leftmost340, &["bce", "cz", "bc"], "bcz", &[(2, 0, 2)]),
+ t!(leftmost350, &["abc", "bd", "ab"], "abd", &[(2, 0, 2)]),
+ t!(
+ leftmost360,
+ &["abcdefghi", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(2, 0, 8),]
+ ),
+ t!(
+ leftmost370,
+ &["abcdefghi", "cde", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(3, 0, 8),]
+ ),
+ t!(
+ leftmost380,
+ &["abcdefghi", "hz", "abcdefgh", "a"],
+ "abcdefghz",
+ &[(2, 0, 8),]
+ ),
+ t!(
+ leftmost390,
+ &["b", "abcdefghi", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(3, 0, 8),]
+ ),
+ t!(
+ leftmost400,
+ &["h", "abcdefghi", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(3, 0, 8),]
+ ),
+ t!(
+ leftmost410,
+ &["z", "abcdefghi", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(3, 0, 8), (0, 8, 9),]
+ ),
+];
+
+/// Like LEFTMOST, but for anchored searches.
+const ANCHORED_LEFTMOST: &'static [SearchTest] = &[
+ t!(aleftmost000, &["ab", "ab"], "abcd", &[(0, 0, 2)]),
+ // We shouldn't allow an empty match immediately following a match, right?
+ t!(aleftmost010, &["a", ""], "a", &[(0, 0, 1)]),
+ t!(aleftmost020, &["", ""], "a", &[(0, 0, 0), (0, 1, 1)]),
+ t!(aleftmost030, &["a", "ab"], "aa", &[(0, 0, 1), (0, 1, 2)]),
+ t!(aleftmost031, &["ab", "a"], "aa", &[(1, 0, 1), (1, 1, 2)]),
+ t!(aleftmost032, &["ab", "a"], "xayabbbz", &[]),
+ t!(aleftmost300, &["abcd", "bce", "b"], "abce", &[]),
+ t!(aleftmost301, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4)]),
+ t!(aleftmost310, &["abcd", "ce", "bc"], "abce", &[]),
+ t!(aleftmost320, &["abcd", "bce", "ce", "b"], "abce", &[]),
+ t!(aleftmost330, &["abcd", "bce", "cz", "bc"], "abcz", &[]),
+ t!(aleftmost340, &["bce", "cz", "bc"], "bcz", &[(2, 0, 2)]),
+ t!(aleftmost350, &["abc", "bd", "ab"], "abd", &[(2, 0, 2)]),
+ t!(
+ aleftmost360,
+ &["abcdefghi", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(2, 0, 8),]
+ ),
+ t!(
+ aleftmost370,
+ &["abcdefghi", "cde", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(3, 0, 8),]
+ ),
+ t!(
+ aleftmost380,
+ &["abcdefghi", "hz", "abcdefgh", "a"],
+ "abcdefghz",
+ &[(2, 0, 8),]
+ ),
+ t!(
+ aleftmost390,
+ &["b", "abcdefghi", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(3, 0, 8),]
+ ),
+ t!(
+ aleftmost400,
+ &["h", "abcdefghi", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(3, 0, 8),]
+ ),
+ t!(
+ aleftmost410,
+ &["z", "abcdefghi", "hz", "abcdefgh"],
+ "abcdefghzyz",
+ &[(3, 0, 8), (0, 8, 9)]
+ ),
+];
+
+/// Tests for non-overlapping leftmost-first match semantics. These tests
+/// should generally be specific to leftmost-first, which means they should
+/// generally fail under leftmost-longest semantics.
+const LEFTMOST_FIRST: &'static [SearchTest] = &[
+ t!(leftfirst000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]),
+ t!(leftfirst010, &["", "a"], "a", &[(0, 0, 0), (0, 1, 1)]),
+ t!(leftfirst011, &["", "a", ""], "a", &[(0, 0, 0), (0, 1, 1),]),
+ t!(leftfirst012, &["a", "", ""], "a", &[(0, 0, 1)]),
+ t!(leftfirst013, &["", "", "a"], "a", &[(0, 0, 0), (0, 1, 1)]),
+ t!(leftfirst014, &["a", ""], "a", &[(0, 0, 1)]),
+ t!(leftfirst015, &["a", ""], "ab", &[(0, 0, 1), (1, 2, 2)]),
+ t!(leftfirst020, &["abcd", "ab"], "abcd", &[(0, 0, 4)]),
+ t!(leftfirst030, &["ab", "ab"], "abcd", &[(0, 0, 2)]),
+ t!(leftfirst040, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (0, 3, 4)]),
+ t!(leftfirst100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(1, 1, 5)]),
+ t!(leftfirst110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]),
+ t!(leftfirst300, &["abcd", "b", "bce"], "abce", &[(1, 1, 2)]),
+ t!(
+ leftfirst310,
+ &["abcd", "b", "bce", "ce"],
+ "abce",
+ &[(1, 1, 2), (3, 2, 4),]
+ ),
+ t!(
+ leftfirst320,
+ &["a", "abcdefghi", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(0, 0, 1), (2, 7, 9),]
+ ),
+ t!(leftfirst330, &["a", "abab"], "abab", &[(0, 0, 1), (0, 2, 3)]),
+ t!(leftfirst400, &["amwix", "samwise", "sam"], "Zsamwix", &[(2, 1, 4)]),
+];
+
+/// Like LEFTMOST_FIRST, but for anchored searches.
+const ANCHORED_LEFTMOST_FIRST: &'static [SearchTest] = &[
+ t!(aleftfirst000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]),
+ t!(aleftfirst010, &["", "a"], "a", &[(0, 0, 0), (0, 1, 1)]),
+ t!(aleftfirst011, &["", "a", ""], "a", &[(0, 0, 0), (0, 1, 1)]),
+ t!(aleftfirst012, &["a", "", ""], "a", &[(0, 0, 1)]),
+ t!(aleftfirst013, &["", "", "a"], "a", &[(0, 0, 0), (0, 1, 1)]),
+ t!(aleftfirst020, &["abcd", "ab"], "abcd", &[(0, 0, 4)]),
+ t!(aleftfirst030, &["ab", "ab"], "abcd", &[(0, 0, 2)]),
+ t!(aleftfirst040, &["a", "ab"], "xayabbbz", &[]),
+ t!(aleftfirst100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[]),
+ t!(aleftfirst110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[]),
+ t!(aleftfirst300, &["abcd", "b", "bce"], "abce", &[]),
+ t!(aleftfirst310, &["abcd", "b", "bce", "ce"], "abce", &[]),
+ t!(
+ aleftfirst320,
+ &["a", "abcdefghi", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(0, 0, 1)]
+ ),
+ t!(aleftfirst330, &["a", "abab"], "abab", &[(0, 0, 1)]),
+ t!(aleftfirst400, &["wise", "samwise", "sam"], "samwix", &[(2, 0, 3)]),
+];
+
+/// Tests for non-overlapping leftmost-longest match semantics. These tests
+/// should generally be specific to leftmost-longest, which means they should
+/// generally fail under leftmost-first semantics.
+const LEFTMOST_LONGEST: &'static [SearchTest] = &[
+ t!(leftlong000, &["ab", "abcd"], "abcd", &[(1, 0, 4)]),
+ t!(leftlong010, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4),]),
+ t!(leftlong020, &["", "a"], "a", &[(1, 0, 1)]),
+ t!(leftlong021, &["", "a", ""], "a", &[(1, 0, 1)]),
+ t!(leftlong022, &["a", "", ""], "a", &[(0, 0, 1)]),
+ t!(leftlong023, &["", "", "a"], "a", &[(2, 0, 1)]),
+ t!(leftlong024, &["", "a"], "ab", &[(1, 0, 1), (0, 2, 2)]),
+ t!(leftlong030, &["", "a"], "aa", &[(1, 0, 1), (1, 1, 2)]),
+ t!(leftlong040, &["a", "ab"], "a", &[(0, 0, 1)]),
+ t!(leftlong050, &["a", "ab"], "ab", &[(1, 0, 2)]),
+ t!(leftlong060, &["ab", "a"], "a", &[(1, 0, 1)]),
+ t!(leftlong070, &["ab", "a"], "ab", &[(0, 0, 2)]),
+ t!(leftlong100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(2, 1, 6)]),
+ t!(leftlong110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]),
+ t!(leftlong300, &["abcd", "b", "bce"], "abce", &[(2, 1, 4)]),
+ t!(
+ leftlong310,
+ &["a", "abcdefghi", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(3, 0, 8),]
+ ),
+ t!(leftlong320, &["a", "abab"], "abab", &[(1, 0, 4)]),
+ t!(leftlong330, &["abcd", "b", "ce"], "abce", &[(1, 1, 2), (2, 2, 4),]),
+ t!(leftlong340, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (1, 3, 5)]),
+];
+
+/// Like LEFTMOST_LONGEST, but for anchored searches.
+const ANCHORED_LEFTMOST_LONGEST: &'static [SearchTest] = &[
+ t!(aleftlong000, &["ab", "abcd"], "abcd", &[(1, 0, 4)]),
+ t!(aleftlong010, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4),]),
+ t!(aleftlong020, &["", "a"], "a", &[(1, 0, 1)]),
+ t!(aleftlong021, &["", "a", ""], "a", &[(1, 0, 1)]),
+ t!(aleftlong022, &["a", "", ""], "a", &[(0, 0, 1)]),
+ t!(aleftlong023, &["", "", "a"], "a", &[(2, 0, 1)]),
+ t!(aleftlong030, &["", "a"], "aa", &[(1, 0, 1), (1, 1, 2)]),
+ t!(aleftlong040, &["a", "ab"], "a", &[(0, 0, 1)]),
+ t!(aleftlong050, &["a", "ab"], "ab", &[(1, 0, 2)]),
+ t!(aleftlong060, &["ab", "a"], "a", &[(1, 0, 1)]),
+ t!(aleftlong070, &["ab", "a"], "ab", &[(0, 0, 2)]),
+ t!(aleftlong100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[]),
+ t!(aleftlong110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[]),
+ t!(aleftlong300, &["abcd", "b", "bce"], "abce", &[]),
+ t!(
+ aleftlong310,
+ &["a", "abcdefghi", "hz", "abcdefgh"],
+ "abcdefghz",
+ &[(3, 0, 8),]
+ ),
+ t!(aleftlong320, &["a", "abab"], "abab", &[(1, 0, 4)]),
+ t!(aleftlong330, &["abcd", "b", "ce"], "abce", &[]),
+ t!(aleftlong340, &["a", "ab"], "xayabbbz", &[]),
+];
+
+/// Tests for non-overlapping match semantics.
+///
+/// Generally these tests shouldn't pass when using overlapping semantics.
+/// These should pass for both standard and leftmost match semantics.
+const NON_OVERLAPPING: &'static [SearchTest] = &[
+ t!(nover010, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4),]),
+ t!(nover020, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4),]),
+ t!(nover030, &["abc", "bc"], "zazabcz", &[(0, 3, 6),]),
+ t!(
+ nover100,
+ &["ab", "ba"],
+ "abababa",
+ &[(0, 0, 2), (0, 2, 4), (0, 4, 6),]
+ ),
+ t!(nover200, &["foo", "foo"], "foobarfoo", &[(0, 0, 3), (0, 6, 9),]),
+ t!(nover300, &["", ""], "", &[(0, 0, 0),]),
+ t!(nover310, &["", ""], "a", &[(0, 0, 0), (0, 1, 1),]),
+];
+
+/// Like NON_OVERLAPPING, but for anchored searches.
+const ANCHORED_NON_OVERLAPPING: &'static [SearchTest] = &[
+ t!(anover010, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4),]),
+ t!(anover020, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4),]),
+ t!(anover030, &["abc", "bc"], "zazabcz", &[]),
+ t!(
+ anover100,
+ &["ab", "ba"],
+ "abababa",
+ &[(0, 0, 2), (0, 2, 4), (0, 4, 6)]
+ ),
+ t!(anover200, &["foo", "foo"], "foobarfoo", &[(0, 0, 3)]),
+ t!(anover300, &["", ""], "", &[(0, 0, 0)]),
+ t!(anover310, &["", ""], "a", &[(0, 0, 0), (0, 1, 1)]),
+];
+
+/// Tests for overlapping match semantics.
+///
+/// This only supports standard match semantics, since leftmost-{first,longest}
+/// do not support overlapping matches.
+const OVERLAPPING: &'static [SearchTest] = &[
+ t!(
+ over000,
+ &["abcd", "bcd", "cd", "b"],
+ "abcd",
+ &[(3, 1, 2), (0, 0, 4), (1, 1, 4), (2, 2, 4),]
+ ),
+ t!(
+ over010,
+ &["bcd", "cd", "b", "abcd"],
+ "abcd",
+ &[(2, 1, 2), (3, 0, 4), (0, 1, 4), (1, 2, 4),]
+ ),
+ t!(
+ over020,
+ &["abcd", "bcd", "cd"],
+ "abcd",
+ &[(0, 0, 4), (1, 1, 4), (2, 2, 4),]
+ ),
+ t!(
+ over030,
+ &["bcd", "abcd", "cd"],
+ "abcd",
+ &[(1, 0, 4), (0, 1, 4), (2, 2, 4),]
+ ),
+ t!(
+ over040,
+ &["bcd", "cd", "abcd"],
+ "abcd",
+ &[(2, 0, 4), (0, 1, 4), (1, 2, 4),]
+ ),
+ t!(over050, &["abc", "bc"], "zazabcz", &[(0, 3, 6), (1, 4, 6),]),
+ t!(
+ over100,
+ &["ab", "ba"],
+ "abababa",
+ &[(0, 0, 2), (1, 1, 3), (0, 2, 4), (1, 3, 5), (0, 4, 6), (1, 5, 7),]
+ ),
+ t!(
+ over200,
+ &["foo", "foo"],
+ "foobarfoo",
+ &[(0, 0, 3), (1, 0, 3), (0, 6, 9), (1, 6, 9),]
+ ),
+ t!(over300, &["", ""], "", &[(0, 0, 0), (1, 0, 0),]),
+ t!(
+ over310,
+ &["", ""],
+ "a",
+ &[(0, 0, 0), (1, 0, 0), (0, 1, 1), (1, 1, 1),]
+ ),
+ t!(over320, &["", "a"], "a", &[(0, 0, 0), (1, 0, 1), (0, 1, 1),]),
+ t!(
+ over330,
+ &["", "a", ""],
+ "a",
+ &[(0, 0, 0), (2, 0, 0), (1, 0, 1), (0, 1, 1), (2, 1, 1),]
+ ),
+ t!(
+ over340,
+ &["a", "", ""],
+ "a",
+ &[(1, 0, 0), (2, 0, 0), (0, 0, 1), (1, 1, 1), (2, 1, 1),]
+ ),
+ t!(
+ over350,
+ &["", "", "a"],
+ "a",
+ &[(0, 0, 0), (1, 0, 0), (2, 0, 1), (0, 1, 1), (1, 1, 1),]
+ ),
+ t!(
+ over360,
+ &["foo", "foofoo"],
+ "foofoo",
+ &[(0, 0, 3), (1, 0, 6), (0, 3, 6)]
+ ),
+];
+
+/*
+Iterators of anchored overlapping searches were removed from the API in
+after 0.7, but we leave the tests commented out for posterity.
+/// Like OVERLAPPING, but for anchored searches.
+const ANCHORED_OVERLAPPING: &'static [SearchTest] = &[
+ t!(aover000, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4)]),
+ t!(aover010, &["bcd", "cd", "b", "abcd"], "abcd", &[(3, 0, 4)]),
+ t!(aover020, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4)]),
+ t!(aover030, &["bcd", "abcd", "cd"], "abcd", &[(1, 0, 4)]),
+ t!(aover040, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4)]),
+ t!(aover050, &["abc", "bc"], "zazabcz", &[]),
+ t!(aover100, &["ab", "ba"], "abababa", &[(0, 0, 2)]),
+ t!(aover200, &["foo", "foo"], "foobarfoo", &[(0, 0, 3), (1, 0, 3)]),
+ t!(aover300, &["", ""], "", &[(0, 0, 0), (1, 0, 0),]),
+ t!(aover310, &["", ""], "a", &[(0, 0, 0), (1, 0, 0)]),
+ t!(aover320, &["", "a"], "a", &[(0, 0, 0), (1, 0, 1)]),
+ t!(aover330, &["", "a", ""], "a", &[(0, 0, 0), (2, 0, 0), (1, 0, 1)]),
+ t!(aover340, &["a", "", ""], "a", &[(1, 0, 0), (2, 0, 0), (0, 0, 1)]),
+ t!(aover350, &["", "", "a"], "a", &[(0, 0, 0), (1, 0, 0), (2, 0, 1)]),
+ t!(aover360, &["foo", "foofoo"], "foofoo", &[(0, 0, 3), (1, 0, 6)]),
+];
+*/
+
+/// Tests for ASCII case insensitivity.
+///
+/// These tests should all have the same behavior regardless of match semantics
+/// or whether the search is overlapping.
+const ASCII_CASE_INSENSITIVE: &'static [SearchTest] = &[
+ t!(acasei000, &["a"], "A", &[(0, 0, 1)]),
+ t!(acasei010, &["Samwise"], "SAMWISE", &[(0, 0, 7)]),
+ t!(acasei011, &["Samwise"], "SAMWISE.abcd", &[(0, 0, 7)]),
+ t!(acasei020, &["fOoBaR"], "quux foobar baz", &[(0, 5, 11)]),
+];
+
+/// Like ASCII_CASE_INSENSITIVE, but specifically for non-overlapping tests.
+const ASCII_CASE_INSENSITIVE_NON_OVERLAPPING: &'static [SearchTest] = &[
+ t!(acasei000, &["foo", "FOO"], "fOo", &[(0, 0, 3)]),
+ t!(acasei000, &["FOO", "foo"], "fOo", &[(0, 0, 3)]),
+ t!(acasei010, &["abc", "def"], "abcdef", &[(0, 0, 3), (1, 3, 6)]),
+];
+
+/// Like ASCII_CASE_INSENSITIVE, but specifically for overlapping tests.
+const ASCII_CASE_INSENSITIVE_OVERLAPPING: &'static [SearchTest] = &[
+ t!(acasei000, &["foo", "FOO"], "fOo", &[(0, 0, 3), (1, 0, 3)]),
+ t!(acasei001, &["FOO", "foo"], "fOo", &[(0, 0, 3), (1, 0, 3)]),
+ // This is a regression test from:
+ // https://github.com/BurntSushi/aho-corasick/issues/68
+ // Previously, it was reporting a duplicate (1, 3, 6) match.
+ t!(
+ acasei010,
+ &["abc", "def", "abcdef"],
+ "abcdef",
+ &[(0, 0, 3), (2, 0, 6), (1, 3, 6)]
+ ),
+];
+
+/// Regression tests that are applied to all Aho-Corasick combinations.
+///
+/// If regression tests are needed for specific match semantics, then add them
+/// to the appropriate group above.
+const REGRESSION: &'static [SearchTest] = &[
+ t!(regression010, &["inf", "ind"], "infind", &[(0, 0, 3), (1, 3, 6),]),
+ t!(regression020, &["ind", "inf"], "infind", &[(1, 0, 3), (0, 3, 6),]),
+ t!(
+ regression030,
+ &["libcore/", "libstd/"],
+ "libcore/char/methods.rs",
+ &[(0, 0, 8),]
+ ),
+ t!(
+ regression040,
+ &["libstd/", "libcore/"],
+ "libcore/char/methods.rs",
+ &[(1, 0, 8),]
+ ),
+ t!(
+ regression050,
+ &["\x00\x00\x01", "\x00\x00\x00"],
+ "\x00\x00\x00",
+ &[(1, 0, 3),]
+ ),
+ t!(
+ regression060,
+ &["\x00\x00\x00", "\x00\x00\x01"],
+ "\x00\x00\x00",
+ &[(0, 0, 3),]
+ ),
+];
+
+// Now define a test for each combination of things above that we want to run.
+// Since there are a few different combinations for each collection of tests,
+// we define a couple of macros to avoid repetition drudgery. The testconfig
+// macro constructs the automaton from a given match kind, and runs the search
+// tests one-by-one over the given collection. The `with` parameter allows one
+// to configure the builder with additional parameters. The testcombo macro
+// invokes testconfig in precisely this way: it sets up several tests where
+// each one turns a different knob on AhoCorasickBuilder.
+
+macro_rules! testconfig {
+ (anchored, $name:ident, $collection:expr, $kind:ident, $with:expr) => {
+ #[test]
+ fn $name() {
+ run_search_tests($collection, |test| {
+ let mut builder = AhoCorasick::builder();
+ $with(&mut builder);
+ let input = Input::new(test.haystack).anchored(Anchored::Yes);
+ builder
+ .match_kind(MatchKind::$kind)
+ .build(test.patterns)
+ .unwrap()
+ .try_find_iter(input)
+ .unwrap()
+ .collect()
+ });
+ }
+ };
+ (overlapping, $name:ident, $collection:expr, $kind:ident, $with:expr) => {
+ #[test]
+ fn $name() {
+ run_search_tests($collection, |test| {
+ let mut builder = AhoCorasick::builder();
+ $with(&mut builder);
+ builder
+ .match_kind(MatchKind::$kind)
+ .build(test.patterns)
+ .unwrap()
+ .find_overlapping_iter(test.haystack)
+ .collect()
+ });
+ }
+ };
+ (stream, $name:ident, $collection:expr, $kind:ident, $with:expr) => {
+ #[test]
+ fn $name() {
+ run_stream_search_tests($collection, |test| {
+ let buf = std::io::BufReader::with_capacity(
+ 1,
+ test.haystack.as_bytes(),
+ );
+ let mut builder = AhoCorasick::builder();
+ $with(&mut builder);
+ builder
+ .match_kind(MatchKind::$kind)
+ .build(test.patterns)
+ .unwrap()
+ .stream_find_iter(buf)
+ .map(|result| result.unwrap())
+ .collect()
+ });
+ }
+ };
+ ($name:ident, $collection:expr, $kind:ident, $with:expr) => {
+ #[test]
+ fn $name() {
+ run_search_tests($collection, |test| {
+ let mut builder = AhoCorasick::builder();
+ $with(&mut builder);
+ builder
+ .match_kind(MatchKind::$kind)
+ .build(test.patterns)
+ .unwrap()
+ .find_iter(test.haystack)
+ .collect()
+ });
+ }
+ };
+}
+
+macro_rules! testcombo {
+ ($name:ident, $collection:expr, $kind:ident) => {
+ mod $name {
+ use super::*;
+
+ testconfig!(default, $collection, $kind, |_| ());
+ testconfig!(
+ nfa_default,
+ $collection,
+ $kind,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::NoncontiguousNFA));
+ }
+ );
+ testconfig!(
+ nfa_noncontig_no_prefilter,
+ $collection,
+ $kind,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::NoncontiguousNFA))
+ .prefilter(false);
+ }
+ );
+ testconfig!(
+ nfa_noncontig_all_sparse,
+ $collection,
+ $kind,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::NoncontiguousNFA))
+ .dense_depth(0);
+ }
+ );
+ testconfig!(
+ nfa_noncontig_all_dense,
+ $collection,
+ $kind,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::NoncontiguousNFA))
+ .dense_depth(usize::MAX);
+ }
+ );
+ testconfig!(
+ nfa_contig_default,
+ $collection,
+ $kind,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::ContiguousNFA));
+ }
+ );
+ testconfig!(
+ nfa_contig_no_prefilter,
+ $collection,
+ $kind,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::ContiguousNFA))
+ .prefilter(false);
+ }
+ );
+ testconfig!(
+ nfa_contig_all_sparse,
+ $collection,
+ $kind,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::ContiguousNFA))
+ .dense_depth(0);
+ }
+ );
+ testconfig!(
+ nfa_contig_all_dense,
+ $collection,
+ $kind,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::ContiguousNFA))
+ .dense_depth(usize::MAX);
+ }
+ );
+ testconfig!(
+ nfa_contig_no_byte_class,
+ $collection,
+ $kind,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::ContiguousNFA))
+ .byte_classes(false);
+ }
+ );
+ testconfig!(
+ dfa_default,
+ $collection,
+ $kind,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::DFA));
+ }
+ );
+ testconfig!(
+ dfa_start_both,
+ $collection,
+ $kind,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::DFA))
+ .start_kind(StartKind::Both);
+ }
+ );
+ testconfig!(
+ dfa_no_prefilter,
+ $collection,
+ $kind,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::DFA)).prefilter(false);
+ }
+ );
+ testconfig!(
+ dfa_start_both_no_prefilter,
+ $collection,
+ $kind,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::DFA))
+ .start_kind(StartKind::Both)
+ .prefilter(false);
+ }
+ );
+ testconfig!(
+ dfa_no_byte_class,
+ $collection,
+ $kind,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::DFA)).byte_classes(false);
+ }
+ );
+ testconfig!(
+ dfa_start_both_no_byte_class,
+ $collection,
+ $kind,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::DFA))
+ .start_kind(StartKind::Both)
+ .byte_classes(false);
+ }
+ );
+ }
+ };
+}
+
+// Write out the various combinations of match semantics given the variety of
+// configurations tested by 'testcombo!'.
+testcombo!(search_leftmost_longest, AC_LEFTMOST_LONGEST, LeftmostLongest);
+testcombo!(search_leftmost_first, AC_LEFTMOST_FIRST, LeftmostFirst);
+testcombo!(
+ search_standard_nonoverlapping,
+ AC_STANDARD_NON_OVERLAPPING,
+ Standard
+);
+
+// Write out the overlapping combo by hand since there is only one of them.
+testconfig!(
+ overlapping,
+ search_standard_overlapping_default,
+ AC_STANDARD_OVERLAPPING,
+ Standard,
+ |_| ()
+);
+testconfig!(
+ overlapping,
+ search_standard_overlapping_nfa_noncontig_default,
+ AC_STANDARD_OVERLAPPING,
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::NoncontiguousNFA));
+ }
+);
+testconfig!(
+ overlapping,
+ search_standard_overlapping_nfa_noncontig_no_prefilter,
+ AC_STANDARD_OVERLAPPING,
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::NoncontiguousNFA)).prefilter(false);
+ }
+);
+testconfig!(
+ overlapping,
+ search_standard_overlapping_nfa_contig_default,
+ AC_STANDARD_OVERLAPPING,
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::ContiguousNFA));
+ }
+);
+testconfig!(
+ overlapping,
+ search_standard_overlapping_nfa_contig_no_prefilter,
+ AC_STANDARD_OVERLAPPING,
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::ContiguousNFA)).prefilter(false);
+ }
+);
+testconfig!(
+ overlapping,
+ search_standard_overlapping_nfa_contig_all_sparse,
+ AC_STANDARD_OVERLAPPING,
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::ContiguousNFA)).dense_depth(0);
+ }
+);
+testconfig!(
+ overlapping,
+ search_standard_overlapping_nfa_contig_all_dense,
+ AC_STANDARD_OVERLAPPING,
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::ContiguousNFA)).dense_depth(usize::MAX);
+ }
+);
+testconfig!(
+ overlapping,
+ search_standard_overlapping_dfa_default,
+ AC_STANDARD_OVERLAPPING,
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::DFA));
+ }
+);
+testconfig!(
+ overlapping,
+ search_standard_overlapping_dfa_start_both,
+ AC_STANDARD_OVERLAPPING,
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::DFA)).start_kind(StartKind::Both);
+ }
+);
+testconfig!(
+ overlapping,
+ search_standard_overlapping_dfa_no_prefilter,
+ AC_STANDARD_OVERLAPPING,
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::DFA)).prefilter(false);
+ }
+);
+testconfig!(
+ overlapping,
+ search_standard_overlapping_dfa_start_both_no_prefilter,
+ AC_STANDARD_OVERLAPPING,
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::DFA))
+ .start_kind(StartKind::Both)
+ .prefilter(false);
+ }
+);
+testconfig!(
+ overlapping,
+ search_standard_overlapping_dfa_no_byte_class,
+ AC_STANDARD_OVERLAPPING,
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::DFA)).byte_classes(false);
+ }
+);
+testconfig!(
+ overlapping,
+ search_standard_overlapping_dfa_start_both_no_byte_class,
+ AC_STANDARD_OVERLAPPING,
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::DFA))
+ .start_kind(StartKind::Both)
+ .byte_classes(false);
+ }
+);
+
+// Also write out tests manually for streams, since we only test the standard
+// match semantics. We also don't bother testing different automaton
+// configurations, since those are well covered by tests above.
+#[cfg(feature = "std")]
+testconfig!(
+ stream,
+ search_standard_stream_default,
+ AC_STANDARD_NON_OVERLAPPING,
+ Standard,
+ |_| ()
+);
+#[cfg(feature = "std")]
+testconfig!(
+ stream,
+ search_standard_stream_nfa_noncontig_default,
+ AC_STANDARD_NON_OVERLAPPING,
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::NoncontiguousNFA));
+ }
+);
+#[cfg(feature = "std")]
+testconfig!(
+ stream,
+ search_standard_stream_nfa_contig_default,
+ AC_STANDARD_NON_OVERLAPPING,
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::ContiguousNFA));
+ }
+);
+#[cfg(feature = "std")]
+testconfig!(
+ stream,
+ search_standard_stream_dfa_default,
+ AC_STANDARD_NON_OVERLAPPING,
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::DFA));
+ }
+);
+
+// Same thing for anchored searches. Write them out manually.
+testconfig!(
+ anchored,
+ search_standard_anchored_default,
+ AC_STANDARD_ANCHORED_NON_OVERLAPPING,
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.start_kind(StartKind::Anchored);
+ }
+);
+testconfig!(
+ anchored,
+ search_standard_anchored_nfa_noncontig_default,
+ AC_STANDARD_ANCHORED_NON_OVERLAPPING,
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.start_kind(StartKind::Anchored)
+ .kind(Some(AhoCorasickKind::NoncontiguousNFA));
+ }
+);
+testconfig!(
+ anchored,
+ search_standard_anchored_nfa_contig_default,
+ AC_STANDARD_ANCHORED_NON_OVERLAPPING,
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.start_kind(StartKind::Anchored)
+ .kind(Some(AhoCorasickKind::ContiguousNFA));
+ }
+);
+testconfig!(
+ anchored,
+ search_standard_anchored_dfa_default,
+ AC_STANDARD_ANCHORED_NON_OVERLAPPING,
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.start_kind(StartKind::Anchored).kind(Some(AhoCorasickKind::DFA));
+ }
+);
+testconfig!(
+ anchored,
+ search_standard_anchored_dfa_start_both,
+ AC_STANDARD_ANCHORED_NON_OVERLAPPING,
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.start_kind(StartKind::Both).kind(Some(AhoCorasickKind::DFA));
+ }
+);
+testconfig!(
+ anchored,
+ search_leftmost_first_anchored_default,
+ AC_LEFTMOST_FIRST_ANCHORED,
+ LeftmostFirst,
+ |b: &mut AhoCorasickBuilder| {
+ b.start_kind(StartKind::Anchored);
+ }
+);
+testconfig!(
+ anchored,
+ search_leftmost_first_anchored_nfa_noncontig_default,
+ AC_LEFTMOST_FIRST_ANCHORED,
+ LeftmostFirst,
+ |b: &mut AhoCorasickBuilder| {
+ b.start_kind(StartKind::Anchored)
+ .kind(Some(AhoCorasickKind::NoncontiguousNFA));
+ }
+);
+testconfig!(
+ anchored,
+ search_leftmost_first_anchored_nfa_contig_default,
+ AC_LEFTMOST_FIRST_ANCHORED,
+ LeftmostFirst,
+ |b: &mut AhoCorasickBuilder| {
+ b.start_kind(StartKind::Anchored)
+ .kind(Some(AhoCorasickKind::ContiguousNFA));
+ }
+);
+testconfig!(
+ anchored,
+ search_leftmost_first_anchored_dfa_default,
+ AC_LEFTMOST_FIRST_ANCHORED,
+ LeftmostFirst,
+ |b: &mut AhoCorasickBuilder| {
+ b.start_kind(StartKind::Anchored).kind(Some(AhoCorasickKind::DFA));
+ }
+);
+testconfig!(
+ anchored,
+ search_leftmost_first_anchored_dfa_start_both,
+ AC_LEFTMOST_FIRST_ANCHORED,
+ LeftmostFirst,
+ |b: &mut AhoCorasickBuilder| {
+ b.start_kind(StartKind::Both).kind(Some(AhoCorasickKind::DFA));
+ }
+);
+testconfig!(
+ anchored,
+ search_leftmost_longest_anchored_default,
+ AC_LEFTMOST_LONGEST_ANCHORED,
+ LeftmostLongest,
+ |b: &mut AhoCorasickBuilder| {
+ b.start_kind(StartKind::Anchored);
+ }
+);
+testconfig!(
+ anchored,
+ search_leftmost_longest_anchored_nfa_noncontig_default,
+ AC_LEFTMOST_LONGEST_ANCHORED,
+ LeftmostLongest,
+ |b: &mut AhoCorasickBuilder| {
+ b.start_kind(StartKind::Anchored)
+ .kind(Some(AhoCorasickKind::NoncontiguousNFA));
+ }
+);
+testconfig!(
+ anchored,
+ search_leftmost_longest_anchored_nfa_contig_default,
+ AC_LEFTMOST_LONGEST_ANCHORED,
+ LeftmostLongest,
+ |b: &mut AhoCorasickBuilder| {
+ b.start_kind(StartKind::Anchored)
+ .kind(Some(AhoCorasickKind::ContiguousNFA));
+ }
+);
+testconfig!(
+ anchored,
+ search_leftmost_longest_anchored_dfa_default,
+ AC_LEFTMOST_LONGEST_ANCHORED,
+ LeftmostLongest,
+ |b: &mut AhoCorasickBuilder| {
+ b.start_kind(StartKind::Anchored).kind(Some(AhoCorasickKind::DFA));
+ }
+);
+testconfig!(
+ anchored,
+ search_leftmost_longest_anchored_dfa_start_both,
+ AC_LEFTMOST_LONGEST_ANCHORED,
+ LeftmostLongest,
+ |b: &mut AhoCorasickBuilder| {
+ b.start_kind(StartKind::Both).kind(Some(AhoCorasickKind::DFA));
+ }
+);
+
+// And also write out the test combinations for ASCII case insensitivity.
+testconfig!(
+ acasei_standard_default,
+ &[ASCII_CASE_INSENSITIVE],
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.prefilter(false).ascii_case_insensitive(true);
+ }
+);
+testconfig!(
+ acasei_standard_nfa_noncontig_default,
+ &[ASCII_CASE_INSENSITIVE],
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::NoncontiguousNFA))
+ .prefilter(false)
+ .ascii_case_insensitive(true);
+ }
+);
+testconfig!(
+ acasei_standard_nfa_contig_default,
+ &[ASCII_CASE_INSENSITIVE],
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::ContiguousNFA))
+ .prefilter(false)
+ .ascii_case_insensitive(true);
+ }
+);
+testconfig!(
+ acasei_standard_dfa_default,
+ &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING],
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::DFA)).ascii_case_insensitive(true);
+ }
+);
+testconfig!(
+ overlapping,
+ acasei_standard_overlapping_default,
+ &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_OVERLAPPING],
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.ascii_case_insensitive(true);
+ }
+);
+testconfig!(
+ overlapping,
+ acasei_standard_overlapping_nfa_noncontig_default,
+ &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_OVERLAPPING],
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::NoncontiguousNFA))
+ .ascii_case_insensitive(true);
+ }
+);
+testconfig!(
+ overlapping,
+ acasei_standard_overlapping_nfa_contig_default,
+ &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_OVERLAPPING],
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::ContiguousNFA))
+ .ascii_case_insensitive(true);
+ }
+);
+testconfig!(
+ overlapping,
+ acasei_standard_overlapping_dfa_default,
+ &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_OVERLAPPING],
+ Standard,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::DFA)).ascii_case_insensitive(true);
+ }
+);
+testconfig!(
+ acasei_leftmost_first_default,
+ &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING],
+ LeftmostFirst,
+ |b: &mut AhoCorasickBuilder| {
+ b.ascii_case_insensitive(true);
+ }
+);
+testconfig!(
+ acasei_leftmost_first_nfa_noncontig_default,
+ &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING],
+ LeftmostFirst,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::NoncontiguousNFA))
+ .ascii_case_insensitive(true);
+ }
+);
+testconfig!(
+ acasei_leftmost_first_nfa_contig_default,
+ &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING],
+ LeftmostFirst,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::ContiguousNFA))
+ .ascii_case_insensitive(true);
+ }
+);
+testconfig!(
+ acasei_leftmost_first_dfa_default,
+ &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING],
+ LeftmostFirst,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::DFA)).ascii_case_insensitive(true);
+ }
+);
+testconfig!(
+ acasei_leftmost_longest_default,
+ &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING],
+ LeftmostLongest,
+ |b: &mut AhoCorasickBuilder| {
+ b.ascii_case_insensitive(true);
+ }
+);
+testconfig!(
+ acasei_leftmost_longest_nfa_noncontig_default,
+ &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING],
+ LeftmostLongest,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::NoncontiguousNFA))
+ .ascii_case_insensitive(true);
+ }
+);
+testconfig!(
+ acasei_leftmost_longest_nfa_contig_default,
+ &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING],
+ LeftmostLongest,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::ContiguousNFA))
+ .ascii_case_insensitive(true);
+ }
+);
+testconfig!(
+ acasei_leftmost_longest_dfa_default,
+ &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING],
+ LeftmostLongest,
+ |b: &mut AhoCorasickBuilder| {
+ b.kind(Some(AhoCorasickKind::DFA)).ascii_case_insensitive(true);
+ }
+);
+
+fn run_search_tests<F: FnMut(&SearchTest) -> Vec<Match>>(
+ which: TestCollection,
+ mut f: F,
+) {
+ let get_match_triples =
+ |matches: Vec<Match>| -> Vec<(usize, usize, usize)> {
+ matches
+ .into_iter()
+ .map(|m| (m.pattern().as_usize(), m.start(), m.end()))
+ .collect()
+ };
+ for &tests in which {
+ for test in tests {
+ assert_eq!(
+ test.matches,
+ get_match_triples(f(&test)).as_slice(),
+ "test: {}, patterns: {:?}, haystack: {:?}",
+ test.name,
+ test.patterns,
+ test.haystack
+ );
+ }
+ }
+}
+
+// Like 'run_search_tests', but we skip any tests that contain the empty
+// pattern because stream searching doesn't support it.
+#[cfg(feature = "std")]
+fn run_stream_search_tests<F: FnMut(&SearchTest) -> Vec<Match>>(
+ which: TestCollection,
+ mut f: F,
+) {
+ let get_match_triples =
+ |matches: Vec<Match>| -> Vec<(usize, usize, usize)> {
+ matches
+ .into_iter()
+ .map(|m| (m.pattern().as_usize(), m.start(), m.end()))
+ .collect()
+ };
+ for &tests in which {
+ for test in tests {
+ if test.patterns.iter().any(|p| p.is_empty()) {
+ continue;
+ }
+ assert_eq!(
+ test.matches,
+ get_match_triples(f(&test)).as_slice(),
+ "test: {}, patterns: {:?}, haystack: {:?}",
+ test.name,
+ test.patterns,
+ test.haystack
+ );
+ }
+ }
+}
+
+#[test]
+fn search_tests_have_unique_names() {
+ let assert = |constname, tests: &[SearchTest]| {
+ let mut seen = HashMap::new(); // map from test name to position
+ for (i, test) in tests.iter().enumerate() {
+ if !seen.contains_key(test.name) {
+ seen.insert(test.name, i);
+ } else {
+ let last = seen[test.name];
+ panic!(
+ "{} tests have duplicate names at positions {} and {}",
+ constname, last, i
+ );
+ }
+ }
+ };
+ assert("BASICS", BASICS);
+ assert("STANDARD", STANDARD);
+ assert("LEFTMOST", LEFTMOST);
+ assert("LEFTMOST_FIRST", LEFTMOST_FIRST);
+ assert("LEFTMOST_LONGEST", LEFTMOST_LONGEST);
+ assert("NON_OVERLAPPING", NON_OVERLAPPING);
+ assert("OVERLAPPING", OVERLAPPING);
+ assert("REGRESSION", REGRESSION);
+}
+
+#[cfg(feature = "std")]
+#[test]
+#[should_panic]
+fn stream_not_allowed_leftmost_first() {
+ let fsm = AhoCorasick::builder()
+ .match_kind(MatchKind::LeftmostFirst)
+ .build(None::<String>)
+ .unwrap();
+ assert_eq!(fsm.stream_find_iter(&b""[..]).count(), 0);
+}
+
+#[cfg(feature = "std")]
+#[test]
+#[should_panic]
+fn stream_not_allowed_leftmost_longest() {
+ let fsm = AhoCorasick::builder()
+ .match_kind(MatchKind::LeftmostLongest)
+ .build(None::<String>)
+ .unwrap();
+ assert_eq!(fsm.stream_find_iter(&b""[..]).count(), 0);
+}
+
+#[test]
+#[should_panic]
+fn overlapping_not_allowed_leftmost_first() {
+ let fsm = AhoCorasick::builder()
+ .match_kind(MatchKind::LeftmostFirst)
+ .build(None::<String>)
+ .unwrap();
+ assert_eq!(fsm.find_overlapping_iter("").count(), 0);
+}
+
+#[test]
+#[should_panic]
+fn overlapping_not_allowed_leftmost_longest() {
+ let fsm = AhoCorasick::builder()
+ .match_kind(MatchKind::LeftmostLongest)
+ .build(None::<String>)
+ .unwrap();
+ assert_eq!(fsm.find_overlapping_iter("").count(), 0);
+}
+
+// This tests that if we build an AC matcher with an "unanchored" start kind,
+// then we can't run an anchored search even if the underlying searcher
+// supports it.
+//
+// The key bit here is that both of the NFAs in this crate unconditionally
+// support both unanchored and anchored searches, but the DFA does not because
+// of the added cost of doing so. To avoid the top-level AC matcher sometimes
+// supporting anchored and sometimes not (depending on which searcher it
+// chooses to use internally), we ensure that the given 'StartKind' is always
+// respected.
+#[test]
+fn anchored_not_allowed_even_if_technically_available() {
+ let ac = AhoCorasick::builder()
+ .kind(Some(AhoCorasickKind::NoncontiguousNFA))
+ .start_kind(StartKind::Unanchored)
+ .build(&["foo"])
+ .unwrap();
+ assert!(ac.try_find(Input::new("foo").anchored(Anchored::Yes)).is_err());
+
+ let ac = AhoCorasick::builder()
+ .kind(Some(AhoCorasickKind::ContiguousNFA))
+ .start_kind(StartKind::Unanchored)
+ .build(&["foo"])
+ .unwrap();
+ assert!(ac.try_find(Input::new("foo").anchored(Anchored::Yes)).is_err());
+
+ // For completeness, check that the DFA returns an error too.
+ let ac = AhoCorasick::builder()
+ .kind(Some(AhoCorasickKind::DFA))
+ .start_kind(StartKind::Unanchored)
+ .build(&["foo"])
+ .unwrap();
+ assert!(ac.try_find(Input::new("foo").anchored(Anchored::Yes)).is_err());
+}
+
+// This is like the test aboved, but with unanchored and anchored flipped. That
+// is, we asked for an AC searcher with anchored support and we check that
+// unanchored searches return an error even if the underlying searcher would
+// technically support it.
+#[test]
+fn unanchored_not_allowed_even_if_technically_available() {
+ let ac = AhoCorasick::builder()
+ .kind(Some(AhoCorasickKind::NoncontiguousNFA))
+ .start_kind(StartKind::Anchored)
+ .build(&["foo"])
+ .unwrap();
+ assert!(ac.try_find(Input::new("foo").anchored(Anchored::No)).is_err());
+
+ let ac = AhoCorasick::builder()
+ .kind(Some(AhoCorasickKind::ContiguousNFA))
+ .start_kind(StartKind::Anchored)
+ .build(&["foo"])
+ .unwrap();
+ assert!(ac.try_find(Input::new("foo").anchored(Anchored::No)).is_err());
+
+ // For completeness, check that the DFA returns an error too.
+ let ac = AhoCorasick::builder()
+ .kind(Some(AhoCorasickKind::DFA))
+ .start_kind(StartKind::Anchored)
+ .build(&["foo"])
+ .unwrap();
+ assert!(ac.try_find(Input::new("foo").anchored(Anchored::No)).is_err());
+}
+
+// This tests that a prefilter does not cause a search to report a match
+// outside the bounds provided by the caller.
+//
+// This is a regression test for a bug I introduced during the rewrite of most
+// of the crate after 0.7. It was never released. The tricky part here is
+// ensuring we get a prefilter that can report matches on its own (such as the
+// packed searcher). Otherwise, prefilters that report false positives might
+// have searched past the bounds provided by the caller, but confirming the
+// match would subsequently fail.
+#[test]
+fn prefilter_stays_in_bounds() {
+ let ac = AhoCorasick::builder()
+ .match_kind(MatchKind::LeftmostFirst)
+ .build(&["sam", "frodo", "pippin", "merry", "gandalf", "sauron"])
+ .unwrap();
+ let haystack = "foo gandalf";
+ assert_eq!(None, ac.find(Input::new(haystack).range(0..10)));
+}
+
+// See: https://github.com/BurntSushi/aho-corasick/issues/44
+//
+// In short, this test ensures that enabling ASCII case insensitivity does not
+// visit an exponential number of states when filling in failure transitions.
+#[test]
+fn regression_ascii_case_insensitive_no_exponential() {
+ let ac = AhoCorasick::builder()
+ .ascii_case_insensitive(true)
+ .build(&["Tsubaki House-Triple Shot Vol01校花三姐妹"])
+ .unwrap();
+ assert!(ac.find("").is_none());
+}
+
+// See: https://github.com/BurntSushi/aho-corasick/issues/53
+//
+// This test ensures that the rare byte prefilter works in a particular corner
+// case. In particular, the shift offset detected for '/' in the patterns below
+// was incorrect, leading to a false negative.
+#[test]
+fn regression_rare_byte_prefilter() {
+ use crate::AhoCorasick;
+
+ let ac = AhoCorasick::new(&["ab/j/", "x/"]).unwrap();
+ assert!(ac.is_match("ab/j/"));
+}
+
+#[test]
+fn regression_case_insensitive_prefilter() {
+ for c in b'a'..b'z' {
+ for c2 in b'a'..b'z' {
+ let c = c as char;
+ let c2 = c2 as char;
+ let needle = format!("{}{}", c, c2).to_lowercase();
+ let haystack = needle.to_uppercase();
+ let ac = AhoCorasick::builder()
+ .ascii_case_insensitive(true)
+ .prefilter(true)
+ .build(&[&needle])
+ .unwrap();
+ assert_eq!(
+ 1,
+ ac.find_iter(&haystack).count(),
+ "failed to find {:?} in {:?}\n\nautomaton:\n{:?}",
+ needle,
+ haystack,
+ ac,
+ );
+ }
+ }
+}
+
+// See: https://github.com/BurntSushi/aho-corasick/issues/64
+//
+// This occurs when the rare byte prefilter is active.
+#[cfg(feature = "std")]
+#[test]
+fn regression_stream_rare_byte_prefilter() {
+ use std::io::Read;
+
+ // NOTE: The test only fails if this ends with j.
+ const MAGIC: [u8; 5] = *b"1234j";
+
+ // NOTE: The test fails for value in 8188..=8191 These value put the string
+ // to search accross two call to read because the buffer size is 64KB by
+ // default.
+ const BEGIN: usize = 65_535;
+
+ /// This is just a structure that implements Reader. The reader
+ /// implementation will simulate a file filled with 0, except for the MAGIC
+ /// string at offset BEGIN.
+ #[derive(Default)]
+ struct R {
+ read: usize,
+ }
+
+ impl Read for R {
+ fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
+ if self.read > 100000 {
+ return Ok(0);
+ }
+ let mut from = 0;
+ if self.read < BEGIN {
+ from = buf.len().min(BEGIN - self.read);
+ for x in 0..from {
+ buf[x] = 0;
+ }
+ self.read += from;
+ }
+ if self.read >= BEGIN && self.read <= BEGIN + MAGIC.len() {
+ let to = buf.len().min(BEGIN + MAGIC.len() - self.read + from);
+ if to > from {
+ buf[from..to].copy_from_slice(
+ &MAGIC
+ [self.read - BEGIN..self.read - BEGIN + to - from],
+ );
+ self.read += to - from;
+ from = to;
+ }
+ }
+ for x in from..buf.len() {
+ buf[x] = 0;
+ self.read += 1;
+ }
+ Ok(buf.len())
+ }
+ }
+
+ fn run() -> std::io::Result<()> {
+ let aut = AhoCorasick::builder()
+ // Enable byte classes to make debugging the automaton easier. It
+ // should have no effect on the test result.
+ .byte_classes(false)
+ .build(&[&MAGIC])
+ .unwrap();
+
+ // While reading from a vector, it works:
+ let mut buf = alloc::vec![];
+ R::default().read_to_end(&mut buf)?;
+ let from_whole = aut.find_iter(&buf).next().unwrap().start();
+
+ // But using stream_find_iter fails!
+ let mut file = std::io::BufReader::new(R::default());
+ let begin = aut
+ .stream_find_iter(&mut file)
+ .next()
+ .expect("NOT FOUND!!!!")? // Panic here
+ .start();
+ assert_eq!(from_whole, begin);
+ Ok(())
+ }
+
+ run().unwrap()
+}
diff --git a/third_party/rust/aho-corasick/src/transducer.rs b/third_party/rust/aho-corasick/src/transducer.rs
new file mode 100644
index 0000000000..39bb240f44
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/transducer.rs
@@ -0,0 +1,270 @@
+/*!
+Provides implementations of `fst::Automaton` for Aho-Corasick automata.
+
+This works by providing two wrapper types, [`Anchored`] and [`Unanchored`].
+The former executes an anchored search on an FST while the latter executes
+an unanchored search. Building these wrappers is fallible and will fail if
+the underlying Aho-Corasick automaton does not support the type of search it
+represents.
+*/
+
+use crate::{
+ automaton::{Automaton, StateID},
+ Anchored as AcAnchored, Input, MatchError,
+};
+
+/// Represents an unanchored Aho-Corasick search of a finite state transducer.
+///
+/// Wrapping an Aho-Corasick automaton in `Unanchored` will fail if the
+/// underlying automaton does not support unanchored searches.
+///
+/// # Example
+///
+/// This shows how to build an FST of keys and then run an unanchored search on
+/// those keys using an Aho-Corasick automaton.
+///
+/// ```
+/// use aho_corasick::{nfa::contiguous::NFA, transducer::Unanchored};
+/// use fst::{Automaton, IntoStreamer, Set, Streamer};
+///
+/// let set = Set::from_iter(&["abcd", "bc", "bcd", "xyz"]).unwrap();
+/// let nfa = NFA::new(&["bcd", "x"]).unwrap();
+/// // NFAs always support both unanchored and anchored searches.
+/// let searcher = Unanchored::new(&nfa).unwrap();
+///
+/// let mut stream = set.search(searcher).into_stream();
+/// let mut results = vec![];
+/// while let Some(key) = stream.next() {
+/// results.push(std::str::from_utf8(key).unwrap().to_string());
+/// }
+/// assert_eq!(vec!["abcd", "bcd", "xyz"], results);
+/// ```
+#[derive(Clone, Debug)]
+pub struct Unanchored<A>(A);
+
+impl<A: Automaton> Unanchored<A> {
+ /// Create a new `Unanchored` implementation of the `fst::Automaton` trait.
+ ///
+ /// If the given Aho-Corasick automaton does not support unanchored
+ /// searches, then this returns an error.
+ pub fn new(aut: A) -> Result<Unanchored<A>, MatchError> {
+ let input = Input::new("").anchored(AcAnchored::No);
+ let _ = aut.start_state(&input)?;
+ Ok(Unanchored(aut))
+ }
+
+ /// Returns a borrow to the underlying automaton.
+ pub fn as_ref(&self) -> &A {
+ &self.0
+ }
+
+ /// Unwrap this value and return the inner automaton.
+ pub fn into_inner(self) -> A {
+ self.0
+ }
+}
+
+impl<A: Automaton> fst::Automaton for Unanchored<A> {
+ type State = StateID;
+
+ #[inline]
+ fn start(&self) -> StateID {
+ let input = Input::new("").anchored(AcAnchored::No);
+ self.0.start_state(&input).expect("support for unanchored searches")
+ }
+
+ #[inline]
+ fn is_match(&self, state: &StateID) -> bool {
+ self.0.is_match(*state)
+ }
+
+ #[inline]
+ fn accept(&self, state: &StateID, byte: u8) -> StateID {
+ if fst::Automaton::is_match(self, state) {
+ return *state;
+ }
+ self.0.next_state(AcAnchored::No, *state, byte)
+ }
+
+ #[inline]
+ fn can_match(&self, state: &StateID) -> bool {
+ !self.0.is_dead(*state)
+ }
+}
+
+/// Represents an anchored Aho-Corasick search of a finite state transducer.
+///
+/// Wrapping an Aho-Corasick automaton in `Unanchored` will fail if the
+/// underlying automaton does not support unanchored searches.
+///
+/// # Example
+///
+/// This shows how to build an FST of keys and then run an anchored search on
+/// those keys using an Aho-Corasick automaton.
+///
+/// ```
+/// use aho_corasick::{nfa::contiguous::NFA, transducer::Anchored};
+/// use fst::{Automaton, IntoStreamer, Set, Streamer};
+///
+/// let set = Set::from_iter(&["abcd", "bc", "bcd", "xyz"]).unwrap();
+/// let nfa = NFA::new(&["bcd", "x"]).unwrap();
+/// // NFAs always support both unanchored and anchored searches.
+/// let searcher = Anchored::new(&nfa).unwrap();
+///
+/// let mut stream = set.search(searcher).into_stream();
+/// let mut results = vec![];
+/// while let Some(key) = stream.next() {
+/// results.push(std::str::from_utf8(key).unwrap().to_string());
+/// }
+/// assert_eq!(vec!["bcd", "xyz"], results);
+/// ```
+///
+/// This is like the example above, except we use an Aho-Corasick DFA, which
+/// requires explicitly configuring it to support anchored searches. (NFAs
+/// unconditionally support both unanchored and anchored searches.)
+///
+/// ```
+/// use aho_corasick::{dfa::DFA, transducer::Anchored, StartKind};
+/// use fst::{Automaton, IntoStreamer, Set, Streamer};
+///
+/// let set = Set::from_iter(&["abcd", "bc", "bcd", "xyz"]).unwrap();
+/// let dfa = DFA::builder()
+/// .start_kind(StartKind::Anchored)
+/// .build(&["bcd", "x"])
+/// .unwrap();
+/// // We've explicitly configured our DFA to support anchored searches.
+/// let searcher = Anchored::new(&dfa).unwrap();
+///
+/// let mut stream = set.search(searcher).into_stream();
+/// let mut results = vec![];
+/// while let Some(key) = stream.next() {
+/// results.push(std::str::from_utf8(key).unwrap().to_string());
+/// }
+/// assert_eq!(vec!["bcd", "xyz"], results);
+/// ```
+#[derive(Clone, Debug)]
+pub struct Anchored<A>(A);
+
+impl<A: Automaton> Anchored<A> {
+ /// Create a new `Anchored` implementation of the `fst::Automaton` trait.
+ ///
+ /// If the given Aho-Corasick automaton does not support anchored searches,
+ /// then this returns an error.
+ pub fn new(aut: A) -> Result<Anchored<A>, MatchError> {
+ let input = Input::new("").anchored(AcAnchored::Yes);
+ let _ = aut.start_state(&input)?;
+ Ok(Anchored(aut))
+ }
+
+ /// Returns a borrow to the underlying automaton.
+ pub fn as_ref(&self) -> &A {
+ &self.0
+ }
+
+ /// Unwrap this value and return the inner automaton.
+ pub fn into_inner(self) -> A {
+ self.0
+ }
+}
+
+impl<A: Automaton> fst::Automaton for Anchored<A> {
+ type State = StateID;
+
+ #[inline]
+ fn start(&self) -> StateID {
+ let input = Input::new("").anchored(AcAnchored::Yes);
+ self.0.start_state(&input).expect("support for unanchored searches")
+ }
+
+ #[inline]
+ fn is_match(&self, state: &StateID) -> bool {
+ self.0.is_match(*state)
+ }
+
+ #[inline]
+ fn accept(&self, state: &StateID, byte: u8) -> StateID {
+ if fst::Automaton::is_match(self, state) {
+ return *state;
+ }
+ self.0.next_state(AcAnchored::Yes, *state, byte)
+ }
+
+ #[inline]
+ fn can_match(&self, state: &StateID) -> bool {
+ !self.0.is_dead(*state)
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use alloc::{string::String, vec, vec::Vec};
+
+ use fst::{Automaton, IntoStreamer, Set, Streamer};
+
+ use crate::{
+ dfa::DFA,
+ nfa::{contiguous, noncontiguous},
+ StartKind,
+ };
+
+ use super::*;
+
+ fn search<A: Automaton, D: AsRef<[u8]>>(
+ set: &Set<D>,
+ aut: A,
+ ) -> Vec<String> {
+ let mut stream = set.search(aut).into_stream();
+ let mut results = vec![];
+ while let Some(key) = stream.next() {
+ results.push(String::from(core::str::from_utf8(key).unwrap()));
+ }
+ results
+ }
+
+ #[test]
+ fn unanchored() {
+ let set =
+ Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
+ .unwrap();
+ let patterns = vec!["baz", "bax"];
+ let expected = vec!["baz", "xbax"];
+
+ let aut = Unanchored(noncontiguous::NFA::new(&patterns).unwrap());
+ let got = search(&set, &aut);
+ assert_eq!(got, expected);
+
+ let aut = Unanchored(contiguous::NFA::new(&patterns).unwrap());
+ let got = search(&set, &aut);
+ assert_eq!(got, expected);
+
+ let aut = Unanchored(DFA::new(&patterns).unwrap());
+ let got = search(&set, &aut);
+ assert_eq!(got, expected);
+ }
+
+ #[test]
+ fn anchored() {
+ let set =
+ Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
+ .unwrap();
+ let patterns = vec!["baz", "bax"];
+ let expected = vec!["baz"];
+
+ let aut = Anchored(noncontiguous::NFA::new(&patterns).unwrap());
+ let got = search(&set, &aut);
+ assert_eq!(got, expected);
+
+ let aut = Anchored(contiguous::NFA::new(&patterns).unwrap());
+ let got = search(&set, &aut);
+ assert_eq!(got, expected);
+
+ let aut = Anchored(
+ DFA::builder()
+ .start_kind(StartKind::Anchored)
+ .build(&patterns)
+ .unwrap(),
+ );
+ let got = search(&set, &aut);
+ assert_eq!(got, expected);
+ }
+}
diff --git a/third_party/rust/aho-corasick/src/util/alphabet.rs b/third_party/rust/aho-corasick/src/util/alphabet.rs
new file mode 100644
index 0000000000..69724fa3ab
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/util/alphabet.rs
@@ -0,0 +1,409 @@
+use crate::util::int::Usize;
+
+/// A representation of byte oriented equivalence classes.
+///
+/// This is used in finite state machines to reduce the size of the transition
+/// table. This can have a particularly large impact not only on the total size
+/// of an FSM, but also on FSM build times because it reduces the number of
+/// transitions that need to be visited/set.
+#[derive(Clone, Copy)]
+pub(crate) struct ByteClasses([u8; 256]);
+
+impl ByteClasses {
+ /// Creates a new set of equivalence classes where all bytes are mapped to
+ /// the same class.
+ pub(crate) fn empty() -> ByteClasses {
+ ByteClasses([0; 256])
+ }
+
+ /// Creates a new set of equivalence classes where each byte belongs to
+ /// its own equivalence class.
+ pub(crate) fn singletons() -> ByteClasses {
+ let mut classes = ByteClasses::empty();
+ for b in 0..=255 {
+ classes.set(b, b);
+ }
+ classes
+ }
+
+ /// Set the equivalence class for the given byte.
+ #[inline]
+ pub(crate) fn set(&mut self, byte: u8, class: u8) {
+ self.0[usize::from(byte)] = class;
+ }
+
+ /// Get the equivalence class for the given byte.
+ #[inline]
+ pub(crate) fn get(&self, byte: u8) -> u8 {
+ self.0[usize::from(byte)]
+ }
+
+ /// Return the total number of elements in the alphabet represented by
+ /// these equivalence classes. Equivalently, this returns the total number
+ /// of equivalence classes.
+ #[inline]
+ pub(crate) fn alphabet_len(&self) -> usize {
+ // Add one since the number of equivalence classes is one bigger than
+ // the last one.
+ usize::from(self.0[255]) + 1
+ }
+
+ /// Returns the stride, as a base-2 exponent, required for these
+ /// equivalence classes.
+ ///
+ /// The stride is always the smallest power of 2 that is greater than or
+ /// equal to the alphabet length. This is done so that converting between
+ /// state IDs and indices can be done with shifts alone, which is much
+ /// faster than integer division. The "stride2" is the exponent. i.e.,
+ /// `2^stride2 = stride`.
+ pub(crate) fn stride2(&self) -> usize {
+ let zeros = self.alphabet_len().next_power_of_two().trailing_zeros();
+ usize::try_from(zeros).unwrap()
+ }
+
+ /// Returns the stride for these equivalence classes, which corresponds
+ /// to the smallest power of 2 greater than or equal to the number of
+ /// equivalence classes.
+ pub(crate) fn stride(&self) -> usize {
+ 1 << self.stride2()
+ }
+
+ /// Returns true if and only if every byte in this class maps to its own
+ /// equivalence class. Equivalently, there are 257 equivalence classes
+ /// and each class contains exactly one byte (plus the special EOI class).
+ #[inline]
+ pub(crate) fn is_singleton(&self) -> bool {
+ self.alphabet_len() == 256
+ }
+
+ /// Returns an iterator over all equivalence classes in this set.
+ pub(crate) fn iter(&self) -> ByteClassIter {
+ ByteClassIter { it: 0..self.alphabet_len() }
+ }
+
+ /// Returns an iterator of the bytes in the given equivalence class.
+ pub(crate) fn elements(&self, class: u8) -> ByteClassElements {
+ ByteClassElements { classes: self, class, bytes: 0..=255 }
+ }
+
+ /// Returns an iterator of byte ranges in the given equivalence class.
+ ///
+ /// That is, a sequence of contiguous ranges are returned. Typically, every
+ /// class maps to a single contiguous range.
+ fn element_ranges(&self, class: u8) -> ByteClassElementRanges {
+ ByteClassElementRanges { elements: self.elements(class), range: None }
+ }
+}
+
+impl core::fmt::Debug for ByteClasses {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ if self.is_singleton() {
+ write!(f, "ByteClasses(<one-class-per-byte>)")
+ } else {
+ write!(f, "ByteClasses(")?;
+ for (i, class) in self.iter().enumerate() {
+ if i > 0 {
+ write!(f, ", ")?;
+ }
+ write!(f, "{:?} => [", class)?;
+ for (start, end) in self.element_ranges(class) {
+ if start == end {
+ write!(f, "{:?}", start)?;
+ } else {
+ write!(f, "{:?}-{:?}", start, end)?;
+ }
+ }
+ write!(f, "]")?;
+ }
+ write!(f, ")")
+ }
+ }
+}
+
+/// An iterator over each equivalence class.
+#[derive(Debug)]
+pub(crate) struct ByteClassIter {
+ it: core::ops::Range<usize>,
+}
+
+impl Iterator for ByteClassIter {
+ type Item = u8;
+
+ fn next(&mut self) -> Option<u8> {
+ self.it.next().map(|class| class.as_u8())
+ }
+}
+
+/// An iterator over all elements in a specific equivalence class.
+#[derive(Debug)]
+pub(crate) struct ByteClassElements<'a> {
+ classes: &'a ByteClasses,
+ class: u8,
+ bytes: core::ops::RangeInclusive<u8>,
+}
+
+impl<'a> Iterator for ByteClassElements<'a> {
+ type Item = u8;
+
+ fn next(&mut self) -> Option<u8> {
+ while let Some(byte) = self.bytes.next() {
+ if self.class == self.classes.get(byte) {
+ return Some(byte);
+ }
+ }
+ None
+ }
+}
+
+/// An iterator over all elements in an equivalence class expressed as a
+/// sequence of contiguous ranges.
+#[derive(Debug)]
+pub(crate) struct ByteClassElementRanges<'a> {
+ elements: ByteClassElements<'a>,
+ range: Option<(u8, u8)>,
+}
+
+impl<'a> Iterator for ByteClassElementRanges<'a> {
+ type Item = (u8, u8);
+
+ fn next(&mut self) -> Option<(u8, u8)> {
+ loop {
+ let element = match self.elements.next() {
+ None => return self.range.take(),
+ Some(element) => element,
+ };
+ match self.range.take() {
+ None => {
+ self.range = Some((element, element));
+ }
+ Some((start, end)) => {
+ if usize::from(end) + 1 != usize::from(element) {
+ self.range = Some((element, element));
+ return Some((start, end));
+ }
+ self.range = Some((start, element));
+ }
+ }
+ }
+ }
+}
+
+/// A partitioning of bytes into equivalence classes.
+///
+/// A byte class set keeps track of an *approximation* of equivalence classes
+/// of bytes during NFA construction. That is, every byte in an equivalence
+/// class cannot discriminate between a match and a non-match.
+///
+/// Note that this may not compute the minimal set of equivalence classes.
+/// Basically, any byte in a pattern given to the noncontiguous NFA builder
+/// will automatically be treated as its own equivalence class. All other
+/// bytes---any byte not in any pattern---will be treated as their own
+/// equivalence classes. In theory, all bytes not in any pattern should
+/// be part of a single equivalence class, but in practice, we only treat
+/// contiguous ranges of bytes as an equivalence class. So the number of
+/// classes computed may be bigger than necessary. This usually doesn't make
+/// much of a difference, and keeps the implementation simple.
+#[derive(Clone, Debug)]
+pub(crate) struct ByteClassSet(ByteSet);
+
+impl Default for ByteClassSet {
+ fn default() -> ByteClassSet {
+ ByteClassSet::empty()
+ }
+}
+
+impl ByteClassSet {
+ /// Create a new set of byte classes where all bytes are part of the same
+ /// equivalence class.
+ pub(crate) fn empty() -> Self {
+ ByteClassSet(ByteSet::empty())
+ }
+
+ /// Indicate the the range of byte given (inclusive) can discriminate a
+ /// match between it and all other bytes outside of the range.
+ pub(crate) fn set_range(&mut self, start: u8, end: u8) {
+ debug_assert!(start <= end);
+ if start > 0 {
+ self.0.add(start - 1);
+ }
+ self.0.add(end);
+ }
+
+ /// Convert this boolean set to a map that maps all byte values to their
+ /// corresponding equivalence class. The last mapping indicates the largest
+ /// equivalence class identifier (which is never bigger than 255).
+ pub(crate) fn byte_classes(&self) -> ByteClasses {
+ let mut classes = ByteClasses::empty();
+ let mut class = 0u8;
+ let mut b = 0u8;
+ loop {
+ classes.set(b, class);
+ if b == 255 {
+ break;
+ }
+ if self.0.contains(b) {
+ class = class.checked_add(1).unwrap();
+ }
+ b = b.checked_add(1).unwrap();
+ }
+ classes
+ }
+}
+
+/// A simple set of bytes that is reasonably cheap to copy and allocation free.
+#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
+pub(crate) struct ByteSet {
+ bits: BitSet,
+}
+
+/// The representation of a byte set. Split out so that we can define a
+/// convenient Debug impl for it while keeping "ByteSet" in the output.
+#[derive(Clone, Copy, Default, Eq, PartialEq)]
+struct BitSet([u128; 2]);
+
+impl ByteSet {
+ /// Create an empty set of bytes.
+ pub(crate) fn empty() -> ByteSet {
+ ByteSet { bits: BitSet([0; 2]) }
+ }
+
+ /// Add a byte to this set.
+ ///
+ /// If the given byte already belongs to this set, then this is a no-op.
+ pub(crate) fn add(&mut self, byte: u8) {
+ let bucket = byte / 128;
+ let bit = byte % 128;
+ self.bits.0[usize::from(bucket)] |= 1 << bit;
+ }
+
+ /// Return true if and only if the given byte is in this set.
+ pub(crate) fn contains(&self, byte: u8) -> bool {
+ let bucket = byte / 128;
+ let bit = byte % 128;
+ self.bits.0[usize::from(bucket)] & (1 << bit) > 0
+ }
+}
+
+impl core::fmt::Debug for BitSet {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ let mut fmtd = f.debug_set();
+ for b in 0u8..=255 {
+ if (ByteSet { bits: *self }).contains(b) {
+ fmtd.entry(&b);
+ }
+ }
+ fmtd.finish()
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use alloc::{vec, vec::Vec};
+
+ use super::*;
+
+ #[test]
+ fn byte_classes() {
+ let mut set = ByteClassSet::empty();
+ set.set_range(b'a', b'z');
+
+ let classes = set.byte_classes();
+ assert_eq!(classes.get(0), 0);
+ assert_eq!(classes.get(1), 0);
+ assert_eq!(classes.get(2), 0);
+ assert_eq!(classes.get(b'a' - 1), 0);
+ assert_eq!(classes.get(b'a'), 1);
+ assert_eq!(classes.get(b'm'), 1);
+ assert_eq!(classes.get(b'z'), 1);
+ assert_eq!(classes.get(b'z' + 1), 2);
+ assert_eq!(classes.get(254), 2);
+ assert_eq!(classes.get(255), 2);
+
+ let mut set = ByteClassSet::empty();
+ set.set_range(0, 2);
+ set.set_range(4, 6);
+ let classes = set.byte_classes();
+ assert_eq!(classes.get(0), 0);
+ assert_eq!(classes.get(1), 0);
+ assert_eq!(classes.get(2), 0);
+ assert_eq!(classes.get(3), 1);
+ assert_eq!(classes.get(4), 2);
+ assert_eq!(classes.get(5), 2);
+ assert_eq!(classes.get(6), 2);
+ assert_eq!(classes.get(7), 3);
+ assert_eq!(classes.get(255), 3);
+ }
+
+ #[test]
+ fn full_byte_classes() {
+ let mut set = ByteClassSet::empty();
+ for b in 0u8..=255 {
+ set.set_range(b, b);
+ }
+ assert_eq!(set.byte_classes().alphabet_len(), 256);
+ }
+
+ #[test]
+ fn elements_typical() {
+ let mut set = ByteClassSet::empty();
+ set.set_range(b'b', b'd');
+ set.set_range(b'g', b'm');
+ set.set_range(b'z', b'z');
+ let classes = set.byte_classes();
+ // class 0: \x00-a
+ // class 1: b-d
+ // class 2: e-f
+ // class 3: g-m
+ // class 4: n-y
+ // class 5: z-z
+ // class 6: \x7B-\xFF
+ assert_eq!(classes.alphabet_len(), 7);
+
+ let elements = classes.elements(0).collect::<Vec<_>>();
+ assert_eq!(elements.len(), 98);
+ assert_eq!(elements[0], b'\x00');
+ assert_eq!(elements[97], b'a');
+
+ let elements = classes.elements(1).collect::<Vec<_>>();
+ assert_eq!(elements, vec![b'b', b'c', b'd'],);
+
+ let elements = classes.elements(2).collect::<Vec<_>>();
+ assert_eq!(elements, vec![b'e', b'f'],);
+
+ let elements = classes.elements(3).collect::<Vec<_>>();
+ assert_eq!(elements, vec![b'g', b'h', b'i', b'j', b'k', b'l', b'm',],);
+
+ let elements = classes.elements(4).collect::<Vec<_>>();
+ assert_eq!(elements.len(), 12);
+ assert_eq!(elements[0], b'n');
+ assert_eq!(elements[11], b'y');
+
+ let elements = classes.elements(5).collect::<Vec<_>>();
+ assert_eq!(elements, vec![b'z']);
+
+ let elements = classes.elements(6).collect::<Vec<_>>();
+ assert_eq!(elements.len(), 133);
+ assert_eq!(elements[0], b'\x7B');
+ assert_eq!(elements[132], b'\xFF');
+ }
+
+ #[test]
+ fn elements_singletons() {
+ let classes = ByteClasses::singletons();
+ assert_eq!(classes.alphabet_len(), 256);
+
+ let elements = classes.elements(b'a').collect::<Vec<_>>();
+ assert_eq!(elements, vec![b'a']);
+ }
+
+ #[test]
+ fn elements_empty() {
+ let classes = ByteClasses::empty();
+ assert_eq!(classes.alphabet_len(), 1);
+
+ let elements = classes.elements(0).collect::<Vec<_>>();
+ assert_eq!(elements.len(), 256);
+ assert_eq!(elements[0], b'\x00');
+ assert_eq!(elements[255], b'\xFF');
+ }
+}
diff --git a/third_party/rust/aho-corasick/src/util/buffer.rs b/third_party/rust/aho-corasick/src/util/buffer.rs
new file mode 100644
index 0000000000..e9e982af58
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/util/buffer.rs
@@ -0,0 +1,124 @@
+use alloc::{vec, vec::Vec};
+
+/// The default buffer capacity that we use for the stream buffer.
+const DEFAULT_BUFFER_CAPACITY: usize = 64 * (1 << 10); // 64 KB
+
+/// A fairly simple roll buffer for supporting stream searches.
+///
+/// This buffer acts as a temporary place to store a fixed amount of data when
+/// reading from a stream. Its central purpose is to allow "rolling" some
+/// suffix of the data to the beginning of the buffer before refilling it with
+/// more data from the stream. For example, let's say we are trying to match
+/// "foobar" on a stream. When we report the match, we'd like to not only
+/// report the correct offsets at which the match occurs, but also the matching
+/// bytes themselves. So let's say our stream is a file with the following
+/// contents: `test test foobar test test`. Now assume that we happen to read
+/// the aforementioned file in two chunks: `test test foo` and `bar test test`.
+/// Naively, it would not be possible to report a single contiguous `foobar`
+/// match, but this roll buffer allows us to do that. Namely, after the second
+/// read, the contents of the buffer should be `st foobar test test`, where the
+/// search should ultimately resume immediately after `foo`. (The prefix `st `
+/// is included because the roll buffer saves N bytes at the end of the buffer,
+/// where N is the maximum possible length of a match.)
+///
+/// A lot of the logic for dealing with this is unfortunately split out between
+/// this roll buffer and the `StreamChunkIter`.
+///
+/// Note also that this buffer is not actually required to just report matches.
+/// Because a `Match` is just some offsets. But it *is* required for supporting
+/// things like `try_stream_replace_all` because that needs some mechanism for
+/// knowing which bytes in the stream correspond to a match and which don't. So
+/// when a match occurs across two `read` calls, *something* needs to retain
+/// the bytes from the previous `read` call because you don't know before the
+/// second read call whether a match exists or not.
+#[derive(Debug)]
+pub(crate) struct Buffer {
+ /// The raw buffer contents. This has a fixed size and never increases.
+ buf: Vec<u8>,
+ /// The minimum size of the buffer, which is equivalent to the maximum
+ /// possible length of a match. This corresponds to the amount that we
+ /// roll
+ min: usize,
+ /// The end of the contents of this buffer.
+ end: usize,
+}
+
+impl Buffer {
+ /// Create a new buffer for stream searching. The minimum buffer length
+ /// given should be the size of the maximum possible match length.
+ pub(crate) fn new(min_buffer_len: usize) -> Buffer {
+ let min = core::cmp::max(1, min_buffer_len);
+ // The minimum buffer amount is also the amount that we roll our
+ // buffer in order to support incremental searching. To this end,
+ // our actual capacity needs to be at least 1 byte bigger than our
+ // minimum amount, otherwise we won't have any overlap. In actuality,
+ // we want our buffer to be a bit bigger than that for performance
+ // reasons, so we set a lower bound of `8 * min`.
+ //
+ // TODO: It would be good to find a way to test the streaming
+ // implementation with the minimal buffer size. For now, we just
+ // uncomment out the next line and comment out the subsequent line.
+ // let capacity = 1 + min;
+ let capacity = core::cmp::max(min * 8, DEFAULT_BUFFER_CAPACITY);
+ Buffer { buf: vec![0; capacity], min, end: 0 }
+ }
+
+ /// Return the contents of this buffer.
+ #[inline]
+ pub(crate) fn buffer(&self) -> &[u8] {
+ &self.buf[..self.end]
+ }
+
+ /// Return the minimum size of the buffer. The only way a buffer may be
+ /// smaller than this is if the stream itself contains less than the
+ /// minimum buffer amount.
+ #[inline]
+ pub(crate) fn min_buffer_len(&self) -> usize {
+ self.min
+ }
+
+ /// Return all free capacity in this buffer.
+ fn free_buffer(&mut self) -> &mut [u8] {
+ &mut self.buf[self.end..]
+ }
+
+ /// Refill the contents of this buffer by reading as much as possible into
+ /// this buffer's free capacity. If no more bytes could be read, then this
+ /// returns false. Otherwise, this reads until it has filled the buffer
+ /// past the minimum amount.
+ pub(crate) fn fill<R: std::io::Read>(
+ &mut self,
+ mut rdr: R,
+ ) -> std::io::Result<bool> {
+ let mut readany = false;
+ loop {
+ let readlen = rdr.read(self.free_buffer())?;
+ if readlen == 0 {
+ return Ok(readany);
+ }
+ readany = true;
+ self.end += readlen;
+ if self.buffer().len() >= self.min {
+ return Ok(true);
+ }
+ }
+ }
+
+ /// Roll the contents of the buffer so that the suffix of this buffer is
+ /// moved to the front and all other contents are dropped. The size of the
+ /// suffix corresponds precisely to the minimum buffer length.
+ ///
+ /// This should only be called when the entire contents of this buffer have
+ /// been searched.
+ pub(crate) fn roll(&mut self) {
+ let roll_start = self
+ .end
+ .checked_sub(self.min)
+ .expect("buffer capacity should be bigger than minimum amount");
+ let roll_end = roll_start + self.min;
+
+ assert!(roll_end <= self.end);
+ self.buf.copy_within(roll_start..roll_end, 0);
+ self.end = self.min;
+ }
+}
diff --git a/third_party/rust/aho-corasick/src/util/byte_frequencies.rs b/third_party/rust/aho-corasick/src/util/byte_frequencies.rs
new file mode 100644
index 0000000000..c313b629db
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/util/byte_frequencies.rs
@@ -0,0 +1,258 @@
+pub const BYTE_FREQUENCIES: [u8; 256] = [
+ 55, // '\x00'
+ 52, // '\x01'
+ 51, // '\x02'
+ 50, // '\x03'
+ 49, // '\x04'
+ 48, // '\x05'
+ 47, // '\x06'
+ 46, // '\x07'
+ 45, // '\x08'
+ 103, // '\t'
+ 242, // '\n'
+ 66, // '\x0b'
+ 67, // '\x0c'
+ 229, // '\r'
+ 44, // '\x0e'
+ 43, // '\x0f'
+ 42, // '\x10'
+ 41, // '\x11'
+ 40, // '\x12'
+ 39, // '\x13'
+ 38, // '\x14'
+ 37, // '\x15'
+ 36, // '\x16'
+ 35, // '\x17'
+ 34, // '\x18'
+ 33, // '\x19'
+ 56, // '\x1a'
+ 32, // '\x1b'
+ 31, // '\x1c'
+ 30, // '\x1d'
+ 29, // '\x1e'
+ 28, // '\x1f'
+ 255, // ' '
+ 148, // '!'
+ 164, // '"'
+ 149, // '#'
+ 136, // '$'
+ 160, // '%'
+ 155, // '&'
+ 173, // "'"
+ 221, // '('
+ 222, // ')'
+ 134, // '*'
+ 122, // '+'
+ 232, // ','
+ 202, // '-'
+ 215, // '.'
+ 224, // '/'
+ 208, // '0'
+ 220, // '1'
+ 204, // '2'
+ 187, // '3'
+ 183, // '4'
+ 179, // '5'
+ 177, // '6'
+ 168, // '7'
+ 178, // '8'
+ 200, // '9'
+ 226, // ':'
+ 195, // ';'
+ 154, // '<'
+ 184, // '='
+ 174, // '>'
+ 126, // '?'
+ 120, // '@'
+ 191, // 'A'
+ 157, // 'B'
+ 194, // 'C'
+ 170, // 'D'
+ 189, // 'E'
+ 162, // 'F'
+ 161, // 'G'
+ 150, // 'H'
+ 193, // 'I'
+ 142, // 'J'
+ 137, // 'K'
+ 171, // 'L'
+ 176, // 'M'
+ 185, // 'N'
+ 167, // 'O'
+ 186, // 'P'
+ 112, // 'Q'
+ 175, // 'R'
+ 192, // 'S'
+ 188, // 'T'
+ 156, // 'U'
+ 140, // 'V'
+ 143, // 'W'
+ 123, // 'X'
+ 133, // 'Y'
+ 128, // 'Z'
+ 147, // '['
+ 138, // '\\'
+ 146, // ']'
+ 114, // '^'
+ 223, // '_'
+ 151, // '`'
+ 249, // 'a'
+ 216, // 'b'
+ 238, // 'c'
+ 236, // 'd'
+ 253, // 'e'
+ 227, // 'f'
+ 218, // 'g'
+ 230, // 'h'
+ 247, // 'i'
+ 135, // 'j'
+ 180, // 'k'
+ 241, // 'l'
+ 233, // 'm'
+ 246, // 'n'
+ 244, // 'o'
+ 231, // 'p'
+ 139, // 'q'
+ 245, // 'r'
+ 243, // 's'
+ 251, // 't'
+ 235, // 'u'
+ 201, // 'v'
+ 196, // 'w'
+ 240, // 'x'
+ 214, // 'y'
+ 152, // 'z'
+ 182, // '{'
+ 205, // '|'
+ 181, // '}'
+ 127, // '~'
+ 27, // '\x7f'
+ 212, // '\x80'
+ 211, // '\x81'
+ 210, // '\x82'
+ 213, // '\x83'
+ 228, // '\x84'
+ 197, // '\x85'
+ 169, // '\x86'
+ 159, // '\x87'
+ 131, // '\x88'
+ 172, // '\x89'
+ 105, // '\x8a'
+ 80, // '\x8b'
+ 98, // '\x8c'
+ 96, // '\x8d'
+ 97, // '\x8e'
+ 81, // '\x8f'
+ 207, // '\x90'
+ 145, // '\x91'
+ 116, // '\x92'
+ 115, // '\x93'
+ 144, // '\x94'
+ 130, // '\x95'
+ 153, // '\x96'
+ 121, // '\x97'
+ 107, // '\x98'
+ 132, // '\x99'
+ 109, // '\x9a'
+ 110, // '\x9b'
+ 124, // '\x9c'
+ 111, // '\x9d'
+ 82, // '\x9e'
+ 108, // '\x9f'
+ 118, // '\xa0'
+ 141, // '¡'
+ 113, // '¢'
+ 129, // '£'
+ 119, // '¤'
+ 125, // '¥'
+ 165, // '¦'
+ 117, // '§'
+ 92, // '¨'
+ 106, // '©'
+ 83, // 'ª'
+ 72, // '«'
+ 99, // '¬'
+ 93, // '\xad'
+ 65, // '®'
+ 79, // '¯'
+ 166, // '°'
+ 237, // '±'
+ 163, // '²'
+ 199, // '³'
+ 190, // '´'
+ 225, // 'µ'
+ 209, // '¶'
+ 203, // '·'
+ 198, // '¸'
+ 217, // '¹'
+ 219, // 'º'
+ 206, // '»'
+ 234, // '¼'
+ 248, // '½'
+ 158, // '¾'
+ 239, // '¿'
+ 255, // 'À'
+ 255, // 'Á'
+ 255, // 'Â'
+ 255, // 'Ã'
+ 255, // 'Ä'
+ 255, // 'Å'
+ 255, // 'Æ'
+ 255, // 'Ç'
+ 255, // 'È'
+ 255, // 'É'
+ 255, // 'Ê'
+ 255, // 'Ë'
+ 255, // 'Ì'
+ 255, // 'Í'
+ 255, // 'Î'
+ 255, // 'Ï'
+ 255, // 'Ð'
+ 255, // 'Ñ'
+ 255, // 'Ò'
+ 255, // 'Ó'
+ 255, // 'Ô'
+ 255, // 'Õ'
+ 255, // 'Ö'
+ 255, // '×'
+ 255, // 'Ø'
+ 255, // 'Ù'
+ 255, // 'Ú'
+ 255, // 'Û'
+ 255, // 'Ü'
+ 255, // 'Ý'
+ 255, // 'Þ'
+ 255, // 'ß'
+ 255, // 'à'
+ 255, // 'á'
+ 255, // 'â'
+ 255, // 'ã'
+ 255, // 'ä'
+ 255, // 'å'
+ 255, // 'æ'
+ 255, // 'ç'
+ 255, // 'è'
+ 255, // 'é'
+ 255, // 'ê'
+ 255, // 'ë'
+ 255, // 'ì'
+ 255, // 'í'
+ 255, // 'î'
+ 255, // 'ï'
+ 255, // 'ð'
+ 255, // 'ñ'
+ 255, // 'ò'
+ 255, // 'ó'
+ 255, // 'ô'
+ 255, // 'õ'
+ 255, // 'ö'
+ 255, // '÷'
+ 255, // 'ø'
+ 255, // 'ù'
+ 255, // 'ú'
+ 255, // 'û'
+ 255, // 'ü'
+ 255, // 'ý'
+ 255, // 'þ'
+ 255, // 'ÿ'
+];
diff --git a/third_party/rust/aho-corasick/src/util/debug.rs b/third_party/rust/aho-corasick/src/util/debug.rs
new file mode 100644
index 0000000000..22b5f2231f
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/util/debug.rs
@@ -0,0 +1,26 @@
+/// A type that wraps a single byte with a convenient fmt::Debug impl that
+/// escapes the byte.
+pub(crate) struct DebugByte(pub(crate) u8);
+
+impl core::fmt::Debug for DebugByte {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ // Special case ASCII space. It's too hard to read otherwise, so
+ // put quotes around it. I sometimes wonder whether just '\x20' would
+ // be better...
+ if self.0 == b' ' {
+ return write!(f, "' '");
+ }
+ // 10 bytes is enough to cover any output from ascii::escape_default.
+ let mut bytes = [0u8; 10];
+ let mut len = 0;
+ for (i, mut b) in core::ascii::escape_default(self.0).enumerate() {
+ // capitalize \xab to \xAB
+ if i >= 2 && b'a' <= b && b <= b'f' {
+ b -= 32;
+ }
+ bytes[len] = b;
+ len += 1;
+ }
+ write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap())
+ }
+}
diff --git a/third_party/rust/aho-corasick/src/util/error.rs b/third_party/rust/aho-corasick/src/util/error.rs
new file mode 100644
index 0000000000..326d04657b
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/util/error.rs
@@ -0,0 +1,259 @@
+use crate::util::{
+ primitives::{PatternID, SmallIndex},
+ search::MatchKind,
+};
+
+/// An error that occurred during the construction of an Aho-Corasick
+/// automaton.
+///
+/// Build errors occur when some kind of limit has been exceeded, either in the
+/// number of states, the number of patterns of the length of a pattern. These
+/// limits aren't part of the public API, but they should generally be large
+/// enough to handle most use cases.
+///
+/// When the `std` feature is enabled, this implements the `std::error::Error`
+/// trait.
+#[derive(Clone, Debug)]
+pub struct BuildError {
+ kind: ErrorKind,
+}
+
+/// The kind of error that occurred.
+#[derive(Clone, Debug)]
+enum ErrorKind {
+ /// An error that occurs when allocating a new state would result in an
+ /// identifier that exceeds the capacity of a `StateID`.
+ StateIDOverflow {
+ /// The maximum possible id.
+ max: u64,
+ /// The maximum ID requested.
+ requested_max: u64,
+ },
+ /// An error that occurs when adding a pattern to an Aho-Corasick
+ /// automaton would result in an identifier that exceeds the capacity of a
+ /// `PatternID`.
+ PatternIDOverflow {
+ /// The maximum possible id.
+ max: u64,
+ /// The maximum ID requested.
+ requested_max: u64,
+ },
+ /// Occurs when a pattern string is given to the Aho-Corasick constructor
+ /// that is too long.
+ PatternTooLong {
+ /// The ID of the pattern that was too long.
+ pattern: PatternID,
+ /// The length that was too long.
+ len: usize,
+ },
+}
+
+impl BuildError {
+ pub(crate) fn state_id_overflow(
+ max: u64,
+ requested_max: u64,
+ ) -> BuildError {
+ BuildError { kind: ErrorKind::StateIDOverflow { max, requested_max } }
+ }
+
+ pub(crate) fn pattern_id_overflow(
+ max: u64,
+ requested_max: u64,
+ ) -> BuildError {
+ BuildError {
+ kind: ErrorKind::PatternIDOverflow { max, requested_max },
+ }
+ }
+
+ pub(crate) fn pattern_too_long(
+ pattern: PatternID,
+ len: usize,
+ ) -> BuildError {
+ BuildError { kind: ErrorKind::PatternTooLong { pattern, len } }
+ }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for BuildError {}
+
+impl core::fmt::Display for BuildError {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ match self.kind {
+ ErrorKind::StateIDOverflow { max, requested_max } => {
+ write!(
+ f,
+ "state identifier overflow: failed to create state ID \
+ from {}, which exceeds the max of {}",
+ requested_max, max,
+ )
+ }
+ ErrorKind::PatternIDOverflow { max, requested_max } => {
+ write!(
+ f,
+ "pattern identifier overflow: failed to create pattern ID \
+ from {}, which exceeds the max of {}",
+ requested_max, max,
+ )
+ }
+ ErrorKind::PatternTooLong { pattern, len } => {
+ write!(
+ f,
+ "pattern {} with length {} exceeds \
+ the maximum pattern length of {}",
+ pattern.as_usize(),
+ len,
+ SmallIndex::MAX.as_usize(),
+ )
+ }
+ }
+ }
+}
+
+/// An error that occurred during an Aho-Corasick search.
+///
+/// An error that occurs during a search is limited to some kind of
+/// misconfiguration that resulted in an illegal call. Stated differently,
+/// whether an error occurs is not dependent on the specific bytes in the
+/// haystack.
+///
+/// Examples of misconfiguration:
+///
+/// * Executing a stream or overlapping search on a searcher that was built was
+/// something other than [`MatchKind::Standard`](crate::MatchKind::Standard)
+/// semantics.
+/// * Requested an anchored or an unanchored search on a searcher that doesn't
+/// support unanchored or anchored searches, respectively.
+///
+/// When the `std` feature is enabled, this implements the `std::error::Error`
+/// trait.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct MatchError(alloc::boxed::Box<MatchErrorKind>);
+
+impl MatchError {
+ /// Create a new error value with the given kind.
+ ///
+ /// This is a more verbose version of the kind-specific constructors, e.g.,
+ /// `MatchError::unsupported_stream`.
+ pub fn new(kind: MatchErrorKind) -> MatchError {
+ MatchError(alloc::boxed::Box::new(kind))
+ }
+
+ /// Returns a reference to the underlying error kind.
+ pub fn kind(&self) -> &MatchErrorKind {
+ &self.0
+ }
+
+ /// Create a new "invalid anchored search" error. This occurs when the
+ /// caller requests an anchored search but where anchored searches aren't
+ /// supported.
+ ///
+ /// This is the same as calling `MatchError::new` with a
+ /// [`MatchErrorKind::InvalidInputAnchored`] kind.
+ pub fn invalid_input_anchored() -> MatchError {
+ MatchError::new(MatchErrorKind::InvalidInputAnchored)
+ }
+
+ /// Create a new "invalid unanchored search" error. This occurs when the
+ /// caller requests an unanchored search but where unanchored searches
+ /// aren't supported.
+ ///
+ /// This is the same as calling `MatchError::new` with a
+ /// [`MatchErrorKind::InvalidInputUnanchored`] kind.
+ pub fn invalid_input_unanchored() -> MatchError {
+ MatchError::new(MatchErrorKind::InvalidInputUnanchored)
+ }
+
+ /// Create a new "unsupported stream search" error. This occurs when the
+ /// caller requests a stream search while using an Aho-Corasick automaton
+ /// with a match kind other than [`MatchKind::Standard`].
+ ///
+ /// The match kind given should be the match kind of the automaton. It
+ /// should never be `MatchKind::Standard`.
+ pub fn unsupported_stream(got: MatchKind) -> MatchError {
+ MatchError::new(MatchErrorKind::UnsupportedStream { got })
+ }
+
+ /// Create a new "unsupported overlapping search" error. This occurs when
+ /// the caller requests an overlapping search while using an Aho-Corasick
+ /// automaton with a match kind other than [`MatchKind::Standard`].
+ ///
+ /// The match kind given should be the match kind of the automaton. It
+ /// should never be `MatchKind::Standard`.
+ pub fn unsupported_overlapping(got: MatchKind) -> MatchError {
+ MatchError::new(MatchErrorKind::UnsupportedOverlapping { got })
+ }
+
+ /// Create a new "unsupported empty pattern" error. This occurs when the
+ /// caller requests a search for which matching an automaton that contains
+ /// an empty pattern string is not supported.
+ pub fn unsupported_empty() -> MatchError {
+ MatchError::new(MatchErrorKind::UnsupportedEmpty)
+ }
+}
+
+/// The underlying kind of a [`MatchError`].
+///
+/// This is a **non-exhaustive** enum. That means new variants may be added in
+/// a semver-compatible release.
+#[non_exhaustive]
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum MatchErrorKind {
+ /// An error indicating that an anchored search was requested, but from a
+ /// searcher that was built without anchored support.
+ InvalidInputAnchored,
+ /// An error indicating that an unanchored search was requested, but from a
+ /// searcher that was built without unanchored support.
+ InvalidInputUnanchored,
+ /// An error indicating that a stream search was attempted on an
+ /// Aho-Corasick automaton with an unsupported `MatchKind`.
+ UnsupportedStream {
+ /// The match semantics for the automaton that was used.
+ got: MatchKind,
+ },
+ /// An error indicating that an overlapping search was attempted on an
+ /// Aho-Corasick automaton with an unsupported `MatchKind`.
+ UnsupportedOverlapping {
+ /// The match semantics for the automaton that was used.
+ got: MatchKind,
+ },
+ /// An error indicating that the operation requested doesn't support
+ /// automatons that contain an empty pattern string.
+ UnsupportedEmpty,
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for MatchError {}
+
+impl core::fmt::Display for MatchError {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ match *self.kind() {
+ MatchErrorKind::InvalidInputAnchored => {
+ write!(f, "anchored searches are not supported or enabled")
+ }
+ MatchErrorKind::InvalidInputUnanchored => {
+ write!(f, "unanchored searches are not supported or enabled")
+ }
+ MatchErrorKind::UnsupportedStream { got } => {
+ write!(
+ f,
+ "match kind {:?} does not support stream searching",
+ got,
+ )
+ }
+ MatchErrorKind::UnsupportedOverlapping { got } => {
+ write!(
+ f,
+ "match kind {:?} does not support overlapping searches",
+ got,
+ )
+ }
+ MatchErrorKind::UnsupportedEmpty => {
+ write!(
+ f,
+ "matching with an empty pattern string is not \
+ supported for this operation",
+ )
+ }
+ }
+ }
+}
diff --git a/third_party/rust/aho-corasick/src/util/int.rs b/third_party/rust/aho-corasick/src/util/int.rs
new file mode 100644
index 0000000000..28ede7a47f
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/util/int.rs
@@ -0,0 +1,284 @@
+/*!
+This module provides several integer oriented traits for converting between
+both fixed size integers and integers whose size varies based on the target
+(like `usize`).
+
+The main design principle for this module is to centralize all uses of `as`.
+The thinking here is that `as` makes it very easy to perform accidental lossy
+conversions, and if we centralize all its uses here under more descriptive
+higher level operations, its use and correctness becomes easier to audit.
+
+This was copied mostly wholesale from `regex-automata`.
+
+NOTE: for simplicity, we don't take target pointer width into account here for
+`usize` conversions. Since we currently only panic in debug mode, skipping the
+check when it can be proven it isn't needed at compile time doesn't really
+matter. Now, if we wind up wanting to do as many checks as possible in release
+mode, then we would want to skip those when we know the conversions are always
+non-lossy.
+*/
+
+pub(crate) trait U8 {
+ fn as_usize(self) -> usize;
+}
+
+impl U8 for u8 {
+ fn as_usize(self) -> usize {
+ usize::from(self)
+ }
+}
+
+pub(crate) trait U16 {
+ fn as_usize(self) -> usize;
+ fn low_u8(self) -> u8;
+ fn high_u8(self) -> u8;
+}
+
+impl U16 for u16 {
+ fn as_usize(self) -> usize {
+ usize::from(self)
+ }
+
+ fn low_u8(self) -> u8 {
+ self as u8
+ }
+
+ fn high_u8(self) -> u8 {
+ (self >> 8) as u8
+ }
+}
+
+pub(crate) trait U32 {
+ fn as_usize(self) -> usize;
+ fn low_u8(self) -> u8;
+ fn low_u16(self) -> u16;
+ fn high_u16(self) -> u16;
+}
+
+impl U32 for u32 {
+ #[inline]
+ fn as_usize(self) -> usize {
+ #[cfg(debug_assertions)]
+ {
+ usize::try_from(self).expect("u32 overflowed usize")
+ }
+ #[cfg(not(debug_assertions))]
+ {
+ self as usize
+ }
+ }
+
+ fn low_u8(self) -> u8 {
+ self as u8
+ }
+
+ fn low_u16(self) -> u16 {
+ self as u16
+ }
+
+ fn high_u16(self) -> u16 {
+ (self >> 16) as u16
+ }
+}
+
+pub(crate) trait U64 {
+ fn as_usize(self) -> usize;
+ fn low_u8(self) -> u8;
+ fn low_u16(self) -> u16;
+ fn low_u32(self) -> u32;
+ fn high_u32(self) -> u32;
+}
+
+impl U64 for u64 {
+ fn as_usize(self) -> usize {
+ #[cfg(debug_assertions)]
+ {
+ usize::try_from(self).expect("u64 overflowed usize")
+ }
+ #[cfg(not(debug_assertions))]
+ {
+ self as usize
+ }
+ }
+
+ fn low_u8(self) -> u8 {
+ self as u8
+ }
+
+ fn low_u16(self) -> u16 {
+ self as u16
+ }
+
+ fn low_u32(self) -> u32 {
+ self as u32
+ }
+
+ fn high_u32(self) -> u32 {
+ (self >> 32) as u32
+ }
+}
+
+pub(crate) trait I8 {
+ fn as_usize(self) -> usize;
+ fn to_bits(self) -> u8;
+ fn from_bits(n: u8) -> i8;
+}
+
+impl I8 for i8 {
+ fn as_usize(self) -> usize {
+ #[cfg(debug_assertions)]
+ {
+ usize::try_from(self).expect("i8 overflowed usize")
+ }
+ #[cfg(not(debug_assertions))]
+ {
+ self as usize
+ }
+ }
+
+ fn to_bits(self) -> u8 {
+ self as u8
+ }
+
+ fn from_bits(n: u8) -> i8 {
+ n as i8
+ }
+}
+
+pub(crate) trait I32 {
+ fn as_usize(self) -> usize;
+ fn to_bits(self) -> u32;
+ fn from_bits(n: u32) -> i32;
+}
+
+impl I32 for i32 {
+ fn as_usize(self) -> usize {
+ #[cfg(debug_assertions)]
+ {
+ usize::try_from(self).expect("i32 overflowed usize")
+ }
+ #[cfg(not(debug_assertions))]
+ {
+ self as usize
+ }
+ }
+
+ fn to_bits(self) -> u32 {
+ self as u32
+ }
+
+ fn from_bits(n: u32) -> i32 {
+ n as i32
+ }
+}
+
+pub(crate) trait I64 {
+ fn as_usize(self) -> usize;
+ fn to_bits(self) -> u64;
+ fn from_bits(n: u64) -> i64;
+}
+
+impl I64 for i64 {
+ fn as_usize(self) -> usize {
+ #[cfg(debug_assertions)]
+ {
+ usize::try_from(self).expect("i64 overflowed usize")
+ }
+ #[cfg(not(debug_assertions))]
+ {
+ self as usize
+ }
+ }
+
+ fn to_bits(self) -> u64 {
+ self as u64
+ }
+
+ fn from_bits(n: u64) -> i64 {
+ n as i64
+ }
+}
+
+pub(crate) trait Usize {
+ fn as_u8(self) -> u8;
+ fn as_u16(self) -> u16;
+ fn as_u32(self) -> u32;
+ fn as_u64(self) -> u64;
+}
+
+impl Usize for usize {
+ fn as_u8(self) -> u8 {
+ #[cfg(debug_assertions)]
+ {
+ u8::try_from(self).expect("usize overflowed u8")
+ }
+ #[cfg(not(debug_assertions))]
+ {
+ self as u8
+ }
+ }
+
+ fn as_u16(self) -> u16 {
+ #[cfg(debug_assertions)]
+ {
+ u16::try_from(self).expect("usize overflowed u16")
+ }
+ #[cfg(not(debug_assertions))]
+ {
+ self as u16
+ }
+ }
+
+ fn as_u32(self) -> u32 {
+ #[cfg(debug_assertions)]
+ {
+ u32::try_from(self).expect("usize overflowed u32")
+ }
+ #[cfg(not(debug_assertions))]
+ {
+ self as u32
+ }
+ }
+
+ fn as_u64(self) -> u64 {
+ #[cfg(debug_assertions)]
+ {
+ u64::try_from(self).expect("usize overflowed u64")
+ }
+ #[cfg(not(debug_assertions))]
+ {
+ self as u64
+ }
+ }
+}
+
+// Pointers aren't integers, but we convert pointers to integers to perform
+// offset arithmetic in some places. (And no, we don't convert the integers
+// back to pointers.) So add 'as_usize' conversions here too for completeness.
+//
+// These 'as' casts are actually okay because they're always non-lossy. But the
+// idea here is to just try and remove as much 'as' as possible, particularly
+// in this crate where we are being really paranoid about offsets and making
+// sure we don't panic on inputs that might be untrusted. This way, the 'as'
+// casts become easier to audit if they're all in one place, even when some of
+// them are actually okay 100% of the time.
+
+pub(crate) trait Pointer {
+ fn as_usize(self) -> usize;
+}
+
+impl<T> Pointer for *const T {
+ fn as_usize(self) -> usize {
+ self as usize
+ }
+}
+
+pub(crate) trait PointerMut {
+ fn as_usize(self) -> usize;
+}
+
+impl<T> PointerMut for *mut T {
+ fn as_usize(self) -> usize {
+ self as usize
+ }
+}
diff --git a/third_party/rust/aho-corasick/src/util/mod.rs b/third_party/rust/aho-corasick/src/util/mod.rs
new file mode 100644
index 0000000000..f7a1ddd07b
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/util/mod.rs
@@ -0,0 +1,12 @@
+pub(crate) mod alphabet;
+#[cfg(feature = "std")]
+pub(crate) mod buffer;
+pub(crate) mod byte_frequencies;
+pub(crate) mod debug;
+pub(crate) mod error;
+pub(crate) mod int;
+pub(crate) mod prefilter;
+pub(crate) mod primitives;
+pub(crate) mod remapper;
+pub(crate) mod search;
+pub(crate) mod special;
diff --git a/third_party/rust/aho-corasick/src/util/prefilter.rs b/third_party/rust/aho-corasick/src/util/prefilter.rs
new file mode 100644
index 0000000000..f5ddc75b7c
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/util/prefilter.rs
@@ -0,0 +1,924 @@
+use core::{
+ cmp,
+ fmt::Debug,
+ panic::{RefUnwindSafe, UnwindSafe},
+ u8,
+};
+
+use alloc::{sync::Arc, vec, vec::Vec};
+
+use crate::{
+ packed,
+ util::{
+ alphabet::ByteSet,
+ search::{Match, MatchKind, Span},
+ },
+};
+
+/// A prefilter for accelerating a search.
+///
+/// This crate uses prefilters in the core search implementations to accelerate
+/// common cases. They typically only apply to cases where there are a small
+/// number of patterns (less than 100 or so), but when they do, thoughput can
+/// be boosted considerably, perhaps by an order of magnitude. When a prefilter
+/// is active, it is used whenever a search enters an automaton's start state.
+///
+/// Currently, prefilters cannot be constructed by
+/// callers. A `Prefilter` can only be accessed via the
+/// [`Automaton::prefilter`](crate::automaton::Automaton::prefilter)
+/// method and used to execute a search. In other words, a prefilter can be
+/// used to optimize your own search implementation if necessary, but cannot do
+/// much else. If you have a use case for more APIs, please submit an issue.
+#[derive(Clone, Debug)]
+pub struct Prefilter {
+ finder: Arc<dyn PrefilterI>,
+ memory_usage: usize,
+}
+
+impl Prefilter {
+ /// Execute a search in the haystack within the span given. If a match or
+ /// a possible match is returned, then it is guaranteed to occur within
+ /// the bounds of the span.
+ ///
+ /// If the span provided is invalid for the given haystack, then behavior
+ /// is unspecified.
+ #[inline]
+ pub fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
+ self.finder.find_in(haystack, span)
+ }
+
+ #[inline]
+ pub(crate) fn memory_usage(&self) -> usize {
+ self.memory_usage
+ }
+}
+
+/// A candidate is the result of running a prefilter on a haystack at a
+/// particular position.
+///
+/// The result is either no match, a confirmed match or a possible match.
+///
+/// When no match is returned, the prefilter is guaranteeing that no possible
+/// match can be found in the haystack, and the caller may trust this. That is,
+/// all correct prefilters must never report false negatives.
+///
+/// In some cases, a prefilter can confirm a match very quickly, in which case,
+/// the caller may use this to stop what it's doing and report the match. In
+/// this case, prefilter implementations must never report a false positive.
+/// In other cases, the prefilter can only report a potential match, in which
+/// case the callers must attempt to confirm the match. In this case, prefilter
+/// implementations are permitted to return false positives.
+#[derive(Clone, Debug)]
+pub enum Candidate {
+ /// No match was found. Since false negatives are not possible, this means
+ /// the search can quit as it is guaranteed not to find another match.
+ None,
+ /// A confirmed match was found. Callers do not need to confirm it.
+ Match(Match),
+ /// The start of a possible match was found. Callers must confirm it before
+ /// reporting it as a match.
+ PossibleStartOfMatch(usize),
+}
+
+impl Candidate {
+ /// Convert this candidate into an option. This is useful when callers
+ /// do not distinguish between true positives and false positives (i.e.,
+ /// the caller must always confirm the match).
+ pub fn into_option(self) -> Option<usize> {
+ match self {
+ Candidate::None => None,
+ Candidate::Match(ref m) => Some(m.start()),
+ Candidate::PossibleStartOfMatch(start) => Some(start),
+ }
+ }
+}
+
+/// A prefilter describes the behavior of fast literal scanners for quickly
+/// skipping past bytes in the haystack that we know cannot possibly
+/// participate in a match.
+trait PrefilterI:
+ Send + Sync + RefUnwindSafe + UnwindSafe + Debug + 'static
+{
+ /// Returns the next possible match candidate. This may yield false
+ /// positives, so callers must confirm a match starting at the position
+ /// returned. This, however, must never produce false negatives. That is,
+ /// this must, at minimum, return the starting position of the next match
+ /// in the given haystack after or at the given position.
+ fn find_in(&self, haystack: &[u8], span: Span) -> Candidate;
+}
+
+impl<P: PrefilterI + ?Sized> PrefilterI for Arc<P> {
+ #[inline(always)]
+ fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
+ (**self).find_in(haystack, span)
+ }
+}
+
+/// A builder for constructing the best possible prefilter. When constructed,
+/// this builder will heuristically select the best prefilter it can build,
+/// if any, and discard the rest.
+#[derive(Debug)]
+pub(crate) struct Builder {
+ count: usize,
+ ascii_case_insensitive: bool,
+ start_bytes: StartBytesBuilder,
+ rare_bytes: RareBytesBuilder,
+ memmem: MemmemBuilder,
+ packed: Option<packed::Builder>,
+ // If we run across a condition that suggests we shouldn't use a prefilter
+ // at all (like an empty pattern), then disable prefilters entirely.
+ enabled: bool,
+}
+
+impl Builder {
+ /// Create a new builder for constructing the best possible prefilter.
+ pub(crate) fn new(kind: MatchKind) -> Builder {
+ let pbuilder = kind
+ .as_packed()
+ .map(|kind| packed::Config::new().match_kind(kind).builder());
+ Builder {
+ count: 0,
+ ascii_case_insensitive: false,
+ start_bytes: StartBytesBuilder::new(),
+ rare_bytes: RareBytesBuilder::new(),
+ memmem: MemmemBuilder::default(),
+ packed: pbuilder,
+ enabled: true,
+ }
+ }
+
+ /// Enable ASCII case insensitivity. When set, byte strings added to this
+ /// builder will be interpreted without respect to ASCII case.
+ pub(crate) fn ascii_case_insensitive(mut self, yes: bool) -> Builder {
+ self.ascii_case_insensitive = yes;
+ self.start_bytes = self.start_bytes.ascii_case_insensitive(yes);
+ self.rare_bytes = self.rare_bytes.ascii_case_insensitive(yes);
+ self
+ }
+
+ /// Return a prefilter suitable for quickly finding potential matches.
+ ///
+ /// All patterns added to an Aho-Corasick automaton should be added to this
+ /// builder before attempting to construct the prefilter.
+ pub(crate) fn build(&self) -> Option<Prefilter> {
+ if !self.enabled {
+ debug!("prefilter not enabled, skipping");
+ return None;
+ }
+ // If we only have one pattern, then deferring to memmem is always
+ // the best choice. This is kind of a weird case, because, well, why
+ // use Aho-Corasick if you only have one pattern? But maybe you don't
+ // know exactly how many patterns you'll get up front, and you need to
+ // support the option of multiple patterns. So instead of relying on
+ // the caller to branch and use memmem explicitly, we just do it for
+ // them.
+ if !self.ascii_case_insensitive {
+ if let Some(pre) = self.memmem.build() {
+ debug!("using memmem prefilter");
+ return Some(pre);
+ }
+ }
+ let (packed, patlen, minlen) = if self.ascii_case_insensitive {
+ (None, usize::MAX, 0)
+ } else {
+ let patlen = self.packed.as_ref().map_or(usize::MAX, |p| p.len());
+ let minlen = self.packed.as_ref().map_or(0, |p| p.minimum_len());
+ let packed =
+ self.packed.as_ref().and_then(|b| b.build()).map(|s| {
+ let memory_usage = s.memory_usage();
+ debug!(
+ "built packed prefilter (len: {}, \
+ minimum pattern len: {}, memory usage: {}) \
+ for consideration",
+ patlen, minlen, memory_usage,
+ );
+ Prefilter { finder: Arc::new(Packed(s)), memory_usage }
+ });
+ (packed, patlen, minlen)
+ };
+ match (self.start_bytes.build(), self.rare_bytes.build()) {
+ // If we could build both start and rare prefilters, then there are
+ // a few cases in which we'd want to use the start-byte prefilter
+ // over the rare-byte prefilter, since the former has lower
+ // overhead.
+ (prestart @ Some(_), prerare @ Some(_)) => {
+ debug!(
+ "both start (len={}, rank={}) and \
+ rare (len={}, rank={}) byte prefilters \
+ are available",
+ self.start_bytes.count,
+ self.start_bytes.rank_sum,
+ self.rare_bytes.count,
+ self.rare_bytes.rank_sum,
+ );
+ if patlen <= 16
+ && minlen >= 2
+ && self.start_bytes.count >= 3
+ && self.rare_bytes.count >= 3
+ {
+ debug!(
+ "start and rare byte prefilters available, but \
+ they're probably slower than packed so using \
+ packed"
+ );
+ return packed;
+ }
+ // If the start-byte prefilter can scan for a smaller number
+ // of bytes than the rare-byte prefilter, then it's probably
+ // faster.
+ let has_fewer_bytes =
+ self.start_bytes.count < self.rare_bytes.count;
+ // Otherwise, if the combined frequency rank of the detected
+ // bytes in the start-byte prefilter is "close" to the combined
+ // frequency rank of the rare-byte prefilter, then we pick
+ // the start-byte prefilter even if the rare-byte prefilter
+ // heuristically searches for rare bytes. This is because the
+ // rare-byte prefilter has higher constant costs, so we tend to
+ // prefer the start-byte prefilter when we can.
+ let has_rarer_bytes =
+ self.start_bytes.rank_sum <= self.rare_bytes.rank_sum + 50;
+ if has_fewer_bytes {
+ debug!(
+ "using start byte prefilter because it has fewer
+ bytes to search for than the rare byte prefilter",
+ );
+ prestart
+ } else if has_rarer_bytes {
+ debug!(
+ "using start byte prefilter because its byte \
+ frequency rank was determined to be \
+ \"good enough\" relative to the rare byte prefilter \
+ byte frequency rank",
+ );
+ prestart
+ } else {
+ debug!("using rare byte prefilter");
+ prerare
+ }
+ }
+ (prestart @ Some(_), None) => {
+ if patlen <= 16 && minlen >= 2 && self.start_bytes.count >= 3 {
+ debug!(
+ "start byte prefilter available, but \
+ it's probably slower than packed so using \
+ packed"
+ );
+ return packed;
+ }
+ debug!(
+ "have start byte prefilter but not rare byte prefilter, \
+ so using start byte prefilter",
+ );
+ prestart
+ }
+ (None, prerare @ Some(_)) => {
+ if patlen <= 16 && minlen >= 2 && self.rare_bytes.count >= 3 {
+ debug!(
+ "rare byte prefilter available, but \
+ it's probably slower than packed so using \
+ packed"
+ );
+ return packed;
+ }
+ debug!(
+ "have rare byte prefilter but not start byte prefilter, \
+ so using rare byte prefilter",
+ );
+ prerare
+ }
+ (None, None) if self.ascii_case_insensitive => {
+ debug!(
+ "no start or rare byte prefilter and ASCII case \
+ insensitivity was enabled, so skipping prefilter",
+ );
+ None
+ }
+ (None, None) => {
+ if packed.is_some() {
+ debug!("falling back to packed prefilter");
+ } else {
+ debug!("no prefilter available");
+ }
+ packed
+ }
+ }
+ }
+
+ /// Add a literal string to this prefilter builder.
+ pub(crate) fn add(&mut self, bytes: &[u8]) {
+ if bytes.is_empty() {
+ self.enabled = false;
+ }
+ if !self.enabled {
+ return;
+ }
+ self.count += 1;
+ self.start_bytes.add(bytes);
+ self.rare_bytes.add(bytes);
+ self.memmem.add(bytes);
+ if let Some(ref mut pbuilder) = self.packed {
+ pbuilder.add(bytes);
+ }
+ }
+}
+
+/// A type that wraps a packed searcher and implements the `Prefilter`
+/// interface.
+#[derive(Clone, Debug)]
+struct Packed(packed::Searcher);
+
+impl PrefilterI for Packed {
+ fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
+ self.0
+ .find_in(&haystack, span)
+ .map_or(Candidate::None, Candidate::Match)
+ }
+}
+
+/// A builder for constructing a prefilter that uses memmem.
+#[derive(Debug, Default)]
+struct MemmemBuilder {
+ /// The number of patterns that have been added.
+ count: usize,
+ /// The singular pattern to search for. This is only set when count==1.
+ one: Option<Vec<u8>>,
+}
+
+impl MemmemBuilder {
+ fn build(&self) -> Option<Prefilter> {
+ #[cfg(all(feature = "std", feature = "perf-literal"))]
+ fn imp(builder: &MemmemBuilder) -> Option<Prefilter> {
+ let pattern = builder.one.as_ref()?;
+ assert_eq!(1, builder.count);
+ let finder = Arc::new(Memmem(
+ memchr::memmem::Finder::new(pattern).into_owned(),
+ ));
+ let memory_usage = pattern.len();
+ Some(Prefilter { finder, memory_usage })
+ }
+
+ #[cfg(not(all(feature = "std", feature = "perf-literal")))]
+ fn imp(_: &MemmemBuilder) -> Option<Prefilter> {
+ None
+ }
+
+ imp(self)
+ }
+
+ fn add(&mut self, bytes: &[u8]) {
+ self.count += 1;
+ if self.count == 1 {
+ self.one = Some(bytes.to_vec());
+ } else {
+ self.one = None;
+ }
+ }
+}
+
+/// A type that wraps a SIMD accelerated single substring search from the
+/// `memchr` crate for use as a prefilter.
+///
+/// Currently, this prefilter is only active for Aho-Corasick searchers with
+/// a single pattern. In theory, this could be extended to support searchers
+/// that have a common prefix of more than one byte (for one byte, we would use
+/// memchr), but it's not clear if it's worth it or not.
+///
+/// Also, unfortunately, this currently also requires the 'std' feature to
+/// be enabled. That's because memchr doesn't have a no-std-but-with-alloc
+/// mode, and so APIs like Finder::into_owned aren't available when 'std' is
+/// disabled. But there should be an 'alloc' feature that brings in APIs like
+/// Finder::into_owned but doesn't use std-only features like runtime CPU
+/// feature detection.
+#[cfg(all(feature = "std", feature = "perf-literal"))]
+#[derive(Clone, Debug)]
+struct Memmem(memchr::memmem::Finder<'static>);
+
+#[cfg(all(feature = "std", feature = "perf-literal"))]
+impl PrefilterI for Memmem {
+ fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
+ use crate::util::primitives::PatternID;
+
+ self.0.find(&haystack[span]).map_or(Candidate::None, |i| {
+ let start = span.start + i;
+ let end = start + self.0.needle().len();
+ // N.B. We can declare a match and use a fixed pattern ID here
+ // because a Memmem prefilter is only ever created for searchers
+ // with exactly one pattern. Thus, every match is always a match
+ // and it is always for the first and only pattern.
+ Candidate::Match(Match::new(PatternID::ZERO, start..end))
+ })
+ }
+}
+
+/// A builder for constructing a rare byte prefilter.
+///
+/// A rare byte prefilter attempts to pick out a small set of rare bytes that
+/// occurr in the patterns, and then quickly scan to matches of those rare
+/// bytes.
+#[derive(Clone, Debug)]
+struct RareBytesBuilder {
+ /// Whether this prefilter should account for ASCII case insensitivity or
+ /// not.
+ ascii_case_insensitive: bool,
+ /// A set of rare bytes, indexed by byte value.
+ rare_set: ByteSet,
+ /// A set of byte offsets associated with bytes in a pattern. An entry
+ /// corresponds to a particular bytes (its index) and is only non-zero if
+ /// the byte occurred at an offset greater than 0 in at least one pattern.
+ ///
+ /// If a byte's offset is not representable in 8 bits, then the rare bytes
+ /// prefilter becomes inert.
+ byte_offsets: RareByteOffsets,
+ /// Whether this is available as a prefilter or not. This can be set to
+ /// false during construction if a condition is seen that invalidates the
+ /// use of the rare-byte prefilter.
+ available: bool,
+ /// The number of bytes set to an active value in `byte_offsets`.
+ count: usize,
+ /// The sum of frequency ranks for the rare bytes detected. This is
+ /// intended to give a heuristic notion of how rare the bytes are.
+ rank_sum: u16,
+}
+
+/// A set of byte offsets, keyed by byte.
+#[derive(Clone, Copy)]
+struct RareByteOffsets {
+ /// Each entry corresponds to the maximum offset of the corresponding
+ /// byte across all patterns seen.
+ set: [RareByteOffset; 256],
+}
+
+impl RareByteOffsets {
+ /// Create a new empty set of rare byte offsets.
+ pub(crate) fn empty() -> RareByteOffsets {
+ RareByteOffsets { set: [RareByteOffset::default(); 256] }
+ }
+
+ /// Add the given offset for the given byte to this set. If the offset is
+ /// greater than the existing offset, then it overwrites the previous
+ /// value and returns false. If there is no previous value set, then this
+ /// sets it and returns true.
+ pub(crate) fn set(&mut self, byte: u8, off: RareByteOffset) {
+ self.set[byte as usize].max =
+ cmp::max(self.set[byte as usize].max, off.max);
+ }
+}
+
+impl core::fmt::Debug for RareByteOffsets {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ let mut offsets = vec![];
+ for off in self.set.iter() {
+ if off.max > 0 {
+ offsets.push(off);
+ }
+ }
+ f.debug_struct("RareByteOffsets").field("set", &offsets).finish()
+ }
+}
+
+/// Offsets associated with an occurrence of a "rare" byte in any of the
+/// patterns used to construct a single Aho-Corasick automaton.
+#[derive(Clone, Copy, Debug)]
+struct RareByteOffset {
+ /// The maximum offset at which a particular byte occurs from the start
+ /// of any pattern. This is used as a shift amount. That is, when an
+ /// occurrence of this byte is found, the candidate position reported by
+ /// the prefilter is `position_of_byte - max`, such that the automaton
+ /// will begin its search at a position that is guaranteed to observe a
+ /// match.
+ ///
+ /// To avoid accidentally quadratic behavior, a prefilter is considered
+ /// ineffective when it is asked to start scanning from a position that it
+ /// has already scanned past.
+ ///
+ /// Using a `u8` here means that if we ever see a pattern that's longer
+ /// than 255 bytes, then the entire rare byte prefilter is disabled.
+ max: u8,
+}
+
+impl Default for RareByteOffset {
+ fn default() -> RareByteOffset {
+ RareByteOffset { max: 0 }
+ }
+}
+
+impl RareByteOffset {
+ /// Create a new rare byte offset. If the given offset is too big, then
+ /// None is returned. In that case, callers should render the rare bytes
+ /// prefilter inert.
+ fn new(max: usize) -> Option<RareByteOffset> {
+ if max > u8::MAX as usize {
+ None
+ } else {
+ Some(RareByteOffset { max: max as u8 })
+ }
+ }
+}
+
+impl RareBytesBuilder {
+ /// Create a new builder for constructing a rare byte prefilter.
+ fn new() -> RareBytesBuilder {
+ RareBytesBuilder {
+ ascii_case_insensitive: false,
+ rare_set: ByteSet::empty(),
+ byte_offsets: RareByteOffsets::empty(),
+ available: true,
+ count: 0,
+ rank_sum: 0,
+ }
+ }
+
+ /// Enable ASCII case insensitivity. When set, byte strings added to this
+ /// builder will be interpreted without respect to ASCII case.
+ fn ascii_case_insensitive(mut self, yes: bool) -> RareBytesBuilder {
+ self.ascii_case_insensitive = yes;
+ self
+ }
+
+ /// Build the rare bytes prefilter.
+ ///
+ /// If there are more than 3 distinct rare bytes found, or if heuristics
+ /// otherwise determine that this prefilter should not be used, then `None`
+ /// is returned.
+ fn build(&self) -> Option<Prefilter> {
+ #[cfg(feature = "perf-literal")]
+ fn imp(builder: &RareBytesBuilder) -> Option<Prefilter> {
+ if !builder.available || builder.count > 3 {
+ return None;
+ }
+ let (mut bytes, mut len) = ([0; 3], 0);
+ for b in 0..=255 {
+ if builder.rare_set.contains(b) {
+ bytes[len] = b as u8;
+ len += 1;
+ }
+ }
+ let finder: Arc<dyn PrefilterI> = match len {
+ 0 => return None,
+ 1 => Arc::new(RareBytesOne {
+ byte1: bytes[0],
+ offset: builder.byte_offsets.set[bytes[0] as usize],
+ }),
+ 2 => Arc::new(RareBytesTwo {
+ offsets: builder.byte_offsets,
+ byte1: bytes[0],
+ byte2: bytes[1],
+ }),
+ 3 => Arc::new(RareBytesThree {
+ offsets: builder.byte_offsets,
+ byte1: bytes[0],
+ byte2: bytes[1],
+ byte3: bytes[2],
+ }),
+ _ => unreachable!(),
+ };
+ Some(Prefilter { finder, memory_usage: 0 })
+ }
+
+ #[cfg(not(feature = "perf-literal"))]
+ fn imp(_: &RareBytesBuilder) -> Option<Prefilter> {
+ None
+ }
+
+ imp(self)
+ }
+
+ /// Add a byte string to this builder.
+ ///
+ /// All patterns added to an Aho-Corasick automaton should be added to this
+ /// builder before attempting to construct the prefilter.
+ fn add(&mut self, bytes: &[u8]) {
+ // If we've already given up, then do nothing.
+ if !self.available {
+ return;
+ }
+ // If we've already blown our budget, then don't waste time looking
+ // for more rare bytes.
+ if self.count > 3 {
+ self.available = false;
+ return;
+ }
+ // If the pattern is too long, then our offset table is bunk, so
+ // give up.
+ if bytes.len() >= 256 {
+ self.available = false;
+ return;
+ }
+ let mut rarest = match bytes.get(0) {
+ None => return,
+ Some(&b) => (b, freq_rank(b)),
+ };
+ // The idea here is to look for the rarest byte in each pattern, and
+ // add that to our set. As a special exception, if we see a byte that
+ // we've already added, then we immediately stop and choose that byte,
+ // even if there's another rare byte in the pattern. This helps us
+ // apply the rare byte optimization in more cases by attempting to pick
+ // bytes that are in common between patterns. So for example, if we
+ // were searching for `Sherlock` and `lockjaw`, then this would pick
+ // `k` for both patterns, resulting in the use of `memchr` instead of
+ // `memchr2` for `k` and `j`.
+ let mut found = false;
+ for (pos, &b) in bytes.iter().enumerate() {
+ self.set_offset(pos, b);
+ if found {
+ continue;
+ }
+ if self.rare_set.contains(b) {
+ found = true;
+ continue;
+ }
+ let rank = freq_rank(b);
+ if rank < rarest.1 {
+ rarest = (b, rank);
+ }
+ }
+ if !found {
+ self.add_rare_byte(rarest.0);
+ }
+ }
+
+ fn set_offset(&mut self, pos: usize, byte: u8) {
+ // This unwrap is OK because pos is never bigger than our max.
+ let offset = RareByteOffset::new(pos).unwrap();
+ self.byte_offsets.set(byte, offset);
+ if self.ascii_case_insensitive {
+ self.byte_offsets.set(opposite_ascii_case(byte), offset);
+ }
+ }
+
+ fn add_rare_byte(&mut self, byte: u8) {
+ self.add_one_rare_byte(byte);
+ if self.ascii_case_insensitive {
+ self.add_one_rare_byte(opposite_ascii_case(byte));
+ }
+ }
+
+ fn add_one_rare_byte(&mut self, byte: u8) {
+ if !self.rare_set.contains(byte) {
+ self.rare_set.add(byte);
+ self.count += 1;
+ self.rank_sum += freq_rank(byte) as u16;
+ }
+ }
+}
+
+/// A prefilter for scanning for a single "rare" byte.
+#[cfg(feature = "perf-literal")]
+#[derive(Clone, Debug)]
+struct RareBytesOne {
+ byte1: u8,
+ offset: RareByteOffset,
+}
+
+#[cfg(feature = "perf-literal")]
+impl PrefilterI for RareBytesOne {
+ fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
+ memchr::memchr(self.byte1, &haystack[span])
+ .map(|i| {
+ let pos = span.start + i;
+ cmp::max(
+ span.start,
+ pos.saturating_sub(usize::from(self.offset.max)),
+ )
+ })
+ .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
+ }
+}
+
+/// A prefilter for scanning for two "rare" bytes.
+#[cfg(feature = "perf-literal")]
+#[derive(Clone, Debug)]
+struct RareBytesTwo {
+ offsets: RareByteOffsets,
+ byte1: u8,
+ byte2: u8,
+}
+
+#[cfg(feature = "perf-literal")]
+impl PrefilterI for RareBytesTwo {
+ fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
+ memchr::memchr2(self.byte1, self.byte2, &haystack[span])
+ .map(|i| {
+ let pos = span.start + i;
+ let offset = self.offsets.set[usize::from(haystack[pos])].max;
+ cmp::max(span.start, pos.saturating_sub(usize::from(offset)))
+ })
+ .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
+ }
+}
+
+/// A prefilter for scanning for three "rare" bytes.
+#[cfg(feature = "perf-literal")]
+#[derive(Clone, Debug)]
+struct RareBytesThree {
+ offsets: RareByteOffsets,
+ byte1: u8,
+ byte2: u8,
+ byte3: u8,
+}
+
+#[cfg(feature = "perf-literal")]
+impl PrefilterI for RareBytesThree {
+ fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
+ memchr::memchr3(self.byte1, self.byte2, self.byte3, &haystack[span])
+ .map(|i| {
+ let pos = span.start + i;
+ let offset = self.offsets.set[usize::from(haystack[pos])].max;
+ cmp::max(span.start, pos.saturating_sub(usize::from(offset)))
+ })
+ .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
+ }
+}
+
+/// A builder for constructing a starting byte prefilter.
+///
+/// A starting byte prefilter is a simplistic prefilter that looks for possible
+/// matches by reporting all positions corresponding to a particular byte. This
+/// generally only takes affect when there are at most 3 distinct possible
+/// starting bytes. e.g., the patterns `foo`, `bar`, and `baz` have two
+/// distinct starting bytes (`f` and `b`), and this prefilter returns all
+/// occurrences of either `f` or `b`.
+///
+/// In some cases, a heuristic frequency analysis may determine that it would
+/// be better not to use this prefilter even when there are 3 or fewer distinct
+/// starting bytes.
+#[derive(Clone, Debug)]
+struct StartBytesBuilder {
+ /// Whether this prefilter should account for ASCII case insensitivity or
+ /// not.
+ ascii_case_insensitive: bool,
+ /// The set of starting bytes observed.
+ byteset: Vec<bool>,
+ /// The number of bytes set to true in `byteset`.
+ count: usize,
+ /// The sum of frequency ranks for the rare bytes detected. This is
+ /// intended to give a heuristic notion of how rare the bytes are.
+ rank_sum: u16,
+}
+
+impl StartBytesBuilder {
+ /// Create a new builder for constructing a start byte prefilter.
+ fn new() -> StartBytesBuilder {
+ StartBytesBuilder {
+ ascii_case_insensitive: false,
+ byteset: vec![false; 256],
+ count: 0,
+ rank_sum: 0,
+ }
+ }
+
+ /// Enable ASCII case insensitivity. When set, byte strings added to this
+ /// builder will be interpreted without respect to ASCII case.
+ fn ascii_case_insensitive(mut self, yes: bool) -> StartBytesBuilder {
+ self.ascii_case_insensitive = yes;
+ self
+ }
+
+ /// Build the starting bytes prefilter.
+ ///
+ /// If there are more than 3 distinct starting bytes, or if heuristics
+ /// otherwise determine that this prefilter should not be used, then `None`
+ /// is returned.
+ fn build(&self) -> Option<Prefilter> {
+ #[cfg(feature = "perf-literal")]
+ fn imp(builder: &StartBytesBuilder) -> Option<Prefilter> {
+ if builder.count > 3 {
+ return None;
+ }
+ let (mut bytes, mut len) = ([0; 3], 0);
+ for b in 0..256 {
+ if !builder.byteset[b] {
+ continue;
+ }
+ // We don't handle non-ASCII bytes for now. Getting non-ASCII
+ // bytes right is trickier, since we generally don't want to put
+ // a leading UTF-8 code unit into a prefilter that isn't ASCII,
+ // since they can frequently. Instead, it would be better to use a
+ // continuation byte, but this requires more sophisticated analysis
+ // of the automaton and a richer prefilter API.
+ if b > 0x7F {
+ return None;
+ }
+ bytes[len] = b as u8;
+ len += 1;
+ }
+ let finder: Arc<dyn PrefilterI> = match len {
+ 0 => return None,
+ 1 => Arc::new(StartBytesOne { byte1: bytes[0] }),
+ 2 => Arc::new(StartBytesTwo {
+ byte1: bytes[0],
+ byte2: bytes[1],
+ }),
+ 3 => Arc::new(StartBytesThree {
+ byte1: bytes[0],
+ byte2: bytes[1],
+ byte3: bytes[2],
+ }),
+ _ => unreachable!(),
+ };
+ Some(Prefilter { finder, memory_usage: 0 })
+ }
+
+ #[cfg(not(feature = "perf-literal"))]
+ fn imp(_: &StartBytesBuilder) -> Option<Prefilter> {
+ None
+ }
+
+ imp(self)
+ }
+
+ /// Add a byte string to this builder.
+ ///
+ /// All patterns added to an Aho-Corasick automaton should be added to this
+ /// builder before attempting to construct the prefilter.
+ fn add(&mut self, bytes: &[u8]) {
+ if self.count > 3 {
+ return;
+ }
+ if let Some(&byte) = bytes.get(0) {
+ self.add_one_byte(byte);
+ if self.ascii_case_insensitive {
+ self.add_one_byte(opposite_ascii_case(byte));
+ }
+ }
+ }
+
+ fn add_one_byte(&mut self, byte: u8) {
+ if !self.byteset[byte as usize] {
+ self.byteset[byte as usize] = true;
+ self.count += 1;
+ self.rank_sum += freq_rank(byte) as u16;
+ }
+ }
+}
+
+/// A prefilter for scanning for a single starting byte.
+#[cfg(feature = "perf-literal")]
+#[derive(Clone, Debug)]
+struct StartBytesOne {
+ byte1: u8,
+}
+
+#[cfg(feature = "perf-literal")]
+impl PrefilterI for StartBytesOne {
+ fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
+ memchr::memchr(self.byte1, &haystack[span])
+ .map(|i| span.start + i)
+ .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
+ }
+}
+
+/// A prefilter for scanning for two starting bytes.
+#[cfg(feature = "perf-literal")]
+#[derive(Clone, Debug)]
+struct StartBytesTwo {
+ byte1: u8,
+ byte2: u8,
+}
+
+#[cfg(feature = "perf-literal")]
+impl PrefilterI for StartBytesTwo {
+ fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
+ memchr::memchr2(self.byte1, self.byte2, &haystack[span])
+ .map(|i| span.start + i)
+ .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
+ }
+}
+
+/// A prefilter for scanning for three starting bytes.
+#[cfg(feature = "perf-literal")]
+#[derive(Clone, Debug)]
+struct StartBytesThree {
+ byte1: u8,
+ byte2: u8,
+ byte3: u8,
+}
+
+#[cfg(feature = "perf-literal")]
+impl PrefilterI for StartBytesThree {
+ fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
+ memchr::memchr3(self.byte1, self.byte2, self.byte3, &haystack[span])
+ .map(|i| span.start + i)
+ .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
+ }
+}
+
+/// If the given byte is an ASCII letter, then return it in the opposite case.
+/// e.g., Given `b'A'`, this returns `b'a'`, and given `b'a'`, this returns
+/// `b'A'`. If a non-ASCII letter is given, then the given byte is returned.
+pub(crate) fn opposite_ascii_case(b: u8) -> u8 {
+ if b'A' <= b && b <= b'Z' {
+ b.to_ascii_lowercase()
+ } else if b'a' <= b && b <= b'z' {
+ b.to_ascii_uppercase()
+ } else {
+ b
+ }
+}
+
+/// Return the frequency rank of the given byte. The higher the rank, the more
+/// common the byte (heuristically speaking).
+fn freq_rank(b: u8) -> u8 {
+ use crate::util::byte_frequencies::BYTE_FREQUENCIES;
+ BYTE_FREQUENCIES[b as usize]
+}
diff --git a/third_party/rust/aho-corasick/src/util/primitives.rs b/third_party/rust/aho-corasick/src/util/primitives.rs
new file mode 100644
index 0000000000..784d397171
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/util/primitives.rs
@@ -0,0 +1,759 @@
+/*!
+Lower level primitive types that are useful in a variety of circumstances.
+
+# Overview
+
+This list represents the principle types in this module and briefly describes
+when you might want to use them.
+
+* [`PatternID`] - A type that represents the identifier of a regex pattern.
+This is probably the most widely used type in this module (which is why it's
+also re-exported in the crate root).
+* [`StateID`] - A type the represents the identifier of a finite automaton
+state. This is used for both NFAs and DFAs, with the notable exception of
+the hybrid NFA/DFA. (The hybrid NFA/DFA uses a special purpose "lazy" state
+identifier.)
+* [`SmallIndex`] - The internal representation of both a `PatternID` and a
+`StateID`. Its purpose is to serve as a type that can index memory without
+being as big as a `usize` on 64-bit targets. The main idea behind this type
+is that there are many things in regex engines that will, in practice, never
+overflow a 32-bit integer. (For example, like the number of patterns in a regex
+or the number of states in an NFA.) Thus, a `SmallIndex` can be used to index
+memory without peppering `as` casts everywhere. Moreover, it forces callers
+to handle errors in the case where, somehow, the value would otherwise overflow
+either a 32-bit integer or a `usize` (e.g., on 16-bit targets).
+*/
+
+// The macro we use to define some types below adds methods that we don't
+// use on some of the types. There isn't much, so we just squash the warning.
+#![allow(dead_code)]
+
+use alloc::vec::Vec;
+
+use crate::util::int::{Usize, U16, U32, U64};
+
+/// A type that represents a "small" index.
+///
+/// The main idea of this type is to provide something that can index memory,
+/// but uses less memory than `usize` on 64-bit systems. Specifically, its
+/// representation is always a `u32` and has `repr(transparent)` enabled. (So
+/// it is safe to transmute between a `u32` and a `SmallIndex`.)
+///
+/// A small index is typically useful in cases where there is no practical way
+/// that the index will overflow a 32-bit integer. A good example of this is
+/// an NFA state. If you could somehow build an NFA with `2^30` states, its
+/// memory usage would be exorbitant and its runtime execution would be so
+/// slow as to be completely worthless. Therefore, this crate generally deems
+/// it acceptable to return an error if it would otherwise build an NFA that
+/// requires a slice longer than what a 32-bit integer can index. In exchange,
+/// we can use 32-bit indices instead of 64-bit indices in various places.
+///
+/// This type ensures this by providing a constructor that will return an error
+/// if its argument cannot fit into the type. This makes it much easier to
+/// handle these sorts of boundary cases that are otherwise extremely subtle.
+///
+/// On all targets, this type guarantees that its value will fit in a `u32`,
+/// `i32`, `usize` and an `isize`. This means that on 16-bit targets, for
+/// example, this type's maximum value will never overflow an `isize`,
+/// which means it will never overflow a `i16` even though its internal
+/// representation is still a `u32`.
+///
+/// The purpose for making the type fit into even signed integer types like
+/// `isize` is to guarantee that the difference between any two small indices
+/// is itself also a small index. This is useful in certain contexts, e.g.,
+/// for delta encoding.
+///
+/// # Other types
+///
+/// The following types wrap `SmallIndex` to provide a more focused use case:
+///
+/// * [`PatternID`] is for representing the identifiers of patterns.
+/// * [`StateID`] is for representing the identifiers of states in finite
+/// automata. It is used for both NFAs and DFAs.
+///
+/// # Representation
+///
+/// This type is always represented internally by a `u32` and is marked as
+/// `repr(transparent)`. Thus, this type always has the same representation as
+/// a `u32`. It is thus safe to transmute between a `u32` and a `SmallIndex`.
+///
+/// # Indexing
+///
+/// For convenience, callers may use a `SmallIndex` to index slices.
+///
+/// # Safety
+///
+/// While a `SmallIndex` is meant to guarantee that its value fits into `usize`
+/// without using as much space as a `usize` on all targets, callers must
+/// not rely on this property for safety. Callers may choose to rely on this
+/// property for correctness however. For example, creating a `SmallIndex` with
+/// an invalid value can be done in entirely safe code. This may in turn result
+/// in panics or silent logical errors.
+#[derive(
+ Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
+)]
+#[repr(transparent)]
+pub(crate) struct SmallIndex(u32);
+
+impl SmallIndex {
+ /// The maximum index value.
+ #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
+ pub const MAX: SmallIndex =
+ // FIXME: Use as_usize() once const functions in traits are stable.
+ SmallIndex::new_unchecked(core::i32::MAX as usize - 1);
+
+ /// The maximum index value.
+ #[cfg(target_pointer_width = "16")]
+ pub const MAX: SmallIndex =
+ SmallIndex::new_unchecked(core::isize::MAX - 1);
+
+ /// The total number of values that can be represented as a small index.
+ pub const LIMIT: usize = SmallIndex::MAX.as_usize() + 1;
+
+ /// The zero index value.
+ pub const ZERO: SmallIndex = SmallIndex::new_unchecked(0);
+
+ /// The number of bytes that a single small index uses in memory.
+ pub const SIZE: usize = core::mem::size_of::<SmallIndex>();
+
+ /// Create a new small index.
+ ///
+ /// If the given index exceeds [`SmallIndex::MAX`], then this returns
+ /// an error.
+ #[inline]
+ pub fn new(index: usize) -> Result<SmallIndex, SmallIndexError> {
+ SmallIndex::try_from(index)
+ }
+
+ /// Create a new small index without checking whether the given value
+ /// exceeds [`SmallIndex::MAX`].
+ ///
+ /// Using this routine with an invalid index value will result in
+ /// unspecified behavior, but *not* undefined behavior. In particular, an
+ /// invalid index value is likely to cause panics or possibly even silent
+ /// logical errors.
+ ///
+ /// Callers must never rely on a `SmallIndex` to be within a certain range
+ /// for memory safety.
+ #[inline]
+ pub const fn new_unchecked(index: usize) -> SmallIndex {
+ // FIXME: Use as_u32() once const functions in traits are stable.
+ SmallIndex::from_u32_unchecked(index as u32)
+ }
+
+ /// Create a new small index from a `u32` without checking whether the
+ /// given value exceeds [`SmallIndex::MAX`].
+ ///
+ /// Using this routine with an invalid index value will result in
+ /// unspecified behavior, but *not* undefined behavior. In particular, an
+ /// invalid index value is likely to cause panics or possibly even silent
+ /// logical errors.
+ ///
+ /// Callers must never rely on a `SmallIndex` to be within a certain range
+ /// for memory safety.
+ #[inline]
+ pub const fn from_u32_unchecked(index: u32) -> SmallIndex {
+ SmallIndex(index)
+ }
+
+ /// Like [`SmallIndex::new`], but panics if the given index is not valid.
+ #[inline]
+ pub fn must(index: usize) -> SmallIndex {
+ SmallIndex::new(index).expect("invalid small index")
+ }
+
+ /// Return this small index as a `usize`. This is guaranteed to never
+ /// overflow `usize`.
+ #[inline]
+ pub const fn as_usize(&self) -> usize {
+ // FIXME: Use as_usize() once const functions in traits are stable.
+ self.0 as usize
+ }
+
+ /// Return this small index as a `u64`. This is guaranteed to never
+ /// overflow.
+ #[inline]
+ pub const fn as_u64(&self) -> u64 {
+ // FIXME: Use u64::from() once const functions in traits are stable.
+ self.0 as u64
+ }
+
+ /// Return the internal `u32` of this small index. This is guaranteed to
+ /// never overflow `u32`.
+ #[inline]
+ pub const fn as_u32(&self) -> u32 {
+ self.0
+ }
+
+ /// Return the internal `u32` of this small index represented as an `i32`.
+ /// This is guaranteed to never overflow an `i32`.
+ #[inline]
+ pub const fn as_i32(&self) -> i32 {
+ // This is OK because we guarantee that our max value is <= i32::MAX.
+ self.0 as i32
+ }
+
+ /// Returns one more than this small index as a usize.
+ ///
+ /// Since a small index has constraints on its maximum value, adding `1` to
+ /// it will always fit in a `usize`, `isize`, `u32` and a `i32`.
+ #[inline]
+ pub fn one_more(&self) -> usize {
+ self.as_usize() + 1
+ }
+
+ /// Decode this small index from the bytes given using the native endian
+ /// byte order for the current target.
+ ///
+ /// If the decoded integer is not representable as a small index for the
+ /// current target, then this returns an error.
+ #[inline]
+ pub fn from_ne_bytes(
+ bytes: [u8; 4],
+ ) -> Result<SmallIndex, SmallIndexError> {
+ let id = u32::from_ne_bytes(bytes);
+ if id > SmallIndex::MAX.as_u32() {
+ return Err(SmallIndexError { attempted: u64::from(id) });
+ }
+ Ok(SmallIndex::new_unchecked(id.as_usize()))
+ }
+
+ /// Decode this small index from the bytes given using the native endian
+ /// byte order for the current target.
+ ///
+ /// This is analogous to [`SmallIndex::new_unchecked`] in that is does not
+ /// check whether the decoded integer is representable as a small index.
+ #[inline]
+ pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> SmallIndex {
+ SmallIndex::new_unchecked(u32::from_ne_bytes(bytes).as_usize())
+ }
+
+ /// Return the underlying small index integer as raw bytes in native endian
+ /// format.
+ #[inline]
+ pub fn to_ne_bytes(&self) -> [u8; 4] {
+ self.0.to_ne_bytes()
+ }
+}
+
+impl<T> core::ops::Index<SmallIndex> for [T] {
+ type Output = T;
+
+ #[inline]
+ fn index(&self, index: SmallIndex) -> &T {
+ &self[index.as_usize()]
+ }
+}
+
+impl<T> core::ops::IndexMut<SmallIndex> for [T] {
+ #[inline]
+ fn index_mut(&mut self, index: SmallIndex) -> &mut T {
+ &mut self[index.as_usize()]
+ }
+}
+
+impl<T> core::ops::Index<SmallIndex> for Vec<T> {
+ type Output = T;
+
+ #[inline]
+ fn index(&self, index: SmallIndex) -> &T {
+ &self[index.as_usize()]
+ }
+}
+
+impl<T> core::ops::IndexMut<SmallIndex> for Vec<T> {
+ #[inline]
+ fn index_mut(&mut self, index: SmallIndex) -> &mut T {
+ &mut self[index.as_usize()]
+ }
+}
+
+impl From<StateID> for SmallIndex {
+ fn from(sid: StateID) -> SmallIndex {
+ sid.0
+ }
+}
+
+impl From<PatternID> for SmallIndex {
+ fn from(pid: PatternID) -> SmallIndex {
+ pid.0
+ }
+}
+
+impl From<u8> for SmallIndex {
+ fn from(index: u8) -> SmallIndex {
+ SmallIndex::new_unchecked(usize::from(index))
+ }
+}
+
+impl TryFrom<u16> for SmallIndex {
+ type Error = SmallIndexError;
+
+ fn try_from(index: u16) -> Result<SmallIndex, SmallIndexError> {
+ if u32::from(index) > SmallIndex::MAX.as_u32() {
+ return Err(SmallIndexError { attempted: u64::from(index) });
+ }
+ Ok(SmallIndex::new_unchecked(index.as_usize()))
+ }
+}
+
+impl TryFrom<u32> for SmallIndex {
+ type Error = SmallIndexError;
+
+ fn try_from(index: u32) -> Result<SmallIndex, SmallIndexError> {
+ if index > SmallIndex::MAX.as_u32() {
+ return Err(SmallIndexError { attempted: u64::from(index) });
+ }
+ Ok(SmallIndex::new_unchecked(index.as_usize()))
+ }
+}
+
+impl TryFrom<u64> for SmallIndex {
+ type Error = SmallIndexError;
+
+ fn try_from(index: u64) -> Result<SmallIndex, SmallIndexError> {
+ if index > SmallIndex::MAX.as_u64() {
+ return Err(SmallIndexError { attempted: index });
+ }
+ Ok(SmallIndex::new_unchecked(index.as_usize()))
+ }
+}
+
+impl TryFrom<usize> for SmallIndex {
+ type Error = SmallIndexError;
+
+ fn try_from(index: usize) -> Result<SmallIndex, SmallIndexError> {
+ if index > SmallIndex::MAX.as_usize() {
+ return Err(SmallIndexError { attempted: index.as_u64() });
+ }
+ Ok(SmallIndex::new_unchecked(index))
+ }
+}
+
+/// This error occurs when a small index could not be constructed.
+///
+/// This occurs when given an integer exceeding the maximum small index value.
+///
+/// When the `std` feature is enabled, this implements the `Error` trait.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct SmallIndexError {
+ attempted: u64,
+}
+
+impl SmallIndexError {
+ /// Returns the value that could not be converted to a small index.
+ pub fn attempted(&self) -> u64 {
+ self.attempted
+ }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for SmallIndexError {}
+
+impl core::fmt::Display for SmallIndexError {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(
+ f,
+ "failed to create small index from {:?}, which exceeds {:?}",
+ self.attempted(),
+ SmallIndex::MAX,
+ )
+ }
+}
+
+#[derive(Clone, Debug)]
+pub(crate) struct SmallIndexIter {
+ rng: core::ops::Range<usize>,
+}
+
+impl Iterator for SmallIndexIter {
+ type Item = SmallIndex;
+
+ fn next(&mut self) -> Option<SmallIndex> {
+ if self.rng.start >= self.rng.end {
+ return None;
+ }
+ let next_id = self.rng.start + 1;
+ let id = core::mem::replace(&mut self.rng.start, next_id);
+ // new_unchecked is OK since we asserted that the number of
+ // elements in this iterator will fit in an ID at construction.
+ Some(SmallIndex::new_unchecked(id))
+ }
+}
+
+macro_rules! index_type_impls {
+ ($name:ident, $err:ident, $iter:ident, $withiter:ident) => {
+ impl $name {
+ /// The maximum value.
+ pub const MAX: $name = $name(SmallIndex::MAX);
+
+ /// The total number of values that can be represented.
+ pub const LIMIT: usize = SmallIndex::LIMIT;
+
+ /// The zero value.
+ pub const ZERO: $name = $name(SmallIndex::ZERO);
+
+ /// The number of bytes that a single value uses in memory.
+ pub const SIZE: usize = SmallIndex::SIZE;
+
+ /// Create a new value that is represented by a "small index."
+ ///
+ /// If the given index exceeds the maximum allowed value, then this
+ /// returns an error.
+ #[inline]
+ pub fn new(value: usize) -> Result<$name, $err> {
+ SmallIndex::new(value).map($name).map_err($err)
+ }
+
+ /// Create a new value without checking whether the given argument
+ /// exceeds the maximum.
+ ///
+ /// Using this routine with an invalid value will result in
+ /// unspecified behavior, but *not* undefined behavior. In
+ /// particular, an invalid ID value is likely to cause panics or
+ /// possibly even silent logical errors.
+ ///
+ /// Callers must never rely on this type to be within a certain
+ /// range for memory safety.
+ #[inline]
+ pub const fn new_unchecked(value: usize) -> $name {
+ $name(SmallIndex::new_unchecked(value))
+ }
+
+ /// Create a new value from a `u32` without checking whether the
+ /// given value exceeds the maximum.
+ ///
+ /// Using this routine with an invalid value will result in
+ /// unspecified behavior, but *not* undefined behavior. In
+ /// particular, an invalid ID value is likely to cause panics or
+ /// possibly even silent logical errors.
+ ///
+ /// Callers must never rely on this type to be within a certain
+ /// range for memory safety.
+ #[inline]
+ pub const fn from_u32_unchecked(index: u32) -> $name {
+ $name(SmallIndex::from_u32_unchecked(index))
+ }
+
+ /// Like `new`, but panics if the given value is not valid.
+ #[inline]
+ pub fn must(value: usize) -> $name {
+ $name::new(value).expect(concat!(
+ "invalid ",
+ stringify!($name),
+ " value"
+ ))
+ }
+
+ /// Return the internal value as a `usize`. This is guaranteed to
+ /// never overflow `usize`.
+ #[inline]
+ pub const fn as_usize(&self) -> usize {
+ self.0.as_usize()
+ }
+
+ /// Return the internal value as a `u64`. This is guaranteed to
+ /// never overflow.
+ #[inline]
+ pub const fn as_u64(&self) -> u64 {
+ self.0.as_u64()
+ }
+
+ /// Return the internal value as a `u32`. This is guaranteed to
+ /// never overflow `u32`.
+ #[inline]
+ pub const fn as_u32(&self) -> u32 {
+ self.0.as_u32()
+ }
+
+ /// Return the internal value as a `i32`. This is guaranteed to
+ /// never overflow an `i32`.
+ #[inline]
+ pub const fn as_i32(&self) -> i32 {
+ self.0.as_i32()
+ }
+
+ /// Returns one more than this value as a usize.
+ ///
+ /// Since values represented by a "small index" have constraints
+ /// on their maximum value, adding `1` to it will always fit in a
+ /// `usize`, `u32` and a `i32`.
+ #[inline]
+ pub fn one_more(&self) -> usize {
+ self.0.one_more()
+ }
+
+ /// Decode this value from the bytes given using the native endian
+ /// byte order for the current target.
+ ///
+ /// If the decoded integer is not representable as a small index
+ /// for the current target, then this returns an error.
+ #[inline]
+ pub fn from_ne_bytes(bytes: [u8; 4]) -> Result<$name, $err> {
+ SmallIndex::from_ne_bytes(bytes).map($name).map_err($err)
+ }
+
+ /// Decode this value from the bytes given using the native endian
+ /// byte order for the current target.
+ ///
+ /// This is analogous to `new_unchecked` in that is does not check
+ /// whether the decoded integer is representable as a small index.
+ #[inline]
+ pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> $name {
+ $name(SmallIndex::from_ne_bytes_unchecked(bytes))
+ }
+
+ /// Return the underlying integer as raw bytes in native endian
+ /// format.
+ #[inline]
+ pub fn to_ne_bytes(&self) -> [u8; 4] {
+ self.0.to_ne_bytes()
+ }
+
+ /// Returns an iterator over all values from 0 up to and not
+ /// including the given length.
+ ///
+ /// If the given length exceeds this type's limit, then this
+ /// panics.
+ pub(crate) fn iter(len: usize) -> $iter {
+ $iter::new(len)
+ }
+ }
+
+ // We write our own Debug impl so that we get things like PatternID(5)
+ // instead of PatternID(SmallIndex(5)).
+ impl core::fmt::Debug for $name {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ f.debug_tuple(stringify!($name)).field(&self.as_u32()).finish()
+ }
+ }
+
+ impl<T> core::ops::Index<$name> for [T] {
+ type Output = T;
+
+ #[inline]
+ fn index(&self, index: $name) -> &T {
+ &self[index.as_usize()]
+ }
+ }
+
+ impl<T> core::ops::IndexMut<$name> for [T] {
+ #[inline]
+ fn index_mut(&mut self, index: $name) -> &mut T {
+ &mut self[index.as_usize()]
+ }
+ }
+
+ impl<T> core::ops::Index<$name> for Vec<T> {
+ type Output = T;
+
+ #[inline]
+ fn index(&self, index: $name) -> &T {
+ &self[index.as_usize()]
+ }
+ }
+
+ impl<T> core::ops::IndexMut<$name> for Vec<T> {
+ #[inline]
+ fn index_mut(&mut self, index: $name) -> &mut T {
+ &mut self[index.as_usize()]
+ }
+ }
+
+ impl From<SmallIndex> for $name {
+ fn from(index: SmallIndex) -> $name {
+ $name(index)
+ }
+ }
+
+ impl From<u8> for $name {
+ fn from(value: u8) -> $name {
+ $name(SmallIndex::from(value))
+ }
+ }
+
+ impl TryFrom<u16> for $name {
+ type Error = $err;
+
+ fn try_from(value: u16) -> Result<$name, $err> {
+ SmallIndex::try_from(value).map($name).map_err($err)
+ }
+ }
+
+ impl TryFrom<u32> for $name {
+ type Error = $err;
+
+ fn try_from(value: u32) -> Result<$name, $err> {
+ SmallIndex::try_from(value).map($name).map_err($err)
+ }
+ }
+
+ impl TryFrom<u64> for $name {
+ type Error = $err;
+
+ fn try_from(value: u64) -> Result<$name, $err> {
+ SmallIndex::try_from(value).map($name).map_err($err)
+ }
+ }
+
+ impl TryFrom<usize> for $name {
+ type Error = $err;
+
+ fn try_from(value: usize) -> Result<$name, $err> {
+ SmallIndex::try_from(value).map($name).map_err($err)
+ }
+ }
+
+ /// This error occurs when an ID could not be constructed.
+ ///
+ /// This occurs when given an integer exceeding the maximum allowed
+ /// value.
+ ///
+ /// When the `std` feature is enabled, this implements the `Error`
+ /// trait.
+ #[derive(Clone, Debug, Eq, PartialEq)]
+ pub struct $err(SmallIndexError);
+
+ impl $err {
+ /// Returns the value that could not be converted to an ID.
+ pub fn attempted(&self) -> u64 {
+ self.0.attempted()
+ }
+ }
+
+ #[cfg(feature = "std")]
+ impl std::error::Error for $err {}
+
+ impl core::fmt::Display for $err {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(
+ f,
+ "failed to create {} from {:?}, which exceeds {:?}",
+ stringify!($name),
+ self.attempted(),
+ $name::MAX,
+ )
+ }
+ }
+
+ #[derive(Clone, Debug)]
+ pub(crate) struct $iter(SmallIndexIter);
+
+ impl $iter {
+ fn new(len: usize) -> $iter {
+ assert!(
+ len <= $name::LIMIT,
+ "cannot create iterator for {} when number of \
+ elements exceed {:?}",
+ stringify!($name),
+ $name::LIMIT,
+ );
+ $iter(SmallIndexIter { rng: 0..len })
+ }
+ }
+
+ impl Iterator for $iter {
+ type Item = $name;
+
+ fn next(&mut self) -> Option<$name> {
+ self.0.next().map($name)
+ }
+ }
+
+ /// An iterator adapter that is like std::iter::Enumerate, but attaches
+ /// small index values instead. It requires `ExactSizeIterator`. At
+ /// construction, it ensures that the index of each element in the
+ /// iterator is representable in the corresponding small index type.
+ #[derive(Clone, Debug)]
+ pub(crate) struct $withiter<I> {
+ it: I,
+ ids: $iter,
+ }
+
+ impl<I: Iterator + ExactSizeIterator> $withiter<I> {
+ fn new(it: I) -> $withiter<I> {
+ let ids = $name::iter(it.len());
+ $withiter { it, ids }
+ }
+ }
+
+ impl<I: Iterator + ExactSizeIterator> Iterator for $withiter<I> {
+ type Item = ($name, I::Item);
+
+ fn next(&mut self) -> Option<($name, I::Item)> {
+ let item = self.it.next()?;
+ // Number of elements in this iterator must match, according
+ // to contract of ExactSizeIterator.
+ let id = self.ids.next().unwrap();
+ Some((id, item))
+ }
+ }
+ };
+}
+
+/// The identifier of a pattern in an Aho-Corasick automaton.
+///
+/// It is represented by a `u32` even on 64-bit systems in order to conserve
+/// space. Namely, on all targets, this type guarantees that its value will
+/// fit in a `u32`, `i32`, `usize` and an `isize`. This means that on 16-bit
+/// targets, for example, this type's maximum value will never overflow an
+/// `isize`, which means it will never overflow a `i16` even though its
+/// internal representation is still a `u32`.
+///
+/// # Safety
+///
+/// While a `PatternID` is meant to guarantee that its value fits into `usize`
+/// without using as much space as a `usize` on all targets, callers must
+/// not rely on this property for safety. Callers may choose to rely on this
+/// property for correctness however. For example, creating a `StateID` with an
+/// invalid value can be done in entirely safe code. This may in turn result in
+/// panics or silent logical errors.
+#[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)]
+#[repr(transparent)]
+pub struct PatternID(SmallIndex);
+
+/// The identifier of a finite automaton state.
+///
+/// It is represented by a `u32` even on 64-bit systems in order to conserve
+/// space. Namely, on all targets, this type guarantees that its value will
+/// fit in a `u32`, `i32`, `usize` and an `isize`. This means that on 16-bit
+/// targets, for example, this type's maximum value will never overflow an
+/// `isize`, which means it will never overflow a `i16` even though its
+/// internal representation is still a `u32`.
+///
+/// # Safety
+///
+/// While a `StateID` is meant to guarantee that its value fits into `usize`
+/// without using as much space as a `usize` on all targets, callers must
+/// not rely on this property for safety. Callers may choose to rely on this
+/// property for correctness however. For example, creating a `StateID` with an
+/// invalid value can be done in entirely safe code. This may in turn result in
+/// panics or silent logical errors.
+#[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)]
+#[repr(transparent)]
+pub struct StateID(SmallIndex);
+
+index_type_impls!(PatternID, PatternIDError, PatternIDIter, WithPatternIDIter);
+index_type_impls!(StateID, StateIDError, StateIDIter, WithStateIDIter);
+
+/// A utility trait that defines a couple of adapters for making it convenient
+/// to access indices as "small index" types. We require ExactSizeIterator so
+/// that iterator construction can do a single check to make sure the index of
+/// each element is representable by its small index type.
+pub(crate) trait IteratorIndexExt: Iterator {
+ fn with_pattern_ids(self) -> WithPatternIDIter<Self>
+ where
+ Self: Sized + ExactSizeIterator,
+ {
+ WithPatternIDIter::new(self)
+ }
+
+ fn with_state_ids(self) -> WithStateIDIter<Self>
+ where
+ Self: Sized + ExactSizeIterator,
+ {
+ WithStateIDIter::new(self)
+ }
+}
+
+impl<I: Iterator> IteratorIndexExt for I {}
diff --git a/third_party/rust/aho-corasick/src/util/remapper.rs b/third_party/rust/aho-corasick/src/util/remapper.rs
new file mode 100644
index 0000000000..7c47a082cd
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/util/remapper.rs
@@ -0,0 +1,214 @@
+use alloc::vec::Vec;
+
+use crate::{nfa::noncontiguous, util::primitives::StateID};
+
+/// Remappable is a tightly coupled abstraction that facilitates remapping
+/// state identifiers in DFAs.
+///
+/// The main idea behind remapping state IDs is that DFAs often need to check
+/// if a certain state is a "special" state of some kind (like a match state)
+/// during a search. Since this is extremely perf critical code, we want this
+/// check to be as fast as possible. Partitioning state IDs into, for example,
+/// into "non-match" and "match" states means one can tell if a state is a
+/// match state via a simple comparison of the state ID.
+///
+/// The issue is that during the DFA construction process, it's not
+/// particularly easy to partition the states. Instead, the simplest thing is
+/// to often just do a pass over all of the states and shuffle them into their
+/// desired partitionings. To do that, we need a mechanism for swapping states.
+/// Hence, this abstraction.
+///
+/// Normally, for such little code, I would just duplicate it. But this is a
+/// key optimization and the implementation is a bit subtle. So the abstraction
+/// is basically a ham-fisted attempt at DRY. The only place we use this is in
+/// the dense and one-pass DFAs.
+///
+/// See also src/dfa/special.rs for a more detailed explanation of how dense
+/// DFAs are partitioned.
+pub(crate) trait Remappable: core::fmt::Debug {
+ /// Return the total number of states.
+ fn state_len(&self) -> usize;
+
+ /// Swap the states pointed to by the given IDs. The underlying finite
+ /// state machine should be mutated such that all of the transitions in
+ /// `id1` are now in the memory region where the transitions for `id2`
+ /// were, and all of the transitions in `id2` are now in the memory region
+ /// where the transitions for `id1` were.
+ ///
+ /// Essentially, this "moves" `id1` to `id2` and `id2` to `id1`.
+ ///
+ /// It is expected that, after calling this, the underlying state machine
+ /// will be left in an inconsistent state, since any other transitions
+ /// pointing to, e.g., `id1` need to be updated to point to `id2`, since
+ /// that's where `id1` moved to.
+ ///
+ /// In order to "fix" the underlying inconsistent state, a `Remapper`
+ /// should be used to guarantee that `remap` is called at the appropriate
+ /// time.
+ fn swap_states(&mut self, id1: StateID, id2: StateID);
+
+ /// This must remap every single state ID in the underlying value according
+ /// to the function given. For example, in a DFA, this should remap every
+ /// transition and every starting state ID.
+ fn remap(&mut self, map: impl Fn(StateID) -> StateID);
+}
+
+/// Remapper is an abstraction the manages the remapping of state IDs in a
+/// finite state machine. This is useful when one wants to shuffle states into
+/// different positions in the machine.
+///
+/// One of the key complexities this manages is the ability to correctly move
+/// one state multiple times.
+///
+/// Once shuffling is complete, `remap` must be called, which will rewrite
+/// all pertinent transitions to updated state IDs. Neglecting to call `remap`
+/// will almost certainly result in a corrupt machine.
+#[derive(Debug)]
+pub(crate) struct Remapper {
+ /// A map from the index of a state to its pre-multiplied identifier.
+ ///
+ /// When a state is swapped with another, then their corresponding
+ /// locations in this map are also swapped. Thus, its new position will
+ /// still point to its old pre-multiplied StateID.
+ ///
+ /// While there is a bit more to it, this then allows us to rewrite the
+ /// state IDs in a DFA's transition table in a single pass. This is done
+ /// by iterating over every ID in this map, then iterating over each
+ /// transition for the state at that ID and re-mapping the transition from
+ /// `old_id` to `map[dfa.to_index(old_id)]`. That is, we find the position
+ /// in this map where `old_id` *started*, and set it to where it ended up
+ /// after all swaps have been completed.
+ map: Vec<StateID>,
+ /// A way to map indices to state IDs (and back).
+ idx: IndexMapper,
+}
+
+impl Remapper {
+ /// Create a new remapper from the given remappable implementation. The
+ /// remapper can then be used to swap states. The remappable value given
+ /// here must the same one given to `swap` and `remap`.
+ ///
+ /// The given stride should be the stride of the transition table expressed
+ /// as a power of 2. This stride is used to map between state IDs and state
+ /// indices. If state IDs and state indices are equivalent, then provide
+ /// a `stride2` of `0`, which acts as an identity.
+ pub(crate) fn new(r: &impl Remappable, stride2: usize) -> Remapper {
+ let idx = IndexMapper { stride2 };
+ let map = (0..r.state_len()).map(|i| idx.to_state_id(i)).collect();
+ Remapper { map, idx }
+ }
+
+ /// Swap two states. Once this is called, callers must follow through to
+ /// call `remap`, or else it's possible for the underlying remappable
+ /// value to be in a corrupt state.
+ pub(crate) fn swap(
+ &mut self,
+ r: &mut impl Remappable,
+ id1: StateID,
+ id2: StateID,
+ ) {
+ if id1 == id2 {
+ return;
+ }
+ r.swap_states(id1, id2);
+ self.map.swap(self.idx.to_index(id1), self.idx.to_index(id2));
+ }
+
+ /// Complete the remapping process by rewriting all state IDs in the
+ /// remappable value according to the swaps performed.
+ pub(crate) fn remap(mut self, r: &mut impl Remappable) {
+ // Update the map to account for states that have been swapped
+ // multiple times. For example, if (A, C) and (C, G) are swapped, then
+ // transitions previously pointing to A should now point to G. But if
+ // we don't update our map, they will erroneously be set to C. All we
+ // do is follow the swaps in our map until we see our original state
+ // ID.
+ //
+ // The intuition here is to think about how changes are made to the
+ // map: only through pairwise swaps. That means that starting at any
+ // given state, it is always possible to find the loop back to that
+ // state by following the swaps represented in the map (which might be
+ // 0 swaps).
+ //
+ // We are also careful to clone the map before starting in order to
+ // freeze it. We use the frozen map to find our loops, since we need to
+ // update our map as well. Without freezing it, our updates could break
+ // the loops referenced above and produce incorrect results.
+ let oldmap = self.map.clone();
+ for i in 0..r.state_len() {
+ let cur_id = self.idx.to_state_id(i);
+ let mut new_id = oldmap[i];
+ if cur_id == new_id {
+ continue;
+ }
+ loop {
+ let id = oldmap[self.idx.to_index(new_id)];
+ if cur_id == id {
+ self.map[i] = new_id;
+ break;
+ }
+ new_id = id;
+ }
+ }
+ r.remap(|sid| self.map[self.idx.to_index(sid)]);
+ }
+}
+
+/// A simple type for mapping between state indices and state IDs.
+///
+/// The reason why this exists is because state IDs are "premultiplied" in a
+/// DFA. That is, in order to get to the transitions for a particular state,
+/// one need only use the state ID as-is, instead of having to multiply it by
+/// transition table's stride.
+///
+/// The downside of this is that it's inconvenient to map between state IDs
+/// using a dense map, e.g., Vec<StateID>. That's because state IDs look like
+/// `0`, `stride`, `2*stride`, `3*stride`, etc., instead of `0`, `1`, `2`, `3`,
+/// etc.
+///
+/// Since our state IDs are premultiplied, we can convert back-and-forth
+/// between IDs and indices by simply unmultiplying the IDs and multiplying the
+/// indices.
+///
+/// Note that for a sparse NFA, state IDs and indices are equivalent. In this
+/// case, we set the stride of the index mapped to be `0`, which acts as an
+/// identity.
+#[derive(Debug)]
+struct IndexMapper {
+ /// The power of 2 corresponding to the stride of the corresponding
+ /// transition table. 'id >> stride2' de-multiplies an ID while 'index <<
+ /// stride2' pre-multiplies an index to an ID.
+ stride2: usize,
+}
+
+impl IndexMapper {
+ /// Convert a state ID to a state index.
+ fn to_index(&self, id: StateID) -> usize {
+ id.as_usize() >> self.stride2
+ }
+
+ /// Convert a state index to a state ID.
+ fn to_state_id(&self, index: usize) -> StateID {
+ // CORRECTNESS: If the given index is not valid, then it is not
+ // required for this to panic or return a valid state ID. We'll "just"
+ // wind up with panics or silent logic errors at some other point. But
+ // this is OK because if Remappable::state_len is correct and so is
+ // 'to_index', then all inputs to 'to_state_id' should be valid indices
+ // and thus transform into valid state IDs.
+ StateID::new_unchecked(index << self.stride2)
+ }
+}
+
+impl Remappable for noncontiguous::NFA {
+ fn state_len(&self) -> usize {
+ noncontiguous::NFA::states(self).len()
+ }
+
+ fn swap_states(&mut self, id1: StateID, id2: StateID) {
+ noncontiguous::NFA::swap_states(self, id1, id2)
+ }
+
+ fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
+ noncontiguous::NFA::remap(self, map)
+ }
+}
diff --git a/third_party/rust/aho-corasick/src/util/search.rs b/third_party/rust/aho-corasick/src/util/search.rs
new file mode 100644
index 0000000000..59b7035e1f
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/util/search.rs
@@ -0,0 +1,1148 @@
+use core::ops::{Range, RangeBounds};
+
+use crate::util::primitives::PatternID;
+
+/// The configuration and the haystack to use for an Aho-Corasick search.
+///
+/// When executing a search, there are a few parameters one might want to
+/// configure:
+///
+/// * The haystack to search, provided to the [`Input::new`] constructor. This
+/// is the only required parameter.
+/// * The span _within_ the haystack to limit a search to. (The default
+/// is the entire haystack.) This is configured via [`Input::span`] or
+/// [`Input::range`].
+/// * Whether to run an unanchored (matches can occur anywhere after the
+/// start of the search) or anchored (matches can only occur beginning at
+/// the start of the search) search. Unanchored search is the default. This is
+/// configured via [`Input::anchored`].
+/// * Whether to quit the search as soon as a match has been found, regardless
+/// of the [`MatchKind`] that the searcher was built with. This is configured
+/// via [`Input::earliest`].
+///
+/// For most cases, the defaults for all optional parameters are appropriate.
+/// The utility of this type is that it keeps the default or common case simple
+/// while permitting tweaking parameters in more niche use cases while reusing
+/// the same search APIs.
+///
+/// # Valid bounds and search termination
+///
+/// An `Input` permits setting the bounds of a search via either
+/// [`Input::span`] or [`Input::range`]. The bounds set must be valid, or
+/// else a panic will occur. Bounds are valid if and only if:
+///
+/// * The bounds represent a valid range into the input's haystack.
+/// * **or** the end bound is a valid ending bound for the haystack *and*
+/// the start bound is exactly one greater than the end bound.
+///
+/// In the latter case, [`Input::is_done`] will return true and indicates any
+/// search receiving such an input should immediately return with no match.
+///
+/// Other than representing "search is complete," the `Input::span` and
+/// `Input::range` APIs are never necessary. Instead, callers can slice the
+/// haystack instead, e.g., with `&haystack[start..end]`. With that said, they
+/// can be more convenient than slicing because the match positions reported
+/// when using `Input::span` or `Input::range` are in terms of the original
+/// haystack. If you instead use `&haystack[start..end]`, then you'll need to
+/// add `start` to any match position returned in order for it to be a correct
+/// index into `haystack`.
+///
+/// # Example: `&str` and `&[u8]` automatically convert to an `Input`
+///
+/// There is a `From<&T> for Input` implementation for all `T: AsRef<[u8]>`.
+/// Additionally, the [`AhoCorasick`](crate::AhoCorasick) search APIs accept
+/// a `Into<Input>`. These two things combined together mean you can provide
+/// things like `&str` and `&[u8]` to search APIs when the defaults are
+/// suitable, but also an `Input` when they're not. For example:
+///
+/// ```
+/// use aho_corasick::{AhoCorasick, Anchored, Input, Match, StartKind};
+///
+/// // Build a searcher that supports both unanchored and anchored modes.
+/// let ac = AhoCorasick::builder()
+/// .start_kind(StartKind::Both)
+/// .build(&["abcd", "b"])
+/// .unwrap();
+/// let haystack = "abcd";
+///
+/// // A search using default parameters is unanchored. With standard
+/// // semantics, this finds `b` first.
+/// assert_eq!(
+/// Some(Match::must(1, 1..2)),
+/// ac.find(haystack),
+/// );
+/// // Using the same 'find' routine, we can provide an 'Input' explicitly
+/// // that is configured to do an anchored search. Since 'b' doesn't start
+/// // at the beginning of the search, it is not reported as a match.
+/// assert_eq!(
+/// Some(Match::must(0, 0..4)),
+/// ac.find(Input::new(haystack).anchored(Anchored::Yes)),
+/// );
+/// ```
+#[derive(Clone)]
+pub struct Input<'h> {
+ haystack: &'h [u8],
+ span: Span,
+ anchored: Anchored,
+ earliest: bool,
+}
+
+impl<'h> Input<'h> {
+ /// Create a new search configuration for the given haystack.
+ #[inline]
+ pub fn new<H: ?Sized + AsRef<[u8]>>(haystack: &'h H) -> Input<'h> {
+ Input {
+ haystack: haystack.as_ref(),
+ span: Span { start: 0, end: haystack.as_ref().len() },
+ anchored: Anchored::No,
+ earliest: false,
+ }
+ }
+
+ /// Set the span for this search.
+ ///
+ /// This routine is generic over how a span is provided. While
+ /// a [`Span`] may be given directly, one may also provide a
+ /// `std::ops::Range<usize>`. To provide anything supported by range
+ /// syntax, use the [`Input::range`] method.
+ ///
+ /// The default span is the entire haystack.
+ ///
+ /// Note that [`Input::range`] overrides this method and vice versa.
+ ///
+ /// # Panics
+ ///
+ /// This panics if the given span does not correspond to valid bounds in
+ /// the haystack or the termination of a search.
+ ///
+ /// # Example
+ ///
+ /// This example shows how the span of the search can impact whether a
+ /// match is reported or not.
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, Input, MatchKind};
+ ///
+ /// let patterns = &["b", "abcd", "abc"];
+ /// let haystack = "abcd";
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .match_kind(MatchKind::LeftmostFirst)
+ /// .build(patterns)
+ /// .unwrap();
+ /// let input = Input::new(haystack).span(0..3);
+ /// let mat = ac.try_find(input)?.expect("should have a match");
+ /// // Without the span stopping the search early, 'abcd' would be reported
+ /// // because it is the correct leftmost-first match.
+ /// assert_eq!("abc", &haystack[mat.span()]);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn span<S: Into<Span>>(mut self, span: S) -> Input<'h> {
+ self.set_span(span);
+ self
+ }
+
+ /// Like `Input::span`, but accepts any range instead.
+ ///
+ /// The default range is the entire haystack.
+ ///
+ /// Note that [`Input::span`] overrides this method and vice versa.
+ ///
+ /// # Panics
+ ///
+ /// This routine will panic if the given range could not be converted
+ /// to a valid [`Range`]. For example, this would panic when given
+ /// `0..=usize::MAX` since it cannot be represented using a half-open
+ /// interval in terms of `usize`.
+ ///
+ /// This routine also panics if the given range does not correspond to
+ /// valid bounds in the haystack or the termination of a search.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use aho_corasick::Input;
+ ///
+ /// let input = Input::new("foobar");
+ /// assert_eq!(0..6, input.get_range());
+ ///
+ /// let input = Input::new("foobar").range(2..=4);
+ /// assert_eq!(2..5, input.get_range());
+ /// ```
+ #[inline]
+ pub fn range<R: RangeBounds<usize>>(mut self, range: R) -> Input<'h> {
+ self.set_range(range);
+ self
+ }
+
+ /// Sets the anchor mode of a search.
+ ///
+ /// When a search is anchored (via [`Anchored::Yes`]), a match must begin
+ /// at the start of a search. When a search is not anchored (that's
+ /// [`Anchored::No`]), searchers will look for a match anywhere in the
+ /// haystack.
+ ///
+ /// By default, the anchored mode is [`Anchored::No`].
+ ///
+ /// # Support for anchored searches
+ ///
+ /// Anchored or unanchored searches might not always be available,
+ /// depending on the type of searcher used and its configuration:
+ ///
+ /// * [`noncontiguous::NFA`](crate::nfa::noncontiguous::NFA) always
+ /// supports both unanchored and anchored searches.
+ /// * [`contiguous::NFA`](crate::nfa::contiguous::NFA) always supports both
+ /// unanchored and anchored searches.
+ /// * [`dfa::DFA`](crate::dfa::DFA) supports only unanchored
+ /// searches by default.
+ /// [`dfa::Builder::start_kind`](crate::dfa::Builder::start_kind) can
+ /// be used to change the default to supporting both kinds of searches
+ /// or even just anchored searches.
+ /// * [`AhoCorasick`](crate::AhoCorasick) inherits the same setup as a
+ /// `DFA`. Namely, it only supports unanchored searches by default, but
+ /// [`AhoCorasickBuilder::start_kind`](crate::AhoCorasickBuilder::start_kind)
+ /// can change this.
+ ///
+ /// If you try to execute a search using a `try_` ("fallible") method with
+ /// an unsupported anchor mode, then an error will be returned. For calls
+ /// to infallible search methods, a panic will result.
+ ///
+ /// # Example
+ ///
+ /// This demonstrates the differences between an anchored search and
+ /// an unanchored search. Notice that we build our `AhoCorasick` searcher
+ /// with [`StartKind::Both`] so that it supports both unanchored and
+ /// anchored searches simultaneously.
+ ///
+ /// ```
+ /// use aho_corasick::{
+ /// AhoCorasick, Anchored, Input, MatchKind, StartKind,
+ /// };
+ ///
+ /// let patterns = &["bcd"];
+ /// let haystack = "abcd";
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .start_kind(StartKind::Both)
+ /// .build(patterns)
+ /// .unwrap();
+ ///
+ /// // Note that 'Anchored::No' is the default, so it doesn't need to
+ /// // be explicitly specified here.
+ /// let input = Input::new(haystack);
+ /// let mat = ac.try_find(input)?.expect("should have a match");
+ /// assert_eq!("bcd", &haystack[mat.span()]);
+ ///
+ /// // While 'bcd' occurs in the haystack, it does not begin where our
+ /// // search begins, so no match is found.
+ /// let input = Input::new(haystack).anchored(Anchored::Yes);
+ /// assert_eq!(None, ac.try_find(input)?);
+ ///
+ /// // However, if we start our search where 'bcd' starts, then we will
+ /// // find a match.
+ /// let input = Input::new(haystack).range(1..).anchored(Anchored::Yes);
+ /// let mat = ac.try_find(input)?.expect("should have a match");
+ /// assert_eq!("bcd", &haystack[mat.span()]);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn anchored(mut self, mode: Anchored) -> Input<'h> {
+ self.set_anchored(mode);
+ self
+ }
+
+ /// Whether to execute an "earliest" search or not.
+ ///
+ /// When running a non-overlapping search, an "earliest" search will
+ /// return the match location as early as possible. For example, given
+ /// the patterns `abc` and `b`, and a haystack of `abc`, a normal
+ /// leftmost-first search will return `abc` as a match. But an "earliest"
+ /// search will return as soon as it is known that a match occurs, which
+ /// happens once `b` is seen.
+ ///
+ /// Note that when using [`MatchKind::Standard`], the "earliest" option
+ /// has no effect since standard semantics are already "earliest." Note
+ /// also that this has no effect in overlapping searches, since overlapping
+ /// searches also use standard semantics and report all possible matches.
+ ///
+ /// This is disabled by default.
+ ///
+ /// # Example
+ ///
+ /// This example shows the difference between "earliest" searching and
+ /// normal leftmost searching.
+ ///
+ /// ```
+ /// use aho_corasick::{AhoCorasick, Anchored, Input, MatchKind, StartKind};
+ ///
+ /// let patterns = &["abc", "b"];
+ /// let haystack = "abc";
+ ///
+ /// let ac = AhoCorasick::builder()
+ /// .match_kind(MatchKind::LeftmostFirst)
+ /// .build(patterns)
+ /// .unwrap();
+ ///
+ /// // The normal leftmost-first match.
+ /// let input = Input::new(haystack);
+ /// let mat = ac.try_find(input)?.expect("should have a match");
+ /// assert_eq!("abc", &haystack[mat.span()]);
+ ///
+ /// // The "earliest" possible match, even if it isn't leftmost-first.
+ /// let input = Input::new(haystack).earliest(true);
+ /// let mat = ac.try_find(input)?.expect("should have a match");
+ /// assert_eq!("b", &haystack[mat.span()]);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn earliest(mut self, yes: bool) -> Input<'h> {
+ self.set_earliest(yes);
+ self
+ }
+
+ /// Set the span for this search configuration.
+ ///
+ /// This is like the [`Input::span`] method, except this mutates the
+ /// span in place.
+ ///
+ /// This routine is generic over how a span is provided. While
+ /// a [`Span`] may be given directly, one may also provide a
+ /// `std::ops::Range<usize>`.
+ ///
+ /// # Panics
+ ///
+ /// This panics if the given span does not correspond to valid bounds in
+ /// the haystack or the termination of a search.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use aho_corasick::Input;
+ ///
+ /// let mut input = Input::new("foobar");
+ /// assert_eq!(0..6, input.get_range());
+ /// input.set_span(2..4);
+ /// assert_eq!(2..4, input.get_range());
+ /// ```
+ #[inline]
+ pub fn set_span<S: Into<Span>>(&mut self, span: S) {
+ let span = span.into();
+ assert!(
+ span.end <= self.haystack.len()
+ && span.start <= span.end.wrapping_add(1),
+ "invalid span {:?} for haystack of length {}",
+ span,
+ self.haystack.len(),
+ );
+ self.span = span;
+ }
+
+ /// Set the span for this search configuration given any range.
+ ///
+ /// This is like the [`Input::range`] method, except this mutates the
+ /// span in place.
+ ///
+ /// # Panics
+ ///
+ /// This routine will panic if the given range could not be converted
+ /// to a valid [`Range`]. For example, this would panic when given
+ /// `0..=usize::MAX` since it cannot be represented using a half-open
+ /// interval in terms of `usize`.
+ ///
+ /// This routine also panics if the given range does not correspond to
+ /// valid bounds in the haystack or the termination of a search.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use aho_corasick::Input;
+ ///
+ /// let mut input = Input::new("foobar");
+ /// assert_eq!(0..6, input.get_range());
+ /// input.set_range(2..=4);
+ /// assert_eq!(2..5, input.get_range());
+ /// ```
+ #[inline]
+ pub fn set_range<R: RangeBounds<usize>>(&mut self, range: R) {
+ use core::ops::Bound;
+
+ // It's a little weird to convert ranges into spans, and then spans
+ // back into ranges when we actually slice the haystack. Because
+ // of that process, we always represent everything as a half-open
+ // internal. Therefore, handling things like m..=n is a little awkward.
+ let start = match range.start_bound() {
+ Bound::Included(&i) => i,
+ // Can this case ever happen? Range syntax doesn't support it...
+ Bound::Excluded(&i) => i.checked_add(1).unwrap(),
+ Bound::Unbounded => 0,
+ };
+ let end = match range.end_bound() {
+ Bound::Included(&i) => i.checked_add(1).unwrap(),
+ Bound::Excluded(&i) => i,
+ Bound::Unbounded => self.haystack().len(),
+ };
+ self.set_span(Span { start, end });
+ }
+
+ /// Set the starting offset for the span for this search configuration.
+ ///
+ /// This is a convenience routine for only mutating the start of a span
+ /// without having to set the entire span.
+ ///
+ /// # Panics
+ ///
+ /// This panics if the given span does not correspond to valid bounds in
+ /// the haystack or the termination of a search.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use aho_corasick::Input;
+ ///
+ /// let mut input = Input::new("foobar");
+ /// assert_eq!(0..6, input.get_range());
+ /// input.set_start(5);
+ /// assert_eq!(5..6, input.get_range());
+ /// ```
+ #[inline]
+ pub fn set_start(&mut self, start: usize) {
+ self.set_span(Span { start, ..self.get_span() });
+ }
+
+ /// Set the ending offset for the span for this search configuration.
+ ///
+ /// This is a convenience routine for only mutating the end of a span
+ /// without having to set the entire span.
+ ///
+ /// # Panics
+ ///
+ /// This panics if the given span does not correspond to valid bounds in
+ /// the haystack or the termination of a search.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use aho_corasick::Input;
+ ///
+ /// let mut input = Input::new("foobar");
+ /// assert_eq!(0..6, input.get_range());
+ /// input.set_end(5);
+ /// assert_eq!(0..5, input.get_range());
+ /// ```
+ #[inline]
+ pub fn set_end(&mut self, end: usize) {
+ self.set_span(Span { end, ..self.get_span() });
+ }
+
+ /// Set the anchor mode of a search.
+ ///
+ /// This is like [`Input::anchored`], except it mutates the search
+ /// configuration in place.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use aho_corasick::{Anchored, Input};
+ ///
+ /// let mut input = Input::new("foobar");
+ /// assert_eq!(Anchored::No, input.get_anchored());
+ ///
+ /// input.set_anchored(Anchored::Yes);
+ /// assert_eq!(Anchored::Yes, input.get_anchored());
+ /// ```
+ #[inline]
+ pub fn set_anchored(&mut self, mode: Anchored) {
+ self.anchored = mode;
+ }
+
+ /// Set whether the search should execute in "earliest" mode or not.
+ ///
+ /// This is like [`Input::earliest`], except it mutates the search
+ /// configuration in place.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use aho_corasick::Input;
+ ///
+ /// let mut input = Input::new("foobar");
+ /// assert!(!input.get_earliest());
+ /// input.set_earliest(true);
+ /// assert!(input.get_earliest());
+ /// ```
+ #[inline]
+ pub fn set_earliest(&mut self, yes: bool) {
+ self.earliest = yes;
+ }
+
+ /// Return a borrow of the underlying haystack as a slice of bytes.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use aho_corasick::Input;
+ ///
+ /// let input = Input::new("foobar");
+ /// assert_eq!(b"foobar", input.haystack());
+ /// ```
+ #[inline]
+ pub fn haystack(&self) -> &[u8] {
+ self.haystack
+ }
+
+ /// Return the start position of this search.
+ ///
+ /// This is a convenience routine for `search.get_span().start()`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use aho_corasick::Input;
+ ///
+ /// let input = Input::new("foobar");
+ /// assert_eq!(0, input.start());
+ ///
+ /// let input = Input::new("foobar").span(2..4);
+ /// assert_eq!(2, input.start());
+ /// ```
+ #[inline]
+ pub fn start(&self) -> usize {
+ self.get_span().start
+ }
+
+ /// Return the end position of this search.
+ ///
+ /// This is a convenience routine for `search.get_span().end()`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use aho_corasick::Input;
+ ///
+ /// let input = Input::new("foobar");
+ /// assert_eq!(6, input.end());
+ ///
+ /// let input = Input::new("foobar").span(2..4);
+ /// assert_eq!(4, input.end());
+ /// ```
+ #[inline]
+ pub fn end(&self) -> usize {
+ self.get_span().end
+ }
+
+ /// Return the span for this search configuration.
+ ///
+ /// If one was not explicitly set, then the span corresponds to the entire
+ /// range of the haystack.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use aho_corasick::{Input, Span};
+ ///
+ /// let input = Input::new("foobar");
+ /// assert_eq!(Span { start: 0, end: 6 }, input.get_span());
+ /// ```
+ #[inline]
+ pub fn get_span(&self) -> Span {
+ self.span
+ }
+
+ /// Return the span as a range for this search configuration.
+ ///
+ /// If one was not explicitly set, then the span corresponds to the entire
+ /// range of the haystack.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use aho_corasick::Input;
+ ///
+ /// let input = Input::new("foobar");
+ /// assert_eq!(0..6, input.get_range());
+ /// ```
+ #[inline]
+ pub fn get_range(&self) -> Range<usize> {
+ self.get_span().range()
+ }
+
+ /// Return the anchored mode for this search configuration.
+ ///
+ /// If no anchored mode was set, then it defaults to [`Anchored::No`].
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use aho_corasick::{Anchored, Input};
+ ///
+ /// let mut input = Input::new("foobar");
+ /// assert_eq!(Anchored::No, input.get_anchored());
+ ///
+ /// input.set_anchored(Anchored::Yes);
+ /// assert_eq!(Anchored::Yes, input.get_anchored());
+ /// ```
+ #[inline]
+ pub fn get_anchored(&self) -> Anchored {
+ self.anchored
+ }
+
+ /// Return whether this search should execute in "earliest" mode.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use aho_corasick::Input;
+ ///
+ /// let input = Input::new("foobar");
+ /// assert!(!input.get_earliest());
+ /// ```
+ #[inline]
+ pub fn get_earliest(&self) -> bool {
+ self.earliest
+ }
+
+ /// Return true if this input has been exhausted, which in turn means all
+ /// subsequent searches will return no matches.
+ ///
+ /// This occurs precisely when the start position of this search is greater
+ /// than the end position of the search.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use aho_corasick::Input;
+ ///
+ /// let mut input = Input::new("foobar");
+ /// assert!(!input.is_done());
+ /// input.set_start(6);
+ /// assert!(!input.is_done());
+ /// input.set_start(7);
+ /// assert!(input.is_done());
+ /// ```
+ #[inline]
+ pub fn is_done(&self) -> bool {
+ self.get_span().start > self.get_span().end
+ }
+}
+
+impl<'h> core::fmt::Debug for Input<'h> {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ let mut fmter = f.debug_struct("Input");
+ match core::str::from_utf8(self.haystack()) {
+ Ok(nice) => fmter.field("haystack", &nice),
+ Err(_) => fmter.field("haystack", &self.haystack()),
+ }
+ .field("span", &self.span)
+ .field("anchored", &self.anchored)
+ .field("earliest", &self.earliest)
+ .finish()
+ }
+}
+
+impl<'h, H: ?Sized + AsRef<[u8]>> From<&'h H> for Input<'h> {
+ #[inline]
+ fn from(haystack: &'h H) -> Input<'h> {
+ Input::new(haystack)
+ }
+}
+
+/// A representation of a range in a haystack.
+///
+/// A span corresponds to the starting and ending _byte offsets_ of a
+/// contiguous region of bytes. The starting offset is inclusive while the
+/// ending offset is exclusive. That is, a span is a half-open interval.
+///
+/// A span is used to report the offsets of a match, but it is also used to
+/// convey which region of a haystack should be searched via routines like
+/// [`Input::span`].
+///
+/// This is basically equivalent to a `std::ops::Range<usize>`, except this
+/// type implements `Copy` which makes it more ergonomic to use in the context
+/// of this crate. Indeed, `Span` exists only because `Range<usize>` does
+/// not implement `Copy`. Like a range, this implements `Index` for `[u8]`
+/// and `str`, and `IndexMut` for `[u8]`. For convenience, this also impls
+/// `From<Range>`, which means things like `Span::from(5..10)` work.
+///
+/// There are no constraints on the values of a span. It is, for example, legal
+/// to create a span where `start > end`.
+#[derive(Clone, Copy, Eq, Hash, PartialEq)]
+pub struct Span {
+ /// The start offset of the span, inclusive.
+ pub start: usize,
+ /// The end offset of the span, exclusive.
+ pub end: usize,
+}
+
+impl Span {
+ /// Returns this span as a range.
+ #[inline]
+ pub fn range(&self) -> Range<usize> {
+ Range::from(*self)
+ }
+
+ /// Returns true when this span is empty. That is, when `start >= end`.
+ #[inline]
+ pub fn is_empty(&self) -> bool {
+ self.start >= self.end
+ }
+
+ /// Returns the length of this span.
+ ///
+ /// This returns `0` in precisely the cases that `is_empty` returns `true`.
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.end.saturating_sub(self.start)
+ }
+
+ /// Returns true when the given offset is contained within this span.
+ ///
+ /// Note that an empty span contains no offsets and will always return
+ /// false.
+ #[inline]
+ pub fn contains(&self, offset: usize) -> bool {
+ !self.is_empty() && self.start <= offset && offset <= self.end
+ }
+
+ /// Returns a new span with `offset` added to this span's `start` and `end`
+ /// values.
+ #[inline]
+ pub fn offset(&self, offset: usize) -> Span {
+ Span { start: self.start + offset, end: self.end + offset }
+ }
+}
+
+impl core::fmt::Debug for Span {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(f, "{}..{}", self.start, self.end)
+ }
+}
+
+impl core::ops::Index<Span> for [u8] {
+ type Output = [u8];
+
+ #[inline]
+ fn index(&self, index: Span) -> &[u8] {
+ &self[index.range()]
+ }
+}
+
+impl core::ops::IndexMut<Span> for [u8] {
+ #[inline]
+ fn index_mut(&mut self, index: Span) -> &mut [u8] {
+ &mut self[index.range()]
+ }
+}
+
+impl core::ops::Index<Span> for str {
+ type Output = str;
+
+ #[inline]
+ fn index(&self, index: Span) -> &str {
+ &self[index.range()]
+ }
+}
+
+impl From<Range<usize>> for Span {
+ #[inline]
+ fn from(range: Range<usize>) -> Span {
+ Span { start: range.start, end: range.end }
+ }
+}
+
+impl From<Span> for Range<usize> {
+ #[inline]
+ fn from(span: Span) -> Range<usize> {
+ Range { start: span.start, end: span.end }
+ }
+}
+
+impl PartialEq<Range<usize>> for Span {
+ #[inline]
+ fn eq(&self, range: &Range<usize>) -> bool {
+ self.start == range.start && self.end == range.end
+ }
+}
+
+impl PartialEq<Span> for Range<usize> {
+ #[inline]
+ fn eq(&self, span: &Span) -> bool {
+ self.start == span.start && self.end == span.end
+ }
+}
+
+/// The type of anchored search to perform.
+///
+/// If an Aho-Corasick searcher does not support the anchored mode selected,
+/// then the search will return an error or panic, depending on whether a
+/// fallible or an infallible routine was called.
+#[non_exhaustive]
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum Anchored {
+ /// Run an unanchored search. This means a match may occur anywhere at or
+ /// after the start position of the search up until the end position of the
+ /// search.
+ No,
+ /// Run an anchored search. This means that a match must begin at the start
+ /// position of the search and end before the end position of the search.
+ Yes,
+}
+
+impl Anchored {
+ /// Returns true if and only if this anchor mode corresponds to an anchored
+ /// search.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use aho_corasick::Anchored;
+ ///
+ /// assert!(!Anchored::No.is_anchored());
+ /// assert!(Anchored::Yes.is_anchored());
+ /// ```
+ #[inline]
+ pub fn is_anchored(&self) -> bool {
+ matches!(*self, Anchored::Yes)
+ }
+}
+
+/// A representation of a match reported by an Aho-Corasick searcher.
+///
+/// A match has two essential pieces of information: the [`PatternID`] that
+/// matches, and the [`Span`] of the match in a haystack.
+///
+/// The pattern is identified by an ID, which corresponds to its position
+/// (starting from `0`) relative to other patterns used to construct the
+/// corresponding searcher. If only a single pattern is provided, then all
+/// matches are guaranteed to have a pattern ID of `0`.
+///
+/// Every match reported by a searcher guarantees that its span has its start
+/// offset as less than or equal to its end offset.
+#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
+pub struct Match {
+ /// The pattern ID.
+ pattern: PatternID,
+ /// The underlying match span.
+ span: Span,
+}
+
+impl Match {
+ /// Create a new match from a pattern ID and a span.
+ ///
+ /// This constructor is generic over how a span is provided. While
+ /// a [`Span`] may be given directly, one may also provide a
+ /// `std::ops::Range<usize>`.
+ ///
+ /// # Panics
+ ///
+ /// This panics if `end < start`.
+ ///
+ /// # Example
+ ///
+ /// This shows how to create a match for the first pattern in an
+ /// Aho-Corasick searcher using convenient range syntax.
+ ///
+ /// ```
+ /// use aho_corasick::{Match, PatternID};
+ ///
+ /// let m = Match::new(PatternID::ZERO, 5..10);
+ /// assert_eq!(0, m.pattern().as_usize());
+ /// assert_eq!(5, m.start());
+ /// assert_eq!(10, m.end());
+ /// ```
+ #[inline]
+ pub fn new<S: Into<Span>>(pattern: PatternID, span: S) -> Match {
+ let span = span.into();
+ assert!(span.start <= span.end, "invalid match span");
+ Match { pattern, span }
+ }
+
+ /// Create a new match from a pattern ID and a byte offset span.
+ ///
+ /// This constructor is generic over how a span is provided. While
+ /// a [`Span`] may be given directly, one may also provide a
+ /// `std::ops::Range<usize>`.
+ ///
+ /// This is like [`Match::new`], but accepts a `usize` instead of a
+ /// [`PatternID`]. This panics if the given `usize` is not representable
+ /// as a `PatternID`.
+ ///
+ /// # Panics
+ ///
+ /// This panics if `end < start` or if `pattern > PatternID::MAX`.
+ ///
+ /// # Example
+ ///
+ /// This shows how to create a match for the third pattern in an
+ /// Aho-Corasick searcher using convenient range syntax.
+ ///
+ /// ```
+ /// use aho_corasick::Match;
+ ///
+ /// let m = Match::must(3, 5..10);
+ /// assert_eq!(3, m.pattern().as_usize());
+ /// assert_eq!(5, m.start());
+ /// assert_eq!(10, m.end());
+ /// ```
+ #[inline]
+ pub fn must<S: Into<Span>>(pattern: usize, span: S) -> Match {
+ Match::new(PatternID::must(pattern), span)
+ }
+
+ /// Returns the ID of the pattern that matched.
+ ///
+ /// The ID of a pattern is derived from the position in which it was
+ /// originally inserted into the corresponding searcher. The first pattern
+ /// has identifier `0`, and each subsequent pattern is `1`, `2` and so on.
+ #[inline]
+ pub fn pattern(&self) -> PatternID {
+ self.pattern
+ }
+
+ /// The starting position of the match.
+ ///
+ /// This is a convenience routine for `Match::span().start`.
+ #[inline]
+ pub fn start(&self) -> usize {
+ self.span().start
+ }
+
+ /// The ending position of the match.
+ ///
+ /// This is a convenience routine for `Match::span().end`.
+ #[inline]
+ pub fn end(&self) -> usize {
+ self.span().end
+ }
+
+ /// Returns the match span as a range.
+ ///
+ /// This is a convenience routine for `Match::span().range()`.
+ #[inline]
+ pub fn range(&self) -> core::ops::Range<usize> {
+ self.span().range()
+ }
+
+ /// Returns the span for this match.
+ #[inline]
+ pub fn span(&self) -> Span {
+ self.span
+ }
+
+ /// Returns true when the span in this match is empty.
+ ///
+ /// An empty match can only be returned when empty pattern is in the
+ /// Aho-Corasick searcher.
+ #[inline]
+ pub fn is_empty(&self) -> bool {
+ self.span().is_empty()
+ }
+
+ /// Returns the length of this match.
+ ///
+ /// This returns `0` in precisely the cases that `is_empty` returns `true`.
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.span().len()
+ }
+
+ /// Returns a new match with `offset` added to its span's `start` and `end`
+ /// values.
+ #[inline]
+ pub fn offset(&self, offset: usize) -> Match {
+ Match {
+ pattern: self.pattern,
+ span: Span {
+ start: self.start() + offset,
+ end: self.end() + offset,
+ },
+ }
+ }
+}
+
+/// A knob for controlling the match semantics of an Aho-Corasick automaton.
+///
+/// There are two generally different ways that Aho-Corasick automatons can
+/// report matches. The first way is the "standard" approach that results from
+/// implementing most textbook explanations of Aho-Corasick. The second way is
+/// to report only the leftmost non-overlapping matches. The leftmost approach
+/// is in turn split into two different ways of resolving ambiguous matches:
+/// leftmost-first and leftmost-longest.
+///
+/// The `Standard` match kind is the default and is the only one that supports
+/// overlapping matches and stream searching. (Trying to find overlapping or
+/// streaming matches using leftmost match semantics will result in an error in
+/// fallible APIs and a panic when using infallibe APIs.) The `Standard` match
+/// kind will report matches as they are seen. When searching for overlapping
+/// matches, then all possible matches are reported. When searching for
+/// non-overlapping matches, the first match seen is reported. For example, for
+/// non-overlapping matches, given the patterns `abcd` and `b` and the haystack
+/// `abcdef`, only a match for `b` is reported since it is detected first. The
+/// `abcd` match is never reported since it overlaps with the `b` match.
+///
+/// In contrast, the leftmost match kind always prefers the leftmost match
+/// among all possible matches. Given the same example as above with `abcd` and
+/// `b` as patterns and `abcdef` as the haystack, the leftmost match is `abcd`
+/// since it begins before the `b` match, even though the `b` match is detected
+/// before the `abcd` match. In this case, the `b` match is not reported at all
+/// since it overlaps with the `abcd` match.
+///
+/// The difference between leftmost-first and leftmost-longest is in how they
+/// resolve ambiguous matches when there are multiple leftmost matches to
+/// choose from. Leftmost-first always chooses the pattern that was provided
+/// earliest, where as leftmost-longest always chooses the longest matching
+/// pattern. For example, given the patterns `a` and `ab` and the subject
+/// string `ab`, the leftmost-first match is `a` but the leftmost-longest match
+/// is `ab`. Conversely, if the patterns were given in reverse order, i.e.,
+/// `ab` and `a`, then both the leftmost-first and leftmost-longest matches
+/// would be `ab`. Stated differently, the leftmost-first match depends on the
+/// order in which the patterns were given to the Aho-Corasick automaton.
+/// Because of that, when leftmost-first matching is used, if a pattern `A`
+/// that appears before a pattern `B` is a prefix of `B`, then it is impossible
+/// to ever observe a match of `B`.
+///
+/// If you're not sure which match kind to pick, then stick with the standard
+/// kind, which is the default. In particular, if you need overlapping or
+/// streaming matches, then you _must_ use the standard kind. The leftmost
+/// kinds are useful in specific circumstances. For example, leftmost-first can
+/// be very useful as a way to implement match priority based on the order of
+/// patterns given and leftmost-longest can be useful for dictionary searching
+/// such that only the longest matching words are reported.
+///
+/// # Relationship with regular expression alternations
+///
+/// Understanding match semantics can be a little tricky, and one easy way
+/// to conceptualize non-overlapping matches from an Aho-Corasick automaton
+/// is to think about them as a simple alternation of literals in a regular
+/// expression. For example, let's say we wanted to match the strings
+/// `Sam` and `Samwise`, which would turn into the regex `Sam|Samwise`. It
+/// turns out that regular expression engines have two different ways of
+/// matching this alternation. The first way, leftmost-longest, is commonly
+/// found in POSIX compatible implementations of regular expressions (such as
+/// `grep`). The second way, leftmost-first, is commonly found in backtracking
+/// implementations such as Perl. (Some regex engines, such as RE2 and Rust's
+/// regex engine do not use backtracking, but still implement leftmost-first
+/// semantics in an effort to match the behavior of dominant backtracking
+/// regex engines such as those found in Perl, Ruby, Python, Javascript and
+/// PHP.)
+///
+/// That is, when matching `Sam|Samwise` against `Samwise`, a POSIX regex
+/// will match `Samwise` because it is the longest possible match, but a
+/// Perl-like regex will match `Sam` since it appears earlier in the
+/// alternation. Indeed, the regex `Sam|Samwise` in a Perl-like regex engine
+/// will never match `Samwise` since `Sam` will always have higher priority.
+/// Conversely, matching the regex `Samwise|Sam` against `Samwise` will lead to
+/// a match of `Samwise` in both POSIX and Perl-like regexes since `Samwise` is
+/// still longest match, but it also appears earlier than `Sam`.
+///
+/// The "standard" match semantics of Aho-Corasick generally don't correspond
+/// to the match semantics of any large group of regex implementations, so
+/// there's no direct analogy that can be made here. Standard match semantics
+/// are generally useful for overlapping matches, or if you just want to see
+/// matches as they are detected.
+///
+/// The main conclusion to draw from this section is that the match semantics
+/// can be tweaked to precisely match either Perl-like regex alternations or
+/// POSIX regex alternations.
+#[non_exhaustive]
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum MatchKind {
+ /// Use standard match semantics, which support overlapping matches. When
+ /// used with non-overlapping matches, matches are reported as they are
+ /// seen.
+ Standard,
+ /// Use leftmost-first match semantics, which reports leftmost matches.
+ /// When there are multiple possible leftmost matches, the match
+ /// corresponding to the pattern that appeared earlier when constructing
+ /// the automaton is reported.
+ ///
+ /// This does **not** support overlapping matches or stream searching. If
+ /// this match kind is used, attempting to find overlapping matches or
+ /// stream matches will fail.
+ LeftmostFirst,
+ /// Use leftmost-longest match semantics, which reports leftmost matches.
+ /// When there are multiple possible leftmost matches, the longest match
+ /// is chosen.
+ ///
+ /// This does **not** support overlapping matches or stream searching. If
+ /// this match kind is used, attempting to find overlapping matches or
+ /// stream matches will fail.
+ LeftmostLongest,
+}
+
+/// The default match kind is `MatchKind::Standard`.
+impl Default for MatchKind {
+ fn default() -> MatchKind {
+ MatchKind::Standard
+ }
+}
+
+impl MatchKind {
+ #[inline]
+ pub(crate) fn is_standard(&self) -> bool {
+ matches!(*self, MatchKind::Standard)
+ }
+
+ #[inline]
+ pub(crate) fn is_leftmost(&self) -> bool {
+ matches!(*self, MatchKind::LeftmostFirst | MatchKind::LeftmostLongest)
+ }
+
+ #[inline]
+ pub(crate) fn is_leftmost_first(&self) -> bool {
+ matches!(*self, MatchKind::LeftmostFirst)
+ }
+
+ /// Convert this match kind into a packed match kind. If this match kind
+ /// corresponds to standard semantics, then this returns None, since
+ /// packed searching does not support standard semantics.
+ #[inline]
+ pub(crate) fn as_packed(&self) -> Option<crate::packed::MatchKind> {
+ match *self {
+ MatchKind::Standard => None,
+ MatchKind::LeftmostFirst => {
+ Some(crate::packed::MatchKind::LeftmostFirst)
+ }
+ MatchKind::LeftmostLongest => {
+ Some(crate::packed::MatchKind::LeftmostLongest)
+ }
+ }
+ }
+}
+
+/// The kind of anchored starting configurations to support in an Aho-Corasick
+/// searcher.
+///
+/// Depending on which searcher is used internally by
+/// [`AhoCorasick`](crate::AhoCorasick), supporting both unanchored
+/// and anchored searches can be quite costly. For this reason,
+/// [`AhoCorasickBuilder::start_kind`](crate::AhoCorasickBuilder::start_kind)
+/// can be used to configure whether your searcher supports unanchored,
+/// anchored or both kinds of searches.
+///
+/// This searcher configuration knob works in concert with the search time
+/// configuration [`Input::anchored`]. Namely, if one requests an unsupported
+/// anchored mode, then the search will either panic or return an error,
+/// depending on whether you're using infallible or fallibe APIs, respectively.
+///
+/// `AhoCorasick` by default only supports unanchored searches.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum StartKind {
+ /// Support both anchored and unanchored searches.
+ Both,
+ /// Support only unanchored searches. Requesting an anchored search will
+ /// return an error in fallible APIs and panic in infallible APIs.
+ Unanchored,
+ /// Support only anchored searches. Requesting an unanchored search will
+ /// return an error in fallible APIs and panic in infallible APIs.
+ Anchored,
+}
+
+impl Default for StartKind {
+ fn default() -> StartKind {
+ StartKind::Unanchored
+ }
+}
diff --git a/third_party/rust/aho-corasick/src/util/special.rs b/third_party/rust/aho-corasick/src/util/special.rs
new file mode 100644
index 0000000000..beeba40c89
--- /dev/null
+++ b/third_party/rust/aho-corasick/src/util/special.rs
@@ -0,0 +1,42 @@
+use crate::util::primitives::StateID;
+
+/// A collection of sentinel state IDs for Aho-Corasick automata.
+///
+/// This specifically enables the technique by which we determine which states
+/// are dead, matches or start states. Namely, by arranging states in a
+/// particular order, we can determine the type of a state simply by looking at
+/// its ID.
+#[derive(Clone, Debug)]
+pub(crate) struct Special {
+ /// The maximum ID of all the "special" states. This corresponds either to
+ /// start_anchored_id when a prefilter is active and max_match_id when a
+ /// prefilter is not active. The idea here is that if there is no prefilter,
+ /// then there is no point in treating start states as special.
+ pub(crate) max_special_id: StateID,
+ /// The maximum ID of all the match states. Any state ID bigger than this
+ /// is guaranteed to be a non-match ID.
+ ///
+ /// It is possible and legal for max_match_id to be equal to
+ /// start_anchored_id, which occurs precisely in the case where the empty
+ /// string is a pattern that was added to the underlying automaton.
+ pub(crate) max_match_id: StateID,
+ /// The state ID of the start state used for unanchored searches.
+ pub(crate) start_unanchored_id: StateID,
+ /// The state ID of the start state used for anchored searches. This is
+ /// always start_unanchored_id+1.
+ pub(crate) start_anchored_id: StateID,
+}
+
+impl Special {
+ /// Create a new set of "special" state IDs with all IDs initialized to
+ /// zero. The general idea here is that they will be updated and set to
+ /// correct values later.
+ pub(crate) fn zero() -> Special {
+ Special {
+ max_special_id: StateID::ZERO,
+ max_match_id: StateID::ZERO,
+ start_unanchored_id: StateID::ZERO,
+ start_anchored_id: StateID::ZERO,
+ }
+ }
+}