diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:47:55 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:47:55 +0000 |
commit | 2aadc03ef15cb5ca5cc2af8a7c08e070742f0ac4 (patch) | |
tree | 033cc839730fda84ff08db877037977be94e5e3a /vendor/aho-corasick | |
parent | Initial commit. (diff) | |
download | cargo-upstream.tar.xz cargo-upstream.zip |
Adding upstream version 0.70.1+ds1.upstream/0.70.1+ds1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/aho-corasick')
41 files changed, 22078 insertions, 0 deletions
diff --git a/vendor/aho-corasick/.cargo-checksum.json b/vendor/aho-corasick/.cargo-checksum.json new file mode 100644 index 0000000..b0c24a1 --- /dev/null +++ b/vendor/aho-corasick/.cargo-checksum.json @@ -0,0 +1 @@ +{"files":{},"package":"b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"}
\ No newline at end of file diff --git a/vendor/aho-corasick/COPYING b/vendor/aho-corasick/COPYING new file mode 100644 index 0000000..bb9c20a --- /dev/null +++ b/vendor/aho-corasick/COPYING @@ -0,0 +1,3 @@ +This project is dual-licensed under the Unlicense and MIT licenses. + +You may use this code under the terms of either license. diff --git a/vendor/aho-corasick/Cargo.toml b/vendor/aho-corasick/Cargo.toml new file mode 100644 index 0000000..05e899c --- /dev/null +++ b/vendor/aho-corasick/Cargo.toml @@ -0,0 +1,74 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies. +# +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. + +[package] +edition = "2021" +rust-version = "1.60.0" +name = "aho-corasick" +version = "1.1.2" +authors = ["Andrew Gallant <jamslam@gmail.com>"] +exclude = [ + "/aho-corasick-debug", + "/benchmarks", + "/tmp", +] +autotests = false +description = "Fast multiple substring searching." +homepage = "https://github.com/BurntSushi/aho-corasick" +readme = "README.md" +keywords = [ + "string", + "search", + "text", + "pattern", + "multi", +] +categories = ["text-processing"] +license = "Unlicense OR MIT" +repository = "https://github.com/BurntSushi/aho-corasick" + +[package.metadata.docs.rs] +all-features = true +rustdoc-args = [ + "--cfg", + "docsrs", + "--generate-link-to-definition", +] + +[profile.bench] +debug = 2 + +[profile.release] +debug = 2 + +[lib] +name = "aho_corasick" + +[dependencies.log] +version = "0.4.17" +optional = true + +[dependencies.memchr] +version = "2.4.0" +optional = true +default-features = false + +[dev-dependencies.doc-comment] +version = "0.3.3" + +[features] +default = [ + "std", + "perf-literal", +] +logging = ["dep:log"] +perf-literal = ["dep:memchr"] +std = ["memchr?/std"] diff --git a/vendor/aho-corasick/DESIGN.md b/vendor/aho-corasick/DESIGN.md new file mode 100644 index 0000000..f911f0c --- /dev/null +++ b/vendor/aho-corasick/DESIGN.md @@ -0,0 +1,481 @@ +This document describes the internal design of this crate, which is an object +lesson in what happens when you take a fairly simple old algorithm like +Aho-Corasick and make it fast and production ready. + +The target audience of this document is Rust programmers that have some +familiarity with string searching, however, one does not need to know the +Aho-Corasick algorithm in order to read this (it is explained below). One +should, however, know what a trie is. (If you don't, go read its Wikipedia +article.) + +The center-piece of this crate is an implementation of Aho-Corasick. On its +own, Aho-Corasick isn't that complicated. The complex pieces come from the +different variants of Aho-Corasick implemented in this crate. Specifically, +they are: + +* Aho-Corasick as a noncontiguous NFA. States have their transitions + represented sparsely, and each state puts its transitions in its own separate + allocation. Hence the same "noncontiguous." +* Aho-Corasick as a contiguous NFA. This NFA uses a single allocation to + represent the transitions of all states. That is, transitions are laid out + contiguously in memory. Moreover, states near the starting state are + represented densely, such that finding the next state ID takes a constant + number of instructions. +* Aho-Corasick as a DFA. In this case, all states are represented densely in + a transition table that uses one allocation. +* Supporting "standard" match semantics, along with its overlapping variant, + in addition to leftmost-first and leftmost-longest semantics. The "standard" + semantics are typically what you see in a textbook description of + Aho-Corasick. However, Aho-Corasick is also useful as an optimization in + regex engines, which often use leftmost-first or leftmost-longest semantics. + Thus, it is useful to implement those semantics here. The "standard" and + "leftmost" search algorithms are subtly different, and also require slightly + different construction algorithms. +* Support for ASCII case insensitive matching. +* Support for accelerating searches when the patterns all start with a small + number of fixed bytes. Or alternatively, when the patterns all contain a + small number of rare bytes. (Searching for these bytes uses SIMD vectorized + code courtesy of `memchr`.) +* Transparent support for alternative SIMD vectorized search routines for + smaller number of literals, such as the Teddy algorithm. We called these + "packed" search routines because they use SIMD. They can often be an order of + magnitude faster than just Aho-Corasick, but don't scale as well. +* Support for searching streams. This can reuse most of the underlying code, + but does require careful buffering support. +* Support for anchored searches, which permit efficient "is prefix" checks for + a large number of patterns. + +When you combine all of this together along with trying to make everything as +fast as possible, what you end up with is enitrely too much code with too much +`unsafe`. Alas, I was not smart enough to figure out how to reduce it. Instead, +we will explain it. + + +# Basics + +The fundamental problem this crate is trying to solve is to determine the +occurrences of possibly many patterns in a haystack. The naive way to solve +this is to look for a match for each pattern at each position in the haystack: + + for i in 0..haystack.len(): + for p in patterns.iter(): + if haystack[i..].starts_with(p.bytes()): + return Match(p.id(), i, i + p.bytes().len()) + +Those four lines are effectively all this crate does. The problem with those +four lines is that they are very slow, especially when you're searching for a +large number of patterns. + +While there are many different algorithms available to solve this, a popular +one is Aho-Corasick. It's a common solution because it's not too hard to +implement, scales quite well even when searching for thousands of patterns and +is generally pretty fast. Aho-Corasick does well here because, regardless of +the number of patterns you're searching for, it always visits each byte in the +haystack exactly once. This means, generally speaking, adding more patterns to +an Aho-Corasick automaton does not make it slower. (Strictly speaking, however, +this is not true, since a larger automaton will make less effective use of the +CPU's cache.) + +Aho-Corasick can be succinctly described as a trie with state transitions +between some of the nodes that efficiently instruct the search algorithm to +try matching alternative keys in the trie. The trick is that these state +transitions are arranged such that each byte of input needs to be inspected +only once. These state transitions are typically called "failure transitions," +because they instruct the searcher (the thing traversing the automaton while +reading from the haystack) what to do when a byte in the haystack does not +correspond to a valid transition in the current state of the trie. + +More formally, a failure transition points to a state in the automaton that may +lead to a match whose prefix is a proper suffix of the path traversed through +the trie so far. (If no such proper suffix exists, then the failure transition +points back to the start state of the trie, effectively restarting the search.) +This is perhaps simpler to explain pictorally. For example, let's say we built +an Aho-Corasick automaton with the following patterns: 'abcd' and 'cef'. The +trie looks like this: + + a - S1 - b - S2 - c - S3 - d - S4* + / + S0 - c - S5 - e - S6 - f - S7* + +where states marked with a `*` are match states (meaning, the search algorithm +should stop and report a match to the caller). + +So given this trie, it should be somewhat straight-forward to see how it can +be used to determine whether any particular haystack *starts* with either +`abcd` or `cef`. It's easy to express this in code: + + fn has_prefix(trie: &Trie, haystack: &[u8]) -> bool { + let mut state_id = trie.start(); + // If the empty pattern is in trie, then state_id is a match state. + if trie.is_match(state_id) { + return true; + } + for (i, &b) in haystack.iter().enumerate() { + state_id = match trie.next_state(state_id, b) { + Some(id) => id, + // If there was no transition for this state and byte, then we know + // the haystack does not start with one of the patterns in our trie. + None => return false, + }; + if trie.is_match(state_id) { + return true; + } + } + false + } + +And that's pretty much it. All we do is move through the trie starting with the +bytes at the beginning of the haystack. If we find ourselves in a position +where we can't move, or if we've looked through the entire haystack without +seeing a match state, then we know the haystack does not start with any of the +patterns in the trie. + +The meat of the Aho-Corasick algorithm is in how we add failure transitions to +our trie to keep searching efficient. Specifically, it permits us to not only +check whether a haystack *starts* with any one of a number of patterns, but +rather, whether the haystack contains any of a number of patterns *anywhere* in +the haystack. + +As mentioned before, failure transitions connect a proper suffix of the path +traversed through the trie before, with a path that leads to a match that has a +prefix corresponding to that proper suffix. So in our case, for patterns `abcd` +and `cef`, with a haystack `abcef`, we want to transition to state `S5` (from +the diagram above) from `S3` upon seeing that the byte following `c` is not +`d`. Namely, the proper suffix in this example is `c`, which is a prefix of +`cef`. So the modified diagram looks like this: + + + a - S1 - b - S2 - c - S3 - d - S4* + / / + / ---------------- + / / + S0 - c - S5 - e - S6 - f - S7* + +One thing that isn't shown in this diagram is that *all* states have a failure +transition, but only `S3` has a *non-trivial* failure transition. That is, all +other states have a failure transition back to the start state. So if our +haystack was `abzabcd`, then the searcher would transition back to `S0` after +seeing `z`, which effectively restarts the search. (Because there is no pattern +in our trie that has a prefix of `bz` or `z`.) + +The code for traversing this *automaton* or *finite state machine* (it is no +longer just a trie) is not that much different from the `has_prefix` code +above: + + fn contains(fsm: &FiniteStateMachine, haystack: &[u8]) -> bool { + let mut state_id = fsm.start(); + // If the empty pattern is in fsm, then state_id is a match state. + if fsm.is_match(state_id) { + return true; + } + for (i, &b) in haystack.iter().enumerate() { + // While the diagram above doesn't show this, we may wind up needing + // to follow multiple failure transitions before we land on a state + // in which we can advance. Therefore, when searching for the next + // state, we need to loop until we don't see a failure transition. + // + // This loop terminates because the start state has no empty + // transitions. Every transition from the start state either points to + // another state, or loops back to the start state. + loop { + match fsm.next_state(state_id, b) { + Some(id) => { + state_id = id; + break; + } + // Unlike our code above, if there was no transition for this + // state, then we don't quit. Instead, we look for this state's + // failure transition and follow that instead. + None => { + state_id = fsm.next_fail_state(state_id); + } + }; + } + if fsm.is_match(state_id) { + return true; + } + } + false + } + +Other than the complication around traversing failure transitions, this code +is still roughly "traverse the automaton with bytes from the haystack, and quit +when a match is seen." + +And that concludes our section on the basics. While we didn't go deep into how +the automaton is built (see `src/nfa/noncontiguous.rs`, which has detailed +comments about that), the basic structure of Aho-Corasick should be reasonably +clear. + + +# NFAs and DFAs + +There are generally two types of finite automata: non-deterministic finite +automata (NFA) and deterministic finite automata (DFA). The difference between +them is, principally, that an NFA can be in multiple states at once. This is +typically accomplished by things called _epsilon_ transitions, where one could +move to a new state without consuming any bytes from the input. (The other +mechanism by which NFAs can be in more than one state is where the same byte in +a particular state transitions to multiple distinct states.) In contrast, a DFA +can only ever be in one state at a time. A DFA has no epsilon transitions, and +for any given state, a byte transitions to at most one other state. + +By this formulation, the Aho-Corasick automaton described in the previous +section is an NFA. This is because failure transitions are, effectively, +epsilon transitions. That is, whenever the automaton is in state `S`, it is +actually in the set of states that are reachable by recursively following +failure transitions from `S` until you reach the start state. (This means +that, for example, the start state is always active since the start state is +reachable via failure transitions from any state in the automaton.) + +NFAs have a lot of nice properties. They tend to be easier to construct, and +also tend to use less memory. However, their primary downside is that they are +typically slower to execute a search with. For example, the code above showing +how to search with an Aho-Corasick automaton needs to potentially iterate +through many failure transitions for every byte of input. While this is a +fairly small amount of overhead, this can add up, especially if the automaton +has a lot of overlapping patterns with a lot of failure transitions. + +A DFA's search code, by contrast, looks like this: + + fn contains(dfa: &DFA, haystack: &[u8]) -> bool { + let mut state_id = dfa.start(); + // If the empty pattern is in dfa, then state_id is a match state. + if dfa.is_match(state_id) { + return true; + } + for (i, &b) in haystack.iter().enumerate() { + // An Aho-Corasick DFA *never* has a missing state that requires + // failure transitions to be followed. One byte of input advances the + // automaton by one state. Always. + state_id = dfa.next_state(state_id, b); + if dfa.is_match(state_id) { + return true; + } + } + false + } + +The search logic here is much simpler than for the NFA, and this tends to +translate into significant performance benefits as well, since there's a lot +less work being done for each byte in the haystack. How is this accomplished? +It's done by pre-following all failure transitions for all states for all bytes +in the alphabet, and then building a single state transition table. Building +this DFA can be much more costly than building the NFA, and use much more +memory, but the better performance can be worth it. + +Users of this crate can actually choose between using one of two possible NFAs +(noncontiguous or contiguous) or a DFA. By default, a contiguous NFA is used, +in most circumstances, but if the number of patterns is small enough a DFA will +be used. A contiguous NFA is chosen because it uses orders of magnitude less +memory than a DFA, takes only a little longer to build than a noncontiguous +NFA and usually gets pretty close to the search speed of a DFA. (Callers can +override this automatic selection via the `AhoCorasickBuilder::start_kind` +configuration.) + + +# More DFA tricks + +As described in the previous section, one of the downsides of using a DFA +is that it uses more memory and can take longer to build. One small way of +mitigating these concerns is to map the alphabet used by the automaton into +a smaller space. Typically, the alphabet of a DFA has 256 elements in it: +one element for each possible value that fits into a byte. However, in many +cases, one does not need the full alphabet. For example, if all patterns in an +Aho-Corasick automaton are ASCII letters, then this only uses up 52 distinct +bytes. As far as the automaton is concerned, the rest of the 204 bytes are +indistinguishable from one another: they will never disrciminate between a +match or a non-match. Therefore, in cases like that, the alphabet can be shrunk +to just 53 elements. One for each ASCII letter, and then another to serve as a +placeholder for every other unused byte. + +In practice, this library doesn't quite compute the optimal set of equivalence +classes, but it's close enough in most cases. The key idea is that this then +allows the transition table for the DFA to be potentially much smaller. The +downside of doing this, however, is that since the transition table is defined +in terms of this smaller alphabet space, every byte in the haystack must be +re-mapped to this smaller space. This requires an additional 256-byte table. +In practice, this can lead to a small search time hit, but it can be difficult +to measure. Moreover, it can sometimes lead to faster search times for bigger +automata, since it could be difference between more parts of the automaton +staying in the CPU cache or not. + +One other trick for DFAs employed by this crate is the notion of premultiplying +state identifiers. Specifically, the normal way to compute the next transition +in a DFA is via the following (assuming that the transition table is laid out +sequentially in memory, in row-major order, where the rows are states): + + next_state_id = dfa.transitions[current_state_id * 256 + current_byte] + +However, since the value `256` is a fixed constant, we can actually premultiply +the state identifiers in the table when we build the table initially. Then, the +next transition computation simply becomes: + + next_state_id = dfa.transitions[current_state_id + current_byte] + +This doesn't seem like much, but when this is being executed for every byte of +input that you're searching, saving that extra multiplication instruction can +add up. + +The same optimization works even when equivalence classes are enabled, as +described above. The only difference is that the premultiplication is by the +total number of equivalence classes instead of 256. + +There isn't much downside to premultiplying state identifiers, other than it +imposes a smaller limit on the total number of states in the DFA. Namely, with +premultiplied state identifiers, you run out of room in your state identifier +representation more rapidly than if the identifiers are just state indices. + +Both equivalence classes and premultiplication are always enabled. There is a +`AhoCorasickBuilder::byte_classes` configuration, but disabling this just makes +it so there are always 256 equivalence classes, i.e., every class corresponds +to precisely one byte. When it's disabled, the equivalence class map itself is +still used. The purpose of disabling it is when one is debugging the underlying +automaton. It can be easier to comprehend when it uses actual byte values for +its transitions instead of equivalence classes. + + +# Match semantics + +One of the more interesting things about this implementation of Aho-Corasick +that (as far as this author knows) separates it from other implementations, is +that it natively supports leftmost-first and leftmost-longest match semantics. +Briefly, match semantics refer to the decision procedure by which searching +will disambiguate matches when there are multiple to choose from: + +* **standard** match semantics emits matches as soon as they are detected by + the automaton. This is typically equivalent to the textbook non-overlapping + formulation of Aho-Corasick. +* **leftmost-first** match semantics means that 1) the next match is the match + starting at the leftmost position and 2) among multiple matches starting at + the same leftmost position, the match corresponding to the pattern provided + first by the caller is reported. +* **leftmost-longest** is like leftmost-first, except when there are multiple + matches starting at the same leftmost position, the pattern corresponding to + the longest match is returned. + +(The crate API documentation discusses these differences, with examples, in +more depth on the `MatchKind` type.) + +The reason why supporting these match semantics is important is because it +gives the user more control over the match procedure. For example, +leftmost-first permits users to implement match priority by simply putting the +higher priority patterns first. Leftmost-longest, on the other hand, permits +finding the longest possible match, which might be useful when trying to find +words matching a dictionary. Additionally, regex engines often want to use +Aho-Corasick as an optimization when searching for an alternation of literals. +In order to preserve correct match semantics, regex engines typically can't use +the standard textbook definition directly, since regex engines will implement +either leftmost-first (Perl-like) or leftmost-longest (POSIX) match semantics. + +Supporting leftmost semantics requires a couple key changes: + +* Constructing the Aho-Corasick automaton changes a bit in both how the trie is + constructed and how failure transitions are found. Namely, only a subset + of the failure transitions are added. Specifically, only the failure + transitions that either do not occur after a match or do occur after a match + but preserve that match are kept. (More details on this can be found in + `src/nfa/noncontiguous.rs`.) +* The search algorithm changes slightly. Since we are looking for the leftmost + match, we cannot quit as soon as a match is detected. Instead, after a match + is detected, we must keep searching until either the end of the input or + until a dead state is seen. (Dead states are not used for standard match + semantics. Dead states mean that searching should stop after a match has been + found.) + +Most other implementations of Aho-Corasick do support leftmost match semantics, +but they do it with more overhead at search time, or even worse, with a queue +of matches and sophisticated hijinks to disambiguate the matches. While our +construction algorithm becomes a bit more complicated, the correct match +semantics fall out from the structure of the automaton itself. + + +# Overlapping matches + +One of the nice properties of an Aho-Corasick automaton is that it can report +all possible matches, even when they overlap with one another. In this mode, +the match semantics don't matter, since all possible matches are reported. +Overlapping searches work just like regular searches, except the state +identifier at which the previous search left off is carried over to the next +search, so that it can pick up where it left off. If there are additional +matches at that state, then they are reported before resuming the search. + +Enabling leftmost-first or leftmost-longest match semantics causes the +automaton to use a subset of all failure transitions, which means that +overlapping searches cannot be used. Therefore, if leftmost match semantics are +used, attempting to do an overlapping search will return an error (or panic +when using the infallible APIs). Thus, to get overlapping searches, the caller +must use the default standard match semantics. This behavior was chosen because +there are only two alternatives, which were deemed worse: + +* Compile two automatons internally, one for standard semantics and one for + the semantics requested by the caller (if not standard). +* Create a new type, distinct from the `AhoCorasick` type, which has different + capabilities based on the configuration options. + +The first is untenable because of the amount of memory used by the automaton. +The second increases the complexity of the API too much by adding too many +types that do similar things. It is conceptually much simpler to keep all +searching isolated to a single type. + + +# Stream searching + +Since Aho-Corasick is an automaton, it is possible to do partial searches on +partial parts of the haystack, and then resume that search on subsequent pieces +of the haystack. This is useful when the haystack you're trying to search is +not stored contiguously in memory, or if one does not want to read the entire +haystack into memory at once. + +Currently, only standard semantics are supported for stream searching. This is +some of the more complicated code in this crate, and is something I would very +much like to improve. In particular, it currently has the restriction that it +must buffer at least enough of the haystack in memory in order to fit the +longest possible match. The difficulty in getting stream searching right is +that the implementation choices (such as the buffer size) often impact what the +API looks like and what it's allowed to do. + + +# Prefilters + +In some cases, Aho-Corasick is not the fastest way to find matches containing +multiple patterns. Sometimes, the search can be accelerated using highly +optimized SIMD routines. For example, consider searching the following +patterns: + + Sherlock + Moriarty + Watson + +It is plausible that it would be much faster to quickly look for occurrences of +the leading bytes, `S`, `M` or `W`, before trying to start searching via the +automaton. Indeed, this is exactly what this crate will do. + +When there are more than three distinct starting bytes, then this crate will +look for three distinct bytes occurring at any position in the patterns, while +preferring bytes that are heuristically determined to be rare over others. For +example: + + Abuzz + Sanchez + Vasquez + Topaz + Waltz + +Here, we have more than 3 distinct starting bytes, but all of the patterns +contain `z`, which is typically a rare byte. In this case, the prefilter will +scan for `z`, back up a bit, and then execute the Aho-Corasick automaton. + +If all of that fails, then a packed multiple substring algorithm will be +attempted. Currently, the only algorithm available for this is Teddy, but more +may be added in the future. Teddy is unlike the above prefilters in that it +confirms its own matches, so when Teddy is active, it might not be necessary +for Aho-Corasick to run at all. However, the current Teddy implementation +only works in `x86_64` when SSSE3 or AVX2 are available or in `aarch64` +(using NEON), and moreover, only works _well_ when there are a small number +of patterns (say, less than 100). Teddy also requires the haystack to be of a +certain length (more than 16-34 bytes). When the haystack is shorter than that, +Rabin-Karp is used instead. (See `src/packed/rabinkarp.rs`.) + +There is a more thorough description of Teddy at +[`src/packed/teddy/README.md`](src/packed/teddy/README.md). diff --git a/vendor/aho-corasick/LICENSE-MIT b/vendor/aho-corasick/LICENSE-MIT new file mode 100644 index 0000000..3b0a5dc --- /dev/null +++ b/vendor/aho-corasick/LICENSE-MIT @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2015 Andrew Gallant + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/vendor/aho-corasick/README.md b/vendor/aho-corasick/README.md new file mode 100644 index 0000000..c0f525f --- /dev/null +++ b/vendor/aho-corasick/README.md @@ -0,0 +1,174 @@ +aho-corasick +============ +A library for finding occurrences of many patterns at once with SIMD +acceleration in some cases. This library provides multiple pattern +search principally through an implementation of the +[Aho-Corasick algorithm](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm), +which builds a finite state machine for executing searches in linear time. +Features include case insensitive matching, overlapping matches, fast searching +via SIMD and optional full DFA construction and search & replace in streams. + +[![Build status](https://github.com/BurntSushi/aho-corasick/workflows/ci/badge.svg)](https://github.com/BurntSushi/aho-corasick/actions) +[![crates.io](https://img.shields.io/crates/v/aho-corasick.svg)](https://crates.io/crates/aho-corasick) + +Dual-licensed under MIT or the [UNLICENSE](https://unlicense.org/). + + +### Documentation + +https://docs.rs/aho-corasick + + +### Usage + +Run `cargo add aho-corasick` to automatically add this crate as a dependency +in your `Cargo.toml` file. + + +### Example: basic searching + +This example shows how to search for occurrences of multiple patterns +simultaneously. Each match includes the pattern that matched along with the +byte offsets of the match. + +```rust +use aho_corasick::{AhoCorasick, PatternID}; + +let patterns = &["apple", "maple", "Snapple"]; +let haystack = "Nobody likes maple in their apple flavored Snapple."; + +let ac = AhoCorasick::new(patterns).unwrap(); +let mut matches = vec![]; +for mat in ac.find_iter(haystack) { + matches.push((mat.pattern(), mat.start(), mat.end())); +} +assert_eq!(matches, vec![ + (PatternID::must(1), 13, 18), + (PatternID::must(0), 28, 33), + (PatternID::must(2), 43, 50), +]); +``` + + +### Example: ASCII case insensitivity + +This is like the previous example, but matches `Snapple` case insensitively +using `AhoCorasickBuilder`: + +```rust +use aho_corasick::{AhoCorasick, PatternID}; + +let patterns = &["apple", "maple", "snapple"]; +let haystack = "Nobody likes maple in their apple flavored Snapple."; + +let ac = AhoCorasick::builder() + .ascii_case_insensitive(true) + .build(patterns) + .unwrap(); +let mut matches = vec![]; +for mat in ac.find_iter(haystack) { + matches.push((mat.pattern(), mat.start(), mat.end())); +} +assert_eq!(matches, vec![ + (PatternID::must(1), 13, 18), + (PatternID::must(0), 28, 33), + (PatternID::must(2), 43, 50), +]); +``` + + +### Example: replacing matches in a stream + +This example shows how to execute a search and replace on a stream without +loading the entire stream into memory first. + +```rust,ignore +use aho_corasick::AhoCorasick; + +let patterns = &["fox", "brown", "quick"]; +let replace_with = &["sloth", "grey", "slow"]; + +// In a real example, these might be `std::fs::File`s instead. All you need to +// do is supply a pair of `std::io::Read` and `std::io::Write` implementations. +let rdr = "The quick brown fox."; +let mut wtr = vec![]; + +let ac = AhoCorasick::new(patterns).unwrap(); +ac.stream_replace_all(rdr.as_bytes(), &mut wtr, replace_with) + .expect("stream_replace_all failed"); +assert_eq!(b"The slow grey sloth.".to_vec(), wtr); +``` + + +### Example: finding the leftmost first match + +In the textbook description of Aho-Corasick, its formulation is typically +structured such that it reports all possible matches, even when they overlap +with another. In many cases, overlapping matches may not be desired, such as +the case of finding all successive non-overlapping matches like you might with +a standard regular expression. + +Unfortunately the "obvious" way to modify the Aho-Corasick algorithm to do +this doesn't always work in the expected way, since it will report matches as +soon as they are seen. For example, consider matching the regex `Samwise|Sam` +against the text `Samwise`. Most regex engines (that are Perl-like, or +non-POSIX) will report `Samwise` as a match, but the standard Aho-Corasick +algorithm modified for reporting non-overlapping matches will report `Sam`. + +A novel contribution of this library is the ability to change the match +semantics of Aho-Corasick (without additional search time overhead) such that +`Samwise` is reported instead. For example, here's the standard approach: + +```rust +use aho_corasick::AhoCorasick; + +let patterns = &["Samwise", "Sam"]; +let haystack = "Samwise"; + +let ac = AhoCorasick::new(patterns).unwrap(); +let mat = ac.find(haystack).expect("should have a match"); +assert_eq!("Sam", &haystack[mat.start()..mat.end()]); +``` + +And now here's the leftmost-first version, which matches how a Perl-like +regex will work: + +```rust +use aho_corasick::{AhoCorasick, MatchKind}; + +let patterns = &["Samwise", "Sam"]; +let haystack = "Samwise"; + +let ac = AhoCorasick::builder() + .match_kind(MatchKind::LeftmostFirst) + .build(patterns) + .unwrap(); +let mat = ac.find(haystack).expect("should have a match"); +assert_eq!("Samwise", &haystack[mat.start()..mat.end()]); +``` + +In addition to leftmost-first semantics, this library also supports +leftmost-longest semantics, which match the POSIX behavior of a regular +expression alternation. See `MatchKind` in the docs for more details. + + +### Minimum Rust version policy + +This crate's minimum supported `rustc` version is `1.60.0`. + +The current policy is that the minimum Rust version required to use this crate +can be increased in minor version updates. For example, if `crate 1.0` requires +Rust 1.20.0, then `crate 1.0.z` for all values of `z` will also require Rust +1.20.0 or newer. However, `crate 1.y` for `y > 0` may require a newer minimum +version of Rust. + +In general, this crate will be conservative with respect to the minimum +supported version of Rust. + + +### FFI bindings + +* [G-Research/ahocorasick_rs](https://github.com/G-Research/ahocorasick_rs/) +is a Python wrapper for this library. +* [tmikus/ahocorasick_rs](https://github.com/tmikus/ahocorasick_rs) is a Go + wrapper for this library. diff --git a/vendor/aho-corasick/UNLICENSE b/vendor/aho-corasick/UNLICENSE new file mode 100644 index 0000000..68a49da --- /dev/null +++ b/vendor/aho-corasick/UNLICENSE @@ -0,0 +1,24 @@ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to <http://unlicense.org/> diff --git a/vendor/aho-corasick/rustfmt.toml b/vendor/aho-corasick/rustfmt.toml new file mode 100644 index 0000000..aa37a21 --- /dev/null +++ b/vendor/aho-corasick/rustfmt.toml @@ -0,0 +1,2 @@ +max_width = 79 +use_small_heuristics = "max" diff --git a/vendor/aho-corasick/src/ahocorasick.rs b/vendor/aho-corasick/src/ahocorasick.rs new file mode 100644 index 0000000..2947627 --- /dev/null +++ b/vendor/aho-corasick/src/ahocorasick.rs @@ -0,0 +1,2789 @@ +use core::{ + fmt::Debug, + panic::{RefUnwindSafe, UnwindSafe}, +}; + +use alloc::{string::String, sync::Arc, vec::Vec}; + +use crate::{ + automaton::{self, Automaton, OverlappingState}, + dfa, + nfa::{contiguous, noncontiguous}, + util::{ + error::{BuildError, MatchError}, + prefilter::Prefilter, + primitives::{PatternID, StateID}, + search::{Anchored, Input, Match, MatchKind, StartKind}, + }, +}; + +/// An automaton for searching multiple strings in linear time. +/// +/// The `AhoCorasick` type supports a few basic ways of constructing an +/// automaton, with the default being [`AhoCorasick::new`]. However, there +/// are a fair number of configurable options that can be set by using +/// [`AhoCorasickBuilder`] instead. Such options include, but are not limited +/// to, how matches are determined, simple case insensitivity, whether to use a +/// DFA or not and various knobs for controlling the space-vs-time trade offs +/// taken when building the automaton. +/// +/// # Resource usage +/// +/// Aho-Corasick automatons are always constructed in `O(p)` time, where +/// `p` is the combined length of all patterns being searched. With that +/// said, building an automaton can be fairly costly because of high constant +/// factors, particularly when enabling the [DFA](AhoCorasickKind::DFA) option +/// with [`AhoCorasickBuilder::kind`]. For this reason, it's generally a good +/// idea to build an automaton once and reuse it as much as possible. +/// +/// Aho-Corasick automatons can also use a fair bit of memory. To get +/// a concrete idea of how much memory is being used, try using the +/// [`AhoCorasick::memory_usage`] method. +/// +/// To give a quick idea of the differences between Aho-Corasick +/// implementations and their resource usage, here's a sample of construction +/// times and heap memory used after building an automaton from 100,000 +/// randomly selected titles from Wikipedia: +/// +/// * 99MB for a [`noncontiguous::NFA`] in 240ms. +/// * 21MB for a [`contiguous::NFA`] in 275ms. +/// * 1.6GB for a [`dfa::DFA`] in 1.88s. +/// +/// (Note that the memory usage above reflects the size of each automaton and +/// not peak memory usage. For example, building a contiguous NFA requires +/// first building a noncontiguous NFA. Once the contiguous NFA is built, the +/// noncontiguous NFA is freed.) +/// +/// This experiment very strongly argues that a contiguous NFA is often the +/// best balance in terms of resource usage. It takes a little longer to build, +/// but its memory usage is quite small. Its search speed (not listed) is +/// also often faster than a noncontiguous NFA, but a little slower than a +/// DFA. Indeed, when no specific [`AhoCorasickKind`] is used (which is the +/// default), a contiguous NFA is used in most cases. +/// +/// The only "catch" to using a contiguous NFA is that, because of its variety +/// of compression tricks, it may not be able to support automatons as large as +/// what the noncontiguous NFA supports. In which case, building a contiguous +/// NFA will fail and (by default) `AhoCorasick` will automatically fall +/// back to a noncontiguous NFA. (This typically only happens when building +/// automatons from millions of patterns.) Otherwise, the small additional time +/// for building a contiguous NFA is almost certainly worth it. +/// +/// # Cloning +/// +/// The `AhoCorasick` type uses thread safe reference counting internally. It +/// is guaranteed that it is cheap to clone. +/// +/// # Search configuration +/// +/// Most of the search routines accept anything that can be cheaply converted +/// to an [`Input`]. This includes `&[u8]`, `&str` and `Input` itself. +/// +/// # Construction failure +/// +/// It is generally possible for building an Aho-Corasick automaton to fail. +/// Construction can fail in generally one way: when the inputs provided are +/// too big. Whether that's a pattern that is too long, too many patterns +/// or some combination of both. A first approximation for the scale at which +/// construction can fail is somewhere around "millions of patterns." +/// +/// For that reason, if you're building an Aho-Corasick automaton from +/// untrusted input (or input that doesn't have any reasonable bounds on its +/// size), then it is strongly recommended to handle the possibility of an +/// error. +/// +/// If you're constructing an Aho-Corasick automaton from static or trusted +/// data, then it is likely acceptable to panic (by calling `unwrap()` or +/// `expect()`) if construction fails. +/// +/// # Fallibility +/// +/// The `AhoCorasick` type provides a number of methods for searching, as one +/// might expect. Depending on how the Aho-Corasick automaton was built and +/// depending on the search configuration, it is possible for a search to +/// return an error. Since an error is _never_ dependent on the actual contents +/// of the haystack, this type provides both infallible and fallible methods +/// for searching. The infallible methods panic if an error occurs, and can be +/// used for convenience and when you know the search will never return an +/// error. +/// +/// For example, the [`AhoCorasick::find_iter`] method is the infallible +/// version of the [`AhoCorasick::try_find_iter`] method. +/// +/// Examples of errors that can occur: +/// +/// * Running a search that requires [`MatchKind::Standard`] semantics (such +/// as a stream or overlapping search) with an automaton that was built with +/// [`MatchKind::LeftmostFirst`] or [`MatchKind::LeftmostLongest`] semantics. +/// * Running an anchored search with an automaton that only supports +/// unanchored searches. (By default, `AhoCorasick` only supports unanchored +/// searches. But this can be toggled with [`AhoCorasickBuilder::start_kind`].) +/// * Running an unanchored search with an automaton that only supports +/// anchored searches. +/// +/// The common thread between the different types of errors is that they are +/// all rooted in the automaton construction and search configurations. If +/// those configurations are a static property of your program, then it is +/// reasonable to call infallible routines since you know an error will never +/// occur. And if one _does_ occur, then it's a bug in your program. +/// +/// To re-iterate, if the patterns, build or search configuration come from +/// user or untrusted data, then you should handle errors at build or search +/// time. If only the haystack comes from user or untrusted data, then there +/// should be no need to handle errors anywhere and it is generally encouraged +/// to `unwrap()` (or `expect()`) both build and search time calls. +/// +/// # Examples +/// +/// This example shows how to search for occurrences of multiple patterns +/// simultaneously in a case insensitive fashion. Each match includes the +/// pattern that matched along with the byte offsets of the match. +/// +/// ``` +/// use aho_corasick::{AhoCorasick, PatternID}; +/// +/// let patterns = &["apple", "maple", "snapple"]; +/// let haystack = "Nobody likes maple in their apple flavored Snapple."; +/// +/// let ac = AhoCorasick::builder() +/// .ascii_case_insensitive(true) +/// .build(patterns) +/// .unwrap(); +/// let mut matches = vec![]; +/// for mat in ac.find_iter(haystack) { +/// matches.push((mat.pattern(), mat.start(), mat.end())); +/// } +/// assert_eq!(matches, vec![ +/// (PatternID::must(1), 13, 18), +/// (PatternID::must(0), 28, 33), +/// (PatternID::must(2), 43, 50), +/// ]); +/// ``` +/// +/// This example shows how to replace matches with some other string: +/// +/// ``` +/// use aho_corasick::AhoCorasick; +/// +/// let patterns = &["fox", "brown", "quick"]; +/// let haystack = "The quick brown fox."; +/// let replace_with = &["sloth", "grey", "slow"]; +/// +/// let ac = AhoCorasick::new(patterns).unwrap(); +/// let result = ac.replace_all(haystack, replace_with); +/// assert_eq!(result, "The slow grey sloth."); +/// ``` +#[derive(Clone)] +pub struct AhoCorasick { + /// The underlying Aho-Corasick automaton. It's one of + /// nfa::noncontiguous::NFA, nfa::contiguous::NFA or dfa::DFA. + aut: Arc<dyn AcAutomaton>, + /// The specific Aho-Corasick kind chosen. This makes it possible to + /// inspect any `AhoCorasick` and know what kind of search strategy it + /// uses. + kind: AhoCorasickKind, + /// The start kind of this automaton as configured by the caller. + /// + /// We don't really *need* to put this here, since the underlying automaton + /// will correctly return errors if the caller requests an unsupported + /// search type. But we do keep this here for API behavior consistency. + /// Namely, the NFAs in this crate support both unanchored and anchored + /// searches unconditionally. There's no way to disable one or the other. + /// They always both work. But the DFA in this crate specifically only + /// supports both unanchored and anchored searches if it's configured to + /// do so. Why? Because for the DFA, supporting both essentially requires + /// two copies of the transition table: one generated by following failure + /// transitions from the original NFA and one generated by not following + /// those failure transitions. + /// + /// So why record the start kind here? Well, consider what happens + /// when no specific 'AhoCorasickKind' is selected by the caller and + /// 'StartKind::Unanchored' is used (both are the default). It *might* + /// result in using a DFA or it might pick an NFA. If it picks an NFA, the + /// caller would then be able to run anchored searches, even though the + /// caller only asked for support for unanchored searches. Maybe that's + /// fine, but what if the DFA was chosen instead? Oops, the caller would + /// get an error. + /// + /// Basically, it seems bad to return an error or not based on some + /// internal implementation choice. So we smooth things out and ensure + /// anchored searches *always* report an error when only unanchored support + /// was asked for (and vice versa), even if the underlying automaton + /// supports it. + start_kind: StartKind, +} + +/// Convenience constructors for an Aho-Corasick searcher. To configure the +/// searcher, use an [`AhoCorasickBuilder`] instead. +impl AhoCorasick { + /// Create a new Aho-Corasick automaton using the default configuration. + /// + /// The default configuration optimizes for less space usage, but at the + /// expense of longer search times. To change the configuration, use + /// [`AhoCorasickBuilder`]. + /// + /// This uses the default [`MatchKind::Standard`] match semantics, which + /// reports a match as soon as it is found. This corresponds to the + /// standard match semantics supported by textbook descriptions of the + /// Aho-Corasick algorithm. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::{AhoCorasick, PatternID}; + /// + /// let ac = AhoCorasick::new(&["foo", "bar", "baz"]).unwrap(); + /// assert_eq!( + /// Some(PatternID::must(1)), + /// ac.find("xxx bar xxx").map(|m| m.pattern()), + /// ); + /// ``` + pub fn new<I, P>(patterns: I) -> Result<AhoCorasick, BuildError> + where + I: IntoIterator<Item = P>, + P: AsRef<[u8]>, + { + AhoCorasickBuilder::new().build(patterns) + } + + /// A convenience method for returning a new Aho-Corasick builder. + /// + /// This usually permits one to just import the `AhoCorasick` type. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::{AhoCorasick, Match, MatchKind}; + /// + /// let ac = AhoCorasick::builder() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(&["samwise", "sam"]) + /// .unwrap(); + /// assert_eq!(Some(Match::must(0, 0..7)), ac.find("samwise")); + /// ``` + pub fn builder() -> AhoCorasickBuilder { + AhoCorasickBuilder::new() + } +} + +/// Infallible search routines. These APIs panic when the underlying search +/// would otherwise fail. Infallible routines are useful because the errors are +/// a result of both search-time configuration and what configuration is used +/// to build the Aho-Corasick searcher. Both of these things are not usually +/// the result of user input, and thus, an error is typically indicative of a +/// programmer error. In cases where callers want errors instead of panics, use +/// the corresponding `try` method in the section below. +impl AhoCorasick { + /// Returns true if and only if this automaton matches the haystack at any + /// position. + /// + /// `input` may be any type that is cheaply convertible to an `Input`. This + /// includes, but is not limited to, `&str` and `&[u8]`. + /// + /// Aside from convenience, when `AhoCorasick` was built with + /// leftmost-first or leftmost-longest semantics, this might result in a + /// search that visits less of the haystack than [`AhoCorasick::find`] + /// would otherwise. (For standard semantics, matches are always + /// immediately returned once they are seen, so there is no way for this to + /// do less work in that case.) + /// + /// Note that there is no corresponding fallible routine for this method. + /// If you need a fallible version of this, then [`AhoCorasick::try_find`] + /// can be used with [`Input::earliest`] enabled. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::AhoCorasick; + /// + /// let ac = AhoCorasick::new(&[ + /// "foo", "bar", "quux", "baz", + /// ]).unwrap(); + /// assert!(ac.is_match("xxx bar xxx")); + /// assert!(!ac.is_match("xxx qux xxx")); + /// ``` + pub fn is_match<'h, I: Into<Input<'h>>>(&self, input: I) -> bool { + self.aut + .try_find(&input.into().earliest(true)) + .expect("AhoCorasick::try_find is not expected to fail") + .is_some() + } + + /// Returns the location of the first match according to the match + /// semantics that this automaton was constructed with. + /// + /// `input` may be any type that is cheaply convertible to an `Input`. This + /// includes, but is not limited to, `&str` and `&[u8]`. + /// + /// This is the infallible version of [`AhoCorasick::try_find`]. + /// + /// # Panics + /// + /// This panics when [`AhoCorasick::try_find`] would return an error. + /// + /// # Examples + /// + /// Basic usage, with standard semantics: + /// + /// ``` + /// use aho_corasick::{AhoCorasick, MatchKind}; + /// + /// let patterns = &["b", "abc", "abcd"]; + /// let haystack = "abcd"; + /// + /// let ac = AhoCorasick::builder() + /// .match_kind(MatchKind::Standard) // default, not necessary + /// .build(patterns) + /// .unwrap(); + /// let mat = ac.find(haystack).expect("should have a match"); + /// assert_eq!("b", &haystack[mat.start()..mat.end()]); + /// ``` + /// + /// Now with leftmost-first semantics: + /// + /// ``` + /// use aho_corasick::{AhoCorasick, MatchKind}; + /// + /// let patterns = &["b", "abc", "abcd"]; + /// let haystack = "abcd"; + /// + /// let ac = AhoCorasick::builder() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(patterns) + /// .unwrap(); + /// let mat = ac.find(haystack).expect("should have a match"); + /// assert_eq!("abc", &haystack[mat.start()..mat.end()]); + /// ``` + /// + /// And finally, leftmost-longest semantics: + /// + /// ``` + /// use aho_corasick::{AhoCorasick, MatchKind}; + /// + /// let patterns = &["b", "abc", "abcd"]; + /// let haystack = "abcd"; + /// + /// let ac = AhoCorasick::builder() + /// .match_kind(MatchKind::LeftmostLongest) + /// .build(patterns) + /// .unwrap(); + /// let mat = ac.find(haystack).expect("should have a match"); + /// ``` + /// + /// # Example: configuring a search + /// + /// Because this method accepts anything that can be turned into an + /// [`Input`], it's possible to provide an `Input` directly in order to + /// configure the search. In this example, we show how to use the + /// `earliest` option to force the search to return as soon as it knows + /// a match has occurred. + /// + /// ``` + /// use aho_corasick::{AhoCorasick, Input, MatchKind}; + /// + /// let patterns = &["b", "abc", "abcd"]; + /// let haystack = "abcd"; + /// + /// let ac = AhoCorasick::builder() + /// .match_kind(MatchKind::LeftmostLongest) + /// .build(patterns) + /// .unwrap(); + /// let mat = ac.find(Input::new(haystack).earliest(true)) + /// .expect("should have a match"); + /// // The correct leftmost-longest match here is 'abcd', but since we + /// // told the search to quit as soon as it knows a match has occurred, + /// // we get a different match back. + /// assert_eq!("b", &haystack[mat.start()..mat.end()]); + /// ``` + pub fn find<'h, I: Into<Input<'h>>>(&self, input: I) -> Option<Match> { + self.try_find(input) + .expect("AhoCorasick::try_find is not expected to fail") + } + + /// Returns the location of the first overlapping match in the given + /// input with respect to the current state of the underlying searcher. + /// + /// `input` may be any type that is cheaply convertible to an `Input`. This + /// includes, but is not limited to, `&str` and `&[u8]`. + /// + /// Overlapping searches do not report matches in their return value. + /// Instead, matches can be accessed via [`OverlappingState::get_match`] + /// after a search call. + /// + /// This is the infallible version of + /// [`AhoCorasick::try_find_overlapping`]. + /// + /// # Panics + /// + /// This panics when [`AhoCorasick::try_find_overlapping`] would + /// return an error. For example, when the Aho-Corasick searcher + /// doesn't support overlapping searches. (Only searchers built with + /// [`MatchKind::Standard`] semantics support overlapping searches.) + /// + /// # Example + /// + /// This shows how we can repeatedly call an overlapping search without + /// ever needing to explicitly re-slice the haystack. Overlapping search + /// works this way because searches depend on state saved during the + /// previous search. + /// + /// ``` + /// use aho_corasick::{ + /// automaton::OverlappingState, + /// AhoCorasick, Input, Match, + /// }; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = "append the app to the appendage"; + /// + /// let ac = AhoCorasick::new(patterns).unwrap(); + /// let mut state = OverlappingState::start(); + /// + /// ac.find_overlapping(haystack, &mut state); + /// assert_eq!(Some(Match::must(2, 0..3)), state.get_match()); + /// + /// ac.find_overlapping(haystack, &mut state); + /// assert_eq!(Some(Match::must(0, 0..6)), state.get_match()); + /// + /// ac.find_overlapping(haystack, &mut state); + /// assert_eq!(Some(Match::must(2, 11..14)), state.get_match()); + /// + /// ac.find_overlapping(haystack, &mut state); + /// assert_eq!(Some(Match::must(2, 22..25)), state.get_match()); + /// + /// ac.find_overlapping(haystack, &mut state); + /// assert_eq!(Some(Match::must(0, 22..28)), state.get_match()); + /// + /// ac.find_overlapping(haystack, &mut state); + /// assert_eq!(Some(Match::must(1, 22..31)), state.get_match()); + /// + /// // No more match matches to be found. + /// ac.find_overlapping(haystack, &mut state); + /// assert_eq!(None, state.get_match()); + /// ``` + pub fn find_overlapping<'h, I: Into<Input<'h>>>( + &self, + input: I, + state: &mut OverlappingState, + ) { + self.try_find_overlapping(input, state).expect( + "AhoCorasick::try_find_overlapping is not expected to fail", + ) + } + + /// Returns an iterator of non-overlapping matches, using the match + /// semantics that this automaton was constructed with. + /// + /// `input` may be any type that is cheaply convertible to an `Input`. This + /// includes, but is not limited to, `&str` and `&[u8]`. + /// + /// This is the infallible version of [`AhoCorasick::try_find_iter`]. + /// + /// # Panics + /// + /// This panics when [`AhoCorasick::try_find_iter`] would return an error. + /// + /// # Examples + /// + /// Basic usage, with standard semantics: + /// + /// ``` + /// use aho_corasick::{AhoCorasick, MatchKind, PatternID}; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = "append the app to the appendage"; + /// + /// let ac = AhoCorasick::builder() + /// .match_kind(MatchKind::Standard) // default, not necessary + /// .build(patterns) + /// .unwrap(); + /// let matches: Vec<PatternID> = ac + /// .find_iter(haystack) + /// .map(|mat| mat.pattern()) + /// .collect(); + /// assert_eq!(vec![ + /// PatternID::must(2), + /// PatternID::must(2), + /// PatternID::must(2), + /// ], matches); + /// ``` + /// + /// Now with leftmost-first semantics: + /// + /// ``` + /// use aho_corasick::{AhoCorasick, MatchKind, PatternID}; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = "append the app to the appendage"; + /// + /// let ac = AhoCorasick::builder() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(patterns) + /// .unwrap(); + /// let matches: Vec<PatternID> = ac + /// .find_iter(haystack) + /// .map(|mat| mat.pattern()) + /// .collect(); + /// assert_eq!(vec![ + /// PatternID::must(0), + /// PatternID::must(2), + /// PatternID::must(0), + /// ], matches); + /// ``` + /// + /// And finally, leftmost-longest semantics: + /// + /// ``` + /// use aho_corasick::{AhoCorasick, MatchKind, PatternID}; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = "append the app to the appendage"; + /// + /// let ac = AhoCorasick::builder() + /// .match_kind(MatchKind::LeftmostLongest) + /// .build(patterns) + /// .unwrap(); + /// let matches: Vec<PatternID> = ac + /// .find_iter(haystack) + /// .map(|mat| mat.pattern()) + /// .collect(); + /// assert_eq!(vec![ + /// PatternID::must(0), + /// PatternID::must(2), + /// PatternID::must(1), + /// ], matches); + /// ``` + pub fn find_iter<'a, 'h, I: Into<Input<'h>>>( + &'a self, + input: I, + ) -> FindIter<'a, 'h> { + self.try_find_iter(input) + .expect("AhoCorasick::try_find_iter is not expected to fail") + } + + /// Returns an iterator of overlapping matches. Stated differently, this + /// returns an iterator of all possible matches at every position. + /// + /// `input` may be any type that is cheaply convertible to an `Input`. This + /// includes, but is not limited to, `&str` and `&[u8]`. + /// + /// This is the infallible version of + /// [`AhoCorasick::try_find_overlapping_iter`]. + /// + /// # Panics + /// + /// This panics when `AhoCorasick::try_find_overlapping_iter` would return + /// an error. For example, when the Aho-Corasick searcher is built with + /// either leftmost-first or leftmost-longest match semantics. Stated + /// differently, overlapping searches require one to build the searcher + /// with [`MatchKind::Standard`] (it is the default). + /// + /// # Example: basic usage + /// + /// ``` + /// use aho_corasick::{AhoCorasick, PatternID}; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = "append the app to the appendage"; + /// + /// let ac = AhoCorasick::new(patterns).unwrap(); + /// let matches: Vec<PatternID> = ac + /// .find_overlapping_iter(haystack) + /// .map(|mat| mat.pattern()) + /// .collect(); + /// assert_eq!(vec![ + /// PatternID::must(2), + /// PatternID::must(0), + /// PatternID::must(2), + /// PatternID::must(2), + /// PatternID::must(0), + /// PatternID::must(1), + /// ], matches); + /// ``` + pub fn find_overlapping_iter<'a, 'h, I: Into<Input<'h>>>( + &'a self, + input: I, + ) -> FindOverlappingIter<'a, 'h> { + self.try_find_overlapping_iter(input).expect( + "AhoCorasick::try_find_overlapping_iter is not expected to fail", + ) + } + + /// Replace all matches with a corresponding value in the `replace_with` + /// slice given. Matches correspond to the same matches as reported by + /// [`AhoCorasick::find_iter`]. + /// + /// Replacements are determined by the index of the matching pattern. + /// For example, if the pattern with index `2` is found, then it is + /// replaced by `replace_with[2]`. + /// + /// This is the infallible version of [`AhoCorasick::try_replace_all`]. + /// + /// # Panics + /// + /// This panics when [`AhoCorasick::try_replace_all`] would return an + /// error. + /// + /// This also panics when `replace_with.len()` does not equal + /// [`AhoCorasick::patterns_len`]. + /// + /// # Example: basic usage + /// + /// ``` + /// use aho_corasick::{AhoCorasick, MatchKind}; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = "append the app to the appendage"; + /// + /// let ac = AhoCorasick::builder() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(patterns) + /// .unwrap(); + /// let result = ac.replace_all(haystack, &["x", "y", "z"]); + /// assert_eq!("x the z to the xage", result); + /// ``` + pub fn replace_all<B>(&self, haystack: &str, replace_with: &[B]) -> String + where + B: AsRef<str>, + { + self.try_replace_all(haystack, replace_with) + .expect("AhoCorasick::try_replace_all is not expected to fail") + } + + /// Replace all matches using raw bytes with a corresponding value in the + /// `replace_with` slice given. Matches correspond to the same matches as + /// reported by [`AhoCorasick::find_iter`]. + /// + /// Replacements are determined by the index of the matching pattern. + /// For example, if the pattern with index `2` is found, then it is + /// replaced by `replace_with[2]`. + /// + /// This is the infallible version of + /// [`AhoCorasick::try_replace_all_bytes`]. + /// + /// # Panics + /// + /// This panics when [`AhoCorasick::try_replace_all_bytes`] would return an + /// error. + /// + /// This also panics when `replace_with.len()` does not equal + /// [`AhoCorasick::patterns_len`]. + /// + /// # Example: basic usage + /// + /// ``` + /// use aho_corasick::{AhoCorasick, MatchKind}; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = b"append the app to the appendage"; + /// + /// let ac = AhoCorasick::builder() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(patterns) + /// .unwrap(); + /// let result = ac.replace_all_bytes(haystack, &["x", "y", "z"]); + /// assert_eq!(b"x the z to the xage".to_vec(), result); + /// ``` + pub fn replace_all_bytes<B>( + &self, + haystack: &[u8], + replace_with: &[B], + ) -> Vec<u8> + where + B: AsRef<[u8]>, + { + self.try_replace_all_bytes(haystack, replace_with) + .expect("AhoCorasick::try_replace_all_bytes should not fail") + } + + /// Replace all matches using a closure called on each match. + /// Matches correspond to the same matches as reported by + /// [`AhoCorasick::find_iter`]. + /// + /// The closure accepts three parameters: the match found, the text of + /// the match and a string buffer with which to write the replaced text + /// (if any). If the closure returns `true`, then it continues to the next + /// match. If the closure returns `false`, then searching is stopped. + /// + /// Note that any matches with boundaries that don't fall on a valid UTF-8 + /// boundary are silently skipped. + /// + /// This is the infallible version of + /// [`AhoCorasick::try_replace_all_with`]. + /// + /// # Panics + /// + /// This panics when [`AhoCorasick::try_replace_all_with`] would return an + /// error. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::{AhoCorasick, MatchKind}; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = "append the app to the appendage"; + /// + /// let ac = AhoCorasick::builder() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(patterns) + /// .unwrap(); + /// let mut result = String::new(); + /// ac.replace_all_with(haystack, &mut result, |mat, _, dst| { + /// dst.push_str(&mat.pattern().as_usize().to_string()); + /// true + /// }); + /// assert_eq!("0 the 2 to the 0age", result); + /// ``` + /// + /// Stopping the replacement by returning `false` (continued from the + /// example above): + /// + /// ``` + /// # use aho_corasick::{AhoCorasick, MatchKind, PatternID}; + /// # let patterns = &["append", "appendage", "app"]; + /// # let haystack = "append the app to the appendage"; + /// # let ac = AhoCorasick::builder() + /// # .match_kind(MatchKind::LeftmostFirst) + /// # .build(patterns) + /// # .unwrap(); + /// let mut result = String::new(); + /// ac.replace_all_with(haystack, &mut result, |mat, _, dst| { + /// dst.push_str(&mat.pattern().as_usize().to_string()); + /// mat.pattern() != PatternID::must(2) + /// }); + /// assert_eq!("0 the 2 to the appendage", result); + /// ``` + pub fn replace_all_with<F>( + &self, + haystack: &str, + dst: &mut String, + replace_with: F, + ) where + F: FnMut(&Match, &str, &mut String) -> bool, + { + self.try_replace_all_with(haystack, dst, replace_with) + .expect("AhoCorasick::try_replace_all_with should not fail") + } + + /// Replace all matches using raw bytes with a closure called on each + /// match. Matches correspond to the same matches as reported by + /// [`AhoCorasick::find_iter`]. + /// + /// The closure accepts three parameters: the match found, the text of + /// the match and a byte buffer with which to write the replaced text + /// (if any). If the closure returns `true`, then it continues to the next + /// match. If the closure returns `false`, then searching is stopped. + /// + /// This is the infallible version of + /// [`AhoCorasick::try_replace_all_with_bytes`]. + /// + /// # Panics + /// + /// This panics when [`AhoCorasick::try_replace_all_with_bytes`] would + /// return an error. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::{AhoCorasick, MatchKind}; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = b"append the app to the appendage"; + /// + /// let ac = AhoCorasick::builder() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(patterns) + /// .unwrap(); + /// let mut result = vec![]; + /// ac.replace_all_with_bytes(haystack, &mut result, |mat, _, dst| { + /// dst.extend(mat.pattern().as_usize().to_string().bytes()); + /// true + /// }); + /// assert_eq!(b"0 the 2 to the 0age".to_vec(), result); + /// ``` + /// + /// Stopping the replacement by returning `false` (continued from the + /// example above): + /// + /// ``` + /// # use aho_corasick::{AhoCorasick, MatchKind, PatternID}; + /// # let patterns = &["append", "appendage", "app"]; + /// # let haystack = b"append the app to the appendage"; + /// # let ac = AhoCorasick::builder() + /// # .match_kind(MatchKind::LeftmostFirst) + /// # .build(patterns) + /// # .unwrap(); + /// let mut result = vec![]; + /// ac.replace_all_with_bytes(haystack, &mut result, |mat, _, dst| { + /// dst.extend(mat.pattern().as_usize().to_string().bytes()); + /// mat.pattern() != PatternID::must(2) + /// }); + /// assert_eq!(b"0 the 2 to the appendage".to_vec(), result); + /// ``` + pub fn replace_all_with_bytes<F>( + &self, + haystack: &[u8], + dst: &mut Vec<u8>, + replace_with: F, + ) where + F: FnMut(&Match, &[u8], &mut Vec<u8>) -> bool, + { + self.try_replace_all_with_bytes(haystack, dst, replace_with) + .expect("AhoCorasick::try_replace_all_with_bytes should not fail") + } + + /// Returns an iterator of non-overlapping matches in the given + /// stream. Matches correspond to the same matches as reported by + /// [`AhoCorasick::find_iter`]. + /// + /// The matches yielded by this iterator use absolute position offsets in + /// the stream given, where the first byte has index `0`. Matches are + /// yieled until the stream is exhausted. + /// + /// Each item yielded by the iterator is an `Result<Match, + /// std::io::Error>`, where an error is yielded if there was a problem + /// reading from the reader given. + /// + /// When searching a stream, an internal buffer is used. Therefore, callers + /// should avoiding providing a buffered reader, if possible. + /// + /// This is the infallible version of + /// [`AhoCorasick::try_stream_find_iter`]. Note that both methods return + /// iterators that produce `Result` values. The difference is that this + /// routine panics if _construction_ of the iterator failed. The `Result` + /// values yield by the iterator come from whether the given reader returns + /// an error or not during the search. + /// + /// # Memory usage + /// + /// In general, searching streams will use a constant amount of memory for + /// its internal buffer. The one requirement is that the internal buffer + /// must be at least the size of the longest possible match. In most use + /// cases, the default buffer size will be much larger than any individual + /// match. + /// + /// # Panics + /// + /// This panics when [`AhoCorasick::try_stream_find_iter`] would return + /// an error. For example, when the Aho-Corasick searcher doesn't support + /// stream searches. (Only searchers built with [`MatchKind::Standard`] + /// semantics support stream searches.) + /// + /// # Example: basic usage + /// + /// ``` + /// use aho_corasick::{AhoCorasick, PatternID}; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = "append the app to the appendage"; + /// + /// let ac = AhoCorasick::new(patterns).unwrap(); + /// let mut matches = vec![]; + /// for result in ac.stream_find_iter(haystack.as_bytes()) { + /// let mat = result?; + /// matches.push(mat.pattern()); + /// } + /// assert_eq!(vec![ + /// PatternID::must(2), + /// PatternID::must(2), + /// PatternID::must(2), + /// ], matches); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "std")] + pub fn stream_find_iter<'a, R: std::io::Read>( + &'a self, + rdr: R, + ) -> StreamFindIter<'a, R> { + self.try_stream_find_iter(rdr) + .expect("AhoCorasick::try_stream_find_iter should not fail") + } +} + +/// Fallible search routines. These APIs return an error in cases where the +/// infallible routines would panic. +impl AhoCorasick { + /// Returns the location of the first match according to the match + /// semantics that this automaton was constructed with, and according + /// to the given `Input` configuration. + /// + /// This is the fallible version of [`AhoCorasick::find`]. + /// + /// # Errors + /// + /// This returns an error when this Aho-Corasick searcher does not support + /// the given `Input` configuration. + /// + /// For example, if the Aho-Corasick searcher only supports anchored + /// searches or only supports unanchored searches, then providing an + /// `Input` that requests an anchored (or unanchored) search when it isn't + /// supported would result in an error. + /// + /// # Example: leftmost-first searching + /// + /// Basic usage with leftmost-first semantics: + /// + /// ``` + /// use aho_corasick::{AhoCorasick, MatchKind, Input}; + /// + /// let patterns = &["b", "abc", "abcd"]; + /// let haystack = "foo abcd"; + /// + /// let ac = AhoCorasick::builder() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(patterns) + /// .unwrap(); + /// let mat = ac.try_find(haystack)?.expect("should have a match"); + /// assert_eq!("abc", &haystack[mat.span()]); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: anchored leftmost-first searching + /// + /// This shows how to anchor the search, so that even if the haystack + /// contains a match somewhere, a match won't be reported unless one can + /// be found that starts at the beginning of the search: + /// + /// ``` + /// use aho_corasick::{AhoCorasick, Anchored, Input, MatchKind, StartKind}; + /// + /// let patterns = &["b", "abc", "abcd"]; + /// let haystack = "foo abcd"; + /// + /// let ac = AhoCorasick::builder() + /// .match_kind(MatchKind::LeftmostFirst) + /// .start_kind(StartKind::Anchored) + /// .build(patterns) + /// .unwrap(); + /// let input = Input::new(haystack).anchored(Anchored::Yes); + /// assert_eq!(None, ac.try_find(input)?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// If the beginning of the search is changed to where a match begins, then + /// it will be found: + /// + /// ``` + /// use aho_corasick::{AhoCorasick, Anchored, Input, MatchKind, StartKind}; + /// + /// let patterns = &["b", "abc", "abcd"]; + /// let haystack = "foo abcd"; + /// + /// let ac = AhoCorasick::builder() + /// .match_kind(MatchKind::LeftmostFirst) + /// .start_kind(StartKind::Anchored) + /// .build(patterns) + /// .unwrap(); + /// let input = Input::new(haystack).range(4..).anchored(Anchored::Yes); + /// let mat = ac.try_find(input)?.expect("should have a match"); + /// assert_eq!("abc", &haystack[mat.span()]); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: earliest leftmost-first searching + /// + /// This shows how to run an "earliest" search even when the Aho-Corasick + /// searcher was compiled with leftmost-first match semantics. In this + /// case, the search is stopped as soon as it is known that a match has + /// occurred, even if it doesn't correspond to the leftmost-first match. + /// + /// ``` + /// use aho_corasick::{AhoCorasick, Input, MatchKind}; + /// + /// let patterns = &["b", "abc", "abcd"]; + /// let haystack = "foo abcd"; + /// + /// let ac = AhoCorasick::builder() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(patterns) + /// .unwrap(); + /// let input = Input::new(haystack).earliest(true); + /// let mat = ac.try_find(input)?.expect("should have a match"); + /// assert_eq!("b", &haystack[mat.span()]); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn try_find<'h, I: Into<Input<'h>>>( + &self, + input: I, + ) -> Result<Option<Match>, MatchError> { + let input = input.into(); + enforce_anchored_consistency(self.start_kind, input.get_anchored())?; + self.aut.try_find(&input) + } + + /// Returns the location of the first overlapping match in the given + /// input with respect to the current state of the underlying searcher. + /// + /// Overlapping searches do not report matches in their return value. + /// Instead, matches can be accessed via [`OverlappingState::get_match`] + /// after a search call. + /// + /// This is the fallible version of [`AhoCorasick::find_overlapping`]. + /// + /// # Errors + /// + /// This returns an error when this Aho-Corasick searcher does not support + /// the given `Input` configuration or if overlapping search is not + /// supported. + /// + /// One example is that only Aho-Corasicker searchers built with + /// [`MatchKind::Standard`] semantics support overlapping searches. Using + /// any other match semantics will result in this returning an error. + /// + /// # Example: basic usage + /// + /// This shows how we can repeatedly call an overlapping search without + /// ever needing to explicitly re-slice the haystack. Overlapping search + /// works this way because searches depend on state saved during the + /// previous search. + /// + /// ``` + /// use aho_corasick::{ + /// automaton::OverlappingState, + /// AhoCorasick, Input, Match, + /// }; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = "append the app to the appendage"; + /// + /// let ac = AhoCorasick::new(patterns).unwrap(); + /// let mut state = OverlappingState::start(); + /// + /// ac.try_find_overlapping(haystack, &mut state)?; + /// assert_eq!(Some(Match::must(2, 0..3)), state.get_match()); + /// + /// ac.try_find_overlapping(haystack, &mut state)?; + /// assert_eq!(Some(Match::must(0, 0..6)), state.get_match()); + /// + /// ac.try_find_overlapping(haystack, &mut state)?; + /// assert_eq!(Some(Match::must(2, 11..14)), state.get_match()); + /// + /// ac.try_find_overlapping(haystack, &mut state)?; + /// assert_eq!(Some(Match::must(2, 22..25)), state.get_match()); + /// + /// ac.try_find_overlapping(haystack, &mut state)?; + /// assert_eq!(Some(Match::must(0, 22..28)), state.get_match()); + /// + /// ac.try_find_overlapping(haystack, &mut state)?; + /// assert_eq!(Some(Match::must(1, 22..31)), state.get_match()); + /// + /// // No more match matches to be found. + /// ac.try_find_overlapping(haystack, &mut state)?; + /// assert_eq!(None, state.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: implementing your own overlapping iteration + /// + /// The previous example can be easily adapted to implement your own + /// iteration by repeatedly calling `try_find_overlapping` until either + /// an error occurs or no more matches are reported. + /// + /// This is effectively equivalent to the iterator returned by + /// [`AhoCorasick::try_find_overlapping_iter`], with the only difference + /// being that the iterator checks for errors before construction and + /// absolves the caller of needing to check for errors on every search + /// call. (Indeed, if the first `try_find_overlapping` call succeeds and + /// the same `Input` is given to subsequent calls, then all subsequent + /// calls are guaranteed to succeed.) + /// + /// ``` + /// use aho_corasick::{ + /// automaton::OverlappingState, + /// AhoCorasick, Input, Match, + /// }; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = "append the app to the appendage"; + /// + /// let ac = AhoCorasick::new(patterns).unwrap(); + /// let mut state = OverlappingState::start(); + /// let mut matches = vec![]; + /// + /// loop { + /// ac.try_find_overlapping(haystack, &mut state)?; + /// let mat = match state.get_match() { + /// None => break, + /// Some(mat) => mat, + /// }; + /// matches.push(mat); + /// } + /// let expected = vec![ + /// Match::must(2, 0..3), + /// Match::must(0, 0..6), + /// Match::must(2, 11..14), + /// Match::must(2, 22..25), + /// Match::must(0, 22..28), + /// Match::must(1, 22..31), + /// ]; + /// assert_eq!(expected, matches); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: anchored iteration + /// + /// The previous example can also be adapted to implement + /// iteration over all anchored matches. In particular, + /// [`AhoCorasick::try_find_overlapping_iter`] does not support this + /// because it isn't totally clear what the match semantics ought to be. + /// + /// In this example, we will find all overlapping matches that start at + /// the beginning of our search. + /// + /// ``` + /// use aho_corasick::{ + /// automaton::OverlappingState, + /// AhoCorasick, Anchored, Input, Match, StartKind, + /// }; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = "append the app to the appendage"; + /// + /// let ac = AhoCorasick::builder() + /// .start_kind(StartKind::Anchored) + /// .build(patterns) + /// .unwrap(); + /// let input = Input::new(haystack).anchored(Anchored::Yes); + /// let mut state = OverlappingState::start(); + /// let mut matches = vec![]; + /// + /// loop { + /// ac.try_find_overlapping(input.clone(), &mut state)?; + /// let mat = match state.get_match() { + /// None => break, + /// Some(mat) => mat, + /// }; + /// matches.push(mat); + /// } + /// let expected = vec![ + /// Match::must(2, 0..3), + /// Match::must(0, 0..6), + /// ]; + /// assert_eq!(expected, matches); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn try_find_overlapping<'h, I: Into<Input<'h>>>( + &self, + input: I, + state: &mut OverlappingState, + ) -> Result<(), MatchError> { + let input = input.into(); + enforce_anchored_consistency(self.start_kind, input.get_anchored())?; + self.aut.try_find_overlapping(&input, state) + } + + /// Returns an iterator of non-overlapping matches, using the match + /// semantics that this automaton was constructed with. + /// + /// This is the fallible version of [`AhoCorasick::find_iter`]. + /// + /// Note that the error returned by this method occurs during construction + /// of the iterator. The iterator itself yields `Match` values. That is, + /// once the iterator is constructed, the iteration itself will never + /// report an error. + /// + /// # Errors + /// + /// This returns an error when this Aho-Corasick searcher does not support + /// the given `Input` configuration. + /// + /// For example, if the Aho-Corasick searcher only supports anchored + /// searches or only supports unanchored searches, then providing an + /// `Input` that requests an anchored (or unanchored) search when it isn't + /// supported would result in an error. + /// + /// # Example: leftmost-first searching + /// + /// Basic usage with leftmost-first semantics: + /// + /// ``` + /// use aho_corasick::{AhoCorasick, Input, MatchKind, PatternID}; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = "append the app to the appendage"; + /// + /// let ac = AhoCorasick::builder() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(patterns) + /// .unwrap(); + /// let matches: Vec<PatternID> = ac + /// .try_find_iter(Input::new(haystack))? + /// .map(|mat| mat.pattern()) + /// .collect(); + /// assert_eq!(vec![ + /// PatternID::must(0), + /// PatternID::must(2), + /// PatternID::must(0), + /// ], matches); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: anchored leftmost-first searching + /// + /// This shows how to anchor the search, such that all matches must begin + /// at the starting location of the search. For an iterator, an anchored + /// search implies that all matches are adjacent. + /// + /// ``` + /// use aho_corasick::{ + /// AhoCorasick, Anchored, Input, MatchKind, PatternID, StartKind, + /// }; + /// + /// let patterns = &["foo", "bar", "quux"]; + /// let haystack = "fooquuxbar foo"; + /// + /// let ac = AhoCorasick::builder() + /// .match_kind(MatchKind::LeftmostFirst) + /// .start_kind(StartKind::Anchored) + /// .build(patterns) + /// .unwrap(); + /// let matches: Vec<PatternID> = ac + /// .try_find_iter(Input::new(haystack).anchored(Anchored::Yes))? + /// .map(|mat| mat.pattern()) + /// .collect(); + /// assert_eq!(vec![ + /// PatternID::must(0), + /// PatternID::must(2), + /// PatternID::must(1), + /// // The final 'foo' is not found because it is not adjacent to the + /// // 'bar' match. It needs to be adjacent because our search is + /// // anchored. + /// ], matches); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn try_find_iter<'a, 'h, I: Into<Input<'h>>>( + &'a self, + input: I, + ) -> Result<FindIter<'a, 'h>, MatchError> { + let input = input.into(); + enforce_anchored_consistency(self.start_kind, input.get_anchored())?; + Ok(FindIter(self.aut.try_find_iter(input)?)) + } + + /// Returns an iterator of overlapping matches. + /// + /// This is the fallible version of [`AhoCorasick::find_overlapping_iter`]. + /// + /// Note that the error returned by this method occurs during construction + /// of the iterator. The iterator itself yields `Match` values. That is, + /// once the iterator is constructed, the iteration itself will never + /// report an error. + /// + /// # Errors + /// + /// This returns an error when this Aho-Corasick searcher does not support + /// the given `Input` configuration or does not support overlapping + /// searches. + /// + /// One example is that only Aho-Corasicker searchers built with + /// [`MatchKind::Standard`] semantics support overlapping searches. Using + /// any other match semantics will result in this returning an error. + /// + /// # Example: basic usage + /// + /// ``` + /// use aho_corasick::{AhoCorasick, Input, PatternID}; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = "append the app to the appendage"; + /// + /// let ac = AhoCorasick::new(patterns).unwrap(); + /// let matches: Vec<PatternID> = ac + /// .try_find_overlapping_iter(Input::new(haystack))? + /// .map(|mat| mat.pattern()) + /// .collect(); + /// assert_eq!(vec![ + /// PatternID::must(2), + /// PatternID::must(0), + /// PatternID::must(2), + /// PatternID::must(2), + /// PatternID::must(0), + /// PatternID::must(1), + /// ], matches); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: anchored overlapping search returns an error + /// + /// It isn't clear what the match semantics for anchored overlapping + /// iterators *ought* to be, so currently an error is returned. Callers + /// may use [`AhoCorasick::try_find_overlapping`] to implement their own + /// semantics if desired. + /// + /// ``` + /// use aho_corasick::{AhoCorasick, Anchored, Input, StartKind}; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = "appendappendage app"; + /// + /// let ac = AhoCorasick::builder() + /// .start_kind(StartKind::Anchored) + /// .build(patterns) + /// .unwrap(); + /// let input = Input::new(haystack).anchored(Anchored::Yes); + /// assert!(ac.try_find_overlapping_iter(input).is_err()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn try_find_overlapping_iter<'a, 'h, I: Into<Input<'h>>>( + &'a self, + input: I, + ) -> Result<FindOverlappingIter<'a, 'h>, MatchError> { + let input = input.into(); + enforce_anchored_consistency(self.start_kind, input.get_anchored())?; + Ok(FindOverlappingIter(self.aut.try_find_overlapping_iter(input)?)) + } + + /// Replace all matches with a corresponding value in the `replace_with` + /// slice given. Matches correspond to the same matches as reported by + /// [`AhoCorasick::try_find_iter`]. + /// + /// Replacements are determined by the index of the matching pattern. + /// For example, if the pattern with index `2` is found, then it is + /// replaced by `replace_with[2]`. + /// + /// # Panics + /// + /// This panics when `replace_with.len()` does not equal + /// [`AhoCorasick::patterns_len`]. + /// + /// # Errors + /// + /// This returns an error when this Aho-Corasick searcher does not support + /// the default `Input` configuration. More specifically, this occurs only + /// when the Aho-Corasick searcher does not support unanchored searches + /// since this replacement routine always does an unanchored search. + /// + /// # Example: basic usage + /// + /// ``` + /// use aho_corasick::{AhoCorasick, MatchKind}; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = "append the app to the appendage"; + /// + /// let ac = AhoCorasick::builder() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(patterns) + /// .unwrap(); + /// let result = ac.try_replace_all(haystack, &["x", "y", "z"])?; + /// assert_eq!("x the z to the xage", result); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn try_replace_all<B>( + &self, + haystack: &str, + replace_with: &[B], + ) -> Result<String, MatchError> + where + B: AsRef<str>, + { + enforce_anchored_consistency(self.start_kind, Anchored::No)?; + self.aut.try_replace_all(haystack, replace_with) + } + + /// Replace all matches using raw bytes with a corresponding value in the + /// `replace_with` slice given. Matches correspond to the same matches as + /// reported by [`AhoCorasick::try_find_iter`]. + /// + /// Replacements are determined by the index of the matching pattern. + /// For example, if the pattern with index `2` is found, then it is + /// replaced by `replace_with[2]`. + /// + /// This is the fallible version of [`AhoCorasick::replace_all_bytes`]. + /// + /// # Panics + /// + /// This panics when `replace_with.len()` does not equal + /// [`AhoCorasick::patterns_len`]. + /// + /// # Errors + /// + /// This returns an error when this Aho-Corasick searcher does not support + /// the default `Input` configuration. More specifically, this occurs only + /// when the Aho-Corasick searcher does not support unanchored searches + /// since this replacement routine always does an unanchored search. + /// + /// # Example: basic usage + /// + /// ``` + /// use aho_corasick::{AhoCorasick, MatchKind}; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = b"append the app to the appendage"; + /// + /// let ac = AhoCorasick::builder() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(patterns) + /// .unwrap(); + /// let result = ac.try_replace_all_bytes(haystack, &["x", "y", "z"])?; + /// assert_eq!(b"x the z to the xage".to_vec(), result); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn try_replace_all_bytes<B>( + &self, + haystack: &[u8], + replace_with: &[B], + ) -> Result<Vec<u8>, MatchError> + where + B: AsRef<[u8]>, + { + enforce_anchored_consistency(self.start_kind, Anchored::No)?; + self.aut.try_replace_all_bytes(haystack, replace_with) + } + + /// Replace all matches using a closure called on each match. + /// Matches correspond to the same matches as reported by + /// [`AhoCorasick::try_find_iter`]. + /// + /// The closure accepts three parameters: the match found, the text of + /// the match and a string buffer with which to write the replaced text + /// (if any). If the closure returns `true`, then it continues to the next + /// match. If the closure returns `false`, then searching is stopped. + /// + /// Note that any matches with boundaries that don't fall on a valid UTF-8 + /// boundary are silently skipped. + /// + /// This is the fallible version of [`AhoCorasick::replace_all_with`]. + /// + /// # Errors + /// + /// This returns an error when this Aho-Corasick searcher does not support + /// the default `Input` configuration. More specifically, this occurs only + /// when the Aho-Corasick searcher does not support unanchored searches + /// since this replacement routine always does an unanchored search. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::{AhoCorasick, MatchKind}; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = "append the app to the appendage"; + /// + /// let ac = AhoCorasick::builder() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(patterns) + /// .unwrap(); + /// let mut result = String::new(); + /// ac.try_replace_all_with(haystack, &mut result, |mat, _, dst| { + /// dst.push_str(&mat.pattern().as_usize().to_string()); + /// true + /// })?; + /// assert_eq!("0 the 2 to the 0age", result); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Stopping the replacement by returning `false` (continued from the + /// example above): + /// + /// ``` + /// # use aho_corasick::{AhoCorasick, MatchKind, PatternID}; + /// # let patterns = &["append", "appendage", "app"]; + /// # let haystack = "append the app to the appendage"; + /// # let ac = AhoCorasick::builder() + /// # .match_kind(MatchKind::LeftmostFirst) + /// # .build(patterns) + /// # .unwrap(); + /// let mut result = String::new(); + /// ac.try_replace_all_with(haystack, &mut result, |mat, _, dst| { + /// dst.push_str(&mat.pattern().as_usize().to_string()); + /// mat.pattern() != PatternID::must(2) + /// })?; + /// assert_eq!("0 the 2 to the appendage", result); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn try_replace_all_with<F>( + &self, + haystack: &str, + dst: &mut String, + replace_with: F, + ) -> Result<(), MatchError> + where + F: FnMut(&Match, &str, &mut String) -> bool, + { + enforce_anchored_consistency(self.start_kind, Anchored::No)?; + self.aut.try_replace_all_with(haystack, dst, replace_with) + } + + /// Replace all matches using raw bytes with a closure called on each + /// match. Matches correspond to the same matches as reported by + /// [`AhoCorasick::try_find_iter`]. + /// + /// The closure accepts three parameters: the match found, the text of + /// the match and a byte buffer with which to write the replaced text + /// (if any). If the closure returns `true`, then it continues to the next + /// match. If the closure returns `false`, then searching is stopped. + /// + /// This is the fallible version of + /// [`AhoCorasick::replace_all_with_bytes`]. + /// + /// # Errors + /// + /// This returns an error when this Aho-Corasick searcher does not support + /// the default `Input` configuration. More specifically, this occurs only + /// when the Aho-Corasick searcher does not support unanchored searches + /// since this replacement routine always does an unanchored search. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::{AhoCorasick, MatchKind}; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = b"append the app to the appendage"; + /// + /// let ac = AhoCorasick::builder() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(patterns) + /// .unwrap(); + /// let mut result = vec![]; + /// ac.try_replace_all_with_bytes(haystack, &mut result, |mat, _, dst| { + /// dst.extend(mat.pattern().as_usize().to_string().bytes()); + /// true + /// })?; + /// assert_eq!(b"0 the 2 to the 0age".to_vec(), result); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Stopping the replacement by returning `false` (continued from the + /// example above): + /// + /// ``` + /// # use aho_corasick::{AhoCorasick, MatchKind, PatternID}; + /// # let patterns = &["append", "appendage", "app"]; + /// # let haystack = b"append the app to the appendage"; + /// # let ac = AhoCorasick::builder() + /// # .match_kind(MatchKind::LeftmostFirst) + /// # .build(patterns) + /// # .unwrap(); + /// let mut result = vec![]; + /// ac.try_replace_all_with_bytes(haystack, &mut result, |mat, _, dst| { + /// dst.extend(mat.pattern().as_usize().to_string().bytes()); + /// mat.pattern() != PatternID::must(2) + /// })?; + /// assert_eq!(b"0 the 2 to the appendage".to_vec(), result); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn try_replace_all_with_bytes<F>( + &self, + haystack: &[u8], + dst: &mut Vec<u8>, + replace_with: F, + ) -> Result<(), MatchError> + where + F: FnMut(&Match, &[u8], &mut Vec<u8>) -> bool, + { + enforce_anchored_consistency(self.start_kind, Anchored::No)?; + self.aut.try_replace_all_with_bytes(haystack, dst, replace_with) + } + + /// Returns an iterator of non-overlapping matches in the given + /// stream. Matches correspond to the same matches as reported by + /// [`AhoCorasick::try_find_iter`]. + /// + /// The matches yielded by this iterator use absolute position offsets in + /// the stream given, where the first byte has index `0`. Matches are + /// yieled until the stream is exhausted. + /// + /// Each item yielded by the iterator is an `Result<Match, + /// std::io::Error>`, where an error is yielded if there was a problem + /// reading from the reader given. + /// + /// When searching a stream, an internal buffer is used. Therefore, callers + /// should avoiding providing a buffered reader, if possible. + /// + /// This is the fallible version of [`AhoCorasick::stream_find_iter`]. + /// Note that both methods return iterators that produce `Result` values. + /// The difference is that this routine returns an error if _construction_ + /// of the iterator failed. The `Result` values yield by the iterator + /// come from whether the given reader returns an error or not during the + /// search. + /// + /// # Memory usage + /// + /// In general, searching streams will use a constant amount of memory for + /// its internal buffer. The one requirement is that the internal buffer + /// must be at least the size of the longest possible match. In most use + /// cases, the default buffer size will be much larger than any individual + /// match. + /// + /// # Errors + /// + /// This returns an error when this Aho-Corasick searcher does not support + /// the default `Input` configuration. More specifically, this occurs only + /// when the Aho-Corasick searcher does not support unanchored searches + /// since this stream searching routine always does an unanchored search. + /// + /// This also returns an error if the searcher does not support stream + /// searches. Only searchers built with [`MatchKind::Standard`] semantics + /// support stream searches. + /// + /// # Example: basic usage + /// + /// ``` + /// use aho_corasick::{AhoCorasick, PatternID}; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = "append the app to the appendage"; + /// + /// let ac = AhoCorasick::new(patterns).unwrap(); + /// let mut matches = vec![]; + /// for result in ac.try_stream_find_iter(haystack.as_bytes())? { + /// let mat = result?; + /// matches.push(mat.pattern()); + /// } + /// assert_eq!(vec![ + /// PatternID::must(2), + /// PatternID::must(2), + /// PatternID::must(2), + /// ], matches); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "std")] + pub fn try_stream_find_iter<'a, R: std::io::Read>( + &'a self, + rdr: R, + ) -> Result<StreamFindIter<'a, R>, MatchError> { + enforce_anchored_consistency(self.start_kind, Anchored::No)?; + self.aut.try_stream_find_iter(rdr).map(StreamFindIter) + } + + /// Search for and replace all matches of this automaton in + /// the given reader, and write the replacements to the given + /// writer. Matches correspond to the same matches as reported by + /// [`AhoCorasick::try_find_iter`]. + /// + /// Replacements are determined by the index of the matching pattern. For + /// example, if the pattern with index `2` is found, then it is replaced by + /// `replace_with[2]`. + /// + /// After all matches are replaced, the writer is _not_ flushed. + /// + /// If there was a problem reading from the given reader or writing to the + /// given writer, then the corresponding `io::Error` is returned and all + /// replacement is stopped. + /// + /// When searching a stream, an internal buffer is used. Therefore, callers + /// should avoiding providing a buffered reader, if possible. However, + /// callers may want to provide a buffered writer. + /// + /// Note that there is currently no infallible version of this routine. + /// + /// # Memory usage + /// + /// In general, searching streams will use a constant amount of memory for + /// its internal buffer. The one requirement is that the internal buffer + /// must be at least the size of the longest possible match. In most use + /// cases, the default buffer size will be much larger than any individual + /// match. + /// + /// # Panics + /// + /// This panics when `replace_with.len()` does not equal + /// [`AhoCorasick::patterns_len`]. + /// + /// # Errors + /// + /// This returns an error when this Aho-Corasick searcher does not support + /// the default `Input` configuration. More specifically, this occurs only + /// when the Aho-Corasick searcher does not support unanchored searches + /// since this stream searching routine always does an unanchored search. + /// + /// This also returns an error if the searcher does not support stream + /// searches. Only searchers built with [`MatchKind::Standard`] semantics + /// support stream searches. + /// + /// # Example: basic usage + /// + /// ``` + /// use aho_corasick::AhoCorasick; + /// + /// let patterns = &["fox", "brown", "quick"]; + /// let haystack = "The quick brown fox."; + /// let replace_with = &["sloth", "grey", "slow"]; + /// + /// let ac = AhoCorasick::new(patterns).unwrap(); + /// let mut result = vec![]; + /// ac.try_stream_replace_all( + /// haystack.as_bytes(), + /// &mut result, + /// replace_with, + /// )?; + /// assert_eq!(b"The slow grey sloth.".to_vec(), result); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "std")] + pub fn try_stream_replace_all<R, W, B>( + &self, + rdr: R, + wtr: W, + replace_with: &[B], + ) -> Result<(), std::io::Error> + where + R: std::io::Read, + W: std::io::Write, + B: AsRef<[u8]>, + { + enforce_anchored_consistency(self.start_kind, Anchored::No) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; + self.aut.try_stream_replace_all(rdr, wtr, replace_with) + } + + /// Search the given reader and replace all matches of this automaton + /// using the given closure. The result is written to the given + /// writer. Matches correspond to the same matches as reported by + /// [`AhoCorasick::try_find_iter`]. + /// + /// The closure accepts three parameters: the match found, the text of + /// the match and the writer with which to write the replaced text (if any). + /// + /// After all matches are replaced, the writer is _not_ flushed. + /// + /// If there was a problem reading from the given reader or writing to the + /// given writer, then the corresponding `io::Error` is returned and all + /// replacement is stopped. + /// + /// When searching a stream, an internal buffer is used. Therefore, callers + /// should avoiding providing a buffered reader, if possible. However, + /// callers may want to provide a buffered writer. + /// + /// Note that there is currently no infallible version of this routine. + /// + /// # Memory usage + /// + /// In general, searching streams will use a constant amount of memory for + /// its internal buffer. The one requirement is that the internal buffer + /// must be at least the size of the longest possible match. In most use + /// cases, the default buffer size will be much larger than any individual + /// match. + /// + /// # Errors + /// + /// This returns an error when this Aho-Corasick searcher does not support + /// the default `Input` configuration. More specifically, this occurs only + /// when the Aho-Corasick searcher does not support unanchored searches + /// since this stream searching routine always does an unanchored search. + /// + /// This also returns an error if the searcher does not support stream + /// searches. Only searchers built with [`MatchKind::Standard`] semantics + /// support stream searches. + /// + /// # Example: basic usage + /// + /// ``` + /// use std::io::Write; + /// use aho_corasick::AhoCorasick; + /// + /// let patterns = &["fox", "brown", "quick"]; + /// let haystack = "The quick brown fox."; + /// + /// let ac = AhoCorasick::new(patterns).unwrap(); + /// let mut result = vec![]; + /// ac.try_stream_replace_all_with( + /// haystack.as_bytes(), + /// &mut result, + /// |mat, _, wtr| { + /// wtr.write_all(mat.pattern().as_usize().to_string().as_bytes()) + /// }, + /// )?; + /// assert_eq!(b"The 2 1 0.".to_vec(), result); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "std")] + pub fn try_stream_replace_all_with<R, W, F>( + &self, + rdr: R, + wtr: W, + replace_with: F, + ) -> Result<(), std::io::Error> + where + R: std::io::Read, + W: std::io::Write, + F: FnMut(&Match, &[u8], &mut W) -> Result<(), std::io::Error>, + { + enforce_anchored_consistency(self.start_kind, Anchored::No) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; + self.aut.try_stream_replace_all_with(rdr, wtr, replace_with) + } +} + +/// Routines for querying information about the Aho-Corasick automaton. +impl AhoCorasick { + /// Returns the kind of the Aho-Corasick automaton used by this searcher. + /// + /// Knowing the Aho-Corasick kind is principally useful for diagnostic + /// purposes. In particular, if no specific kind was given to + /// [`AhoCorasickBuilder::kind`], then one is automatically chosen and + /// this routine will report which one. + /// + /// Note that the heuristics used for choosing which `AhoCorasickKind` + /// may be changed in a semver compatible release. + /// + /// # Examples + /// + /// ``` + /// use aho_corasick::{AhoCorasick, AhoCorasickKind}; + /// + /// let ac = AhoCorasick::new(&["foo", "bar", "quux", "baz"]).unwrap(); + /// // The specific Aho-Corasick kind chosen is not guaranteed! + /// assert_eq!(AhoCorasickKind::DFA, ac.kind()); + /// ``` + pub fn kind(&self) -> AhoCorasickKind { + self.kind + } + + /// Returns the type of starting search configuration supported by this + /// Aho-Corasick automaton. + /// + /// # Examples + /// + /// ``` + /// use aho_corasick::{AhoCorasick, StartKind}; + /// + /// let ac = AhoCorasick::new(&["foo", "bar", "quux", "baz"]).unwrap(); + /// assert_eq!(StartKind::Unanchored, ac.start_kind()); + /// ``` + pub fn start_kind(&self) -> StartKind { + self.start_kind + } + + /// Returns the match kind used by this automaton. + /// + /// The match kind is important because it determines what kinds of + /// matches are returned. Also, some operations (such as overlapping + /// search and stream searching) are only supported when using the + /// [`MatchKind::Standard`] match kind. + /// + /// # Examples + /// + /// ``` + /// use aho_corasick::{AhoCorasick, MatchKind}; + /// + /// let ac = AhoCorasick::new(&["foo", "bar", "quux", "baz"]).unwrap(); + /// assert_eq!(MatchKind::Standard, ac.match_kind()); + /// ``` + pub fn match_kind(&self) -> MatchKind { + self.aut.match_kind() + } + + /// Returns the length of the shortest pattern matched by this automaton. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::AhoCorasick; + /// + /// let ac = AhoCorasick::new(&["foo", "bar", "quux", "baz"]).unwrap(); + /// assert_eq!(3, ac.min_pattern_len()); + /// ``` + /// + /// Note that an `AhoCorasick` automaton has a minimum length of `0` if + /// and only if it can match the empty string: + /// + /// ``` + /// use aho_corasick::AhoCorasick; + /// + /// let ac = AhoCorasick::new(&["foo", "", "quux", "baz"]).unwrap(); + /// assert_eq!(0, ac.min_pattern_len()); + /// ``` + pub fn min_pattern_len(&self) -> usize { + self.aut.min_pattern_len() + } + + /// Returns the length of the longest pattern matched by this automaton. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::AhoCorasick; + /// + /// let ac = AhoCorasick::new(&["foo", "bar", "quux", "baz"]).unwrap(); + /// assert_eq!(4, ac.max_pattern_len()); + /// ``` + pub fn max_pattern_len(&self) -> usize { + self.aut.max_pattern_len() + } + + /// Return the total number of patterns matched by this automaton. + /// + /// This includes patterns that may never participate in a match. For + /// example, if [`MatchKind::LeftmostFirst`] match semantics are used, and + /// the patterns `Sam` and `Samwise` were used to build the automaton (in + /// that order), then `Samwise` can never participate in a match because + /// `Sam` will always take priority. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::AhoCorasick; + /// + /// let ac = AhoCorasick::new(&["foo", "bar", "baz"]).unwrap(); + /// assert_eq!(3, ac.patterns_len()); + /// ``` + pub fn patterns_len(&self) -> usize { + self.aut.patterns_len() + } + + /// Returns the approximate total amount of heap used by this automaton, in + /// units of bytes. + /// + /// # Examples + /// + /// This example shows the difference in heap usage between a few + /// configurations: + /// + /// ``` + /// # if !cfg!(target_pointer_width = "64") { return; } + /// use aho_corasick::{AhoCorasick, AhoCorasickKind, MatchKind}; + /// + /// let ac = AhoCorasick::builder() + /// .kind(None) // default + /// .build(&["foobar", "bruce", "triskaidekaphobia", "springsteen"]) + /// .unwrap(); + /// assert_eq!(5_632, ac.memory_usage()); + /// + /// let ac = AhoCorasick::builder() + /// .kind(None) // default + /// .ascii_case_insensitive(true) + /// .build(&["foobar", "bruce", "triskaidekaphobia", "springsteen"]) + /// .unwrap(); + /// assert_eq!(11_136, ac.memory_usage()); + /// + /// let ac = AhoCorasick::builder() + /// .kind(Some(AhoCorasickKind::NoncontiguousNFA)) + /// .ascii_case_insensitive(true) + /// .build(&["foobar", "bruce", "triskaidekaphobia", "springsteen"]) + /// .unwrap(); + /// assert_eq!(10_879, ac.memory_usage()); + /// + /// let ac = AhoCorasick::builder() + /// .kind(Some(AhoCorasickKind::ContiguousNFA)) + /// .ascii_case_insensitive(true) + /// .build(&["foobar", "bruce", "triskaidekaphobia", "springsteen"]) + /// .unwrap(); + /// assert_eq!(2_584, ac.memory_usage()); + /// + /// let ac = AhoCorasick::builder() + /// .kind(Some(AhoCorasickKind::DFA)) + /// .ascii_case_insensitive(true) + /// .build(&["foobar", "bruce", "triskaidekaphobia", "springsteen"]) + /// .unwrap(); + /// // While this shows the DFA being the biggest here by a small margin, + /// // don't let the difference fool you. With such a small number of + /// // patterns, the difference is small, but a bigger number of patterns + /// // will reveal that the rate of growth of the DFA is far bigger than + /// // the NFAs above. For a large number of patterns, it is easy for the + /// // DFA to take an order of magnitude more heap space (or more!). + /// assert_eq!(11_136, ac.memory_usage()); + /// ``` + pub fn memory_usage(&self) -> usize { + self.aut.memory_usage() + } +} + +// We provide a manual debug impl so that we don't include the 'start_kind', +// principally because it's kind of weird to do so and because it screws with +// the carefully curated debug output for the underlying automaton. +impl core::fmt::Debug for AhoCorasick { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.debug_tuple("AhoCorasick").field(&self.aut).finish() + } +} + +/// An iterator of non-overlapping matches in a particular haystack. +/// +/// This iterator yields matches according to the [`MatchKind`] used by this +/// automaton. +/// +/// This iterator is constructed via the [`AhoCorasick::find_iter`] and +/// [`AhoCorasick::try_find_iter`] methods. +/// +/// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton. +/// +/// The lifetime `'h` refers to the lifetime of the haystack being searched. +#[derive(Debug)] +pub struct FindIter<'a, 'h>(automaton::FindIter<'a, 'h, Arc<dyn AcAutomaton>>); + +impl<'a, 'h> Iterator for FindIter<'a, 'h> { + type Item = Match; + + #[inline] + fn next(&mut self) -> Option<Match> { + self.0.next() + } +} + +/// An iterator of overlapping matches in a particular haystack. +/// +/// This iterator will report all possible matches in a particular haystack, +/// even when the matches overlap. +/// +/// This iterator is constructed via the [`AhoCorasick::find_overlapping_iter`] +/// and [`AhoCorasick::try_find_overlapping_iter`] methods. +/// +/// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton. +/// +/// The lifetime `'h` refers to the lifetime of the haystack being searched. +#[derive(Debug)] +pub struct FindOverlappingIter<'a, 'h>( + automaton::FindOverlappingIter<'a, 'h, Arc<dyn AcAutomaton>>, +); + +impl<'a, 'h> Iterator for FindOverlappingIter<'a, 'h> { + type Item = Match; + + #[inline] + fn next(&mut self) -> Option<Match> { + self.0.next() + } +} + +/// An iterator that reports Aho-Corasick matches in a stream. +/// +/// This iterator yields elements of type `Result<Match, std::io::Error>`, +/// where an error is reported if there was a problem reading from the +/// underlying stream. The iterator terminates only when the underlying stream +/// reaches `EOF`. +/// +/// This iterator is constructed via the [`AhoCorasick::stream_find_iter`] and +/// [`AhoCorasick::try_stream_find_iter`] methods. +/// +/// The type variable `R` refers to the `io::Read` stream that is being read +/// from. +/// +/// The lifetime `'a` refers to the lifetime of the corresponding +/// [`AhoCorasick`] searcher. +#[cfg(feature = "std")] +#[derive(Debug)] +pub struct StreamFindIter<'a, R>( + automaton::StreamFindIter<'a, Arc<dyn AcAutomaton>, R>, +); + +#[cfg(feature = "std")] +impl<'a, R: std::io::Read> Iterator for StreamFindIter<'a, R> { + type Item = Result<Match, std::io::Error>; + + fn next(&mut self) -> Option<Result<Match, std::io::Error>> { + self.0.next() + } +} + +/// A builder for configuring an Aho-Corasick automaton. +/// +/// # Quick advice +/// +/// * Use [`AhoCorasickBuilder::match_kind`] to configure your searcher +/// with [`MatchKind::LeftmostFirst`] if you want to match how backtracking +/// regex engines execute searches for `pat1|pat2|..|patN`. Use +/// [`MatchKind::LeftmostLongest`] if you want to match how POSIX regex engines +/// do it. +/// * If you need an anchored search, use [`AhoCorasickBuilder::start_kind`] to +/// set the [`StartKind::Anchored`] mode since [`StartKind::Unanchored`] is the +/// default. Or just use [`StartKind::Both`] to support both types of searches. +/// * You might want to use [`AhoCorasickBuilder::kind`] to set your searcher +/// to always use a [`AhoCorasickKind::DFA`] if search speed is critical and +/// memory usage isn't a concern. Otherwise, not setting a kind will probably +/// make the right choice for you. Beware that if you use [`StartKind::Both`] +/// to build a searcher that supports both unanchored and anchored searches +/// _and_ you set [`AhoCorasickKind::DFA`], then the DFA will essentially be +/// duplicated to support both simultaneously. This results in very high memory +/// usage. +/// * For all other options, their defaults are almost certainly what you want. +#[derive(Clone, Debug, Default)] +pub struct AhoCorasickBuilder { + nfa_noncontiguous: noncontiguous::Builder, + nfa_contiguous: contiguous::Builder, + dfa: dfa::Builder, + kind: Option<AhoCorasickKind>, + start_kind: StartKind, +} + +impl AhoCorasickBuilder { + /// Create a new builder for configuring an Aho-Corasick automaton. + /// + /// The builder provides a way to configure a number of things, including + /// ASCII case insensitivity and what kind of match semantics are used. + pub fn new() -> AhoCorasickBuilder { + AhoCorasickBuilder::default() + } + + /// Build an Aho-Corasick automaton using the configuration set on this + /// builder. + /// + /// A builder may be reused to create more automatons. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::{AhoCorasickBuilder, PatternID}; + /// + /// let patterns = &["foo", "bar", "baz"]; + /// let ac = AhoCorasickBuilder::new().build(patterns).unwrap(); + /// assert_eq!( + /// Some(PatternID::must(1)), + /// ac.find("xxx bar xxx").map(|m| m.pattern()), + /// ); + /// ``` + pub fn build<I, P>(&self, patterns: I) -> Result<AhoCorasick, BuildError> + where + I: IntoIterator<Item = P>, + P: AsRef<[u8]>, + { + let nfa = self.nfa_noncontiguous.build(patterns)?; + let (aut, kind): (Arc<dyn AcAutomaton>, AhoCorasickKind) = + match self.kind { + None => { + debug!( + "asked for automatic Aho-Corasick implementation, \ + criteria: <patterns: {:?}, max pattern len: {:?}, \ + start kind: {:?}>", + nfa.patterns_len(), + nfa.max_pattern_len(), + self.start_kind, + ); + self.build_auto(nfa) + } + Some(AhoCorasickKind::NoncontiguousNFA) => { + debug!("forcefully chose noncontiguous NFA"); + (Arc::new(nfa), AhoCorasickKind::NoncontiguousNFA) + } + Some(AhoCorasickKind::ContiguousNFA) => { + debug!("forcefully chose contiguous NFA"); + let cnfa = + self.nfa_contiguous.build_from_noncontiguous(&nfa)?; + (Arc::new(cnfa), AhoCorasickKind::ContiguousNFA) + } + Some(AhoCorasickKind::DFA) => { + debug!("forcefully chose DFA"); + let dfa = self.dfa.build_from_noncontiguous(&nfa)?; + (Arc::new(dfa), AhoCorasickKind::DFA) + } + }; + Ok(AhoCorasick { aut, kind, start_kind: self.start_kind }) + } + + /// Implements the automatic selection logic for the Aho-Corasick + /// implementation to use. Since all Aho-Corasick automatons are built + /// from a non-contiguous NFA, the caller is responsible for building + /// that first. + fn build_auto( + &self, + nfa: noncontiguous::NFA, + ) -> (Arc<dyn AcAutomaton>, AhoCorasickKind) { + // We try to build a DFA if we have a very small number of patterns, + // otherwise the memory usage just gets too crazy. We also only do it + // when the start kind is unanchored or anchored, but not both, because + // both implies two full copies of the transition table. + let try_dfa = !matches!(self.start_kind, StartKind::Both) + && nfa.patterns_len() <= 100; + if try_dfa { + match self.dfa.build_from_noncontiguous(&nfa) { + Ok(dfa) => { + debug!("chose a DFA"); + return (Arc::new(dfa), AhoCorasickKind::DFA); + } + Err(_err) => { + debug!( + "failed to build DFA, trying something else: {}", + _err + ); + } + } + } + // We basically always want a contiguous NFA if the limited + // circumstances in which we use a DFA are not true. It is quite fast + // and has excellent memory usage. The only way we don't use it is if + // there are so many states that it can't fit in a contiguous NFA. + // And the only way to know that is to try to build it. Building a + // contiguous NFA is mostly just reshuffling data from a noncontiguous + // NFA, so it isn't too expensive, especially relative to building a + // noncontiguous NFA in the first place. + match self.nfa_contiguous.build_from_noncontiguous(&nfa) { + Ok(nfa) => { + debug!("chose contiguous NFA"); + return (Arc::new(nfa), AhoCorasickKind::ContiguousNFA); + } + #[allow(unused_variables)] // unused when 'logging' is disabled + Err(_err) => { + debug!( + "failed to build contiguous NFA, \ + trying something else: {}", + _err + ); + } + } + debug!("chose non-contiguous NFA"); + (Arc::new(nfa), AhoCorasickKind::NoncontiguousNFA) + } + + /// Set the desired match semantics. + /// + /// The default is [`MatchKind::Standard`], which corresponds to the match + /// semantics supported by the standard textbook description of the + /// Aho-Corasick algorithm. Namely, matches are reported as soon as they + /// are found. Moreover, this is the only way to get overlapping matches + /// or do stream searching. + /// + /// The other kinds of match semantics that are supported are + /// [`MatchKind::LeftmostFirst`] and [`MatchKind::LeftmostLongest`]. The + /// former corresponds to the match you would get if you were to try to + /// match each pattern at each position in the haystack in the same order + /// that you give to the automaton. That is, it returns the leftmost match + /// corresponding to the earliest pattern given to the automaton. The + /// latter corresponds to finding the longest possible match among all + /// leftmost matches. + /// + /// For more details on match semantics, see the [documentation for + /// `MatchKind`](MatchKind). + /// + /// Note that setting this to [`MatchKind::LeftmostFirst`] or + /// [`MatchKind::LeftmostLongest`] will cause some search routines on + /// [`AhoCorasick`] to return an error (or panic if you're using the + /// infallible API). Notably, this includes stream and overlapping + /// searches. + /// + /// # Examples + /// + /// In these examples, we demonstrate the differences between match + /// semantics for a particular set of patterns in a specific order: + /// `b`, `abc`, `abcd`. + /// + /// Standard semantics: + /// + /// ``` + /// use aho_corasick::{AhoCorasick, MatchKind}; + /// + /// let patterns = &["b", "abc", "abcd"]; + /// let haystack = "abcd"; + /// + /// let ac = AhoCorasick::builder() + /// .match_kind(MatchKind::Standard) // default, not necessary + /// .build(patterns) + /// .unwrap(); + /// let mat = ac.find(haystack).expect("should have a match"); + /// assert_eq!("b", &haystack[mat.start()..mat.end()]); + /// ``` + /// + /// Leftmost-first semantics: + /// + /// ``` + /// use aho_corasick::{AhoCorasick, MatchKind}; + /// + /// let patterns = &["b", "abc", "abcd"]; + /// let haystack = "abcd"; + /// + /// let ac = AhoCorasick::builder() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(patterns) + /// .unwrap(); + /// let mat = ac.find(haystack).expect("should have a match"); + /// assert_eq!("abc", &haystack[mat.start()..mat.end()]); + /// ``` + /// + /// Leftmost-longest semantics: + /// + /// ``` + /// use aho_corasick::{AhoCorasick, MatchKind}; + /// + /// let patterns = &["b", "abc", "abcd"]; + /// let haystack = "abcd"; + /// + /// let ac = AhoCorasick::builder() + /// .match_kind(MatchKind::LeftmostLongest) + /// .build(patterns) + /// .unwrap(); + /// let mat = ac.find(haystack).expect("should have a match"); + /// assert_eq!("abcd", &haystack[mat.start()..mat.end()]); + /// ``` + pub fn match_kind(&mut self, kind: MatchKind) -> &mut AhoCorasickBuilder { + self.nfa_noncontiguous.match_kind(kind); + self.nfa_contiguous.match_kind(kind); + self.dfa.match_kind(kind); + self + } + + /// Sets the starting state configuration for the automaton. + /// + /// Every Aho-Corasick automaton is capable of having two start states: one + /// that is used for unanchored searches and one that is used for anchored + /// searches. Some automatons, like the NFAs, support this with almost zero + /// additional cost. Other automatons, like the DFA, require two copies of + /// the underlying transition table to support both simultaneously. + /// + /// Because there may be an added non-trivial cost to supporting both, it + /// is possible to configure which starting state configuration is needed. + /// + /// Indeed, since anchored searches tend to be somewhat more rare, + /// _only_ unanchored searches are supported by default. Thus, + /// [`StartKind::Unanchored`] is the default. + /// + /// Note that when this is set to [`StartKind::Unanchored`], then + /// running an anchored search will result in an error (or a panic + /// if using the infallible APIs). Similarly, when this is set to + /// [`StartKind::Anchored`], then running an unanchored search will + /// result in an error (or a panic if using the infallible APIs). When + /// [`StartKind::Both`] is used, then both unanchored and anchored searches + /// are always supported. + /// + /// Also note that even if an `AhoCorasick` searcher is using an NFA + /// internally (which always supports both unanchored and anchored + /// searches), an error will still be reported for a search that isn't + /// supported by the configuration set via this method. This means, + /// for example, that an error is never dependent on which internal + /// implementation of Aho-Corasick is used. + /// + /// # Example: anchored search + /// + /// This shows how to build a searcher that only supports anchored + /// searches: + /// + /// ``` + /// use aho_corasick::{ + /// AhoCorasick, Anchored, Input, Match, MatchKind, StartKind, + /// }; + /// + /// let ac = AhoCorasick::builder() + /// .match_kind(MatchKind::LeftmostFirst) + /// .start_kind(StartKind::Anchored) + /// .build(&["b", "abc", "abcd"]) + /// .unwrap(); + /// + /// // An unanchored search is not supported! An error here is guaranteed + /// // given the configuration above regardless of which kind of + /// // Aho-Corasick implementation ends up being used internally. + /// let input = Input::new("foo abcd").anchored(Anchored::No); + /// assert!(ac.try_find(input).is_err()); + /// + /// let input = Input::new("foo abcd").anchored(Anchored::Yes); + /// assert_eq!(None, ac.try_find(input)?); + /// + /// let input = Input::new("abcd").anchored(Anchored::Yes); + /// assert_eq!(Some(Match::must(1, 0..3)), ac.try_find(input)?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: unanchored and anchored searches + /// + /// This shows how to build a searcher that supports both unanchored and + /// anchored searches: + /// + /// ``` + /// use aho_corasick::{ + /// AhoCorasick, Anchored, Input, Match, MatchKind, StartKind, + /// }; + /// + /// let ac = AhoCorasick::builder() + /// .match_kind(MatchKind::LeftmostFirst) + /// .start_kind(StartKind::Both) + /// .build(&["b", "abc", "abcd"]) + /// .unwrap(); + /// + /// let input = Input::new("foo abcd").anchored(Anchored::No); + /// assert_eq!(Some(Match::must(1, 4..7)), ac.try_find(input)?); + /// + /// let input = Input::new("foo abcd").anchored(Anchored::Yes); + /// assert_eq!(None, ac.try_find(input)?); + /// + /// let input = Input::new("abcd").anchored(Anchored::Yes); + /// assert_eq!(Some(Match::must(1, 0..3)), ac.try_find(input)?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn start_kind(&mut self, kind: StartKind) -> &mut AhoCorasickBuilder { + self.dfa.start_kind(kind); + self.start_kind = kind; + self + } + + /// Enable ASCII-aware case insensitive matching. + /// + /// When this option is enabled, searching will be performed without + /// respect to case for ASCII letters (`a-z` and `A-Z`) only. + /// + /// Enabling this option does not change the search algorithm, but it may + /// increase the size of the automaton. + /// + /// **NOTE:** It is unlikely that support for Unicode case folding will + /// be added in the future. The ASCII case works via a simple hack to the + /// underlying automaton, but full Unicode handling requires a fair bit of + /// sophistication. If you do need Unicode handling, you might consider + /// using the [`regex` crate](https://docs.rs/regex) or the lower level + /// [`regex-automata` crate](https://docs.rs/regex-automata). + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::AhoCorasick; + /// + /// let patterns = &["FOO", "bAr", "BaZ"]; + /// let haystack = "foo bar baz"; + /// + /// let ac = AhoCorasick::builder() + /// .ascii_case_insensitive(true) + /// .build(patterns) + /// .unwrap(); + /// assert_eq!(3, ac.find_iter(haystack).count()); + /// ``` + pub fn ascii_case_insensitive( + &mut self, + yes: bool, + ) -> &mut AhoCorasickBuilder { + self.nfa_noncontiguous.ascii_case_insensitive(yes); + self.nfa_contiguous.ascii_case_insensitive(yes); + self.dfa.ascii_case_insensitive(yes); + self + } + + /// Choose the type of underlying automaton to use. + /// + /// Currently, there are four choices: + /// + /// * [`AhoCorasickKind::NoncontiguousNFA`] instructs the searcher to + /// use a [`noncontiguous::NFA`]. A noncontiguous NFA is the fastest to + /// be built, has moderate memory usage and is typically the slowest to + /// execute a search. + /// * [`AhoCorasickKind::ContiguousNFA`] instructs the searcher to use a + /// [`contiguous::NFA`]. A contiguous NFA is a little slower to build than + /// a noncontiguous NFA, has excellent memory usage and is typically a + /// little slower than a DFA for a search. + /// * [`AhoCorasickKind::DFA`] instructs the searcher to use a + /// [`dfa::DFA`]. A DFA is very slow to build, uses exorbitant amounts of + /// memory, but will typically execute searches the fastest. + /// * `None` (the default) instructs the searcher to choose the "best" + /// Aho-Corasick implementation. This choice is typically based primarily + /// on the number of patterns. + /// + /// Setting this configuration does not change the time complexity for + /// constructing the Aho-Corasick automaton (which is `O(p)` where `p` + /// is the total number of patterns being compiled). Setting this to + /// [`AhoCorasickKind::DFA`] does however reduce the time complexity of + /// non-overlapping searches from `O(n + p)` to `O(n)`, where `n` is the + /// length of the haystack. + /// + /// In general, you should probably stick to the default unless you have + /// some kind of reason to use a specific Aho-Corasick implementation. For + /// example, you might choose `AhoCorasickKind::DFA` if you don't care + /// about memory usage and want the fastest possible search times. + /// + /// Setting this guarantees that the searcher returned uses the chosen + /// implementation. If that implementation could not be constructed, then + /// an error will be returned. In contrast, when `None` is used, it is + /// possible for it to attempt to construct, for example, a contiguous + /// NFA and have it fail. In which case, it will fall back to using a + /// noncontiguous NFA. + /// + /// If `None` is given, then one may use [`AhoCorasick::kind`] to determine + /// which Aho-Corasick implementation was chosen. + /// + /// Note that the heuristics used for choosing which `AhoCorasickKind` + /// may be changed in a semver compatible release. + pub fn kind( + &mut self, + kind: Option<AhoCorasickKind>, + ) -> &mut AhoCorasickBuilder { + self.kind = kind; + self + } + + /// Enable heuristic prefilter optimizations. + /// + /// When enabled, searching will attempt to quickly skip to match + /// candidates using specialized literal search routines. A prefilter + /// cannot always be used, and is generally treated as a heuristic. It + /// can be useful to disable this if the prefilter is observed to be + /// sub-optimal for a particular workload. + /// + /// Currently, prefilters are typically only active when building searchers + /// with a small (less than 100) number of patterns. + /// + /// This is enabled by default. + pub fn prefilter(&mut self, yes: bool) -> &mut AhoCorasickBuilder { + self.nfa_noncontiguous.prefilter(yes); + self.nfa_contiguous.prefilter(yes); + self.dfa.prefilter(yes); + self + } + + /// Set the limit on how many states use a dense representation for their + /// transitions. Other states will generally use a sparse representation. + /// + /// A dense representation uses more memory but is generally faster, since + /// the next transition in a dense representation can be computed in a + /// constant number of instructions. A sparse representation uses less + /// memory but is generally slower, since the next transition in a sparse + /// representation requires executing a variable number of instructions. + /// + /// This setting is only used when an Aho-Corasick implementation is used + /// that supports the dense versus sparse representation trade off. Not all + /// do. + /// + /// This limit is expressed in terms of the depth of a state, i.e., the + /// number of transitions from the starting state of the automaton. The + /// idea is that most of the time searching will be spent near the starting + /// state of the automaton, so states near the start state should use a + /// dense representation. States further away from the start state would + /// then use a sparse representation. + /// + /// By default, this is set to a low but non-zero number. Setting this to + /// `0` is almost never what you want, since it is likely to make searches + /// very slow due to the start state itself being forced to use a sparse + /// representation. However, it is unlikely that increasing this number + /// will help things much, since the most active states have a small depth. + /// More to the point, the memory usage increases superlinearly as this + /// number increases. + pub fn dense_depth(&mut self, depth: usize) -> &mut AhoCorasickBuilder { + self.nfa_noncontiguous.dense_depth(depth); + self.nfa_contiguous.dense_depth(depth); + self + } + + /// A debug settting for whether to attempt to shrink the size of the + /// automaton's alphabet or not. + /// + /// This option is enabled by default and should never be disabled unless + /// one is debugging the underlying automaton. + /// + /// When enabled, some (but not all) Aho-Corasick automatons will use a map + /// from all possible bytes to their corresponding equivalence class. Each + /// equivalence class represents a set of bytes that does not discriminate + /// between a match and a non-match in the automaton. + /// + /// The advantage of this map is that the size of the transition table can + /// be reduced drastically from `#states * 256 * sizeof(u32)` to + /// `#states * k * sizeof(u32)` where `k` is the number of equivalence + /// classes (rounded up to the nearest power of 2). As a result, total + /// space usage can decrease substantially. Moreover, since a smaller + /// alphabet is used, automaton compilation becomes faster as well. + /// + /// **WARNING:** This is only useful for debugging automatons. Disabling + /// this does not yield any speed advantages. Namely, even when this is + /// disabled, a byte class map is still used while searching. The only + /// difference is that every byte will be forced into its own distinct + /// equivalence class. This is useful for debugging the actual generated + /// transitions because it lets one see the transitions defined on actual + /// bytes instead of the equivalence classes. + pub fn byte_classes(&mut self, yes: bool) -> &mut AhoCorasickBuilder { + self.nfa_contiguous.byte_classes(yes); + self.dfa.byte_classes(yes); + self + } +} + +/// The type of Aho-Corasick implementation to use in an [`AhoCorasick`] +/// searcher. +/// +/// This is principally used as an input to the +/// [`AhoCorasickBuilder::start_kind`] method. Its documentation goes into more +/// detail about each choice. +#[non_exhaustive] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum AhoCorasickKind { + /// Use a noncontiguous NFA. + NoncontiguousNFA, + /// Use a contiguous NFA. + ContiguousNFA, + /// Use a DFA. Warning: DFAs typically use a large amount of memory. + DFA, +} + +/// A trait that effectively gives us practical dynamic dispatch over anything +/// that impls `Automaton`, but without needing to add a bunch of bounds to +/// the core `Automaton` trait. Basically, we provide all of the marker traits +/// that our automatons have, in addition to `Debug` impls and requiring that +/// there is no borrowed data. Without these, the main `AhoCorasick` type would +/// not be able to meaningfully impl `Debug` or the marker traits without also +/// requiring that all impls of `Automaton` do so, which would be not great. +trait AcAutomaton: + Automaton + Debug + Send + Sync + UnwindSafe + RefUnwindSafe + 'static +{ +} + +impl<A> AcAutomaton for A where + A: Automaton + Debug + Send + Sync + UnwindSafe + RefUnwindSafe + 'static +{ +} + +impl crate::automaton::private::Sealed for Arc<dyn AcAutomaton> {} + +// I'm not sure why this trait impl shows up in the docs, as the AcAutomaton +// trait is not exported. So we forcefully hide it. +// +// SAFETY: This just defers to the underlying 'AcAutomaton' and thus inherits +// its safety properties. +#[doc(hidden)] +unsafe impl Automaton for Arc<dyn AcAutomaton> { + #[inline(always)] + fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> { + (**self).start_state(anchored) + } + + #[inline(always)] + fn next_state( + &self, + anchored: Anchored, + sid: StateID, + byte: u8, + ) -> StateID { + (**self).next_state(anchored, sid, byte) + } + + #[inline(always)] + fn is_special(&self, sid: StateID) -> bool { + (**self).is_special(sid) + } + + #[inline(always)] + fn is_dead(&self, sid: StateID) -> bool { + (**self).is_dead(sid) + } + + #[inline(always)] + fn is_match(&self, sid: StateID) -> bool { + (**self).is_match(sid) + } + + #[inline(always)] + fn is_start(&self, sid: StateID) -> bool { + (**self).is_start(sid) + } + + #[inline(always)] + fn match_kind(&self) -> MatchKind { + (**self).match_kind() + } + + #[inline(always)] + fn match_len(&self, sid: StateID) -> usize { + (**self).match_len(sid) + } + + #[inline(always)] + fn match_pattern(&self, sid: StateID, index: usize) -> PatternID { + (**self).match_pattern(sid, index) + } + + #[inline(always)] + fn patterns_len(&self) -> usize { + (**self).patterns_len() + } + + #[inline(always)] + fn pattern_len(&self, pid: PatternID) -> usize { + (**self).pattern_len(pid) + } + + #[inline(always)] + fn min_pattern_len(&self) -> usize { + (**self).min_pattern_len() + } + + #[inline(always)] + fn max_pattern_len(&self) -> usize { + (**self).max_pattern_len() + } + + #[inline(always)] + fn memory_usage(&self) -> usize { + (**self).memory_usage() + } + + #[inline(always)] + fn prefilter(&self) -> Option<&Prefilter> { + (**self).prefilter() + } + + // Even though 'try_find' and 'try_find_overlapping' each have their + // own default impls, we explicitly define them here to fix a perf bug. + // Without these explicit definitions, the default impl will wind up using + // dynamic dispatch for all 'Automaton' method calls, including things like + // 'next_state' that absolutely must get inlined or else perf is trashed. + // Defining them explicitly here like this still requires dynamic dispatch + // to call 'try_find' itself, but all uses of 'Automaton' within 'try_find' + // are monomorphized. + // + // We don't need to explicitly impl any other methods, I think, because + // they are all implemented themselves in terms of 'try_find' and + // 'try_find_overlapping'. We still might wind up with an extra virtual + // call here or there, but that's okay since it's outside of any perf + // critical areas. + + #[inline(always)] + fn try_find( + &self, + input: &Input<'_>, + ) -> Result<Option<Match>, MatchError> { + (**self).try_find(input) + } + + #[inline(always)] + fn try_find_overlapping( + &self, + input: &Input<'_>, + state: &mut OverlappingState, + ) -> Result<(), MatchError> { + (**self).try_find_overlapping(input, state) + } +} + +/// Returns an error if the start state configuration does not support the +/// desired search configuration. See the internal 'AhoCorasick::start_kind' +/// field docs for more details. +fn enforce_anchored_consistency( + have: StartKind, + want: Anchored, +) -> Result<(), MatchError> { + match have { + StartKind::Both => Ok(()), + StartKind::Unanchored if !want.is_anchored() => Ok(()), + StartKind::Unanchored => Err(MatchError::invalid_input_anchored()), + StartKind::Anchored if want.is_anchored() => Ok(()), + StartKind::Anchored => Err(MatchError::invalid_input_unanchored()), + } +} diff --git a/vendor/aho-corasick/src/automaton.rs b/vendor/aho-corasick/src/automaton.rs new file mode 100644 index 0000000..c41dc6e --- /dev/null +++ b/vendor/aho-corasick/src/automaton.rs @@ -0,0 +1,1608 @@ +/*! +Provides [`Automaton`] trait for abstracting over Aho-Corasick automata. + +The `Automaton` trait provides a way to write generic code over any +Aho-Corasick automaton. It also provides access to lower level APIs that +permit walking the state transitions of an Aho-Corasick automaton manually. +*/ + +use alloc::{string::String, vec::Vec}; + +use crate::util::{ + error::MatchError, + primitives::PatternID, + search::{Anchored, Input, Match, MatchKind, Span}, +}; + +pub use crate::util::{ + prefilter::{Candidate, Prefilter}, + primitives::{StateID, StateIDError}, +}; + +/// We seal the `Automaton` trait for now. It's a big trait, and it's +/// conceivable that I might want to add new required methods, and sealing the +/// trait permits doing that in a backwards compatible fashion. On other the +/// hand, if you have a solid use case for implementing the trait yourself, +/// please file an issue and we can discuss it. This was *mostly* done as a +/// conservative step. +pub(crate) mod private { + pub trait Sealed {} +} +impl private::Sealed for crate::nfa::noncontiguous::NFA {} +impl private::Sealed for crate::nfa::contiguous::NFA {} +impl private::Sealed for crate::dfa::DFA {} + +impl<'a, T: private::Sealed + ?Sized> private::Sealed for &'a T {} + +/// A trait that abstracts over Aho-Corasick automata. +/// +/// This trait primarily exists for niche use cases such as: +/// +/// * Using an NFA or DFA directly, bypassing the top-level +/// [`AhoCorasick`](crate::AhoCorasick) searcher. Currently, these include +/// [`noncontiguous::NFA`](crate::nfa::noncontiguous::NFA), +/// [`contiguous::NFA`](crate::nfa::contiguous::NFA) and +/// [`dfa::DFA`](crate::dfa::DFA). +/// * Implementing your own custom search routine by walking the automaton +/// yourself. This might be useful for implementing search on non-contiguous +/// strings or streams. +/// +/// For most use cases, it is not expected that users will need +/// to use or even know about this trait. Indeed, the top level +/// [`AhoCorasick`](crate::AhoCorasick) searcher does not expose any details +/// about this trait, nor does it implement it itself. +/// +/// Note that this trait defines a number of default methods, such as +/// [`Automaton::try_find`] and [`Automaton::try_find_iter`], which implement +/// higher level search routines in terms of the lower level automata API. +/// +/// # Sealed +/// +/// Currently, this trait is sealed. That means users of this crate can write +/// generic routines over this trait but cannot implement it themselves. This +/// restriction may be lifted in the future, but sealing the trait permits +/// adding new required methods in a backwards compatible fashion. +/// +/// # Special states +/// +/// This trait encodes a notion of "special" states in an automaton. Namely, +/// a state is treated as special if it is a dead, match or start state: +/// +/// * A dead state is a state that cannot be left once entered. All transitions +/// on a dead state lead back to itself. The dead state is meant to be treated +/// as a sentinel indicating that the search should stop and return a match if +/// one has been found, and nothing otherwise. +/// * A match state is a state that indicates one or more patterns have +/// matched. Depending on the [`MatchKind`] of the automaton, a search may +/// stop once a match is seen, or it may continue looking for matches until +/// it enters a dead state or sees the end of the haystack. +/// * A start state is a state that a search begins in. It is useful to know +/// when a search enters a start state because it may mean that a prefilter can +/// be used to skip ahead and quickly look for candidate matches. Unlike dead +/// and match states, it is never necessary to explicitly handle start states +/// for correctness. Indeed, in this crate, implementations of `Automaton` +/// will only treat start states as "special" when a prefilter is enabled and +/// active. Otherwise, treating it as special has no purpose and winds up +/// slowing down the overall search because it results in ping-ponging between +/// the main state transition and the "special" state logic. +/// +/// Since checking whether a state is special by doing three different +/// checks would be too expensive inside a fast search loop, the +/// [`Automaton::is_special`] method is provided for quickly checking whether +/// the state is special. The `Automaton::is_dead`, `Automaton::is_match` and +/// `Automaton::is_start` predicates can then be used to determine which kind +/// of special state it is. +/// +/// # Panics +/// +/// Most of the APIs on this trait should panic or give incorrect results +/// if invalid inputs are given to it. For example, `Automaton::next_state` +/// has unspecified behavior if the state ID given to it is not a valid +/// state ID for the underlying automaton. Valid state IDs can only be +/// retrieved in one of two ways: calling `Automaton::start_state` or calling +/// `Automaton::next_state` with a valid state ID. +/// +/// # Safety +/// +/// This trait is not safe to implement so that code may rely on the +/// correctness of implementations of this trait to avoid undefined behavior. +/// The primary correctness guarantees are: +/// +/// * `Automaton::start_state` always returns a valid state ID or an error or +/// panics. +/// * `Automaton::next_state`, when given a valid state ID, always returns +/// a valid state ID for all values of `anchored` and `byte`, or otherwise +/// panics. +/// +/// In general, the rest of the methods on `Automaton` need to uphold their +/// contracts as well. For example, `Automaton::is_dead` should only returns +/// true if the given state ID is actually a dead state. +/// +/// Note that currently this crate does not rely on the safety property defined +/// here to avoid undefined behavior. Instead, this was done to make it +/// _possible_ to do in the future. +/// +/// # Example +/// +/// This example shows how one might implement a basic but correct search +/// routine. We keep things simple by not using prefilters or worrying about +/// anchored searches, but do make sure our search is correct for all possible +/// [`MatchKind`] semantics. (The comments in the code below note the parts +/// that are needed to support certain `MatchKind` semantics.) +/// +/// ``` +/// use aho_corasick::{ +/// automaton::Automaton, +/// nfa::noncontiguous::NFA, +/// Anchored, Match, MatchError, MatchKind, +/// }; +/// +/// // Run an unanchored search for 'aut' in 'haystack'. Return the first match +/// // seen according to the automaton's match semantics. This returns an error +/// // if the given automaton does not support unanchored searches. +/// fn find<A: Automaton>( +/// aut: A, +/// haystack: &[u8], +/// ) -> Result<Option<Match>, MatchError> { +/// let mut sid = aut.start_state(Anchored::No)?; +/// let mut at = 0; +/// let mut mat = None; +/// let get_match = |sid, at| { +/// let pid = aut.match_pattern(sid, 0); +/// let len = aut.pattern_len(pid); +/// Match::new(pid, (at - len)..at) +/// }; +/// // Start states can be match states! +/// if aut.is_match(sid) { +/// mat = Some(get_match(sid, at)); +/// // Standard semantics require matches to be reported as soon as +/// // they're seen. Otherwise, we continue until we see a dead state +/// // or the end of the haystack. +/// if matches!(aut.match_kind(), MatchKind::Standard) { +/// return Ok(mat); +/// } +/// } +/// while at < haystack.len() { +/// sid = aut.next_state(Anchored::No, sid, haystack[at]); +/// if aut.is_special(sid) { +/// if aut.is_dead(sid) { +/// return Ok(mat); +/// } else if aut.is_match(sid) { +/// mat = Some(get_match(sid, at + 1)); +/// // As above, standard semantics require that we return +/// // immediately once a match is found. +/// if matches!(aut.match_kind(), MatchKind::Standard) { +/// return Ok(mat); +/// } +/// } +/// } +/// at += 1; +/// } +/// Ok(mat) +/// } +/// +/// // Show that it works for standard searches. +/// let nfa = NFA::new(&["samwise", "sam"]).unwrap(); +/// assert_eq!(Some(Match::must(1, 0..3)), find(&nfa, b"samwise")?); +/// +/// // But also works when using leftmost-first. Notice how the match result +/// // has changed! +/// let nfa = NFA::builder() +/// .match_kind(MatchKind::LeftmostFirst) +/// .build(&["samwise", "sam"]) +/// .unwrap(); +/// assert_eq!(Some(Match::must(0, 0..7)), find(&nfa, b"samwise")?); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +pub unsafe trait Automaton: private::Sealed { + /// Returns the starting state for the given anchor mode. + /// + /// Upon success, the state ID returned is guaranteed to be valid for + /// this automaton. + /// + /// # Errors + /// + /// This returns an error when the given search configuration is not + /// supported by the underlying automaton. For example, if the underlying + /// automaton only supports unanchored searches but the given configuration + /// was set to an anchored search, then this must return an error. + fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError>; + + /// Performs a state transition from `sid` for `byte` and returns the next + /// state. + /// + /// `anchored` should be [`Anchored::Yes`] when executing an anchored + /// search and [`Anchored::No`] otherwise. For some implementations of + /// `Automaton`, it is required to know whether the search is anchored + /// or not in order to avoid following failure transitions. Other + /// implementations may ignore `anchored` altogether and depend on + /// `Automaton::start_state` returning a state that walks a different path + /// through the automaton depending on whether the search is anchored or + /// not. + /// + /// # Panics + /// + /// This routine may panic or return incorrect results when the given state + /// ID is invalid. A state ID is valid if and only if: + /// + /// 1. It came from a call to `Automaton::start_state`, or + /// 2. It came from a previous call to `Automaton::next_state` with a + /// valid state ID. + /// + /// Implementations must treat all possible values of `byte` as valid. + /// + /// Implementations may panic on unsupported values of `anchored`, but are + /// not required to do so. + fn next_state( + &self, + anchored: Anchored, + sid: StateID, + byte: u8, + ) -> StateID; + + /// Returns true if the given ID represents a "special" state. A special + /// state is a dead, match or start state. + /// + /// Note that implementations may choose to return false when the given ID + /// corresponds to a start state. Namely, it always correct to treat start + /// states as non-special. Implementations must return true for states that + /// are dead or contain matches. + /// + /// This has unspecified behavior when given an invalid state ID. + fn is_special(&self, sid: StateID) -> bool; + + /// Returns true if the given ID represents a dead state. + /// + /// A dead state is a type of "sink" in a finite state machine. It + /// corresponds to a state whose transitions all loop back to itself. That + /// is, once entered, it can never be left. In practice, it serves as a + /// sentinel indicating that the search should terminate. + /// + /// This has unspecified behavior when given an invalid state ID. + fn is_dead(&self, sid: StateID) -> bool; + + /// Returns true if the given ID represents a match state. + /// + /// A match state is always associated with one or more pattern IDs that + /// matched at the position in the haystack when the match state was + /// entered. When a match state is entered, the match semantics dictate + /// whether it should be returned immediately (for `MatchKind::Standard`) + /// or if the search should continue (for `MatchKind::LeftmostFirst` and + /// `MatchKind::LeftmostLongest`) until a dead state is seen or the end of + /// the haystack has been reached. + /// + /// This has unspecified behavior when given an invalid state ID. + fn is_match(&self, sid: StateID) -> bool; + + /// Returns true if the given ID represents a start state. + /// + /// While it is never incorrect to ignore start states during a search + /// (except for the start of the search of course), knowing whether one has + /// entered a start state can be useful for certain classes of performance + /// optimizations. For example, if one is in a start state, it may be legal + /// to try to skip ahead and look for match candidates more quickly than + /// would otherwise be accomplished by walking the automaton. + /// + /// Implementations of `Automaton` in this crate "unspecialize" start + /// states when a prefilter is not active or enabled. In this case, it + /// is possible for `Automaton::is_special(sid)` to return false while + /// `Automaton::is_start(sid)` returns true. + /// + /// This has unspecified behavior when given an invalid state ID. + fn is_start(&self, sid: StateID) -> bool; + + /// Returns the match semantics that this automaton was built with. + fn match_kind(&self) -> MatchKind; + + /// Returns the total number of matches for the given state ID. + /// + /// This has unspecified behavior if the given ID does not refer to a match + /// state. + fn match_len(&self, sid: StateID) -> usize; + + /// Returns the pattern ID for the match state given by `sid` at the + /// `index` given. + /// + /// Typically, `index` is only ever greater than `0` when implementing an + /// overlapping search. Otherwise, it's likely that your search only cares + /// about reporting the first pattern ID in a match state. + /// + /// This has unspecified behavior if the given ID does not refer to a match + /// state, or if the index is greater than or equal to the total number of + /// matches in this match state. + fn match_pattern(&self, sid: StateID, index: usize) -> PatternID; + + /// Returns the total number of patterns compiled into this automaton. + fn patterns_len(&self) -> usize; + + /// Returns the length of the pattern for the given ID. + /// + /// This has unspecified behavior when given an invalid pattern + /// ID. A pattern ID is valid if and only if it is less than + /// `Automaton::patterns_len`. + fn pattern_len(&self, pid: PatternID) -> usize; + + /// Returns the length, in bytes, of the shortest pattern in this + /// automaton. + fn min_pattern_len(&self) -> usize; + + /// Returns the length, in bytes, of the longest pattern in this automaton. + fn max_pattern_len(&self) -> usize; + + /// Returns the heap memory usage, in bytes, used by this automaton. + fn memory_usage(&self) -> usize; + + /// Returns a prefilter, if available, that can be used to accelerate + /// searches for this automaton. + /// + /// The typical way this is used is when the start state is entered during + /// a search. When that happens, one can use a prefilter to skip ahead and + /// look for candidate matches without having to walk the automaton on the + /// bytes between candidates. + /// + /// Typically a prefilter is only available when there are a small (<100) + /// number of patterns built into the automaton. + fn prefilter(&self) -> Option<&Prefilter>; + + /// Executes a non-overlapping search with this automaton using the given + /// configuration. + /// + /// See + /// [`AhoCorasick::try_find`](crate::AhoCorasick::try_find) + /// for more documentation and examples. + fn try_find( + &self, + input: &Input<'_>, + ) -> Result<Option<Match>, MatchError> { + try_find_fwd(&self, input) + } + + /// Executes a overlapping search with this automaton using the given + /// configuration. + /// + /// See + /// [`AhoCorasick::try_find_overlapping`](crate::AhoCorasick::try_find_overlapping) + /// for more documentation and examples. + fn try_find_overlapping( + &self, + input: &Input<'_>, + state: &mut OverlappingState, + ) -> Result<(), MatchError> { + try_find_overlapping_fwd(&self, input, state) + } + + /// Returns an iterator of non-overlapping matches with this automaton + /// using the given configuration. + /// + /// See + /// [`AhoCorasick::try_find_iter`](crate::AhoCorasick::try_find_iter) + /// for more documentation and examples. + fn try_find_iter<'a, 'h>( + &'a self, + input: Input<'h>, + ) -> Result<FindIter<'a, 'h, Self>, MatchError> + where + Self: Sized, + { + FindIter::new(self, input) + } + + /// Returns an iterator of overlapping matches with this automaton + /// using the given configuration. + /// + /// See + /// [`AhoCorasick::try_find_overlapping_iter`](crate::AhoCorasick::try_find_overlapping_iter) + /// for more documentation and examples. + fn try_find_overlapping_iter<'a, 'h>( + &'a self, + input: Input<'h>, + ) -> Result<FindOverlappingIter<'a, 'h, Self>, MatchError> + where + Self: Sized, + { + if !self.match_kind().is_standard() { + return Err(MatchError::unsupported_overlapping( + self.match_kind(), + )); + } + // We might consider lifting this restriction. The reason why I added + // it was to ban the combination of "anchored search" and "overlapping + // iteration." The match semantics aren't totally clear in that case. + // Should we allow *any* matches that are adjacent to *any* previous + // match? Or only following the most recent one? Or only matches + // that start at the beginning of the search? We might also elect to + // just keep this restriction in place, as callers should be able to + // implement it themselves if they want to. + if input.get_anchored().is_anchored() { + return Err(MatchError::invalid_input_anchored()); + } + let _ = self.start_state(input.get_anchored())?; + let state = OverlappingState::start(); + Ok(FindOverlappingIter { aut: self, input, state }) + } + + /// Replaces all non-overlapping matches in `haystack` with + /// strings from `replace_with` depending on the pattern that + /// matched. The `replace_with` slice must have length equal to + /// `Automaton::patterns_len`. + /// + /// See + /// [`AhoCorasick::try_replace_all`](crate::AhoCorasick::try_replace_all) + /// for more documentation and examples. + fn try_replace_all<B>( + &self, + haystack: &str, + replace_with: &[B], + ) -> Result<String, MatchError> + where + Self: Sized, + B: AsRef<str>, + { + assert_eq!( + replace_with.len(), + self.patterns_len(), + "replace_all requires a replacement for every pattern \ + in the automaton" + ); + let mut dst = String::with_capacity(haystack.len()); + self.try_replace_all_with(haystack, &mut dst, |mat, _, dst| { + dst.push_str(replace_with[mat.pattern()].as_ref()); + true + })?; + Ok(dst) + } + + /// Replaces all non-overlapping matches in `haystack` with + /// strings from `replace_with` depending on the pattern that + /// matched. The `replace_with` slice must have length equal to + /// `Automaton::patterns_len`. + /// + /// See + /// [`AhoCorasick::try_replace_all_bytes`](crate::AhoCorasick::try_replace_all_bytes) + /// for more documentation and examples. + fn try_replace_all_bytes<B>( + &self, + haystack: &[u8], + replace_with: &[B], + ) -> Result<Vec<u8>, MatchError> + where + Self: Sized, + B: AsRef<[u8]>, + { + assert_eq!( + replace_with.len(), + self.patterns_len(), + "replace_all requires a replacement for every pattern \ + in the automaton" + ); + let mut dst = Vec::with_capacity(haystack.len()); + self.try_replace_all_with_bytes(haystack, &mut dst, |mat, _, dst| { + dst.extend(replace_with[mat.pattern()].as_ref()); + true + })?; + Ok(dst) + } + + /// Replaces all non-overlapping matches in `haystack` by calling the + /// `replace_with` closure given. + /// + /// See + /// [`AhoCorasick::try_replace_all_with`](crate::AhoCorasick::try_replace_all_with) + /// for more documentation and examples. + fn try_replace_all_with<F>( + &self, + haystack: &str, + dst: &mut String, + mut replace_with: F, + ) -> Result<(), MatchError> + where + Self: Sized, + F: FnMut(&Match, &str, &mut String) -> bool, + { + let mut last_match = 0; + for m in self.try_find_iter(Input::new(haystack))? { + // Since there are no restrictions on what kinds of patterns are + // in an Aho-Corasick automaton, we might get matches that split + // a codepoint, or even matches of a partial codepoint. When that + // happens, we just skip the match. + if !haystack.is_char_boundary(m.start()) + || !haystack.is_char_boundary(m.end()) + { + continue; + } + dst.push_str(&haystack[last_match..m.start()]); + last_match = m.end(); + if !replace_with(&m, &haystack[m.start()..m.end()], dst) { + break; + }; + } + dst.push_str(&haystack[last_match..]); + Ok(()) + } + + /// Replaces all non-overlapping matches in `haystack` by calling the + /// `replace_with` closure given. + /// + /// See + /// [`AhoCorasick::try_replace_all_with_bytes`](crate::AhoCorasick::try_replace_all_with_bytes) + /// for more documentation and examples. + fn try_replace_all_with_bytes<F>( + &self, + haystack: &[u8], + dst: &mut Vec<u8>, + mut replace_with: F, + ) -> Result<(), MatchError> + where + Self: Sized, + F: FnMut(&Match, &[u8], &mut Vec<u8>) -> bool, + { + let mut last_match = 0; + for m in self.try_find_iter(Input::new(haystack))? { + dst.extend(&haystack[last_match..m.start()]); + last_match = m.end(); + if !replace_with(&m, &haystack[m.start()..m.end()], dst) { + break; + }; + } + dst.extend(&haystack[last_match..]); + Ok(()) + } + + /// Returns an iterator of non-overlapping matches with this automaton + /// from the stream given. + /// + /// See + /// [`AhoCorasick::try_stream_find_iter`](crate::AhoCorasick::try_stream_find_iter) + /// for more documentation and examples. + #[cfg(feature = "std")] + fn try_stream_find_iter<'a, R: std::io::Read>( + &'a self, + rdr: R, + ) -> Result<StreamFindIter<'a, Self, R>, MatchError> + where + Self: Sized, + { + Ok(StreamFindIter { it: StreamChunkIter::new(self, rdr)? }) + } + + /// Replaces all non-overlapping matches in `rdr` with strings from + /// `replace_with` depending on the pattern that matched, and writes the + /// result to `wtr`. The `replace_with` slice must have length equal to + /// `Automaton::patterns_len`. + /// + /// See + /// [`AhoCorasick::try_stream_replace_all`](crate::AhoCorasick::try_stream_replace_all) + /// for more documentation and examples. + #[cfg(feature = "std")] + fn try_stream_replace_all<R, W, B>( + &self, + rdr: R, + wtr: W, + replace_with: &[B], + ) -> std::io::Result<()> + where + Self: Sized, + R: std::io::Read, + W: std::io::Write, + B: AsRef<[u8]>, + { + assert_eq!( + replace_with.len(), + self.patterns_len(), + "streaming replace_all requires a replacement for every pattern \ + in the automaton", + ); + self.try_stream_replace_all_with(rdr, wtr, |mat, _, wtr| { + wtr.write_all(replace_with[mat.pattern()].as_ref()) + }) + } + + /// Replaces all non-overlapping matches in `rdr` by calling the + /// `replace_with` closure given and writing the result to `wtr`. + /// + /// See + /// [`AhoCorasick::try_stream_replace_all_with`](crate::AhoCorasick::try_stream_replace_all_with) + /// for more documentation and examples. + #[cfg(feature = "std")] + fn try_stream_replace_all_with<R, W, F>( + &self, + rdr: R, + mut wtr: W, + mut replace_with: F, + ) -> std::io::Result<()> + where + Self: Sized, + R: std::io::Read, + W: std::io::Write, + F: FnMut(&Match, &[u8], &mut W) -> std::io::Result<()>, + { + let mut it = StreamChunkIter::new(self, rdr).map_err(|e| { + let kind = std::io::ErrorKind::Other; + std::io::Error::new(kind, e) + })?; + while let Some(result) = it.next() { + let chunk = result?; + match chunk { + StreamChunk::NonMatch { bytes, .. } => { + wtr.write_all(bytes)?; + } + StreamChunk::Match { bytes, mat } => { + replace_with(&mat, bytes, &mut wtr)?; + } + } + } + Ok(()) + } +} + +// SAFETY: This just defers to the underlying 'AcAutomaton' and thus inherits +// its safety properties. +unsafe impl<'a, A: Automaton + ?Sized> Automaton for &'a A { + #[inline(always)] + fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> { + (**self).start_state(anchored) + } + + #[inline(always)] + fn next_state( + &self, + anchored: Anchored, + sid: StateID, + byte: u8, + ) -> StateID { + (**self).next_state(anchored, sid, byte) + } + + #[inline(always)] + fn is_special(&self, sid: StateID) -> bool { + (**self).is_special(sid) + } + + #[inline(always)] + fn is_dead(&self, sid: StateID) -> bool { + (**self).is_dead(sid) + } + + #[inline(always)] + fn is_match(&self, sid: StateID) -> bool { + (**self).is_match(sid) + } + + #[inline(always)] + fn is_start(&self, sid: StateID) -> bool { + (**self).is_start(sid) + } + + #[inline(always)] + fn match_kind(&self) -> MatchKind { + (**self).match_kind() + } + + #[inline(always)] + fn match_len(&self, sid: StateID) -> usize { + (**self).match_len(sid) + } + + #[inline(always)] + fn match_pattern(&self, sid: StateID, index: usize) -> PatternID { + (**self).match_pattern(sid, index) + } + + #[inline(always)] + fn patterns_len(&self) -> usize { + (**self).patterns_len() + } + + #[inline(always)] + fn pattern_len(&self, pid: PatternID) -> usize { + (**self).pattern_len(pid) + } + + #[inline(always)] + fn min_pattern_len(&self) -> usize { + (**self).min_pattern_len() + } + + #[inline(always)] + fn max_pattern_len(&self) -> usize { + (**self).max_pattern_len() + } + + #[inline(always)] + fn memory_usage(&self) -> usize { + (**self).memory_usage() + } + + #[inline(always)] + fn prefilter(&self) -> Option<&Prefilter> { + (**self).prefilter() + } +} + +/// Represents the current state of an overlapping search. +/// +/// This is used for overlapping searches since they need to know something +/// about the previous search. For example, when multiple patterns match at the +/// same position, this state tracks the last reported pattern so that the next +/// search knows whether to report another matching pattern or continue with +/// the search at the next position. Additionally, it also tracks which state +/// the last search call terminated in and the current offset of the search +/// in the haystack. +/// +/// This type provides limited introspection capabilities. The only thing a +/// caller can do is construct it and pass it around to permit search routines +/// to use it to track state, and to ask whether a match has been found. +/// +/// Callers should always provide a fresh state constructed via +/// [`OverlappingState::start`] when starting a new search. That same state +/// should be reused for subsequent searches on the same `Input`. The state +/// given will advance through the haystack itself. Callers can detect the end +/// of a search when neither an error nor a match is returned. +/// +/// # Example +/// +/// This example shows how to manually iterate over all overlapping matches. If +/// you need this, you might consider using +/// [`AhoCorasick::find_overlapping_iter`](crate::AhoCorasick::find_overlapping_iter) +/// instead, but this shows how to correctly use an `OverlappingState`. +/// +/// ``` +/// use aho_corasick::{ +/// automaton::OverlappingState, +/// AhoCorasick, Input, Match, +/// }; +/// +/// let patterns = &["append", "appendage", "app"]; +/// let haystack = "append the app to the appendage"; +/// +/// let ac = AhoCorasick::new(patterns).unwrap(); +/// let mut state = OverlappingState::start(); +/// let mut matches = vec![]; +/// +/// loop { +/// ac.find_overlapping(haystack, &mut state); +/// let mat = match state.get_match() { +/// None => break, +/// Some(mat) => mat, +/// }; +/// matches.push(mat); +/// } +/// let expected = vec![ +/// Match::must(2, 0..3), +/// Match::must(0, 0..6), +/// Match::must(2, 11..14), +/// Match::must(2, 22..25), +/// Match::must(0, 22..28), +/// Match::must(1, 22..31), +/// ]; +/// assert_eq!(expected, matches); +/// ``` +#[derive(Clone, Debug)] +pub struct OverlappingState { + /// The match reported by the most recent overlapping search to use this + /// state. + /// + /// If a search does not find any matches, then it is expected to clear + /// this value. + mat: Option<Match>, + /// The state ID of the state at which the search was in when the call + /// terminated. When this is a match state, `last_match` must be set to a + /// non-None value. + /// + /// A `None` value indicates the start state of the corresponding + /// automaton. We cannot use the actual ID, since any one automaton may + /// have many start states, and which one is in use depends on search-time + /// factors (such as whether the search is anchored or not). + id: Option<StateID>, + /// The position of the search. + /// + /// When `id` is None (i.e., we are starting a search), this is set to + /// the beginning of the search as given by the caller regardless of its + /// current value. Subsequent calls to an overlapping search pick up at + /// this offset. + at: usize, + /// The index into the matching patterns of the next match to report if the + /// current state is a match state. Note that this may be 1 greater than + /// the total number of matches to report for the current match state. (In + /// which case, no more matches should be reported at the current position + /// and the search should advance to the next position.) + next_match_index: Option<usize>, +} + +impl OverlappingState { + /// Create a new overlapping state that begins at the start state. + pub fn start() -> OverlappingState { + OverlappingState { mat: None, id: None, at: 0, next_match_index: None } + } + + /// Return the match result of the most recent search to execute with this + /// state. + /// + /// Every search will clear this result automatically, such that if no + /// match is found, this will always correctly report `None`. + pub fn get_match(&self) -> Option<Match> { + self.mat + } +} + +/// An iterator of non-overlapping matches in a particular haystack. +/// +/// This iterator yields matches according to the [`MatchKind`] used by this +/// automaton. +/// +/// This iterator is constructed via the [`Automaton::try_find_iter`] method. +/// +/// The type variable `A` refers to the implementation of the [`Automaton`] +/// trait used to execute the search. +/// +/// The lifetime `'a` refers to the lifetime of the [`Automaton`] +/// implementation. +/// +/// The lifetime `'h` refers to the lifetime of the haystack being searched. +#[derive(Debug)] +pub struct FindIter<'a, 'h, A> { + /// The automaton used to drive the search. + aut: &'a A, + /// The input parameters to give to each search call. + /// + /// The start position of the search is mutated during iteration. + input: Input<'h>, + /// Records the end offset of the most recent match. This is necessary to + /// handle a corner case for preventing empty matches from overlapping with + /// the ending bounds of a prior match. + last_match_end: Option<usize>, +} + +impl<'a, 'h, A: Automaton> FindIter<'a, 'h, A> { + /// Creates a new non-overlapping iterator. If the given automaton would + /// return an error on a search with the given input configuration, then + /// that error is returned here. + fn new( + aut: &'a A, + input: Input<'h>, + ) -> Result<FindIter<'a, 'h, A>, MatchError> { + // The only way this search can fail is if we cannot retrieve the start + // state. e.g., Asking for an anchored search when only unanchored + // searches are supported. + let _ = aut.start_state(input.get_anchored())?; + Ok(FindIter { aut, input, last_match_end: None }) + } + + /// Executes a search and returns a match if one is found. + /// + /// This does not advance the input forward. It just executes a search + /// based on the current configuration/offsets. + fn search(&self) -> Option<Match> { + // The unwrap is OK here because we check at iterator construction time + // that no subsequent search call (using the same configuration) will + // ever return an error. + self.aut + .try_find(&self.input) + .expect("already checked that no match error can occur") + } + + /// Handles the special case of an empty match by ensuring that 1) the + /// iterator always advances and 2) empty matches never overlap with other + /// matches. + /// + /// (1) is necessary because we principally make progress by setting the + /// starting location of the next search to the ending location of the last + /// match. But if a match is empty, then this results in a search that does + /// not advance and thus does not terminate. + /// + /// (2) is not strictly necessary, but makes intuitive sense and matches + /// the presiding behavior of most general purpose regex engines. + /// (Obviously this crate isn't a regex engine, but we choose to match + /// their semantics.) The "intuitive sense" here is that we want to report + /// NON-overlapping matches. So for example, given the patterns 'a' and + /// '' (an empty string) against the haystack 'a', without the special + /// handling, you'd get the matches [0, 1) and [1, 1), where the latter + /// overlaps with the end bounds of the former. + /// + /// Note that we mark this cold and forcefully prevent inlining because + /// handling empty matches like this is extremely rare and does require + /// quite a bit of code, comparatively. Keeping this code out of the main + /// iterator function keeps it smaller and more amenable to inlining + /// itself. + #[cold] + #[inline(never)] + fn handle_overlapping_empty_match( + &mut self, + mut m: Match, + ) -> Option<Match> { + assert!(m.is_empty()); + if Some(m.end()) == self.last_match_end { + self.input.set_start(self.input.start().checked_add(1).unwrap()); + m = self.search()?; + } + Some(m) + } +} + +impl<'a, 'h, A: Automaton> Iterator for FindIter<'a, 'h, A> { + type Item = Match; + + #[inline(always)] + fn next(&mut self) -> Option<Match> { + let mut m = self.search()?; + if m.is_empty() { + m = self.handle_overlapping_empty_match(m)?; + } + self.input.set_start(m.end()); + self.last_match_end = Some(m.end()); + Some(m) + } +} + +/// An iterator of overlapping matches in a particular haystack. +/// +/// This iterator will report all possible matches in a particular haystack, +/// even when the matches overlap. +/// +/// This iterator is constructed via the +/// [`Automaton::try_find_overlapping_iter`] method. +/// +/// The type variable `A` refers to the implementation of the [`Automaton`] +/// trait used to execute the search. +/// +/// The lifetime `'a` refers to the lifetime of the [`Automaton`] +/// implementation. +/// +/// The lifetime `'h` refers to the lifetime of the haystack being searched. +#[derive(Debug)] +pub struct FindOverlappingIter<'a, 'h, A> { + aut: &'a A, + input: Input<'h>, + state: OverlappingState, +} + +impl<'a, 'h, A: Automaton> Iterator for FindOverlappingIter<'a, 'h, A> { + type Item = Match; + + #[inline(always)] + fn next(&mut self) -> Option<Match> { + self.aut + .try_find_overlapping(&self.input, &mut self.state) + .expect("already checked that no match error can occur here"); + self.state.get_match() + } +} + +/// An iterator that reports matches in a stream. +/// +/// This iterator yields elements of type `io::Result<Match>`, where an error +/// is reported if there was a problem reading from the underlying stream. +/// The iterator terminates only when the underlying stream reaches `EOF`. +/// +/// This iterator is constructed via the [`Automaton::try_stream_find_iter`] +/// method. +/// +/// The type variable `A` refers to the implementation of the [`Automaton`] +/// trait used to execute the search. +/// +/// The type variable `R` refers to the `io::Read` stream that is being read +/// from. +/// +/// The lifetime `'a` refers to the lifetime of the [`Automaton`] +/// implementation. +#[cfg(feature = "std")] +#[derive(Debug)] +pub struct StreamFindIter<'a, A, R> { + it: StreamChunkIter<'a, A, R>, +} + +#[cfg(feature = "std")] +impl<'a, A: Automaton, R: std::io::Read> Iterator + for StreamFindIter<'a, A, R> +{ + type Item = std::io::Result<Match>; + + fn next(&mut self) -> Option<std::io::Result<Match>> { + loop { + match self.it.next() { + None => return None, + Some(Err(err)) => return Some(Err(err)), + Some(Ok(StreamChunk::NonMatch { .. })) => {} + Some(Ok(StreamChunk::Match { mat, .. })) => { + return Some(Ok(mat)); + } + } + } + } +} + +/// An iterator that reports matches in a stream. +/// +/// (This doesn't actually implement the `Iterator` trait because it returns +/// something with a lifetime attached to a buffer it owns, but that's OK. It +/// still has a `next` method and is iterator-like enough to be fine.) +/// +/// This iterator yields elements of type `io::Result<StreamChunk>`, where +/// an error is reported if there was a problem reading from the underlying +/// stream. The iterator terminates only when the underlying stream reaches +/// `EOF`. +/// +/// The idea here is that each chunk represents either a match or a non-match, +/// and if you concatenated all of the chunks together, you'd reproduce the +/// entire contents of the stream, byte-for-byte. +/// +/// This chunk machinery is a bit complicated and it isn't strictly required +/// for a stream searcher that just reports matches. But we do need something +/// like this to deal with the "replacement" API, which needs to know which +/// chunks it can copy and which it needs to replace. +#[cfg(feature = "std")] +#[derive(Debug)] +struct StreamChunkIter<'a, A, R> { + /// The underlying automaton to do the search. + aut: &'a A, + /// The source of bytes we read from. + rdr: R, + /// A roll buffer for managing bytes from `rdr`. Basically, this is used + /// to handle the case of a match that is split by two different + /// calls to `rdr.read()`. This isn't strictly needed if all we needed to + /// do was report matches, but here we are reporting chunks of non-matches + /// and matches and in order to do that, we really just cannot treat our + /// stream as non-overlapping blocks of bytes. We need to permit some + /// overlap while we retain bytes from a previous `read` call in memory. + buf: crate::util::buffer::Buffer, + /// The unanchored starting state of this automaton. + start: StateID, + /// The state of the automaton. + sid: StateID, + /// The absolute position over the entire stream. + absolute_pos: usize, + /// The position we're currently at within `buf`. + buffer_pos: usize, + /// The buffer position of the end of the bytes that we last returned + /// to the caller. Basically, whenever we find a match, we look to see if + /// there is a difference between where the match started and the position + /// of the last byte we returned to the caller. If there's a difference, + /// then we need to return a 'NonMatch' chunk. + buffer_reported_pos: usize, +} + +#[cfg(feature = "std")] +impl<'a, A: Automaton, R: std::io::Read> StreamChunkIter<'a, A, R> { + fn new( + aut: &'a A, + rdr: R, + ) -> Result<StreamChunkIter<'a, A, R>, MatchError> { + // This restriction is a carry-over from older versions of this crate. + // I didn't have the bandwidth to think through how to handle, say, + // leftmost-first or leftmost-longest matching, but... it should be + // possible? The main problem is that once you see a match state in + // leftmost-first semantics, you can't just stop at that point and + // report a match. You have to keep going until you either hit a dead + // state or EOF. So how do you know when you'll hit a dead state? Well, + // you don't. With Aho-Corasick, I believe you can put a bound on it + // and say, "once a match has been seen, you'll need to scan forward at + // most N bytes" where N=aut.max_pattern_len(). + // + // Which is fine, but it does mean that state about whether we're still + // looking for a dead state or not needs to persist across buffer + // refills. Which this code doesn't really handle. It does preserve + // *some* state across buffer refills, basically ensuring that a match + // span is always in memory. + if !aut.match_kind().is_standard() { + return Err(MatchError::unsupported_stream(aut.match_kind())); + } + // This is kind of a cop-out, but empty matches are SUPER annoying. + // If we know they can't happen (which is what we enforce here), then + // it makes a lot of logic much simpler. With that said, I'm open to + // supporting this case, but we need to define proper semantics for it + // first. It wasn't totally clear to me what it should do at the time + // of writing, so I decided to just be conservative. + // + // It also seems like a very weird case to support anyway. Why search a + // stream if you're just going to get a match at every position? + // + // ¯\_(ツ)_/¯ + if aut.min_pattern_len() == 0 { + return Err(MatchError::unsupported_empty()); + } + let start = aut.start_state(Anchored::No)?; + Ok(StreamChunkIter { + aut, + rdr, + buf: crate::util::buffer::Buffer::new(aut.max_pattern_len()), + start, + sid: start, + absolute_pos: 0, + buffer_pos: 0, + buffer_reported_pos: 0, + }) + } + + fn next(&mut self) -> Option<std::io::Result<StreamChunk>> { + // This code is pretty gnarly. It IS simpler than the equivalent code + // in the previous aho-corasick release, in part because we inline + // automaton traversal here and also in part because we have abdicated + // support for automatons that contain an empty pattern. + // + // I suspect this code could be made a bit simpler by designing a + // better buffer abstraction. + // + // But in general, this code is basically write-only. So you'll need + // to go through it step-by-step to grok it. One of the key bits of + // complexity is tracking a few different offsets. 'buffer_pos' is + // where we are in the buffer for search. 'buffer_reported_pos' is the + // position immediately following the last byte in the buffer that + // we've returned to the caller. And 'absolute_pos' is the overall + // current absolute position of the search in the entire stream, and + // this is what match spans are reported in terms of. + loop { + if self.aut.is_match(self.sid) { + let mat = self.get_match(); + if let Some(r) = self.get_non_match_chunk(mat) { + self.buffer_reported_pos += r.len(); + let bytes = &self.buf.buffer()[r]; + return Some(Ok(StreamChunk::NonMatch { bytes })); + } + self.sid = self.start; + let r = self.get_match_chunk(mat); + self.buffer_reported_pos += r.len(); + let bytes = &self.buf.buffer()[r]; + return Some(Ok(StreamChunk::Match { bytes, mat })); + } + if self.buffer_pos >= self.buf.buffer().len() { + if let Some(r) = self.get_pre_roll_non_match_chunk() { + self.buffer_reported_pos += r.len(); + let bytes = &self.buf.buffer()[r]; + return Some(Ok(StreamChunk::NonMatch { bytes })); + } + if self.buf.buffer().len() >= self.buf.min_buffer_len() { + self.buffer_pos = self.buf.min_buffer_len(); + self.buffer_reported_pos -= + self.buf.buffer().len() - self.buf.min_buffer_len(); + self.buf.roll(); + } + match self.buf.fill(&mut self.rdr) { + Err(err) => return Some(Err(err)), + Ok(true) => {} + Ok(false) => { + // We've hit EOF, but if there are still some + // unreported bytes remaining, return them now. + if let Some(r) = self.get_eof_non_match_chunk() { + self.buffer_reported_pos += r.len(); + let bytes = &self.buf.buffer()[r]; + return Some(Ok(StreamChunk::NonMatch { bytes })); + } + // We've reported everything! + return None; + } + } + } + let start = self.absolute_pos; + for &byte in self.buf.buffer()[self.buffer_pos..].iter() { + self.sid = self.aut.next_state(Anchored::No, self.sid, byte); + self.absolute_pos += 1; + if self.aut.is_match(self.sid) { + break; + } + } + self.buffer_pos += self.absolute_pos - start; + } + } + + /// Return a match chunk for the given match. It is assumed that the match + /// ends at the current `buffer_pos`. + fn get_match_chunk(&self, mat: Match) -> core::ops::Range<usize> { + let start = self.buffer_pos - mat.len(); + let end = self.buffer_pos; + start..end + } + + /// Return a non-match chunk, if necessary, just before reporting a match. + /// This returns `None` if there is nothing to report. Otherwise, this + /// assumes that the given match ends at the current `buffer_pos`. + fn get_non_match_chunk( + &self, + mat: Match, + ) -> Option<core::ops::Range<usize>> { + let buffer_mat_start = self.buffer_pos - mat.len(); + if buffer_mat_start > self.buffer_reported_pos { + let start = self.buffer_reported_pos; + let end = buffer_mat_start; + return Some(start..end); + } + None + } + + /// Look for any bytes that should be reported as a non-match just before + /// rolling the buffer. + /// + /// Note that this only reports bytes up to `buffer.len() - + /// min_buffer_len`, as it's not possible to know whether the bytes + /// following that will participate in a match or not. + fn get_pre_roll_non_match_chunk(&self) -> Option<core::ops::Range<usize>> { + let end = + self.buf.buffer().len().saturating_sub(self.buf.min_buffer_len()); + if self.buffer_reported_pos < end { + return Some(self.buffer_reported_pos..end); + } + None + } + + /// Return any unreported bytes as a non-match up to the end of the buffer. + /// + /// This should only be called when the entire contents of the buffer have + /// been searched and EOF has been hit when trying to fill the buffer. + fn get_eof_non_match_chunk(&self) -> Option<core::ops::Range<usize>> { + if self.buffer_reported_pos < self.buf.buffer().len() { + return Some(self.buffer_reported_pos..self.buf.buffer().len()); + } + None + } + + /// Return the match at the current position for the current state. + /// + /// This panics if `self.aut.is_match(self.sid)` isn't true. + fn get_match(&self) -> Match { + get_match(self.aut, self.sid, 0, self.absolute_pos) + } +} + +/// A single chunk yielded by the stream chunk iterator. +/// +/// The `'r` lifetime refers to the lifetime of the stream chunk iterator. +#[cfg(feature = "std")] +#[derive(Debug)] +enum StreamChunk<'r> { + /// A chunk that does not contain any matches. + NonMatch { bytes: &'r [u8] }, + /// A chunk that precisely contains a match. + Match { bytes: &'r [u8], mat: Match }, +} + +#[inline(never)] +pub(crate) fn try_find_fwd<A: Automaton + ?Sized>( + aut: &A, + input: &Input<'_>, +) -> Result<Option<Match>, MatchError> { + if input.is_done() { + return Ok(None); + } + let earliest = aut.match_kind().is_standard() || input.get_earliest(); + if input.get_anchored().is_anchored() { + try_find_fwd_imp(aut, input, None, Anchored::Yes, earliest) + } else if let Some(pre) = aut.prefilter() { + if earliest { + try_find_fwd_imp(aut, input, Some(pre), Anchored::No, true) + } else { + try_find_fwd_imp(aut, input, Some(pre), Anchored::No, false) + } + } else { + if earliest { + try_find_fwd_imp(aut, input, None, Anchored::No, true) + } else { + try_find_fwd_imp(aut, input, None, Anchored::No, false) + } + } +} + +#[inline(always)] +fn try_find_fwd_imp<A: Automaton + ?Sized>( + aut: &A, + input: &Input<'_>, + pre: Option<&Prefilter>, + anchored: Anchored, + earliest: bool, +) -> Result<Option<Match>, MatchError> { + let mut sid = aut.start_state(input.get_anchored())?; + let mut at = input.start(); + let mut mat = None; + if aut.is_match(sid) { + mat = Some(get_match(aut, sid, 0, at)); + if earliest { + return Ok(mat); + } + } + if let Some(pre) = pre { + match pre.find_in(input.haystack(), input.get_span()) { + Candidate::None => return Ok(None), + Candidate::Match(m) => return Ok(Some(m)), + Candidate::PossibleStartOfMatch(i) => { + at = i; + } + } + } + while at < input.end() { + // I've tried unrolling this loop and eliding bounds checks, but no + // matter what I did, I could not observe a consistent improvement on + // any benchmark I could devise. (If someone wants to re-litigate this, + // the way to do it is to add an 'next_state_unchecked' method to the + // 'Automaton' trait with a default impl that uses 'next_state'. Then + // use 'aut.next_state_unchecked' here and implement it on DFA using + // unchecked slice index acces.) + sid = aut.next_state(anchored, sid, input.haystack()[at]); + if aut.is_special(sid) { + if aut.is_dead(sid) { + return Ok(mat); + } else if aut.is_match(sid) { + // We use 'at + 1' here because the match state is entered + // at the last byte of the pattern. Since we use half-open + // intervals, the end of the range of the match is one past the + // last byte. + let m = get_match(aut, sid, 0, at + 1); + // For the automata in this crate, we make a size trade off + // where we reuse the same automaton for both anchored and + // unanchored searches. We achieve this, principally, by simply + // not following failure transitions while computing the next + // state. Instead, if we fail to find the next state, we return + // a dead state, which instructs the search to stop. (This + // is why 'next_state' needs to know whether the search is + // anchored or not.) In addition, we have different start + // states for anchored and unanchored searches. The latter has + // a self-loop where as the former does not. + // + // In this way, we can use the same trie to execute both + // anchored and unanchored searches. There is a catch though. + // When building an Aho-Corasick automaton for unanchored + // searches, we copy matches from match states to other states + // (which would otherwise not be match states) if they are + // reachable via a failure transition. In the case of an + // anchored search, we *specifically* do not want to report + // these matches because they represent matches that start past + // the beginning of the search. + // + // Now we could tweak the automaton somehow to differentiate + // anchored from unanchored match states, but this would make + // 'aut.is_match' and potentially 'aut.is_special' slower. And + // also make the automaton itself more complex. + // + // Instead, we insert a special hack: if the search is + // anchored, we simply ignore matches that don't begin at + // the start of the search. This is not quite ideal, but we + // do specialize this function in such a way that unanchored + // searches don't pay for this additional branch. While this + // might cause a search to continue on for more than it + // otherwise optimally would, it will be no more than the + // longest pattern in the automaton. The reason for this is + // that we ensure we don't follow failure transitions during + // an anchored search. Combined with using a different anchored + // starting state with no self-loop, we guarantee that we'll + // at worst move through a number of transitions equal to the + // longest pattern. + // + // Now for DFAs, the whole point of them is to eliminate + // failure transitions entirely. So there is no way to say "if + // it's an anchored search don't follow failure transitions." + // Instead, we actually have to build two entirely separate + // automatons into the transition table. One with failure + // transitions built into it and another that is effectively + // just an encoding of the base trie into a transition table. + // DFAs still need this check though, because the match states + // still carry matches only reachable via a failure transition. + // Why? Because removing them seems difficult, although I + // haven't given it a lot of thought. + if !(anchored.is_anchored() && m.start() > input.start()) { + mat = Some(m); + if earliest { + return Ok(mat); + } + } + } else if let Some(pre) = pre { + // If we're here, we know it's a special state that is not a + // dead or a match state AND that a prefilter is active. Thus, + // it must be a start state. + debug_assert!(aut.is_start(sid)); + // We don't care about 'Candidate::Match' here because if such + // a match were possible, it would have been returned above + // when we run the prefilter before walking the automaton. + let span = Span::from(at..input.end()); + match pre.find_in(input.haystack(), span).into_option() { + None => return Ok(None), + Some(i) => { + if i > at { + at = i; + continue; + } + } + } + } else { + // When pre.is_none(), then starting states should not be + // treated as special. That is, without a prefilter, is_special + // should only return true when the state is a dead or a match + // state. + // + // It is possible to execute a search without a prefilter even + // when the underlying searcher has one: an anchored search. + // But in this case, the automaton makes it impossible to move + // back to the start state by construction, and thus, we should + // never reach this branch. + debug_assert!(false, "unreachable"); + } + } + at += 1; + } + Ok(mat) +} + +#[inline(never)] +fn try_find_overlapping_fwd<A: Automaton + ?Sized>( + aut: &A, + input: &Input<'_>, + state: &mut OverlappingState, +) -> Result<(), MatchError> { + state.mat = None; + if input.is_done() { + return Ok(()); + } + // Searching with a pattern ID is always anchored, so we should only ever + // use a prefilter when no pattern ID is given. + if aut.prefilter().is_some() && !input.get_anchored().is_anchored() { + let pre = aut.prefilter().unwrap(); + try_find_overlapping_fwd_imp(aut, input, Some(pre), state) + } else { + try_find_overlapping_fwd_imp(aut, input, None, state) + } +} + +#[inline(always)] +fn try_find_overlapping_fwd_imp<A: Automaton + ?Sized>( + aut: &A, + input: &Input<'_>, + pre: Option<&Prefilter>, + state: &mut OverlappingState, +) -> Result<(), MatchError> { + let mut sid = match state.id { + None => { + let sid = aut.start_state(input.get_anchored())?; + // Handle the case where the start state is a match state. That is, + // the empty string is in our automaton. We report every match we + // can here before moving on and updating 'state.at' and 'state.id' + // to find more matches in other parts of the haystack. + if aut.is_match(sid) { + let i = state.next_match_index.unwrap_or(0); + let len = aut.match_len(sid); + if i < len { + state.next_match_index = Some(i + 1); + state.mat = Some(get_match(aut, sid, i, input.start())); + return Ok(()); + } + } + state.at = input.start(); + state.id = Some(sid); + state.next_match_index = None; + state.mat = None; + sid + } + Some(sid) => { + // If we still have matches left to report in this state then + // report them until we've exhausted them. Only after that do we + // advance to the next offset in the haystack. + if let Some(i) = state.next_match_index { + let len = aut.match_len(sid); + if i < len { + state.next_match_index = Some(i + 1); + state.mat = Some(get_match(aut, sid, i, state.at + 1)); + return Ok(()); + } + // Once we've reported all matches at a given position, we need + // to advance the search to the next position. + state.at += 1; + state.next_match_index = None; + state.mat = None; + } + sid + } + }; + while state.at < input.end() { + sid = aut.next_state( + input.get_anchored(), + sid, + input.haystack()[state.at], + ); + if aut.is_special(sid) { + state.id = Some(sid); + if aut.is_dead(sid) { + return Ok(()); + } else if aut.is_match(sid) { + state.next_match_index = Some(1); + state.mat = Some(get_match(aut, sid, 0, state.at + 1)); + return Ok(()); + } else if let Some(pre) = pre { + // If we're here, we know it's a special state that is not a + // dead or a match state AND that a prefilter is active. Thus, + // it must be a start state. + debug_assert!(aut.is_start(sid)); + let span = Span::from(state.at..input.end()); + match pre.find_in(input.haystack(), span).into_option() { + None => return Ok(()), + Some(i) => { + if i > state.at { + state.at = i; + continue; + } + } + } + } else { + // When pre.is_none(), then starting states should not be + // treated as special. That is, without a prefilter, is_special + // should only return true when the state is a dead or a match + // state. + // + // ... except for one special case: in stream searching, we + // currently call overlapping search with a 'None' prefilter, + // regardless of whether one exists or not, because stream + // searching can't currently deal with prefilters correctly in + // all cases. + } + } + state.at += 1; + } + state.id = Some(sid); + Ok(()) +} + +#[inline(always)] +fn get_match<A: Automaton + ?Sized>( + aut: &A, + sid: StateID, + index: usize, + at: usize, +) -> Match { + let pid = aut.match_pattern(sid, index); + let len = aut.pattern_len(pid); + Match::new(pid, (at - len)..at) +} + +/// Write a prefix "state" indicator for fmt::Debug impls. It always writes +/// exactly two printable bytes to the given formatter. +/// +/// Specifically, this tries to succinctly distinguish the different types of +/// states: dead states, start states and match states. It even accounts for +/// the possible overlappings of different state types. (The only possible +/// overlapping is that of match and start states.) +pub(crate) fn fmt_state_indicator<A: Automaton>( + f: &mut core::fmt::Formatter<'_>, + aut: A, + id: StateID, +) -> core::fmt::Result { + if aut.is_dead(id) { + write!(f, "D ")?; + } else if aut.is_match(id) { + if aut.is_start(id) { + write!(f, "*>")?; + } else { + write!(f, "* ")?; + } + } else if aut.is_start(id) { + write!(f, " >")?; + } else { + write!(f, " ")?; + } + Ok(()) +} + +/// Return an iterator of transitions in a sparse format given an iterator +/// of all explicitly defined transitions. The iterator yields ranges of +/// transitions, such that any adjacent transitions mapped to the same +/// state are combined into a single range. +pub(crate) fn sparse_transitions<'a>( + mut it: impl Iterator<Item = (u8, StateID)> + 'a, +) -> impl Iterator<Item = (u8, u8, StateID)> + 'a { + let mut cur: Option<(u8, u8, StateID)> = None; + core::iter::from_fn(move || { + while let Some((class, next)) = it.next() { + let (prev_start, prev_end, prev_next) = match cur { + Some(x) => x, + None => { + cur = Some((class, class, next)); + continue; + } + }; + if prev_next == next { + cur = Some((prev_start, class, prev_next)); + } else { + cur = Some((class, class, next)); + return Some((prev_start, prev_end, prev_next)); + } + } + if let Some((start, end, next)) = cur.take() { + return Some((start, end, next)); + } + None + }) +} diff --git a/vendor/aho-corasick/src/dfa.rs b/vendor/aho-corasick/src/dfa.rs new file mode 100644 index 0000000..eabd15b --- /dev/null +++ b/vendor/aho-corasick/src/dfa.rs @@ -0,0 +1,835 @@ +/*! +Provides direct access to a DFA implementation of Aho-Corasick. + +This is a low-level API that generally only needs to be used in niche +circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) +instead of a DFA directly. Using an `DFA` directly is typically only necessary +when one needs access to the [`Automaton`] trait implementation. +*/ + +use alloc::{vec, vec::Vec}; + +use crate::{ + automaton::Automaton, + nfa::noncontiguous, + util::{ + alphabet::ByteClasses, + error::{BuildError, MatchError}, + int::{Usize, U32}, + prefilter::Prefilter, + primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID}, + search::{Anchored, MatchKind, StartKind}, + special::Special, + }, +}; + +/// A DFA implementation of Aho-Corasick. +/// +/// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of +/// this type directly. Using a `DFA` directly is typically only necessary when +/// one needs access to the [`Automaton`] trait implementation. +/// +/// This DFA can only be built by first constructing a [`noncontiguous::NFA`]. +/// Both [`DFA::new`] and [`Builder::build`] do this for you automatically, but +/// [`Builder::build_from_noncontiguous`] permits doing it explicitly. +/// +/// A DFA provides the best possible search performance (in this crate) via two +/// mechanisms: +/// +/// * All states use a dense representation for their transitions. +/// * All failure transitions are pre-computed such that they are never +/// explicitly handled at search time. +/// +/// These two facts combined mean that every state transition is performed +/// using a constant number of instructions. However, this comes at +/// great cost. The memory usage of a DFA can be quite exorbitant. +/// It is potentially multiple orders of magnitude greater than a +/// [`contiguous::NFA`](crate::nfa::contiguous::NFA) for example. In exchange, +/// a DFA will typically have better search speed than a `contiguous::NFA`, but +/// not by orders of magnitude. +/// +/// Unless you have a small number of patterns or memory usage is not a concern +/// and search performance is critical, a DFA is usually not the best choice. +/// +/// Moreover, unlike the NFAs in this crate, it is costly for a DFA to +/// support for anchored and unanchored search configurations. Namely, +/// since failure transitions are pre-computed, supporting both anchored +/// and unanchored searches requires a duplication of the transition table, +/// making the memory usage of such a DFA ever bigger. (The NFAs in this crate +/// unconditionally support both anchored and unanchored searches because there +/// is essentially no added cost for doing so.) It is for this reason that +/// a DFA's support for anchored and unanchored searches can be configured +/// via [`Builder::start_kind`]. By default, a DFA only supports unanchored +/// searches. +/// +/// # Example +/// +/// This example shows how to build an `DFA` directly and use it to execute +/// [`Automaton::try_find`]: +/// +/// ``` +/// use aho_corasick::{ +/// automaton::Automaton, +/// dfa::DFA, +/// Input, Match, +/// }; +/// +/// let patterns = &["b", "abc", "abcd"]; +/// let haystack = "abcd"; +/// +/// let nfa = DFA::new(patterns).unwrap(); +/// assert_eq!( +/// Some(Match::must(0, 1..2)), +/// nfa.try_find(&Input::new(haystack))?, +/// ); +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// It is also possible to implement your own version of `try_find`. See the +/// [`Automaton`] documentation for an example. +#[derive(Clone)] +pub struct DFA { + /// The DFA transition table. IDs in this table are pre-multiplied. So + /// instead of the IDs being 0, 1, 2, 3, ..., they are 0*stride, 1*stride, + /// 2*stride, 3*stride, ... + trans: Vec<StateID>, + /// The matches for every match state in this DFA. This is first indexed by + /// state index (so that's `sid >> stride2`) and then by order in which the + /// matches are meant to occur. + matches: Vec<Vec<PatternID>>, + /// The amount of heap memory used, in bytes, by the inner Vecs of + /// 'matches'. + matches_memory_usage: usize, + /// The length of each pattern. This is used to compute the start offset + /// of a match. + pattern_lens: Vec<SmallIndex>, + /// A prefilter for accelerating searches, if one exists. + prefilter: Option<Prefilter>, + /// The match semantics built into this DFA. + match_kind: MatchKind, + /// The total number of states in this DFA. + state_len: usize, + /// The alphabet size, or total number of equivalence classes, for this + /// DFA. Note that the actual number of transitions in each state is + /// stride=2^stride2, where stride is the smallest power of 2 greater than + /// or equal to alphabet_len. We do things this way so that we can use + /// bitshifting to go from a state ID to an index into 'matches'. + alphabet_len: usize, + /// The exponent with a base 2, such that stride=2^stride2. Given a state + /// index 'i', its state identifier is 'i << stride2'. Given a state + /// identifier 'sid', its state index is 'sid >> stride2'. + stride2: usize, + /// The equivalence classes for this DFA. All transitions are defined on + /// equivalence classes and not on the 256 distinct byte values. + byte_classes: ByteClasses, + /// The length of the shortest pattern in this automaton. + min_pattern_len: usize, + /// The length of the longest pattern in this automaton. + max_pattern_len: usize, + /// The information required to deduce which states are "special" in this + /// DFA. + special: Special, +} + +impl DFA { + /// Create a new Aho-Corasick DFA using the default configuration. + /// + /// Use a [`Builder`] if you want to change the configuration. + pub fn new<I, P>(patterns: I) -> Result<DFA, BuildError> + where + I: IntoIterator<Item = P>, + P: AsRef<[u8]>, + { + DFA::builder().build(patterns) + } + + /// A convenience method for returning a new Aho-Corasick DFA builder. + /// + /// This usually permits one to just import the `DFA` type. + pub fn builder() -> Builder { + Builder::new() + } +} + +impl DFA { + /// A sentinel state ID indicating that a search should stop once it has + /// entered this state. When a search stops, it returns a match if one has + /// been found, otherwise no match. A DFA always has an actual dead state + /// at this ID. + /// + /// N.B. DFAs, unlike NFAs, do not have any notion of a FAIL state. + /// Namely, the whole point of a DFA is that the FAIL state is completely + /// compiled away. That is, DFA construction involves pre-computing the + /// failure transitions everywhere, such that failure transitions are no + /// longer used at search time. This, combined with its uniformly dense + /// representation, are the two most important factors in why it's faster + /// than the NFAs in this crate. + const DEAD: StateID = StateID::new_unchecked(0); + + /// Adds the given pattern IDs as matches to the given state and also + /// records the added memory usage. + fn set_matches( + &mut self, + sid: StateID, + pids: impl Iterator<Item = PatternID>, + ) { + let index = (sid.as_usize() >> self.stride2).checked_sub(2).unwrap(); + let mut at_least_one = false; + for pid in pids { + self.matches[index].push(pid); + self.matches_memory_usage += PatternID::SIZE; + at_least_one = true; + } + assert!(at_least_one, "match state must have non-empty pids"); + } +} + +// SAFETY: 'start_state' always returns a valid state ID, 'next_state' always +// returns a valid state ID given a valid state ID. We otherwise claim that +// all other methods are correct as well. +unsafe impl Automaton for DFA { + #[inline(always)] + fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> { + // Either of the start state IDs can be DEAD, in which case, support + // for that type of search is not provided by this DFA. Which start + // state IDs are inactive depends on the 'StartKind' configuration at + // DFA construction time. + match anchored { + Anchored::No => { + let start = self.special.start_unanchored_id; + if start == DFA::DEAD { + Err(MatchError::invalid_input_unanchored()) + } else { + Ok(start) + } + } + Anchored::Yes => { + let start = self.special.start_anchored_id; + if start == DFA::DEAD { + Err(MatchError::invalid_input_anchored()) + } else { + Ok(start) + } + } + } + } + + #[inline(always)] + fn next_state( + &self, + _anchored: Anchored, + sid: StateID, + byte: u8, + ) -> StateID { + let class = self.byte_classes.get(byte); + self.trans[(sid.as_u32() + u32::from(class)).as_usize()] + } + + #[inline(always)] + fn is_special(&self, sid: StateID) -> bool { + sid <= self.special.max_special_id + } + + #[inline(always)] + fn is_dead(&self, sid: StateID) -> bool { + sid == DFA::DEAD + } + + #[inline(always)] + fn is_match(&self, sid: StateID) -> bool { + !self.is_dead(sid) && sid <= self.special.max_match_id + } + + #[inline(always)] + fn is_start(&self, sid: StateID) -> bool { + sid == self.special.start_unanchored_id + || sid == self.special.start_anchored_id + } + + #[inline(always)] + fn match_kind(&self) -> MatchKind { + self.match_kind + } + + #[inline(always)] + fn patterns_len(&self) -> usize { + self.pattern_lens.len() + } + + #[inline(always)] + fn pattern_len(&self, pid: PatternID) -> usize { + self.pattern_lens[pid].as_usize() + } + + #[inline(always)] + fn min_pattern_len(&self) -> usize { + self.min_pattern_len + } + + #[inline(always)] + fn max_pattern_len(&self) -> usize { + self.max_pattern_len + } + + #[inline(always)] + fn match_len(&self, sid: StateID) -> usize { + debug_assert!(self.is_match(sid)); + let offset = (sid.as_usize() >> self.stride2) - 2; + self.matches[offset].len() + } + + #[inline(always)] + fn match_pattern(&self, sid: StateID, index: usize) -> PatternID { + debug_assert!(self.is_match(sid)); + let offset = (sid.as_usize() >> self.stride2) - 2; + self.matches[offset][index] + } + + #[inline(always)] + fn memory_usage(&self) -> usize { + use core::mem::size_of; + + (self.trans.len() * size_of::<u32>()) + + (self.matches.len() * size_of::<Vec<PatternID>>()) + + self.matches_memory_usage + + (self.pattern_lens.len() * size_of::<SmallIndex>()) + + self.prefilter.as_ref().map_or(0, |p| p.memory_usage()) + } + + #[inline(always)] + fn prefilter(&self) -> Option<&Prefilter> { + self.prefilter.as_ref() + } +} + +impl core::fmt::Debug for DFA { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + use crate::{ + automaton::{fmt_state_indicator, sparse_transitions}, + util::debug::DebugByte, + }; + + writeln!(f, "dfa::DFA(")?; + for index in 0..self.state_len { + let sid = StateID::new_unchecked(index << self.stride2); + // While we do currently include the FAIL state in the transition + // table (to simplify construction), it is never actually used. It + // poses problems with the code below because it gets treated as + // a match state incidentally when it is, of course, not. So we + // special case it. The fail state is always the first state after + // the dead state. + // + // If the construction is changed to remove the fail state (it + // probably should be), then this special case should be updated. + if index == 1 { + writeln!(f, "F {:06}:", sid.as_usize())?; + continue; + } + fmt_state_indicator(f, self, sid)?; + write!(f, "{:06}: ", sid.as_usize())?; + + let it = (0..self.byte_classes.alphabet_len()).map(|class| { + (class.as_u8(), self.trans[sid.as_usize() + class]) + }); + for (i, (start, end, next)) in sparse_transitions(it).enumerate() { + if i > 0 { + write!(f, ", ")?; + } + if start == end { + write!( + f, + "{:?} => {:?}", + DebugByte(start), + next.as_usize() + )?; + } else { + write!( + f, + "{:?}-{:?} => {:?}", + DebugByte(start), + DebugByte(end), + next.as_usize() + )?; + } + } + write!(f, "\n")?; + if self.is_match(sid) { + write!(f, " matches: ")?; + for i in 0..self.match_len(sid) { + if i > 0 { + write!(f, ", ")?; + } + let pid = self.match_pattern(sid, i); + write!(f, "{}", pid.as_usize())?; + } + write!(f, "\n")?; + } + } + writeln!(f, "match kind: {:?}", self.match_kind)?; + writeln!(f, "prefilter: {:?}", self.prefilter.is_some())?; + writeln!(f, "state length: {:?}", self.state_len)?; + writeln!(f, "pattern length: {:?}", self.patterns_len())?; + writeln!(f, "shortest pattern length: {:?}", self.min_pattern_len)?; + writeln!(f, "longest pattern length: {:?}", self.max_pattern_len)?; + writeln!(f, "alphabet length: {:?}", self.alphabet_len)?; + writeln!(f, "stride: {:?}", 1 << self.stride2)?; + writeln!(f, "byte classes: {:?}", self.byte_classes)?; + writeln!(f, "memory usage: {:?}", self.memory_usage())?; + writeln!(f, ")")?; + Ok(()) + } +} + +/// A builder for configuring an Aho-Corasick DFA. +/// +/// This builder has a subset of the options available to a +/// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options, +/// their behavior is identical. +#[derive(Clone, Debug)] +pub struct Builder { + noncontiguous: noncontiguous::Builder, + start_kind: StartKind, + byte_classes: bool, +} + +impl Default for Builder { + fn default() -> Builder { + Builder { + noncontiguous: noncontiguous::Builder::new(), + start_kind: StartKind::Unanchored, + byte_classes: true, + } + } +} + +impl Builder { + /// Create a new builder for configuring an Aho-Corasick DFA. + pub fn new() -> Builder { + Builder::default() + } + + /// Build an Aho-Corasick DFA from the given iterator of patterns. + /// + /// A builder may be reused to create more DFAs. + pub fn build<I, P>(&self, patterns: I) -> Result<DFA, BuildError> + where + I: IntoIterator<Item = P>, + P: AsRef<[u8]>, + { + let nnfa = self.noncontiguous.build(patterns)?; + self.build_from_noncontiguous(&nnfa) + } + + /// Build an Aho-Corasick DFA from the given noncontiguous NFA. + /// + /// Note that when this method is used, only the `start_kind` and + /// `byte_classes` settings on this builder are respected. The other + /// settings only apply to the initial construction of the Aho-Corasick + /// automaton. Since using this method requires that initial construction + /// has already completed, all settings impacting only initial construction + /// are no longer relevant. + pub fn build_from_noncontiguous( + &self, + nnfa: &noncontiguous::NFA, + ) -> Result<DFA, BuildError> { + debug!("building DFA"); + let byte_classes = if self.byte_classes { + nnfa.byte_classes().clone() + } else { + ByteClasses::singletons() + }; + let state_len = match self.start_kind { + StartKind::Unanchored | StartKind::Anchored => nnfa.states().len(), + StartKind::Both => { + // These unwraps are OK because we know that the number of + // NFA states is < StateID::LIMIT which is in turn less than + // i32::MAX. Thus, there is always room to multiply by 2. + // Finally, the number of states is always at least 4 in the + // NFA (DEAD, FAIL, START-UNANCHORED, START-ANCHORED), so the + // subtraction of 4 is okay. + // + // Note that we subtract 4 because the "anchored" part of + // the DFA duplicates the unanchored part (without failure + // transitions), but reuses the DEAD, FAIL and START states. + nnfa.states() + .len() + .checked_mul(2) + .unwrap() + .checked_sub(4) + .unwrap() + } + }; + let trans_len = + match state_len.checked_shl(byte_classes.stride2().as_u32()) { + Some(trans_len) => trans_len, + None => { + return Err(BuildError::state_id_overflow( + StateID::MAX.as_u64(), + usize::MAX.as_u64(), + )) + } + }; + StateID::new(trans_len.checked_sub(byte_classes.stride()).unwrap()) + .map_err(|e| { + BuildError::state_id_overflow( + StateID::MAX.as_u64(), + e.attempted(), + ) + })?; + let num_match_states = match self.start_kind { + StartKind::Unanchored | StartKind::Anchored => { + nnfa.special().max_match_id.as_usize().checked_sub(1).unwrap() + } + StartKind::Both => nnfa + .special() + .max_match_id + .as_usize() + .checked_sub(1) + .unwrap() + .checked_mul(2) + .unwrap(), + }; + let mut dfa = DFA { + trans: vec![DFA::DEAD; trans_len], + matches: vec![vec![]; num_match_states], + matches_memory_usage: 0, + pattern_lens: nnfa.pattern_lens_raw().to_vec(), + prefilter: nnfa.prefilter().map(|p| p.clone()), + match_kind: nnfa.match_kind(), + state_len, + alphabet_len: byte_classes.alphabet_len(), + stride2: byte_classes.stride2(), + byte_classes, + min_pattern_len: nnfa.min_pattern_len(), + max_pattern_len: nnfa.max_pattern_len(), + // The special state IDs are set later. + special: Special::zero(), + }; + match self.start_kind { + StartKind::Both => { + self.finish_build_both_starts(nnfa, &mut dfa); + } + StartKind::Unanchored => { + self.finish_build_one_start(Anchored::No, nnfa, &mut dfa); + } + StartKind::Anchored => { + self.finish_build_one_start(Anchored::Yes, nnfa, &mut dfa) + } + } + debug!( + "DFA built, <states: {:?}, size: {:?}, \ + alphabet len: {:?}, stride: {:?}>", + dfa.state_len, + dfa.memory_usage(), + dfa.byte_classes.alphabet_len(), + dfa.byte_classes.stride(), + ); + // The vectors can grow ~twice as big during construction because a + // Vec amortizes growth. But here, let's shrink things back down to + // what we actually need since we're never going to add more to it. + dfa.trans.shrink_to_fit(); + dfa.pattern_lens.shrink_to_fit(); + dfa.matches.shrink_to_fit(); + // TODO: We might also want to shrink each Vec inside of `dfa.matches`, + // or even better, convert it to one contiguous allocation. But I think + // I went with nested allocs for good reason (can't remember), so this + // may be tricky to do. I decided not to shrink them here because it + // might require a fair bit of work to do. It's unclear whether it's + // worth it. + Ok(dfa) + } + + /// Finishes building a DFA for either unanchored or anchored searches, + /// but NOT both. + fn finish_build_one_start( + &self, + anchored: Anchored, + nnfa: &noncontiguous::NFA, + dfa: &mut DFA, + ) { + // This function always succeeds because we check above that all of the + // states in the NFA can be mapped to DFA state IDs. + let stride2 = dfa.stride2; + let old2new = |oldsid: StateID| { + StateID::new_unchecked(oldsid.as_usize() << stride2) + }; + for (oldsid, state) in nnfa.states().iter().with_state_ids() { + let newsid = old2new(oldsid); + if state.is_match() { + dfa.set_matches(newsid, nnfa.iter_matches(oldsid)); + } + sparse_iter( + nnfa, + oldsid, + &dfa.byte_classes, + |byte, class, mut oldnextsid| { + if oldnextsid == noncontiguous::NFA::FAIL { + if anchored.is_anchored() { + oldnextsid = noncontiguous::NFA::DEAD; + } else if state.fail() == noncontiguous::NFA::DEAD { + // This is a special case that avoids following + // DEAD transitions in a non-contiguous NFA. + // Following these transitions is pretty slow + // because the non-contiguous NFA will always use + // a sparse representation for it (because the + // DEAD state is usually treated as a sentinel). + // The *vast* majority of failure states are DEAD + // states, so this winds up being pretty slow if + // we go through the non-contiguous NFA state + // transition logic. Instead, just do it ourselves. + oldnextsid = noncontiguous::NFA::DEAD; + } else { + oldnextsid = nnfa.next_state( + Anchored::No, + state.fail(), + byte, + ); + } + } + dfa.trans[newsid.as_usize() + usize::from(class)] = + old2new(oldnextsid); + }, + ); + } + // Now that we've remapped all the IDs in our states, all that's left + // is remapping the special state IDs. + let old = nnfa.special(); + let new = &mut dfa.special; + new.max_special_id = old2new(old.max_special_id); + new.max_match_id = old2new(old.max_match_id); + if anchored.is_anchored() { + new.start_unanchored_id = DFA::DEAD; + new.start_anchored_id = old2new(old.start_anchored_id); + } else { + new.start_unanchored_id = old2new(old.start_unanchored_id); + new.start_anchored_id = DFA::DEAD; + } + } + + /// Finishes building a DFA that supports BOTH unanchored and anchored + /// searches. It works by inter-leaving unanchored states with anchored + /// states in the same transition table. This way, we avoid needing to + /// re-shuffle states afterward to ensure that our states still look like + /// DEAD, MATCH, ..., START-UNANCHORED, START-ANCHORED, NON-MATCH, ... + /// + /// Honestly this is pretty inscrutable... Simplifications are most + /// welcome. + fn finish_build_both_starts( + &self, + nnfa: &noncontiguous::NFA, + dfa: &mut DFA, + ) { + let stride2 = dfa.stride2; + let stride = 1 << stride2; + let mut remap_unanchored = vec![DFA::DEAD; nnfa.states().len()]; + let mut remap_anchored = vec![DFA::DEAD; nnfa.states().len()]; + let mut is_anchored = vec![false; dfa.state_len]; + let mut newsid = DFA::DEAD; + let next_dfa_id = + |sid: StateID| StateID::new_unchecked(sid.as_usize() + stride); + for (oldsid, state) in nnfa.states().iter().with_state_ids() { + if oldsid == noncontiguous::NFA::DEAD + || oldsid == noncontiguous::NFA::FAIL + { + remap_unanchored[oldsid] = newsid; + remap_anchored[oldsid] = newsid; + newsid = next_dfa_id(newsid); + } else if oldsid == nnfa.special().start_unanchored_id + || oldsid == nnfa.special().start_anchored_id + { + if oldsid == nnfa.special().start_unanchored_id { + remap_unanchored[oldsid] = newsid; + remap_anchored[oldsid] = DFA::DEAD; + } else { + remap_unanchored[oldsid] = DFA::DEAD; + remap_anchored[oldsid] = newsid; + is_anchored[newsid.as_usize() >> stride2] = true; + } + if state.is_match() { + dfa.set_matches(newsid, nnfa.iter_matches(oldsid)); + } + sparse_iter( + nnfa, + oldsid, + &dfa.byte_classes, + |_, class, oldnextsid| { + let class = usize::from(class); + if oldnextsid == noncontiguous::NFA::FAIL { + dfa.trans[newsid.as_usize() + class] = DFA::DEAD; + } else { + dfa.trans[newsid.as_usize() + class] = oldnextsid; + } + }, + ); + newsid = next_dfa_id(newsid); + } else { + let unewsid = newsid; + newsid = next_dfa_id(newsid); + let anewsid = newsid; + newsid = next_dfa_id(newsid); + + remap_unanchored[oldsid] = unewsid; + remap_anchored[oldsid] = anewsid; + is_anchored[anewsid.as_usize() >> stride2] = true; + if state.is_match() { + dfa.set_matches(unewsid, nnfa.iter_matches(oldsid)); + dfa.set_matches(anewsid, nnfa.iter_matches(oldsid)); + } + sparse_iter( + nnfa, + oldsid, + &dfa.byte_classes, + |byte, class, oldnextsid| { + let class = usize::from(class); + if oldnextsid == noncontiguous::NFA::FAIL { + let oldnextsid = + if state.fail() == noncontiguous::NFA::DEAD { + noncontiguous::NFA::DEAD + } else { + nnfa.next_state( + Anchored::No, + state.fail(), + byte, + ) + }; + dfa.trans[unewsid.as_usize() + class] = oldnextsid; + } else { + dfa.trans[unewsid.as_usize() + class] = oldnextsid; + dfa.trans[anewsid.as_usize() + class] = oldnextsid; + } + }, + ); + } + } + for i in 0..dfa.state_len { + let sid = i << stride2; + if is_anchored[i] { + for next in dfa.trans[sid..][..stride].iter_mut() { + *next = remap_anchored[*next]; + } + } else { + for next in dfa.trans[sid..][..stride].iter_mut() { + *next = remap_unanchored[*next]; + } + } + } + // Now that we've remapped all the IDs in our states, all that's left + // is remapping the special state IDs. + let old = nnfa.special(); + let new = &mut dfa.special; + new.max_special_id = remap_anchored[old.max_special_id]; + new.max_match_id = remap_anchored[old.max_match_id]; + new.start_unanchored_id = remap_unanchored[old.start_unanchored_id]; + new.start_anchored_id = remap_anchored[old.start_anchored_id]; + } + + /// Set the desired match semantics. + /// + /// This only applies when using [`Builder::build`] and not + /// [`Builder::build_from_noncontiguous`]. + /// + /// See + /// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind) + /// for more documentation and examples. + pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder { + self.noncontiguous.match_kind(kind); + self + } + + /// Enable ASCII-aware case insensitive matching. + /// + /// This only applies when using [`Builder::build`] and not + /// [`Builder::build_from_noncontiguous`]. + /// + /// See + /// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive) + /// for more documentation and examples. + pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder { + self.noncontiguous.ascii_case_insensitive(yes); + self + } + + /// Enable heuristic prefilter optimizations. + /// + /// This only applies when using [`Builder::build`] and not + /// [`Builder::build_from_noncontiguous`]. + /// + /// See + /// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter) + /// for more documentation and examples. + pub fn prefilter(&mut self, yes: bool) -> &mut Builder { + self.noncontiguous.prefilter(yes); + self + } + + /// Sets the starting state configuration for the automaton. + /// + /// See + /// [`AhoCorasickBuilder::start_kind`](crate::AhoCorasickBuilder::start_kind) + /// for more documentation and examples. + pub fn start_kind(&mut self, kind: StartKind) -> &mut Builder { + self.start_kind = kind; + self + } + + /// A debug setting for whether to attempt to shrink the size of the + /// automaton's alphabet or not. + /// + /// This should never be enabled unless you're debugging an automaton. + /// Namely, disabling byte classes makes transitions easier to reason + /// about, since they use the actual bytes instead of equivalence classes. + /// Disabling this confers no performance benefit at search time. + /// + /// See + /// [`AhoCorasickBuilder::byte_classes`](crate::AhoCorasickBuilder::byte_classes) + /// for more documentation and examples. + pub fn byte_classes(&mut self, yes: bool) -> &mut Builder { + self.byte_classes = yes; + self + } +} + +/// Iterate over all possible equivalence class transitions in this state. +/// The closure is called for all transitions with a distinct equivalence +/// class, even those not explicitly represented in this sparse state. For +/// any implicitly defined transitions, the given closure is called with +/// the fail state ID. +/// +/// The closure is guaranteed to be called precisely +/// `byte_classes.alphabet_len()` times, once for every possible class in +/// ascending order. +fn sparse_iter<F: FnMut(u8, u8, StateID)>( + nnfa: &noncontiguous::NFA, + oldsid: StateID, + classes: &ByteClasses, + mut f: F, +) { + let mut prev_class = None; + let mut byte = 0usize; + for t in nnfa.iter_trans(oldsid) { + while byte < usize::from(t.byte()) { + let rep = byte.as_u8(); + let class = classes.get(rep); + byte += 1; + if prev_class != Some(class) { + f(rep, class, noncontiguous::NFA::FAIL); + prev_class = Some(class); + } + } + let rep = t.byte(); + let class = classes.get(rep); + byte += 1; + if prev_class != Some(class) { + f(rep, class, t.next()); + prev_class = Some(class); + } + } + for b in byte..=255 { + let rep = b.as_u8(); + let class = classes.get(rep); + if prev_class != Some(class) { + f(rep, class, noncontiguous::NFA::FAIL); + prev_class = Some(class); + } + } +} diff --git a/vendor/aho-corasick/src/lib.rs b/vendor/aho-corasick/src/lib.rs new file mode 100644 index 0000000..20e8b81 --- /dev/null +++ b/vendor/aho-corasick/src/lib.rs @@ -0,0 +1,326 @@ +/*! +A library for finding occurrences of many patterns at once. This library +provides multiple pattern search principally through an implementation of the +[Aho-Corasick algorithm](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm), +which builds a fast finite state machine for executing searches in linear time. + +Additionally, this library provides a number of configuration options for +building the automaton that permit controlling the space versus time trade +off. Other features include simple ASCII case insensitive matching, finding +overlapping matches, replacements, searching streams and even searching and +replacing text in streams. + +Finally, unlike most other Aho-Corasick implementations, this one +supports enabling [leftmost-first](MatchKind::LeftmostFirst) or +[leftmost-longest](MatchKind::LeftmostLongest) match semantics, using a +(seemingly) novel alternative construction algorithm. For more details on what +match semantics means, see the [`MatchKind`] type. + +# Overview + +This section gives a brief overview of the primary types in this crate: + +* [`AhoCorasick`] is the primary type and represents an Aho-Corasick automaton. +This is the type you use to execute searches. +* [`AhoCorasickBuilder`] can be used to build an Aho-Corasick automaton, and +supports configuring a number of options. +* [`Match`] represents a single match reported by an Aho-Corasick automaton. +Each match has two pieces of information: the pattern that matched and the +start and end byte offsets corresponding to the position in the haystack at +which it matched. + +# Example: basic searching + +This example shows how to search for occurrences of multiple patterns +simultaneously. Each match includes the pattern that matched along with the +byte offsets of the match. + +``` +use aho_corasick::{AhoCorasick, PatternID}; + +let patterns = &["apple", "maple", "Snapple"]; +let haystack = "Nobody likes maple in their apple flavored Snapple."; + +let ac = AhoCorasick::new(patterns).unwrap(); +let mut matches = vec![]; +for mat in ac.find_iter(haystack) { + matches.push((mat.pattern(), mat.start(), mat.end())); +} +assert_eq!(matches, vec![ + (PatternID::must(1), 13, 18), + (PatternID::must(0), 28, 33), + (PatternID::must(2), 43, 50), +]); +``` + +# Example: case insensitivity + +This is like the previous example, but matches `Snapple` case insensitively +using `AhoCorasickBuilder`: + +``` +use aho_corasick::{AhoCorasick, PatternID}; + +let patterns = &["apple", "maple", "snapple"]; +let haystack = "Nobody likes maple in their apple flavored Snapple."; + +let ac = AhoCorasick::builder() + .ascii_case_insensitive(true) + .build(patterns) + .unwrap(); +let mut matches = vec![]; +for mat in ac.find_iter(haystack) { + matches.push((mat.pattern(), mat.start(), mat.end())); +} +assert_eq!(matches, vec![ + (PatternID::must(1), 13, 18), + (PatternID::must(0), 28, 33), + (PatternID::must(2), 43, 50), +]); +``` + +# Example: replacing matches in a stream + +This example shows how to execute a search and replace on a stream without +loading the entire stream into memory first. + +``` +# #[cfg(feature = "std")] { +use aho_corasick::AhoCorasick; + +# fn example() -> Result<(), std::io::Error> { +let patterns = &["fox", "brown", "quick"]; +let replace_with = &["sloth", "grey", "slow"]; + +// In a real example, these might be `std::fs::File`s instead. All you need to +// do is supply a pair of `std::io::Read` and `std::io::Write` implementations. +let rdr = "The quick brown fox."; +let mut wtr = vec![]; + +let ac = AhoCorasick::new(patterns).unwrap(); +ac.try_stream_replace_all(rdr.as_bytes(), &mut wtr, replace_with)?; +assert_eq!(b"The slow grey sloth.".to_vec(), wtr); +# Ok(()) }; example().unwrap() +# } +``` + +# Example: finding the leftmost first match + +In the textbook description of Aho-Corasick, its formulation is typically +structured such that it reports all possible matches, even when they overlap +with another. In many cases, overlapping matches may not be desired, such as +the case of finding all successive non-overlapping matches like you might with +a standard regular expression. + +Unfortunately the "obvious" way to modify the Aho-Corasick algorithm to do +this doesn't always work in the expected way, since it will report matches as +soon as they are seen. For example, consider matching the regex `Samwise|Sam` +against the text `Samwise`. Most regex engines (that are Perl-like, or +non-POSIX) will report `Samwise` as a match, but the standard Aho-Corasick +algorithm modified for reporting non-overlapping matches will report `Sam`. + +A novel contribution of this library is the ability to change the match +semantics of Aho-Corasick (without additional search time overhead) such that +`Samwise` is reported instead. For example, here's the standard approach: + +``` +use aho_corasick::AhoCorasick; + +let patterns = &["Samwise", "Sam"]; +let haystack = "Samwise"; + +let ac = AhoCorasick::new(patterns).unwrap(); +let mat = ac.find(haystack).expect("should have a match"); +assert_eq!("Sam", &haystack[mat.start()..mat.end()]); +``` + +And now here's the leftmost-first version, which matches how a Perl-like +regex will work: + +``` +use aho_corasick::{AhoCorasick, MatchKind}; + +let patterns = &["Samwise", "Sam"]; +let haystack = "Samwise"; + +let ac = AhoCorasick::builder() + .match_kind(MatchKind::LeftmostFirst) + .build(patterns) + .unwrap(); +let mat = ac.find(haystack).expect("should have a match"); +assert_eq!("Samwise", &haystack[mat.start()..mat.end()]); +``` + +In addition to leftmost-first semantics, this library also supports +leftmost-longest semantics, which match the POSIX behavior of a regular +expression alternation. See [`MatchKind`] for more details. + +# Prefilters + +While an Aho-Corasick automaton can perform admirably when compared to more +naive solutions, it is generally slower than more specialized algorithms that +are accelerated using vector instructions such as SIMD. + +For that reason, this library will internally use a "prefilter" to attempt +to accelerate searches when possible. Currently, this library has several +different algorithms it might use depending on the patterns provided. Once the +number of patterns gets too big, prefilters are no longer used. + +While a prefilter is generally good to have on by default since it works +well in the common case, it can lead to less predictable or even sub-optimal +performance in some cases. For that reason, prefilters can be explicitly +disabled via [`AhoCorasickBuilder::prefilter`]. + +# Lower level APIs + +This crate also provides several sub-modules that collectively expose many of +the implementation details of the main [`AhoCorasick`] type. Most users of this +library can completely ignore the submodules and their contents, but if you +needed finer grained control, some parts of them may be useful to you. Here is +a brief overview of each and why you might want to use them: + +* The [`packed`] sub-module contains a lower level API for using fast +vectorized routines for finding a small number of patterns in a haystack. +You might want to use this API when you want to completely side-step using +Aho-Corasick automata. Otherwise, the fast vectorized routines are used +automatically as prefilters for `AhoCorasick` searches whenever possible. +* The [`automaton`] sub-module provides a lower level finite state +machine interface that the various Aho-Corasick implementations in +this crate implement. This sub-module's main contribution is the +[`Automaton`](automaton::Automaton) trait, which permits manually walking the +state transitions of an Aho-Corasick automaton. +* The [`dfa`] and [`nfa`] sub-modules provide DFA and NFA implementations of +the aforementioned `Automaton` trait. The main reason one might want to use +these sub-modules is to get access to a type that implements the `Automaton` +trait. (The top-level `AhoCorasick` type does not implement the `Automaton` +trait.) + +As mentioned above, if you aren't sure whether you need these sub-modules, +you should be able to safely ignore them and just focus on the [`AhoCorasick`] +type. + +# Crate features + +This crate exposes a few features for controlling dependency usage and whether +this crate can be used without the standard library. + +* **std** - + Enables support for the standard library. This feature is enabled by + default. When disabled, only `core` and `alloc` are used. At an API + level, enabling `std` enables `std::error::Error` trait impls for the + various error types, and higher level stream search routines such as + [`AhoCorasick::try_stream_find_iter`]. But the `std` feature is also required + to enable vectorized prefilters. Prefilters can greatly accelerate searches, + but generally only apply when the number of patterns is small (less than + ~100). +* **perf-literal** - + Enables support for literal prefilters that use vectorized routines from + external crates. This feature is enabled by default. If you're only using + Aho-Corasick for large numbers of patterns or otherwise can abide lower + throughput when searching with a small number of patterns, then it is + reasonable to disable this feature. +* **logging** - + Enables a dependency on the `log` crate and emits messages to aide in + diagnostics. This feature is disabled by default. +*/ + +#![no_std] +#![deny(missing_docs)] +#![deny(rustdoc::broken_intra_doc_links)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] + +extern crate alloc; +#[cfg(any(test, feature = "std"))] +extern crate std; + +#[cfg(doctest)] +doc_comment::doctest!("../README.md"); + +#[cfg(feature = "std")] +pub use crate::ahocorasick::StreamFindIter; +pub use crate::{ + ahocorasick::{ + AhoCorasick, AhoCorasickBuilder, AhoCorasickKind, FindIter, + FindOverlappingIter, + }, + util::{ + error::{BuildError, MatchError, MatchErrorKind}, + primitives::{PatternID, PatternIDError}, + search::{Anchored, Input, Match, MatchKind, Span, StartKind}, + }, +}; + +#[macro_use] +mod macros; + +mod ahocorasick; +pub mod automaton; +pub mod dfa; +pub mod nfa; +pub mod packed; +#[cfg(test)] +mod tests; +// I wrote out the module for implementing fst::Automaton only to later realize +// that this would make fst a public dependency and fst is not at 1.0 yet. I +// decided to just keep the code in tree, but build it only during tests. +// +// TODO: I think I've changed my mind again. I'm considering pushing it out +// into either a separate crate or into 'fst' directly as an optional feature. +// #[cfg(test)] +// #[allow(dead_code)] +// mod transducer; +pub(crate) mod util; + +#[cfg(test)] +mod testoibits { + use std::panic::{RefUnwindSafe, UnwindSafe}; + + use super::*; + + fn assert_all<T: Send + Sync + UnwindSafe + RefUnwindSafe>() {} + + #[test] + fn oibits_main() { + assert_all::<AhoCorasick>(); + assert_all::<AhoCorasickBuilder>(); + assert_all::<AhoCorasickKind>(); + assert_all::<FindIter>(); + assert_all::<FindOverlappingIter>(); + + assert_all::<BuildError>(); + assert_all::<MatchError>(); + assert_all::<MatchErrorKind>(); + + assert_all::<Anchored>(); + assert_all::<Input>(); + assert_all::<Match>(); + assert_all::<MatchKind>(); + assert_all::<Span>(); + assert_all::<StartKind>(); + } + + #[test] + fn oibits_automaton() { + use crate::{automaton, dfa::DFA}; + + assert_all::<automaton::FindIter<DFA>>(); + assert_all::<automaton::FindOverlappingIter<DFA>>(); + #[cfg(feature = "std")] + assert_all::<automaton::StreamFindIter<DFA, std::io::Stdin>>(); + assert_all::<automaton::OverlappingState>(); + + assert_all::<automaton::Prefilter>(); + assert_all::<automaton::Candidate>(); + } + + #[test] + fn oibits_packed() { + use crate::packed; + + assert_all::<packed::Config>(); + assert_all::<packed::Builder>(); + assert_all::<packed::Searcher>(); + assert_all::<packed::FindIter>(); + assert_all::<packed::MatchKind>(); + } +} diff --git a/vendor/aho-corasick/src/macros.rs b/vendor/aho-corasick/src/macros.rs new file mode 100644 index 0000000..fc73e6e --- /dev/null +++ b/vendor/aho-corasick/src/macros.rs @@ -0,0 +1,18 @@ +#![allow(unused_macros)] + +macro_rules! log { + ($($tt:tt)*) => { + #[cfg(feature = "logging")] + { + $($tt)* + } + } +} + +macro_rules! debug { + ($($tt:tt)*) => { log!(log::debug!($($tt)*)) } +} + +macro_rules! trace { + ($($tt:tt)*) => { log!(log::trace!($($tt)*)) } +} diff --git a/vendor/aho-corasick/src/nfa/contiguous.rs b/vendor/aho-corasick/src/nfa/contiguous.rs new file mode 100644 index 0000000..29c1621 --- /dev/null +++ b/vendor/aho-corasick/src/nfa/contiguous.rs @@ -0,0 +1,1141 @@ +/*! +Provides a contiguous NFA implementation of Aho-Corasick. + +This is a low-level API that generally only needs to be used in niche +circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) +instead of a contiguous NFA directly. Using an `NFA` directly is typically only +necessary when one needs access to the [`Automaton`] trait implementation. +*/ + +use alloc::{vec, vec::Vec}; + +use crate::{ + automaton::Automaton, + nfa::noncontiguous, + util::{ + alphabet::ByteClasses, + error::{BuildError, MatchError}, + int::{Usize, U16, U32}, + prefilter::Prefilter, + primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID}, + search::{Anchored, MatchKind}, + special::Special, + }, +}; + +/// A contiguous NFA implementation of Aho-Corasick. +/// +/// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of +/// this type directly. Using an `NFA` directly is typically only necessary +/// when one needs access to the [`Automaton`] trait implementation. +/// +/// This NFA can only be built by first constructing a [`noncontiguous::NFA`]. +/// Both [`NFA::new`] and [`Builder::build`] do this for you automatically, but +/// [`Builder::build_from_noncontiguous`] permits doing it explicitly. +/// +/// The main difference between a noncontiguous NFA and a contiguous NFA is +/// that the latter represents all of its states and transitions in a single +/// allocation, where as the former uses a separate allocation for each state. +/// Doing this at construction time while keeping a low memory footprint isn't +/// feasible, which is primarily why there are two different NFA types: one +/// that does the least amount of work possible to build itself, and another +/// that does a little extra work to compact itself and make state transitions +/// faster by making some states use a dense representation. +/// +/// Because a contiguous NFA uses a single allocation, there is a lot more +/// opportunity for compression tricks to reduce the heap memory used. Indeed, +/// it is not uncommon for a contiguous NFA to use an order of magnitude less +/// heap memory than a noncontiguous NFA. Since building a contiguous NFA +/// usually only takes a fraction of the time it takes to build a noncontiguous +/// NFA, the overall build time is not much slower. Thus, in most cases, a +/// contiguous NFA is the best choice. +/// +/// Since a contiguous NFA uses various tricks for compression and to achieve +/// faster state transitions, currently, its limit on the number of states +/// is somewhat smaller than what a noncontiguous NFA can achieve. Generally +/// speaking, you shouldn't expect to run into this limit if the number of +/// patterns is under 1 million. It is plausible that this limit will be +/// increased in the future. If the limit is reached, building a contiguous NFA +/// will return an error. Often, since building a contiguous NFA is relatively +/// cheap, it can make sense to always try it even if you aren't sure if it +/// will fail or not. If it does, you can always fall back to a noncontiguous +/// NFA. (Indeed, the main [`AhoCorasick`](crate::AhoCorasick) type employs a +/// strategy similar to this at construction time.) +/// +/// # Example +/// +/// This example shows how to build an `NFA` directly and use it to execute +/// [`Automaton::try_find`]: +/// +/// ``` +/// use aho_corasick::{ +/// automaton::Automaton, +/// nfa::contiguous::NFA, +/// Input, Match, +/// }; +/// +/// let patterns = &["b", "abc", "abcd"]; +/// let haystack = "abcd"; +/// +/// let nfa = NFA::new(patterns).unwrap(); +/// assert_eq!( +/// Some(Match::must(0, 1..2)), +/// nfa.try_find(&Input::new(haystack))?, +/// ); +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// It is also possible to implement your own version of `try_find`. See the +/// [`Automaton`] documentation for an example. +#[derive(Clone)] +pub struct NFA { + /// The raw NFA representation. Each state is packed with a header + /// (containing the format of the state, the failure transition and, for + /// a sparse state, the number of transitions), its transitions and any + /// matching pattern IDs for match states. + repr: Vec<u32>, + /// The length of each pattern. This is used to compute the start offset + /// of a match. + pattern_lens: Vec<SmallIndex>, + /// The total number of states in this NFA. + state_len: usize, + /// A prefilter for accelerating searches, if one exists. + prefilter: Option<Prefilter>, + /// The match semantics built into this NFA. + match_kind: MatchKind, + /// The alphabet size, or total number of equivalence classes, for this + /// NFA. Dense states always have this many transitions. + alphabet_len: usize, + /// The equivalence classes for this NFA. All transitions, dense and + /// sparse, are defined on equivalence classes and not on the 256 distinct + /// byte values. + byte_classes: ByteClasses, + /// The length of the shortest pattern in this automaton. + min_pattern_len: usize, + /// The length of the longest pattern in this automaton. + max_pattern_len: usize, + /// The information required to deduce which states are "special" in this + /// NFA. + special: Special, +} + +impl NFA { + /// Create a new Aho-Corasick contiguous NFA using the default + /// configuration. + /// + /// Use a [`Builder`] if you want to change the configuration. + pub fn new<I, P>(patterns: I) -> Result<NFA, BuildError> + where + I: IntoIterator<Item = P>, + P: AsRef<[u8]>, + { + NFA::builder().build(patterns) + } + + /// A convenience method for returning a new Aho-Corasick contiguous NFA + /// builder. + /// + /// This usually permits one to just import the `NFA` type. + pub fn builder() -> Builder { + Builder::new() + } +} + +impl NFA { + /// A sentinel state ID indicating that a search should stop once it has + /// entered this state. When a search stops, it returns a match if one + /// has been found, otherwise no match. A contiguous NFA always has an + /// actual dead state at this ID. + const DEAD: StateID = StateID::new_unchecked(0); + /// Another sentinel state ID indicating that a search should move through + /// current state's failure transition. + /// + /// Note that unlike DEAD, this does not actually point to a valid state + /// in a contiguous NFA. (noncontiguous::NFA::FAIL does point to a valid + /// state.) Instead, this points to the position that is guaranteed to + /// never be a valid state ID (by making sure it points to a place in the + /// middle of the encoding of the DEAD state). Since we never need to + /// actually look at the FAIL state itself, this works out. + /// + /// By why do it this way? So that FAIL is a constant. I don't have any + /// concrete evidence that this materially helps matters, but it's easy to + /// do. The alternative would be making the FAIL ID point to the second + /// state, which could be made a constant but is a little trickier to do. + /// The easiest path is to just make the FAIL state a runtime value, but + /// since comparisons with FAIL occur in perf critical parts of the search, + /// we want it to be as tight as possible and not waste any registers. + /// + /// Very hand wavy... But the code complexity that results from this is + /// very mild. + const FAIL: StateID = StateID::new_unchecked(1); +} + +// SAFETY: 'start_state' always returns a valid state ID, 'next_state' always +// returns a valid state ID given a valid state ID. We otherwise claim that +// all other methods are correct as well. +unsafe impl Automaton for NFA { + #[inline(always)] + fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> { + match anchored { + Anchored::No => Ok(self.special.start_unanchored_id), + Anchored::Yes => Ok(self.special.start_anchored_id), + } + } + + #[inline(always)] + fn next_state( + &self, + anchored: Anchored, + mut sid: StateID, + byte: u8, + ) -> StateID { + let repr = &self.repr; + let class = self.byte_classes.get(byte); + let u32tosid = StateID::from_u32_unchecked; + loop { + let o = sid.as_usize(); + let kind = repr[o] & 0xFF; + // I tried to encapsulate the "next transition" logic into its own + // function, but it seemed to always result in sub-optimal codegen + // that led to real and significant slowdowns. So we just inline + // the logic here. + // + // I've also tried a lot of different ways to speed up this + // routine, and most of them have failed. + if kind == State::KIND_DENSE { + let next = u32tosid(repr[o + 2 + usize::from(class)]); + if next != NFA::FAIL { + return next; + } + } else if kind == State::KIND_ONE { + if class == repr[o].low_u16().high_u8() { + return u32tosid(repr[o + 2]); + } + } else { + // NOTE: I tried a SWAR technique in the loop below, but found + // it slower. See the 'swar' test in the tests for this module. + let trans_len = kind.as_usize(); + let classes_len = u32_len(trans_len); + let trans_offset = o + 2 + classes_len; + for (i, &chunk) in + repr[o + 2..][..classes_len].iter().enumerate() + { + let classes = chunk.to_ne_bytes(); + if classes[0] == class { + return u32tosid(repr[trans_offset + i * 4]); + } + if classes[1] == class { + return u32tosid(repr[trans_offset + i * 4 + 1]); + } + if classes[2] == class { + return u32tosid(repr[trans_offset + i * 4 + 2]); + } + if classes[3] == class { + return u32tosid(repr[trans_offset + i * 4 + 3]); + } + } + } + // For an anchored search, we never follow failure transitions + // because failure transitions lead us down a path to matching + // a *proper* suffix of the path we were on. Thus, it can only + // produce matches that appear after the beginning of the search. + if anchored.is_anchored() { + return NFA::DEAD; + } + sid = u32tosid(repr[o + 1]); + } + } + + #[inline(always)] + fn is_special(&self, sid: StateID) -> bool { + sid <= self.special.max_special_id + } + + #[inline(always)] + fn is_dead(&self, sid: StateID) -> bool { + sid == NFA::DEAD + } + + #[inline(always)] + fn is_match(&self, sid: StateID) -> bool { + !self.is_dead(sid) && sid <= self.special.max_match_id + } + + #[inline(always)] + fn is_start(&self, sid: StateID) -> bool { + sid == self.special.start_unanchored_id + || sid == self.special.start_anchored_id + } + + #[inline(always)] + fn match_kind(&self) -> MatchKind { + self.match_kind + } + + #[inline(always)] + fn patterns_len(&self) -> usize { + self.pattern_lens.len() + } + + #[inline(always)] + fn pattern_len(&self, pid: PatternID) -> usize { + self.pattern_lens[pid].as_usize() + } + + #[inline(always)] + fn min_pattern_len(&self) -> usize { + self.min_pattern_len + } + + #[inline(always)] + fn max_pattern_len(&self) -> usize { + self.max_pattern_len + } + + #[inline(always)] + fn match_len(&self, sid: StateID) -> usize { + State::match_len(self.alphabet_len, &self.repr[sid.as_usize()..]) + } + + #[inline(always)] + fn match_pattern(&self, sid: StateID, index: usize) -> PatternID { + State::match_pattern( + self.alphabet_len, + &self.repr[sid.as_usize()..], + index, + ) + } + + #[inline(always)] + fn memory_usage(&self) -> usize { + use core::mem::size_of; + + (self.repr.len() * size_of::<u32>()) + + (self.pattern_lens.len() * size_of::<SmallIndex>()) + + self.prefilter.as_ref().map_or(0, |p| p.memory_usage()) + } + + #[inline(always)] + fn prefilter(&self) -> Option<&Prefilter> { + self.prefilter.as_ref() + } +} + +impl core::fmt::Debug for NFA { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + use crate::automaton::fmt_state_indicator; + + writeln!(f, "contiguous::NFA(")?; + let mut sid = NFA::DEAD; // always the first state and always present + loop { + let raw = &self.repr[sid.as_usize()..]; + if raw.is_empty() { + break; + } + let is_match = self.is_match(sid); + let state = State::read(self.alphabet_len, is_match, raw); + fmt_state_indicator(f, self, sid)?; + write!( + f, + "{:06}({:06}): ", + sid.as_usize(), + state.fail.as_usize() + )?; + state.fmt(f)?; + write!(f, "\n")?; + if self.is_match(sid) { + write!(f, " matches: ")?; + for i in 0..state.match_len { + let pid = State::match_pattern(self.alphabet_len, raw, i); + if i > 0 { + write!(f, ", ")?; + } + write!(f, "{}", pid.as_usize())?; + } + write!(f, "\n")?; + } + // The FAIL state doesn't actually have space for a state allocated + // for it, so we have to treat it as a special case. write below + // the DEAD state. + if sid == NFA::DEAD { + writeln!(f, "F {:06}:", NFA::FAIL.as_usize())?; + } + let len = State::len(self.alphabet_len, is_match, raw); + sid = StateID::new(sid.as_usize().checked_add(len).unwrap()) + .unwrap(); + } + writeln!(f, "match kind: {:?}", self.match_kind)?; + writeln!(f, "prefilter: {:?}", self.prefilter.is_some())?; + writeln!(f, "state length: {:?}", self.state_len)?; + writeln!(f, "pattern length: {:?}", self.patterns_len())?; + writeln!(f, "shortest pattern length: {:?}", self.min_pattern_len)?; + writeln!(f, "longest pattern length: {:?}", self.max_pattern_len)?; + writeln!(f, "alphabet length: {:?}", self.alphabet_len)?; + writeln!(f, "byte classes: {:?}", self.byte_classes)?; + writeln!(f, "memory usage: {:?}", self.memory_usage())?; + writeln!(f, ")")?; + + Ok(()) + } +} + +/// The "in memory" representation a single dense or sparse state. +/// +/// A `State`'s in memory representation is not ever actually materialized +/// during a search with a contiguous NFA. Doing so would be too slow. (Indeed, +/// the only time a `State` is actually constructed is in `Debug` impls.) +/// Instead, a `State` exposes a number of static methods for reading certain +/// things from the raw binary encoding of the state. +#[derive(Clone)] +struct State<'a> { + /// The state to transition to when 'class_to_next' yields a transition + /// to the FAIL state. + fail: StateID, + /// The number of pattern IDs in this state. For a non-match state, this is + /// always zero. Otherwise it is always bigger than zero. + match_len: usize, + /// The sparse or dense representation of the transitions for this state. + trans: StateTrans<'a>, +} + +/// The underlying representation of sparse or dense transitions for a state. +/// +/// Note that like `State`, we don't typically construct values of this type +/// during a search since we don't always need all values and thus would +/// represent a lot of wasteful work. +#[derive(Clone)] +enum StateTrans<'a> { + /// A sparse representation of transitions for a state, where only non-FAIL + /// transitions are explicitly represented. + Sparse { + classes: &'a [u32], + /// The transitions for this state, where each transition is packed + /// into a u32. The low 8 bits correspond to the byte class for the + /// transition, and the high 24 bits correspond to the next state ID. + /// + /// This packing is why the max state ID allowed for a contiguous + /// NFA is 2^24-1. + nexts: &'a [u32], + }, + /// A "one transition" state that is never a match state. + /// + /// These are by far the most common state, so we use a specialized and + /// very compact representation for them. + One { + /// The element of this NFA's alphabet that this transition is + /// defined for. + class: u8, + /// The state this should transition to if the current symbol is + /// equal to 'class'. + next: u32, + }, + /// A dense representation of transitions for a state, where all + /// transitions are explicitly represented, including transitions to the + /// FAIL state. + Dense { + /// A dense set of transitions to other states. The transitions may + /// point to a FAIL state, in which case, the search should try the + /// same transition lookup at 'fail'. + /// + /// Note that this is indexed by byte equivalence classes and not + /// byte values. That means 'class_to_next[byte]' is wrong and + /// 'class_to_next[classes.get(byte)]' is correct. The number of + /// transitions is always equivalent to 'classes.alphabet_len()'. + class_to_next: &'a [u32], + }, +} + +impl<'a> State<'a> { + /// The offset of where the "kind" of a state is stored. If it isn't one + /// of the sentinel values below, then it's a sparse state and the kind + /// corresponds to the number of transitions in the state. + const KIND: usize = 0; + + /// A sentinel value indicating that the state uses a dense representation. + const KIND_DENSE: u32 = 0xFF; + /// A sentinel value indicating that the state uses a special "one + /// transition" encoding. In practice, non-match states with one transition + /// make up the overwhelming majority of all states in any given + /// Aho-Corasick automaton, so we can specialize them using a very compact + /// representation. + const KIND_ONE: u32 = 0xFE; + + /// The maximum number of transitions to encode as a sparse state. Usually + /// states with a lot of transitions are either very rare, or occur near + /// the start state. In the latter case, they are probably dense already + /// anyway. In the former case, making them dense is fine because they're + /// rare. + /// + /// This needs to be small enough to permit each of the sentinel values for + /// 'KIND' above. Namely, a sparse state embeds the number of transitions + /// into the 'KIND'. Basically, "sparse" is a state kind too, but it's the + /// "else" branch. + /// + /// N.B. There isn't anything particularly magical about 127 here. I + /// just picked it because I figured any sparse state with this many + /// transitions is going to be exceptionally rare, and if it did have this + /// many transitions, then it would be quite slow to do a linear scan on + /// the transitions during a search anyway. + const MAX_SPARSE_TRANSITIONS: usize = 127; + + /// Remap state IDs in-place. + /// + /// `state` should be the the raw binary encoding of a state. (The start + /// of the slice must correspond to the start of the state, but the slice + /// may extend past the end of the encoding of the state.) + fn remap( + alphabet_len: usize, + old_to_new: &[StateID], + state: &mut [u32], + ) -> Result<(), BuildError> { + let kind = State::kind(state); + if kind == State::KIND_DENSE { + state[1] = old_to_new[state[1].as_usize()].as_u32(); + for next in state[2..][..alphabet_len].iter_mut() { + *next = old_to_new[next.as_usize()].as_u32(); + } + } else if kind == State::KIND_ONE { + state[1] = old_to_new[state[1].as_usize()].as_u32(); + state[2] = old_to_new[state[2].as_usize()].as_u32(); + } else { + let trans_len = State::sparse_trans_len(state); + let classes_len = u32_len(trans_len); + state[1] = old_to_new[state[1].as_usize()].as_u32(); + for next in state[2 + classes_len..][..trans_len].iter_mut() { + *next = old_to_new[next.as_usize()].as_u32(); + } + } + Ok(()) + } + + /// Returns the length, in number of u32s, of this state. + /// + /// This is useful for reading states consecutively, e.g., in the Debug + /// impl without needing to store a separate map from state index to state + /// identifier. + /// + /// `state` should be the the raw binary encoding of a state. (The start + /// of the slice must correspond to the start of the state, but the slice + /// may extend past the end of the encoding of the state.) + fn len(alphabet_len: usize, is_match: bool, state: &[u32]) -> usize { + let kind_len = 1; + let fail_len = 1; + let kind = State::kind(state); + let (classes_len, trans_len) = if kind == State::KIND_DENSE { + (0, alphabet_len) + } else if kind == State::KIND_ONE { + (0, 1) + } else { + let trans_len = State::sparse_trans_len(state); + let classes_len = u32_len(trans_len); + (classes_len, trans_len) + }; + let match_len = if !is_match { + 0 + } else if State::match_len(alphabet_len, state) == 1 { + // This is a special case because when there is one pattern ID for + // a match state, it is represented by a single u32 with its high + // bit set (which is impossible for a valid pattern ID). + 1 + } else { + // We add 1 to include the u32 that indicates the number of + // pattern IDs that follow. + 1 + State::match_len(alphabet_len, state) + }; + kind_len + fail_len + classes_len + trans_len + match_len + } + + /// Returns the kind of this state. + /// + /// This only includes the low byte. + #[inline(always)] + fn kind(state: &[u32]) -> u32 { + state[State::KIND] & 0xFF + } + + /// Get the number of sparse transitions in this state. This can never + /// be more than State::MAX_SPARSE_TRANSITIONS, as all states with more + /// transitions are encoded as dense states. + /// + /// `state` should be the the raw binary encoding of a sparse state. (The + /// start of the slice must correspond to the start of the state, but the + /// slice may extend past the end of the encoding of the state.) If this + /// isn't a sparse state, then the return value is unspecified. + /// + /// Do note that this is only legal to call on a sparse state. So for + /// example, "one transition" state is not a sparse state, so it would not + /// be legal to call this method on such a state. + #[inline(always)] + fn sparse_trans_len(state: &[u32]) -> usize { + (state[State::KIND] & 0xFF).as_usize() + } + + /// Returns the total number of matching pattern IDs in this state. Calling + /// this on a state that isn't a match results in unspecified behavior. + /// Thus, the returned number is never 0 for all correct calls. + /// + /// `state` should be the the raw binary encoding of a state. (The start + /// of the slice must correspond to the start of the state, but the slice + /// may extend past the end of the encoding of the state.) + #[inline(always)] + fn match_len(alphabet_len: usize, state: &[u32]) -> usize { + // We don't need to handle KIND_ONE here because it can never be a + // match state. + let packed = if State::kind(state) == State::KIND_DENSE { + let start = 2 + alphabet_len; + state[start].as_usize() + } else { + let trans_len = State::sparse_trans_len(state); + let classes_len = u32_len(trans_len); + let start = 2 + classes_len + trans_len; + state[start].as_usize() + }; + if packed & (1 << 31) == 0 { + packed + } else { + 1 + } + } + + /// Returns the pattern ID corresponding to the given index for the state + /// given. The `index` provided must be less than the number of pattern IDs + /// in this state. + /// + /// `state` should be the the raw binary encoding of a state. (The start of + /// the slice must correspond to the start of the state, but the slice may + /// extend past the end of the encoding of the state.) + /// + /// If the given state is not a match state or if the index is out of + /// bounds, then this has unspecified behavior. + #[inline(always)] + fn match_pattern( + alphabet_len: usize, + state: &[u32], + index: usize, + ) -> PatternID { + // We don't need to handle KIND_ONE here because it can never be a + // match state. + let start = if State::kind(state) == State::KIND_DENSE { + 2 + alphabet_len + } else { + let trans_len = State::sparse_trans_len(state); + let classes_len = u32_len(trans_len); + 2 + classes_len + trans_len + }; + let packed = state[start]; + let pid = if packed & (1 << 31) == 0 { + state[start + 1 + index] + } else { + assert_eq!(0, index); + packed & !(1 << 31) + }; + PatternID::from_u32_unchecked(pid) + } + + /// Read a state's binary encoding to its in-memory representation. + /// + /// `alphabet_len` should be the total number of transitions defined for + /// dense states. + /// + /// `is_match` should be true if this state is a match state and false + /// otherwise. + /// + /// `state` should be the the raw binary encoding of a state. (The start + /// of the slice must correspond to the start of the state, but the slice + /// may extend past the end of the encoding of the state.) + fn read( + alphabet_len: usize, + is_match: bool, + state: &'a [u32], + ) -> State<'a> { + let kind = State::kind(state); + let match_len = + if !is_match { 0 } else { State::match_len(alphabet_len, state) }; + let (trans, fail) = if kind == State::KIND_DENSE { + let fail = StateID::from_u32_unchecked(state[1]); + let class_to_next = &state[2..][..alphabet_len]; + (StateTrans::Dense { class_to_next }, fail) + } else if kind == State::KIND_ONE { + let fail = StateID::from_u32_unchecked(state[1]); + let class = state[State::KIND].low_u16().high_u8(); + let next = state[2]; + (StateTrans::One { class, next }, fail) + } else { + let fail = StateID::from_u32_unchecked(state[1]); + let trans_len = State::sparse_trans_len(state); + let classes_len = u32_len(trans_len); + let classes = &state[2..][..classes_len]; + let nexts = &state[2 + classes_len..][..trans_len]; + (StateTrans::Sparse { classes, nexts }, fail) + }; + State { fail, match_len, trans } + } + + /// Encode the "old" state from a noncontiguous NFA to its binary + /// representation to the given `dst` slice. `classes` should be the byte + /// classes computed for the noncontiguous NFA that the given state came + /// from. + /// + /// This returns an error if `dst` became so big that `StateID`s can no + /// longer be created for new states. Otherwise, it returns the state ID of + /// the new state created. + /// + /// When `force_dense` is true, then the encoded state will always use a + /// dense format. Otherwise, the choice between dense and sparse will be + /// automatically chosen based on the old state. + fn write( + nnfa: &noncontiguous::NFA, + oldsid: StateID, + old: &noncontiguous::State, + classes: &ByteClasses, + dst: &mut Vec<u32>, + force_dense: bool, + ) -> Result<StateID, BuildError> { + let sid = StateID::new(dst.len()).map_err(|e| { + BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted()) + })?; + let old_len = nnfa.iter_trans(oldsid).count(); + // For states with a lot of transitions, we might as well just make + // them dense. These kinds of hot states tend to be very rare, so we're + // okay with it. This also gives us more sentinels in the state's + // 'kind', which lets us create different state kinds to save on + // space. + let kind = if force_dense || old_len > State::MAX_SPARSE_TRANSITIONS { + State::KIND_DENSE + } else if old_len == 1 && !old.is_match() { + State::KIND_ONE + } else { + // For a sparse state, the kind is just the number of transitions. + u32::try_from(old_len).unwrap() + }; + if kind == State::KIND_DENSE { + dst.push(kind); + dst.push(old.fail().as_u32()); + State::write_dense_trans(nnfa, oldsid, classes, dst)?; + } else if kind == State::KIND_ONE { + let t = nnfa.iter_trans(oldsid).next().unwrap(); + let class = u32::from(classes.get(t.byte())); + dst.push(kind | (class << 8)); + dst.push(old.fail().as_u32()); + dst.push(t.next().as_u32()); + } else { + dst.push(kind); + dst.push(old.fail().as_u32()); + State::write_sparse_trans(nnfa, oldsid, classes, dst)?; + } + // Now finally write the number of matches and the matches themselves. + if old.is_match() { + let matches_len = nnfa.iter_matches(oldsid).count(); + if matches_len == 1 { + let pid = nnfa.iter_matches(oldsid).next().unwrap().as_u32(); + assert_eq!(0, pid & (1 << 31)); + dst.push((1 << 31) | pid); + } else { + assert_eq!(0, matches_len & (1 << 31)); + dst.push(matches_len.as_u32()); + dst.extend(nnfa.iter_matches(oldsid).map(|pid| pid.as_u32())); + } + } + Ok(sid) + } + + /// Encode the "old" state transitions from a noncontiguous NFA to its + /// binary sparse representation to the given `dst` slice. `classes` should + /// be the byte classes computed for the noncontiguous NFA that the given + /// state came from. + /// + /// This returns an error if `dst` became so big that `StateID`s can no + /// longer be created for new states. + fn write_sparse_trans( + nnfa: &noncontiguous::NFA, + oldsid: StateID, + classes: &ByteClasses, + dst: &mut Vec<u32>, + ) -> Result<(), BuildError> { + let (mut chunk, mut len) = ([0; 4], 0); + for t in nnfa.iter_trans(oldsid) { + chunk[len] = classes.get(t.byte()); + len += 1; + if len == 4 { + dst.push(u32::from_ne_bytes(chunk)); + chunk = [0; 4]; + len = 0; + } + } + if len > 0 { + // In the case where the number of transitions isn't divisible + // by 4, the last u32 chunk will have some left over room. In + // this case, we "just" repeat the last equivalence class. By + // doing this, we know the leftover faux transitions will never + // be followed because if they were, it would have been followed + // prior to it in the last equivalence class. This saves us some + // branching in the search time state transition code. + let repeat = chunk[len - 1]; + while len < 4 { + chunk[len] = repeat; + len += 1; + } + dst.push(u32::from_ne_bytes(chunk)); + } + for t in nnfa.iter_trans(oldsid) { + dst.push(t.next().as_u32()); + } + Ok(()) + } + + /// Encode the "old" state transitions from a noncontiguous NFA to its + /// binary dense representation to the given `dst` slice. `classes` should + /// be the byte classes computed for the noncontiguous NFA that the given + /// state came from. + /// + /// This returns an error if `dst` became so big that `StateID`s can no + /// longer be created for new states. + fn write_dense_trans( + nnfa: &noncontiguous::NFA, + oldsid: StateID, + classes: &ByteClasses, + dst: &mut Vec<u32>, + ) -> Result<(), BuildError> { + // Our byte classes let us shrink the size of our dense states to the + // number of equivalence classes instead of just fixing it to 256. + // Any non-explicitly defined transition is just a transition to the + // FAIL state, so we fill that in first and then overwrite them with + // explicitly defined transitions. (Most states probably only have one + // or two explicitly defined transitions.) + // + // N.B. Remember that while building the contiguous NFA, we use state + // IDs from the noncontiguous NFA. It isn't until we've added all + // states that we go back and map noncontiguous IDs to contiguous IDs. + let start = dst.len(); + dst.extend( + core::iter::repeat(noncontiguous::NFA::FAIL.as_u32()) + .take(classes.alphabet_len()), + ); + assert!(start < dst.len(), "equivalence classes are never empty"); + for t in nnfa.iter_trans(oldsid) { + dst[start + usize::from(classes.get(t.byte()))] = + t.next().as_u32(); + } + Ok(()) + } + + /// Return an iterator over every explicitly defined transition in this + /// state. + fn transitions<'b>(&'b self) -> impl Iterator<Item = (u8, StateID)> + 'b { + let mut i = 0; + core::iter::from_fn(move || match self.trans { + StateTrans::Sparse { classes, nexts } => { + if i >= nexts.len() { + return None; + } + let chunk = classes[i / 4]; + let class = chunk.to_ne_bytes()[i % 4]; + let next = StateID::from_u32_unchecked(nexts[i]); + i += 1; + Some((class, next)) + } + StateTrans::One { class, next } => { + if i == 0 { + i += 1; + Some((class, StateID::from_u32_unchecked(next))) + } else { + None + } + } + StateTrans::Dense { class_to_next } => { + if i >= class_to_next.len() { + return None; + } + let class = i.as_u8(); + let next = StateID::from_u32_unchecked(class_to_next[i]); + i += 1; + Some((class, next)) + } + }) + } +} + +impl<'a> core::fmt::Debug for State<'a> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + use crate::{automaton::sparse_transitions, util::debug::DebugByte}; + + let it = sparse_transitions(self.transitions()) + // Writing out all FAIL transitions is quite noisy. Instead, we + // just require readers of the output to assume anything absent + // maps to the FAIL transition. + .filter(|&(_, _, sid)| sid != NFA::FAIL) + .enumerate(); + for (i, (start, end, sid)) in it { + if i > 0 { + write!(f, ", ")?; + } + if start == end { + write!(f, "{:?} => {:?}", DebugByte(start), sid.as_usize())?; + } else { + write!( + f, + "{:?}-{:?} => {:?}", + DebugByte(start), + DebugByte(end), + sid.as_usize() + )?; + } + } + Ok(()) + } +} + +/// A builder for configuring an Aho-Corasick contiguous NFA. +/// +/// This builder has a subset of the options available to a +/// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options, +/// their behavior is identical. +#[derive(Clone, Debug)] +pub struct Builder { + noncontiguous: noncontiguous::Builder, + dense_depth: usize, + byte_classes: bool, +} + +impl Default for Builder { + fn default() -> Builder { + Builder { + noncontiguous: noncontiguous::Builder::new(), + dense_depth: 2, + byte_classes: true, + } + } +} + +impl Builder { + /// Create a new builder for configuring an Aho-Corasick contiguous NFA. + pub fn new() -> Builder { + Builder::default() + } + + /// Build an Aho-Corasick contiguous NFA from the given iterator of + /// patterns. + /// + /// A builder may be reused to create more NFAs. + pub fn build<I, P>(&self, patterns: I) -> Result<NFA, BuildError> + where + I: IntoIterator<Item = P>, + P: AsRef<[u8]>, + { + let nnfa = self.noncontiguous.build(patterns)?; + self.build_from_noncontiguous(&nnfa) + } + + /// Build an Aho-Corasick contiguous NFA from the given noncontiguous NFA. + /// + /// Note that when this method is used, only the `dense_depth` and + /// `byte_classes` settings on this builder are respected. The other + /// settings only apply to the initial construction of the Aho-Corasick + /// automaton. Since using this method requires that initial construction + /// has already completed, all settings impacting only initial construction + /// are no longer relevant. + pub fn build_from_noncontiguous( + &self, + nnfa: &noncontiguous::NFA, + ) -> Result<NFA, BuildError> { + debug!("building contiguous NFA"); + let byte_classes = if self.byte_classes { + nnfa.byte_classes().clone() + } else { + ByteClasses::singletons() + }; + let mut index_to_state_id = vec![NFA::DEAD; nnfa.states().len()]; + let mut nfa = NFA { + repr: vec![], + pattern_lens: nnfa.pattern_lens_raw().to_vec(), + state_len: nnfa.states().len(), + prefilter: nnfa.prefilter().map(|p| p.clone()), + match_kind: nnfa.match_kind(), + alphabet_len: byte_classes.alphabet_len(), + byte_classes, + min_pattern_len: nnfa.min_pattern_len(), + max_pattern_len: nnfa.max_pattern_len(), + // The special state IDs are set later. + special: Special::zero(), + }; + for (oldsid, state) in nnfa.states().iter().with_state_ids() { + // We don't actually encode a fail state since it isn't necessary. + // But we still want to make sure any FAIL ids are mapped + // correctly. + if oldsid == noncontiguous::NFA::FAIL { + index_to_state_id[oldsid] = NFA::FAIL; + continue; + } + let force_dense = state.depth().as_usize() < self.dense_depth; + let newsid = State::write( + nnfa, + oldsid, + state, + &nfa.byte_classes, + &mut nfa.repr, + force_dense, + )?; + index_to_state_id[oldsid] = newsid; + } + for &newsid in index_to_state_id.iter() { + if newsid == NFA::FAIL { + continue; + } + let state = &mut nfa.repr[newsid.as_usize()..]; + State::remap(nfa.alphabet_len, &index_to_state_id, state)?; + } + // Now that we've remapped all the IDs in our states, all that's left + // is remapping the special state IDs. + let remap = &index_to_state_id; + let old = nnfa.special(); + let new = &mut nfa.special; + new.max_special_id = remap[old.max_special_id]; + new.max_match_id = remap[old.max_match_id]; + new.start_unanchored_id = remap[old.start_unanchored_id]; + new.start_anchored_id = remap[old.start_anchored_id]; + debug!( + "contiguous NFA built, <states: {:?}, size: {:?}, \ + alphabet len: {:?}>", + nfa.state_len, + nfa.memory_usage(), + nfa.byte_classes.alphabet_len(), + ); + // The vectors can grow ~twice as big during construction because a + // Vec amortizes growth. But here, let's shrink things back down to + // what we actually need since we're never going to add more to it. + nfa.repr.shrink_to_fit(); + nfa.pattern_lens.shrink_to_fit(); + Ok(nfa) + } + + /// Set the desired match semantics. + /// + /// This only applies when using [`Builder::build`] and not + /// [`Builder::build_from_noncontiguous`]. + /// + /// See + /// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind) + /// for more documentation and examples. + pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder { + self.noncontiguous.match_kind(kind); + self + } + + /// Enable ASCII-aware case insensitive matching. + /// + /// This only applies when using [`Builder::build`] and not + /// [`Builder::build_from_noncontiguous`]. + /// + /// See + /// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive) + /// for more documentation and examples. + pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder { + self.noncontiguous.ascii_case_insensitive(yes); + self + } + + /// Enable heuristic prefilter optimizations. + /// + /// This only applies when using [`Builder::build`] and not + /// [`Builder::build_from_noncontiguous`]. + /// + /// See + /// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter) + /// for more documentation and examples. + pub fn prefilter(&mut self, yes: bool) -> &mut Builder { + self.noncontiguous.prefilter(yes); + self + } + + /// Set the limit on how many states use a dense representation for their + /// transitions. Other states will generally use a sparse representation. + /// + /// See + /// [`AhoCorasickBuilder::dense_depth`](crate::AhoCorasickBuilder::dense_depth) + /// for more documentation and examples. + pub fn dense_depth(&mut self, depth: usize) -> &mut Builder { + self.dense_depth = depth; + self + } + + /// A debug setting for whether to attempt to shrink the size of the + /// automaton's alphabet or not. + /// + /// This should never be enabled unless you're debugging an automaton. + /// Namely, disabling byte classes makes transitions easier to reason + /// about, since they use the actual bytes instead of equivalence classes. + /// Disabling this confers no performance benefit at search time. + /// + /// See + /// [`AhoCorasickBuilder::byte_classes`](crate::AhoCorasickBuilder::byte_classes) + /// for more documentation and examples. + pub fn byte_classes(&mut self, yes: bool) -> &mut Builder { + self.byte_classes = yes; + self + } +} + +/// Computes the number of u32 values needed to represent one byte per the +/// number of transitions given. +fn u32_len(ntrans: usize) -> usize { + if ntrans % 4 == 0 { + ntrans >> 2 + } else { + (ntrans >> 2) + 1 + } +} + +#[cfg(test)] +mod tests { + // This test demonstrates a SWAR technique I tried in the sparse transition + // code inside of 'next_state'. Namely, sparse transitions work by + // iterating over u32 chunks, with each chunk containing up to 4 classes + // corresponding to 4 transitions. This SWAR technique lets us find a + // matching transition without converting the u32 to a [u8; 4]. + // + // It turned out to be a little slower unfortunately, which isn't too + // surprising, since this is likely a throughput oriented optimization. + // Loop unrolling doesn't really help us because the vast majority of + // states have very few transitions. + // + // Anyway, this code was a little tricky to write, so I converted it to a + // test in case someone figures out how to use it more effectively than + // I could. + // + // (This also only works on little endian. So big endian would need to be + // accounted for if we ever decided to use this I think.) + #[cfg(target_endian = "little")] + #[test] + fn swar() { + use super::*; + + fn has_zero_byte(x: u32) -> u32 { + const LO_U32: u32 = 0x01010101; + const HI_U32: u32 = 0x80808080; + + x.wrapping_sub(LO_U32) & !x & HI_U32 + } + + fn broadcast(b: u8) -> u32 { + (u32::from(b)) * (u32::MAX / 255) + } + + fn index_of(x: u32) -> usize { + let o = + (((x - 1) & 0x01010101).wrapping_mul(0x01010101) >> 24) - 1; + o.as_usize() + } + + let bytes: [u8; 4] = [b'1', b'A', b'a', b'z']; + let chunk = u32::from_ne_bytes(bytes); + + let needle = broadcast(b'1'); + assert_eq!(0, index_of(has_zero_byte(needle ^ chunk))); + let needle = broadcast(b'A'); + assert_eq!(1, index_of(has_zero_byte(needle ^ chunk))); + let needle = broadcast(b'a'); + assert_eq!(2, index_of(has_zero_byte(needle ^ chunk))); + let needle = broadcast(b'z'); + assert_eq!(3, index_of(has_zero_byte(needle ^ chunk))); + } +} diff --git a/vendor/aho-corasick/src/nfa/mod.rs b/vendor/aho-corasick/src/nfa/mod.rs new file mode 100644 index 0000000..93f4dc2 --- /dev/null +++ b/vendor/aho-corasick/src/nfa/mod.rs @@ -0,0 +1,40 @@ +/*! +Provides direct access to NFA implementations of Aho-Corasick. + +The principle characteristic of an NFA in this crate is that it may +transition through multiple states per byte of haystack. In Aho-Corasick +parlance, NFAs follow failure transitions during a search. In contrast, +a [`DFA`](crate::dfa::DFA) pre-computes all failure transitions during +compilation at the expense of a much bigger memory footprint. + +Currently, there are two NFA implementations provided: noncontiguous and +contiguous. The names reflect their internal representation, and consequently, +the trade offs associated with them: + +* A [`noncontiguous::NFA`] uses a separate allocation for every NFA state to +represent its transitions in a sparse format. This is ideal for building an +NFA, since it cheaply permits different states to have a different number of +transitions. A noncontiguous NFA is where the main Aho-Corasick construction +algorithm is implemented. All other Aho-Corasick implementations are built by +first constructing a noncontiguous NFA. +* A [`contiguous::NFA`] is uses a single allocation to represent all states, +while still encoding most states as sparse states but permitting states near +the starting state to have a dense representation. The dense representation +uses more memory, but permits computing transitions during a search more +quickly. By only making the most active states dense (the states near the +starting state), a contiguous NFA better balances memory usage with search +speed. The single contiguous allocation also uses less overhead per state and +enables compression tricks where most states only use 8 bytes of heap memory. + +When given the choice between these two, you almost always want to pick a +contiguous NFA. It takes only a little longer to build, but both its memory +usage and search speed are typically much better than a noncontiguous NFA. A +noncontiguous NFA is useful when prioritizing build times, or when there are +so many patterns that a contiguous NFA could not be built. (Currently, because +of both memory and search speed improvements, a contiguous NFA has a smaller +internal limit on the total number of NFA states it can represent. But you +would likely need to have hundreds of thousands or even millions of patterns +before you hit this limit.) +*/ +pub mod contiguous; +pub mod noncontiguous; diff --git a/vendor/aho-corasick/src/nfa/noncontiguous.rs b/vendor/aho-corasick/src/nfa/noncontiguous.rs new file mode 100644 index 0000000..af32617 --- /dev/null +++ b/vendor/aho-corasick/src/nfa/noncontiguous.rs @@ -0,0 +1,1762 @@ +/*! +Provides a noncontiguous NFA implementation of Aho-Corasick. + +This is a low-level API that generally only needs to be used in niche +circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) +instead of a noncontiguous NFA directly. Using an `NFA` directly is typically +only necessary when one needs access to the [`Automaton`] trait implementation. +*/ + +use alloc::{ + collections::{BTreeSet, VecDeque}, + vec, + vec::Vec, +}; + +use crate::{ + automaton::Automaton, + util::{ + alphabet::{ByteClassSet, ByteClasses}, + error::{BuildError, MatchError}, + prefilter::{self, opposite_ascii_case, Prefilter}, + primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID}, + remapper::Remapper, + search::{Anchored, MatchKind}, + special::Special, + }, +}; + +/// A noncontiguous NFA implementation of Aho-Corasick. +/// +/// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of +/// this type directly. Using an `NFA` directly is typically only necessary +/// when one needs access to the [`Automaton`] trait implementation. +/// +/// This NFA represents the "core" implementation of Aho-Corasick in this +/// crate. Namely, constructing this NFA involving building a trie and then +/// filling in the failure transitions between states, similar to what is +/// described in any standard textbook description of Aho-Corasick. +/// +/// In order to minimize heap usage and to avoid additional construction costs, +/// this implementation represents the transitions of all states as distinct +/// sparse memory allocations. This is where it gets its name from. That is, +/// this NFA has no contiguous memory allocation for its transition table. Each +/// state gets its own allocation. +/// +/// While the sparse representation keeps memory usage to somewhat reasonable +/// levels, it is still quite large and also results in somewhat mediocre +/// search performance. For this reason, it is almost always a good idea to +/// use a [`contiguous::NFA`](crate::nfa::contiguous::NFA) instead. It is +/// marginally slower to build, but has higher throughput and can sometimes use +/// an order of magnitude less memory. The main reason to use a noncontiguous +/// NFA is when you need the fastest possible construction time, or when a +/// contiguous NFA does not have the desired capacity. (The total number of NFA +/// states it can have is fewer than a noncontiguous NFA.) +/// +/// # Example +/// +/// This example shows how to build an `NFA` directly and use it to execute +/// [`Automaton::try_find`]: +/// +/// ``` +/// use aho_corasick::{ +/// automaton::Automaton, +/// nfa::noncontiguous::NFA, +/// Input, Match, +/// }; +/// +/// let patterns = &["b", "abc", "abcd"]; +/// let haystack = "abcd"; +/// +/// let nfa = NFA::new(patterns).unwrap(); +/// assert_eq!( +/// Some(Match::must(0, 1..2)), +/// nfa.try_find(&Input::new(haystack))?, +/// ); +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// It is also possible to implement your own version of `try_find`. See the +/// [`Automaton`] documentation for an example. +#[derive(Clone)] +pub struct NFA { + /// The match semantics built into this NFA. + match_kind: MatchKind, + /// A set of states. Each state defines its own transitions, a fail + /// transition and a set of indices corresponding to matches. + /// + /// The first state is always the fail state, which is used only as a + /// sentinel. Namely, in the final NFA, no transition into the fail state + /// exists. (Well, they do, but they aren't followed. Instead, the state's + /// failure transition is followed.) + /// + /// The second state (index 1) is always the dead state. Dead states are + /// in every automaton, but only used when leftmost-{first,longest} match + /// semantics are enabled. Specifically, they instruct search to stop + /// at specific points in order to report the correct match location. In + /// the standard Aho-Corasick construction, there are no transitions to + /// the dead state. + /// + /// The third state (index 2) is generally intended to be the starting or + /// "root" state. + states: Vec<State>, + /// Transitions stored in a sparse representation via a linked list. + /// + /// Each transition contains three pieces of information: the byte it + /// is defined for, the state it transitions to and a link to the next + /// transition in the same state (or `StateID::ZERO` if it is the last + /// transition). + /// + /// The first transition for each state is determined by `State::sparse`. + /// + /// Note that this contains a complete set of all transitions in this NFA, + /// including states that have a dense representation for transitions. + /// (Adding dense transitions for a state doesn't remove its sparse + /// transitions, since deleting transitions from this particular sparse + /// representation would be fairly expensive.) + sparse: Vec<Transition>, + /// Transitions stored in a dense representation. + /// + /// A state has a row in this table if and only if `State::dense` is + /// not equal to `StateID::ZERO`. When not zero, there are precisely + /// `NFA::byte_classes::alphabet_len()` entries beginning at `State::dense` + /// in this table. + /// + /// Generally a very small minority of states have a dense representation + /// since it uses so much memory. + dense: Vec<StateID>, + /// Matches stored in linked list for each state. + /// + /// Like sparse transitions, each match has a link to the next match in the + /// state. + /// + /// The first match for each state is determined by `State::matches`. + matches: Vec<Match>, + /// The length, in bytes, of each pattern in this NFA. This slice is + /// indexed by `PatternID`. + /// + /// The number of entries in this vector corresponds to the total number of + /// patterns in this automaton. + pattern_lens: Vec<SmallIndex>, + /// A prefilter for quickly skipping to candidate matches, if pertinent. + prefilter: Option<Prefilter>, + /// A set of equivalence classes in terms of bytes. We compute this while + /// building the NFA, but don't use it in the NFA's states. Instead, we + /// use this for building the DFA. We store it on the NFA since it's easy + /// to compute while visiting the patterns. + byte_classes: ByteClasses, + /// The length, in bytes, of the shortest pattern in this automaton. This + /// information is useful for detecting whether an automaton matches the + /// empty string or not. + min_pattern_len: usize, + /// The length, in bytes, of the longest pattern in this automaton. This + /// information is useful for keeping correct buffer sizes when searching + /// on streams. + max_pattern_len: usize, + /// The information required to deduce which states are "special" in this + /// NFA. + /// + /// Since the DEAD and FAIL states are always the first two states and + /// there are only ever two start states (which follow all of the match + /// states), it follows that we can determine whether a state is a fail, + /// dead, match or start with just a few comparisons on the ID itself: + /// + /// is_dead(sid): sid == NFA::DEAD + /// is_fail(sid): sid == NFA::FAIL + /// is_match(sid): NFA::FAIL < sid && sid <= max_match_id + /// is_start(sid): sid == start_unanchored_id || sid == start_anchored_id + /// + /// Note that this only applies to the NFA after it has been constructed. + /// During construction, the start states are the first ones added and the + /// match states are inter-leaved with non-match states. Once all of the + /// states have been added, the states are shuffled such that the above + /// predicates hold. + special: Special, +} + +impl NFA { + /// Create a new Aho-Corasick noncontiguous NFA using the default + /// configuration. + /// + /// Use a [`Builder`] if you want to change the configuration. + pub fn new<I, P>(patterns: I) -> Result<NFA, BuildError> + where + I: IntoIterator<Item = P>, + P: AsRef<[u8]>, + { + NFA::builder().build(patterns) + } + + /// A convenience method for returning a new Aho-Corasick noncontiguous NFA + /// builder. + /// + /// This usually permits one to just import the `NFA` type. + pub fn builder() -> Builder { + Builder::new() + } +} + +impl NFA { + /// The DEAD state is a sentinel state like the FAIL state. The DEAD state + /// instructs any search to stop and return any currently recorded match, + /// or no match otherwise. Generally speaking, it is impossible for an + /// unanchored standard search to enter a DEAD state. But an anchored + /// search can, and so to can a leftmost search. + /// + /// We put DEAD before FAIL so that DEAD is always 0. We repeat this + /// decision across the other Aho-Corasicm automata, so that DEAD + /// states there are always 0 too. It's not that we need all of the + /// implementations to agree, but rather, the contiguous NFA and the DFA + /// use a sort of "premultiplied" state identifier where the only state + /// whose ID is always known and constant is the first state. Subsequent + /// state IDs depend on how much space has already been used in the + /// transition table. + pub(crate) const DEAD: StateID = StateID::new_unchecked(0); + /// The FAIL state mostly just corresponds to the ID of any transition on a + /// state that isn't explicitly defined. When one transitions into the FAIL + /// state, one must follow the previous state's failure transition before + /// doing the next state lookup. In this way, FAIL is more of a sentinel + /// than a state that one actually transitions into. In particular, it is + /// never exposed in the `Automaton` interface. + pub(crate) const FAIL: StateID = StateID::new_unchecked(1); + + /// Returns the equivalence classes of bytes found while constructing + /// this NFA. + /// + /// Note that the NFA doesn't actually make use of these equivalence + /// classes. Instead, these are useful for building the DFA when desired. + pub(crate) fn byte_classes(&self) -> &ByteClasses { + &self.byte_classes + } + + /// Returns a slice containing the length of each pattern in this searcher. + /// It is indexed by `PatternID` and has length `NFA::patterns_len`. + /// + /// This is exposed for convenience when building a contiguous NFA. But it + /// can be reconstructed from the `Automaton` API if necessary. + pub(crate) fn pattern_lens_raw(&self) -> &[SmallIndex] { + &self.pattern_lens + } + + /// Returns a slice of all states in this non-contiguous NFA. + pub(crate) fn states(&self) -> &[State] { + &self.states + } + + /// Returns the underlying "special" state information for this NFA. + pub(crate) fn special(&self) -> &Special { + &self.special + } + + /// Swaps the states at `id1` and `id2`. + /// + /// This does not update the transitions of any state to account for the + /// state swap. + pub(crate) fn swap_states(&mut self, id1: StateID, id2: StateID) { + self.states.swap(id1.as_usize(), id2.as_usize()); + } + + /// Re-maps all state IDs in this NFA according to the `map` function + /// given. + pub(crate) fn remap(&mut self, map: impl Fn(StateID) -> StateID) { + let alphabet_len = self.byte_classes.alphabet_len(); + for state in self.states.iter_mut() { + state.fail = map(state.fail); + let mut link = state.sparse; + while link != StateID::ZERO { + let t = &mut self.sparse[link]; + t.next = map(t.next); + link = t.link; + } + if state.dense != StateID::ZERO { + let start = state.dense.as_usize(); + for next in self.dense[start..][..alphabet_len].iter_mut() { + *next = map(*next); + } + } + } + } + + /// Iterate over all of the transitions for the given state ID. + pub(crate) fn iter_trans( + &self, + sid: StateID, + ) -> impl Iterator<Item = Transition> + '_ { + let mut link = self.states[sid].sparse; + core::iter::from_fn(move || { + if link == StateID::ZERO { + return None; + } + let t = self.sparse[link]; + link = t.link; + Some(t) + }) + } + + /// Iterate over all of the matches for the given state ID. + pub(crate) fn iter_matches( + &self, + sid: StateID, + ) -> impl Iterator<Item = PatternID> + '_ { + let mut link = self.states[sid].matches; + core::iter::from_fn(move || { + if link == StateID::ZERO { + return None; + } + let m = self.matches[link]; + link = m.link; + Some(m.pid) + }) + } + + /// Return the link following the one given. If the one given is the last + /// link for the given state, then return `None`. + /// + /// If no previous link is given, then this returns the first link in the + /// state, if one exists. + /// + /// This is useful for manually iterating over the transitions in a single + /// state without borrowing the NFA. This permits mutating other parts of + /// the NFA during iteration. Namely, one can access the transition pointed + /// to by the link via `self.sparse[link]`. + fn next_link( + &self, + sid: StateID, + prev: Option<StateID>, + ) -> Option<StateID> { + let link = + prev.map_or(self.states[sid].sparse, |p| self.sparse[p].link); + if link == StateID::ZERO { + None + } else { + Some(link) + } + } + + /// Follow the transition for the given byte in the given state. If no such + /// transition exists, then the FAIL state ID is returned. + #[inline(always)] + fn follow_transition(&self, sid: StateID, byte: u8) -> StateID { + let s = &self.states[sid]; + // This is a special case that targets starting states and states + // near a start state. Namely, after the initial trie is constructed, + // we look for states close to the start state to convert to a dense + // representation for their transitions. This winds up using a lot more + // memory per state in exchange for faster transition lookups. But + // since we only do this for a small number of states (by default), the + // memory usage is usually minimal. + // + // This has *massive* benefit when executing searches because the + // unanchored starting state is by far the hottest state and is + // frequently visited. Moreover, the 'for' loop below that works + // decently on an actually sparse state is disastrous on a state that + // is nearly or completely dense. + if s.dense == StateID::ZERO { + self.follow_transition_sparse(sid, byte) + } else { + let class = usize::from(self.byte_classes.get(byte)); + self.dense[s.dense.as_usize() + class] + } + } + + /// Like `follow_transition`, but always uses the sparse representation. + #[inline(always)] + fn follow_transition_sparse(&self, sid: StateID, byte: u8) -> StateID { + for t in self.iter_trans(sid) { + if byte <= t.byte { + if byte == t.byte { + return t.next; + } + break; + } + } + NFA::FAIL + } + + /// Set the transition for the given byte to the state ID given. + /// + /// Note that one should not set transitions to the FAIL state. It is not + /// technically incorrect, but it wastes space. If a transition is not + /// defined, then it is automatically assumed to lead to the FAIL state. + fn add_transition( + &mut self, + prev: StateID, + byte: u8, + next: StateID, + ) -> Result<(), BuildError> { + if self.states[prev].dense != StateID::ZERO { + let dense = self.states[prev].dense; + let class = usize::from(self.byte_classes.get(byte)); + self.dense[dense.as_usize() + class] = next; + } + + let head = self.states[prev].sparse; + if head == StateID::ZERO || byte < self.sparse[head].byte { + let new_link = self.alloc_transition()?; + self.sparse[new_link] = Transition { byte, next, link: head }; + self.states[prev].sparse = new_link; + return Ok(()); + } else if byte == self.sparse[head].byte { + self.sparse[head].next = next; + return Ok(()); + } + + // We handled the only cases where the beginning of the transition + // chain needs to change. At this point, we now know that there is + // at least one entry in the transition chain and the byte for that + // transition is less than the byte for the transition we're adding. + let (mut link_prev, mut link_next) = (head, self.sparse[head].link); + while link_next != StateID::ZERO && byte > self.sparse[link_next].byte + { + link_prev = link_next; + link_next = self.sparse[link_next].link; + } + if link_next == StateID::ZERO || byte < self.sparse[link_next].byte { + let link = self.alloc_transition()?; + self.sparse[link] = Transition { byte, next, link: link_next }; + self.sparse[link_prev].link = link; + } else { + assert_eq!(byte, self.sparse[link_next].byte); + self.sparse[link_next].next = next; + } + Ok(()) + } + + /// This sets every possible transition (all 255 of them) for the given + /// state to the name `next` value. + /// + /// This is useful for efficiently initializing start/dead states. + /// + /// # Panics + /// + /// This requires that the state has no transitions added to it already. + /// If it has any transitions, then this panics. It will also panic if + /// the state has been densified prior to calling this. + fn init_full_state( + &mut self, + prev: StateID, + next: StateID, + ) -> Result<(), BuildError> { + assert_eq!( + StateID::ZERO, + self.states[prev].dense, + "state must not be dense yet" + ); + assert_eq!( + StateID::ZERO, + self.states[prev].sparse, + "state must have zero transitions" + ); + let mut prev_link = StateID::ZERO; + for byte in 0..=255 { + let new_link = self.alloc_transition()?; + self.sparse[new_link] = + Transition { byte, next, link: StateID::ZERO }; + if prev_link == StateID::ZERO { + self.states[prev].sparse = new_link; + } else { + self.sparse[prev_link].link = new_link; + } + prev_link = new_link; + } + Ok(()) + } + + /// Add a match for the given pattern ID to the state for the given ID. + fn add_match( + &mut self, + sid: StateID, + pid: PatternID, + ) -> Result<(), BuildError> { + let head = self.states[sid].matches; + let mut link = head; + while self.matches[link].link != StateID::ZERO { + link = self.matches[link].link; + } + let new_match_link = self.alloc_match()?; + self.matches[new_match_link].pid = pid; + if link == StateID::ZERO { + self.states[sid].matches = new_match_link; + } else { + self.matches[link].link = new_match_link; + } + Ok(()) + } + + /// Copy matches from the `src` state to the `dst` state. This is useful + /// when a match state can be reached via a failure transition. In which + /// case, you'll want to copy the matches (if any) from the state reached + /// by the failure transition to the original state you were at. + fn copy_matches( + &mut self, + src: StateID, + dst: StateID, + ) -> Result<(), BuildError> { + let head_dst = self.states[dst].matches; + let mut link_dst = head_dst; + while self.matches[link_dst].link != StateID::ZERO { + link_dst = self.matches[link_dst].link; + } + let mut link_src = self.states[src].matches; + while link_src != StateID::ZERO { + let new_match_link = + StateID::new(self.matches.len()).map_err(|e| { + BuildError::state_id_overflow( + StateID::MAX.as_u64(), + e.attempted(), + ) + })?; + self.matches.push(Match { + pid: self.matches[link_src].pid, + link: StateID::ZERO, + }); + if link_dst == StateID::ZERO { + self.states[dst].matches = new_match_link; + } else { + self.matches[link_dst].link = new_match_link; + } + + link_dst = new_match_link; + link_src = self.matches[link_src].link; + } + Ok(()) + } + + /// Create a new entry in `NFA::trans`, if there's room, and return that + /// entry's ID. If there's no room, then an error is returned. + fn alloc_transition(&mut self) -> Result<StateID, BuildError> { + let id = StateID::new(self.sparse.len()).map_err(|e| { + BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted()) + })?; + self.sparse.push(Transition::default()); + Ok(id) + } + + /// Create a new entry in `NFA::matches`, if there's room, and return that + /// entry's ID. If there's no room, then an error is returned. + fn alloc_match(&mut self) -> Result<StateID, BuildError> { + let id = StateID::new(self.matches.len()).map_err(|e| { + BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted()) + })?; + self.matches.push(Match::default()); + Ok(id) + } + + /// Create a new set of `N` transitions in this NFA's dense transition + /// table. The ID return corresponds to the index at which the `N` + /// transitions begin. So `id+0` is the first transition and `id+(N-1)` is + /// the last. + /// + /// `N` is determined via `NFA::byte_classes::alphabet_len`. + fn alloc_dense_state(&mut self) -> Result<StateID, BuildError> { + let id = StateID::new(self.dense.len()).map_err(|e| { + BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted()) + })?; + // We use FAIL because it's the correct default. If a state doesn't + // have a transition defined for every possible byte value, then the + // transition function should return NFA::FAIL. + self.dense.extend( + core::iter::repeat(NFA::FAIL) + .take(self.byte_classes.alphabet_len()), + ); + Ok(id) + } + + /// Allocate and add a fresh state to the underlying NFA and return its + /// ID (guaranteed to be one more than the ID of the previously allocated + /// state). If the ID would overflow `StateID`, then this returns an error. + fn alloc_state(&mut self, depth: usize) -> Result<StateID, BuildError> { + // This is OK because we error when building the trie if we see a + // pattern whose length cannot fit into a 'SmallIndex', and the longest + // possible depth corresponds to the length of the longest pattern. + let depth = SmallIndex::new(depth) + .expect("patterns longer than SmallIndex::MAX are not allowed"); + let id = StateID::new(self.states.len()).map_err(|e| { + BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted()) + })?; + self.states.push(State { + sparse: StateID::ZERO, + dense: StateID::ZERO, + matches: StateID::ZERO, + fail: self.special.start_unanchored_id, + depth, + }); + Ok(id) + } +} + +// SAFETY: 'start_state' always returns a valid state ID, 'next_state' always +// returns a valid state ID given a valid state ID. We otherwise claim that +// all other methods are correct as well. +unsafe impl Automaton for NFA { + #[inline(always)] + fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> { + match anchored { + Anchored::No => Ok(self.special.start_unanchored_id), + Anchored::Yes => Ok(self.special.start_anchored_id), + } + } + + #[inline(always)] + fn next_state( + &self, + anchored: Anchored, + mut sid: StateID, + byte: u8, + ) -> StateID { + // This terminates since: + // + // 1. state.fail never points to the FAIL state. + // 2. All state.fail values point to a state closer to the start state. + // 3. The start state has no transitions to the FAIL state. + loop { + let next = self.follow_transition(sid, byte); + if next != NFA::FAIL { + return next; + } + // For an anchored search, we never follow failure transitions + // because failure transitions lead us down a path to matching + // a *proper* suffix of the path we were on. Thus, it can only + // produce matches that appear after the beginning of the search. + if anchored.is_anchored() { + return NFA::DEAD; + } + sid = self.states[sid].fail(); + } + } + + #[inline(always)] + fn is_special(&self, sid: StateID) -> bool { + sid <= self.special.max_special_id + } + + #[inline(always)] + fn is_dead(&self, sid: StateID) -> bool { + sid == NFA::DEAD + } + + #[inline(always)] + fn is_match(&self, sid: StateID) -> bool { + // N.B. This returns true when sid==NFA::FAIL but that's okay because + // NFA::FAIL is not actually a valid state ID from the perspective of + // the Automaton trait. Namely, it is never returned by 'start_state' + // or by 'next_state'. So we don't need to care about it here. + !self.is_dead(sid) && sid <= self.special.max_match_id + } + + #[inline(always)] + fn is_start(&self, sid: StateID) -> bool { + sid == self.special.start_unanchored_id + || sid == self.special.start_anchored_id + } + + #[inline(always)] + fn match_kind(&self) -> MatchKind { + self.match_kind + } + + #[inline(always)] + fn patterns_len(&self) -> usize { + self.pattern_lens.len() + } + + #[inline(always)] + fn pattern_len(&self, pid: PatternID) -> usize { + self.pattern_lens[pid].as_usize() + } + + #[inline(always)] + fn min_pattern_len(&self) -> usize { + self.min_pattern_len + } + + #[inline(always)] + fn max_pattern_len(&self) -> usize { + self.max_pattern_len + } + + #[inline(always)] + fn match_len(&self, sid: StateID) -> usize { + self.iter_matches(sid).count() + } + + #[inline(always)] + fn match_pattern(&self, sid: StateID, index: usize) -> PatternID { + self.iter_matches(sid).nth(index).unwrap() + } + + #[inline(always)] + fn memory_usage(&self) -> usize { + self.states.len() * core::mem::size_of::<State>() + + self.sparse.len() * core::mem::size_of::<Transition>() + + self.matches.len() * core::mem::size_of::<Match>() + + self.dense.len() * StateID::SIZE + + self.pattern_lens.len() * SmallIndex::SIZE + + self.prefilter.as_ref().map_or(0, |p| p.memory_usage()) + } + + #[inline(always)] + fn prefilter(&self) -> Option<&Prefilter> { + self.prefilter.as_ref() + } +} + +/// A representation of a sparse NFA state for an Aho-Corasick automaton. +/// +/// It contains the transitions to the next state, a failure transition for +/// cases where there exists no other transition for the current input byte +/// and the matches implied by visiting this state (if any). +#[derive(Clone, Debug)] +pub(crate) struct State { + /// A pointer to `NFA::trans` corresponding to the head of a linked list + /// containing all of the transitions for this state. + /// + /// This is `StateID::ZERO` if and only if this state has zero transitions. + sparse: StateID, + /// A pointer to a row of `N` transitions in `NFA::dense`. These + /// transitions correspond precisely to what is obtained by traversing + /// `sparse`, but permits constant time lookup. + /// + /// When this is zero (which is true for most states in the default + /// configuration), then this state has no dense representation. + /// + /// Note that `N` is equal to `NFA::byte_classes::alphabet_len()`. This is + /// typically much less than 256 (the maximum value). + dense: StateID, + /// A pointer to `NFA::matches` corresponding to the head of a linked list + /// containing all of the matches for this state. + /// + /// This is `StateID::ZERO` if and only if this state is not a match state. + matches: StateID, + /// The state that should be transitioned to if the current byte in the + /// haystack does not have a corresponding transition defined in this + /// state. + fail: StateID, + /// The depth of this state. Specifically, this is the distance from this + /// state to the starting state. (For the special sentinel states DEAD and + /// FAIL, their depth is always 0.) The depth of a starting state is 0. + /// + /// Note that depth is currently not used in this non-contiguous NFA. It + /// may in the future, but it is used in the contiguous NFA. Namely, it + /// permits an optimization where states near the starting state have their + /// transitions stored in a dense fashion, but all other states have their + /// transitions stored in a sparse fashion. (This non-contiguous NFA uses + /// a sparse representation for all states unconditionally.) In any case, + /// this is really the only convenient place to compute and store this + /// information, which we need when building the contiguous NFA. + depth: SmallIndex, +} + +impl State { + /// Return true if and only if this state is a match state. + pub(crate) fn is_match(&self) -> bool { + self.matches != StateID::ZERO + } + + /// Returns the failure transition for this state. + pub(crate) fn fail(&self) -> StateID { + self.fail + } + + /// Returns the depth of this state. That is, the number of transitions + /// this state is from the start state of the NFA. + pub(crate) fn depth(&self) -> SmallIndex { + self.depth + } +} + +/// A single transition in a non-contiguous NFA. +#[derive(Clone, Copy, Default)] +#[repr(packed)] +pub(crate) struct Transition { + byte: u8, + next: StateID, + link: StateID, +} + +impl Transition { + /// Return the byte for which this transition is defined. + pub(crate) fn byte(&self) -> u8 { + self.byte + } + + /// Return the ID of the state that this transition points to. + pub(crate) fn next(&self) -> StateID { + self.next + } + + /// Return the ID of the next transition. + fn link(&self) -> StateID { + self.link + } +} + +impl core::fmt::Debug for Transition { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!( + f, + "Transition(byte: {:X?}, next: {:?}, link: {:?})", + self.byte, + self.next().as_usize(), + self.link().as_usize() + ) + } +} + +/// A single match in a non-contiguous NFA. +#[derive(Clone, Copy, Default)] +struct Match { + pid: PatternID, + link: StateID, +} + +impl Match { + /// Return the pattern ID for this match. + pub(crate) fn pattern(&self) -> PatternID { + self.pid + } + + /// Return the ID of the next match. + fn link(&self) -> StateID { + self.link + } +} + +impl core::fmt::Debug for Match { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!( + f, + "Match(pid: {:?}, link: {:?})", + self.pattern().as_usize(), + self.link().as_usize() + ) + } +} + +/// A builder for configuring an Aho-Corasick noncontiguous NFA. +/// +/// This builder has a subset of the options available to a +/// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options, +/// their behavior is identical. +#[derive(Clone, Debug)] +pub struct Builder { + match_kind: MatchKind, + prefilter: bool, + ascii_case_insensitive: bool, + dense_depth: usize, +} + +impl Default for Builder { + fn default() -> Builder { + Builder { + match_kind: MatchKind::default(), + prefilter: true, + ascii_case_insensitive: false, + dense_depth: 3, + } + } +} + +impl Builder { + /// Create a new builder for configuring an Aho-Corasick noncontiguous NFA. + pub fn new() -> Builder { + Builder::default() + } + + /// Build an Aho-Corasick noncontiguous NFA from the given iterator of + /// patterns. + /// + /// A builder may be reused to create more NFAs. + pub fn build<I, P>(&self, patterns: I) -> Result<NFA, BuildError> + where + I: IntoIterator<Item = P>, + P: AsRef<[u8]>, + { + debug!("building non-contiguous NFA"); + let nfa = Compiler::new(self)?.compile(patterns)?; + debug!( + "non-contiguous NFA built, <states: {:?}, size: {:?}>", + nfa.states.len(), + nfa.memory_usage() + ); + Ok(nfa) + } + + /// Set the desired match semantics. + /// + /// See + /// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind) + /// for more documentation and examples. + pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder { + self.match_kind = kind; + self + } + + /// Enable ASCII-aware case insensitive matching. + /// + /// See + /// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive) + /// for more documentation and examples. + pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder { + self.ascii_case_insensitive = yes; + self + } + + /// Set the limit on how many states use a dense representation for their + /// transitions. Other states will generally use a sparse representation. + /// + /// See + /// [`AhoCorasickBuilder::dense_depth`](crate::AhoCorasickBuilder::dense_depth) + /// for more documentation and examples. + pub fn dense_depth(&mut self, depth: usize) -> &mut Builder { + self.dense_depth = depth; + self + } + + /// Enable heuristic prefilter optimizations. + /// + /// See + /// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter) + /// for more documentation and examples. + pub fn prefilter(&mut self, yes: bool) -> &mut Builder { + self.prefilter = yes; + self + } +} + +/// A compiler uses a builder configuration and builds up the NFA formulation +/// of an Aho-Corasick automaton. This roughly corresponds to the standard +/// formulation described in textbooks, with some tweaks to support leftmost +/// searching. +#[derive(Debug)] +struct Compiler<'a> { + builder: &'a Builder, + prefilter: prefilter::Builder, + nfa: NFA, + byteset: ByteClassSet, +} + +impl<'a> Compiler<'a> { + fn new(builder: &'a Builder) -> Result<Compiler<'a>, BuildError> { + let prefilter = prefilter::Builder::new(builder.match_kind) + .ascii_case_insensitive(builder.ascii_case_insensitive); + Ok(Compiler { + builder, + prefilter, + nfa: NFA { + match_kind: builder.match_kind, + states: vec![], + sparse: vec![], + dense: vec![], + matches: vec![], + pattern_lens: vec![], + prefilter: None, + byte_classes: ByteClasses::singletons(), + min_pattern_len: usize::MAX, + max_pattern_len: 0, + special: Special::zero(), + }, + byteset: ByteClassSet::empty(), + }) + } + + fn compile<I, P>(mut self, patterns: I) -> Result<NFA, BuildError> + where + I: IntoIterator<Item = P>, + P: AsRef<[u8]>, + { + // Add dummy transition/match links, so that no valid link will point + // to another link at index 0. + self.nfa.sparse.push(Transition::default()); + self.nfa.matches.push(Match::default()); + // Add a dummy dense transition so that no states can have dense==0 + // represent a valid pointer to dense transitions. This permits + // dense==0 to be a sentinel indicating "no dense transitions." + self.nfa.dense.push(NFA::DEAD); + // the dead state, only used for leftmost and fixed to id==0 + self.nfa.alloc_state(0)?; + // the fail state, which is never entered and fixed to id==1 + self.nfa.alloc_state(0)?; + // unanchored start state, initially fixed to id==2 but later shuffled + // to appear after all non-start match states. + self.nfa.special.start_unanchored_id = self.nfa.alloc_state(0)?; + // anchored start state, initially fixed to id==3 but later shuffled + // to appear after unanchored start state. + self.nfa.special.start_anchored_id = self.nfa.alloc_state(0)?; + // Initialize the unanchored starting state in order to make it dense, + // and thus make transition lookups on this state faster. + self.init_unanchored_start_state()?; + // Set all transitions on the DEAD state to point to itself. This way, + // the DEAD state can never be escaped. It MUST be used as a sentinel + // in any correct search. + self.add_dead_state_loop()?; + // Build the base trie from the given patterns. + self.build_trie(patterns)?; + self.nfa.states.shrink_to_fit(); + // Turn our set of bytes into equivalent classes. This NFA + // implementation uses byte classes only for states that use a dense + // representation of transitions. (And that's why this comes before + // `self.densify()`, as the byte classes need to be set first.) + self.nfa.byte_classes = self.byteset.byte_classes(); + // Add transitions (and maybe matches) to the anchored starting state. + // The anchored starting state is used for anchored searches. The only + // mechanical difference between it and the unanchored start state is + // that missing transitions map to the DEAD state instead of the FAIL + // state. + self.set_anchored_start_state()?; + // Rewrite transitions to the FAIL state on the unanchored start state + // as self-transitions. This keeps the start state active at all times. + self.add_unanchored_start_state_loop(); + // Make some (possibly zero) states use a dense representation for + // transitions. It's important to do this right after the states + // and non-failure transitions are solidified. That way, subsequent + // accesses (particularly `fill_failure_transitions`) will benefit from + // the faster transition lookup in densified states. + self.densify()?; + // The meat of the Aho-Corasick algorithm: compute and write failure + // transitions. i.e., the state to move to when a transition isn't + // defined in the current state. These are epsilon transitions and thus + // make this formulation an NFA. + self.fill_failure_transitions()?; + // Handle a special case under leftmost semantics when at least one + // of the patterns is the empty string. + self.close_start_state_loop_for_leftmost(); + // Shuffle states so that we have DEAD, FAIL, MATCH, ..., START, START, + // NON-MATCH, ... This permits us to very quickly query the type of + // the state we're currently in during a search. + self.shuffle(); + self.nfa.prefilter = self.prefilter.build(); + // Store the maximum ID of all *relevant* special states. Start states + // are only relevant when we have a prefilter, otherwise, there is zero + // reason to care about whether a state is a start state or not during + // a search. Indeed, without a prefilter, we are careful to explicitly + // NOT care about start states, otherwise the search can ping pong + // between the unrolled loop and the handling of special-status states + // and destroy perf. + self.nfa.special.max_special_id = if self.nfa.prefilter.is_some() { + // Why the anchored starting state? Because we always put it + // after the unanchored starting state and it is therefore the + // maximum. Why put unanchored followed by anchored? No particular + // reason, but that's how the states are logically organized in the + // Thompson NFA implementation found in regex-automata. ¯\_(ツ)_/¯ + self.nfa.special.start_anchored_id + } else { + self.nfa.special.max_match_id + }; + self.nfa.sparse.shrink_to_fit(); + self.nfa.dense.shrink_to_fit(); + self.nfa.matches.shrink_to_fit(); + self.nfa.pattern_lens.shrink_to_fit(); + Ok(self.nfa) + } + + /// This sets up the initial prefix trie that makes up the Aho-Corasick + /// automaton. Effectively, it creates the basic structure of the + /// automaton, where every pattern given has a path from the start state to + /// the end of the pattern. + fn build_trie<I, P>(&mut self, patterns: I) -> Result<(), BuildError> + where + I: IntoIterator<Item = P>, + P: AsRef<[u8]>, + { + 'PATTERNS: for (i, pat) in patterns.into_iter().enumerate() { + let pid = PatternID::new(i).map_err(|e| { + BuildError::pattern_id_overflow( + PatternID::MAX.as_u64(), + e.attempted(), + ) + })?; + let pat = pat.as_ref(); + let patlen = SmallIndex::new(pat.len()) + .map_err(|_| BuildError::pattern_too_long(pid, pat.len()))?; + self.nfa.min_pattern_len = + core::cmp::min(self.nfa.min_pattern_len, pat.len()); + self.nfa.max_pattern_len = + core::cmp::max(self.nfa.max_pattern_len, pat.len()); + assert_eq!( + i, + self.nfa.pattern_lens.len(), + "expected number of patterns to match pattern ID" + ); + self.nfa.pattern_lens.push(patlen); + // We add the pattern to the prefilter here because the pattern + // ID in the prefilter is determined with respect to the patterns + // added to the prefilter. That is, it isn't the ID we have here, + // but the one determined by its own accounting of patterns. + // To ensure they line up, we add every pattern we see to the + // prefilter, even if some patterns ultimately are impossible to + // match (in leftmost-first semantics specifically). + // + // Another way of doing this would be to expose an API in the + // prefilter to permit setting your own pattern IDs. Or to just use + // our own map and go between them. But this case is sufficiently + // rare that we don't bother and just make sure they're in sync. + if self.builder.prefilter { + self.prefilter.add(pat); + } + + let mut prev = self.nfa.special.start_unanchored_id; + let mut saw_match = false; + for (depth, &b) in pat.iter().enumerate() { + // When leftmost-first match semantics are requested, we + // specifically stop adding patterns when a previously added + // pattern is a prefix of it. We avoid adding it because + // leftmost-first semantics imply that the pattern can never + // match. This is not just an optimization to save space! It + // is necessary for correctness. In fact, this is the only + // difference in the automaton between the implementations for + // leftmost-first and leftmost-longest. + saw_match = saw_match || self.nfa.states[prev].is_match(); + if self.builder.match_kind.is_leftmost_first() && saw_match { + // Skip to the next pattern immediately. This avoids + // incorrectly adding a match after this loop terminates. + continue 'PATTERNS; + } + + // Add this byte to our equivalence classes. These don't + // get used while building the trie, but other Aho-Corasick + // implementations may use them. + self.byteset.set_range(b, b); + if self.builder.ascii_case_insensitive { + let b = opposite_ascii_case(b); + self.byteset.set_range(b, b); + } + + // If the transition from prev using the current byte already + // exists, then just move through it. Otherwise, add a new + // state. We track the depth here so that we can determine + // how to represent transitions. States near the start state + // use a dense representation that uses more memory but is + // faster. Other states use a sparse representation that uses + // less memory but is slower. + let next = self.nfa.follow_transition(prev, b); + if next != NFA::FAIL { + prev = next; + } else { + let next = self.nfa.alloc_state(depth)?; + self.nfa.add_transition(prev, b, next)?; + if self.builder.ascii_case_insensitive { + let b = opposite_ascii_case(b); + self.nfa.add_transition(prev, b, next)?; + } + prev = next; + } + } + // Once the pattern has been added, log the match in the final + // state that it reached. + self.nfa.add_match(prev, pid)?; + } + Ok(()) + } + + /// This routine creates failure transitions according to the standard + /// textbook formulation of the Aho-Corasick algorithm, with a couple small + /// tweaks to support "leftmost" semantics. + /// + /// Building failure transitions is the most interesting part of building + /// the Aho-Corasick automaton, because they are what allow searches to + /// be performed in linear time. Specifically, a failure transition is + /// a single transition associated with each state that points back to + /// the longest proper suffix of the pattern being searched. The failure + /// transition is followed whenever there exists no transition on the + /// current state for the current input byte. If there is no other proper + /// suffix, then the failure transition points back to the starting state. + /// + /// For example, let's say we built an Aho-Corasick automaton with the + /// following patterns: 'abcd' and 'cef'. The trie looks like this: + /// + /// ```ignore + /// a - S1 - b - S2 - c - S3 - d - S4* + /// / + /// S0 - c - S5 - e - S6 - f - S7* + /// ``` + /// + /// At this point, it should be fairly straight-forward to see how this + /// trie can be used in a simplistic way. At any given position in the + /// text we're searching (called the "subject" string), all we need to do + /// is follow the transitions in the trie by consuming one transition for + /// each byte in the subject string. If we reach a match state, then we can + /// report that location as a match. + /// + /// The trick comes when searching a subject string like 'abcef'. We'll + /// initially follow the transition from S0 to S1 and wind up in S3 after + /// observng the 'c' byte. At this point, the next byte is 'e' but state + /// S3 has no transition for 'e', so the search fails. We then would need + /// to restart the search at the next position in 'abcef', which + /// corresponds to 'b'. The match would fail, but the next search starting + /// at 'c' would finally succeed. The problem with this approach is that + /// we wind up searching the subject string potentially many times. In + /// effect, this makes the algorithm have worst case `O(n * m)` complexity, + /// where `n ~ len(subject)` and `m ~ len(all patterns)`. We would instead + /// like to achieve a `O(n + m)` worst case complexity. + /// + /// This is where failure transitions come in. Instead of dying at S3 in + /// the first search, the automaton can instruct the search to move to + /// another part of the automaton that corresponds to a suffix of what + /// we've seen so far. Recall that we've seen 'abc' in the subject string, + /// and the automaton does indeed have a non-empty suffix, 'c', that could + /// potentially lead to another match. Thus, the actual Aho-Corasick + /// automaton for our patterns in this case looks like this: + /// + /// ```ignore + /// a - S1 - b - S2 - c - S3 - d - S4* + /// / / + /// / ---------------- + /// / / + /// S0 - c - S5 - e - S6 - f - S7* + /// ``` + /// + /// That is, we have a failure transition from S3 to S5, which is followed + /// exactly in cases when we are in state S3 but see any byte other than + /// 'd' (that is, we've "failed" to find a match in this portion of our + /// trie). We know we can transition back to S5 because we've already seen + /// a 'c' byte, so we don't need to re-scan it. We can then pick back up + /// with the search starting at S5 and complete our match. + /// + /// Adding failure transitions to a trie is fairly simple, but subtle. The + /// key issue is that you might have multiple failure transition that you + /// need to follow. For example, look at the trie for the patterns + /// 'abcd', 'b', 'bcd' and 'cd': + /// + /// ```ignore + /// - a - S1 - b - S2* - c - S3 - d - S4* + /// / / / + /// / ------- ------- + /// / / / + /// S0 --- b - S5* - c - S6 - d - S7* + /// \ / + /// \ -------- + /// \ / + /// - c - S8 - d - S9* + /// ``` + /// + /// The failure transitions for this trie are defined from S2 to S5, + /// S3 to S6 and S6 to S8. Moreover, state S2 needs to track that it + /// corresponds to a match, since its failure transition to S5 is itself + /// a match state. + /// + /// Perhaps simplest way to think about adding these failure transitions + /// is recursively. That is, if you know the failure transitions for every + /// possible previous state that could be visited (e.g., when computing the + /// failure transition for S3, you already know the failure transitions + /// for S0, S1 and S2), then you can simply follow the failure transition + /// of the previous state and check whether the incoming transition is + /// defined after following the failure transition. + /// + /// For example, when determining the failure state for S3, by our + /// assumptions, we already know that there is a failure transition from + /// S2 (the previous state) to S5. So we follow that transition and check + /// whether the transition connecting S2 to S3 is defined. Indeed, it is, + /// as there is a transition from S5 to S6 for the byte 'c'. If no such + /// transition existed, we could keep following the failure transitions + /// until we reach the start state, which is the failure transition for + /// every state that has no corresponding proper suffix. + /// + /// We don't actually use recursion to implement this, but instead, use a + /// breadth first search of the automaton. Our base case is the start + /// state, whose failure transition is just a transition to itself. + /// + /// When building a leftmost automaton, we proceed as above, but only + /// include a subset of failure transitions. Namely, we omit any failure + /// transitions that appear after a match state in the trie. This is + /// because failure transitions always point back to a proper suffix of + /// what has been seen so far. Thus, following a failure transition after + /// a match implies looking for a match that starts after the one that has + /// already been seen, which is of course therefore not the leftmost match. + /// + /// N.B. I came up with this algorithm on my own, and after scouring all of + /// the other AC implementations I know of (Perl, Snort, many on GitHub). + /// I couldn't find any that implement leftmost semantics like this. + /// Perl of course needs leftmost-first semantics, but they implement it + /// with a seeming hack at *search* time instead of encoding it into the + /// automaton. There are also a couple Java libraries that support leftmost + /// longest semantics, but they do it by building a queue of matches at + /// search time, which is even worse than what Perl is doing. ---AG + fn fill_failure_transitions(&mut self) -> Result<(), BuildError> { + let is_leftmost = self.builder.match_kind.is_leftmost(); + let start_uid = self.nfa.special.start_unanchored_id; + // Initialize the queue for breadth first search with all transitions + // out of the start state. We handle the start state specially because + // we only want to follow non-self transitions. If we followed self + // transitions, then this would never terminate. + let mut queue = VecDeque::new(); + let mut seen = self.queued_set(); + let mut prev_link = None; + while let Some(link) = self.nfa.next_link(start_uid, prev_link) { + prev_link = Some(link); + let t = self.nfa.sparse[link]; + + // Skip anything we've seen before and any self-transitions on the + // start state. + if start_uid == t.next() || seen.contains(t.next) { + continue; + } + queue.push_back(t.next); + seen.insert(t.next); + // Under leftmost semantics, if a state immediately following + // the start state is a match state, then we never want to + // follow its failure transition since the failure transition + // necessarily leads back to the start state, which we never + // want to do for leftmost matching after a match has been + // found. + // + // We apply the same logic to non-start states below as well. + if is_leftmost && self.nfa.states[t.next].is_match() { + self.nfa.states[t.next].fail = NFA::DEAD; + } + } + while let Some(id) = queue.pop_front() { + let mut prev_link = None; + while let Some(link) = self.nfa.next_link(id, prev_link) { + prev_link = Some(link); + let t = self.nfa.sparse[link]; + + if seen.contains(t.next) { + // The only way to visit a duplicate state in a transition + // list is when ASCII case insensitivity is enabled. In + // this case, we want to skip it since it's redundant work. + // But it would also end up duplicating matches, which + // results in reporting duplicate matches in some cases. + // See the 'acasei010' regression test. + continue; + } + queue.push_back(t.next); + seen.insert(t.next); + + // As above for start states, under leftmost semantics, once + // we see a match all subsequent states should have no failure + // transitions because failure transitions always imply looking + // for a match that is a suffix of what has been seen so far + // (where "seen so far" corresponds to the string formed by + // following the transitions from the start state to the + // current state). Under leftmost semantics, we specifically do + // not want to allow this to happen because we always want to + // report the match found at the leftmost position. + // + // The difference between leftmost-first and leftmost-longest + // occurs previously while we build the trie. For + // leftmost-first, we simply omit any entries that would + // otherwise require passing through a match state. + // + // Note that for correctness, the failure transition has to be + // set to the dead state for ALL states following a match, not + // just the match state itself. However, by setting the failure + // transition to the dead state on all match states, the dead + // state will automatically propagate to all subsequent states + // via the failure state computation below. + if is_leftmost && self.nfa.states[t.next].is_match() { + self.nfa.states[t.next].fail = NFA::DEAD; + continue; + } + let mut fail = self.nfa.states[id].fail; + while self.nfa.follow_transition(fail, t.byte) == NFA::FAIL { + fail = self.nfa.states[fail].fail; + } + fail = self.nfa.follow_transition(fail, t.byte); + self.nfa.states[t.next].fail = fail; + self.nfa.copy_matches(fail, t.next)?; + } + // If the start state is a match state, then this automaton can + // match the empty string. This implies all states are match states + // since every position matches the empty string, so copy the + // matches from the start state to every state. Strictly speaking, + // this is only necessary for overlapping matches since each + // non-empty non-start match state needs to report empty matches + // in addition to its own. For the non-overlapping case, such + // states only report the first match, which is never empty since + // it isn't a start state. + if !is_leftmost { + self.nfa + .copy_matches(self.nfa.special.start_unanchored_id, id)?; + } + } + Ok(()) + } + + /// Shuffle the states so that they appear in this sequence: + /// + /// DEAD, FAIL, MATCH..., START, START, NON-MATCH... + /// + /// The idea here is that if we know how special states are laid out in our + /// transition table, then we can determine what "kind" of state we're in + /// just by comparing our current state ID with a particular value. In this + /// way, we avoid doing extra memory lookups. + /// + /// Before shuffling begins, our states look something like this: + /// + /// DEAD, FAIL, START, START, (MATCH | NON-MATCH)... + /// + /// So all we need to do is move all of the MATCH states so that they + /// all appear before any NON-MATCH state, like so: + /// + /// DEAD, FAIL, START, START, MATCH... NON-MATCH... + /// + /// Then it's just a simple matter of swapping the two START states with + /// the last two MATCH states. + /// + /// (This is the same technique used for fully compiled DFAs in + /// regex-automata.) + fn shuffle(&mut self) { + let old_start_uid = self.nfa.special.start_unanchored_id; + let old_start_aid = self.nfa.special.start_anchored_id; + assert!(old_start_uid < old_start_aid); + assert_eq!( + 3, + old_start_aid.as_usize(), + "anchored start state should be at index 3" + ); + // We implement shuffling by a sequence of pairwise swaps of states. + // Since we have a number of things referencing states via their + // IDs and swapping them changes their IDs, we need to record every + // swap we make so that we can remap IDs. The remapper handles this + // book-keeping for us. + let mut remapper = Remapper::new(&self.nfa, 0); + // The way we proceed here is by moving all match states so that + // they directly follow the start states. So it will go: DEAD, FAIL, + // START-UNANCHORED, START-ANCHORED, MATCH, ..., NON-MATCH, ... + // + // To do that, we proceed forward through all states after + // START-ANCHORED and swap match states so that they appear before all + // non-match states. + let mut next_avail = StateID::from(4u8); + for i in next_avail.as_usize()..self.nfa.states.len() { + let sid = StateID::new(i).unwrap(); + if !self.nfa.states[sid].is_match() { + continue; + } + remapper.swap(&mut self.nfa, sid, next_avail); + // The key invariant here is that only non-match states exist + // between 'next_avail' and 'sid' (with them being potentially + // equivalent). Thus, incrementing 'next_avail' by 1 is guaranteed + // to land on the leftmost non-match state. (Unless 'next_avail' + // and 'sid' are equivalent, in which case, a swap will occur but + // it is a no-op.) + next_avail = StateID::new(next_avail.one_more()).unwrap(); + } + // Now we'd like to move the start states to immediately following the + // match states. (The start states may themselves be match states, but + // we'll handle that later.) We arrange the states this way so that we + // don't necessarily need to check whether a state is a start state or + // not before checking whether a state is a match state. For example, + // we'd like to be able to write this as our state machine loop: + // + // sid = start() + // for byte in haystack: + // sid = next(sid, byte) + // if sid <= nfa.max_start_id: + // if sid <= nfa.max_dead_id: + // # search complete + // elif sid <= nfa.max_match_id: + // # found match + // + // The important context here is that we might not want to look for + // start states at all. Namely, if a searcher doesn't have a prefilter, + // then there is no reason to care about whether we're in a start state + // or not. And indeed, if we did check for it, this very hot loop would + // ping pong between the special state handling and the main state + // transition logic. This in turn stalls the CPU by killing branch + // prediction. + // + // So essentially, we really want to be able to "forget" that start + // states even exist and this is why we put them at the end. + let new_start_aid = + StateID::new(next_avail.as_usize().checked_sub(1).unwrap()) + .unwrap(); + remapper.swap(&mut self.nfa, old_start_aid, new_start_aid); + let new_start_uid = + StateID::new(next_avail.as_usize().checked_sub(2).unwrap()) + .unwrap(); + remapper.swap(&mut self.nfa, old_start_uid, new_start_uid); + let new_max_match_id = + StateID::new(next_avail.as_usize().checked_sub(3).unwrap()) + .unwrap(); + self.nfa.special.max_match_id = new_max_match_id; + self.nfa.special.start_unanchored_id = new_start_uid; + self.nfa.special.start_anchored_id = new_start_aid; + // If one start state is a match state, then they both are. + if self.nfa.states[self.nfa.special.start_anchored_id].is_match() { + self.nfa.special.max_match_id = self.nfa.special.start_anchored_id; + } + remapper.remap(&mut self.nfa); + } + + /// Attempts to convert the transition representation of a subset of states + /// in this NFA from sparse to dense. This can greatly improve search + /// performance since states with a higher number of transitions tend to + /// correlate with very active states. + /// + /// We generally only densify states that are close to the start state. + /// These tend to be the most active states and thus benefit from a dense + /// representation more than other states. + /// + /// This tends to best balance between memory usage and performance. In + /// particular, the *vast majority* of all states in a typical Aho-Corasick + /// automaton have only 1 transition and are usually farther from the start + /// state and thus don't get densified. + /// + /// Note that this doesn't remove the sparse representation of transitions + /// for states that are densified. It could be done, but actually removing + /// entries from `NFA::sparse` is likely more expensive than it's worth. + fn densify(&mut self) -> Result<(), BuildError> { + for i in 0..self.nfa.states.len() { + let sid = StateID::new(i).unwrap(); + // Don't bother densifying states that are only used as sentinels. + if sid == NFA::DEAD || sid == NFA::FAIL { + continue; + } + // Only densify states that are "close enough" to the start state. + if self.nfa.states[sid].depth.as_usize() + >= self.builder.dense_depth + { + continue; + } + let dense = self.nfa.alloc_dense_state()?; + let mut prev_link = None; + while let Some(link) = self.nfa.next_link(sid, prev_link) { + prev_link = Some(link); + let t = self.nfa.sparse[link]; + + let class = usize::from(self.nfa.byte_classes.get(t.byte)); + let index = dense.as_usize() + class; + self.nfa.dense[index] = t.next; + } + self.nfa.states[sid].dense = dense; + } + Ok(()) + } + + /// Returns a set that tracked queued states. + /// + /// This is only necessary when ASCII case insensitivity is enabled, since + /// it is the only way to visit the same state twice. Otherwise, this + /// returns an inert set that nevers adds anything and always reports + /// `false` for every member test. + fn queued_set(&self) -> QueuedSet { + if self.builder.ascii_case_insensitive { + QueuedSet::active() + } else { + QueuedSet::inert() + } + } + + /// Initializes the unanchored start state by making it dense. This is + /// achieved by explicitly setting every transition to the FAIL state. + /// This isn't necessary for correctness, since any missing transition is + /// automatically assumed to be mapped to the FAIL state. We do this to + /// make the unanchored starting state dense, and thus in turn make + /// transition lookups on it faster. (Which is worth doing because it's + /// the most active state.) + fn init_unanchored_start_state(&mut self) -> Result<(), BuildError> { + let start_uid = self.nfa.special.start_unanchored_id; + let start_aid = self.nfa.special.start_anchored_id; + self.nfa.init_full_state(start_uid, NFA::FAIL)?; + self.nfa.init_full_state(start_aid, NFA::FAIL)?; + Ok(()) + } + + /// Setup the anchored start state by copying all of the transitions and + /// matches from the unanchored starting state with one change: the failure + /// transition is changed to the DEAD state, so that for any undefined + /// transitions, the search will stop. + fn set_anchored_start_state(&mut self) -> Result<(), BuildError> { + let start_uid = self.nfa.special.start_unanchored_id; + let start_aid = self.nfa.special.start_anchored_id; + let (mut uprev_link, mut aprev_link) = (None, None); + loop { + let unext = self.nfa.next_link(start_uid, uprev_link); + let anext = self.nfa.next_link(start_aid, aprev_link); + let (ulink, alink) = match (unext, anext) { + (Some(ulink), Some(alink)) => (ulink, alink), + (None, None) => break, + _ => unreachable!(), + }; + uprev_link = Some(ulink); + aprev_link = Some(alink); + self.nfa.sparse[alink].next = self.nfa.sparse[ulink].next; + } + self.nfa.copy_matches(start_uid, start_aid)?; + // This is the main difference between the unanchored and anchored + // starting states. If a lookup on an anchored starting state fails, + // then the search should stop. + // + // N.B. This assumes that the loop on the unanchored starting state + // hasn't been created yet. + self.nfa.states[start_aid].fail = NFA::DEAD; + Ok(()) + } + + /// Set the failure transitions on the start state to loop back to the + /// start state. This effectively permits the Aho-Corasick automaton to + /// match at any position. This is also required for finding the next + /// state to terminate, namely, finding the next state should never return + /// a fail_id. + /// + /// This must be done after building the initial trie, since trie + /// construction depends on transitions to `fail_id` to determine whether a + /// state already exists or not. + fn add_unanchored_start_state_loop(&mut self) { + let start_uid = self.nfa.special.start_unanchored_id; + let mut prev_link = None; + while let Some(link) = self.nfa.next_link(start_uid, prev_link) { + prev_link = Some(link); + if self.nfa.sparse[link].next() == NFA::FAIL { + self.nfa.sparse[link].next = start_uid; + } + } + } + + /// Remove the start state loop by rewriting any transitions on the start + /// state back to the start state with transitions to the dead state. + /// + /// The loop is only closed when two conditions are met: the start state + /// is a match state and the match kind is leftmost-first or + /// leftmost-longest. + /// + /// The reason for this is that under leftmost semantics, a start state + /// that is also a match implies that we should never restart the search + /// process. We allow normal transitions out of the start state, but if + /// none exist, we transition to the dead state, which signals that + /// searching should stop. + fn close_start_state_loop_for_leftmost(&mut self) { + let start_uid = self.nfa.special.start_unanchored_id; + let start = &mut self.nfa.states[start_uid]; + let dense = start.dense; + if self.builder.match_kind.is_leftmost() && start.is_match() { + let mut prev_link = None; + while let Some(link) = self.nfa.next_link(start_uid, prev_link) { + prev_link = Some(link); + if self.nfa.sparse[link].next() == start_uid { + self.nfa.sparse[link].next = NFA::DEAD; + if dense != StateID::ZERO { + let b = self.nfa.sparse[link].byte; + let class = usize::from(self.nfa.byte_classes.get(b)); + self.nfa.dense[dense.as_usize() + class] = NFA::DEAD; + } + } + } + } + } + + /// Sets all transitions on the dead state to point back to the dead state. + /// Normally, missing transitions map back to the failure state, but the + /// point of the dead state is to act as a sink that can never be escaped. + fn add_dead_state_loop(&mut self) -> Result<(), BuildError> { + self.nfa.init_full_state(NFA::DEAD, NFA::DEAD)?; + Ok(()) + } +} + +/// A set of state identifiers used to avoid revisiting the same state multiple +/// times when filling in failure transitions. +/// +/// This set has an "inert" and an "active" mode. When inert, the set never +/// stores anything and always returns `false` for every member test. This is +/// useful to avoid the performance and memory overhead of maintaining this +/// set when it is not needed. +#[derive(Debug)] +struct QueuedSet { + set: Option<BTreeSet<StateID>>, +} + +impl QueuedSet { + /// Return an inert set that returns `false` for every state ID membership + /// test. + fn inert() -> QueuedSet { + QueuedSet { set: None } + } + + /// Return an active set that tracks state ID membership. + fn active() -> QueuedSet { + QueuedSet { set: Some(BTreeSet::new()) } + } + + /// Inserts the given state ID into this set. (If the set is inert, then + /// this is a no-op.) + fn insert(&mut self, state_id: StateID) { + if let Some(ref mut set) = self.set { + set.insert(state_id); + } + } + + /// Returns true if and only if the given state ID is in this set. If the + /// set is inert, this always returns false. + fn contains(&self, state_id: StateID) -> bool { + match self.set { + None => false, + Some(ref set) => set.contains(&state_id), + } + } +} + +impl core::fmt::Debug for NFA { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + use crate::{ + automaton::{fmt_state_indicator, sparse_transitions}, + util::debug::DebugByte, + }; + + writeln!(f, "noncontiguous::NFA(")?; + for (sid, state) in self.states.iter().with_state_ids() { + // The FAIL state doesn't actually have space for a state allocated + // for it, so we have to treat it as a special case. + if sid == NFA::FAIL { + writeln!(f, "F {:06}:", sid.as_usize())?; + continue; + } + fmt_state_indicator(f, self, sid)?; + write!( + f, + "{:06}({:06}): ", + sid.as_usize(), + state.fail.as_usize() + )?; + + let it = sparse_transitions( + self.iter_trans(sid).map(|t| (t.byte, t.next)), + ) + .enumerate(); + for (i, (start, end, sid)) in it { + if i > 0 { + write!(f, ", ")?; + } + if start == end { + write!( + f, + "{:?} => {:?}", + DebugByte(start), + sid.as_usize() + )?; + } else { + write!( + f, + "{:?}-{:?} => {:?}", + DebugByte(start), + DebugByte(end), + sid.as_usize() + )?; + } + } + + write!(f, "\n")?; + if self.is_match(sid) { + write!(f, " matches: ")?; + for (i, pid) in self.iter_matches(sid).enumerate() { + if i > 0 { + write!(f, ", ")?; + } + write!(f, "{}", pid.as_usize())?; + } + write!(f, "\n")?; + } + } + writeln!(f, "match kind: {:?}", self.match_kind)?; + writeln!(f, "prefilter: {:?}", self.prefilter.is_some())?; + writeln!(f, "state length: {:?}", self.states.len())?; + writeln!(f, "pattern length: {:?}", self.patterns_len())?; + writeln!(f, "shortest pattern length: {:?}", self.min_pattern_len)?; + writeln!(f, "longest pattern length: {:?}", self.max_pattern_len)?; + writeln!(f, "memory usage: {:?}", self.memory_usage())?; + writeln!(f, ")")?; + Ok(()) + } +} diff --git a/vendor/aho-corasick/src/packed/api.rs b/vendor/aho-corasick/src/packed/api.rs new file mode 100644 index 0000000..44f0bc9 --- /dev/null +++ b/vendor/aho-corasick/src/packed/api.rs @@ -0,0 +1,687 @@ +use alloc::sync::Arc; + +use crate::{ + packed::{pattern::Patterns, rabinkarp::RabinKarp, teddy}, + util::search::{Match, Span}, +}; + +/// This is a limit placed on the total number of patterns we're willing to try +/// and match at once. As more sophisticated algorithms are added, this number +/// may be increased. +const PATTERN_LIMIT: usize = 128; + +/// A knob for controlling the match semantics of a packed multiple string +/// searcher. +/// +/// This differs from the [`MatchKind`](crate::MatchKind) type in the top-level +/// crate module in that it doesn't support "standard" match semantics, +/// and instead only supports leftmost-first or leftmost-longest. Namely, +/// "standard" semantics cannot be easily supported by packed searchers. +/// +/// For more information on the distinction between leftmost-first and +/// leftmost-longest, see the docs on the top-level `MatchKind` type. +/// +/// Unlike the top-level `MatchKind` type, the default match semantics for this +/// type are leftmost-first. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +#[non_exhaustive] +pub enum MatchKind { + /// Use leftmost-first match semantics, which reports leftmost matches. + /// When there are multiple possible leftmost matches, the match + /// corresponding to the pattern that appeared earlier when constructing + /// the automaton is reported. + /// + /// This is the default. + LeftmostFirst, + /// Use leftmost-longest match semantics, which reports leftmost matches. + /// When there are multiple possible leftmost matches, the longest match + /// is chosen. + LeftmostLongest, +} + +impl Default for MatchKind { + fn default() -> MatchKind { + MatchKind::LeftmostFirst + } +} + +/// The configuration for a packed multiple pattern searcher. +/// +/// The configuration is currently limited only to being able to select the +/// match semantics (leftmost-first or leftmost-longest) of a searcher. In the +/// future, more knobs may be made available. +/// +/// A configuration produces a [`packed::Builder`](Builder), which in turn can +/// be used to construct a [`packed::Searcher`](Searcher) for searching. +/// +/// # Example +/// +/// This example shows how to use leftmost-longest semantics instead of the +/// default (leftmost-first). +/// +/// ``` +/// use aho_corasick::{packed::{Config, MatchKind}, PatternID}; +/// +/// # fn example() -> Option<()> { +/// let searcher = Config::new() +/// .match_kind(MatchKind::LeftmostLongest) +/// .builder() +/// .add("foo") +/// .add("foobar") +/// .build()?; +/// let matches: Vec<PatternID> = searcher +/// .find_iter("foobar") +/// .map(|mat| mat.pattern()) +/// .collect(); +/// assert_eq!(vec![PatternID::must(1)], matches); +/// # Some(()) } +/// # if cfg!(all(feature = "std", any( +/// # target_arch = "x86_64", target_arch = "aarch64", +/// # ))) { +/// # example().unwrap() +/// # } else { +/// # assert!(example().is_none()); +/// # } +/// ``` +#[derive(Clone, Debug)] +pub struct Config { + kind: MatchKind, + force: Option<ForceAlgorithm>, + only_teddy_fat: Option<bool>, + only_teddy_256bit: Option<bool>, + heuristic_pattern_limits: bool, +} + +/// An internal option for forcing the use of a particular packed algorithm. +/// +/// When an algorithm is forced, if a searcher could not be constructed for it, +/// then no searcher will be returned even if an alternative algorithm would +/// work. +#[derive(Clone, Debug)] +enum ForceAlgorithm { + Teddy, + RabinKarp, +} + +impl Default for Config { + fn default() -> Config { + Config::new() + } +} + +impl Config { + /// Create a new default configuration. A default configuration uses + /// leftmost-first match semantics. + pub fn new() -> Config { + Config { + kind: MatchKind::LeftmostFirst, + force: None, + only_teddy_fat: None, + only_teddy_256bit: None, + heuristic_pattern_limits: true, + } + } + + /// Create a packed builder from this configuration. The builder can be + /// used to accumulate patterns and create a [`Searcher`] from them. + pub fn builder(&self) -> Builder { + Builder::from_config(self.clone()) + } + + /// Set the match semantics for this configuration. + pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config { + self.kind = kind; + self + } + + /// An undocumented method for forcing the use of the Teddy algorithm. + /// + /// This is only exposed for more precise testing and benchmarks. Callers + /// should not use it as it is not part of the API stability guarantees of + /// this crate. + #[doc(hidden)] + pub fn only_teddy(&mut self, yes: bool) -> &mut Config { + if yes { + self.force = Some(ForceAlgorithm::Teddy); + } else { + self.force = None; + } + self + } + + /// An undocumented method for forcing the use of the Fat Teddy algorithm. + /// + /// This is only exposed for more precise testing and benchmarks. Callers + /// should not use it as it is not part of the API stability guarantees of + /// this crate. + #[doc(hidden)] + pub fn only_teddy_fat(&mut self, yes: Option<bool>) -> &mut Config { + self.only_teddy_fat = yes; + self + } + + /// An undocumented method for forcing the use of SSE (`Some(false)`) or + /// AVX (`Some(true)`) algorithms. + /// + /// This is only exposed for more precise testing and benchmarks. Callers + /// should not use it as it is not part of the API stability guarantees of + /// this crate. + #[doc(hidden)] + pub fn only_teddy_256bit(&mut self, yes: Option<bool>) -> &mut Config { + self.only_teddy_256bit = yes; + self + } + + /// An undocumented method for forcing the use of the Rabin-Karp algorithm. + /// + /// This is only exposed for more precise testing and benchmarks. Callers + /// should not use it as it is not part of the API stability guarantees of + /// this crate. + #[doc(hidden)] + pub fn only_rabin_karp(&mut self, yes: bool) -> &mut Config { + if yes { + self.force = Some(ForceAlgorithm::RabinKarp); + } else { + self.force = None; + } + self + } + + /// Request that heuristic limitations on the number of patterns be + /// employed. This useful to disable for benchmarking where one wants to + /// explore how Teddy performs on large number of patterns even if the + /// heuristics would otherwise refuse construction. + /// + /// This is enabled by default. + pub fn heuristic_pattern_limits(&mut self, yes: bool) -> &mut Config { + self.heuristic_pattern_limits = yes; + self + } +} + +/// A builder for constructing a packed searcher from a collection of patterns. +/// +/// # Example +/// +/// This example shows how to use a builder to construct a searcher. By +/// default, leftmost-first match semantics are used. +/// +/// ``` +/// use aho_corasick::{packed::{Builder, MatchKind}, PatternID}; +/// +/// # fn example() -> Option<()> { +/// let searcher = Builder::new() +/// .add("foobar") +/// .add("foo") +/// .build()?; +/// let matches: Vec<PatternID> = searcher +/// .find_iter("foobar") +/// .map(|mat| mat.pattern()) +/// .collect(); +/// assert_eq!(vec![PatternID::ZERO], matches); +/// # Some(()) } +/// # if cfg!(all(feature = "std", any( +/// # target_arch = "x86_64", target_arch = "aarch64", +/// # ))) { +/// # example().unwrap() +/// # } else { +/// # assert!(example().is_none()); +/// # } +/// ``` +#[derive(Clone, Debug)] +pub struct Builder { + /// The configuration of this builder and subsequent matcher. + config: Config, + /// Set to true if the builder detects that a matcher cannot be built. + inert: bool, + /// The patterns provided by the caller. + patterns: Patterns, +} + +impl Builder { + /// Create a new builder for constructing a multi-pattern searcher. This + /// constructor uses the default configuration. + pub fn new() -> Builder { + Builder::from_config(Config::new()) + } + + fn from_config(config: Config) -> Builder { + Builder { config, inert: false, patterns: Patterns::new() } + } + + /// Build a searcher from the patterns added to this builder so far. + pub fn build(&self) -> Option<Searcher> { + if self.inert || self.patterns.is_empty() { + return None; + } + let mut patterns = self.patterns.clone(); + patterns.set_match_kind(self.config.kind); + let patterns = Arc::new(patterns); + let rabinkarp = RabinKarp::new(&patterns); + // Effectively, we only want to return a searcher if we can use Teddy, + // since Teddy is our only fast packed searcher at the moment. + // Rabin-Karp is only used when searching haystacks smaller than what + // Teddy can support. Thus, the only way to get a Rabin-Karp searcher + // is to force it using undocumented APIs (for tests/benchmarks). + let (search_kind, minimum_len) = match self.config.force { + None | Some(ForceAlgorithm::Teddy) => { + debug!("trying to build Teddy packed matcher"); + let teddy = match self.build_teddy(Arc::clone(&patterns)) { + None => return None, + Some(teddy) => teddy, + }; + let minimum_len = teddy.minimum_len(); + (SearchKind::Teddy(teddy), minimum_len) + } + Some(ForceAlgorithm::RabinKarp) => { + debug!("using Rabin-Karp packed matcher"); + (SearchKind::RabinKarp, 0) + } + }; + Some(Searcher { patterns, rabinkarp, search_kind, minimum_len }) + } + + fn build_teddy(&self, patterns: Arc<Patterns>) -> Option<teddy::Searcher> { + teddy::Builder::new() + .only_256bit(self.config.only_teddy_256bit) + .only_fat(self.config.only_teddy_fat) + .heuristic_pattern_limits(self.config.heuristic_pattern_limits) + .build(patterns) + } + + /// Add the given pattern to this set to match. + /// + /// The order in which patterns are added is significant. Namely, when + /// using leftmost-first match semantics, then when multiple patterns can + /// match at a particular location, the pattern that was added first is + /// used as the match. + /// + /// If the number of patterns added exceeds the amount supported by packed + /// searchers, then the builder will stop accumulating patterns and render + /// itself inert. At this point, constructing a searcher will always return + /// `None`. + pub fn add<P: AsRef<[u8]>>(&mut self, pattern: P) -> &mut Builder { + if self.inert { + return self; + } else if self.patterns.len() >= PATTERN_LIMIT { + self.inert = true; + self.patterns.reset(); + return self; + } + // Just in case PATTERN_LIMIT increases beyond u16::MAX. + assert!(self.patterns.len() <= core::u16::MAX as usize); + + let pattern = pattern.as_ref(); + if pattern.is_empty() { + self.inert = true; + self.patterns.reset(); + return self; + } + self.patterns.add(pattern); + self + } + + /// Add the given iterator of patterns to this set to match. + /// + /// The iterator must yield elements that can be converted into a `&[u8]`. + /// + /// The order in which patterns are added is significant. Namely, when + /// using leftmost-first match semantics, then when multiple patterns can + /// match at a particular location, the pattern that was added first is + /// used as the match. + /// + /// If the number of patterns added exceeds the amount supported by packed + /// searchers, then the builder will stop accumulating patterns and render + /// itself inert. At this point, constructing a searcher will always return + /// `None`. + pub fn extend<I, P>(&mut self, patterns: I) -> &mut Builder + where + I: IntoIterator<Item = P>, + P: AsRef<[u8]>, + { + for p in patterns { + self.add(p); + } + self + } + + /// Returns the number of patterns added to this builder. + pub fn len(&self) -> usize { + self.patterns.len() + } + + /// Returns the length, in bytes, of the shortest pattern added. + pub fn minimum_len(&self) -> usize { + self.patterns.minimum_len() + } +} + +impl Default for Builder { + fn default() -> Builder { + Builder::new() + } +} + +/// A packed searcher for quickly finding occurrences of multiple patterns. +/// +/// If callers need more flexible construction, or if one wants to change the +/// match semantics (either leftmost-first or leftmost-longest), then one can +/// use the [`Config`] and/or [`Builder`] types for more fine grained control. +/// +/// # Example +/// +/// This example shows how to create a searcher from an iterator of patterns. +/// By default, leftmost-first match semantics are used. +/// +/// ``` +/// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID}; +/// +/// # fn example() -> Option<()> { +/// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; +/// let matches: Vec<PatternID> = searcher +/// .find_iter("foobar") +/// .map(|mat| mat.pattern()) +/// .collect(); +/// assert_eq!(vec![PatternID::ZERO], matches); +/// # Some(()) } +/// # if cfg!(all(feature = "std", any( +/// # target_arch = "x86_64", target_arch = "aarch64", +/// # ))) { +/// # example().unwrap() +/// # } else { +/// # assert!(example().is_none()); +/// # } +/// ``` +#[derive(Clone, Debug)] +pub struct Searcher { + patterns: Arc<Patterns>, + rabinkarp: RabinKarp, + search_kind: SearchKind, + minimum_len: usize, +} + +#[derive(Clone, Debug)] +enum SearchKind { + Teddy(teddy::Searcher), + RabinKarp, +} + +impl Searcher { + /// A convenience function for constructing a searcher from an iterator + /// of things that can be converted to a `&[u8]`. + /// + /// If a searcher could not be constructed (either because of an + /// unsupported CPU or because there are too many patterns), then `None` + /// is returned. + /// + /// # Example + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID}; + /// + /// # fn example() -> Option<()> { + /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; + /// let matches: Vec<PatternID> = searcher + /// .find_iter("foobar") + /// .map(|mat| mat.pattern()) + /// .collect(); + /// assert_eq!(vec![PatternID::ZERO], matches); + /// # Some(()) } + /// # if cfg!(all(feature = "std", any( + /// # target_arch = "x86_64", target_arch = "aarch64", + /// # ))) { + /// # example().unwrap() + /// # } else { + /// # assert!(example().is_none()); + /// # } + /// ``` + pub fn new<I, P>(patterns: I) -> Option<Searcher> + where + I: IntoIterator<Item = P>, + P: AsRef<[u8]>, + { + Builder::new().extend(patterns).build() + } + + /// A convenience function for calling `Config::new()`. + /// + /// This is useful for avoiding an additional import. + pub fn config() -> Config { + Config::new() + } + + /// A convenience function for calling `Builder::new()`. + /// + /// This is useful for avoiding an additional import. + pub fn builder() -> Builder { + Builder::new() + } + + /// Return the first occurrence of any of the patterns in this searcher, + /// according to its match semantics, in the given haystack. The `Match` + /// returned will include the identifier of the pattern that matched, which + /// corresponds to the index of the pattern (starting from `0`) in which it + /// was added. + /// + /// # Example + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID}; + /// + /// # fn example() -> Option<()> { + /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; + /// let mat = searcher.find("foobar")?; + /// assert_eq!(PatternID::ZERO, mat.pattern()); + /// assert_eq!(0, mat.start()); + /// assert_eq!(6, mat.end()); + /// # Some(()) } + /// # if cfg!(all(feature = "std", any( + /// # target_arch = "x86_64", target_arch = "aarch64", + /// # ))) { + /// # example().unwrap() + /// # } else { + /// # assert!(example().is_none()); + /// # } + /// ``` + #[inline] + pub fn find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> { + let haystack = haystack.as_ref(); + self.find_in(haystack, Span::from(0..haystack.len())) + } + + /// Return the first occurrence of any of the patterns in this searcher, + /// according to its match semantics, in the given haystack starting from + /// the given position. + /// + /// The `Match` returned will include the identifier of the pattern that + /// matched, which corresponds to the index of the pattern (starting from + /// `0`) in which it was added. The offsets in the `Match` will be relative + /// to the start of `haystack` (and not `at`). + /// + /// # Example + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID, Span}; + /// + /// # fn example() -> Option<()> { + /// let haystack = "foofoobar"; + /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; + /// let mat = searcher.find_in(haystack, Span::from(3..haystack.len()))?; + /// assert_eq!(PatternID::ZERO, mat.pattern()); + /// assert_eq!(3, mat.start()); + /// assert_eq!(9, mat.end()); + /// # Some(()) } + /// # if cfg!(all(feature = "std", any( + /// # target_arch = "x86_64", target_arch = "aarch64", + /// # ))) { + /// # example().unwrap() + /// # } else { + /// # assert!(example().is_none()); + /// # } + /// ``` + #[inline] + pub fn find_in<B: AsRef<[u8]>>( + &self, + haystack: B, + span: Span, + ) -> Option<Match> { + let haystack = haystack.as_ref(); + match self.search_kind { + SearchKind::Teddy(ref teddy) => { + if haystack[span].len() < teddy.minimum_len() { + return self.find_in_slow(haystack, span); + } + teddy.find(&haystack[..span.end], span.start) + } + SearchKind::RabinKarp => { + self.rabinkarp.find_at(&haystack[..span.end], span.start) + } + } + } + + /// Return an iterator of non-overlapping occurrences of the patterns in + /// this searcher, according to its match semantics, in the given haystack. + /// + /// # Example + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID}; + /// + /// # fn example() -> Option<()> { + /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; + /// let matches: Vec<PatternID> = searcher + /// .find_iter("foobar fooba foofoo") + /// .map(|mat| mat.pattern()) + /// .collect(); + /// assert_eq!(vec![ + /// PatternID::must(0), + /// PatternID::must(1), + /// PatternID::must(1), + /// PatternID::must(1), + /// ], matches); + /// # Some(()) } + /// # if cfg!(all(feature = "std", any( + /// # target_arch = "x86_64", target_arch = "aarch64", + /// # ))) { + /// # example().unwrap() + /// # } else { + /// # assert!(example().is_none()); + /// # } + /// ``` + #[inline] + pub fn find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>( + &'a self, + haystack: &'b B, + ) -> FindIter<'a, 'b> { + let haystack = haystack.as_ref(); + let span = Span::from(0..haystack.len()); + FindIter { searcher: self, haystack, span } + } + + /// Returns the match kind used by this packed searcher. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::packed::{MatchKind, Searcher}; + /// + /// # fn example() -> Option<()> { + /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; + /// // leftmost-first is the default. + /// assert_eq!(&MatchKind::LeftmostFirst, searcher.match_kind()); + /// # Some(()) } + /// # if cfg!(all(feature = "std", any( + /// # target_arch = "x86_64", target_arch = "aarch64", + /// # ))) { + /// # example().unwrap() + /// # } else { + /// # assert!(example().is_none()); + /// # } + /// ``` + #[inline] + pub fn match_kind(&self) -> &MatchKind { + self.patterns.match_kind() + } + + /// Returns the minimum length of a haystack that is required in order for + /// packed searching to be effective. + /// + /// In some cases, the underlying packed searcher may not be able to search + /// very short haystacks. When that occurs, the implementation will defer + /// to a slower non-packed searcher (which is still generally faster than + /// Aho-Corasick for a small number of patterns). However, callers may + /// want to avoid ever using the slower variant, which one can do by + /// never passing a haystack shorter than the minimum length returned by + /// this method. + #[inline] + pub fn minimum_len(&self) -> usize { + self.minimum_len + } + + /// Returns the approximate total amount of heap used by this searcher, in + /// units of bytes. + #[inline] + pub fn memory_usage(&self) -> usize { + self.patterns.memory_usage() + + self.rabinkarp.memory_usage() + + self.search_kind.memory_usage() + } + + /// Use a slow (non-packed) searcher. + /// + /// This is useful when a packed searcher could be constructed, but could + /// not be used to search a specific haystack. For example, if Teddy was + /// built but the haystack is smaller than ~34 bytes, then Teddy might not + /// be able to run. + fn find_in_slow(&self, haystack: &[u8], span: Span) -> Option<Match> { + self.rabinkarp.find_at(&haystack[..span.end], span.start) + } +} + +impl SearchKind { + fn memory_usage(&self) -> usize { + match *self { + SearchKind::Teddy(ref ted) => ted.memory_usage(), + SearchKind::RabinKarp => 0, + } + } +} + +/// An iterator over non-overlapping matches from a packed searcher. +/// +/// The lifetime `'s` refers to the lifetime of the underlying [`Searcher`], +/// while the lifetime `'h` refers to the lifetime of the haystack being +/// searched. +#[derive(Debug)] +pub struct FindIter<'s, 'h> { + searcher: &'s Searcher, + haystack: &'h [u8], + span: Span, +} + +impl<'s, 'h> Iterator for FindIter<'s, 'h> { + type Item = Match; + + fn next(&mut self) -> Option<Match> { + if self.span.start > self.span.end { + return None; + } + match self.searcher.find_in(&self.haystack, self.span) { + None => None, + Some(m) => { + self.span.start = m.end(); + Some(m) + } + } + } +} diff --git a/vendor/aho-corasick/src/packed/ext.rs b/vendor/aho-corasick/src/packed/ext.rs new file mode 100644 index 0000000..b689642 --- /dev/null +++ b/vendor/aho-corasick/src/packed/ext.rs @@ -0,0 +1,39 @@ +/// A trait for adding some helper routines to pointers. +pub(crate) trait Pointer { + /// Returns the distance, in units of `T`, between `self` and `origin`. + /// + /// # Safety + /// + /// Same as `ptr::offset_from` in addition to `self >= origin`. + unsafe fn distance(self, origin: Self) -> usize; + + /// Casts this pointer to `usize`. + /// + /// Callers should not convert the `usize` back to a pointer if at all + /// possible. (And if you believe it's necessary, open an issue to discuss + /// why. Otherwise, it has the potential to violate pointer provenance.) + /// The purpose of this function is just to be able to do arithmetic, i.e., + /// computing offsets or alignments. + fn as_usize(self) -> usize; +} + +impl<T> Pointer for *const T { + unsafe fn distance(self, origin: *const T) -> usize { + // TODO: Replace with `ptr::sub_ptr` once stabilized. + usize::try_from(self.offset_from(origin)).unwrap_unchecked() + } + + fn as_usize(self) -> usize { + self as usize + } +} + +impl<T> Pointer for *mut T { + unsafe fn distance(self, origin: *mut T) -> usize { + (self as *const T).distance(origin as *const T) + } + + fn as_usize(self) -> usize { + (self as *const T).as_usize() + } +} diff --git a/vendor/aho-corasick/src/packed/mod.rs b/vendor/aho-corasick/src/packed/mod.rs new file mode 100644 index 0000000..3990bc9 --- /dev/null +++ b/vendor/aho-corasick/src/packed/mod.rs @@ -0,0 +1,120 @@ +/*! +Provides packed multiple substring search, principally for a small number of +patterns. + +This sub-module provides vectorized routines for quickly finding +matches of a small number of patterns. In general, users of this crate +shouldn't need to interface with this module directly, as the primary +[`AhoCorasick`](crate::AhoCorasick) searcher will use these routines +automatically as a prefilter when applicable. However, in some cases, callers +may want to bypass the Aho-Corasick machinery entirely and use this vectorized +searcher directly. + +# Overview + +The primary types in this sub-module are: + +* [`Searcher`] executes the actual search algorithm to report matches in a +haystack. +* [`Builder`] accumulates patterns incrementally and can construct a +`Searcher`. +* [`Config`] permits tuning the searcher, and itself will produce a `Builder` +(which can then be used to build a `Searcher`). Currently, the only tuneable +knob are the match semantics, but this may be expanded in the future. + +# Examples + +This example shows how to create a searcher from an iterator of patterns. +By default, leftmost-first match semantics are used. (See the top-level +[`MatchKind`] type for more details about match semantics, which apply +similarly to packed substring search.) + +``` +use aho_corasick::{packed::{MatchKind, Searcher}, PatternID}; + +# fn example() -> Option<()> { +let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; +let matches: Vec<PatternID> = searcher + .find_iter("foobar") + .map(|mat| mat.pattern()) + .collect(); +assert_eq!(vec![PatternID::ZERO], matches); +# Some(()) } +# if cfg!(all(feature = "std", any( +# target_arch = "x86_64", target_arch = "aarch64", +# ))) { +# example().unwrap() +# } else { +# assert!(example().is_none()); +# } +``` + +This example shows how to use [`Config`] to change the match semantics to +leftmost-longest: + +``` +use aho_corasick::{packed::{Config, MatchKind}, PatternID}; + +# fn example() -> Option<()> { +let searcher = Config::new() + .match_kind(MatchKind::LeftmostLongest) + .builder() + .add("foo") + .add("foobar") + .build()?; +let matches: Vec<PatternID> = searcher + .find_iter("foobar") + .map(|mat| mat.pattern()) + .collect(); +assert_eq!(vec![PatternID::must(1)], matches); +# Some(()) } +# if cfg!(all(feature = "std", any( +# target_arch = "x86_64", target_arch = "aarch64", +# ))) { +# example().unwrap() +# } else { +# assert!(example().is_none()); +# } +``` + +# Packed substring searching + +Packed substring searching refers to the use of SIMD (Single Instruction, +Multiple Data) to accelerate the detection of matches in a haystack. Unlike +conventional algorithms, such as Aho-Corasick, SIMD algorithms for substring +search tend to do better with a small number of patterns, where as Aho-Corasick +generally maintains reasonably consistent performance regardless of the number +of patterns you give it. Because of this, the vectorized searcher in this +sub-module cannot be used as a general purpose searcher, since building the +searcher may fail even when given a small number of patterns. However, in +exchange, when searching for a small number of patterns, searching can be quite +a bit faster than Aho-Corasick (sometimes by an order of magnitude). + +The key take away here is that constructing a searcher from a list of patterns +is a fallible operation with no clear rules for when it will fail. While the +precise conditions under which building a searcher can fail is specifically an +implementation detail, here are some common reasons: + +* Too many patterns were given. Typically, the limit is on the order of 100 or + so, but this limit may fluctuate based on available CPU features. +* The available packed algorithms require CPU features that aren't available. + For example, currently, this crate only provides packed algorithms for + `x86_64` and `aarch64`. Therefore, constructing a packed searcher on any + other target will always fail. +* Zero patterns were given, or one of the patterns given was empty. Packed + searchers require at least one pattern and that all patterns are non-empty. +* Something else about the nature of the patterns (typically based on + heuristics) suggests that a packed searcher would perform very poorly, so + no searcher is built. +*/ + +pub use crate::packed::api::{Builder, Config, FindIter, MatchKind, Searcher}; + +mod api; +mod ext; +mod pattern; +mod rabinkarp; +mod teddy; +#[cfg(all(feature = "std", test))] +mod tests; +mod vector; diff --git a/vendor/aho-corasick/src/packed/pattern.rs b/vendor/aho-corasick/src/packed/pattern.rs new file mode 100644 index 0000000..95aca4d --- /dev/null +++ b/vendor/aho-corasick/src/packed/pattern.rs @@ -0,0 +1,480 @@ +use core::{cmp, fmt, mem, u16, usize}; + +use alloc::{boxed::Box, string::String, vec, vec::Vec}; + +use crate::{ + packed::{api::MatchKind, ext::Pointer}, + PatternID, +}; + +/// A non-empty collection of non-empty patterns to search for. +/// +/// This collection of patterns is what is passed around to both execute +/// searches and to construct the searchers themselves. Namely, this permits +/// searches to avoid copying all of the patterns, and allows us to keep only +/// one copy throughout all packed searchers. +/// +/// Note that this collection is not a set. The same pattern can appear more +/// than once. +#[derive(Clone, Debug)] +pub(crate) struct Patterns { + /// The match semantics supported by this collection of patterns. + /// + /// The match semantics determines the order of the iterator over patterns. + /// For leftmost-first, patterns are provided in the same order as were + /// provided by the caller. For leftmost-longest, patterns are provided in + /// descending order of length, with ties broken by the order in which they + /// were provided by the caller. + kind: MatchKind, + /// The collection of patterns, indexed by their identifier. + by_id: Vec<Vec<u8>>, + /// The order of patterns defined for iteration, given by pattern + /// identifiers. The order of `by_id` and `order` is always the same for + /// leftmost-first semantics, but may be different for leftmost-longest + /// semantics. + order: Vec<PatternID>, + /// The length of the smallest pattern, in bytes. + minimum_len: usize, + /// The total number of pattern bytes across the entire collection. This + /// is used for reporting total heap usage in constant time. + total_pattern_bytes: usize, +} + +// BREADCRUMBS: I think we want to experiment with a different bucket +// representation. Basically, each bucket is just a Range<usize> to a single +// contiguous allocation? Maybe length-prefixed patterns or something? The +// idea is to try to get rid of the pointer chasing in verification. I don't +// know that that is the issue, but I suspect it is. + +impl Patterns { + /// Create a new collection of patterns for the given match semantics. The + /// ID of each pattern is the index of the pattern at which it occurs in + /// the `by_id` slice. + /// + /// If any of the patterns in the slice given are empty, then this panics. + /// Similarly, if the number of patterns given is zero, then this also + /// panics. + pub(crate) fn new() -> Patterns { + Patterns { + kind: MatchKind::default(), + by_id: vec![], + order: vec![], + minimum_len: usize::MAX, + total_pattern_bytes: 0, + } + } + + /// Add a pattern to this collection. + /// + /// This panics if the pattern given is empty. + pub(crate) fn add(&mut self, bytes: &[u8]) { + assert!(!bytes.is_empty()); + assert!(self.by_id.len() <= u16::MAX as usize); + + let id = PatternID::new(self.by_id.len()).unwrap(); + self.order.push(id); + self.by_id.push(bytes.to_vec()); + self.minimum_len = cmp::min(self.minimum_len, bytes.len()); + self.total_pattern_bytes += bytes.len(); + } + + /// Set the match kind semantics for this collection of patterns. + /// + /// If the kind is not set, then the default is leftmost-first. + pub(crate) fn set_match_kind(&mut self, kind: MatchKind) { + self.kind = kind; + match self.kind { + MatchKind::LeftmostFirst => { + self.order.sort(); + } + MatchKind::LeftmostLongest => { + let (order, by_id) = (&mut self.order, &mut self.by_id); + order.sort_by(|&id1, &id2| { + by_id[id1].len().cmp(&by_id[id2].len()).reverse() + }); + } + } + } + + /// Return the number of patterns in this collection. + /// + /// This is guaranteed to be greater than zero. + pub(crate) fn len(&self) -> usize { + self.by_id.len() + } + + /// Returns true if and only if this collection of patterns is empty. + pub(crate) fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Returns the approximate total amount of heap used by these patterns, in + /// units of bytes. + pub(crate) fn memory_usage(&self) -> usize { + self.order.len() * mem::size_of::<PatternID>() + + self.by_id.len() * mem::size_of::<Vec<u8>>() + + self.total_pattern_bytes + } + + /// Clears all heap memory associated with this collection of patterns and + /// resets all state such that it is a valid empty collection. + pub(crate) fn reset(&mut self) { + self.kind = MatchKind::default(); + self.by_id.clear(); + self.order.clear(); + self.minimum_len = usize::MAX; + } + + /// Returns the length, in bytes, of the smallest pattern. + /// + /// This is guaranteed to be at least one. + pub(crate) fn minimum_len(&self) -> usize { + self.minimum_len + } + + /// Returns the match semantics used by these patterns. + pub(crate) fn match_kind(&self) -> &MatchKind { + &self.kind + } + + /// Return the pattern with the given identifier. If such a pattern does + /// not exist, then this panics. + pub(crate) fn get(&self, id: PatternID) -> Pattern<'_> { + Pattern(&self.by_id[id]) + } + + /// Return the pattern with the given identifier without performing bounds + /// checks. + /// + /// # Safety + /// + /// Callers must ensure that a pattern with the given identifier exists + /// before using this method. + pub(crate) unsafe fn get_unchecked(&self, id: PatternID) -> Pattern<'_> { + Pattern(self.by_id.get_unchecked(id.as_usize())) + } + + /// Return an iterator over all the patterns in this collection, in the + /// order in which they should be matched. + /// + /// Specifically, in a naive multi-pattern matcher, the following is + /// guaranteed to satisfy the match semantics of this collection of + /// patterns: + /// + /// ```ignore + /// for i in 0..haystack.len(): + /// for p in patterns.iter(): + /// if haystack[i..].starts_with(p.bytes()): + /// return Match(p.id(), i, i + p.bytes().len()) + /// ``` + /// + /// Namely, among the patterns in a collection, if they are matched in + /// the order provided by this iterator, then the result is guaranteed + /// to satisfy the correct match semantics. (Either leftmost-first or + /// leftmost-longest.) + pub(crate) fn iter(&self) -> PatternIter<'_> { + PatternIter { patterns: self, i: 0 } + } +} + +/// An iterator over the patterns in the `Patterns` collection. +/// +/// The order of the patterns provided by this iterator is consistent with the +/// match semantics of the originating collection of patterns. +/// +/// The lifetime `'p` corresponds to the lifetime of the collection of patterns +/// this is iterating over. +#[derive(Debug)] +pub(crate) struct PatternIter<'p> { + patterns: &'p Patterns, + i: usize, +} + +impl<'p> Iterator for PatternIter<'p> { + type Item = (PatternID, Pattern<'p>); + + fn next(&mut self) -> Option<(PatternID, Pattern<'p>)> { + if self.i >= self.patterns.len() { + return None; + } + let id = self.patterns.order[self.i]; + let p = self.patterns.get(id); + self.i += 1; + Some((id, p)) + } +} + +/// A pattern that is used in packed searching. +#[derive(Clone)] +pub(crate) struct Pattern<'a>(&'a [u8]); + +impl<'a> fmt::Debug for Pattern<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Pattern") + .field("lit", &String::from_utf8_lossy(&self.0)) + .finish() + } +} + +impl<'p> Pattern<'p> { + /// Returns the length of this pattern, in bytes. + pub(crate) fn len(&self) -> usize { + self.0.len() + } + + /// Returns the bytes of this pattern. + pub(crate) fn bytes(&self) -> &[u8] { + &self.0 + } + + /// Returns the first `len` low nybbles from this pattern. If this pattern + /// is shorter than `len`, then this panics. + pub(crate) fn low_nybbles(&self, len: usize) -> Box<[u8]> { + let mut nybs = vec![0; len].into_boxed_slice(); + for (i, byte) in self.bytes().iter().take(len).enumerate() { + nybs[i] = byte & 0xF; + } + nybs + } + + /// Returns true if this pattern is a prefix of the given bytes. + #[inline(always)] + pub(crate) fn is_prefix(&self, bytes: &[u8]) -> bool { + is_prefix(bytes, self.bytes()) + } + + /// Returns true if this pattern is a prefix of the haystack given by the + /// raw `start` and `end` pointers. + /// + /// # Safety + /// + /// * It must be the case that `start < end` and that the distance between + /// them is at least equal to `V::BYTES`. That is, it must always be valid + /// to do at least an unaligned load of `V` at `start`. + /// * Both `start` and `end` must be valid for reads. + /// * Both `start` and `end` must point to an initialized value. + /// * Both `start` and `end` must point to the same allocated object and + /// must either be in bounds or at most one byte past the end of the + /// allocated object. + /// * Both `start` and `end` must be _derived from_ a pointer to the same + /// object. + /// * The distance between `start` and `end` must not overflow `isize`. + /// * The distance being in bounds must not rely on "wrapping around" the + /// address space. + #[inline(always)] + pub(crate) unsafe fn is_prefix_raw( + &self, + start: *const u8, + end: *const u8, + ) -> bool { + let patlen = self.bytes().len(); + let haylen = end.distance(start); + if patlen > haylen { + return false; + } + // SAFETY: We've checked that the haystack has length at least equal + // to this pattern. All other safety concerns are the responsibility + // of the caller. + is_equal_raw(start, self.bytes().as_ptr(), patlen) + } +} + +/// Returns true if and only if `needle` is a prefix of `haystack`. +/// +/// This uses a latency optimized variant of `memcmp` internally which *might* +/// make this faster for very short strings. +/// +/// # Inlining +/// +/// This routine is marked `inline(always)`. If you want to call this function +/// in a way that is not always inlined, you'll need to wrap a call to it in +/// another function that is marked as `inline(never)` or just `inline`. +#[inline(always)] +fn is_prefix(haystack: &[u8], needle: &[u8]) -> bool { + if needle.len() > haystack.len() { + return false; + } + // SAFETY: Our pointers are derived directly from borrowed slices which + // uphold all of our safety guarantees except for length. We account for + // length with the check above. + unsafe { is_equal_raw(haystack.as_ptr(), needle.as_ptr(), needle.len()) } +} + +/// Compare corresponding bytes in `x` and `y` for equality. +/// +/// That is, this returns true if and only if `x.len() == y.len()` and +/// `x[i] == y[i]` for all `0 <= i < x.len()`. +/// +/// Note that this isn't used. We only use it in tests as a convenient way +/// of testing `is_equal_raw`. +/// +/// # Inlining +/// +/// This routine is marked `inline(always)`. If you want to call this function +/// in a way that is not always inlined, you'll need to wrap a call to it in +/// another function that is marked as `inline(never)` or just `inline`. +/// +/// # Motivation +/// +/// Why not use slice equality instead? Well, slice equality usually results in +/// a call out to the current platform's `libc` which might not be inlineable +/// or have other overhead. This routine isn't guaranteed to be a win, but it +/// might be in some cases. +#[cfg(test)] +#[inline(always)] +fn is_equal(x: &[u8], y: &[u8]) -> bool { + if x.len() != y.len() { + return false; + } + // SAFETY: Our pointers are derived directly from borrowed slices which + // uphold all of our safety guarantees except for length. We account for + // length with the check above. + unsafe { is_equal_raw(x.as_ptr(), y.as_ptr(), x.len()) } +} + +/// Compare `n` bytes at the given pointers for equality. +/// +/// This returns true if and only if `*x.add(i) == *y.add(i)` for all +/// `0 <= i < n`. +/// +/// # Inlining +/// +/// This routine is marked `inline(always)`. If you want to call this function +/// in a way that is not always inlined, you'll need to wrap a call to it in +/// another function that is marked as `inline(never)` or just `inline`. +/// +/// # Motivation +/// +/// Why not use slice equality instead? Well, slice equality usually results in +/// a call out to the current platform's `libc` which might not be inlineable +/// or have other overhead. This routine isn't guaranteed to be a win, but it +/// might be in some cases. +/// +/// # Safety +/// +/// * Both `x` and `y` must be valid for reads of up to `n` bytes. +/// * Both `x` and `y` must point to an initialized value. +/// * Both `x` and `y` must each point to an allocated object and +/// must either be in bounds or at most one byte past the end of the +/// allocated object. `x` and `y` do not need to point to the same allocated +/// object, but they may. +/// * Both `x` and `y` must be _derived from_ a pointer to their respective +/// allocated objects. +/// * The distance between `x` and `x+n` must not overflow `isize`. Similarly +/// for `y` and `y+n`. +/// * The distance being in bounds must not rely on "wrapping around" the +/// address space. +#[inline(always)] +unsafe fn is_equal_raw(mut x: *const u8, mut y: *const u8, n: usize) -> bool { + // If we don't have enough bytes to do 4-byte at a time loads, then + // handle each possible length specially. Note that I used to have a + // byte-at-a-time loop here and that turned out to be quite a bit slower + // for the memmem/pathological/defeat-simple-vector-alphabet benchmark. + if n < 4 { + return match n { + 0 => true, + 1 => x.read() == y.read(), + 2 => { + x.cast::<u16>().read_unaligned() + == y.cast::<u16>().read_unaligned() + } + // I also tried copy_nonoverlapping here and it looks like the + // codegen is the same. + 3 => x.cast::<[u8; 3]>().read() == y.cast::<[u8; 3]>().read(), + _ => unreachable!(), + }; + } + // When we have 4 or more bytes to compare, then proceed in chunks of 4 at + // a time using unaligned loads. + // + // Also, why do 4 byte loads instead of, say, 8 byte loads? The reason is + // that this particular version of memcmp is likely to be called with tiny + // needles. That means that if we do 8 byte loads, then a higher proportion + // of memcmp calls will use the slower variant above. With that said, this + // is a hypothesis and is only loosely supported by benchmarks. There's + // likely some improvement that could be made here. The main thing here + // though is to optimize for latency, not throughput. + + // SAFETY: The caller is responsible for ensuring the pointers we get are + // valid and readable for at least `n` bytes. We also do unaligned loads, + // so there's no need to ensure we're aligned. (This is justified by this + // routine being specifically for short strings.) + let xend = x.add(n.wrapping_sub(4)); + let yend = y.add(n.wrapping_sub(4)); + while x < xend { + let vx = x.cast::<u32>().read_unaligned(); + let vy = y.cast::<u32>().read_unaligned(); + if vx != vy { + return false; + } + x = x.add(4); + y = y.add(4); + } + let vx = xend.cast::<u32>().read_unaligned(); + let vy = yend.cast::<u32>().read_unaligned(); + vx == vy +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn equals_different_lengths() { + assert!(!is_equal(b"", b"a")); + assert!(!is_equal(b"a", b"")); + assert!(!is_equal(b"ab", b"a")); + assert!(!is_equal(b"a", b"ab")); + } + + #[test] + fn equals_mismatch() { + let one_mismatch = [ + (&b"a"[..], &b"x"[..]), + (&b"ab"[..], &b"ax"[..]), + (&b"abc"[..], &b"abx"[..]), + (&b"abcd"[..], &b"abcx"[..]), + (&b"abcde"[..], &b"abcdx"[..]), + (&b"abcdef"[..], &b"abcdex"[..]), + (&b"abcdefg"[..], &b"abcdefx"[..]), + (&b"abcdefgh"[..], &b"abcdefgx"[..]), + (&b"abcdefghi"[..], &b"abcdefghx"[..]), + (&b"abcdefghij"[..], &b"abcdefghix"[..]), + (&b"abcdefghijk"[..], &b"abcdefghijx"[..]), + (&b"abcdefghijkl"[..], &b"abcdefghijkx"[..]), + (&b"abcdefghijklm"[..], &b"abcdefghijklx"[..]), + (&b"abcdefghijklmn"[..], &b"abcdefghijklmx"[..]), + ]; + for (x, y) in one_mismatch { + assert_eq!(x.len(), y.len(), "lengths should match"); + assert!(!is_equal(x, y)); + assert!(!is_equal(y, x)); + } + } + + #[test] + fn equals_yes() { + assert!(is_equal(b"", b"")); + assert!(is_equal(b"a", b"a")); + assert!(is_equal(b"ab", b"ab")); + assert!(is_equal(b"abc", b"abc")); + assert!(is_equal(b"abcd", b"abcd")); + assert!(is_equal(b"abcde", b"abcde")); + assert!(is_equal(b"abcdef", b"abcdef")); + assert!(is_equal(b"abcdefg", b"abcdefg")); + assert!(is_equal(b"abcdefgh", b"abcdefgh")); + assert!(is_equal(b"abcdefghi", b"abcdefghi")); + } + + #[test] + fn prefix() { + assert!(is_prefix(b"", b"")); + assert!(is_prefix(b"a", b"")); + assert!(is_prefix(b"ab", b"")); + assert!(is_prefix(b"foo", b"foo")); + assert!(is_prefix(b"foobar", b"foo")); + + assert!(!is_prefix(b"foo", b"fob")); + assert!(!is_prefix(b"foobar", b"fob")); + } +} diff --git a/vendor/aho-corasick/src/packed/rabinkarp.rs b/vendor/aho-corasick/src/packed/rabinkarp.rs new file mode 100644 index 0000000..fdd8a6f --- /dev/null +++ b/vendor/aho-corasick/src/packed/rabinkarp.rs @@ -0,0 +1,168 @@ +use alloc::{sync::Arc, vec, vec::Vec}; + +use crate::{packed::pattern::Patterns, util::search::Match, PatternID}; + +/// The type of the rolling hash used in the Rabin-Karp algorithm. +type Hash = usize; + +/// The number of buckets to store our patterns in. We don't want this to be +/// too big in order to avoid wasting memory, but we don't want it to be too +/// small either to avoid spending too much time confirming literals. +/// +/// The number of buckets MUST be a power of two. Otherwise, determining the +/// bucket from a hash will slow down the code considerably. Using a power +/// of two means `hash % NUM_BUCKETS` can compile down to a simple `and` +/// instruction. +const NUM_BUCKETS: usize = 64; + +/// An implementation of the Rabin-Karp algorithm. The main idea of this +/// algorithm is to maintain a rolling hash as it moves through the input, and +/// then check whether that hash corresponds to the same hash for any of the +/// patterns we're looking for. +/// +/// A draw back of naively scaling Rabin-Karp to multiple patterns is that +/// it requires all of the patterns to be the same length, which in turn +/// corresponds to the number of bytes to hash. We adapt this to work for +/// multiple patterns of varying size by fixing the number of bytes to hash +/// to be the length of the smallest pattern. We also split the patterns into +/// several buckets to hopefully make the confirmation step faster. +/// +/// Wikipedia has a decent explanation, if a bit heavy on the theory: +/// https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm +/// +/// But ESMAJ provides something a bit more concrete: +/// https://www-igm.univ-mlv.fr/~lecroq/string/node5.html +#[derive(Clone, Debug)] +pub(crate) struct RabinKarp { + /// The patterns we're searching for. + patterns: Arc<Patterns>, + /// The order of patterns in each bucket is significant. Namely, they are + /// arranged such that the first one to match is the correct match. This + /// may not necessarily correspond to the order provided by the caller. + /// For example, if leftmost-longest semantics are used, then the patterns + /// are sorted by their length in descending order. If leftmost-first + /// semantics are used, then the patterns are sorted by their pattern ID + /// in ascending order (which corresponds to the caller's order). + buckets: Vec<Vec<(Hash, PatternID)>>, + /// The length of the hashing window. Generally, this corresponds to the + /// length of the smallest pattern. + hash_len: usize, + /// The factor to subtract out of a hash before updating it with a new + /// byte. + hash_2pow: usize, +} + +impl RabinKarp { + /// Compile a new Rabin-Karp matcher from the patterns given. + /// + /// This panics if any of the patterns in the collection are empty, or if + /// the collection is itself empty. + pub(crate) fn new(patterns: &Arc<Patterns>) -> RabinKarp { + assert!(patterns.len() >= 1); + let hash_len = patterns.minimum_len(); + assert!(hash_len >= 1); + + let mut hash_2pow = 1usize; + for _ in 1..hash_len { + hash_2pow = hash_2pow.wrapping_shl(1); + } + + let mut rk = RabinKarp { + patterns: Arc::clone(patterns), + buckets: vec![vec![]; NUM_BUCKETS], + hash_len, + hash_2pow, + }; + for (id, pat) in patterns.iter() { + let hash = rk.hash(&pat.bytes()[..rk.hash_len]); + let bucket = hash % NUM_BUCKETS; + rk.buckets[bucket].push((hash, id)); + } + rk + } + + /// Return the first matching pattern in the given haystack, begining the + /// search at `at`. + pub(crate) fn find_at( + &self, + haystack: &[u8], + mut at: usize, + ) -> Option<Match> { + assert_eq!(NUM_BUCKETS, self.buckets.len()); + + if at + self.hash_len > haystack.len() { + return None; + } + let mut hash = self.hash(&haystack[at..at + self.hash_len]); + loop { + let bucket = &self.buckets[hash % NUM_BUCKETS]; + for &(phash, pid) in bucket { + if phash == hash { + if let Some(c) = self.verify(pid, haystack, at) { + return Some(c); + } + } + } + if at + self.hash_len >= haystack.len() { + return None; + } + hash = self.update_hash( + hash, + haystack[at], + haystack[at + self.hash_len], + ); + at += 1; + } + } + + /// Returns the approximate total amount of heap used by this searcher, in + /// units of bytes. + pub(crate) fn memory_usage(&self) -> usize { + self.buckets.len() * core::mem::size_of::<Vec<(Hash, PatternID)>>() + + self.patterns.len() * core::mem::size_of::<(Hash, PatternID)>() + } + + /// Verify whether the pattern with the given id matches at + /// `haystack[at..]`. + /// + /// We tag this function as `cold` because it helps improve codegen. + /// Intuitively, it would seem like inlining it would be better. However, + /// the only time this is called and a match is not found is when there + /// there is a hash collision, or when a prefix of a pattern matches but + /// the entire pattern doesn't match. This is hopefully fairly rare, and + /// if it does occur a lot, it's going to be slow no matter what we do. + #[cold] + fn verify( + &self, + id: PatternID, + haystack: &[u8], + at: usize, + ) -> Option<Match> { + let pat = self.patterns.get(id); + if pat.is_prefix(&haystack[at..]) { + Some(Match::new(id, at..at + pat.len())) + } else { + None + } + } + + /// Hash the given bytes. + fn hash(&self, bytes: &[u8]) -> Hash { + assert_eq!(self.hash_len, bytes.len()); + + let mut hash = 0usize; + for &b in bytes { + hash = hash.wrapping_shl(1).wrapping_add(b as usize); + } + hash + } + + /// Update the hash given based on removing `old_byte` at the beginning + /// of some byte string, and appending `new_byte` to the end of that same + /// byte string. + fn update_hash(&self, prev: Hash, old_byte: u8, new_byte: u8) -> Hash { + prev.wrapping_sub((old_byte as usize).wrapping_mul(self.hash_2pow)) + .wrapping_shl(1) + .wrapping_add(new_byte as usize) + } +} diff --git a/vendor/aho-corasick/src/packed/teddy/README.md b/vendor/aho-corasick/src/packed/teddy/README.md new file mode 100644 index 0000000..f0928cb --- /dev/null +++ b/vendor/aho-corasick/src/packed/teddy/README.md @@ -0,0 +1,386 @@ +Teddy is a SIMD accelerated multiple substring matching algorithm. The name +and the core ideas in the algorithm were learned from the [Hyperscan][1_u] +project. The implementation in this repository was mostly motivated for use in +accelerating regex searches by searching for small sets of required literals +extracted from the regex. + + +# Background + +The key idea of Teddy is to do *packed* substring matching. In the literature, +packed substring matching is the idea of examining multiple bytes in a haystack +at a time to detect matches. Implementations of, for example, memchr (which +detects matches of a single byte) have been doing this for years. Only +recently, with the introduction of various SIMD instructions, has this been +extended to substring matching. The PCMPESTRI instruction (and its relatives), +for example, implements substring matching in hardware. It is, however, limited +to substrings of length 16 bytes or fewer, but this restriction is fine in a +regex engine, since we rarely care about the performance difference between +searching for a 16 byte literal and a 16 + N literal; 16 is already long +enough. The key downside of the PCMPESTRI instruction, on current (2016) CPUs +at least, is its latency and throughput. As a result, it is often faster to +do substring search with a Boyer-Moore (or Two-Way) variant and a well placed +memchr to quickly skip through the haystack. + +There are fewer results from the literature on packed substring matching, +and even fewer for packed multiple substring matching. Ben-Kiki et al. [2] +describes use of PCMPESTRI for substring matching, but is mostly theoretical +and hand-waves performance. There is other theoretical work done by Bille [3] +as well. + +The rest of the work in the field, as far as I'm aware, is by Faro and Kulekci +and is generally focused on multiple pattern search. Their first paper [4a] +introduces the concept of a fingerprint, which is computed for every block of +N bytes in every pattern. The haystack is then scanned N bytes at a time and +a fingerprint is computed in the same way it was computed for blocks in the +patterns. If the fingerprint corresponds to one that was found in a pattern, +then a verification step follows to confirm that one of the substrings with the +corresponding fingerprint actually matches at the current location. Various +implementation tricks are employed to make sure the fingerprint lookup is fast; +typically by truncating the fingerprint. (This may, of course, provoke more +steps in the verification process, so a balance must be struck.) + +The main downside of [4a] is that the minimum substring length is 32 bytes, +presumably because of how the algorithm uses certain SIMD instructions. This +essentially makes it useless for general purpose regex matching, where a small +number of short patterns is far more likely. + +Faro and Kulekci published another paper [4b] that is conceptually very similar +to [4a]. The key difference is that it uses the CRC32 instruction (introduced +as part of SSE 4.2) to compute fingerprint values. This also enables the +algorithm to work effectively on substrings as short as 7 bytes with 4 byte +windows. 7 bytes is unfortunately still too long. The window could be +technically shrunk to 2 bytes, thereby reducing minimum length to 3, but the +small window size ends up negating most performance benefits—and it's likely +the common case in a general purpose regex engine. + +Faro and Kulekci also published [4c] that appears to be intended as a +replacement to using PCMPESTRI. In particular, it is specifically motivated by +the high throughput/latency time of PCMPESTRI and therefore chooses other SIMD +instructions that are faster. While this approach works for short substrings, +I personally couldn't see a way to generalize it to multiple substring search. + +Faro and Kulekci have another paper [4d] that I haven't been able to read +because it is behind a paywall. + + +# Teddy + +Finally, we get to Teddy. If the above literature review is complete, then it +appears that Teddy is a novel algorithm. More than that, in my experience, it +completely blows away the competition for short substrings, which is exactly +what we want in a general purpose regex engine. Again, the algorithm appears +to be developed by the authors of [Hyperscan][1_u]. Hyperscan was open sourced +late 2015, and no earlier history could be found. Therefore, tracking the exact +provenance of the algorithm with respect to the published literature seems +difficult. + +At a high level, Teddy works somewhat similarly to the fingerprint algorithms +published by Faro and Kulekci, but Teddy does it in a way that scales a bit +better. Namely: + +1. Teddy's core algorithm scans the haystack in 16 (for SSE, or 32 for AVX) + byte chunks. 16 (or 32) is significant because it corresponds to the number + of bytes in a SIMD vector. +2. Bitwise operations are performed on each chunk to discover if any region of + it matches a set of precomputed fingerprints from the patterns. If there are + matches, then a verification step is performed. In this implementation, our + verification step is naive. This can be improved upon. + +The details to make this work are quite clever. First, we must choose how to +pick our fingerprints. In Hyperscan's implementation, I *believe* they use the +last N bytes of each substring, where N must be at least the minimum length of +any substring in the set being searched. In this implementation, we use the +first N bytes of each substring. (The tradeoffs between these choices aren't +yet clear to me.) We then must figure out how to quickly test whether an +occurrence of any fingerprint from the set of patterns appears in a 16 byte +block from the haystack. To keep things simple, let's assume N = 1 and examine +some examples to motivate the approach. Here are our patterns: + +```ignore +foo +bar +baz +``` + +The corresponding fingerprints, for N = 1, are `f`, `b` and `b`. Now let's set +our 16 byte block to: + +```ignore +bat cat foo bump +xxxxxxxxxxxxxxxx +``` + +To cut to the chase, Teddy works by using bitsets. In particular, Teddy creates +a mask that allows us to quickly compute membership of a fingerprint in a 16 +byte block that also tells which pattern the fingerprint corresponds to. In +this case, our fingerprint is a single byte, so an appropriate abstraction is +a map from a single byte to a list of patterns that contain that fingerprint: + +```ignore +f |--> foo +b |--> bar, baz +``` + +Now, all we need to do is figure out how to represent this map in vector space +and use normal SIMD operations to perform a lookup. The first simplification +we can make is to represent our patterns as bit fields occupying a single +byte. This is important, because a single SIMD vector can store 16 bytes. + +```ignore +f |--> 00000001 +b |--> 00000010, 00000100 +``` + +How do we perform lookup though? It turns out that SSSE3 introduced a very cool +instruction called PSHUFB. The instruction takes two SIMD vectors, `A` and `B`, +and returns a third vector `C`. All vectors are treated as 16 8-bit integers. +`C` is formed by `C[i] = A[B[i]]`. (This is a bit of a simplification, but true +for the purposes of this algorithm. For full details, see [Intel's Intrinsics +Guide][5_u].) This essentially lets us use the values in `B` to lookup values +in `A`. + +If we could somehow cause `B` to contain our 16 byte block from the haystack, +and if `A` could contain our bitmasks, then we'd end up with something like +this for `A`: + +```ignore + 0x00 0x01 ... 0x62 ... 0x66 ... 0xFF +A = 0 0 00000110 00000001 0 +``` + +And if `B` contains our window from our haystack, we could use shuffle to take +the values from `B` and use them to look up our bitsets in `A`. But of course, +we can't do this because `A` in the above example contains 256 bytes, which +is much larger than the size of a SIMD vector. + +Nybbles to the rescue! A nybble is 4 bits. Instead of one mask to hold all of +our bitsets, we can use two masks, where one mask corresponds to the lower four +bits of our fingerprint and the other mask corresponds to the upper four bits. +So our map now looks like: + +```ignore +'f' & 0xF = 0x6 |--> 00000001 +'f' >> 4 = 0x6 |--> 00000111 +'b' & 0xF = 0x2 |--> 00000110 +'b' >> 4 = 0x6 |--> 00000111 +``` + +Notice that the bitsets for each nybble correspond to the union of all +fingerprints that contain that nybble. For example, both `f` and `b` have the +same upper 4 bits but differ on the lower 4 bits. Putting this together, we +have `A0`, `A1` and `B`, where `A0` is our mask for the lower nybble, `A1` is +our mask for the upper nybble and `B` is our 16 byte block from the haystack: + +```ignore + 0x00 0x01 0x02 0x03 ... 0x06 ... 0xF +A0 = 0 0 00000110 0 00000001 0 +A1 = 0 0 0 0 00000111 0 +B = b a t _ t p +B = 0x62 0x61 0x74 0x20 0x74 0x70 +``` + +But of course, we can't use `B` with `PSHUFB` yet, since its values are 8 bits, +and we need indexes that are at most 4 bits (corresponding to one of 16 +values). We can apply the same transformation to split `B` into lower and upper +nybbles as we did `A`. As before, `B0` corresponds to the lower nybbles and +`B1` corresponds to the upper nybbles: + +```ignore + b a t _ c a t _ f o o _ b u m p +B0 = 0x2 0x1 0x4 0x0 0x3 0x1 0x4 0x0 0x6 0xF 0xF 0x0 0x2 0x5 0xD 0x0 +B1 = 0x6 0x6 0x7 0x2 0x6 0x6 0x7 0x2 0x6 0x6 0x6 0x2 0x6 0x7 0x6 0x7 +``` + +And now we have a nice correspondence. `B0` can index `A0` and `B1` can index +`A1`. Here's what we get when we apply `C0 = PSHUFB(A0, B0)`: + +```ignore + b a ... f o ... p + A0[0x2] A0[0x1] A0[0x6] A0[0xF] A0[0x0] +C0 = 00000110 0 00000001 0 0 +``` + +And `C1 = PSHUFB(A1, B1)`: + +```ignore + b a ... f o ... p + A1[0x6] A1[0x6] A1[0x6] A1[0x6] A1[0x7] +C1 = 00000111 00000111 00000111 00000111 0 +``` + +Notice how neither one of `C0` or `C1` is guaranteed to report fully correct +results all on its own. For example, `C1` claims that `b` is a fingerprint for +the pattern `foo` (since `A1[0x6] = 00000111`), and that `o` is a fingerprint +for all of our patterns. But if we combined `C0` and `C1` with an `AND` +operation: + +```ignore + b a ... f o ... p +C = 00000110 0 00000001 0 0 +``` + +Then we now have that `C[i]` contains a bitset corresponding to the matching +fingerprints in a haystack's 16 byte block, where `i` is the `ith` byte in that +block. + +Once we have that, we can look for the position of the least significant bit +in `C`. (Least significant because we only target little endian here. Thus, +the least significant bytes correspond to bytes in our haystack at a lower +address.) That position, modulo `8`, gives us the pattern that the fingerprint +matches. That position, integer divided by `8`, also gives us the byte offset +that the fingerprint occurs in inside the 16 byte haystack block. Using those +two pieces of information, we can run a verification procedure that tries +to match all substrings containing that fingerprint at that position in the +haystack. + + +# Implementation notes + +The problem with the algorithm as described above is that it uses a single byte +for a fingerprint. This will work well if the fingerprints are rare in the +haystack (e.g., capital letters or special characters in normal English text), +but if the fingerprints are common, you'll wind up spending too much time in +the verification step, which effectively negates the performance benefits of +scanning 16 bytes at a time. Remember, the key to the performance of this +algorithm is to do as little work as possible per 16 (or 32) bytes. + +This algorithm can be extrapolated in a relatively straight-forward way to use +larger fingerprints. That is, instead of a single byte prefix, we might use a +two or three byte prefix. The implementation here implements N = {1, 2, 3} +and always picks the largest N possible. The rationale is that the bigger the +fingerprint, the fewer verification steps we'll do. Of course, if N is too +large, then we'll end up doing too much on each step. + +The way to extend it is: + +1. Add a mask for each byte in the fingerprint. (Remember that each mask is + composed of two SIMD vectors.) This results in a value of `C` for each byte + in the fingerprint while searching. +2. When testing each 16 (or 32) byte block, each value of `C` must be shifted + so that they are aligned. Once aligned, they should all be `AND`'d together. + This will give you only the bitsets corresponding to the full match of the + fingerprint. To do this, one needs to save the last byte (for N=2) or last + two bytes (for N=3) from the previous iteration, and then line them up with + the first one or two bytes of the next iteration. + +## Verification + +Verification generally follows the procedure outlined above. The tricky parts +are in the right formulation of operations to get our bits out of our vectors. +We have a limited set of operations available to us on SIMD vectors as 128-bit +or 256-bit numbers, so we wind up needing to rip out 2 (or 4) 64-bit integers +from our vectors, and then run our verification step on each of those. The +verification step looks at the least significant bit set, and from its +position, we can derive the byte offset and bucket. (Again, as described +above.) Once we know the bucket, we do a fairly naive exhaustive search for +every literal in that bucket. (Hyperscan is a bit smarter here and uses a hash +table, but I haven't had time to thoroughly explore that. A few initial +half-hearted attempts resulted in worse performance.) + +## AVX + +The AVX version of Teddy extrapolates almost perfectly from the SSE version. +The only hickup is that PALIGNR is used to align chunks in the 16-bit version, +and there is no equivalent instruction in AVX. AVX does have VPALIGNR, but it +only works within 128-bit lanes. So there's a bit of tomfoolery to get around +this by shuffling the vectors before calling VPALIGNR. + +The only other aspect to AVX is that since our masks are still fundamentally +16-bytes (0x0-0xF), they are duplicated to 32-bytes, so that they can apply to +32-byte chunks. + +## Fat Teddy + +In the version of Teddy described above, 8 buckets are used to group patterns +that we want to search for. However, when AVX is available, we can extend the +number of buckets to 16 by permitting each byte in our masks to use 16-bits +instead of 8-bits to represent the buckets it belongs to. (This variant is also +in Hyperscan.) However, what we give up is the ability to scan 32 bytes at a +time, even though we're using AVX. Instead, we have to scan 16 bytes at a time. +What we gain, though, is (hopefully) less work in our verification routine. +It patterns are more spread out across more buckets, then there should overall +be fewer false positives. In general, Fat Teddy permits us to grow our capacity +a bit and search for more literals before Teddy gets overwhelmed. + +The tricky part of Fat Teddy is in how we adjust our masks and our verification +procedure. For the masks, we simply represent the first 8 buckets in each of +the low 16 bytes, and then the second 8 buckets in each of the high 16 bytes. +Then, in the search loop, instead of loading 32 bytes from the haystack, we +load the same 16 bytes from the haystack into both the low and high 16 byte +portions of our 256-bit vector. So for example, a mask might look like this: + + bits: 00100001 00000000 ... 11000000 00000000 00000001 ... 00000000 + byte: 31 30 16 15 14 0 + offset: 15 14 0 15 14 0 + buckets: 8-15 8-15 8-15 0-7 0-7 0-7 + +Where `byte` is the position in the vector (higher numbers corresponding to +more significant bits), `offset` is the corresponding position in the haystack +chunk, and `buckets` corresponds to the bucket assignments for that particular +byte. + +In particular, notice that the bucket assignments for offset `0` are spread +out between bytes `0` and `16`. This works well for the chunk-by-chunk search +procedure, but verification really wants to process all bucket assignments for +each offset at once. Otherwise, we might wind up finding a match at offset +`1` in one the first 8 buckets, when we really should have reported a match +at offset `0` in one of the second 8 buckets. (Because we want the leftmost +match.) + +Thus, for verification, we rearrange the above vector such that it is a +sequence of 16-bit integers, where the least significant 16-bit integer +corresponds to all of the bucket assignments for offset `0`. So with the +above vector, the least significant 16-bit integer would be + + 11000000 000000 + +which was taken from bytes `16` and `0`. Then the verification step pretty much +runs as described, except with 16 buckets instead of 8. + + +# References + +- **[1]** [Hyperscan on GitHub](https://github.com/intel/hyperscan), + [webpage](https://www.hyperscan.io/) +- **[2a]** Ben-Kiki, O., Bille, P., Breslauer, D., Gasieniec, L., Grossi, R., + & Weimann, O. (2011). + _Optimal packed string matching_. + In LIPIcs-Leibniz International Proceedings in Informatics (Vol. 13). + Schloss Dagstuhl-Leibniz-Zentrum fuer Informatik. + DOI: 10.4230/LIPIcs.FSTTCS.2011.423. + [PDF](https://drops.dagstuhl.de/opus/volltexte/2011/3355/pdf/37.pdf). +- **[2b]** Ben-Kiki, O., Bille, P., Breslauer, D., Ga̧sieniec, L., Grossi, R., + & Weimann, O. (2014). + _Towards optimal packed string matching_. + Theoretical Computer Science, 525, 111-129. + DOI: 10.1016/j.tcs.2013.06.013. + [PDF](https://www.cs.haifa.ac.il/~oren/Publications/bpsm.pdf). +- **[3]** Bille, P. (2011). + _Fast searching in packed strings_. + Journal of Discrete Algorithms, 9(1), 49-56. + DOI: 10.1016/j.jda.2010.09.003. + [PDF](https://www.sciencedirect.com/science/article/pii/S1570866710000353). +- **[4a]** Faro, S., & Külekci, M. O. (2012, October). + _Fast multiple string matching using streaming SIMD extensions technology_. + In String Processing and Information Retrieval (pp. 217-228). + Springer Berlin Heidelberg. + DOI: 10.1007/978-3-642-34109-0_23. + [PDF](https://www.dmi.unict.it/faro/papers/conference/faro32.pdf). +- **[4b]** Faro, S., & Külekci, M. O. (2013, September). + _Towards a Very Fast Multiple String Matching Algorithm for Short Patterns_. + In Stringology (pp. 78-91). + [PDF](https://www.dmi.unict.it/faro/papers/conference/faro36.pdf). +- **[4c]** Faro, S., & Külekci, M. O. (2013, January). + _Fast packed string matching for short patterns_. + In Proceedings of the Meeting on Algorithm Engineering & Expermiments + (pp. 113-121). + Society for Industrial and Applied Mathematics. + [PDF](https://arxiv.org/pdf/1209.6449.pdf). +- **[4d]** Faro, S., & Külekci, M. O. (2014). + _Fast and flexible packed string matching_. + Journal of Discrete Algorithms, 28, 61-72. + DOI: 10.1016/j.jda.2014.07.003. + +[1_u]: https://github.com/intel/hyperscan +[5_u]: https://software.intel.com/sites/landingpage/IntrinsicsGuide diff --git a/vendor/aho-corasick/src/packed/teddy/builder.rs b/vendor/aho-corasick/src/packed/teddy/builder.rs new file mode 100644 index 0000000..be91777 --- /dev/null +++ b/vendor/aho-corasick/src/packed/teddy/builder.rs @@ -0,0 +1,780 @@ +use core::{ + fmt::Debug, + panic::{RefUnwindSafe, UnwindSafe}, +}; + +use alloc::sync::Arc; + +use crate::packed::{ext::Pointer, pattern::Patterns, teddy::generic::Match}; + +/// A builder for constructing a Teddy matcher. +/// +/// The builder primarily permits fine grained configuration of the Teddy +/// matcher. Most options are made only available for testing/benchmarking +/// purposes. In reality, options are automatically determined by the nature +/// and number of patterns given to the builder. +#[derive(Clone, Debug)] +pub(crate) struct Builder { + /// When none, this is automatically determined. Otherwise, `false` means + /// slim Teddy is used (8 buckets) and `true` means fat Teddy is used + /// (16 buckets). Fat Teddy requires AVX2, so if that CPU feature isn't + /// available and Fat Teddy was requested, no matcher will be built. + only_fat: Option<bool>, + /// When none, this is automatically determined. Otherwise, `false` means + /// that 128-bit vectors will be used (up to SSSE3 instructions) where as + /// `true` means that 256-bit vectors will be used. As with `fat`, if + /// 256-bit vectors are requested and they aren't available, then a + /// searcher will not be built. + only_256bit: Option<bool>, + /// When true (the default), the number of patterns will be used as a + /// heuristic for refusing construction of a Teddy searcher. The point here + /// is that too many patterns can overwhelm Teddy. But this can be disabled + /// in cases where the caller knows better. + heuristic_pattern_limits: bool, +} + +impl Default for Builder { + fn default() -> Builder { + Builder::new() + } +} + +impl Builder { + /// Create a new builder for configuring a Teddy matcher. + pub(crate) fn new() -> Builder { + Builder { + only_fat: None, + only_256bit: None, + heuristic_pattern_limits: true, + } + } + + /// Build a matcher for the set of patterns given. If a matcher could not + /// be built, then `None` is returned. + /// + /// Generally, a matcher isn't built if the necessary CPU features aren't + /// available, an unsupported target or if the searcher is believed to be + /// slower than standard techniques (i.e., if there are too many literals). + pub(crate) fn build(&self, patterns: Arc<Patterns>) -> Option<Searcher> { + self.build_imp(patterns) + } + + /// Require the use of Fat (true) or Slim (false) Teddy. Fat Teddy uses + /// 16 buckets where as Slim Teddy uses 8 buckets. More buckets are useful + /// for a larger set of literals. + /// + /// `None` is the default, which results in an automatic selection based + /// on the number of literals and available CPU features. + pub(crate) fn only_fat(&mut self, yes: Option<bool>) -> &mut Builder { + self.only_fat = yes; + self + } + + /// Request the use of 256-bit vectors (true) or 128-bit vectors (false). + /// Generally, a larger vector size is better since it either permits + /// matching more patterns or matching more bytes in the haystack at once. + /// + /// `None` is the default, which results in an automatic selection based on + /// the number of literals and available CPU features. + pub(crate) fn only_256bit(&mut self, yes: Option<bool>) -> &mut Builder { + self.only_256bit = yes; + self + } + + /// Request that heuristic limitations on the number of patterns be + /// employed. This useful to disable for benchmarking where one wants to + /// explore how Teddy performs on large number of patterns even if the + /// heuristics would otherwise refuse construction. + /// + /// This is enabled by default. + pub(crate) fn heuristic_pattern_limits( + &mut self, + yes: bool, + ) -> &mut Builder { + self.heuristic_pattern_limits = yes; + self + } + + fn build_imp(&self, patterns: Arc<Patterns>) -> Option<Searcher> { + let patlimit = self.heuristic_pattern_limits; + // There's no particular reason why we limit ourselves to little endian + // here, but it seems likely that some parts of Teddy as they are + // currently written (e.g., the uses of `trailing_zeros`) are likely + // wrong on non-little-endian targets. Such things are likely easy to + // fix, but at the time of writing (2023/09/18), I actually do not know + // how to test this code on a big-endian target. So for now, we're + // conservative and just bail out. + if !cfg!(target_endian = "little") { + debug!("skipping Teddy because target isn't little endian"); + return None; + } + // Too many patterns will overwhelm Teddy and likely lead to slow + // downs, typically in the verification step. + if patlimit && patterns.len() > 64 { + debug!("skipping Teddy because of too many patterns"); + return None; + } + + #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] + { + use self::x86_64::{FatAVX2, SlimAVX2, SlimSSSE3}; + + let mask_len = core::cmp::min(4, patterns.minimum_len()); + let beefy = patterns.len() > 32; + let has_avx2 = self::x86_64::is_available_avx2(); + let has_ssse3 = has_avx2 || self::x86_64::is_available_ssse3(); + let use_avx2 = if self.only_256bit == Some(true) { + if !has_avx2 { + debug!( + "skipping Teddy because avx2 was demanded but unavailable" + ); + return None; + } + true + } else if self.only_256bit == Some(false) { + if !has_ssse3 { + debug!( + "skipping Teddy because ssse3 was demanded but unavailable" + ); + return None; + } + false + } else if !has_ssse3 && !has_avx2 { + debug!( + "skipping Teddy because ssse3 and avx2 are unavailable" + ); + return None; + } else { + has_avx2 + }; + let fat = match self.only_fat { + None => use_avx2 && beefy, + Some(false) => false, + Some(true) if !use_avx2 => { + debug!( + "skipping Teddy because fat was demanded, but fat \ + Teddy requires avx2 which is unavailable" + ); + return None; + } + Some(true) => true, + }; + // Just like for aarch64, it's possible that too many patterns will + // overhwelm Teddy. Unlike aarch64 though, we have Fat teddy which + // helps things scale a bit more by spreading patterns over more + // buckets. + // + // These thresholds were determined by looking at the measurements + // for the rust/aho-corasick/packed/leftmost-first and + // rust/aho-corasick/dfa/leftmost-first engines on the `teddy/` + // benchmarks. + if patlimit && mask_len == 1 && patterns.len() > 16 { + debug!( + "skipping Teddy (mask len: 1) because there are \ + too many patterns", + ); + return None; + } + match (mask_len, use_avx2, fat) { + (1, false, _) => { + debug!("Teddy choice: 128-bit slim, 1 byte"); + SlimSSSE3::<1>::new(&patterns) + } + (1, true, false) => { + debug!("Teddy choice: 256-bit slim, 1 byte"); + SlimAVX2::<1>::new(&patterns) + } + (1, true, true) => { + debug!("Teddy choice: 256-bit fat, 1 byte"); + FatAVX2::<1>::new(&patterns) + } + (2, false, _) => { + debug!("Teddy choice: 128-bit slim, 2 bytes"); + SlimSSSE3::<2>::new(&patterns) + } + (2, true, false) => { + debug!("Teddy choice: 256-bit slim, 2 bytes"); + SlimAVX2::<2>::new(&patterns) + } + (2, true, true) => { + debug!("Teddy choice: 256-bit fat, 2 bytes"); + FatAVX2::<2>::new(&patterns) + } + (3, false, _) => { + debug!("Teddy choice: 128-bit slim, 3 bytes"); + SlimSSSE3::<3>::new(&patterns) + } + (3, true, false) => { + debug!("Teddy choice: 256-bit slim, 3 bytes"); + SlimAVX2::<3>::new(&patterns) + } + (3, true, true) => { + debug!("Teddy choice: 256-bit fat, 3 bytes"); + FatAVX2::<3>::new(&patterns) + } + (4, false, _) => { + debug!("Teddy choice: 128-bit slim, 4 bytes"); + SlimSSSE3::<4>::new(&patterns) + } + (4, true, false) => { + debug!("Teddy choice: 256-bit slim, 4 bytes"); + SlimAVX2::<4>::new(&patterns) + } + (4, true, true) => { + debug!("Teddy choice: 256-bit fat, 4 bytes"); + FatAVX2::<4>::new(&patterns) + } + _ => { + debug!("no supported Teddy configuration found"); + None + } + } + } + #[cfg(target_arch = "aarch64")] + { + use self::aarch64::SlimNeon; + + let mask_len = core::cmp::min(4, patterns.minimum_len()); + if self.only_256bit == Some(true) { + debug!( + "skipping Teddy because 256-bits were demanded \ + but unavailable" + ); + return None; + } + if self.only_fat == Some(true) { + debug!( + "skipping Teddy because fat was demanded but unavailable" + ); + } + // Since we don't have Fat teddy in aarch64 (I think we'd want at + // least 256-bit vectors for that), we need to be careful not to + // allow too many patterns as it might overwhelm Teddy. Generally + // speaking, as the mask length goes up, the more patterns we can + // handle because the mask length results in fewer candidates + // generated. + // + // These thresholds were determined by looking at the measurements + // for the rust/aho-corasick/packed/leftmost-first and + // rust/aho-corasick/dfa/leftmost-first engines on the `teddy/` + // benchmarks. + match mask_len { + 1 => { + if patlimit && patterns.len() > 16 { + debug!( + "skipping Teddy (mask len: 1) because there are \ + too many patterns", + ); + } + debug!("Teddy choice: 128-bit slim, 1 byte"); + SlimNeon::<1>::new(&patterns) + } + 2 => { + if patlimit && patterns.len() > 32 { + debug!( + "skipping Teddy (mask len: 2) because there are \ + too many patterns", + ); + } + debug!("Teddy choice: 128-bit slim, 2 bytes"); + SlimNeon::<2>::new(&patterns) + } + 3 => { + if patlimit && patterns.len() > 48 { + debug!( + "skipping Teddy (mask len: 3) because there are \ + too many patterns", + ); + } + debug!("Teddy choice: 128-bit slim, 3 bytes"); + SlimNeon::<3>::new(&patterns) + } + 4 => { + debug!("Teddy choice: 128-bit slim, 4 bytes"); + SlimNeon::<4>::new(&patterns) + } + _ => { + debug!("no supported Teddy configuration found"); + None + } + } + } + #[cfg(not(any( + all(target_arch = "x86_64", target_feature = "sse2"), + target_arch = "aarch64" + )))] + { + None + } + } +} + +/// A searcher that dispatches to one of several possible Teddy variants. +#[derive(Clone, Debug)] +pub(crate) struct Searcher { + /// The Teddy variant we use. We use dynamic dispatch under the theory that + /// it results in better codegen then a enum, although this is a specious + /// claim. + /// + /// This `Searcher` is essentially a wrapper for a `SearcherT` trait + /// object. We just make `memory_usage` and `minimum_len` available without + /// going through dynamic dispatch. + imp: Arc<dyn SearcherT>, + /// Total heap memory used by the Teddy variant. + memory_usage: usize, + /// The minimum haystack length this searcher can handle. It is intended + /// for callers to use some other search routine (such as Rabin-Karp) in + /// cases where the haystack (or remainer of the haystack) is too short. + minimum_len: usize, +} + +impl Searcher { + /// Look for the leftmost occurrence of any pattern in this search in the + /// given haystack starting at the given position. + /// + /// # Panics + /// + /// This panics when `haystack[at..].len()` is less than the minimum length + /// for this haystack. + #[inline(always)] + pub(crate) fn find( + &self, + haystack: &[u8], + at: usize, + ) -> Option<crate::Match> { + // SAFETY: The Teddy implementations all require a minimum haystack + // length, and this is required for safety. Therefore, we assert it + // here in order to make this method sound. + assert!(haystack[at..].len() >= self.minimum_len); + let hayptr = haystack.as_ptr(); + // SAFETY: Construction of the searcher guarantees that we are able + // to run it in the current environment (i.e., we won't get an AVX2 + // searcher on a x86-64 CPU without AVX2 support). Also, the pointers + // are valid as they are derived directly from a borrowed slice. + let teddym = unsafe { + self.imp.find(hayptr.add(at), hayptr.add(haystack.len()))? + }; + let start = teddym.start().as_usize().wrapping_sub(hayptr.as_usize()); + let end = teddym.end().as_usize().wrapping_sub(hayptr.as_usize()); + let span = crate::Span { start, end }; + // OK because we won't permit the construction of a searcher that + // could report a pattern ID bigger than what can fit in the crate-wide + // PatternID type. + let pid = crate::PatternID::new_unchecked(teddym.pattern().as_usize()); + let m = crate::Match::new(pid, span); + Some(m) + } + + /// Returns the approximate total amount of heap used by this type, in + /// units of bytes. + #[inline(always)] + pub(crate) fn memory_usage(&self) -> usize { + self.memory_usage + } + + /// Returns the minimum length, in bytes, that a haystack must be in order + /// to use it with this searcher. + #[inline(always)] + pub(crate) fn minimum_len(&self) -> usize { + self.minimum_len + } +} + +/// A trait that provides dynamic dispatch over the different possible Teddy +/// variants on the same algorithm. +/// +/// On `x86_64` for example, it isn't known until runtime which of 12 possible +/// variants will be used. One might use one of the four slim 128-bit vector +/// variants, or one of the four 256-bit vector variants or even one of the +/// four fat 256-bit vector variants. +/// +/// Since this choice is generally made when the Teddy searcher is constructed +/// and this choice is based on the patterns given and what the current CPU +/// supports, it follows that there must be some kind of indirection at search +/// time that "selects" the variant chosen at build time. +/// +/// There are a few different ways to go about this. One approach is to use an +/// enum. It works fine, but in my experiments, this generally results in worse +/// codegen. Another approach, which is what we use here, is dynamic dispatch +/// via a trait object. We basically implement this trait for each possible +/// variant, select the variant we want at build time and convert it to a +/// trait object for use at search time. +/// +/// Another approach is to use function pointers and stick each of the possible +/// variants into a union. This is essentially isomorphic to the dynamic +/// dispatch approach, but doesn't require any allocations. Since this crate +/// requires `alloc`, there's no real reason (AFAIK) to go down this path. (The +/// `memchr` crate does this.) +trait SearcherT: + Debug + Send + Sync + UnwindSafe + RefUnwindSafe + 'static +{ + /// Execute a search on the given haystack (identified by `start` and `end` + /// raw pointers). + /// + /// # Safety + /// + /// Essentially, the `start` and `end` pointers must be valid and point + /// to a haystack one can read. As long as you derive them from, for + /// example, a `&[u8]`, they should automatically satisfy all of the safety + /// obligations: + /// + /// * Both `start` and `end` must be valid for reads. + /// * Both `start` and `end` must point to an initialized value. + /// * Both `start` and `end` must point to the same allocated object and + /// must either be in bounds or at most one byte past the end of the + /// allocated object. + /// * Both `start` and `end` must be _derived from_ a pointer to the same + /// object. + /// * The distance between `start` and `end` must not overflow `isize`. + /// * The distance being in bounds must not rely on "wrapping around" the + /// address space. + /// * It must be the case that `start <= end`. + /// * `end - start` must be greater than the minimum length for this + /// searcher. + /// + /// Also, it is expected that implementations of this trait will tag this + /// method with a `target_feature` attribute. Callers must ensure that + /// they are executing this method in an environment where that attribute + /// is valid. + unsafe fn find(&self, start: *const u8, end: *const u8) -> Option<Match>; +} + +#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] +mod x86_64 { + use core::arch::x86_64::{__m128i, __m256i}; + + use alloc::sync::Arc; + + use crate::packed::{ + ext::Pointer, + pattern::Patterns, + teddy::generic::{self, Match}, + }; + + use super::{Searcher, SearcherT}; + + #[derive(Clone, Debug)] + pub(super) struct SlimSSSE3<const BYTES: usize> { + slim128: generic::Slim<__m128i, BYTES>, + } + + // Defines SlimSSSE3 wrapper functions for 1, 2, 3 and 4 bytes. + macro_rules! slim_ssse3 { + ($len:expr) => { + impl SlimSSSE3<$len> { + /// Creates a new searcher using "slim" Teddy with 128-bit + /// vectors. If SSSE3 is not available in the current + /// environment, then this returns `None`. + pub(super) fn new( + patterns: &Arc<Patterns>, + ) -> Option<Searcher> { + if !is_available_ssse3() { + return None; + } + Some(unsafe { SlimSSSE3::<$len>::new_unchecked(patterns) }) + } + + /// Creates a new searcher using "slim" Teddy with 256-bit + /// vectors without checking whether SSSE3 is available or not. + /// + /// # Safety + /// + /// Callers must ensure that SSSE3 is available in the current + /// environment. + #[target_feature(enable = "ssse3")] + unsafe fn new_unchecked(patterns: &Arc<Patterns>) -> Searcher { + let slim128 = generic::Slim::<__m128i, $len>::new( + Arc::clone(patterns), + ); + let memory_usage = slim128.memory_usage(); + let minimum_len = slim128.minimum_len(); + let imp = Arc::new(SlimSSSE3 { slim128 }); + Searcher { imp, memory_usage, minimum_len } + } + } + + impl SearcherT for SlimSSSE3<$len> { + #[target_feature(enable = "ssse3")] + #[inline] + unsafe fn find( + &self, + start: *const u8, + end: *const u8, + ) -> Option<Match> { + // SAFETY: All obligations except for `target_feature` are + // passed to the caller. Our use of `target_feature` is + // safe because construction of this type requires that the + // requisite target features are available. + self.slim128.find(start, end) + } + } + }; + } + + slim_ssse3!(1); + slim_ssse3!(2); + slim_ssse3!(3); + slim_ssse3!(4); + + #[derive(Clone, Debug)] + pub(super) struct SlimAVX2<const BYTES: usize> { + slim128: generic::Slim<__m128i, BYTES>, + slim256: generic::Slim<__m256i, BYTES>, + } + + // Defines SlimAVX2 wrapper functions for 1, 2, 3 and 4 bytes. + macro_rules! slim_avx2 { + ($len:expr) => { + impl SlimAVX2<$len> { + /// Creates a new searcher using "slim" Teddy with 256-bit + /// vectors. If AVX2 is not available in the current + /// environment, then this returns `None`. + pub(super) fn new( + patterns: &Arc<Patterns>, + ) -> Option<Searcher> { + if !is_available_avx2() { + return None; + } + Some(unsafe { SlimAVX2::<$len>::new_unchecked(patterns) }) + } + + /// Creates a new searcher using "slim" Teddy with 256-bit + /// vectors without checking whether AVX2 is available or not. + /// + /// # Safety + /// + /// Callers must ensure that AVX2 is available in the current + /// environment. + #[target_feature(enable = "avx2")] + unsafe fn new_unchecked(patterns: &Arc<Patterns>) -> Searcher { + let slim128 = generic::Slim::<__m128i, $len>::new( + Arc::clone(&patterns), + ); + let slim256 = generic::Slim::<__m256i, $len>::new( + Arc::clone(&patterns), + ); + let memory_usage = + slim128.memory_usage() + slim256.memory_usage(); + let minimum_len = slim128.minimum_len(); + let imp = Arc::new(SlimAVX2 { slim128, slim256 }); + Searcher { imp, memory_usage, minimum_len } + } + } + + impl SearcherT for SlimAVX2<$len> { + #[target_feature(enable = "avx2")] + #[inline] + unsafe fn find( + &self, + start: *const u8, + end: *const u8, + ) -> Option<Match> { + // SAFETY: All obligations except for `target_feature` are + // passed to the caller. Our use of `target_feature` is + // safe because construction of this type requires that the + // requisite target features are available. + let len = end.distance(start); + if len < self.slim256.minimum_len() { + self.slim128.find(start, end) + } else { + self.slim256.find(start, end) + } + } + } + }; + } + + slim_avx2!(1); + slim_avx2!(2); + slim_avx2!(3); + slim_avx2!(4); + + #[derive(Clone, Debug)] + pub(super) struct FatAVX2<const BYTES: usize> { + fat256: generic::Fat<__m256i, BYTES>, + } + + // Defines SlimAVX2 wrapper functions for 1, 2, 3 and 4 bytes. + macro_rules! fat_avx2 { + ($len:expr) => { + impl FatAVX2<$len> { + /// Creates a new searcher using "slim" Teddy with 256-bit + /// vectors. If AVX2 is not available in the current + /// environment, then this returns `None`. + pub(super) fn new( + patterns: &Arc<Patterns>, + ) -> Option<Searcher> { + if !is_available_avx2() { + return None; + } + Some(unsafe { FatAVX2::<$len>::new_unchecked(patterns) }) + } + + /// Creates a new searcher using "slim" Teddy with 256-bit + /// vectors without checking whether AVX2 is available or not. + /// + /// # Safety + /// + /// Callers must ensure that AVX2 is available in the current + /// environment. + #[target_feature(enable = "avx2")] + unsafe fn new_unchecked(patterns: &Arc<Patterns>) -> Searcher { + let fat256 = generic::Fat::<__m256i, $len>::new( + Arc::clone(&patterns), + ); + let memory_usage = fat256.memory_usage(); + let minimum_len = fat256.minimum_len(); + let imp = Arc::new(FatAVX2 { fat256 }); + Searcher { imp, memory_usage, minimum_len } + } + } + + impl SearcherT for FatAVX2<$len> { + #[target_feature(enable = "avx2")] + #[inline] + unsafe fn find( + &self, + start: *const u8, + end: *const u8, + ) -> Option<Match> { + // SAFETY: All obligations except for `target_feature` are + // passed to the caller. Our use of `target_feature` is + // safe because construction of this type requires that the + // requisite target features are available. + self.fat256.find(start, end) + } + } + }; + } + + fat_avx2!(1); + fat_avx2!(2); + fat_avx2!(3); + fat_avx2!(4); + + #[inline] + pub(super) fn is_available_ssse3() -> bool { + #[cfg(not(target_feature = "sse2"))] + { + false + } + #[cfg(target_feature = "sse2")] + { + #[cfg(target_feature = "ssse3")] + { + true + } + #[cfg(not(target_feature = "ssse3"))] + { + #[cfg(feature = "std")] + { + std::is_x86_feature_detected!("ssse3") + } + #[cfg(not(feature = "std"))] + { + false + } + } + } + } + + #[inline] + pub(super) fn is_available_avx2() -> bool { + #[cfg(not(target_feature = "sse2"))] + { + false + } + #[cfg(target_feature = "sse2")] + { + #[cfg(target_feature = "avx2")] + { + true + } + #[cfg(not(target_feature = "avx2"))] + { + #[cfg(feature = "std")] + { + std::is_x86_feature_detected!("avx2") + } + #[cfg(not(feature = "std"))] + { + false + } + } + } + } +} + +#[cfg(target_arch = "aarch64")] +mod aarch64 { + use core::arch::aarch64::uint8x16_t; + + use alloc::sync::Arc; + + use crate::packed::{ + pattern::Patterns, + teddy::generic::{self, Match}, + }; + + use super::{Searcher, SearcherT}; + + #[derive(Clone, Debug)] + pub(super) struct SlimNeon<const BYTES: usize> { + slim128: generic::Slim<uint8x16_t, BYTES>, + } + + // Defines SlimSSSE3 wrapper functions for 1, 2, 3 and 4 bytes. + macro_rules! slim_neon { + ($len:expr) => { + impl SlimNeon<$len> { + /// Creates a new searcher using "slim" Teddy with 128-bit + /// vectors. If SSSE3 is not available in the current + /// environment, then this returns `None`. + pub(super) fn new( + patterns: &Arc<Patterns>, + ) -> Option<Searcher> { + Some(unsafe { SlimNeon::<$len>::new_unchecked(patterns) }) + } + + /// Creates a new searcher using "slim" Teddy with 256-bit + /// vectors without checking whether SSSE3 is available or not. + /// + /// # Safety + /// + /// Callers must ensure that SSSE3 is available in the current + /// environment. + #[target_feature(enable = "neon")] + unsafe fn new_unchecked(patterns: &Arc<Patterns>) -> Searcher { + let slim128 = generic::Slim::<uint8x16_t, $len>::new( + Arc::clone(patterns), + ); + let memory_usage = slim128.memory_usage(); + let minimum_len = slim128.minimum_len(); + let imp = Arc::new(SlimNeon { slim128 }); + Searcher { imp, memory_usage, minimum_len } + } + } + + impl SearcherT for SlimNeon<$len> { + #[target_feature(enable = "neon")] + #[inline] + unsafe fn find( + &self, + start: *const u8, + end: *const u8, + ) -> Option<Match> { + // SAFETY: All obligations except for `target_feature` are + // passed to the caller. Our use of `target_feature` is + // safe because construction of this type requires that the + // requisite target features are available. + self.slim128.find(start, end) + } + } + }; + } + + slim_neon!(1); + slim_neon!(2); + slim_neon!(3); + slim_neon!(4); +} diff --git a/vendor/aho-corasick/src/packed/teddy/generic.rs b/vendor/aho-corasick/src/packed/teddy/generic.rs new file mode 100644 index 0000000..2aacd00 --- /dev/null +++ b/vendor/aho-corasick/src/packed/teddy/generic.rs @@ -0,0 +1,1382 @@ +use core::fmt::Debug; + +use alloc::{ + boxed::Box, collections::BTreeMap, format, sync::Arc, vec, vec::Vec, +}; + +use crate::{ + packed::{ + ext::Pointer, + pattern::Patterns, + vector::{FatVector, Vector}, + }, + util::int::U32, + PatternID, +}; + +/// A match type specialized to the Teddy implementations below. +/// +/// Essentially, instead of representing a match at byte offsets, we use +/// raw pointers. This is because the implementations below operate on raw +/// pointers, and so this is a more natural return type based on how the +/// implementation works. +/// +/// Also, the `PatternID` used here is a `u16`. +#[derive(Clone, Copy, Debug)] +pub(crate) struct Match { + pid: PatternID, + start: *const u8, + end: *const u8, +} + +impl Match { + /// Returns the ID of the pattern that matched. + pub(crate) fn pattern(&self) -> PatternID { + self.pid + } + + /// Returns a pointer into the haystack at which the match starts. + pub(crate) fn start(&self) -> *const u8 { + self.start + } + + /// Returns a pointer into the haystack at which the match ends. + pub(crate) fn end(&self) -> *const u8 { + self.end + } +} + +/// A "slim" Teddy implementation that is generic over both the vector type +/// and the minimum length of the patterns being searched for. +/// +/// Only 1, 2, 3 and 4 bytes are supported as minimum lengths. +#[derive(Clone, Debug)] +pub(crate) struct Slim<V, const BYTES: usize> { + /// A generic data structure for doing "slim" Teddy verification. + teddy: Teddy<8>, + /// The masks used as inputs to the shuffle operation to generate + /// candidates (which are fed into the verification routines). + masks: [Mask<V>; BYTES], +} + +impl<V: Vector, const BYTES: usize> Slim<V, BYTES> { + /// Create a new "slim" Teddy searcher for the given patterns. + /// + /// # Panics + /// + /// This panics when `BYTES` is any value other than 1, 2, 3 or 4. + /// + /// # Safety + /// + /// Callers must ensure that this is okay to call in the current target for + /// the current CPU. + #[inline(always)] + pub(crate) unsafe fn new(patterns: Arc<Patterns>) -> Slim<V, BYTES> { + assert!( + 1 <= BYTES && BYTES <= 4, + "only 1, 2, 3 or 4 bytes are supported" + ); + let teddy = Teddy::new(patterns); + let masks = SlimMaskBuilder::from_teddy(&teddy); + Slim { teddy, masks } + } + + /// Returns the approximate total amount of heap used by this type, in + /// units of bytes. + #[inline(always)] + pub(crate) fn memory_usage(&self) -> usize { + self.teddy.memory_usage() + } + + /// Returns the minimum length, in bytes, that a haystack must be in order + /// to use it with this searcher. + #[inline(always)] + pub(crate) fn minimum_len(&self) -> usize { + V::BYTES + (BYTES - 1) + } +} + +impl<V: Vector> Slim<V, 1> { + /// Look for an occurrences of the patterns in this finder in the haystack + /// given by the `start` and `end` pointers. + /// + /// If no match could be found, then `None` is returned. + /// + /// # Safety + /// + /// The given pointers representing the haystack must be valid to read + /// from. They must also point to a region of memory that is at least the + /// minimum length required by this searcher. + /// + /// Callers must ensure that this is okay to call in the current target for + /// the current CPU. + #[inline(always)] + pub(crate) unsafe fn find( + &self, + start: *const u8, + end: *const u8, + ) -> Option<Match> { + let len = end.distance(start); + debug_assert!(len >= self.minimum_len()); + let mut cur = start; + while cur <= end.sub(V::BYTES) { + if let Some(m) = self.find_one(cur, end) { + return Some(m); + } + cur = cur.add(V::BYTES); + } + if cur < end { + cur = end.sub(V::BYTES); + if let Some(m) = self.find_one(cur, end) { + return Some(m); + } + } + None + } + + /// Look for a match starting at the `V::BYTES` at and after `cur`. If + /// there isn't one, then `None` is returned. + /// + /// # Safety + /// + /// The given pointers representing the haystack must be valid to read + /// from. They must also point to a region of memory that is at least the + /// minimum length required by this searcher. + /// + /// Callers must ensure that this is okay to call in the current target for + /// the current CPU. + #[inline(always)] + unsafe fn find_one( + &self, + cur: *const u8, + end: *const u8, + ) -> Option<Match> { + let c = self.candidate(cur); + if !c.is_zero() { + if let Some(m) = self.teddy.verify(cur, end, c) { + return Some(m); + } + } + None + } + + /// Look for a candidate match (represented as a vector) starting at the + /// `V::BYTES` at and after `cur`. If there isn't one, then a vector with + /// all bits set to zero is returned. + /// + /// # Safety + /// + /// The given pointer representing the haystack must be valid to read + /// from. + /// + /// Callers must ensure that this is okay to call in the current target for + /// the current CPU. + #[inline(always)] + unsafe fn candidate(&self, cur: *const u8) -> V { + let chunk = V::load_unaligned(cur); + Mask::members1(chunk, self.masks) + } +} + +impl<V: Vector> Slim<V, 2> { + /// See Slim<V, 1>::find. + #[inline(always)] + pub(crate) unsafe fn find( + &self, + start: *const u8, + end: *const u8, + ) -> Option<Match> { + let len = end.distance(start); + debug_assert!(len >= self.minimum_len()); + let mut cur = start.add(1); + let mut prev0 = V::splat(0xFF); + while cur <= end.sub(V::BYTES) { + if let Some(m) = self.find_one(cur, end, &mut prev0) { + return Some(m); + } + cur = cur.add(V::BYTES); + } + if cur < end { + cur = end.sub(V::BYTES); + prev0 = V::splat(0xFF); + if let Some(m) = self.find_one(cur, end, &mut prev0) { + return Some(m); + } + } + None + } + + /// See Slim<V, 1>::find_one. + #[inline(always)] + unsafe fn find_one( + &self, + cur: *const u8, + end: *const u8, + prev0: &mut V, + ) -> Option<Match> { + let c = self.candidate(cur, prev0); + if !c.is_zero() { + if let Some(m) = self.teddy.verify(cur.sub(1), end, c) { + return Some(m); + } + } + None + } + + /// See Slim<V, 1>::candidate. + #[inline(always)] + unsafe fn candidate(&self, cur: *const u8, prev0: &mut V) -> V { + let chunk = V::load_unaligned(cur); + let (res0, res1) = Mask::members2(chunk, self.masks); + let res0prev0 = res0.shift_in_one_byte(*prev0); + let res = res0prev0.and(res1); + *prev0 = res0; + res + } +} + +impl<V: Vector> Slim<V, 3> { + /// See Slim<V, 1>::find. + #[inline(always)] + pub(crate) unsafe fn find( + &self, + start: *const u8, + end: *const u8, + ) -> Option<Match> { + let len = end.distance(start); + debug_assert!(len >= self.minimum_len()); + let mut cur = start.add(2); + let mut prev0 = V::splat(0xFF); + let mut prev1 = V::splat(0xFF); + while cur <= end.sub(V::BYTES) { + if let Some(m) = self.find_one(cur, end, &mut prev0, &mut prev1) { + return Some(m); + } + cur = cur.add(V::BYTES); + } + if cur < end { + cur = end.sub(V::BYTES); + prev0 = V::splat(0xFF); + prev1 = V::splat(0xFF); + if let Some(m) = self.find_one(cur, end, &mut prev0, &mut prev1) { + return Some(m); + } + } + None + } + + /// See Slim<V, 1>::find_one. + #[inline(always)] + unsafe fn find_one( + &self, + cur: *const u8, + end: *const u8, + prev0: &mut V, + prev1: &mut V, + ) -> Option<Match> { + let c = self.candidate(cur, prev0, prev1); + if !c.is_zero() { + if let Some(m) = self.teddy.verify(cur.sub(2), end, c) { + return Some(m); + } + } + None + } + + /// See Slim<V, 1>::candidate. + #[inline(always)] + unsafe fn candidate( + &self, + cur: *const u8, + prev0: &mut V, + prev1: &mut V, + ) -> V { + let chunk = V::load_unaligned(cur); + let (res0, res1, res2) = Mask::members3(chunk, self.masks); + let res0prev0 = res0.shift_in_two_bytes(*prev0); + let res1prev1 = res1.shift_in_one_byte(*prev1); + let res = res0prev0.and(res1prev1).and(res2); + *prev0 = res0; + *prev1 = res1; + res + } +} + +impl<V: Vector> Slim<V, 4> { + /// See Slim<V, 1>::find. + #[inline(always)] + pub(crate) unsafe fn find( + &self, + start: *const u8, + end: *const u8, + ) -> Option<Match> { + let len = end.distance(start); + debug_assert!(len >= self.minimum_len()); + let mut cur = start.add(3); + let mut prev0 = V::splat(0xFF); + let mut prev1 = V::splat(0xFF); + let mut prev2 = V::splat(0xFF); + while cur <= end.sub(V::BYTES) { + if let Some(m) = + self.find_one(cur, end, &mut prev0, &mut prev1, &mut prev2) + { + return Some(m); + } + cur = cur.add(V::BYTES); + } + if cur < end { + cur = end.sub(V::BYTES); + prev0 = V::splat(0xFF); + prev1 = V::splat(0xFF); + prev2 = V::splat(0xFF); + if let Some(m) = + self.find_one(cur, end, &mut prev0, &mut prev1, &mut prev2) + { + return Some(m); + } + } + None + } + + /// See Slim<V, 1>::find_one. + #[inline(always)] + unsafe fn find_one( + &self, + cur: *const u8, + end: *const u8, + prev0: &mut V, + prev1: &mut V, + prev2: &mut V, + ) -> Option<Match> { + let c = self.candidate(cur, prev0, prev1, prev2); + if !c.is_zero() { + if let Some(m) = self.teddy.verify(cur.sub(3), end, c) { + return Some(m); + } + } + None + } + + /// See Slim<V, 1>::candidate. + #[inline(always)] + unsafe fn candidate( + &self, + cur: *const u8, + prev0: &mut V, + prev1: &mut V, + prev2: &mut V, + ) -> V { + let chunk = V::load_unaligned(cur); + let (res0, res1, res2, res3) = Mask::members4(chunk, self.masks); + let res0prev0 = res0.shift_in_three_bytes(*prev0); + let res1prev1 = res1.shift_in_two_bytes(*prev1); + let res2prev2 = res2.shift_in_one_byte(*prev2); + let res = res0prev0.and(res1prev1).and(res2prev2).and(res3); + *prev0 = res0; + *prev1 = res1; + *prev2 = res2; + res + } +} + +/// A "fat" Teddy implementation that is generic over both the vector type +/// and the minimum length of the patterns being searched for. +/// +/// Only 1, 2, 3 and 4 bytes are supported as minimum lengths. +#[derive(Clone, Debug)] +pub(crate) struct Fat<V, const BYTES: usize> { + /// A generic data structure for doing "fat" Teddy verification. + teddy: Teddy<16>, + /// The masks used as inputs to the shuffle operation to generate + /// candidates (which are fed into the verification routines). + masks: [Mask<V>; BYTES], +} + +impl<V: FatVector, const BYTES: usize> Fat<V, BYTES> { + /// Create a new "fat" Teddy searcher for the given patterns. + /// + /// # Panics + /// + /// This panics when `BYTES` is any value other than 1, 2, 3 or 4. + /// + /// # Safety + /// + /// Callers must ensure that this is okay to call in the current target for + /// the current CPU. + #[inline(always)] + pub(crate) unsafe fn new(patterns: Arc<Patterns>) -> Fat<V, BYTES> { + assert!( + 1 <= BYTES && BYTES <= 4, + "only 1, 2, 3 or 4 bytes are supported" + ); + let teddy = Teddy::new(patterns); + let masks = FatMaskBuilder::from_teddy(&teddy); + Fat { teddy, masks } + } + + /// Returns the approximate total amount of heap used by this type, in + /// units of bytes. + #[inline(always)] + pub(crate) fn memory_usage(&self) -> usize { + self.teddy.memory_usage() + } + + /// Returns the minimum length, in bytes, that a haystack must be in order + /// to use it with this searcher. + #[inline(always)] + pub(crate) fn minimum_len(&self) -> usize { + V::Half::BYTES + (BYTES - 1) + } +} + +impl<V: FatVector> Fat<V, 1> { + /// Look for an occurrences of the patterns in this finder in the haystack + /// given by the `start` and `end` pointers. + /// + /// If no match could be found, then `None` is returned. + /// + /// # Safety + /// + /// The given pointers representing the haystack must be valid to read + /// from. They must also point to a region of memory that is at least the + /// minimum length required by this searcher. + /// + /// Callers must ensure that this is okay to call in the current target for + /// the current CPU. + #[inline(always)] + pub(crate) unsafe fn find( + &self, + start: *const u8, + end: *const u8, + ) -> Option<Match> { + let len = end.distance(start); + debug_assert!(len >= self.minimum_len()); + let mut cur = start; + while cur <= end.sub(V::Half::BYTES) { + if let Some(m) = self.find_one(cur, end) { + return Some(m); + } + cur = cur.add(V::Half::BYTES); + } + if cur < end { + cur = end.sub(V::Half::BYTES); + if let Some(m) = self.find_one(cur, end) { + return Some(m); + } + } + None + } + + /// Look for a match starting at the `V::BYTES` at and after `cur`. If + /// there isn't one, then `None` is returned. + /// + /// # Safety + /// + /// The given pointers representing the haystack must be valid to read + /// from. They must also point to a region of memory that is at least the + /// minimum length required by this searcher. + /// + /// Callers must ensure that this is okay to call in the current target for + /// the current CPU. + #[inline(always)] + unsafe fn find_one( + &self, + cur: *const u8, + end: *const u8, + ) -> Option<Match> { + let c = self.candidate(cur); + if !c.is_zero() { + if let Some(m) = self.teddy.verify(cur, end, c) { + return Some(m); + } + } + None + } + + /// Look for a candidate match (represented as a vector) starting at the + /// `V::BYTES` at and after `cur`. If there isn't one, then a vector with + /// all bits set to zero is returned. + /// + /// # Safety + /// + /// The given pointer representing the haystack must be valid to read + /// from. + /// + /// Callers must ensure that this is okay to call in the current target for + /// the current CPU. + #[inline(always)] + unsafe fn candidate(&self, cur: *const u8) -> V { + let chunk = V::load_half_unaligned(cur); + Mask::members1(chunk, self.masks) + } +} + +impl<V: FatVector> Fat<V, 2> { + /// See `Fat<V, 1>::find`. + #[inline(always)] + pub(crate) unsafe fn find( + &self, + start: *const u8, + end: *const u8, + ) -> Option<Match> { + let len = end.distance(start); + debug_assert!(len >= self.minimum_len()); + let mut cur = start.add(1); + let mut prev0 = V::splat(0xFF); + while cur <= end.sub(V::Half::BYTES) { + if let Some(m) = self.find_one(cur, end, &mut prev0) { + return Some(m); + } + cur = cur.add(V::Half::BYTES); + } + if cur < end { + cur = end.sub(V::Half::BYTES); + prev0 = V::splat(0xFF); + if let Some(m) = self.find_one(cur, end, &mut prev0) { + return Some(m); + } + } + None + } + + /// See `Fat<V, 1>::find_one`. + #[inline(always)] + unsafe fn find_one( + &self, + cur: *const u8, + end: *const u8, + prev0: &mut V, + ) -> Option<Match> { + let c = self.candidate(cur, prev0); + if !c.is_zero() { + if let Some(m) = self.teddy.verify(cur.sub(1), end, c) { + return Some(m); + } + } + None + } + + /// See `Fat<V, 1>::candidate`. + #[inline(always)] + unsafe fn candidate(&self, cur: *const u8, prev0: &mut V) -> V { + let chunk = V::load_half_unaligned(cur); + let (res0, res1) = Mask::members2(chunk, self.masks); + let res0prev0 = res0.half_shift_in_one_byte(*prev0); + let res = res0prev0.and(res1); + *prev0 = res0; + res + } +} + +impl<V: FatVector> Fat<V, 3> { + /// See `Fat<V, 1>::find`. + #[inline(always)] + pub(crate) unsafe fn find( + &self, + start: *const u8, + end: *const u8, + ) -> Option<Match> { + let len = end.distance(start); + debug_assert!(len >= self.minimum_len()); + let mut cur = start.add(2); + let mut prev0 = V::splat(0xFF); + let mut prev1 = V::splat(0xFF); + while cur <= end.sub(V::Half::BYTES) { + if let Some(m) = self.find_one(cur, end, &mut prev0, &mut prev1) { + return Some(m); + } + cur = cur.add(V::Half::BYTES); + } + if cur < end { + cur = end.sub(V::Half::BYTES); + prev0 = V::splat(0xFF); + prev1 = V::splat(0xFF); + if let Some(m) = self.find_one(cur, end, &mut prev0, &mut prev1) { + return Some(m); + } + } + None + } + + /// See `Fat<V, 1>::find_one`. + #[inline(always)] + unsafe fn find_one( + &self, + cur: *const u8, + end: *const u8, + prev0: &mut V, + prev1: &mut V, + ) -> Option<Match> { + let c = self.candidate(cur, prev0, prev1); + if !c.is_zero() { + if let Some(m) = self.teddy.verify(cur.sub(2), end, c) { + return Some(m); + } + } + None + } + + /// See `Fat<V, 1>::candidate`. + #[inline(always)] + unsafe fn candidate( + &self, + cur: *const u8, + prev0: &mut V, + prev1: &mut V, + ) -> V { + let chunk = V::load_half_unaligned(cur); + let (res0, res1, res2) = Mask::members3(chunk, self.masks); + let res0prev0 = res0.half_shift_in_two_bytes(*prev0); + let res1prev1 = res1.half_shift_in_one_byte(*prev1); + let res = res0prev0.and(res1prev1).and(res2); + *prev0 = res0; + *prev1 = res1; + res + } +} + +impl<V: FatVector> Fat<V, 4> { + /// See `Fat<V, 1>::find`. + #[inline(always)] + pub(crate) unsafe fn find( + &self, + start: *const u8, + end: *const u8, + ) -> Option<Match> { + let len = end.distance(start); + debug_assert!(len >= self.minimum_len()); + let mut cur = start.add(3); + let mut prev0 = V::splat(0xFF); + let mut prev1 = V::splat(0xFF); + let mut prev2 = V::splat(0xFF); + while cur <= end.sub(V::Half::BYTES) { + if let Some(m) = + self.find_one(cur, end, &mut prev0, &mut prev1, &mut prev2) + { + return Some(m); + } + cur = cur.add(V::Half::BYTES); + } + if cur < end { + cur = end.sub(V::Half::BYTES); + prev0 = V::splat(0xFF); + prev1 = V::splat(0xFF); + prev2 = V::splat(0xFF); + if let Some(m) = + self.find_one(cur, end, &mut prev0, &mut prev1, &mut prev2) + { + return Some(m); + } + } + None + } + + /// See `Fat<V, 1>::find_one`. + #[inline(always)] + unsafe fn find_one( + &self, + cur: *const u8, + end: *const u8, + prev0: &mut V, + prev1: &mut V, + prev2: &mut V, + ) -> Option<Match> { + let c = self.candidate(cur, prev0, prev1, prev2); + if !c.is_zero() { + if let Some(m) = self.teddy.verify(cur.sub(3), end, c) { + return Some(m); + } + } + None + } + + /// See `Fat<V, 1>::candidate`. + #[inline(always)] + unsafe fn candidate( + &self, + cur: *const u8, + prev0: &mut V, + prev1: &mut V, + prev2: &mut V, + ) -> V { + let chunk = V::load_half_unaligned(cur); + let (res0, res1, res2, res3) = Mask::members4(chunk, self.masks); + let res0prev0 = res0.half_shift_in_three_bytes(*prev0); + let res1prev1 = res1.half_shift_in_two_bytes(*prev1); + let res2prev2 = res2.half_shift_in_one_byte(*prev2); + let res = res0prev0.and(res1prev1).and(res2prev2).and(res3); + *prev0 = res0; + *prev1 = res1; + *prev2 = res2; + res + } +} + +/// The common elements of all "slim" and "fat" Teddy search implementations. +/// +/// Essentially, this contains the patterns and the buckets. Namely, it +/// contains enough to implement the verification step after candidates are +/// identified via the shuffle masks. +/// +/// It is generic over the number of buckets used. In general, the number of +/// buckets is either 8 (for "slim" Teddy) or 16 (for "fat" Teddy). The generic +/// parameter isn't really meant to be instantiated for any value other than +/// 8 or 16, although it is technically possible. The main hiccup is that there +/// is some bit-shifting done in the critical part of verification that could +/// be quite expensive if `N` is not a multiple of 2. +#[derive(Clone, Debug)] +struct Teddy<const BUCKETS: usize> { + /// The patterns we are searching for. + /// + /// A pattern string can be found by its `PatternID`. + patterns: Arc<Patterns>, + /// The allocation of patterns in buckets. This only contains the IDs of + /// patterns. In order to do full verification, callers must provide the + /// actual patterns when using Teddy. + buckets: [Vec<PatternID>; BUCKETS], + // N.B. The above representation is very simple, but it definitely results + // in ping-ponging between different allocations during verification. I've + // tried experimenting with other representations that flatten the pattern + // strings into a single allocation, but it doesn't seem to help much. + // Probably everything is small enough to fit into cache anyway, and so the + // pointer chasing isn't a big deal? + // + // One other avenue I haven't explored is some kind of hashing trick + // that let's us do another high-confidence check before launching into + // `memcmp`. +} + +impl<const BUCKETS: usize> Teddy<BUCKETS> { + /// Create a new generic data structure for Teddy verification. + fn new(patterns: Arc<Patterns>) -> Teddy<BUCKETS> { + assert_ne!(0, patterns.len(), "Teddy requires at least one pattern"); + assert_ne!( + 0, + patterns.minimum_len(), + "Teddy does not support zero-length patterns" + ); + assert!( + BUCKETS == 8 || BUCKETS == 16, + "Teddy only supports 8 or 16 buckets" + ); + // MSRV(1.63): Use core::array::from_fn below instead of allocating a + // superfluous outer Vec. Not a big deal (especially given the BTreeMap + // allocation below), but nice to not do it. + let buckets = + <[Vec<PatternID>; BUCKETS]>::try_from(vec![vec![]; BUCKETS]) + .unwrap(); + let mut t = Teddy { patterns, buckets }; + + let mut map: BTreeMap<Box<[u8]>, usize> = BTreeMap::new(); + for (id, pattern) in t.patterns.iter() { + // We try to be slightly clever in how we assign patterns into + // buckets. Generally speaking, we want patterns with the same + // prefix to be in the same bucket, since it minimizes the amount + // of time we spend churning through buckets in the verification + // step. + // + // So we could assign patterns with the same N-prefix (where N is + // the size of the mask, which is one of {1, 2, 3}) to the same + // bucket. However, case insensitive searches are fairly common, so + // we'd for example, ideally want to treat `abc` and `ABC` as if + // they shared the same prefix. ASCII has the nice property that + // the lower 4 bits of A and a are the same, so we therefore group + // patterns with the same low-nybble-N-prefix into the same bucket. + // + // MOREOVER, this is actually necessary for correctness! In + // particular, by grouping patterns with the same prefix into the + // same bucket, we ensure that we preserve correct leftmost-first + // and leftmost-longest match semantics. In addition to the fact + // that `patterns.iter()` iterates in the correct order, this + // guarantees that all possible ambiguous matches will occur in + // the same bucket. The verification routine could be adjusted to + // support correct leftmost match semantics regardless of bucket + // allocation, but that results in a performance hit. It's much + // nicer to be able to just stop as soon as a match is found. + let lonybs = pattern.low_nybbles(t.mask_len()); + if let Some(&bucket) = map.get(&lonybs) { + t.buckets[bucket].push(id); + } else { + // N.B. We assign buckets in reverse because it shouldn't have + // any influence on performance, but it does make it harder to + // get leftmost match semantics accidentally correct. + let bucket = (BUCKETS - 1) - (id.as_usize() % BUCKETS); + t.buckets[bucket].push(id); + map.insert(lonybs, bucket); + } + } + t + } + + /// Verify whether there are any matches starting at or after `cur` in the + /// haystack. The candidate chunk given should correspond to 8-bit bitsets + /// for N buckets. + /// + /// # Safety + /// + /// The given pointers representing the haystack must be valid to read + /// from. + #[inline(always)] + unsafe fn verify64( + &self, + cur: *const u8, + end: *const u8, + mut candidate_chunk: u64, + ) -> Option<Match> { + while candidate_chunk != 0 { + let bit = candidate_chunk.trailing_zeros().as_usize(); + candidate_chunk &= !(1 << bit); + + let cur = cur.add(bit / BUCKETS); + let bucket = bit % BUCKETS; + if let Some(m) = self.verify_bucket(cur, end, bucket) { + return Some(m); + } + } + None + } + + /// Verify whether there are any matches starting at `at` in the given + /// `haystack` corresponding only to patterns in the given bucket. + /// + /// # Safety + /// + /// The given pointers representing the haystack must be valid to read + /// from. + /// + /// The bucket index must be less than or equal to `self.buckets.len()`. + #[inline(always)] + unsafe fn verify_bucket( + &self, + cur: *const u8, + end: *const u8, + bucket: usize, + ) -> Option<Match> { + debug_assert!(bucket < self.buckets.len()); + // SAFETY: The caller must ensure that the bucket index is correct. + for pid in self.buckets.get_unchecked(bucket).iter().copied() { + // SAFETY: This is safe because we are guaranteed that every + // index in a Teddy bucket is a valid index into `pats`, by + // construction. + debug_assert!(pid.as_usize() < self.patterns.len()); + let pat = self.patterns.get_unchecked(pid); + if pat.is_prefix_raw(cur, end) { + let start = cur; + let end = start.add(pat.len()); + return Some(Match { pid, start, end }); + } + } + None + } + + /// Returns the total number of masks required by the patterns in this + /// Teddy searcher. + /// + /// Basically, the mask length corresponds to the type of Teddy searcher + /// to use: a 1-byte, 2-byte, 3-byte or 4-byte searcher. The bigger the + /// better, typically, since searching for longer substrings usually + /// decreases the rate of false positives. Therefore, the number of masks + /// needed is the length of the shortest pattern in this searcher. If the + /// length of the shortest pattern (in bytes) is bigger than 4, then the + /// mask length is 4 since there are no Teddy searchers for more than 4 + /// bytes. + fn mask_len(&self) -> usize { + core::cmp::min(4, self.patterns.minimum_len()) + } + + /// Returns the approximate total amount of heap used by this type, in + /// units of bytes. + fn memory_usage(&self) -> usize { + // This is an upper bound rather than a precise accounting. No + // particular reason, other than it's probably very close to actual + // memory usage in practice. + self.patterns.len() * core::mem::size_of::<PatternID>() + } +} + +impl Teddy<8> { + /// Runs the verification routine for "slim" Teddy. + /// + /// The candidate given should be a collection of 8-bit bitsets (one bitset + /// per lane), where the ith bit is set in the jth lane if and only if the + /// byte occurring at `at + j` in `cur` is in the bucket `i`. + /// + /// # Safety + /// + /// Callers must ensure that this is okay to call in the current target for + /// the current CPU. + /// + /// The given pointers must be valid to read from. + #[inline(always)] + unsafe fn verify<V: Vector>( + &self, + mut cur: *const u8, + end: *const u8, + candidate: V, + ) -> Option<Match> { + debug_assert!(!candidate.is_zero()); + // Convert the candidate into 64-bit chunks, and then verify each of + // those chunks. + candidate.for_each_64bit_lane( + #[inline(always)] + |_, chunk| { + let result = self.verify64(cur, end, chunk); + cur = cur.add(8); + result + }, + ) + } +} + +impl Teddy<16> { + /// Runs the verification routine for "fat" Teddy. + /// + /// The candidate given should be a collection of 8-bit bitsets (one bitset + /// per lane), where the ith bit is set in the jth lane if and only if the + /// byte occurring at `at + (j < 16 ? j : j - 16)` in `cur` is in the + /// bucket `j < 16 ? i : i + 8`. + /// + /// # Safety + /// + /// Callers must ensure that this is okay to call in the current target for + /// the current CPU. + /// + /// The given pointers must be valid to read from. + #[inline(always)] + unsafe fn verify<V: FatVector>( + &self, + mut cur: *const u8, + end: *const u8, + candidate: V, + ) -> Option<Match> { + // This is a bit tricky, but we basically want to convert our + // candidate, which looks like this (assuming a 256-bit vector): + // + // a31 a30 ... a17 a16 a15 a14 ... a01 a00 + // + // where each a(i) is an 8-bit bitset corresponding to the activated + // buckets, to this + // + // a31 a15 a30 a14 a29 a13 ... a18 a02 a17 a01 a16 a00 + // + // Namely, for Fat Teddy, the high 128-bits of the candidate correspond + // to the same bytes in the haystack in the low 128-bits (so we only + // scan 16 bytes at a time), but are for buckets 8-15 instead of 0-7. + // + // The verification routine wants to look at all potentially matching + // buckets before moving on to the next lane. So for example, both + // a16 and a00 both correspond to the first byte in our window; a00 + // contains buckets 0-7 and a16 contains buckets 8-15. Specifically, + // a16 should be checked before a01. So the transformation shown above + // allows us to use our normal verification procedure with one small + // change: we treat each bitset as 16 bits instead of 8 bits. + debug_assert!(!candidate.is_zero()); + + // Swap the 128-bit lanes in the candidate vector. + let swapped = candidate.swap_halves(); + // Interleave the bytes from the low 128-bit lanes, starting with + // cand first. + let r1 = candidate.interleave_low_8bit_lanes(swapped); + // Interleave the bytes from the high 128-bit lanes, starting with + // cand first. + let r2 = candidate.interleave_high_8bit_lanes(swapped); + // Now just take the 2 low 64-bit integers from both r1 and r2. We + // can drop the high 64-bit integers because they are a mirror image + // of the low 64-bit integers. All we care about are the low 128-bit + // lanes of r1 and r2. Combined, they contain all our 16-bit bitsets + // laid out in the desired order, as described above. + r1.for_each_low_64bit_lane( + r2, + #[inline(always)] + |_, chunk| { + let result = self.verify64(cur, end, chunk); + cur = cur.add(4); + result + }, + ) + } +} + +/// A vector generic mask for the low and high nybbles in a set of patterns. +/// Each 8-bit lane `j` in a vector corresponds to a bitset where the `i`th bit +/// is set if and only if the nybble `j` is in the bucket `i` at a particular +/// position. +/// +/// This is slightly tweaked dependending on whether Slim or Fat Teddy is being +/// used. For Slim Teddy, the bitsets in the lower half are the same as the +/// bitsets in the higher half, so that we can search `V::BYTES` bytes at a +/// time. (Remember, the nybbles in the haystack are used as indices into these +/// masks, and 256-bit shuffles only operate on 128-bit lanes.) +/// +/// For Fat Teddy, the bitsets are not repeated, but instead, the high half +/// bits correspond to an addition 8 buckets. So that a bitset `00100010` has +/// buckets 1 and 5 set if it's in the lower half, but has buckets 9 and 13 set +/// if it's in the higher half. +#[derive(Clone, Copy, Debug)] +struct Mask<V> { + lo: V, + hi: V, +} + +impl<V: Vector> Mask<V> { + /// Return a candidate for Teddy (fat or slim) that is searching for 1-byte + /// candidates. + /// + /// If a candidate is returned, it will be a collection of 8-bit bitsets + /// (one bitset per lane), where the ith bit is set in the jth lane if and + /// only if the byte occurring at the jth lane in `chunk` is in the bucket + /// `i`. If no candidate is found, then the vector returned will have all + /// lanes set to zero. + /// + /// `chunk` should correspond to a `V::BYTES` window of the haystack (where + /// the least significant byte corresponds to the start of the window). For + /// fat Teddy, the haystack window length should be `V::BYTES / 2`, with + /// the window repeated in each half of the vector. + /// + /// `mask1` should correspond to a low/high mask for the first byte of all + /// patterns that are being searched. + #[inline(always)] + unsafe fn members1(chunk: V, masks: [Mask<V>; 1]) -> V { + let lomask = V::splat(0xF); + let hlo = chunk.and(lomask); + let hhi = chunk.shift_8bit_lane_right::<4>().and(lomask); + let locand = masks[0].lo.shuffle_bytes(hlo); + let hicand = masks[0].hi.shuffle_bytes(hhi); + locand.and(hicand) + } + + /// Return a candidate for Teddy (fat or slim) that is searching for 2-byte + /// candidates. + /// + /// If candidates are returned, each will be a collection of 8-bit bitsets + /// (one bitset per lane), where the ith bit is set in the jth lane if and + /// only if the byte occurring at the jth lane in `chunk` is in the bucket + /// `i`. Each candidate returned corresponds to the first and second bytes + /// of the patterns being searched. If no candidate is found, then all of + /// the lanes will be set to zero in at least one of the vectors returned. + /// + /// `chunk` should correspond to a `V::BYTES` window of the haystack (where + /// the least significant byte corresponds to the start of the window). For + /// fat Teddy, the haystack window length should be `V::BYTES / 2`, with + /// the window repeated in each half of the vector. + /// + /// The masks should correspond to the masks computed for the first and + /// second bytes of all patterns that are being searched. + #[inline(always)] + unsafe fn members2(chunk: V, masks: [Mask<V>; 2]) -> (V, V) { + let lomask = V::splat(0xF); + let hlo = chunk.and(lomask); + let hhi = chunk.shift_8bit_lane_right::<4>().and(lomask); + + let locand1 = masks[0].lo.shuffle_bytes(hlo); + let hicand1 = masks[0].hi.shuffle_bytes(hhi); + let cand1 = locand1.and(hicand1); + + let locand2 = masks[1].lo.shuffle_bytes(hlo); + let hicand2 = masks[1].hi.shuffle_bytes(hhi); + let cand2 = locand2.and(hicand2); + + (cand1, cand2) + } + + /// Return a candidate for Teddy (fat or slim) that is searching for 3-byte + /// candidates. + /// + /// If candidates are returned, each will be a collection of 8-bit bitsets + /// (one bitset per lane), where the ith bit is set in the jth lane if and + /// only if the byte occurring at the jth lane in `chunk` is in the bucket + /// `i`. Each candidate returned corresponds to the first, second and third + /// bytes of the patterns being searched. If no candidate is found, then + /// all of the lanes will be set to zero in at least one of the vectors + /// returned. + /// + /// `chunk` should correspond to a `V::BYTES` window of the haystack (where + /// the least significant byte corresponds to the start of the window). For + /// fat Teddy, the haystack window length should be `V::BYTES / 2`, with + /// the window repeated in each half of the vector. + /// + /// The masks should correspond to the masks computed for the first, second + /// and third bytes of all patterns that are being searched. + #[inline(always)] + unsafe fn members3(chunk: V, masks: [Mask<V>; 3]) -> (V, V, V) { + let lomask = V::splat(0xF); + let hlo = chunk.and(lomask); + let hhi = chunk.shift_8bit_lane_right::<4>().and(lomask); + + let locand1 = masks[0].lo.shuffle_bytes(hlo); + let hicand1 = masks[0].hi.shuffle_bytes(hhi); + let cand1 = locand1.and(hicand1); + + let locand2 = masks[1].lo.shuffle_bytes(hlo); + let hicand2 = masks[1].hi.shuffle_bytes(hhi); + let cand2 = locand2.and(hicand2); + + let locand3 = masks[2].lo.shuffle_bytes(hlo); + let hicand3 = masks[2].hi.shuffle_bytes(hhi); + let cand3 = locand3.and(hicand3); + + (cand1, cand2, cand3) + } + + /// Return a candidate for Teddy (fat or slim) that is searching for 4-byte + /// candidates. + /// + /// If candidates are returned, each will be a collection of 8-bit bitsets + /// (one bitset per lane), where the ith bit is set in the jth lane if and + /// only if the byte occurring at the jth lane in `chunk` is in the bucket + /// `i`. Each candidate returned corresponds to the first, second, third + /// and fourth bytes of the patterns being searched. If no candidate is + /// found, then all of the lanes will be set to zero in at least one of the + /// vectors returned. + /// + /// `chunk` should correspond to a `V::BYTES` window of the haystack (where + /// the least significant byte corresponds to the start of the window). For + /// fat Teddy, the haystack window length should be `V::BYTES / 2`, with + /// the window repeated in each half of the vector. + /// + /// The masks should correspond to the masks computed for the first, + /// second, third and fourth bytes of all patterns that are being searched. + #[inline(always)] + unsafe fn members4(chunk: V, masks: [Mask<V>; 4]) -> (V, V, V, V) { + let lomask = V::splat(0xF); + let hlo = chunk.and(lomask); + let hhi = chunk.shift_8bit_lane_right::<4>().and(lomask); + + let locand1 = masks[0].lo.shuffle_bytes(hlo); + let hicand1 = masks[0].hi.shuffle_bytes(hhi); + let cand1 = locand1.and(hicand1); + + let locand2 = masks[1].lo.shuffle_bytes(hlo); + let hicand2 = masks[1].hi.shuffle_bytes(hhi); + let cand2 = locand2.and(hicand2); + + let locand3 = masks[2].lo.shuffle_bytes(hlo); + let hicand3 = masks[2].hi.shuffle_bytes(hhi); + let cand3 = locand3.and(hicand3); + + let locand4 = masks[3].lo.shuffle_bytes(hlo); + let hicand4 = masks[3].hi.shuffle_bytes(hhi); + let cand4 = locand4.and(hicand4); + + (cand1, cand2, cand3, cand4) + } +} + +/// Represents the low and high nybble masks that will be used during +/// search. Each mask is 32 bytes wide, although only the first 16 bytes are +/// used for 128-bit vectors. +/// +/// Each byte in the mask corresponds to a 8-bit bitset, where bit `i` is set +/// if and only if the corresponding nybble is in the ith bucket. The index of +/// the byte (0-15, inclusive) corresponds to the nybble. +/// +/// Each mask is used as the target of a shuffle, where the indices for the +/// shuffle are taken from the haystack. AND'ing the shuffles for both the +/// low and high masks together also results in 8-bit bitsets, but where bit +/// `i` is set if and only if the correspond *byte* is in the ith bucket. +#[derive(Clone, Default)] +struct SlimMaskBuilder { + lo: [u8; 32], + hi: [u8; 32], +} + +impl SlimMaskBuilder { + /// Update this mask by adding the given byte to the given bucket. The + /// given bucket must be in the range 0-7. + /// + /// # Panics + /// + /// When `bucket >= 8`. + fn add(&mut self, bucket: usize, byte: u8) { + assert!(bucket < 8); + + let bucket = u8::try_from(bucket).unwrap(); + let byte_lo = usize::from(byte & 0xF); + let byte_hi = usize::from((byte >> 4) & 0xF); + // When using 256-bit vectors, we need to set this bucket assignment in + // the low and high 128-bit portions of the mask. This allows us to + // process 32 bytes at a time. Namely, AVX2 shuffles operate on each + // of the 128-bit lanes, rather than the full 256-bit vector at once. + self.lo[byte_lo] |= 1 << bucket; + self.lo[byte_lo + 16] |= 1 << bucket; + self.hi[byte_hi] |= 1 << bucket; + self.hi[byte_hi + 16] |= 1 << bucket; + } + + /// Turn this builder into a vector mask. + /// + /// # Panics + /// + /// When `V` represents a vector bigger than what `MaskBytes` can contain. + /// + /// # Safety + /// + /// Callers must ensure that this is okay to call in the current target for + /// the current CPU. + #[inline(always)] + unsafe fn build<V: Vector>(&self) -> Mask<V> { + assert!(V::BYTES <= self.lo.len()); + assert!(V::BYTES <= self.hi.len()); + Mask { + lo: V::load_unaligned(self.lo[..].as_ptr()), + hi: V::load_unaligned(self.hi[..].as_ptr()), + } + } + + /// A convenience function for building `N` vector masks from a slim + /// `Teddy` value. + /// + /// # Panics + /// + /// When `V` represents a vector bigger than what `MaskBytes` can contain. + /// + /// # Safety + /// + /// Callers must ensure that this is okay to call in the current target for + /// the current CPU. + #[inline(always)] + unsafe fn from_teddy<const BYTES: usize, V: Vector>( + teddy: &Teddy<8>, + ) -> [Mask<V>; BYTES] { + // MSRV(1.63): Use core::array::from_fn to just build the array here + // instead of creating a vector and turning it into an array. + let mut mask_builders = vec![SlimMaskBuilder::default(); BYTES]; + for (bucket_index, bucket) in teddy.buckets.iter().enumerate() { + for pid in bucket.iter().copied() { + let pat = teddy.patterns.get(pid); + for (i, builder) in mask_builders.iter_mut().enumerate() { + builder.add(bucket_index, pat.bytes()[i]); + } + } + } + let array = + <[SlimMaskBuilder; BYTES]>::try_from(mask_builders).unwrap(); + array.map(|builder| builder.build()) + } +} + +impl Debug for SlimMaskBuilder { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let (mut parts_lo, mut parts_hi) = (vec![], vec![]); + for i in 0..32 { + parts_lo.push(format!("{:02}: {:08b}", i, self.lo[i])); + parts_hi.push(format!("{:02}: {:08b}", i, self.hi[i])); + } + f.debug_struct("SlimMaskBuilder") + .field("lo", &parts_lo) + .field("hi", &parts_hi) + .finish() + } +} + +/// Represents the low and high nybble masks that will be used during "fat" +/// Teddy search. +/// +/// Each mask is 32 bytes wide, and at the time of writing, only 256-bit vectors +/// support fat Teddy. +/// +/// A fat Teddy mask is like a slim Teddy mask, except that instead of +/// repeating the bitsets in the high and low 128-bits in 256-bit vectors, the +/// high and low 128-bit halves each represent distinct buckets. (Bringing the +/// total to 16 instead of 8.) This permits spreading the patterns out a bit +/// more and thus putting less pressure on verification to be fast. +/// +/// Each byte in the mask corresponds to a 8-bit bitset, where bit `i` is set +/// if and only if the corresponding nybble is in the ith bucket. The index of +/// the byte (0-15, inclusive) corresponds to the nybble. +#[derive(Clone, Copy, Default)] +struct FatMaskBuilder { + lo: [u8; 32], + hi: [u8; 32], +} + +impl FatMaskBuilder { + /// Update this mask by adding the given byte to the given bucket. The + /// given bucket must be in the range 0-15. + /// + /// # Panics + /// + /// When `bucket >= 16`. + fn add(&mut self, bucket: usize, byte: u8) { + assert!(bucket < 16); + + let bucket = u8::try_from(bucket).unwrap(); + let byte_lo = usize::from(byte & 0xF); + let byte_hi = usize::from((byte >> 4) & 0xF); + // Unlike slim teddy, fat teddy only works with AVX2. For fat teddy, + // the high 128 bits of our mask correspond to buckets 8-15, while the + // low 128 bits correspond to buckets 0-7. + if bucket < 8 { + self.lo[byte_lo] |= 1 << bucket; + self.hi[byte_hi] |= 1 << bucket; + } else { + self.lo[byte_lo + 16] |= 1 << (bucket % 8); + self.hi[byte_hi + 16] |= 1 << (bucket % 8); + } + } + + /// Turn this builder into a vector mask. + /// + /// # Panics + /// + /// When `V` represents a vector bigger than what `MaskBytes` can contain. + /// + /// # Safety + /// + /// Callers must ensure that this is okay to call in the current target for + /// the current CPU. + #[inline(always)] + unsafe fn build<V: Vector>(&self) -> Mask<V> { + assert!(V::BYTES <= self.lo.len()); + assert!(V::BYTES <= self.hi.len()); + Mask { + lo: V::load_unaligned(self.lo[..].as_ptr()), + hi: V::load_unaligned(self.hi[..].as_ptr()), + } + } + + /// A convenience function for building `N` vector masks from a fat + /// `Teddy` value. + /// + /// # Panics + /// + /// When `V` represents a vector bigger than what `MaskBytes` can contain. + /// + /// # Safety + /// + /// Callers must ensure that this is okay to call in the current target for + /// the current CPU. + #[inline(always)] + unsafe fn from_teddy<const BYTES: usize, V: Vector>( + teddy: &Teddy<16>, + ) -> [Mask<V>; BYTES] { + // MSRV(1.63): Use core::array::from_fn to just build the array here + // instead of creating a vector and turning it into an array. + let mut mask_builders = vec![FatMaskBuilder::default(); BYTES]; + for (bucket_index, bucket) in teddy.buckets.iter().enumerate() { + for pid in bucket.iter().copied() { + let pat = teddy.patterns.get(pid); + for (i, builder) in mask_builders.iter_mut().enumerate() { + builder.add(bucket_index, pat.bytes()[i]); + } + } + } + let array = + <[FatMaskBuilder; BYTES]>::try_from(mask_builders).unwrap(); + array.map(|builder| builder.build()) + } +} + +impl Debug for FatMaskBuilder { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let (mut parts_lo, mut parts_hi) = (vec![], vec![]); + for i in 0..32 { + parts_lo.push(format!("{:02}: {:08b}", i, self.lo[i])); + parts_hi.push(format!("{:02}: {:08b}", i, self.hi[i])); + } + f.debug_struct("FatMaskBuilder") + .field("lo", &parts_lo) + .field("hi", &parts_hi) + .finish() + } +} diff --git a/vendor/aho-corasick/src/packed/teddy/mod.rs b/vendor/aho-corasick/src/packed/teddy/mod.rs new file mode 100644 index 0000000..26cfcdc --- /dev/null +++ b/vendor/aho-corasick/src/packed/teddy/mod.rs @@ -0,0 +1,9 @@ +// Regrettable, but Teddy stuff just isn't used on all targets. And for some +// targets, like aarch64, only "slim" Teddy is used and so "fat" Teddy gets a +// bunch of dead-code warnings. Just not worth trying to squash them. Blech. +#![allow(dead_code)] + +pub(crate) use self::builder::{Builder, Searcher}; + +mod builder; +mod generic; diff --git a/vendor/aho-corasick/src/packed/tests.rs b/vendor/aho-corasick/src/packed/tests.rs new file mode 100644 index 0000000..2b0d44e --- /dev/null +++ b/vendor/aho-corasick/src/packed/tests.rs @@ -0,0 +1,583 @@ +use std::collections::HashMap; + +use alloc::{ + format, + string::{String, ToString}, + vec, + vec::Vec, +}; + +use crate::{ + packed::{Config, MatchKind}, + util::search::Match, +}; + +/// A description of a single test against a multi-pattern searcher. +/// +/// A single test may not necessarily pass on every configuration of a +/// searcher. The tests are categorized and grouped appropriately below. +#[derive(Clone, Debug, Eq, PartialEq)] +struct SearchTest { + /// The name of this test, for debugging. + name: &'static str, + /// The patterns to search for. + patterns: &'static [&'static str], + /// The text to search. + haystack: &'static str, + /// Each match is a triple of (pattern_index, start, end), where + /// pattern_index is an index into `patterns` and `start`/`end` are indices + /// into `haystack`. + matches: &'static [(usize, usize, usize)], +} + +struct SearchTestOwned { + offset: usize, + name: String, + patterns: Vec<String>, + haystack: String, + matches: Vec<(usize, usize, usize)>, +} + +impl SearchTest { + fn variations(&self) -> Vec<SearchTestOwned> { + let count = if cfg!(miri) { 1 } else { 261 }; + let mut tests = vec![]; + for i in 0..count { + tests.push(self.offset_prefix(i)); + tests.push(self.offset_suffix(i)); + tests.push(self.offset_both(i)); + } + tests + } + + fn offset_both(&self, off: usize) -> SearchTestOwned { + SearchTestOwned { + offset: off, + name: self.name.to_string(), + patterns: self.patterns.iter().map(|s| s.to_string()).collect(), + haystack: format!( + "{}{}{}", + "Z".repeat(off), + self.haystack, + "Z".repeat(off) + ), + matches: self + .matches + .iter() + .map(|&(id, s, e)| (id, s + off, e + off)) + .collect(), + } + } + + fn offset_prefix(&self, off: usize) -> SearchTestOwned { + SearchTestOwned { + offset: off, + name: self.name.to_string(), + patterns: self.patterns.iter().map(|s| s.to_string()).collect(), + haystack: format!("{}{}", "Z".repeat(off), self.haystack), + matches: self + .matches + .iter() + .map(|&(id, s, e)| (id, s + off, e + off)) + .collect(), + } + } + + fn offset_suffix(&self, off: usize) -> SearchTestOwned { + SearchTestOwned { + offset: off, + name: self.name.to_string(), + patterns: self.patterns.iter().map(|s| s.to_string()).collect(), + haystack: format!("{}{}", self.haystack, "Z".repeat(off)), + matches: self.matches.to_vec(), + } + } +} + +/// Short-hand constructor for SearchTest. We use it a lot below. +macro_rules! t { + ($name:ident, $patterns:expr, $haystack:expr, $matches:expr) => { + SearchTest { + name: stringify!($name), + patterns: $patterns, + haystack: $haystack, + matches: $matches, + } + }; +} + +/// A collection of test groups. +type TestCollection = &'static [&'static [SearchTest]]; + +// Define several collections corresponding to the different type of match +// semantics supported. These collections have some overlap, but each +// collection should have some tests that no other collection has. + +/// Tests for leftmost-first match semantics. +const PACKED_LEFTMOST_FIRST: TestCollection = + &[BASICS, LEFTMOST, LEFTMOST_FIRST, REGRESSION, TEDDY]; + +/// Tests for leftmost-longest match semantics. +const PACKED_LEFTMOST_LONGEST: TestCollection = + &[BASICS, LEFTMOST, LEFTMOST_LONGEST, REGRESSION, TEDDY]; + +// Now define the individual tests that make up the collections above. + +/// A collection of tests for the that should always be true regardless of +/// match semantics. That is, all combinations of leftmost-{first, longest} +/// should produce the same answer. +const BASICS: &'static [SearchTest] = &[ + t!(basic001, &["a"], "", &[]), + t!(basic010, &["a"], "a", &[(0, 0, 1)]), + t!(basic020, &["a"], "aa", &[(0, 0, 1), (0, 1, 2)]), + t!(basic030, &["a"], "aaa", &[(0, 0, 1), (0, 1, 2), (0, 2, 3)]), + t!(basic040, &["a"], "aba", &[(0, 0, 1), (0, 2, 3)]), + t!(basic050, &["a"], "bba", &[(0, 2, 3)]), + t!(basic060, &["a"], "bbb", &[]), + t!(basic070, &["a"], "bababbbba", &[(0, 1, 2), (0, 3, 4), (0, 8, 9)]), + t!(basic100, &["aa"], "", &[]), + t!(basic110, &["aa"], "aa", &[(0, 0, 2)]), + t!(basic120, &["aa"], "aabbaa", &[(0, 0, 2), (0, 4, 6)]), + t!(basic130, &["aa"], "abbab", &[]), + t!(basic140, &["aa"], "abbabaa", &[(0, 5, 7)]), + t!(basic150, &["aaa"], "aaa", &[(0, 0, 3)]), + t!(basic200, &["abc"], "abc", &[(0, 0, 3)]), + t!(basic210, &["abc"], "zazabzabcz", &[(0, 6, 9)]), + t!(basic220, &["abc"], "zazabczabcz", &[(0, 3, 6), (0, 7, 10)]), + t!(basic230, &["abcd"], "abcd", &[(0, 0, 4)]), + t!(basic240, &["abcd"], "zazabzabcdz", &[(0, 6, 10)]), + t!(basic250, &["abcd"], "zazabcdzabcdz", &[(0, 3, 7), (0, 8, 12)]), + t!(basic300, &["a", "b"], "", &[]), + t!(basic310, &["a", "b"], "z", &[]), + t!(basic320, &["a", "b"], "b", &[(1, 0, 1)]), + t!(basic330, &["a", "b"], "a", &[(0, 0, 1)]), + t!( + basic340, + &["a", "b"], + "abba", + &[(0, 0, 1), (1, 1, 2), (1, 2, 3), (0, 3, 4),] + ), + t!( + basic350, + &["b", "a"], + "abba", + &[(1, 0, 1), (0, 1, 2), (0, 2, 3), (1, 3, 4),] + ), + t!(basic360, &["abc", "bc"], "xbc", &[(1, 1, 3),]), + t!(basic400, &["foo", "bar"], "", &[]), + t!(basic410, &["foo", "bar"], "foobar", &[(0, 0, 3), (1, 3, 6),]), + t!(basic420, &["foo", "bar"], "barfoo", &[(1, 0, 3), (0, 3, 6),]), + t!(basic430, &["foo", "bar"], "foofoo", &[(0, 0, 3), (0, 3, 6),]), + t!(basic440, &["foo", "bar"], "barbar", &[(1, 0, 3), (1, 3, 6),]), + t!(basic450, &["foo", "bar"], "bafofoo", &[(0, 4, 7),]), + t!(basic460, &["bar", "foo"], "bafofoo", &[(1, 4, 7),]), + t!(basic470, &["foo", "bar"], "fobabar", &[(1, 4, 7),]), + t!(basic480, &["bar", "foo"], "fobabar", &[(0, 4, 7),]), + t!(basic700, &["yabcdef", "abcdezghi"], "yabcdefghi", &[(0, 0, 7),]), + t!(basic710, &["yabcdef", "abcdezghi"], "yabcdezghi", &[(1, 1, 10),]), + t!( + basic720, + &["yabcdef", "bcdeyabc", "abcdezghi"], + "yabcdezghi", + &[(2, 1, 10),] + ), + t!(basic810, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4),]), + t!(basic820, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4),]), + t!(basic830, &["abc", "bc"], "zazabcz", &[(0, 3, 6),]), + t!( + basic840, + &["ab", "ba"], + "abababa", + &[(0, 0, 2), (0, 2, 4), (0, 4, 6),] + ), + t!(basic850, &["foo", "foo"], "foobarfoo", &[(0, 0, 3), (0, 6, 9),]), +]; + +/// Tests for leftmost match semantics. These should pass for both +/// leftmost-first and leftmost-longest match kinds. Stated differently, among +/// ambiguous matches, the longest match and the match that appeared first when +/// constructing the automaton should always be the same. +const LEFTMOST: &'static [SearchTest] = &[ + t!(leftmost000, &["ab", "ab"], "abcd", &[(0, 0, 2)]), + t!(leftmost030, &["a", "ab"], "aa", &[(0, 0, 1), (0, 1, 2)]), + t!(leftmost031, &["ab", "a"], "aa", &[(1, 0, 1), (1, 1, 2)]), + t!(leftmost032, &["ab", "a"], "xayabbbz", &[(1, 1, 2), (0, 3, 5)]), + t!(leftmost300, &["abcd", "bce", "b"], "abce", &[(1, 1, 4)]), + t!(leftmost310, &["abcd", "ce", "bc"], "abce", &[(2, 1, 3)]), + t!(leftmost320, &["abcd", "bce", "ce", "b"], "abce", &[(1, 1, 4)]), + t!(leftmost330, &["abcd", "bce", "cz", "bc"], "abcz", &[(3, 1, 3)]), + t!(leftmost340, &["bce", "cz", "bc"], "bcz", &[(2, 0, 2)]), + t!(leftmost350, &["abc", "bd", "ab"], "abd", &[(2, 0, 2)]), + t!( + leftmost360, + &["abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(2, 0, 8),] + ), + t!( + leftmost370, + &["abcdefghi", "cde", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8),] + ), + t!( + leftmost380, + &["abcdefghi", "hz", "abcdefgh", "a"], + "abcdefghz", + &[(2, 0, 8),] + ), + t!( + leftmost390, + &["b", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8),] + ), + t!( + leftmost400, + &["h", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8),] + ), + t!( + leftmost410, + &["z", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8), (0, 8, 9),] + ), +]; + +/// Tests for non-overlapping leftmost-first match semantics. These tests +/// should generally be specific to leftmost-first, which means they should +/// generally fail under leftmost-longest semantics. +const LEFTMOST_FIRST: &'static [SearchTest] = &[ + t!(leftfirst000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]), + t!(leftfirst020, &["abcd", "ab"], "abcd", &[(0, 0, 4)]), + t!(leftfirst030, &["ab", "ab"], "abcd", &[(0, 0, 2)]), + t!(leftfirst040, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (0, 3, 4)]), + t!(leftfirst100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(1, 1, 5)]), + t!(leftfirst110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]), + t!(leftfirst300, &["abcd", "b", "bce"], "abce", &[(1, 1, 2)]), + t!( + leftfirst310, + &["abcd", "b", "bce", "ce"], + "abce", + &[(1, 1, 2), (3, 2, 4),] + ), + t!( + leftfirst320, + &["a", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(0, 0, 1), (2, 7, 9),] + ), + t!(leftfirst330, &["a", "abab"], "abab", &[(0, 0, 1), (0, 2, 3)]), + t!( + leftfirst340, + &["abcdef", "x", "x", "x", "x", "x", "x", "abcde"], + "abcdef", + &[(0, 0, 6)] + ), +]; + +/// Tests for non-overlapping leftmost-longest match semantics. These tests +/// should generally be specific to leftmost-longest, which means they should +/// generally fail under leftmost-first semantics. +const LEFTMOST_LONGEST: &'static [SearchTest] = &[ + t!(leftlong000, &["ab", "abcd"], "abcd", &[(1, 0, 4)]), + t!(leftlong010, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4),]), + t!(leftlong040, &["a", "ab"], "a", &[(0, 0, 1)]), + t!(leftlong050, &["a", "ab"], "ab", &[(1, 0, 2)]), + t!(leftlong060, &["ab", "a"], "a", &[(1, 0, 1)]), + t!(leftlong070, &["ab", "a"], "ab", &[(0, 0, 2)]), + t!(leftlong100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(2, 1, 6)]), + t!(leftlong110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]), + t!(leftlong300, &["abcd", "b", "bce"], "abce", &[(2, 1, 4)]), + t!( + leftlong310, + &["a", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8),] + ), + t!(leftlong320, &["a", "abab"], "abab", &[(1, 0, 4)]), + t!(leftlong330, &["abcd", "b", "ce"], "abce", &[(1, 1, 2), (2, 2, 4),]), + t!(leftlong340, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (1, 3, 5)]), +]; + +/// Regression tests that are applied to all combinations. +/// +/// If regression tests are needed for specific match semantics, then add them +/// to the appropriate group above. +const REGRESSION: &'static [SearchTest] = &[ + t!(regression010, &["inf", "ind"], "infind", &[(0, 0, 3), (1, 3, 6),]), + t!(regression020, &["ind", "inf"], "infind", &[(1, 0, 3), (0, 3, 6),]), + t!( + regression030, + &["libcore/", "libstd/"], + "libcore/char/methods.rs", + &[(0, 0, 8),] + ), + t!( + regression040, + &["libstd/", "libcore/"], + "libcore/char/methods.rs", + &[(1, 0, 8),] + ), + t!( + regression050, + &["\x00\x00\x01", "\x00\x00\x00"], + "\x00\x00\x00", + &[(1, 0, 3),] + ), + t!( + regression060, + &["\x00\x00\x00", "\x00\x00\x01"], + "\x00\x00\x00", + &[(0, 0, 3),] + ), +]; + +const TEDDY: &'static [SearchTest] = &[ + t!( + teddy010, + &["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"], + "abcdefghijk", + &[ + (0, 0, 1), + (1, 1, 2), + (2, 2, 3), + (3, 3, 4), + (4, 4, 5), + (5, 5, 6), + (6, 6, 7), + (7, 7, 8), + (8, 8, 9), + (9, 9, 10), + (10, 10, 11) + ] + ), + t!( + teddy020, + &["ab", "bc", "cd", "de", "ef", "fg", "gh", "hi", "ij", "jk", "kl"], + "abcdefghijk", + &[(0, 0, 2), (2, 2, 4), (4, 4, 6), (6, 6, 8), (8, 8, 10),] + ), + t!( + teddy030, + &["abc"], + "abcdefghijklmnopqrstuvwxyzabcdefghijk", + &[(0, 0, 3), (0, 26, 29)] + ), +]; + +// Now define a test for each combination of things above that we want to run. +// Since there are a few different combinations for each collection of tests, +// we define a couple of macros to avoid repetition drudgery. The testconfig +// macro constructs the automaton from a given match kind, and runs the search +// tests one-by-one over the given collection. The `with` parameter allows one +// to configure the config with additional parameters. The testcombo macro +// invokes testconfig in precisely this way: it sets up several tests where +// each one turns a different knob on Config. + +macro_rules! testconfig { + ($name:ident, $collection:expr, $with:expr) => { + #[test] + fn $name() { + run_search_tests($collection, |test| { + let mut config = Config::new(); + $with(&mut config); + let mut builder = config.builder(); + builder.extend(test.patterns.iter().map(|p| p.as_bytes())); + let searcher = match builder.build() { + Some(searcher) => searcher, + None => { + // For x86-64 and aarch64, not building a searcher is + // probably a bug, so be loud. + if cfg!(any( + target_arch = "x86_64", + target_arch = "aarch64" + )) { + panic!("failed to build packed searcher") + } + return None; + } + }; + Some(searcher.find_iter(&test.haystack).collect()) + }); + } + }; +} + +testconfig!( + search_default_leftmost_first, + PACKED_LEFTMOST_FIRST, + |_: &mut Config| {} +); + +testconfig!( + search_default_leftmost_longest, + PACKED_LEFTMOST_LONGEST, + |c: &mut Config| { + c.match_kind(MatchKind::LeftmostLongest); + } +); + +testconfig!( + search_teddy_leftmost_first, + PACKED_LEFTMOST_FIRST, + |c: &mut Config| { + c.only_teddy(true); + } +); + +testconfig!( + search_teddy_leftmost_longest, + PACKED_LEFTMOST_LONGEST, + |c: &mut Config| { + c.only_teddy(true).match_kind(MatchKind::LeftmostLongest); + } +); + +testconfig!( + search_teddy_ssse3_leftmost_first, + PACKED_LEFTMOST_FIRST, + |c: &mut Config| { + c.only_teddy(true); + #[cfg(target_arch = "x86_64")] + if std::is_x86_feature_detected!("ssse3") { + c.only_teddy_256bit(Some(false)); + } + } +); + +testconfig!( + search_teddy_ssse3_leftmost_longest, + PACKED_LEFTMOST_LONGEST, + |c: &mut Config| { + c.only_teddy(true).match_kind(MatchKind::LeftmostLongest); + #[cfg(target_arch = "x86_64")] + if std::is_x86_feature_detected!("ssse3") { + c.only_teddy_256bit(Some(false)); + } + } +); + +testconfig!( + search_teddy_avx2_leftmost_first, + PACKED_LEFTMOST_FIRST, + |c: &mut Config| { + c.only_teddy(true); + #[cfg(target_arch = "x86_64")] + if std::is_x86_feature_detected!("avx2") { + c.only_teddy_256bit(Some(true)); + } + } +); + +testconfig!( + search_teddy_avx2_leftmost_longest, + PACKED_LEFTMOST_LONGEST, + |c: &mut Config| { + c.only_teddy(true).match_kind(MatchKind::LeftmostLongest); + #[cfg(target_arch = "x86_64")] + if std::is_x86_feature_detected!("avx2") { + c.only_teddy_256bit(Some(true)); + } + } +); + +testconfig!( + search_teddy_fat_leftmost_first, + PACKED_LEFTMOST_FIRST, + |c: &mut Config| { + c.only_teddy(true); + #[cfg(target_arch = "x86_64")] + if std::is_x86_feature_detected!("avx2") { + c.only_teddy_fat(Some(true)); + } + } +); + +testconfig!( + search_teddy_fat_leftmost_longest, + PACKED_LEFTMOST_LONGEST, + |c: &mut Config| { + c.only_teddy(true).match_kind(MatchKind::LeftmostLongest); + #[cfg(target_arch = "x86_64")] + if std::is_x86_feature_detected!("avx2") { + c.only_teddy_fat(Some(true)); + } + } +); + +testconfig!( + search_rabinkarp_leftmost_first, + PACKED_LEFTMOST_FIRST, + |c: &mut Config| { + c.only_rabin_karp(true); + } +); + +testconfig!( + search_rabinkarp_leftmost_longest, + PACKED_LEFTMOST_LONGEST, + |c: &mut Config| { + c.only_rabin_karp(true).match_kind(MatchKind::LeftmostLongest); + } +); + +#[test] +fn search_tests_have_unique_names() { + let assert = |constname, tests: &[SearchTest]| { + let mut seen = HashMap::new(); // map from test name to position + for (i, test) in tests.iter().enumerate() { + if !seen.contains_key(test.name) { + seen.insert(test.name, i); + } else { + let last = seen[test.name]; + panic!( + "{} tests have duplicate names at positions {} and {}", + constname, last, i + ); + } + } + }; + assert("BASICS", BASICS); + assert("LEFTMOST", LEFTMOST); + assert("LEFTMOST_FIRST", LEFTMOST_FIRST); + assert("LEFTMOST_LONGEST", LEFTMOST_LONGEST); + assert("REGRESSION", REGRESSION); + assert("TEDDY", TEDDY); +} + +fn run_search_tests<F: FnMut(&SearchTestOwned) -> Option<Vec<Match>>>( + which: TestCollection, + mut f: F, +) { + let get_match_triples = + |matches: Vec<Match>| -> Vec<(usize, usize, usize)> { + matches + .into_iter() + .map(|m| (m.pattern().as_usize(), m.start(), m.end())) + .collect() + }; + for &tests in which { + for spec in tests { + for test in spec.variations() { + let results = match f(&test) { + None => continue, + Some(results) => results, + }; + assert_eq!( + test.matches, + get_match_triples(results).as_slice(), + "test: {}, patterns: {:?}, haystack(len={:?}): {:?}, \ + offset: {:?}", + test.name, + test.patterns, + test.haystack.len(), + test.haystack, + test.offset, + ); + } + } + } +} diff --git a/vendor/aho-corasick/src/packed/vector.rs b/vendor/aho-corasick/src/packed/vector.rs new file mode 100644 index 0000000..ed3f890 --- /dev/null +++ b/vendor/aho-corasick/src/packed/vector.rs @@ -0,0 +1,1752 @@ +// NOTE: The descriptions for each of the vector methods on the traits below +// are pretty inscrutable. For this reason, there are tests for every method +// on for every trait impl below. If you're confused about what an op does, +// consult its test. (They probably should be doc tests, but I couldn't figure +// out how to write them in a non-annoying way.) + +use core::{ + fmt::Debug, + panic::{RefUnwindSafe, UnwindSafe}, +}; + +/// A trait for describing vector operations used by vectorized searchers. +/// +/// The trait is highly constrained to low level vector operations needed for +/// the specific algorithms used in this crate. In general, it was invented +/// mostly to be generic over x86's __m128i and __m256i types. At time of +/// writing, it also supports wasm and aarch64 128-bit vector types as well. +/// +/// # Safety +/// +/// All methods are not safe since they are intended to be implemented using +/// vendor intrinsics, which are also not safe. Callers must ensure that +/// the appropriate target features are enabled in the calling function, +/// and that the current CPU supports them. All implementations should +/// avoid marking the routines with `#[target_feature]` and instead mark +/// them as `#[inline(always)]` to ensure they get appropriately inlined. +/// (`inline(always)` cannot be used with target_feature.) +pub(crate) trait Vector: + Copy + Debug + Send + Sync + UnwindSafe + RefUnwindSafe +{ + /// The number of bits in the vector. + const BITS: usize; + /// The number of bytes in the vector. That is, this is the size of the + /// vector in memory. + const BYTES: usize; + + /// Create a vector with 8-bit lanes with the given byte repeated into each + /// lane. + /// + /// # Safety + /// + /// Callers must ensure that this is okay to call in the current target for + /// the current CPU. + unsafe fn splat(byte: u8) -> Self; + + /// Read a vector-size number of bytes from the given pointer. The pointer + /// does not need to be aligned. + /// + /// # Safety + /// + /// Callers must ensure that this is okay to call in the current target for + /// the current CPU. + /// + /// Callers must guarantee that at least `BYTES` bytes are readable from + /// `data`. + unsafe fn load_unaligned(data: *const u8) -> Self; + + /// Returns true if and only if this vector has zero in all of its lanes. + /// + /// # Safety + /// + /// Callers must ensure that this is okay to call in the current target for + /// the current CPU. + unsafe fn is_zero(self) -> bool; + + /// Do an 8-bit pairwise equality check. If lane `i` is equal in this + /// vector and the one given, then lane `i` in the resulting vector is set + /// to `0xFF`. Otherwise, it is set to `0x00`. + /// + /// # Safety + /// + /// Callers must ensure that this is okay to call in the current target for + /// the current CPU. + unsafe fn cmpeq(self, vector2: Self) -> Self; + + /// Perform a bitwise 'and' of this vector and the one given and return + /// the result. + /// + /// # Safety + /// + /// Callers must ensure that this is okay to call in the current target for + /// the current CPU. + unsafe fn and(self, vector2: Self) -> Self; + + /// Perform a bitwise 'or' of this vector and the one given and return + /// the result. + /// + /// # Safety + /// + /// Callers must ensure that this is okay to call in the current target for + /// the current CPU. + unsafe fn or(self, vector2: Self) -> Self; + + /// Shift each 8-bit lane in this vector to the right by the number of + /// bits indictated by the `BITS` type parameter. + /// + /// # Safety + /// + /// Callers must ensure that this is okay to call in the current target for + /// the current CPU. + unsafe fn shift_8bit_lane_right<const BITS: i32>(self) -> Self; + + /// Shift this vector to the left by one byte and shift the most + /// significant byte of `vector2` into the least significant position of + /// this vector. + /// + /// Stated differently, this behaves as if `self` and `vector2` were + /// concatenated into a `2 * Self::BITS` temporary buffer and then shifted + /// right by `Self::BYTES - 1` bytes. + /// + /// With respect to the Teddy algorithm, `vector2` is usually a previous + /// `Self::BYTES` chunk from the haystack and `self` is the chunk + /// immediately following it. This permits combining the last two bytes + /// from the previous chunk (`vector2`) with the first `Self::BYTES - 1` + /// bytes from the current chunk. This permits aligning the result of + /// various shuffles so that they can be and-ed together and a possible + /// candidate discovered. + /// + /// # Safety + /// + /// Callers must ensure that this is okay to call in the current target for + /// the current CPU. + unsafe fn shift_in_one_byte(self, vector2: Self) -> Self; + + /// Shift this vector to the left by two bytes and shift the two most + /// significant bytes of `vector2` into the least significant position of + /// this vector. + /// + /// Stated differently, this behaves as if `self` and `vector2` were + /// concatenated into a `2 * Self::BITS` temporary buffer and then shifted + /// right by `Self::BYTES - 2` bytes. + /// + /// With respect to the Teddy algorithm, `vector2` is usually a previous + /// `Self::BYTES` chunk from the haystack and `self` is the chunk + /// immediately following it. This permits combining the last two bytes + /// from the previous chunk (`vector2`) with the first `Self::BYTES - 2` + /// bytes from the current chunk. This permits aligning the result of + /// various shuffles so that they can be and-ed together and a possible + /// candidate discovered. + /// + /// # Safety + /// + /// Callers must ensure that this is okay to call in the current target for + /// the current CPU. + unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self; + + /// Shift this vector to the left by three bytes and shift the three most + /// significant bytes of `vector2` into the least significant position of + /// this vector. + /// + /// Stated differently, this behaves as if `self` and `vector2` were + /// concatenated into a `2 * Self::BITS` temporary buffer and then shifted + /// right by `Self::BYTES - 3` bytes. + /// + /// With respect to the Teddy algorithm, `vector2` is usually a previous + /// `Self::BYTES` chunk from the haystack and `self` is the chunk + /// immediately following it. This permits combining the last three bytes + /// from the previous chunk (`vector2`) with the first `Self::BYTES - 3` + /// bytes from the current chunk. This permits aligning the result of + /// various shuffles so that they can be and-ed together and a possible + /// candidate discovered. + /// + /// # Safety + /// + /// Callers must ensure that this is okay to call in the current target for + /// the current CPU. + unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self; + + /// Shuffles the bytes in this vector according to the indices in each of + /// the corresponding lanes in `indices`. + /// + /// If `i` is the index of corresponding lanes, `A` is this vector, `B` is + /// indices and `C` is the resulting vector, then `C = A[B[i]]`. + /// + /// # Safety + /// + /// Callers must ensure that this is okay to call in the current target for + /// the current CPU. + unsafe fn shuffle_bytes(self, indices: Self) -> Self; + + /// Call the provided function for each 64-bit lane in this vector. The + /// given function is provided the lane index and lane value as a `u64`. + /// + /// If `f` returns `Some`, then iteration over the lanes is stopped and the + /// value is returned. Otherwise, this returns `None`. + /// + /// # Notes + /// + /// Conceptually it would be nice if we could have a + /// `unpack64(self) -> [u64; BITS / 64]` method, but defining that is + /// tricky given Rust's [current support for const generics][support]. + /// And even if we could, it would be tricky to write generic code over + /// it. (Not impossible. We could introduce another layer that requires + /// `AsRef<[u64]>` or something.) + /// + /// [support]: https://github.com/rust-lang/rust/issues/60551 + /// + /// # Safety + /// + /// Callers must ensure that this is okay to call in the current target for + /// the current CPU. + unsafe fn for_each_64bit_lane<T>( + self, + f: impl FnMut(usize, u64) -> Option<T>, + ) -> Option<T>; +} + +/// This trait extends the `Vector` trait with additional operations to support +/// Fat Teddy. +/// +/// Fat Teddy uses 16 buckets instead of 8, but reads half as many bytes (as +/// the vector size) instead of the full size of a vector per iteration. For +/// example, when using a 256-bit vector, Slim Teddy reads 32 bytes at a timr +/// but Fat Teddy reads 16 bytes at a time. +/// +/// Fat Teddy is useful when searching for a large number of literals. +/// The extra number of buckets spreads the literals out more and reduces +/// verification time. +/// +/// Currently we only implement this for AVX on x86_64. It would be nice to +/// implement this for SSE on x86_64 and NEON on aarch64, with the latter two +/// only reading 8 bytes at a time. It's not clear how well it would work, but +/// there are some tricky things to figure out in terms of implementation. The +/// `half_shift_in_{one,two,three}_bytes` methods in particular are probably +/// the trickiest of the bunch. For AVX2, these are implemented by taking +/// advantage of the fact that `_mm256_alignr_epi8` operates on each 128-bit +/// half instead of the full 256-bit vector. (Where as `_mm_alignr_epi8` +/// operates on the full 128-bit vector and not on each 64-bit half.) I didn't +/// do a careful survey of NEON to see if it could easily support these +/// operations. +pub(crate) trait FatVector: Vector { + type Half: Vector; + + /// Read a half-vector-size number of bytes from the given pointer, and + /// broadcast it across both halfs of a full vector. The pointer does not + /// need to be aligned. + /// + /// # Safety + /// + /// Callers must ensure that this is okay to call in the current target for + /// the current CPU. + /// + /// Callers must guarantee that at least `Self::HALF::BYTES` bytes are + /// readable from `data`. + unsafe fn load_half_unaligned(data: *const u8) -> Self; + + /// Like `Vector::shift_in_one_byte`, except this is done for each half + /// of the vector instead. + /// + /// # Safety + /// + /// Callers must ensure that this is okay to call in the current target for + /// the current CPU. + unsafe fn half_shift_in_one_byte(self, vector2: Self) -> Self; + + /// Like `Vector::shift_in_two_bytes`, except this is done for each half + /// of the vector instead. + /// + /// # Safety + /// + /// Callers must ensure that this is okay to call in the current target for + /// the current CPU. + unsafe fn half_shift_in_two_bytes(self, vector2: Self) -> Self; + + /// Like `Vector::shift_in_two_bytes`, except this is done for each half + /// of the vector instead. + /// + /// # Safety + /// + /// Callers must ensure that this is okay to call in the current target for + /// the current CPU. + unsafe fn half_shift_in_three_bytes(self, vector2: Self) -> Self; + + /// Swap the 128-bit lanes in this vector. + /// + /// # Safety + /// + /// Callers must ensure that this is okay to call in the current target for + /// the current CPU. + unsafe fn swap_halves(self) -> Self; + + /// Unpack and interleave the 8-bit lanes from the low 128 bits of each + /// vector and return the result. + /// + /// # Safety + /// + /// Callers must ensure that this is okay to call in the current target for + /// the current CPU. + unsafe fn interleave_low_8bit_lanes(self, vector2: Self) -> Self; + + /// Unpack and interleave the 8-bit lanes from the high 128 bits of each + /// vector and return the result. + /// + /// # Safety + /// + /// Callers must ensure that this is okay to call in the current target for + /// the current CPU. + unsafe fn interleave_high_8bit_lanes(self, vector2: Self) -> Self; + + /// Call the provided function for each 64-bit lane in the lower half + /// of this vector and then in the other vector. The given function is + /// provided the lane index and lane value as a `u64`. (The high 128-bits + /// of each vector are ignored.) + /// + /// If `f` returns `Some`, then iteration over the lanes is stopped and the + /// value is returned. Otherwise, this returns `None`. + /// + /// # Safety + /// + /// Callers must ensure that this is okay to call in the current target for + /// the current CPU. + unsafe fn for_each_low_64bit_lane<T>( + self, + vector2: Self, + f: impl FnMut(usize, u64) -> Option<T>, + ) -> Option<T>; +} + +#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] +mod x86_64_ssse3 { + use core::arch::x86_64::*; + + use crate::util::int::{I32, I8}; + + use super::Vector; + + impl Vector for __m128i { + const BITS: usize = 128; + const BYTES: usize = 16; + + #[inline(always)] + unsafe fn splat(byte: u8) -> __m128i { + _mm_set1_epi8(i8::from_bits(byte)) + } + + #[inline(always)] + unsafe fn load_unaligned(data: *const u8) -> __m128i { + _mm_loadu_si128(data.cast::<__m128i>()) + } + + #[inline(always)] + unsafe fn is_zero(self) -> bool { + let cmp = self.cmpeq(Self::splat(0)); + _mm_movemask_epi8(cmp).to_bits() == 0xFFFF + } + + #[inline(always)] + unsafe fn cmpeq(self, vector2: Self) -> __m128i { + _mm_cmpeq_epi8(self, vector2) + } + + #[inline(always)] + unsafe fn and(self, vector2: Self) -> __m128i { + _mm_and_si128(self, vector2) + } + + #[inline(always)] + unsafe fn or(self, vector2: Self) -> __m128i { + _mm_or_si128(self, vector2) + } + + #[inline(always)] + unsafe fn shift_8bit_lane_right<const BITS: i32>(self) -> Self { + // Apparently there is no _mm_srli_epi8, so we emulate it by + // shifting 16-bit integers and masking out the high nybble of each + // 8-bit lane (since that nybble will contain bits from the low + // nybble of the previous lane). + let lomask = Self::splat(0xF); + _mm_srli_epi16(self, BITS).and(lomask) + } + + #[inline(always)] + unsafe fn shift_in_one_byte(self, vector2: Self) -> Self { + _mm_alignr_epi8(self, vector2, 15) + } + + #[inline(always)] + unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self { + _mm_alignr_epi8(self, vector2, 14) + } + + #[inline(always)] + unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self { + _mm_alignr_epi8(self, vector2, 13) + } + + #[inline(always)] + unsafe fn shuffle_bytes(self, indices: Self) -> Self { + _mm_shuffle_epi8(self, indices) + } + + #[inline(always)] + unsafe fn for_each_64bit_lane<T>( + self, + mut f: impl FnMut(usize, u64) -> Option<T>, + ) -> Option<T> { + // We could just use _mm_extract_epi64 here, but that requires + // SSE 4.1. It isn't necessarily a problem to just require SSE 4.1, + // but everything else works with SSSE3 so we stick to that subset. + let lanes: [u64; 2] = core::mem::transmute(self); + if let Some(t) = f(0, lanes[0]) { + return Some(t); + } + if let Some(t) = f(1, lanes[1]) { + return Some(t); + } + None + } + } +} + +#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] +mod x86_64_avx2 { + use core::arch::x86_64::*; + + use crate::util::int::{I32, I64, I8}; + + use super::{FatVector, Vector}; + + impl Vector for __m256i { + const BITS: usize = 256; + const BYTES: usize = 32; + + #[inline(always)] + unsafe fn splat(byte: u8) -> __m256i { + _mm256_set1_epi8(i8::from_bits(byte)) + } + + #[inline(always)] + unsafe fn load_unaligned(data: *const u8) -> __m256i { + _mm256_loadu_si256(data.cast::<__m256i>()) + } + + #[inline(always)] + unsafe fn is_zero(self) -> bool { + let cmp = self.cmpeq(Self::splat(0)); + _mm256_movemask_epi8(cmp).to_bits() == 0xFFFFFFFF + } + + #[inline(always)] + unsafe fn cmpeq(self, vector2: Self) -> __m256i { + _mm256_cmpeq_epi8(self, vector2) + } + + #[inline(always)] + unsafe fn and(self, vector2: Self) -> __m256i { + _mm256_and_si256(self, vector2) + } + + #[inline(always)] + unsafe fn or(self, vector2: Self) -> __m256i { + _mm256_or_si256(self, vector2) + } + + #[inline(always)] + unsafe fn shift_8bit_lane_right<const BITS: i32>(self) -> Self { + let lomask = Self::splat(0xF); + _mm256_srli_epi16(self, BITS).and(lomask) + } + + #[inline(always)] + unsafe fn shift_in_one_byte(self, vector2: Self) -> Self { + // Credit goes to jneem for figuring this out: + // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184 + // + // TL;DR avx2's PALIGNR instruction is actually just two 128-bit + // PALIGNR instructions, which is not what we want, so we need to + // do some extra shuffling. + let v = _mm256_permute2x128_si256(vector2, self, 0x21); + _mm256_alignr_epi8(self, v, 15) + } + + #[inline(always)] + unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self { + // Credit goes to jneem for figuring this out: + // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184 + // + // TL;DR avx2's PALIGNR instruction is actually just two 128-bit + // PALIGNR instructions, which is not what we want, so we need to + // do some extra shuffling. + let v = _mm256_permute2x128_si256(vector2, self, 0x21); + _mm256_alignr_epi8(self, v, 14) + } + + #[inline(always)] + unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self { + // Credit goes to jneem for figuring this out: + // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184 + // + // TL;DR avx2's PALIGNR instruction is actually just two 128-bit + // PALIGNR instructions, which is not what we want, so we need to + // do some extra shuffling. + let v = _mm256_permute2x128_si256(vector2, self, 0x21); + _mm256_alignr_epi8(self, v, 13) + } + + #[inline(always)] + unsafe fn shuffle_bytes(self, indices: Self) -> Self { + _mm256_shuffle_epi8(self, indices) + } + + #[inline(always)] + unsafe fn for_each_64bit_lane<T>( + self, + mut f: impl FnMut(usize, u64) -> Option<T>, + ) -> Option<T> { + // NOTE: At one point in the past, I used transmute to this to + // get a [u64; 4], but it turned out to lead to worse codegen IIRC. + // I've tried it more recently, and it looks like that's no longer + // the case. But since there's no difference, we stick with the + // slightly more complicated but transmute-free version. + let lane = _mm256_extract_epi64(self, 0).to_bits(); + if let Some(t) = f(0, lane) { + return Some(t); + } + let lane = _mm256_extract_epi64(self, 1).to_bits(); + if let Some(t) = f(1, lane) { + return Some(t); + } + let lane = _mm256_extract_epi64(self, 2).to_bits(); + if let Some(t) = f(2, lane) { + return Some(t); + } + let lane = _mm256_extract_epi64(self, 3).to_bits(); + if let Some(t) = f(3, lane) { + return Some(t); + } + None + } + } + + impl FatVector for __m256i { + type Half = __m128i; + + #[inline(always)] + unsafe fn load_half_unaligned(data: *const u8) -> Self { + let half = Self::Half::load_unaligned(data); + _mm256_broadcastsi128_si256(half) + } + + #[inline(always)] + unsafe fn half_shift_in_one_byte(self, vector2: Self) -> Self { + _mm256_alignr_epi8(self, vector2, 15) + } + + #[inline(always)] + unsafe fn half_shift_in_two_bytes(self, vector2: Self) -> Self { + _mm256_alignr_epi8(self, vector2, 14) + } + + #[inline(always)] + unsafe fn half_shift_in_three_bytes(self, vector2: Self) -> Self { + _mm256_alignr_epi8(self, vector2, 13) + } + + #[inline(always)] + unsafe fn swap_halves(self) -> Self { + _mm256_permute4x64_epi64(self, 0x4E) + } + + #[inline(always)] + unsafe fn interleave_low_8bit_lanes(self, vector2: Self) -> Self { + _mm256_unpacklo_epi8(self, vector2) + } + + #[inline(always)] + unsafe fn interleave_high_8bit_lanes(self, vector2: Self) -> Self { + _mm256_unpackhi_epi8(self, vector2) + } + + #[inline(always)] + unsafe fn for_each_low_64bit_lane<T>( + self, + vector2: Self, + mut f: impl FnMut(usize, u64) -> Option<T>, + ) -> Option<T> { + let lane = _mm256_extract_epi64(self, 0).to_bits(); + if let Some(t) = f(0, lane) { + return Some(t); + } + let lane = _mm256_extract_epi64(self, 1).to_bits(); + if let Some(t) = f(1, lane) { + return Some(t); + } + let lane = _mm256_extract_epi64(vector2, 0).to_bits(); + if let Some(t) = f(2, lane) { + return Some(t); + } + let lane = _mm256_extract_epi64(vector2, 1).to_bits(); + if let Some(t) = f(3, lane) { + return Some(t); + } + None + } + } +} + +#[cfg(target_arch = "aarch64")] +mod aarch64_neon { + use core::arch::aarch64::*; + + use super::Vector; + + impl Vector for uint8x16_t { + const BITS: usize = 128; + const BYTES: usize = 16; + + #[inline(always)] + unsafe fn splat(byte: u8) -> uint8x16_t { + vdupq_n_u8(byte) + } + + #[inline(always)] + unsafe fn load_unaligned(data: *const u8) -> uint8x16_t { + vld1q_u8(data) + } + + #[inline(always)] + unsafe fn is_zero(self) -> bool { + // Could also use vmaxvq_u8. + // ... I tried that and couldn't observe any meaningful difference + // in benchmarks. + let maxes = vreinterpretq_u64_u8(vpmaxq_u8(self, self)); + vgetq_lane_u64(maxes, 0) == 0 + } + + #[inline(always)] + unsafe fn cmpeq(self, vector2: Self) -> uint8x16_t { + vceqq_u8(self, vector2) + } + + #[inline(always)] + unsafe fn and(self, vector2: Self) -> uint8x16_t { + vandq_u8(self, vector2) + } + + #[inline(always)] + unsafe fn or(self, vector2: Self) -> uint8x16_t { + vorrq_u8(self, vector2) + } + + #[inline(always)] + unsafe fn shift_8bit_lane_right<const BITS: i32>(self) -> Self { + debug_assert!(BITS <= 7); + vshrq_n_u8(self, BITS) + } + + #[inline(always)] + unsafe fn shift_in_one_byte(self, vector2: Self) -> Self { + vextq_u8(vector2, self, 15) + } + + #[inline(always)] + unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self { + vextq_u8(vector2, self, 14) + } + + #[inline(always)] + unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self { + vextq_u8(vector2, self, 13) + } + + #[inline(always)] + unsafe fn shuffle_bytes(self, indices: Self) -> Self { + vqtbl1q_u8(self, indices) + } + + #[inline(always)] + unsafe fn for_each_64bit_lane<T>( + self, + mut f: impl FnMut(usize, u64) -> Option<T>, + ) -> Option<T> { + let this = vreinterpretq_u64_u8(self); + let lane = vgetq_lane_u64(this, 0); + if let Some(t) = f(0, lane) { + return Some(t); + } + let lane = vgetq_lane_u64(this, 1); + if let Some(t) = f(1, lane) { + return Some(t); + } + None + } + } +} + +#[cfg(all(test, target_arch = "x86_64", target_feature = "sse2"))] +mod tests_x86_64_ssse3 { + use core::arch::x86_64::*; + + use crate::util::int::{I32, U32}; + + use super::*; + + fn is_runnable() -> bool { + std::is_x86_feature_detected!("ssse3") + } + + #[target_feature(enable = "ssse3")] + unsafe fn load(lanes: [u8; 16]) -> __m128i { + __m128i::load_unaligned(&lanes as *const u8) + } + + #[target_feature(enable = "ssse3")] + unsafe fn unload(v: __m128i) -> [u8; 16] { + [ + _mm_extract_epi8(v, 0).to_bits().low_u8(), + _mm_extract_epi8(v, 1).to_bits().low_u8(), + _mm_extract_epi8(v, 2).to_bits().low_u8(), + _mm_extract_epi8(v, 3).to_bits().low_u8(), + _mm_extract_epi8(v, 4).to_bits().low_u8(), + _mm_extract_epi8(v, 5).to_bits().low_u8(), + _mm_extract_epi8(v, 6).to_bits().low_u8(), + _mm_extract_epi8(v, 7).to_bits().low_u8(), + _mm_extract_epi8(v, 8).to_bits().low_u8(), + _mm_extract_epi8(v, 9).to_bits().low_u8(), + _mm_extract_epi8(v, 10).to_bits().low_u8(), + _mm_extract_epi8(v, 11).to_bits().low_u8(), + _mm_extract_epi8(v, 12).to_bits().low_u8(), + _mm_extract_epi8(v, 13).to_bits().low_u8(), + _mm_extract_epi8(v, 14).to_bits().low_u8(), + _mm_extract_epi8(v, 15).to_bits().low_u8(), + ] + } + + #[test] + fn vector_splat() { + #[target_feature(enable = "ssse3")] + unsafe fn test() { + let v = __m128i::splat(0xAF); + assert_eq!( + unload(v), + [ + 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, + 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF + ] + ); + } + if !is_runnable() { + return; + } + unsafe { test() } + } + + #[test] + fn vector_is_zero() { + #[target_feature(enable = "ssse3")] + unsafe fn test() { + let v = load([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); + assert!(!v.is_zero()); + let v = load([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); + assert!(v.is_zero()); + } + if !is_runnable() { + return; + } + unsafe { test() } + } + + #[test] + fn vector_cmpeq() { + #[target_feature(enable = "ssse3")] + unsafe fn test() { + let v1 = + load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1]); + let v2 = + load([16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]); + assert_eq!( + unload(v1.cmpeq(v2)), + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF] + ); + } + if !is_runnable() { + return; + } + unsafe { test() } + } + + #[test] + fn vector_and() { + #[target_feature(enable = "ssse3")] + unsafe fn test() { + let v1 = + load([0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); + let v2 = + load([0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); + assert_eq!( + unload(v1.and(v2)), + [0, 0, 0, 0, 0, 0b1000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + ); + } + if !is_runnable() { + return; + } + unsafe { test() } + } + + #[test] + fn vector_or() { + #[target_feature(enable = "ssse3")] + unsafe fn test() { + let v1 = + load([0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); + let v2 = + load([0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); + assert_eq!( + unload(v1.or(v2)), + [0, 0, 0, 0, 0, 0b1011, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + ); + } + if !is_runnable() { + return; + } + unsafe { test() } + } + + #[test] + fn vector_shift_8bit_lane_right() { + #[target_feature(enable = "ssse3")] + unsafe fn test() { + let v = load([ + 0, 0, 0, 0, 0b1011, 0b0101, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ]); + assert_eq!( + unload(v.shift_8bit_lane_right::<2>()), + [0, 0, 0, 0, 0b0010, 0b0001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + ); + } + if !is_runnable() { + return; + } + unsafe { test() } + } + + #[test] + fn vector_shift_in_one_byte() { + #[target_feature(enable = "ssse3")] + unsafe fn test() { + let v1 = + load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); + let v2 = load([ + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + ]); + assert_eq!( + unload(v1.shift_in_one_byte(v2)), + [32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], + ); + } + if !is_runnable() { + return; + } + unsafe { test() } + } + + #[test] + fn vector_shift_in_two_bytes() { + #[target_feature(enable = "ssse3")] + unsafe fn test() { + let v1 = + load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); + let v2 = load([ + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + ]); + assert_eq!( + unload(v1.shift_in_two_bytes(v2)), + [31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], + ); + } + if !is_runnable() { + return; + } + unsafe { test() } + } + + #[test] + fn vector_shift_in_three_bytes() { + #[target_feature(enable = "ssse3")] + unsafe fn test() { + let v1 = + load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); + let v2 = load([ + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + ]); + assert_eq!( + unload(v1.shift_in_three_bytes(v2)), + [30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], + ); + } + if !is_runnable() { + return; + } + unsafe { test() } + } + + #[test] + fn vector_shuffle_bytes() { + #[target_feature(enable = "ssse3")] + unsafe fn test() { + let v1 = + load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); + let v2 = + load([0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12]); + assert_eq!( + unload(v1.shuffle_bytes(v2)), + [1, 1, 1, 1, 5, 5, 5, 5, 9, 9, 9, 9, 13, 13, 13, 13], + ); + } + if !is_runnable() { + return; + } + unsafe { test() } + } + + #[test] + fn vector_for_each_64bit_lane() { + #[target_feature(enable = "ssse3")] + unsafe fn test() { + let v = load([ + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, + 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, + ]); + let mut lanes = [0u64; 2]; + v.for_each_64bit_lane(|i, lane| { + lanes[i] = lane; + None::<()> + }); + assert_eq!(lanes, [0x0807060504030201, 0x100F0E0D0C0B0A09],); + } + if !is_runnable() { + return; + } + unsafe { test() } + } +} + +#[cfg(all(test, target_arch = "x86_64", target_feature = "sse2"))] +mod tests_x86_64_avx2 { + use core::arch::x86_64::*; + + use crate::util::int::{I32, U32}; + + use super::*; + + fn is_runnable() -> bool { + std::is_x86_feature_detected!("avx2") + } + + #[target_feature(enable = "avx2")] + unsafe fn load(lanes: [u8; 32]) -> __m256i { + __m256i::load_unaligned(&lanes as *const u8) + } + + #[target_feature(enable = "avx2")] + unsafe fn load_half(lanes: [u8; 16]) -> __m256i { + __m256i::load_half_unaligned(&lanes as *const u8) + } + + #[target_feature(enable = "avx2")] + unsafe fn unload(v: __m256i) -> [u8; 32] { + [ + _mm256_extract_epi8(v, 0).to_bits().low_u8(), + _mm256_extract_epi8(v, 1).to_bits().low_u8(), + _mm256_extract_epi8(v, 2).to_bits().low_u8(), + _mm256_extract_epi8(v, 3).to_bits().low_u8(), + _mm256_extract_epi8(v, 4).to_bits().low_u8(), + _mm256_extract_epi8(v, 5).to_bits().low_u8(), + _mm256_extract_epi8(v, 6).to_bits().low_u8(), + _mm256_extract_epi8(v, 7).to_bits().low_u8(), + _mm256_extract_epi8(v, 8).to_bits().low_u8(), + _mm256_extract_epi8(v, 9).to_bits().low_u8(), + _mm256_extract_epi8(v, 10).to_bits().low_u8(), + _mm256_extract_epi8(v, 11).to_bits().low_u8(), + _mm256_extract_epi8(v, 12).to_bits().low_u8(), + _mm256_extract_epi8(v, 13).to_bits().low_u8(), + _mm256_extract_epi8(v, 14).to_bits().low_u8(), + _mm256_extract_epi8(v, 15).to_bits().low_u8(), + _mm256_extract_epi8(v, 16).to_bits().low_u8(), + _mm256_extract_epi8(v, 17).to_bits().low_u8(), + _mm256_extract_epi8(v, 18).to_bits().low_u8(), + _mm256_extract_epi8(v, 19).to_bits().low_u8(), + _mm256_extract_epi8(v, 20).to_bits().low_u8(), + _mm256_extract_epi8(v, 21).to_bits().low_u8(), + _mm256_extract_epi8(v, 22).to_bits().low_u8(), + _mm256_extract_epi8(v, 23).to_bits().low_u8(), + _mm256_extract_epi8(v, 24).to_bits().low_u8(), + _mm256_extract_epi8(v, 25).to_bits().low_u8(), + _mm256_extract_epi8(v, 26).to_bits().low_u8(), + _mm256_extract_epi8(v, 27).to_bits().low_u8(), + _mm256_extract_epi8(v, 28).to_bits().low_u8(), + _mm256_extract_epi8(v, 29).to_bits().low_u8(), + _mm256_extract_epi8(v, 30).to_bits().low_u8(), + _mm256_extract_epi8(v, 31).to_bits().low_u8(), + ] + } + + #[test] + fn vector_splat() { + #[target_feature(enable = "avx2")] + unsafe fn test() { + let v = __m256i::splat(0xAF); + assert_eq!( + unload(v), + [ + 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, + 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, + 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, + 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, + ] + ); + } + if !is_runnable() { + return; + } + unsafe { test() } + } + + #[test] + fn vector_is_zero() { + #[target_feature(enable = "avx2")] + unsafe fn test() { + let v = load([ + 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ]); + assert!(!v.is_zero()); + let v = load([ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ]); + assert!(v.is_zero()); + } + if !is_runnable() { + return; + } + unsafe { test() } + } + + #[test] + fn vector_cmpeq() { + #[target_feature(enable = "avx2")] + unsafe fn test() { + let v1 = load([ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 1, + ]); + let v2 = load([ + 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, + 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, + ]); + assert_eq!( + unload(v1.cmpeq(v2)), + [ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF + ] + ); + } + if !is_runnable() { + return; + } + unsafe { test() } + } + + #[test] + fn vector_and() { + #[target_feature(enable = "avx2")] + unsafe fn test() { + let v1 = load([ + 0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ]); + let v2 = load([ + 0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ]); + assert_eq!( + unload(v1.and(v2)), + [ + 0, 0, 0, 0, 0, 0b1000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ] + ); + } + if !is_runnable() { + return; + } + unsafe { test() } + } + + #[test] + fn vector_or() { + #[target_feature(enable = "avx2")] + unsafe fn test() { + let v1 = load([ + 0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ]); + let v2 = load([ + 0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ]); + assert_eq!( + unload(v1.or(v2)), + [ + 0, 0, 0, 0, 0, 0b1011, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ] + ); + } + if !is_runnable() { + return; + } + unsafe { test() } + } + + #[test] + fn vector_shift_8bit_lane_right() { + #[target_feature(enable = "avx2")] + unsafe fn test() { + let v = load([ + 0, 0, 0, 0, 0b1011, 0b0101, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ]); + assert_eq!( + unload(v.shift_8bit_lane_right::<2>()), + [ + 0, 0, 0, 0, 0b0010, 0b0001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ] + ); + } + if !is_runnable() { + return; + } + unsafe { test() } + } + + #[test] + fn vector_shift_in_one_byte() { + #[target_feature(enable = "avx2")] + unsafe fn test() { + let v1 = load([ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + ]); + let v2 = load([ + 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, + 63, 64, + ]); + assert_eq!( + unload(v1.shift_in_one_byte(v2)), + [ + 64, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, + 31, + ], + ); + } + if !is_runnable() { + return; + } + unsafe { test() } + } + + #[test] + fn vector_shift_in_two_bytes() { + #[target_feature(enable = "avx2")] + unsafe fn test() { + let v1 = load([ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + ]); + let v2 = load([ + 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, + 63, 64, + ]); + assert_eq!( + unload(v1.shift_in_two_bytes(v2)), + [ + 63, 64, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, + ], + ); + } + if !is_runnable() { + return; + } + unsafe { test() } + } + + #[test] + fn vector_shift_in_three_bytes() { + #[target_feature(enable = "avx2")] + unsafe fn test() { + let v1 = load([ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + ]); + let v2 = load([ + 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, + 63, 64, + ]); + assert_eq!( + unload(v1.shift_in_three_bytes(v2)), + [ + 62, 63, 64, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, + 29, + ], + ); + } + if !is_runnable() { + return; + } + unsafe { test() } + } + + #[test] + fn vector_shuffle_bytes() { + #[target_feature(enable = "avx2")] + unsafe fn test() { + let v1 = load([ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + ]); + let v2 = load([ + 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12, 16, 16, + 16, 16, 20, 20, 20, 20, 24, 24, 24, 24, 28, 28, 28, 28, + ]); + assert_eq!( + unload(v1.shuffle_bytes(v2)), + [ + 1, 1, 1, 1, 5, 5, 5, 5, 9, 9, 9, 9, 13, 13, 13, 13, 17, + 17, 17, 17, 21, 21, 21, 21, 25, 25, 25, 25, 29, 29, 29, + 29 + ], + ); + } + if !is_runnable() { + return; + } + unsafe { test() } + } + + #[test] + fn vector_for_each_64bit_lane() { + #[target_feature(enable = "avx2")] + unsafe fn test() { + let v = load([ + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, + 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, + 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, + 0x1F, 0x20, + ]); + let mut lanes = [0u64; 4]; + v.for_each_64bit_lane(|i, lane| { + lanes[i] = lane; + None::<()> + }); + assert_eq!( + lanes, + [ + 0x0807060504030201, + 0x100F0E0D0C0B0A09, + 0x1817161514131211, + 0x201F1E1D1C1B1A19 + ] + ); + } + if !is_runnable() { + return; + } + unsafe { test() } + } + + #[test] + fn fat_vector_half_shift_in_one_byte() { + #[target_feature(enable = "avx2")] + unsafe fn test() { + let v1 = load_half([ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + ]); + let v2 = load_half([ + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + ]); + assert_eq!( + unload(v1.half_shift_in_one_byte(v2)), + [ + 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + ], + ); + } + if !is_runnable() { + return; + } + unsafe { test() } + } + + #[test] + fn fat_vector_half_shift_in_two_bytes() { + #[target_feature(enable = "avx2")] + unsafe fn test() { + let v1 = load_half([ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + ]); + let v2 = load_half([ + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + ]); + assert_eq!( + unload(v1.half_shift_in_two_bytes(v2)), + [ + 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 31, + 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + ], + ); + } + if !is_runnable() { + return; + } + unsafe { test() } + } + + #[test] + fn fat_vector_half_shift_in_three_bytes() { + #[target_feature(enable = "avx2")] + unsafe fn test() { + let v1 = load_half([ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + ]); + let v2 = load_half([ + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + ]); + assert_eq!( + unload(v1.half_shift_in_three_bytes(v2)), + [ + 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 30, + 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + ], + ); + } + if !is_runnable() { + return; + } + unsafe { test() } + } + + #[test] + fn fat_vector_swap_halves() { + #[target_feature(enable = "avx2")] + unsafe fn test() { + let v = load([ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + ]); + assert_eq!( + unload(v.swap_halves()), + [ + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, + ], + ); + } + if !is_runnable() { + return; + } + unsafe { test() } + } + + #[test] + fn fat_vector_interleave_low_8bit_lanes() { + #[target_feature(enable = "avx2")] + unsafe fn test() { + let v1 = load([ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + ]); + let v2 = load([ + 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, + 63, 64, + ]); + assert_eq!( + unload(v1.interleave_low_8bit_lanes(v2)), + [ + 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, + 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, + 24, 56, + ], + ); + } + if !is_runnable() { + return; + } + unsafe { test() } + } + + #[test] + fn fat_vector_interleave_high_8bit_lanes() { + #[target_feature(enable = "avx2")] + unsafe fn test() { + let v1 = load([ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + ]); + let v2 = load([ + 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, + 63, 64, + ]); + assert_eq!( + unload(v1.interleave_high_8bit_lanes(v2)), + [ + 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, 16, + 48, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, + 63, 32, 64, + ], + ); + } + if !is_runnable() { + return; + } + unsafe { test() } + } + + #[test] + fn fat_vector_for_each_low_64bit_lane() { + #[target_feature(enable = "avx2")] + unsafe fn test() { + let v1 = load([ + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, + 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, + 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, + 0x1F, 0x20, + ]); + let v2 = load([ + 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, + 0x2B, 0x2C, 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, + 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, + 0x3F, 0x40, + ]); + let mut lanes = [0u64; 4]; + v1.for_each_low_64bit_lane(v2, |i, lane| { + lanes[i] = lane; + None::<()> + }); + assert_eq!( + lanes, + [ + 0x0807060504030201, + 0x100F0E0D0C0B0A09, + 0x2827262524232221, + 0x302F2E2D2C2B2A29 + ] + ); + } + if !is_runnable() { + return; + } + unsafe { test() } + } +} + +#[cfg(all(test, target_arch = "aarch64", target_feature = "neon"))] +mod tests_aarch64_neon { + use core::arch::aarch64::*; + + use super::*; + + #[target_feature(enable = "neon")] + unsafe fn load(lanes: [u8; 16]) -> uint8x16_t { + uint8x16_t::load_unaligned(&lanes as *const u8) + } + + #[target_feature(enable = "neon")] + unsafe fn unload(v: uint8x16_t) -> [u8; 16] { + [ + vgetq_lane_u8(v, 0), + vgetq_lane_u8(v, 1), + vgetq_lane_u8(v, 2), + vgetq_lane_u8(v, 3), + vgetq_lane_u8(v, 4), + vgetq_lane_u8(v, 5), + vgetq_lane_u8(v, 6), + vgetq_lane_u8(v, 7), + vgetq_lane_u8(v, 8), + vgetq_lane_u8(v, 9), + vgetq_lane_u8(v, 10), + vgetq_lane_u8(v, 11), + vgetq_lane_u8(v, 12), + vgetq_lane_u8(v, 13), + vgetq_lane_u8(v, 14), + vgetq_lane_u8(v, 15), + ] + } + + // Example functions. These don't test the Vector traits, but rather, + // specific NEON instructions. They are basically little experiments I + // wrote to figure out what an instruction does since their descriptions + // are so dense. I decided to keep the experiments around as example tests + // in case there' useful. + + #[test] + fn example_vmaxvq_u8_non_zero() { + #[target_feature(enable = "neon")] + unsafe fn example() { + let v = load([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); + assert_eq!(vmaxvq_u8(v), 1); + } + unsafe { example() } + } + + #[test] + fn example_vmaxvq_u8_zero() { + #[target_feature(enable = "neon")] + unsafe fn example() { + let v = load([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); + assert_eq!(vmaxvq_u8(v), 0); + } + unsafe { example() } + } + + #[test] + fn example_vpmaxq_u8_non_zero() { + #[target_feature(enable = "neon")] + unsafe fn example() { + let v = load([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); + let r = vpmaxq_u8(v, v); + assert_eq!( + unload(r), + [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0] + ); + } + unsafe { example() } + } + + #[test] + fn example_vpmaxq_u8_self() { + #[target_feature(enable = "neon")] + unsafe fn example() { + let v = + load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); + let r = vpmaxq_u8(v, v); + assert_eq!( + unload(r), + [2, 4, 6, 8, 10, 12, 14, 16, 2, 4, 6, 8, 10, 12, 14, 16] + ); + } + unsafe { example() } + } + + #[test] + fn example_vpmaxq_u8_other() { + #[target_feature(enable = "neon")] + unsafe fn example() { + let v1 = + load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); + let v2 = load([ + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + ]); + let r = vpmaxq_u8(v1, v2); + assert_eq!( + unload(r), + [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32] + ); + } + unsafe { example() } + } + + // Now we test the actual methods on the Vector trait. + + #[test] + fn vector_splat() { + #[target_feature(enable = "neon")] + unsafe fn test() { + let v = uint8x16_t::splat(0xAF); + assert_eq!( + unload(v), + [ + 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, + 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF + ] + ); + } + unsafe { test() } + } + + #[test] + fn vector_is_zero() { + #[target_feature(enable = "neon")] + unsafe fn test() { + let v = load([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); + assert!(!v.is_zero()); + let v = load([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); + assert!(v.is_zero()); + } + unsafe { test() } + } + + #[test] + fn vector_cmpeq() { + #[target_feature(enable = "neon")] + unsafe fn test() { + let v1 = + load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1]); + let v2 = + load([16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]); + assert_eq!( + unload(v1.cmpeq(v2)), + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF] + ); + } + unsafe { test() } + } + + #[test] + fn vector_and() { + #[target_feature(enable = "neon")] + unsafe fn test() { + let v1 = + load([0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); + let v2 = + load([0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); + assert_eq!( + unload(v1.and(v2)), + [0, 0, 0, 0, 0, 0b1000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + ); + } + unsafe { test() } + } + + #[test] + fn vector_or() { + #[target_feature(enable = "neon")] + unsafe fn test() { + let v1 = + load([0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); + let v2 = + load([0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); + assert_eq!( + unload(v1.or(v2)), + [0, 0, 0, 0, 0, 0b1011, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + ); + } + unsafe { test() } + } + + #[test] + fn vector_shift_8bit_lane_right() { + #[target_feature(enable = "neon")] + unsafe fn test() { + let v = load([ + 0, 0, 0, 0, 0b1011, 0b0101, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ]); + assert_eq!( + unload(v.shift_8bit_lane_right::<2>()), + [0, 0, 0, 0, 0b0010, 0b0001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + ); + } + unsafe { test() } + } + + #[test] + fn vector_shift_in_one_byte() { + #[target_feature(enable = "neon")] + unsafe fn test() { + let v1 = + load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); + let v2 = load([ + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + ]); + assert_eq!( + unload(v1.shift_in_one_byte(v2)), + [32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], + ); + } + unsafe { test() } + } + + #[test] + fn vector_shift_in_two_bytes() { + #[target_feature(enable = "neon")] + unsafe fn test() { + let v1 = + load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); + let v2 = load([ + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + ]); + assert_eq!( + unload(v1.shift_in_two_bytes(v2)), + [31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], + ); + } + unsafe { test() } + } + + #[test] + fn vector_shift_in_three_bytes() { + #[target_feature(enable = "neon")] + unsafe fn test() { + let v1 = + load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); + let v2 = load([ + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + ]); + assert_eq!( + unload(v1.shift_in_three_bytes(v2)), + [30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], + ); + } + unsafe { test() } + } + + #[test] + fn vector_shuffle_bytes() { + #[target_feature(enable = "neon")] + unsafe fn test() { + let v1 = + load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); + let v2 = + load([0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12]); + assert_eq!( + unload(v1.shuffle_bytes(v2)), + [1, 1, 1, 1, 5, 5, 5, 5, 9, 9, 9, 9, 13, 13, 13, 13], + ); + } + unsafe { test() } + } + + #[test] + fn vector_for_each_64bit_lane() { + #[target_feature(enable = "neon")] + unsafe fn test() { + let v = load([ + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, + 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, + ]); + let mut lanes = [0u64; 2]; + v.for_each_64bit_lane(|i, lane| { + lanes[i] = lane; + None::<()> + }); + assert_eq!(lanes, [0x0807060504030201, 0x100F0E0D0C0B0A09],); + } + unsafe { test() } + } +} diff --git a/vendor/aho-corasick/src/tests.rs b/vendor/aho-corasick/src/tests.rs new file mode 100644 index 0000000..a5276f8 --- /dev/null +++ b/vendor/aho-corasick/src/tests.rs @@ -0,0 +1,1664 @@ +use std::{collections::HashMap, format, string::String, vec::Vec}; + +use crate::{ + AhoCorasick, AhoCorasickBuilder, AhoCorasickKind, Anchored, Input, Match, + MatchKind, StartKind, +}; + +/// A description of a single test against an Aho-Corasick automaton. +/// +/// A single test may not necessarily pass on every configuration of an +/// Aho-Corasick automaton. The tests are categorized and grouped appropriately +/// below. +#[derive(Clone, Debug, Eq, PartialEq)] +struct SearchTest { + /// The name of this test, for debugging. + name: &'static str, + /// The patterns to search for. + patterns: &'static [&'static str], + /// The text to search. + haystack: &'static str, + /// Each match is a triple of (pattern_index, start, end), where + /// pattern_index is an index into `patterns` and `start`/`end` are indices + /// into `haystack`. + matches: &'static [(usize, usize, usize)], +} + +/// Short-hand constructor for SearchTest. We use it a lot below. +macro_rules! t { + ($name:ident, $patterns:expr, $haystack:expr, $matches:expr) => { + SearchTest { + name: stringify!($name), + patterns: $patterns, + haystack: $haystack, + matches: $matches, + } + }; +} + +/// A collection of test groups. +type TestCollection = &'static [&'static [SearchTest]]; + +// Define several collections corresponding to the different type of match +// semantics supported by Aho-Corasick. These collections have some overlap, +// but each collection should have some tests that no other collection has. + +/// Tests for Aho-Corasick's standard non-overlapping match semantics. +const AC_STANDARD_NON_OVERLAPPING: TestCollection = + &[BASICS, NON_OVERLAPPING, STANDARD, REGRESSION]; + +/// Tests for Aho-Corasick's anchored standard non-overlapping match semantics. +const AC_STANDARD_ANCHORED_NON_OVERLAPPING: TestCollection = + &[ANCHORED_BASICS, ANCHORED_NON_OVERLAPPING, STANDARD_ANCHORED]; + +/// Tests for Aho-Corasick's standard overlapping match semantics. +const AC_STANDARD_OVERLAPPING: TestCollection = + &[BASICS, OVERLAPPING, REGRESSION]; + +/* +Iterators of anchored overlapping searches were removed from the API in +after 0.7, but we leave the tests commented out for posterity. +/// Tests for Aho-Corasick's anchored standard overlapping match semantics. +const AC_STANDARD_ANCHORED_OVERLAPPING: TestCollection = + &[ANCHORED_BASICS, ANCHORED_OVERLAPPING]; +*/ + +/// Tests for Aho-Corasick's leftmost-first match semantics. +const AC_LEFTMOST_FIRST: TestCollection = + &[BASICS, NON_OVERLAPPING, LEFTMOST, LEFTMOST_FIRST, REGRESSION]; + +/// Tests for Aho-Corasick's anchored leftmost-first match semantics. +const AC_LEFTMOST_FIRST_ANCHORED: TestCollection = &[ + ANCHORED_BASICS, + ANCHORED_NON_OVERLAPPING, + ANCHORED_LEFTMOST, + ANCHORED_LEFTMOST_FIRST, +]; + +/// Tests for Aho-Corasick's leftmost-longest match semantics. +const AC_LEFTMOST_LONGEST: TestCollection = + &[BASICS, NON_OVERLAPPING, LEFTMOST, LEFTMOST_LONGEST, REGRESSION]; + +/// Tests for Aho-Corasick's anchored leftmost-longest match semantics. +const AC_LEFTMOST_LONGEST_ANCHORED: TestCollection = &[ + ANCHORED_BASICS, + ANCHORED_NON_OVERLAPPING, + ANCHORED_LEFTMOST, + ANCHORED_LEFTMOST_LONGEST, +]; + +// Now define the individual tests that make up the collections above. + +/// A collection of tests for the Aho-Corasick algorithm that should always be +/// true regardless of match semantics. That is, all combinations of +/// leftmost-{shortest, first, longest} x {overlapping, non-overlapping} +/// should produce the same answer. +const BASICS: &'static [SearchTest] = &[ + t!(basic000, &[], "", &[]), + t!(basic001, &[""], "a", &[(0, 0, 0), (0, 1, 1)]), + t!(basic002, &["a"], "", &[]), + t!(basic010, &["a"], "a", &[(0, 0, 1)]), + t!(basic020, &["a"], "aa", &[(0, 0, 1), (0, 1, 2)]), + t!(basic030, &["a"], "aaa", &[(0, 0, 1), (0, 1, 2), (0, 2, 3)]), + t!(basic040, &["a"], "aba", &[(0, 0, 1), (0, 2, 3)]), + t!(basic050, &["a"], "bba", &[(0, 2, 3)]), + t!(basic060, &["a"], "bbb", &[]), + t!(basic070, &["a"], "bababbbba", &[(0, 1, 2), (0, 3, 4), (0, 8, 9)]), + t!(basic100, &["aa"], "", &[]), + t!(basic110, &["aa"], "aa", &[(0, 0, 2)]), + t!(basic120, &["aa"], "aabbaa", &[(0, 0, 2), (0, 4, 6)]), + t!(basic130, &["aa"], "abbab", &[]), + t!(basic140, &["aa"], "abbabaa", &[(0, 5, 7)]), + t!(basic200, &["abc"], "abc", &[(0, 0, 3)]), + t!(basic210, &["abc"], "zazabzabcz", &[(0, 6, 9)]), + t!(basic220, &["abc"], "zazabczabcz", &[(0, 3, 6), (0, 7, 10)]), + t!(basic300, &["a", "b"], "", &[]), + t!(basic310, &["a", "b"], "z", &[]), + t!(basic320, &["a", "b"], "b", &[(1, 0, 1)]), + t!(basic330, &["a", "b"], "a", &[(0, 0, 1)]), + t!( + basic340, + &["a", "b"], + "abba", + &[(0, 0, 1), (1, 1, 2), (1, 2, 3), (0, 3, 4),] + ), + t!( + basic350, + &["b", "a"], + "abba", + &[(1, 0, 1), (0, 1, 2), (0, 2, 3), (1, 3, 4),] + ), + t!(basic360, &["abc", "bc"], "xbc", &[(1, 1, 3),]), + t!(basic400, &["foo", "bar"], "", &[]), + t!(basic410, &["foo", "bar"], "foobar", &[(0, 0, 3), (1, 3, 6),]), + t!(basic420, &["foo", "bar"], "barfoo", &[(1, 0, 3), (0, 3, 6),]), + t!(basic430, &["foo", "bar"], "foofoo", &[(0, 0, 3), (0, 3, 6),]), + t!(basic440, &["foo", "bar"], "barbar", &[(1, 0, 3), (1, 3, 6),]), + t!(basic450, &["foo", "bar"], "bafofoo", &[(0, 4, 7),]), + t!(basic460, &["bar", "foo"], "bafofoo", &[(1, 4, 7),]), + t!(basic470, &["foo", "bar"], "fobabar", &[(1, 4, 7),]), + t!(basic480, &["bar", "foo"], "fobabar", &[(0, 4, 7),]), + t!(basic600, &[""], "", &[(0, 0, 0)]), + t!(basic610, &[""], "a", &[(0, 0, 0), (0, 1, 1)]), + t!(basic620, &[""], "abc", &[(0, 0, 0), (0, 1, 1), (0, 2, 2), (0, 3, 3)]), + t!(basic700, &["yabcdef", "abcdezghi"], "yabcdefghi", &[(0, 0, 7),]), + t!(basic710, &["yabcdef", "abcdezghi"], "yabcdezghi", &[(1, 1, 10),]), + t!( + basic720, + &["yabcdef", "bcdeyabc", "abcdezghi"], + "yabcdezghi", + &[(2, 1, 10),] + ), +]; + +/// A collection of *anchored* tests for the Aho-Corasick algorithm that should +/// always be true regardless of match semantics. That is, all combinations of +/// leftmost-{shortest, first, longest} x {overlapping, non-overlapping} should +/// produce the same answer. +const ANCHORED_BASICS: &'static [SearchTest] = &[ + t!(abasic000, &[], "", &[]), + t!(abasic001, &[], "a", &[]), + t!(abasic002, &[], "abc", &[]), + t!(abasic010, &[""], "", &[(0, 0, 0)]), + t!(abasic020, &[""], "a", &[(0, 0, 0), (0, 1, 1)]), + t!(abasic030, &[""], "abc", &[(0, 0, 0), (0, 1, 1), (0, 2, 2), (0, 3, 3)]), + t!(abasic100, &["a"], "a", &[(0, 0, 1)]), + t!(abasic110, &["a"], "aa", &[(0, 0, 1), (0, 1, 2)]), + t!(abasic120, &["a", "b"], "ab", &[(0, 0, 1), (1, 1, 2)]), + t!(abasic130, &["a", "b"], "ba", &[(1, 0, 1), (0, 1, 2)]), + t!(abasic140, &["foo", "foofoo"], "foo", &[(0, 0, 3)]), + t!(abasic150, &["foofoo", "foo"], "foo", &[(1, 0, 3)]), + t!(abasic200, &["foo"], "foofoo foo", &[(0, 0, 3), (0, 3, 6)]), +]; + +/// Tests for non-overlapping standard match semantics. +/// +/// These tests generally shouldn't pass for leftmost-{first,longest}, although +/// some do in order to write clearer tests. For example, standard000 will +/// pass with leftmost-first semantics, but standard010 will not. We write +/// both to emphasize how the match semantics work. +const STANDARD: &'static [SearchTest] = &[ + t!(standard000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]), + t!(standard010, &["abcd", "ab"], "abcd", &[(1, 0, 2)]), + t!(standard020, &["abcd", "ab", "abc"], "abcd", &[(1, 0, 2)]), + t!(standard030, &["abcd", "abc", "ab"], "abcd", &[(2, 0, 2)]), + t!(standard040, &["a", ""], "a", &[(1, 0, 0), (1, 1, 1)]), + t!( + standard400, + &["abcd", "bcd", "cd", "b"], + "abcd", + &[(3, 1, 2), (2, 2, 4),] + ), + t!(standard410, &["", "a"], "a", &[(0, 0, 0), (0, 1, 1),]), + t!(standard420, &["", "a"], "aa", &[(0, 0, 0), (0, 1, 1), (0, 2, 2),]), + t!(standard430, &["", "a", ""], "a", &[(0, 0, 0), (0, 1, 1),]), + t!(standard440, &["a", "", ""], "a", &[(1, 0, 0), (1, 1, 1),]), + t!(standard450, &["", "", "a"], "a", &[(0, 0, 0), (0, 1, 1),]), +]; + +/// Like STANDARD, but for anchored searches. +const STANDARD_ANCHORED: &'static [SearchTest] = &[ + t!(astandard000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]), + t!(astandard010, &["abcd", "ab"], "abcd", &[(1, 0, 2)]), + t!(astandard020, &["abcd", "ab", "abc"], "abcd", &[(1, 0, 2)]), + t!(astandard030, &["abcd", "abc", "ab"], "abcd", &[(2, 0, 2)]), + t!(astandard040, &["a", ""], "a", &[(1, 0, 0), (1, 1, 1)]), + t!(astandard050, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4)]), + t!(astandard410, &["", "a"], "a", &[(0, 0, 0), (0, 1, 1)]), + t!(astandard420, &["", "a"], "aa", &[(0, 0, 0), (0, 1, 1), (0, 2, 2)]), + t!(astandard430, &["", "a", ""], "a", &[(0, 0, 0), (0, 1, 1)]), + t!(astandard440, &["a", "", ""], "a", &[(1, 0, 0), (1, 1, 1)]), + t!(astandard450, &["", "", "a"], "a", &[(0, 0, 0), (0, 1, 1)]), +]; + +/// Tests for non-overlapping leftmost match semantics. These should pass for +/// both leftmost-first and leftmost-longest match kinds. Stated differently, +/// among ambiguous matches, the longest match and the match that appeared +/// first when constructing the automaton should always be the same. +const LEFTMOST: &'static [SearchTest] = &[ + t!(leftmost000, &["ab", "ab"], "abcd", &[(0, 0, 2)]), + t!(leftmost010, &["a", ""], "a", &[(0, 0, 1)]), + t!(leftmost011, &["a", ""], "ab", &[(0, 0, 1), (1, 2, 2)]), + t!(leftmost020, &["", ""], "a", &[(0, 0, 0), (0, 1, 1)]), + t!(leftmost030, &["a", "ab"], "aa", &[(0, 0, 1), (0, 1, 2)]), + t!(leftmost031, &["ab", "a"], "aa", &[(1, 0, 1), (1, 1, 2)]), + t!(leftmost032, &["ab", "a"], "xayabbbz", &[(1, 1, 2), (0, 3, 5)]), + t!(leftmost300, &["abcd", "bce", "b"], "abce", &[(1, 1, 4)]), + t!(leftmost310, &["abcd", "ce", "bc"], "abce", &[(2, 1, 3)]), + t!(leftmost320, &["abcd", "bce", "ce", "b"], "abce", &[(1, 1, 4)]), + t!(leftmost330, &["abcd", "bce", "cz", "bc"], "abcz", &[(3, 1, 3)]), + t!(leftmost340, &["bce", "cz", "bc"], "bcz", &[(2, 0, 2)]), + t!(leftmost350, &["abc", "bd", "ab"], "abd", &[(2, 0, 2)]), + t!( + leftmost360, + &["abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(2, 0, 8),] + ), + t!( + leftmost370, + &["abcdefghi", "cde", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8),] + ), + t!( + leftmost380, + &["abcdefghi", "hz", "abcdefgh", "a"], + "abcdefghz", + &[(2, 0, 8),] + ), + t!( + leftmost390, + &["b", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8),] + ), + t!( + leftmost400, + &["h", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8),] + ), + t!( + leftmost410, + &["z", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8), (0, 8, 9),] + ), +]; + +/// Like LEFTMOST, but for anchored searches. +const ANCHORED_LEFTMOST: &'static [SearchTest] = &[ + t!(aleftmost000, &["ab", "ab"], "abcd", &[(0, 0, 2)]), + // We shouldn't allow an empty match immediately following a match, right? + t!(aleftmost010, &["a", ""], "a", &[(0, 0, 1)]), + t!(aleftmost020, &["", ""], "a", &[(0, 0, 0), (0, 1, 1)]), + t!(aleftmost030, &["a", "ab"], "aa", &[(0, 0, 1), (0, 1, 2)]), + t!(aleftmost031, &["ab", "a"], "aa", &[(1, 0, 1), (1, 1, 2)]), + t!(aleftmost032, &["ab", "a"], "xayabbbz", &[]), + t!(aleftmost300, &["abcd", "bce", "b"], "abce", &[]), + t!(aleftmost301, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4)]), + t!(aleftmost310, &["abcd", "ce", "bc"], "abce", &[]), + t!(aleftmost320, &["abcd", "bce", "ce", "b"], "abce", &[]), + t!(aleftmost330, &["abcd", "bce", "cz", "bc"], "abcz", &[]), + t!(aleftmost340, &["bce", "cz", "bc"], "bcz", &[(2, 0, 2)]), + t!(aleftmost350, &["abc", "bd", "ab"], "abd", &[(2, 0, 2)]), + t!( + aleftmost360, + &["abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(2, 0, 8),] + ), + t!( + aleftmost370, + &["abcdefghi", "cde", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8),] + ), + t!( + aleftmost380, + &["abcdefghi", "hz", "abcdefgh", "a"], + "abcdefghz", + &[(2, 0, 8),] + ), + t!( + aleftmost390, + &["b", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8),] + ), + t!( + aleftmost400, + &["h", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8),] + ), + t!( + aleftmost410, + &["z", "abcdefghi", "hz", "abcdefgh"], + "abcdefghzyz", + &[(3, 0, 8), (0, 8, 9)] + ), +]; + +/// Tests for non-overlapping leftmost-first match semantics. These tests +/// should generally be specific to leftmost-first, which means they should +/// generally fail under leftmost-longest semantics. +const LEFTMOST_FIRST: &'static [SearchTest] = &[ + t!(leftfirst000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]), + t!(leftfirst010, &["", "a"], "a", &[(0, 0, 0), (0, 1, 1)]), + t!(leftfirst011, &["", "a", ""], "a", &[(0, 0, 0), (0, 1, 1),]), + t!(leftfirst012, &["a", "", ""], "a", &[(0, 0, 1)]), + t!(leftfirst013, &["", "", "a"], "a", &[(0, 0, 0), (0, 1, 1)]), + t!(leftfirst014, &["a", ""], "a", &[(0, 0, 1)]), + t!(leftfirst015, &["a", ""], "ab", &[(0, 0, 1), (1, 2, 2)]), + t!(leftfirst020, &["abcd", "ab"], "abcd", &[(0, 0, 4)]), + t!(leftfirst030, &["ab", "ab"], "abcd", &[(0, 0, 2)]), + t!(leftfirst040, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (0, 3, 4)]), + t!(leftfirst100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(1, 1, 5)]), + t!(leftfirst110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]), + t!(leftfirst300, &["abcd", "b", "bce"], "abce", &[(1, 1, 2)]), + t!( + leftfirst310, + &["abcd", "b", "bce", "ce"], + "abce", + &[(1, 1, 2), (3, 2, 4),] + ), + t!( + leftfirst320, + &["a", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(0, 0, 1), (2, 7, 9),] + ), + t!(leftfirst330, &["a", "abab"], "abab", &[(0, 0, 1), (0, 2, 3)]), + t!(leftfirst400, &["amwix", "samwise", "sam"], "Zsamwix", &[(2, 1, 4)]), +]; + +/// Like LEFTMOST_FIRST, but for anchored searches. +const ANCHORED_LEFTMOST_FIRST: &'static [SearchTest] = &[ + t!(aleftfirst000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]), + t!(aleftfirst010, &["", "a"], "a", &[(0, 0, 0), (0, 1, 1)]), + t!(aleftfirst011, &["", "a", ""], "a", &[(0, 0, 0), (0, 1, 1)]), + t!(aleftfirst012, &["a", "", ""], "a", &[(0, 0, 1)]), + t!(aleftfirst013, &["", "", "a"], "a", &[(0, 0, 0), (0, 1, 1)]), + t!(aleftfirst020, &["abcd", "ab"], "abcd", &[(0, 0, 4)]), + t!(aleftfirst030, &["ab", "ab"], "abcd", &[(0, 0, 2)]), + t!(aleftfirst040, &["a", "ab"], "xayabbbz", &[]), + t!(aleftfirst100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[]), + t!(aleftfirst110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[]), + t!(aleftfirst300, &["abcd", "b", "bce"], "abce", &[]), + t!(aleftfirst310, &["abcd", "b", "bce", "ce"], "abce", &[]), + t!( + aleftfirst320, + &["a", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(0, 0, 1)] + ), + t!(aleftfirst330, &["a", "abab"], "abab", &[(0, 0, 1)]), + t!(aleftfirst400, &["wise", "samwise", "sam"], "samwix", &[(2, 0, 3)]), +]; + +/// Tests for non-overlapping leftmost-longest match semantics. These tests +/// should generally be specific to leftmost-longest, which means they should +/// generally fail under leftmost-first semantics. +const LEFTMOST_LONGEST: &'static [SearchTest] = &[ + t!(leftlong000, &["ab", "abcd"], "abcd", &[(1, 0, 4)]), + t!(leftlong010, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4),]), + t!(leftlong020, &["", "a"], "a", &[(1, 0, 1)]), + t!(leftlong021, &["", "a", ""], "a", &[(1, 0, 1)]), + t!(leftlong022, &["a", "", ""], "a", &[(0, 0, 1)]), + t!(leftlong023, &["", "", "a"], "a", &[(2, 0, 1)]), + t!(leftlong024, &["", "a"], "ab", &[(1, 0, 1), (0, 2, 2)]), + t!(leftlong030, &["", "a"], "aa", &[(1, 0, 1), (1, 1, 2)]), + t!(leftlong040, &["a", "ab"], "a", &[(0, 0, 1)]), + t!(leftlong050, &["a", "ab"], "ab", &[(1, 0, 2)]), + t!(leftlong060, &["ab", "a"], "a", &[(1, 0, 1)]), + t!(leftlong070, &["ab", "a"], "ab", &[(0, 0, 2)]), + t!(leftlong100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(2, 1, 6)]), + t!(leftlong110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]), + t!(leftlong300, &["abcd", "b", "bce"], "abce", &[(2, 1, 4)]), + t!( + leftlong310, + &["a", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8),] + ), + t!(leftlong320, &["a", "abab"], "abab", &[(1, 0, 4)]), + t!(leftlong330, &["abcd", "b", "ce"], "abce", &[(1, 1, 2), (2, 2, 4),]), + t!(leftlong340, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (1, 3, 5)]), +]; + +/// Like LEFTMOST_LONGEST, but for anchored searches. +const ANCHORED_LEFTMOST_LONGEST: &'static [SearchTest] = &[ + t!(aleftlong000, &["ab", "abcd"], "abcd", &[(1, 0, 4)]), + t!(aleftlong010, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4),]), + t!(aleftlong020, &["", "a"], "a", &[(1, 0, 1)]), + t!(aleftlong021, &["", "a", ""], "a", &[(1, 0, 1)]), + t!(aleftlong022, &["a", "", ""], "a", &[(0, 0, 1)]), + t!(aleftlong023, &["", "", "a"], "a", &[(2, 0, 1)]), + t!(aleftlong030, &["", "a"], "aa", &[(1, 0, 1), (1, 1, 2)]), + t!(aleftlong040, &["a", "ab"], "a", &[(0, 0, 1)]), + t!(aleftlong050, &["a", "ab"], "ab", &[(1, 0, 2)]), + t!(aleftlong060, &["ab", "a"], "a", &[(1, 0, 1)]), + t!(aleftlong070, &["ab", "a"], "ab", &[(0, 0, 2)]), + t!(aleftlong100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[]), + t!(aleftlong110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[]), + t!(aleftlong300, &["abcd", "b", "bce"], "abce", &[]), + t!( + aleftlong310, + &["a", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8),] + ), + t!(aleftlong320, &["a", "abab"], "abab", &[(1, 0, 4)]), + t!(aleftlong330, &["abcd", "b", "ce"], "abce", &[]), + t!(aleftlong340, &["a", "ab"], "xayabbbz", &[]), +]; + +/// Tests for non-overlapping match semantics. +/// +/// Generally these tests shouldn't pass when using overlapping semantics. +/// These should pass for both standard and leftmost match semantics. +const NON_OVERLAPPING: &'static [SearchTest] = &[ + t!(nover010, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4),]), + t!(nover020, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4),]), + t!(nover030, &["abc", "bc"], "zazabcz", &[(0, 3, 6),]), + t!( + nover100, + &["ab", "ba"], + "abababa", + &[(0, 0, 2), (0, 2, 4), (0, 4, 6),] + ), + t!(nover200, &["foo", "foo"], "foobarfoo", &[(0, 0, 3), (0, 6, 9),]), + t!(nover300, &["", ""], "", &[(0, 0, 0),]), + t!(nover310, &["", ""], "a", &[(0, 0, 0), (0, 1, 1),]), +]; + +/// Like NON_OVERLAPPING, but for anchored searches. +const ANCHORED_NON_OVERLAPPING: &'static [SearchTest] = &[ + t!(anover010, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4),]), + t!(anover020, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4),]), + t!(anover030, &["abc", "bc"], "zazabcz", &[]), + t!( + anover100, + &["ab", "ba"], + "abababa", + &[(0, 0, 2), (0, 2, 4), (0, 4, 6)] + ), + t!(anover200, &["foo", "foo"], "foobarfoo", &[(0, 0, 3)]), + t!(anover300, &["", ""], "", &[(0, 0, 0)]), + t!(anover310, &["", ""], "a", &[(0, 0, 0), (0, 1, 1)]), +]; + +/// Tests for overlapping match semantics. +/// +/// This only supports standard match semantics, since leftmost-{first,longest} +/// do not support overlapping matches. +const OVERLAPPING: &'static [SearchTest] = &[ + t!( + over000, + &["abcd", "bcd", "cd", "b"], + "abcd", + &[(3, 1, 2), (0, 0, 4), (1, 1, 4), (2, 2, 4),] + ), + t!( + over010, + &["bcd", "cd", "b", "abcd"], + "abcd", + &[(2, 1, 2), (3, 0, 4), (0, 1, 4), (1, 2, 4),] + ), + t!( + over020, + &["abcd", "bcd", "cd"], + "abcd", + &[(0, 0, 4), (1, 1, 4), (2, 2, 4),] + ), + t!( + over030, + &["bcd", "abcd", "cd"], + "abcd", + &[(1, 0, 4), (0, 1, 4), (2, 2, 4),] + ), + t!( + over040, + &["bcd", "cd", "abcd"], + "abcd", + &[(2, 0, 4), (0, 1, 4), (1, 2, 4),] + ), + t!(over050, &["abc", "bc"], "zazabcz", &[(0, 3, 6), (1, 4, 6),]), + t!( + over100, + &["ab", "ba"], + "abababa", + &[(0, 0, 2), (1, 1, 3), (0, 2, 4), (1, 3, 5), (0, 4, 6), (1, 5, 7),] + ), + t!( + over200, + &["foo", "foo"], + "foobarfoo", + &[(0, 0, 3), (1, 0, 3), (0, 6, 9), (1, 6, 9),] + ), + t!(over300, &["", ""], "", &[(0, 0, 0), (1, 0, 0),]), + t!( + over310, + &["", ""], + "a", + &[(0, 0, 0), (1, 0, 0), (0, 1, 1), (1, 1, 1),] + ), + t!(over320, &["", "a"], "a", &[(0, 0, 0), (1, 0, 1), (0, 1, 1),]), + t!( + over330, + &["", "a", ""], + "a", + &[(0, 0, 0), (2, 0, 0), (1, 0, 1), (0, 1, 1), (2, 1, 1),] + ), + t!( + over340, + &["a", "", ""], + "a", + &[(1, 0, 0), (2, 0, 0), (0, 0, 1), (1, 1, 1), (2, 1, 1),] + ), + t!( + over350, + &["", "", "a"], + "a", + &[(0, 0, 0), (1, 0, 0), (2, 0, 1), (0, 1, 1), (1, 1, 1),] + ), + t!( + over360, + &["foo", "foofoo"], + "foofoo", + &[(0, 0, 3), (1, 0, 6), (0, 3, 6)] + ), +]; + +/* +Iterators of anchored overlapping searches were removed from the API in +after 0.7, but we leave the tests commented out for posterity. +/// Like OVERLAPPING, but for anchored searches. +const ANCHORED_OVERLAPPING: &'static [SearchTest] = &[ + t!(aover000, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4)]), + t!(aover010, &["bcd", "cd", "b", "abcd"], "abcd", &[(3, 0, 4)]), + t!(aover020, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4)]), + t!(aover030, &["bcd", "abcd", "cd"], "abcd", &[(1, 0, 4)]), + t!(aover040, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4)]), + t!(aover050, &["abc", "bc"], "zazabcz", &[]), + t!(aover100, &["ab", "ba"], "abababa", &[(0, 0, 2)]), + t!(aover200, &["foo", "foo"], "foobarfoo", &[(0, 0, 3), (1, 0, 3)]), + t!(aover300, &["", ""], "", &[(0, 0, 0), (1, 0, 0),]), + t!(aover310, &["", ""], "a", &[(0, 0, 0), (1, 0, 0)]), + t!(aover320, &["", "a"], "a", &[(0, 0, 0), (1, 0, 1)]), + t!(aover330, &["", "a", ""], "a", &[(0, 0, 0), (2, 0, 0), (1, 0, 1)]), + t!(aover340, &["a", "", ""], "a", &[(1, 0, 0), (2, 0, 0), (0, 0, 1)]), + t!(aover350, &["", "", "a"], "a", &[(0, 0, 0), (1, 0, 0), (2, 0, 1)]), + t!(aover360, &["foo", "foofoo"], "foofoo", &[(0, 0, 3), (1, 0, 6)]), +]; +*/ + +/// Tests for ASCII case insensitivity. +/// +/// These tests should all have the same behavior regardless of match semantics +/// or whether the search is overlapping. +const ASCII_CASE_INSENSITIVE: &'static [SearchTest] = &[ + t!(acasei000, &["a"], "A", &[(0, 0, 1)]), + t!(acasei010, &["Samwise"], "SAMWISE", &[(0, 0, 7)]), + t!(acasei011, &["Samwise"], "SAMWISE.abcd", &[(0, 0, 7)]), + t!(acasei020, &["fOoBaR"], "quux foobar baz", &[(0, 5, 11)]), +]; + +/// Like ASCII_CASE_INSENSITIVE, but specifically for non-overlapping tests. +const ASCII_CASE_INSENSITIVE_NON_OVERLAPPING: &'static [SearchTest] = &[ + t!(acasei000, &["foo", "FOO"], "fOo", &[(0, 0, 3)]), + t!(acasei000, &["FOO", "foo"], "fOo", &[(0, 0, 3)]), + t!(acasei010, &["abc", "def"], "abcdef", &[(0, 0, 3), (1, 3, 6)]), +]; + +/// Like ASCII_CASE_INSENSITIVE, but specifically for overlapping tests. +const ASCII_CASE_INSENSITIVE_OVERLAPPING: &'static [SearchTest] = &[ + t!(acasei000, &["foo", "FOO"], "fOo", &[(0, 0, 3), (1, 0, 3)]), + t!(acasei001, &["FOO", "foo"], "fOo", &[(0, 0, 3), (1, 0, 3)]), + // This is a regression test from: + // https://github.com/BurntSushi/aho-corasick/issues/68 + // Previously, it was reporting a duplicate (1, 3, 6) match. + t!( + acasei010, + &["abc", "def", "abcdef"], + "abcdef", + &[(0, 0, 3), (2, 0, 6), (1, 3, 6)] + ), +]; + +/// Regression tests that are applied to all Aho-Corasick combinations. +/// +/// If regression tests are needed for specific match semantics, then add them +/// to the appropriate group above. +const REGRESSION: &'static [SearchTest] = &[ + t!(regression010, &["inf", "ind"], "infind", &[(0, 0, 3), (1, 3, 6),]), + t!(regression020, &["ind", "inf"], "infind", &[(1, 0, 3), (0, 3, 6),]), + t!( + regression030, + &["libcore/", "libstd/"], + "libcore/char/methods.rs", + &[(0, 0, 8),] + ), + t!( + regression040, + &["libstd/", "libcore/"], + "libcore/char/methods.rs", + &[(1, 0, 8),] + ), + t!( + regression050, + &["\x00\x00\x01", "\x00\x00\x00"], + "\x00\x00\x00", + &[(1, 0, 3),] + ), + t!( + regression060, + &["\x00\x00\x00", "\x00\x00\x01"], + "\x00\x00\x00", + &[(0, 0, 3),] + ), +]; + +// Now define a test for each combination of things above that we want to run. +// Since there are a few different combinations for each collection of tests, +// we define a couple of macros to avoid repetition drudgery. The testconfig +// macro constructs the automaton from a given match kind, and runs the search +// tests one-by-one over the given collection. The `with` parameter allows one +// to configure the builder with additional parameters. The testcombo macro +// invokes testconfig in precisely this way: it sets up several tests where +// each one turns a different knob on AhoCorasickBuilder. + +macro_rules! testconfig { + (anchored, $name:ident, $collection:expr, $kind:ident, $with:expr) => { + #[test] + fn $name() { + run_search_tests($collection, |test| { + let mut builder = AhoCorasick::builder(); + $with(&mut builder); + let input = Input::new(test.haystack).anchored(Anchored::Yes); + builder + .match_kind(MatchKind::$kind) + .build(test.patterns) + .unwrap() + .try_find_iter(input) + .unwrap() + .collect() + }); + } + }; + (overlapping, $name:ident, $collection:expr, $kind:ident, $with:expr) => { + #[test] + fn $name() { + run_search_tests($collection, |test| { + let mut builder = AhoCorasick::builder(); + $with(&mut builder); + builder + .match_kind(MatchKind::$kind) + .build(test.patterns) + .unwrap() + .find_overlapping_iter(test.haystack) + .collect() + }); + } + }; + (stream, $name:ident, $collection:expr, $kind:ident, $with:expr) => { + #[test] + fn $name() { + run_stream_search_tests($collection, |test| { + let buf = std::io::BufReader::with_capacity( + 1, + test.haystack.as_bytes(), + ); + let mut builder = AhoCorasick::builder(); + $with(&mut builder); + builder + .match_kind(MatchKind::$kind) + .build(test.patterns) + .unwrap() + .stream_find_iter(buf) + .map(|result| result.unwrap()) + .collect() + }); + } + }; + ($name:ident, $collection:expr, $kind:ident, $with:expr) => { + #[test] + fn $name() { + run_search_tests($collection, |test| { + let mut builder = AhoCorasick::builder(); + $with(&mut builder); + builder + .match_kind(MatchKind::$kind) + .build(test.patterns) + .unwrap() + .find_iter(test.haystack) + .collect() + }); + } + }; +} + +macro_rules! testcombo { + ($name:ident, $collection:expr, $kind:ident) => { + mod $name { + use super::*; + + testconfig!(default, $collection, $kind, |_| ()); + testconfig!( + nfa_default, + $collection, + $kind, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::NoncontiguousNFA)); + } + ); + testconfig!( + nfa_noncontig_no_prefilter, + $collection, + $kind, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::NoncontiguousNFA)) + .prefilter(false); + } + ); + testconfig!( + nfa_noncontig_all_sparse, + $collection, + $kind, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::NoncontiguousNFA)) + .dense_depth(0); + } + ); + testconfig!( + nfa_noncontig_all_dense, + $collection, + $kind, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::NoncontiguousNFA)) + .dense_depth(usize::MAX); + } + ); + testconfig!( + nfa_contig_default, + $collection, + $kind, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::ContiguousNFA)); + } + ); + testconfig!( + nfa_contig_no_prefilter, + $collection, + $kind, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::ContiguousNFA)) + .prefilter(false); + } + ); + testconfig!( + nfa_contig_all_sparse, + $collection, + $kind, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::ContiguousNFA)) + .dense_depth(0); + } + ); + testconfig!( + nfa_contig_all_dense, + $collection, + $kind, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::ContiguousNFA)) + .dense_depth(usize::MAX); + } + ); + testconfig!( + nfa_contig_no_byte_class, + $collection, + $kind, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::ContiguousNFA)) + .byte_classes(false); + } + ); + testconfig!( + dfa_default, + $collection, + $kind, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::DFA)); + } + ); + testconfig!( + dfa_start_both, + $collection, + $kind, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::DFA)) + .start_kind(StartKind::Both); + } + ); + testconfig!( + dfa_no_prefilter, + $collection, + $kind, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::DFA)).prefilter(false); + } + ); + testconfig!( + dfa_start_both_no_prefilter, + $collection, + $kind, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::DFA)) + .start_kind(StartKind::Both) + .prefilter(false); + } + ); + testconfig!( + dfa_no_byte_class, + $collection, + $kind, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::DFA)).byte_classes(false); + } + ); + testconfig!( + dfa_start_both_no_byte_class, + $collection, + $kind, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::DFA)) + .start_kind(StartKind::Both) + .byte_classes(false); + } + ); + } + }; +} + +// Write out the various combinations of match semantics given the variety of +// configurations tested by 'testcombo!'. +testcombo!(search_leftmost_longest, AC_LEFTMOST_LONGEST, LeftmostLongest); +testcombo!(search_leftmost_first, AC_LEFTMOST_FIRST, LeftmostFirst); +testcombo!( + search_standard_nonoverlapping, + AC_STANDARD_NON_OVERLAPPING, + Standard +); + +// Write out the overlapping combo by hand since there is only one of them. +testconfig!( + overlapping, + search_standard_overlapping_default, + AC_STANDARD_OVERLAPPING, + Standard, + |_| () +); +testconfig!( + overlapping, + search_standard_overlapping_nfa_noncontig_default, + AC_STANDARD_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::NoncontiguousNFA)); + } +); +testconfig!( + overlapping, + search_standard_overlapping_nfa_noncontig_no_prefilter, + AC_STANDARD_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::NoncontiguousNFA)).prefilter(false); + } +); +testconfig!( + overlapping, + search_standard_overlapping_nfa_contig_default, + AC_STANDARD_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::ContiguousNFA)); + } +); +testconfig!( + overlapping, + search_standard_overlapping_nfa_contig_no_prefilter, + AC_STANDARD_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::ContiguousNFA)).prefilter(false); + } +); +testconfig!( + overlapping, + search_standard_overlapping_nfa_contig_all_sparse, + AC_STANDARD_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::ContiguousNFA)).dense_depth(0); + } +); +testconfig!( + overlapping, + search_standard_overlapping_nfa_contig_all_dense, + AC_STANDARD_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::ContiguousNFA)).dense_depth(usize::MAX); + } +); +testconfig!( + overlapping, + search_standard_overlapping_dfa_default, + AC_STANDARD_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::DFA)); + } +); +testconfig!( + overlapping, + search_standard_overlapping_dfa_start_both, + AC_STANDARD_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::DFA)).start_kind(StartKind::Both); + } +); +testconfig!( + overlapping, + search_standard_overlapping_dfa_no_prefilter, + AC_STANDARD_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::DFA)).prefilter(false); + } +); +testconfig!( + overlapping, + search_standard_overlapping_dfa_start_both_no_prefilter, + AC_STANDARD_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::DFA)) + .start_kind(StartKind::Both) + .prefilter(false); + } +); +testconfig!( + overlapping, + search_standard_overlapping_dfa_no_byte_class, + AC_STANDARD_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::DFA)).byte_classes(false); + } +); +testconfig!( + overlapping, + search_standard_overlapping_dfa_start_both_no_byte_class, + AC_STANDARD_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::DFA)) + .start_kind(StartKind::Both) + .byte_classes(false); + } +); + +// Also write out tests manually for streams, since we only test the standard +// match semantics. We also don't bother testing different automaton +// configurations, since those are well covered by tests above. +#[cfg(feature = "std")] +testconfig!( + stream, + search_standard_stream_default, + AC_STANDARD_NON_OVERLAPPING, + Standard, + |_| () +); +#[cfg(feature = "std")] +testconfig!( + stream, + search_standard_stream_nfa_noncontig_default, + AC_STANDARD_NON_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::NoncontiguousNFA)); + } +); +#[cfg(feature = "std")] +testconfig!( + stream, + search_standard_stream_nfa_contig_default, + AC_STANDARD_NON_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::ContiguousNFA)); + } +); +#[cfg(feature = "std")] +testconfig!( + stream, + search_standard_stream_dfa_default, + AC_STANDARD_NON_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::DFA)); + } +); + +// Same thing for anchored searches. Write them out manually. +testconfig!( + anchored, + search_standard_anchored_default, + AC_STANDARD_ANCHORED_NON_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.start_kind(StartKind::Anchored); + } +); +testconfig!( + anchored, + search_standard_anchored_nfa_noncontig_default, + AC_STANDARD_ANCHORED_NON_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.start_kind(StartKind::Anchored) + .kind(Some(AhoCorasickKind::NoncontiguousNFA)); + } +); +testconfig!( + anchored, + search_standard_anchored_nfa_contig_default, + AC_STANDARD_ANCHORED_NON_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.start_kind(StartKind::Anchored) + .kind(Some(AhoCorasickKind::ContiguousNFA)); + } +); +testconfig!( + anchored, + search_standard_anchored_dfa_default, + AC_STANDARD_ANCHORED_NON_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.start_kind(StartKind::Anchored).kind(Some(AhoCorasickKind::DFA)); + } +); +testconfig!( + anchored, + search_standard_anchored_dfa_start_both, + AC_STANDARD_ANCHORED_NON_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.start_kind(StartKind::Both).kind(Some(AhoCorasickKind::DFA)); + } +); +testconfig!( + anchored, + search_leftmost_first_anchored_default, + AC_LEFTMOST_FIRST_ANCHORED, + LeftmostFirst, + |b: &mut AhoCorasickBuilder| { + b.start_kind(StartKind::Anchored); + } +); +testconfig!( + anchored, + search_leftmost_first_anchored_nfa_noncontig_default, + AC_LEFTMOST_FIRST_ANCHORED, + LeftmostFirst, + |b: &mut AhoCorasickBuilder| { + b.start_kind(StartKind::Anchored) + .kind(Some(AhoCorasickKind::NoncontiguousNFA)); + } +); +testconfig!( + anchored, + search_leftmost_first_anchored_nfa_contig_default, + AC_LEFTMOST_FIRST_ANCHORED, + LeftmostFirst, + |b: &mut AhoCorasickBuilder| { + b.start_kind(StartKind::Anchored) + .kind(Some(AhoCorasickKind::ContiguousNFA)); + } +); +testconfig!( + anchored, + search_leftmost_first_anchored_dfa_default, + AC_LEFTMOST_FIRST_ANCHORED, + LeftmostFirst, + |b: &mut AhoCorasickBuilder| { + b.start_kind(StartKind::Anchored).kind(Some(AhoCorasickKind::DFA)); + } +); +testconfig!( + anchored, + search_leftmost_first_anchored_dfa_start_both, + AC_LEFTMOST_FIRST_ANCHORED, + LeftmostFirst, + |b: &mut AhoCorasickBuilder| { + b.start_kind(StartKind::Both).kind(Some(AhoCorasickKind::DFA)); + } +); +testconfig!( + anchored, + search_leftmost_longest_anchored_default, + AC_LEFTMOST_LONGEST_ANCHORED, + LeftmostLongest, + |b: &mut AhoCorasickBuilder| { + b.start_kind(StartKind::Anchored); + } +); +testconfig!( + anchored, + search_leftmost_longest_anchored_nfa_noncontig_default, + AC_LEFTMOST_LONGEST_ANCHORED, + LeftmostLongest, + |b: &mut AhoCorasickBuilder| { + b.start_kind(StartKind::Anchored) + .kind(Some(AhoCorasickKind::NoncontiguousNFA)); + } +); +testconfig!( + anchored, + search_leftmost_longest_anchored_nfa_contig_default, + AC_LEFTMOST_LONGEST_ANCHORED, + LeftmostLongest, + |b: &mut AhoCorasickBuilder| { + b.start_kind(StartKind::Anchored) + .kind(Some(AhoCorasickKind::ContiguousNFA)); + } +); +testconfig!( + anchored, + search_leftmost_longest_anchored_dfa_default, + AC_LEFTMOST_LONGEST_ANCHORED, + LeftmostLongest, + |b: &mut AhoCorasickBuilder| { + b.start_kind(StartKind::Anchored).kind(Some(AhoCorasickKind::DFA)); + } +); +testconfig!( + anchored, + search_leftmost_longest_anchored_dfa_start_both, + AC_LEFTMOST_LONGEST_ANCHORED, + LeftmostLongest, + |b: &mut AhoCorasickBuilder| { + b.start_kind(StartKind::Both).kind(Some(AhoCorasickKind::DFA)); + } +); + +// And also write out the test combinations for ASCII case insensitivity. +testconfig!( + acasei_standard_default, + &[ASCII_CASE_INSENSITIVE], + Standard, + |b: &mut AhoCorasickBuilder| { + b.prefilter(false).ascii_case_insensitive(true); + } +); +testconfig!( + acasei_standard_nfa_noncontig_default, + &[ASCII_CASE_INSENSITIVE], + Standard, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::NoncontiguousNFA)) + .prefilter(false) + .ascii_case_insensitive(true); + } +); +testconfig!( + acasei_standard_nfa_contig_default, + &[ASCII_CASE_INSENSITIVE], + Standard, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::ContiguousNFA)) + .prefilter(false) + .ascii_case_insensitive(true); + } +); +testconfig!( + acasei_standard_dfa_default, + &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], + Standard, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::DFA)).ascii_case_insensitive(true); + } +); +testconfig!( + overlapping, + acasei_standard_overlapping_default, + &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_OVERLAPPING], + Standard, + |b: &mut AhoCorasickBuilder| { + b.ascii_case_insensitive(true); + } +); +testconfig!( + overlapping, + acasei_standard_overlapping_nfa_noncontig_default, + &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_OVERLAPPING], + Standard, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::NoncontiguousNFA)) + .ascii_case_insensitive(true); + } +); +testconfig!( + overlapping, + acasei_standard_overlapping_nfa_contig_default, + &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_OVERLAPPING], + Standard, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::ContiguousNFA)) + .ascii_case_insensitive(true); + } +); +testconfig!( + overlapping, + acasei_standard_overlapping_dfa_default, + &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_OVERLAPPING], + Standard, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::DFA)).ascii_case_insensitive(true); + } +); +testconfig!( + acasei_leftmost_first_default, + &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], + LeftmostFirst, + |b: &mut AhoCorasickBuilder| { + b.ascii_case_insensitive(true); + } +); +testconfig!( + acasei_leftmost_first_nfa_noncontig_default, + &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], + LeftmostFirst, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::NoncontiguousNFA)) + .ascii_case_insensitive(true); + } +); +testconfig!( + acasei_leftmost_first_nfa_contig_default, + &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], + LeftmostFirst, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::ContiguousNFA)) + .ascii_case_insensitive(true); + } +); +testconfig!( + acasei_leftmost_first_dfa_default, + &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], + LeftmostFirst, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::DFA)).ascii_case_insensitive(true); + } +); +testconfig!( + acasei_leftmost_longest_default, + &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], + LeftmostLongest, + |b: &mut AhoCorasickBuilder| { + b.ascii_case_insensitive(true); + } +); +testconfig!( + acasei_leftmost_longest_nfa_noncontig_default, + &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], + LeftmostLongest, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::NoncontiguousNFA)) + .ascii_case_insensitive(true); + } +); +testconfig!( + acasei_leftmost_longest_nfa_contig_default, + &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], + LeftmostLongest, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::ContiguousNFA)) + .ascii_case_insensitive(true); + } +); +testconfig!( + acasei_leftmost_longest_dfa_default, + &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], + LeftmostLongest, + |b: &mut AhoCorasickBuilder| { + b.kind(Some(AhoCorasickKind::DFA)).ascii_case_insensitive(true); + } +); + +fn run_search_tests<F: FnMut(&SearchTest) -> Vec<Match>>( + which: TestCollection, + mut f: F, +) { + let get_match_triples = + |matches: Vec<Match>| -> Vec<(usize, usize, usize)> { + matches + .into_iter() + .map(|m| (m.pattern().as_usize(), m.start(), m.end())) + .collect() + }; + for &tests in which { + for test in tests { + assert_eq!( + test.matches, + get_match_triples(f(&test)).as_slice(), + "test: {}, patterns: {:?}, haystack: {:?}", + test.name, + test.patterns, + test.haystack + ); + } + } +} + +// Like 'run_search_tests', but we skip any tests that contain the empty +// pattern because stream searching doesn't support it. +#[cfg(feature = "std")] +fn run_stream_search_tests<F: FnMut(&SearchTest) -> Vec<Match>>( + which: TestCollection, + mut f: F, +) { + let get_match_triples = + |matches: Vec<Match>| -> Vec<(usize, usize, usize)> { + matches + .into_iter() + .map(|m| (m.pattern().as_usize(), m.start(), m.end())) + .collect() + }; + for &tests in which { + for test in tests { + if test.patterns.iter().any(|p| p.is_empty()) { + continue; + } + assert_eq!( + test.matches, + get_match_triples(f(&test)).as_slice(), + "test: {}, patterns: {:?}, haystack: {:?}", + test.name, + test.patterns, + test.haystack + ); + } + } +} + +#[test] +fn search_tests_have_unique_names() { + let assert = |constname, tests: &[SearchTest]| { + let mut seen = HashMap::new(); // map from test name to position + for (i, test) in tests.iter().enumerate() { + if !seen.contains_key(test.name) { + seen.insert(test.name, i); + } else { + let last = seen[test.name]; + panic!( + "{} tests have duplicate names at positions {} and {}", + constname, last, i + ); + } + } + }; + assert("BASICS", BASICS); + assert("STANDARD", STANDARD); + assert("LEFTMOST", LEFTMOST); + assert("LEFTMOST_FIRST", LEFTMOST_FIRST); + assert("LEFTMOST_LONGEST", LEFTMOST_LONGEST); + assert("NON_OVERLAPPING", NON_OVERLAPPING); + assert("OVERLAPPING", OVERLAPPING); + assert("REGRESSION", REGRESSION); +} + +#[cfg(feature = "std")] +#[test] +#[should_panic] +fn stream_not_allowed_leftmost_first() { + let fsm = AhoCorasick::builder() + .match_kind(MatchKind::LeftmostFirst) + .build(None::<String>) + .unwrap(); + assert_eq!(fsm.stream_find_iter(&b""[..]).count(), 0); +} + +#[cfg(feature = "std")] +#[test] +#[should_panic] +fn stream_not_allowed_leftmost_longest() { + let fsm = AhoCorasick::builder() + .match_kind(MatchKind::LeftmostLongest) + .build(None::<String>) + .unwrap(); + assert_eq!(fsm.stream_find_iter(&b""[..]).count(), 0); +} + +#[test] +#[should_panic] +fn overlapping_not_allowed_leftmost_first() { + let fsm = AhoCorasick::builder() + .match_kind(MatchKind::LeftmostFirst) + .build(None::<String>) + .unwrap(); + assert_eq!(fsm.find_overlapping_iter("").count(), 0); +} + +#[test] +#[should_panic] +fn overlapping_not_allowed_leftmost_longest() { + let fsm = AhoCorasick::builder() + .match_kind(MatchKind::LeftmostLongest) + .build(None::<String>) + .unwrap(); + assert_eq!(fsm.find_overlapping_iter("").count(), 0); +} + +// This tests that if we build an AC matcher with an "unanchored" start kind, +// then we can't run an anchored search even if the underlying searcher +// supports it. +// +// The key bit here is that both of the NFAs in this crate unconditionally +// support both unanchored and anchored searches, but the DFA does not because +// of the added cost of doing so. To avoid the top-level AC matcher sometimes +// supporting anchored and sometimes not (depending on which searcher it +// chooses to use internally), we ensure that the given 'StartKind' is always +// respected. +#[test] +fn anchored_not_allowed_even_if_technically_available() { + let ac = AhoCorasick::builder() + .kind(Some(AhoCorasickKind::NoncontiguousNFA)) + .start_kind(StartKind::Unanchored) + .build(&["foo"]) + .unwrap(); + assert!(ac.try_find(Input::new("foo").anchored(Anchored::Yes)).is_err()); + + let ac = AhoCorasick::builder() + .kind(Some(AhoCorasickKind::ContiguousNFA)) + .start_kind(StartKind::Unanchored) + .build(&["foo"]) + .unwrap(); + assert!(ac.try_find(Input::new("foo").anchored(Anchored::Yes)).is_err()); + + // For completeness, check that the DFA returns an error too. + let ac = AhoCorasick::builder() + .kind(Some(AhoCorasickKind::DFA)) + .start_kind(StartKind::Unanchored) + .build(&["foo"]) + .unwrap(); + assert!(ac.try_find(Input::new("foo").anchored(Anchored::Yes)).is_err()); +} + +// This is like the test aboved, but with unanchored and anchored flipped. That +// is, we asked for an AC searcher with anchored support and we check that +// unanchored searches return an error even if the underlying searcher would +// technically support it. +#[test] +fn unanchored_not_allowed_even_if_technically_available() { + let ac = AhoCorasick::builder() + .kind(Some(AhoCorasickKind::NoncontiguousNFA)) + .start_kind(StartKind::Anchored) + .build(&["foo"]) + .unwrap(); + assert!(ac.try_find(Input::new("foo").anchored(Anchored::No)).is_err()); + + let ac = AhoCorasick::builder() + .kind(Some(AhoCorasickKind::ContiguousNFA)) + .start_kind(StartKind::Anchored) + .build(&["foo"]) + .unwrap(); + assert!(ac.try_find(Input::new("foo").anchored(Anchored::No)).is_err()); + + // For completeness, check that the DFA returns an error too. + let ac = AhoCorasick::builder() + .kind(Some(AhoCorasickKind::DFA)) + .start_kind(StartKind::Anchored) + .build(&["foo"]) + .unwrap(); + assert!(ac.try_find(Input::new("foo").anchored(Anchored::No)).is_err()); +} + +// This tests that a prefilter does not cause a search to report a match +// outside the bounds provided by the caller. +// +// This is a regression test for a bug I introduced during the rewrite of most +// of the crate after 0.7. It was never released. The tricky part here is +// ensuring we get a prefilter that can report matches on its own (such as the +// packed searcher). Otherwise, prefilters that report false positives might +// have searched past the bounds provided by the caller, but confirming the +// match would subsequently fail. +#[test] +fn prefilter_stays_in_bounds() { + let ac = AhoCorasick::builder() + .match_kind(MatchKind::LeftmostFirst) + .build(&["sam", "frodo", "pippin", "merry", "gandalf", "sauron"]) + .unwrap(); + let haystack = "foo gandalf"; + assert_eq!(None, ac.find(Input::new(haystack).range(0..10))); +} + +// See: https://github.com/BurntSushi/aho-corasick/issues/44 +// +// In short, this test ensures that enabling ASCII case insensitivity does not +// visit an exponential number of states when filling in failure transitions. +#[test] +fn regression_ascii_case_insensitive_no_exponential() { + let ac = AhoCorasick::builder() + .ascii_case_insensitive(true) + .build(&["Tsubaki House-Triple Shot Vol01校花三姐妹"]) + .unwrap(); + assert!(ac.find("").is_none()); +} + +// See: https://github.com/BurntSushi/aho-corasick/issues/53 +// +// This test ensures that the rare byte prefilter works in a particular corner +// case. In particular, the shift offset detected for '/' in the patterns below +// was incorrect, leading to a false negative. +#[test] +fn regression_rare_byte_prefilter() { + use crate::AhoCorasick; + + let ac = AhoCorasick::new(&["ab/j/", "x/"]).unwrap(); + assert!(ac.is_match("ab/j/")); +} + +#[test] +fn regression_case_insensitive_prefilter() { + for c in b'a'..b'z' { + for c2 in b'a'..b'z' { + let c = c as char; + let c2 = c2 as char; + let needle = format!("{}{}", c, c2).to_lowercase(); + let haystack = needle.to_uppercase(); + let ac = AhoCorasick::builder() + .ascii_case_insensitive(true) + .prefilter(true) + .build(&[&needle]) + .unwrap(); + assert_eq!( + 1, + ac.find_iter(&haystack).count(), + "failed to find {:?} in {:?}\n\nautomaton:\n{:?}", + needle, + haystack, + ac, + ); + } + } +} + +// See: https://github.com/BurntSushi/aho-corasick/issues/64 +// +// This occurs when the rare byte prefilter is active. +#[cfg(feature = "std")] +#[test] +fn regression_stream_rare_byte_prefilter() { + use std::io::Read; + + // NOTE: The test only fails if this ends with j. + const MAGIC: [u8; 5] = *b"1234j"; + + // NOTE: The test fails for value in 8188..=8191 These value put the string + // to search accross two call to read because the buffer size is 64KB by + // default. + const BEGIN: usize = 65_535; + + /// This is just a structure that implements Reader. The reader + /// implementation will simulate a file filled with 0, except for the MAGIC + /// string at offset BEGIN. + #[derive(Default)] + struct R { + read: usize, + } + + impl Read for R { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> { + if self.read > 100000 { + return Ok(0); + } + let mut from = 0; + if self.read < BEGIN { + from = buf.len().min(BEGIN - self.read); + for x in 0..from { + buf[x] = 0; + } + self.read += from; + } + if self.read >= BEGIN && self.read <= BEGIN + MAGIC.len() { + let to = buf.len().min(BEGIN + MAGIC.len() - self.read + from); + if to > from { + buf[from..to].copy_from_slice( + &MAGIC + [self.read - BEGIN..self.read - BEGIN + to - from], + ); + self.read += to - from; + from = to; + } + } + for x in from..buf.len() { + buf[x] = 0; + self.read += 1; + } + Ok(buf.len()) + } + } + + fn run() -> std::io::Result<()> { + let aut = AhoCorasick::builder() + // Enable byte classes to make debugging the automaton easier. It + // should have no effect on the test result. + .byte_classes(false) + .build(&[&MAGIC]) + .unwrap(); + + // While reading from a vector, it works: + let mut buf = alloc::vec![]; + R::default().read_to_end(&mut buf)?; + let from_whole = aut.find_iter(&buf).next().unwrap().start(); + + // But using stream_find_iter fails! + let mut file = std::io::BufReader::new(R::default()); + let begin = aut + .stream_find_iter(&mut file) + .next() + .expect("NOT FOUND!!!!")? // Panic here + .start(); + assert_eq!(from_whole, begin); + Ok(()) + } + + run().unwrap() +} diff --git a/vendor/aho-corasick/src/transducer.rs b/vendor/aho-corasick/src/transducer.rs new file mode 100644 index 0000000..39bb240 --- /dev/null +++ b/vendor/aho-corasick/src/transducer.rs @@ -0,0 +1,270 @@ +/*! +Provides implementations of `fst::Automaton` for Aho-Corasick automata. + +This works by providing two wrapper types, [`Anchored`] and [`Unanchored`]. +The former executes an anchored search on an FST while the latter executes +an unanchored search. Building these wrappers is fallible and will fail if +the underlying Aho-Corasick automaton does not support the type of search it +represents. +*/ + +use crate::{ + automaton::{Automaton, StateID}, + Anchored as AcAnchored, Input, MatchError, +}; + +/// Represents an unanchored Aho-Corasick search of a finite state transducer. +/// +/// Wrapping an Aho-Corasick automaton in `Unanchored` will fail if the +/// underlying automaton does not support unanchored searches. +/// +/// # Example +/// +/// This shows how to build an FST of keys and then run an unanchored search on +/// those keys using an Aho-Corasick automaton. +/// +/// ``` +/// use aho_corasick::{nfa::contiguous::NFA, transducer::Unanchored}; +/// use fst::{Automaton, IntoStreamer, Set, Streamer}; +/// +/// let set = Set::from_iter(&["abcd", "bc", "bcd", "xyz"]).unwrap(); +/// let nfa = NFA::new(&["bcd", "x"]).unwrap(); +/// // NFAs always support both unanchored and anchored searches. +/// let searcher = Unanchored::new(&nfa).unwrap(); +/// +/// let mut stream = set.search(searcher).into_stream(); +/// let mut results = vec![]; +/// while let Some(key) = stream.next() { +/// results.push(std::str::from_utf8(key).unwrap().to_string()); +/// } +/// assert_eq!(vec!["abcd", "bcd", "xyz"], results); +/// ``` +#[derive(Clone, Debug)] +pub struct Unanchored<A>(A); + +impl<A: Automaton> Unanchored<A> { + /// Create a new `Unanchored` implementation of the `fst::Automaton` trait. + /// + /// If the given Aho-Corasick automaton does not support unanchored + /// searches, then this returns an error. + pub fn new(aut: A) -> Result<Unanchored<A>, MatchError> { + let input = Input::new("").anchored(AcAnchored::No); + let _ = aut.start_state(&input)?; + Ok(Unanchored(aut)) + } + + /// Returns a borrow to the underlying automaton. + pub fn as_ref(&self) -> &A { + &self.0 + } + + /// Unwrap this value and return the inner automaton. + pub fn into_inner(self) -> A { + self.0 + } +} + +impl<A: Automaton> fst::Automaton for Unanchored<A> { + type State = StateID; + + #[inline] + fn start(&self) -> StateID { + let input = Input::new("").anchored(AcAnchored::No); + self.0.start_state(&input).expect("support for unanchored searches") + } + + #[inline] + fn is_match(&self, state: &StateID) -> bool { + self.0.is_match(*state) + } + + #[inline] + fn accept(&self, state: &StateID, byte: u8) -> StateID { + if fst::Automaton::is_match(self, state) { + return *state; + } + self.0.next_state(AcAnchored::No, *state, byte) + } + + #[inline] + fn can_match(&self, state: &StateID) -> bool { + !self.0.is_dead(*state) + } +} + +/// Represents an anchored Aho-Corasick search of a finite state transducer. +/// +/// Wrapping an Aho-Corasick automaton in `Unanchored` will fail if the +/// underlying automaton does not support unanchored searches. +/// +/// # Example +/// +/// This shows how to build an FST of keys and then run an anchored search on +/// those keys using an Aho-Corasick automaton. +/// +/// ``` +/// use aho_corasick::{nfa::contiguous::NFA, transducer::Anchored}; +/// use fst::{Automaton, IntoStreamer, Set, Streamer}; +/// +/// let set = Set::from_iter(&["abcd", "bc", "bcd", "xyz"]).unwrap(); +/// let nfa = NFA::new(&["bcd", "x"]).unwrap(); +/// // NFAs always support both unanchored and anchored searches. +/// let searcher = Anchored::new(&nfa).unwrap(); +/// +/// let mut stream = set.search(searcher).into_stream(); +/// let mut results = vec![]; +/// while let Some(key) = stream.next() { +/// results.push(std::str::from_utf8(key).unwrap().to_string()); +/// } +/// assert_eq!(vec!["bcd", "xyz"], results); +/// ``` +/// +/// This is like the example above, except we use an Aho-Corasick DFA, which +/// requires explicitly configuring it to support anchored searches. (NFAs +/// unconditionally support both unanchored and anchored searches.) +/// +/// ``` +/// use aho_corasick::{dfa::DFA, transducer::Anchored, StartKind}; +/// use fst::{Automaton, IntoStreamer, Set, Streamer}; +/// +/// let set = Set::from_iter(&["abcd", "bc", "bcd", "xyz"]).unwrap(); +/// let dfa = DFA::builder() +/// .start_kind(StartKind::Anchored) +/// .build(&["bcd", "x"]) +/// .unwrap(); +/// // We've explicitly configured our DFA to support anchored searches. +/// let searcher = Anchored::new(&dfa).unwrap(); +/// +/// let mut stream = set.search(searcher).into_stream(); +/// let mut results = vec![]; +/// while let Some(key) = stream.next() { +/// results.push(std::str::from_utf8(key).unwrap().to_string()); +/// } +/// assert_eq!(vec!["bcd", "xyz"], results); +/// ``` +#[derive(Clone, Debug)] +pub struct Anchored<A>(A); + +impl<A: Automaton> Anchored<A> { + /// Create a new `Anchored` implementation of the `fst::Automaton` trait. + /// + /// If the given Aho-Corasick automaton does not support anchored searches, + /// then this returns an error. + pub fn new(aut: A) -> Result<Anchored<A>, MatchError> { + let input = Input::new("").anchored(AcAnchored::Yes); + let _ = aut.start_state(&input)?; + Ok(Anchored(aut)) + } + + /// Returns a borrow to the underlying automaton. + pub fn as_ref(&self) -> &A { + &self.0 + } + + /// Unwrap this value and return the inner automaton. + pub fn into_inner(self) -> A { + self.0 + } +} + +impl<A: Automaton> fst::Automaton for Anchored<A> { + type State = StateID; + + #[inline] + fn start(&self) -> StateID { + let input = Input::new("").anchored(AcAnchored::Yes); + self.0.start_state(&input).expect("support for unanchored searches") + } + + #[inline] + fn is_match(&self, state: &StateID) -> bool { + self.0.is_match(*state) + } + + #[inline] + fn accept(&self, state: &StateID, byte: u8) -> StateID { + if fst::Automaton::is_match(self, state) { + return *state; + } + self.0.next_state(AcAnchored::Yes, *state, byte) + } + + #[inline] + fn can_match(&self, state: &StateID) -> bool { + !self.0.is_dead(*state) + } +} + +#[cfg(test)] +mod tests { + use alloc::{string::String, vec, vec::Vec}; + + use fst::{Automaton, IntoStreamer, Set, Streamer}; + + use crate::{ + dfa::DFA, + nfa::{contiguous, noncontiguous}, + StartKind, + }; + + use super::*; + + fn search<A: Automaton, D: AsRef<[u8]>>( + set: &Set<D>, + aut: A, + ) -> Vec<String> { + let mut stream = set.search(aut).into_stream(); + let mut results = vec![]; + while let Some(key) = stream.next() { + results.push(String::from(core::str::from_utf8(key).unwrap())); + } + results + } + + #[test] + fn unanchored() { + let set = + Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"]) + .unwrap(); + let patterns = vec!["baz", "bax"]; + let expected = vec!["baz", "xbax"]; + + let aut = Unanchored(noncontiguous::NFA::new(&patterns).unwrap()); + let got = search(&set, &aut); + assert_eq!(got, expected); + + let aut = Unanchored(contiguous::NFA::new(&patterns).unwrap()); + let got = search(&set, &aut); + assert_eq!(got, expected); + + let aut = Unanchored(DFA::new(&patterns).unwrap()); + let got = search(&set, &aut); + assert_eq!(got, expected); + } + + #[test] + fn anchored() { + let set = + Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"]) + .unwrap(); + let patterns = vec!["baz", "bax"]; + let expected = vec!["baz"]; + + let aut = Anchored(noncontiguous::NFA::new(&patterns).unwrap()); + let got = search(&set, &aut); + assert_eq!(got, expected); + + let aut = Anchored(contiguous::NFA::new(&patterns).unwrap()); + let got = search(&set, &aut); + assert_eq!(got, expected); + + let aut = Anchored( + DFA::builder() + .start_kind(StartKind::Anchored) + .build(&patterns) + .unwrap(), + ); + let got = search(&set, &aut); + assert_eq!(got, expected); + } +} diff --git a/vendor/aho-corasick/src/util/alphabet.rs b/vendor/aho-corasick/src/util/alphabet.rs new file mode 100644 index 0000000..69724fa --- /dev/null +++ b/vendor/aho-corasick/src/util/alphabet.rs @@ -0,0 +1,409 @@ +use crate::util::int::Usize; + +/// A representation of byte oriented equivalence classes. +/// +/// This is used in finite state machines to reduce the size of the transition +/// table. This can have a particularly large impact not only on the total size +/// of an FSM, but also on FSM build times because it reduces the number of +/// transitions that need to be visited/set. +#[derive(Clone, Copy)] +pub(crate) struct ByteClasses([u8; 256]); + +impl ByteClasses { + /// Creates a new set of equivalence classes where all bytes are mapped to + /// the same class. + pub(crate) fn empty() -> ByteClasses { + ByteClasses([0; 256]) + } + + /// Creates a new set of equivalence classes where each byte belongs to + /// its own equivalence class. + pub(crate) fn singletons() -> ByteClasses { + let mut classes = ByteClasses::empty(); + for b in 0..=255 { + classes.set(b, b); + } + classes + } + + /// Set the equivalence class for the given byte. + #[inline] + pub(crate) fn set(&mut self, byte: u8, class: u8) { + self.0[usize::from(byte)] = class; + } + + /// Get the equivalence class for the given byte. + #[inline] + pub(crate) fn get(&self, byte: u8) -> u8 { + self.0[usize::from(byte)] + } + + /// Return the total number of elements in the alphabet represented by + /// these equivalence classes. Equivalently, this returns the total number + /// of equivalence classes. + #[inline] + pub(crate) fn alphabet_len(&self) -> usize { + // Add one since the number of equivalence classes is one bigger than + // the last one. + usize::from(self.0[255]) + 1 + } + + /// Returns the stride, as a base-2 exponent, required for these + /// equivalence classes. + /// + /// The stride is always the smallest power of 2 that is greater than or + /// equal to the alphabet length. This is done so that converting between + /// state IDs and indices can be done with shifts alone, which is much + /// faster than integer division. The "stride2" is the exponent. i.e., + /// `2^stride2 = stride`. + pub(crate) fn stride2(&self) -> usize { + let zeros = self.alphabet_len().next_power_of_two().trailing_zeros(); + usize::try_from(zeros).unwrap() + } + + /// Returns the stride for these equivalence classes, which corresponds + /// to the smallest power of 2 greater than or equal to the number of + /// equivalence classes. + pub(crate) fn stride(&self) -> usize { + 1 << self.stride2() + } + + /// Returns true if and only if every byte in this class maps to its own + /// equivalence class. Equivalently, there are 257 equivalence classes + /// and each class contains exactly one byte (plus the special EOI class). + #[inline] + pub(crate) fn is_singleton(&self) -> bool { + self.alphabet_len() == 256 + } + + /// Returns an iterator over all equivalence classes in this set. + pub(crate) fn iter(&self) -> ByteClassIter { + ByteClassIter { it: 0..self.alphabet_len() } + } + + /// Returns an iterator of the bytes in the given equivalence class. + pub(crate) fn elements(&self, class: u8) -> ByteClassElements { + ByteClassElements { classes: self, class, bytes: 0..=255 } + } + + /// Returns an iterator of byte ranges in the given equivalence class. + /// + /// That is, a sequence of contiguous ranges are returned. Typically, every + /// class maps to a single contiguous range. + fn element_ranges(&self, class: u8) -> ByteClassElementRanges { + ByteClassElementRanges { elements: self.elements(class), range: None } + } +} + +impl core::fmt::Debug for ByteClasses { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + if self.is_singleton() { + write!(f, "ByteClasses(<one-class-per-byte>)") + } else { + write!(f, "ByteClasses(")?; + for (i, class) in self.iter().enumerate() { + if i > 0 { + write!(f, ", ")?; + } + write!(f, "{:?} => [", class)?; + for (start, end) in self.element_ranges(class) { + if start == end { + write!(f, "{:?}", start)?; + } else { + write!(f, "{:?}-{:?}", start, end)?; + } + } + write!(f, "]")?; + } + write!(f, ")") + } + } +} + +/// An iterator over each equivalence class. +#[derive(Debug)] +pub(crate) struct ByteClassIter { + it: core::ops::Range<usize>, +} + +impl Iterator for ByteClassIter { + type Item = u8; + + fn next(&mut self) -> Option<u8> { + self.it.next().map(|class| class.as_u8()) + } +} + +/// An iterator over all elements in a specific equivalence class. +#[derive(Debug)] +pub(crate) struct ByteClassElements<'a> { + classes: &'a ByteClasses, + class: u8, + bytes: core::ops::RangeInclusive<u8>, +} + +impl<'a> Iterator for ByteClassElements<'a> { + type Item = u8; + + fn next(&mut self) -> Option<u8> { + while let Some(byte) = self.bytes.next() { + if self.class == self.classes.get(byte) { + return Some(byte); + } + } + None + } +} + +/// An iterator over all elements in an equivalence class expressed as a +/// sequence of contiguous ranges. +#[derive(Debug)] +pub(crate) struct ByteClassElementRanges<'a> { + elements: ByteClassElements<'a>, + range: Option<(u8, u8)>, +} + +impl<'a> Iterator for ByteClassElementRanges<'a> { + type Item = (u8, u8); + + fn next(&mut self) -> Option<(u8, u8)> { + loop { + let element = match self.elements.next() { + None => return self.range.take(), + Some(element) => element, + }; + match self.range.take() { + None => { + self.range = Some((element, element)); + } + Some((start, end)) => { + if usize::from(end) + 1 != usize::from(element) { + self.range = Some((element, element)); + return Some((start, end)); + } + self.range = Some((start, element)); + } + } + } + } +} + +/// A partitioning of bytes into equivalence classes. +/// +/// A byte class set keeps track of an *approximation* of equivalence classes +/// of bytes during NFA construction. That is, every byte in an equivalence +/// class cannot discriminate between a match and a non-match. +/// +/// Note that this may not compute the minimal set of equivalence classes. +/// Basically, any byte in a pattern given to the noncontiguous NFA builder +/// will automatically be treated as its own equivalence class. All other +/// bytes---any byte not in any pattern---will be treated as their own +/// equivalence classes. In theory, all bytes not in any pattern should +/// be part of a single equivalence class, but in practice, we only treat +/// contiguous ranges of bytes as an equivalence class. So the number of +/// classes computed may be bigger than necessary. This usually doesn't make +/// much of a difference, and keeps the implementation simple. +#[derive(Clone, Debug)] +pub(crate) struct ByteClassSet(ByteSet); + +impl Default for ByteClassSet { + fn default() -> ByteClassSet { + ByteClassSet::empty() + } +} + +impl ByteClassSet { + /// Create a new set of byte classes where all bytes are part of the same + /// equivalence class. + pub(crate) fn empty() -> Self { + ByteClassSet(ByteSet::empty()) + } + + /// Indicate the the range of byte given (inclusive) can discriminate a + /// match between it and all other bytes outside of the range. + pub(crate) fn set_range(&mut self, start: u8, end: u8) { + debug_assert!(start <= end); + if start > 0 { + self.0.add(start - 1); + } + self.0.add(end); + } + + /// Convert this boolean set to a map that maps all byte values to their + /// corresponding equivalence class. The last mapping indicates the largest + /// equivalence class identifier (which is never bigger than 255). + pub(crate) fn byte_classes(&self) -> ByteClasses { + let mut classes = ByteClasses::empty(); + let mut class = 0u8; + let mut b = 0u8; + loop { + classes.set(b, class); + if b == 255 { + break; + } + if self.0.contains(b) { + class = class.checked_add(1).unwrap(); + } + b = b.checked_add(1).unwrap(); + } + classes + } +} + +/// A simple set of bytes that is reasonably cheap to copy and allocation free. +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub(crate) struct ByteSet { + bits: BitSet, +} + +/// The representation of a byte set. Split out so that we can define a +/// convenient Debug impl for it while keeping "ByteSet" in the output. +#[derive(Clone, Copy, Default, Eq, PartialEq)] +struct BitSet([u128; 2]); + +impl ByteSet { + /// Create an empty set of bytes. + pub(crate) fn empty() -> ByteSet { + ByteSet { bits: BitSet([0; 2]) } + } + + /// Add a byte to this set. + /// + /// If the given byte already belongs to this set, then this is a no-op. + pub(crate) fn add(&mut self, byte: u8) { + let bucket = byte / 128; + let bit = byte % 128; + self.bits.0[usize::from(bucket)] |= 1 << bit; + } + + /// Return true if and only if the given byte is in this set. + pub(crate) fn contains(&self, byte: u8) -> bool { + let bucket = byte / 128; + let bit = byte % 128; + self.bits.0[usize::from(bucket)] & (1 << bit) > 0 + } +} + +impl core::fmt::Debug for BitSet { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let mut fmtd = f.debug_set(); + for b in 0u8..=255 { + if (ByteSet { bits: *self }).contains(b) { + fmtd.entry(&b); + } + } + fmtd.finish() + } +} + +#[cfg(test)] +mod tests { + use alloc::{vec, vec::Vec}; + + use super::*; + + #[test] + fn byte_classes() { + let mut set = ByteClassSet::empty(); + set.set_range(b'a', b'z'); + + let classes = set.byte_classes(); + assert_eq!(classes.get(0), 0); + assert_eq!(classes.get(1), 0); + assert_eq!(classes.get(2), 0); + assert_eq!(classes.get(b'a' - 1), 0); + assert_eq!(classes.get(b'a'), 1); + assert_eq!(classes.get(b'm'), 1); + assert_eq!(classes.get(b'z'), 1); + assert_eq!(classes.get(b'z' + 1), 2); + assert_eq!(classes.get(254), 2); + assert_eq!(classes.get(255), 2); + + let mut set = ByteClassSet::empty(); + set.set_range(0, 2); + set.set_range(4, 6); + let classes = set.byte_classes(); + assert_eq!(classes.get(0), 0); + assert_eq!(classes.get(1), 0); + assert_eq!(classes.get(2), 0); + assert_eq!(classes.get(3), 1); + assert_eq!(classes.get(4), 2); + assert_eq!(classes.get(5), 2); + assert_eq!(classes.get(6), 2); + assert_eq!(classes.get(7), 3); + assert_eq!(classes.get(255), 3); + } + + #[test] + fn full_byte_classes() { + let mut set = ByteClassSet::empty(); + for b in 0u8..=255 { + set.set_range(b, b); + } + assert_eq!(set.byte_classes().alphabet_len(), 256); + } + + #[test] + fn elements_typical() { + let mut set = ByteClassSet::empty(); + set.set_range(b'b', b'd'); + set.set_range(b'g', b'm'); + set.set_range(b'z', b'z'); + let classes = set.byte_classes(); + // class 0: \x00-a + // class 1: b-d + // class 2: e-f + // class 3: g-m + // class 4: n-y + // class 5: z-z + // class 6: \x7B-\xFF + assert_eq!(classes.alphabet_len(), 7); + + let elements = classes.elements(0).collect::<Vec<_>>(); + assert_eq!(elements.len(), 98); + assert_eq!(elements[0], b'\x00'); + assert_eq!(elements[97], b'a'); + + let elements = classes.elements(1).collect::<Vec<_>>(); + assert_eq!(elements, vec![b'b', b'c', b'd'],); + + let elements = classes.elements(2).collect::<Vec<_>>(); + assert_eq!(elements, vec![b'e', b'f'],); + + let elements = classes.elements(3).collect::<Vec<_>>(); + assert_eq!(elements, vec![b'g', b'h', b'i', b'j', b'k', b'l', b'm',],); + + let elements = classes.elements(4).collect::<Vec<_>>(); + assert_eq!(elements.len(), 12); + assert_eq!(elements[0], b'n'); + assert_eq!(elements[11], b'y'); + + let elements = classes.elements(5).collect::<Vec<_>>(); + assert_eq!(elements, vec![b'z']); + + let elements = classes.elements(6).collect::<Vec<_>>(); + assert_eq!(elements.len(), 133); + assert_eq!(elements[0], b'\x7B'); + assert_eq!(elements[132], b'\xFF'); + } + + #[test] + fn elements_singletons() { + let classes = ByteClasses::singletons(); + assert_eq!(classes.alphabet_len(), 256); + + let elements = classes.elements(b'a').collect::<Vec<_>>(); + assert_eq!(elements, vec![b'a']); + } + + #[test] + fn elements_empty() { + let classes = ByteClasses::empty(); + assert_eq!(classes.alphabet_len(), 1); + + let elements = classes.elements(0).collect::<Vec<_>>(); + assert_eq!(elements.len(), 256); + assert_eq!(elements[0], b'\x00'); + assert_eq!(elements[255], b'\xFF'); + } +} diff --git a/vendor/aho-corasick/src/util/buffer.rs b/vendor/aho-corasick/src/util/buffer.rs new file mode 100644 index 0000000..e9e982a --- /dev/null +++ b/vendor/aho-corasick/src/util/buffer.rs @@ -0,0 +1,124 @@ +use alloc::{vec, vec::Vec}; + +/// The default buffer capacity that we use for the stream buffer. +const DEFAULT_BUFFER_CAPACITY: usize = 64 * (1 << 10); // 64 KB + +/// A fairly simple roll buffer for supporting stream searches. +/// +/// This buffer acts as a temporary place to store a fixed amount of data when +/// reading from a stream. Its central purpose is to allow "rolling" some +/// suffix of the data to the beginning of the buffer before refilling it with +/// more data from the stream. For example, let's say we are trying to match +/// "foobar" on a stream. When we report the match, we'd like to not only +/// report the correct offsets at which the match occurs, but also the matching +/// bytes themselves. So let's say our stream is a file with the following +/// contents: `test test foobar test test`. Now assume that we happen to read +/// the aforementioned file in two chunks: `test test foo` and `bar test test`. +/// Naively, it would not be possible to report a single contiguous `foobar` +/// match, but this roll buffer allows us to do that. Namely, after the second +/// read, the contents of the buffer should be `st foobar test test`, where the +/// search should ultimately resume immediately after `foo`. (The prefix `st ` +/// is included because the roll buffer saves N bytes at the end of the buffer, +/// where N is the maximum possible length of a match.) +/// +/// A lot of the logic for dealing with this is unfortunately split out between +/// this roll buffer and the `StreamChunkIter`. +/// +/// Note also that this buffer is not actually required to just report matches. +/// Because a `Match` is just some offsets. But it *is* required for supporting +/// things like `try_stream_replace_all` because that needs some mechanism for +/// knowing which bytes in the stream correspond to a match and which don't. So +/// when a match occurs across two `read` calls, *something* needs to retain +/// the bytes from the previous `read` call because you don't know before the +/// second read call whether a match exists or not. +#[derive(Debug)] +pub(crate) struct Buffer { + /// The raw buffer contents. This has a fixed size and never increases. + buf: Vec<u8>, + /// The minimum size of the buffer, which is equivalent to the maximum + /// possible length of a match. This corresponds to the amount that we + /// roll + min: usize, + /// The end of the contents of this buffer. + end: usize, +} + +impl Buffer { + /// Create a new buffer for stream searching. The minimum buffer length + /// given should be the size of the maximum possible match length. + pub(crate) fn new(min_buffer_len: usize) -> Buffer { + let min = core::cmp::max(1, min_buffer_len); + // The minimum buffer amount is also the amount that we roll our + // buffer in order to support incremental searching. To this end, + // our actual capacity needs to be at least 1 byte bigger than our + // minimum amount, otherwise we won't have any overlap. In actuality, + // we want our buffer to be a bit bigger than that for performance + // reasons, so we set a lower bound of `8 * min`. + // + // TODO: It would be good to find a way to test the streaming + // implementation with the minimal buffer size. For now, we just + // uncomment out the next line and comment out the subsequent line. + // let capacity = 1 + min; + let capacity = core::cmp::max(min * 8, DEFAULT_BUFFER_CAPACITY); + Buffer { buf: vec![0; capacity], min, end: 0 } + } + + /// Return the contents of this buffer. + #[inline] + pub(crate) fn buffer(&self) -> &[u8] { + &self.buf[..self.end] + } + + /// Return the minimum size of the buffer. The only way a buffer may be + /// smaller than this is if the stream itself contains less than the + /// minimum buffer amount. + #[inline] + pub(crate) fn min_buffer_len(&self) -> usize { + self.min + } + + /// Return all free capacity in this buffer. + fn free_buffer(&mut self) -> &mut [u8] { + &mut self.buf[self.end..] + } + + /// Refill the contents of this buffer by reading as much as possible into + /// this buffer's free capacity. If no more bytes could be read, then this + /// returns false. Otherwise, this reads until it has filled the buffer + /// past the minimum amount. + pub(crate) fn fill<R: std::io::Read>( + &mut self, + mut rdr: R, + ) -> std::io::Result<bool> { + let mut readany = false; + loop { + let readlen = rdr.read(self.free_buffer())?; + if readlen == 0 { + return Ok(readany); + } + readany = true; + self.end += readlen; + if self.buffer().len() >= self.min { + return Ok(true); + } + } + } + + /// Roll the contents of the buffer so that the suffix of this buffer is + /// moved to the front and all other contents are dropped. The size of the + /// suffix corresponds precisely to the minimum buffer length. + /// + /// This should only be called when the entire contents of this buffer have + /// been searched. + pub(crate) fn roll(&mut self) { + let roll_start = self + .end + .checked_sub(self.min) + .expect("buffer capacity should be bigger than minimum amount"); + let roll_end = roll_start + self.min; + + assert!(roll_end <= self.end); + self.buf.copy_within(roll_start..roll_end, 0); + self.end = self.min; + } +} diff --git a/vendor/aho-corasick/src/util/byte_frequencies.rs b/vendor/aho-corasick/src/util/byte_frequencies.rs new file mode 100644 index 0000000..c313b62 --- /dev/null +++ b/vendor/aho-corasick/src/util/byte_frequencies.rs @@ -0,0 +1,258 @@ +pub const BYTE_FREQUENCIES: [u8; 256] = [ + 55, // '\x00' + 52, // '\x01' + 51, // '\x02' + 50, // '\x03' + 49, // '\x04' + 48, // '\x05' + 47, // '\x06' + 46, // '\x07' + 45, // '\x08' + 103, // '\t' + 242, // '\n' + 66, // '\x0b' + 67, // '\x0c' + 229, // '\r' + 44, // '\x0e' + 43, // '\x0f' + 42, // '\x10' + 41, // '\x11' + 40, // '\x12' + 39, // '\x13' + 38, // '\x14' + 37, // '\x15' + 36, // '\x16' + 35, // '\x17' + 34, // '\x18' + 33, // '\x19' + 56, // '\x1a' + 32, // '\x1b' + 31, // '\x1c' + 30, // '\x1d' + 29, // '\x1e' + 28, // '\x1f' + 255, // ' ' + 148, // '!' + 164, // '"' + 149, // '#' + 136, // '$' + 160, // '%' + 155, // '&' + 173, // "'" + 221, // '(' + 222, // ')' + 134, // '*' + 122, // '+' + 232, // ',' + 202, // '-' + 215, // '.' + 224, // '/' + 208, // '0' + 220, // '1' + 204, // '2' + 187, // '3' + 183, // '4' + 179, // '5' + 177, // '6' + 168, // '7' + 178, // '8' + 200, // '9' + 226, // ':' + 195, // ';' + 154, // '<' + 184, // '=' + 174, // '>' + 126, // '?' + 120, // '@' + 191, // 'A' + 157, // 'B' + 194, // 'C' + 170, // 'D' + 189, // 'E' + 162, // 'F' + 161, // 'G' + 150, // 'H' + 193, // 'I' + 142, // 'J' + 137, // 'K' + 171, // 'L' + 176, // 'M' + 185, // 'N' + 167, // 'O' + 186, // 'P' + 112, // 'Q' + 175, // 'R' + 192, // 'S' + 188, // 'T' + 156, // 'U' + 140, // 'V' + 143, // 'W' + 123, // 'X' + 133, // 'Y' + 128, // 'Z' + 147, // '[' + 138, // '\\' + 146, // ']' + 114, // '^' + 223, // '_' + 151, // '`' + 249, // 'a' + 216, // 'b' + 238, // 'c' + 236, // 'd' + 253, // 'e' + 227, // 'f' + 218, // 'g' + 230, // 'h' + 247, // 'i' + 135, // 'j' + 180, // 'k' + 241, // 'l' + 233, // 'm' + 246, // 'n' + 244, // 'o' + 231, // 'p' + 139, // 'q' + 245, // 'r' + 243, // 's' + 251, // 't' + 235, // 'u' + 201, // 'v' + 196, // 'w' + 240, // 'x' + 214, // 'y' + 152, // 'z' + 182, // '{' + 205, // '|' + 181, // '}' + 127, // '~' + 27, // '\x7f' + 212, // '\x80' + 211, // '\x81' + 210, // '\x82' + 213, // '\x83' + 228, // '\x84' + 197, // '\x85' + 169, // '\x86' + 159, // '\x87' + 131, // '\x88' + 172, // '\x89' + 105, // '\x8a' + 80, // '\x8b' + 98, // '\x8c' + 96, // '\x8d' + 97, // '\x8e' + 81, // '\x8f' + 207, // '\x90' + 145, // '\x91' + 116, // '\x92' + 115, // '\x93' + 144, // '\x94' + 130, // '\x95' + 153, // '\x96' + 121, // '\x97' + 107, // '\x98' + 132, // '\x99' + 109, // '\x9a' + 110, // '\x9b' + 124, // '\x9c' + 111, // '\x9d' + 82, // '\x9e' + 108, // '\x9f' + 118, // '\xa0' + 141, // '¡' + 113, // '¢' + 129, // '£' + 119, // '¤' + 125, // '¥' + 165, // '¦' + 117, // '§' + 92, // '¨' + 106, // '©' + 83, // 'ª' + 72, // '«' + 99, // '¬' + 93, // '\xad' + 65, // '®' + 79, // '¯' + 166, // '°' + 237, // '±' + 163, // '²' + 199, // '³' + 190, // '´' + 225, // 'µ' + 209, // '¶' + 203, // '·' + 198, // '¸' + 217, // '¹' + 219, // 'º' + 206, // '»' + 234, // '¼' + 248, // '½' + 158, // '¾' + 239, // '¿' + 255, // 'À' + 255, // 'Á' + 255, // 'Â' + 255, // 'Ã' + 255, // 'Ä' + 255, // 'Å' + 255, // 'Æ' + 255, // 'Ç' + 255, // 'È' + 255, // 'É' + 255, // 'Ê' + 255, // 'Ë' + 255, // 'Ì' + 255, // 'Í' + 255, // 'Î' + 255, // 'Ï' + 255, // 'Ð' + 255, // 'Ñ' + 255, // 'Ò' + 255, // 'Ó' + 255, // 'Ô' + 255, // 'Õ' + 255, // 'Ö' + 255, // '×' + 255, // 'Ø' + 255, // 'Ù' + 255, // 'Ú' + 255, // 'Û' + 255, // 'Ü' + 255, // 'Ý' + 255, // 'Þ' + 255, // 'ß' + 255, // 'à' + 255, // 'á' + 255, // 'â' + 255, // 'ã' + 255, // 'ä' + 255, // 'å' + 255, // 'æ' + 255, // 'ç' + 255, // 'è' + 255, // 'é' + 255, // 'ê' + 255, // 'ë' + 255, // 'ì' + 255, // 'í' + 255, // 'î' + 255, // 'ï' + 255, // 'ð' + 255, // 'ñ' + 255, // 'ò' + 255, // 'ó' + 255, // 'ô' + 255, // 'õ' + 255, // 'ö' + 255, // '÷' + 255, // 'ø' + 255, // 'ù' + 255, // 'ú' + 255, // 'û' + 255, // 'ü' + 255, // 'ý' + 255, // 'þ' + 255, // 'ÿ' +]; diff --git a/vendor/aho-corasick/src/util/debug.rs b/vendor/aho-corasick/src/util/debug.rs new file mode 100644 index 0000000..22b5f22 --- /dev/null +++ b/vendor/aho-corasick/src/util/debug.rs @@ -0,0 +1,26 @@ +/// A type that wraps a single byte with a convenient fmt::Debug impl that +/// escapes the byte. +pub(crate) struct DebugByte(pub(crate) u8); + +impl core::fmt::Debug for DebugByte { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + // Special case ASCII space. It's too hard to read otherwise, so + // put quotes around it. I sometimes wonder whether just '\x20' would + // be better... + if self.0 == b' ' { + return write!(f, "' '"); + } + // 10 bytes is enough to cover any output from ascii::escape_default. + let mut bytes = [0u8; 10]; + let mut len = 0; + for (i, mut b) in core::ascii::escape_default(self.0).enumerate() { + // capitalize \xab to \xAB + if i >= 2 && b'a' <= b && b <= b'f' { + b -= 32; + } + bytes[len] = b; + len += 1; + } + write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap()) + } +} diff --git a/vendor/aho-corasick/src/util/error.rs b/vendor/aho-corasick/src/util/error.rs new file mode 100644 index 0000000..326d046 --- /dev/null +++ b/vendor/aho-corasick/src/util/error.rs @@ -0,0 +1,259 @@ +use crate::util::{ + primitives::{PatternID, SmallIndex}, + search::MatchKind, +}; + +/// An error that occurred during the construction of an Aho-Corasick +/// automaton. +/// +/// Build errors occur when some kind of limit has been exceeded, either in the +/// number of states, the number of patterns of the length of a pattern. These +/// limits aren't part of the public API, but they should generally be large +/// enough to handle most use cases. +/// +/// When the `std` feature is enabled, this implements the `std::error::Error` +/// trait. +#[derive(Clone, Debug)] +pub struct BuildError { + kind: ErrorKind, +} + +/// The kind of error that occurred. +#[derive(Clone, Debug)] +enum ErrorKind { + /// An error that occurs when allocating a new state would result in an + /// identifier that exceeds the capacity of a `StateID`. + StateIDOverflow { + /// The maximum possible id. + max: u64, + /// The maximum ID requested. + requested_max: u64, + }, + /// An error that occurs when adding a pattern to an Aho-Corasick + /// automaton would result in an identifier that exceeds the capacity of a + /// `PatternID`. + PatternIDOverflow { + /// The maximum possible id. + max: u64, + /// The maximum ID requested. + requested_max: u64, + }, + /// Occurs when a pattern string is given to the Aho-Corasick constructor + /// that is too long. + PatternTooLong { + /// The ID of the pattern that was too long. + pattern: PatternID, + /// The length that was too long. + len: usize, + }, +} + +impl BuildError { + pub(crate) fn state_id_overflow( + max: u64, + requested_max: u64, + ) -> BuildError { + BuildError { kind: ErrorKind::StateIDOverflow { max, requested_max } } + } + + pub(crate) fn pattern_id_overflow( + max: u64, + requested_max: u64, + ) -> BuildError { + BuildError { + kind: ErrorKind::PatternIDOverflow { max, requested_max }, + } + } + + pub(crate) fn pattern_too_long( + pattern: PatternID, + len: usize, + ) -> BuildError { + BuildError { kind: ErrorKind::PatternTooLong { pattern, len } } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for BuildError {} + +impl core::fmt::Display for BuildError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match self.kind { + ErrorKind::StateIDOverflow { max, requested_max } => { + write!( + f, + "state identifier overflow: failed to create state ID \ + from {}, which exceeds the max of {}", + requested_max, max, + ) + } + ErrorKind::PatternIDOverflow { max, requested_max } => { + write!( + f, + "pattern identifier overflow: failed to create pattern ID \ + from {}, which exceeds the max of {}", + requested_max, max, + ) + } + ErrorKind::PatternTooLong { pattern, len } => { + write!( + f, + "pattern {} with length {} exceeds \ + the maximum pattern length of {}", + pattern.as_usize(), + len, + SmallIndex::MAX.as_usize(), + ) + } + } + } +} + +/// An error that occurred during an Aho-Corasick search. +/// +/// An error that occurs during a search is limited to some kind of +/// misconfiguration that resulted in an illegal call. Stated differently, +/// whether an error occurs is not dependent on the specific bytes in the +/// haystack. +/// +/// Examples of misconfiguration: +/// +/// * Executing a stream or overlapping search on a searcher that was built was +/// something other than [`MatchKind::Standard`](crate::MatchKind::Standard) +/// semantics. +/// * Requested an anchored or an unanchored search on a searcher that doesn't +/// support unanchored or anchored searches, respectively. +/// +/// When the `std` feature is enabled, this implements the `std::error::Error` +/// trait. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct MatchError(alloc::boxed::Box<MatchErrorKind>); + +impl MatchError { + /// Create a new error value with the given kind. + /// + /// This is a more verbose version of the kind-specific constructors, e.g., + /// `MatchError::unsupported_stream`. + pub fn new(kind: MatchErrorKind) -> MatchError { + MatchError(alloc::boxed::Box::new(kind)) + } + + /// Returns a reference to the underlying error kind. + pub fn kind(&self) -> &MatchErrorKind { + &self.0 + } + + /// Create a new "invalid anchored search" error. This occurs when the + /// caller requests an anchored search but where anchored searches aren't + /// supported. + /// + /// This is the same as calling `MatchError::new` with a + /// [`MatchErrorKind::InvalidInputAnchored`] kind. + pub fn invalid_input_anchored() -> MatchError { + MatchError::new(MatchErrorKind::InvalidInputAnchored) + } + + /// Create a new "invalid unanchored search" error. This occurs when the + /// caller requests an unanchored search but where unanchored searches + /// aren't supported. + /// + /// This is the same as calling `MatchError::new` with a + /// [`MatchErrorKind::InvalidInputUnanchored`] kind. + pub fn invalid_input_unanchored() -> MatchError { + MatchError::new(MatchErrorKind::InvalidInputUnanchored) + } + + /// Create a new "unsupported stream search" error. This occurs when the + /// caller requests a stream search while using an Aho-Corasick automaton + /// with a match kind other than [`MatchKind::Standard`]. + /// + /// The match kind given should be the match kind of the automaton. It + /// should never be `MatchKind::Standard`. + pub fn unsupported_stream(got: MatchKind) -> MatchError { + MatchError::new(MatchErrorKind::UnsupportedStream { got }) + } + + /// Create a new "unsupported overlapping search" error. This occurs when + /// the caller requests an overlapping search while using an Aho-Corasick + /// automaton with a match kind other than [`MatchKind::Standard`]. + /// + /// The match kind given should be the match kind of the automaton. It + /// should never be `MatchKind::Standard`. + pub fn unsupported_overlapping(got: MatchKind) -> MatchError { + MatchError::new(MatchErrorKind::UnsupportedOverlapping { got }) + } + + /// Create a new "unsupported empty pattern" error. This occurs when the + /// caller requests a search for which matching an automaton that contains + /// an empty pattern string is not supported. + pub fn unsupported_empty() -> MatchError { + MatchError::new(MatchErrorKind::UnsupportedEmpty) + } +} + +/// The underlying kind of a [`MatchError`]. +/// +/// This is a **non-exhaustive** enum. That means new variants may be added in +/// a semver-compatible release. +#[non_exhaustive] +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum MatchErrorKind { + /// An error indicating that an anchored search was requested, but from a + /// searcher that was built without anchored support. + InvalidInputAnchored, + /// An error indicating that an unanchored search was requested, but from a + /// searcher that was built without unanchored support. + InvalidInputUnanchored, + /// An error indicating that a stream search was attempted on an + /// Aho-Corasick automaton with an unsupported `MatchKind`. + UnsupportedStream { + /// The match semantics for the automaton that was used. + got: MatchKind, + }, + /// An error indicating that an overlapping search was attempted on an + /// Aho-Corasick automaton with an unsupported `MatchKind`. + UnsupportedOverlapping { + /// The match semantics for the automaton that was used. + got: MatchKind, + }, + /// An error indicating that the operation requested doesn't support + /// automatons that contain an empty pattern string. + UnsupportedEmpty, +} + +#[cfg(feature = "std")] +impl std::error::Error for MatchError {} + +impl core::fmt::Display for MatchError { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + match *self.kind() { + MatchErrorKind::InvalidInputAnchored => { + write!(f, "anchored searches are not supported or enabled") + } + MatchErrorKind::InvalidInputUnanchored => { + write!(f, "unanchored searches are not supported or enabled") + } + MatchErrorKind::UnsupportedStream { got } => { + write!( + f, + "match kind {:?} does not support stream searching", + got, + ) + } + MatchErrorKind::UnsupportedOverlapping { got } => { + write!( + f, + "match kind {:?} does not support overlapping searches", + got, + ) + } + MatchErrorKind::UnsupportedEmpty => { + write!( + f, + "matching with an empty pattern string is not \ + supported for this operation", + ) + } + } + } +} diff --git a/vendor/aho-corasick/src/util/int.rs b/vendor/aho-corasick/src/util/int.rs new file mode 100644 index 0000000..28ede7a --- /dev/null +++ b/vendor/aho-corasick/src/util/int.rs @@ -0,0 +1,284 @@ +/*! +This module provides several integer oriented traits for converting between +both fixed size integers and integers whose size varies based on the target +(like `usize`). + +The main design principle for this module is to centralize all uses of `as`. +The thinking here is that `as` makes it very easy to perform accidental lossy +conversions, and if we centralize all its uses here under more descriptive +higher level operations, its use and correctness becomes easier to audit. + +This was copied mostly wholesale from `regex-automata`. + +NOTE: for simplicity, we don't take target pointer width into account here for +`usize` conversions. Since we currently only panic in debug mode, skipping the +check when it can be proven it isn't needed at compile time doesn't really +matter. Now, if we wind up wanting to do as many checks as possible in release +mode, then we would want to skip those when we know the conversions are always +non-lossy. +*/ + +pub(crate) trait U8 { + fn as_usize(self) -> usize; +} + +impl U8 for u8 { + fn as_usize(self) -> usize { + usize::from(self) + } +} + +pub(crate) trait U16 { + fn as_usize(self) -> usize; + fn low_u8(self) -> u8; + fn high_u8(self) -> u8; +} + +impl U16 for u16 { + fn as_usize(self) -> usize { + usize::from(self) + } + + fn low_u8(self) -> u8 { + self as u8 + } + + fn high_u8(self) -> u8 { + (self >> 8) as u8 + } +} + +pub(crate) trait U32 { + fn as_usize(self) -> usize; + fn low_u8(self) -> u8; + fn low_u16(self) -> u16; + fn high_u16(self) -> u16; +} + +impl U32 for u32 { + #[inline] + fn as_usize(self) -> usize { + #[cfg(debug_assertions)] + { + usize::try_from(self).expect("u32 overflowed usize") + } + #[cfg(not(debug_assertions))] + { + self as usize + } + } + + fn low_u8(self) -> u8 { + self as u8 + } + + fn low_u16(self) -> u16 { + self as u16 + } + + fn high_u16(self) -> u16 { + (self >> 16) as u16 + } +} + +pub(crate) trait U64 { + fn as_usize(self) -> usize; + fn low_u8(self) -> u8; + fn low_u16(self) -> u16; + fn low_u32(self) -> u32; + fn high_u32(self) -> u32; +} + +impl U64 for u64 { + fn as_usize(self) -> usize { + #[cfg(debug_assertions)] + { + usize::try_from(self).expect("u64 overflowed usize") + } + #[cfg(not(debug_assertions))] + { + self as usize + } + } + + fn low_u8(self) -> u8 { + self as u8 + } + + fn low_u16(self) -> u16 { + self as u16 + } + + fn low_u32(self) -> u32 { + self as u32 + } + + fn high_u32(self) -> u32 { + (self >> 32) as u32 + } +} + +pub(crate) trait I8 { + fn as_usize(self) -> usize; + fn to_bits(self) -> u8; + fn from_bits(n: u8) -> i8; +} + +impl I8 for i8 { + fn as_usize(self) -> usize { + #[cfg(debug_assertions)] + { + usize::try_from(self).expect("i8 overflowed usize") + } + #[cfg(not(debug_assertions))] + { + self as usize + } + } + + fn to_bits(self) -> u8 { + self as u8 + } + + fn from_bits(n: u8) -> i8 { + n as i8 + } +} + +pub(crate) trait I32 { + fn as_usize(self) -> usize; + fn to_bits(self) -> u32; + fn from_bits(n: u32) -> i32; +} + +impl I32 for i32 { + fn as_usize(self) -> usize { + #[cfg(debug_assertions)] + { + usize::try_from(self).expect("i32 overflowed usize") + } + #[cfg(not(debug_assertions))] + { + self as usize + } + } + + fn to_bits(self) -> u32 { + self as u32 + } + + fn from_bits(n: u32) -> i32 { + n as i32 + } +} + +pub(crate) trait I64 { + fn as_usize(self) -> usize; + fn to_bits(self) -> u64; + fn from_bits(n: u64) -> i64; +} + +impl I64 for i64 { + fn as_usize(self) -> usize { + #[cfg(debug_assertions)] + { + usize::try_from(self).expect("i64 overflowed usize") + } + #[cfg(not(debug_assertions))] + { + self as usize + } + } + + fn to_bits(self) -> u64 { + self as u64 + } + + fn from_bits(n: u64) -> i64 { + n as i64 + } +} + +pub(crate) trait Usize { + fn as_u8(self) -> u8; + fn as_u16(self) -> u16; + fn as_u32(self) -> u32; + fn as_u64(self) -> u64; +} + +impl Usize for usize { + fn as_u8(self) -> u8 { + #[cfg(debug_assertions)] + { + u8::try_from(self).expect("usize overflowed u8") + } + #[cfg(not(debug_assertions))] + { + self as u8 + } + } + + fn as_u16(self) -> u16 { + #[cfg(debug_assertions)] + { + u16::try_from(self).expect("usize overflowed u16") + } + #[cfg(not(debug_assertions))] + { + self as u16 + } + } + + fn as_u32(self) -> u32 { + #[cfg(debug_assertions)] + { + u32::try_from(self).expect("usize overflowed u32") + } + #[cfg(not(debug_assertions))] + { + self as u32 + } + } + + fn as_u64(self) -> u64 { + #[cfg(debug_assertions)] + { + u64::try_from(self).expect("usize overflowed u64") + } + #[cfg(not(debug_assertions))] + { + self as u64 + } + } +} + +// Pointers aren't integers, but we convert pointers to integers to perform +// offset arithmetic in some places. (And no, we don't convert the integers +// back to pointers.) So add 'as_usize' conversions here too for completeness. +// +// These 'as' casts are actually okay because they're always non-lossy. But the +// idea here is to just try and remove as much 'as' as possible, particularly +// in this crate where we are being really paranoid about offsets and making +// sure we don't panic on inputs that might be untrusted. This way, the 'as' +// casts become easier to audit if they're all in one place, even when some of +// them are actually okay 100% of the time. + +pub(crate) trait Pointer { + fn as_usize(self) -> usize; +} + +impl<T> Pointer for *const T { + fn as_usize(self) -> usize { + self as usize + } +} + +pub(crate) trait PointerMut { + fn as_usize(self) -> usize; +} + +impl<T> PointerMut for *mut T { + fn as_usize(self) -> usize { + self as usize + } +} diff --git a/vendor/aho-corasick/src/util/mod.rs b/vendor/aho-corasick/src/util/mod.rs new file mode 100644 index 0000000..f7a1ddd --- /dev/null +++ b/vendor/aho-corasick/src/util/mod.rs @@ -0,0 +1,12 @@ +pub(crate) mod alphabet; +#[cfg(feature = "std")] +pub(crate) mod buffer; +pub(crate) mod byte_frequencies; +pub(crate) mod debug; +pub(crate) mod error; +pub(crate) mod int; +pub(crate) mod prefilter; +pub(crate) mod primitives; +pub(crate) mod remapper; +pub(crate) mod search; +pub(crate) mod special; diff --git a/vendor/aho-corasick/src/util/prefilter.rs b/vendor/aho-corasick/src/util/prefilter.rs new file mode 100644 index 0000000..f5ddc75 --- /dev/null +++ b/vendor/aho-corasick/src/util/prefilter.rs @@ -0,0 +1,924 @@ +use core::{ + cmp, + fmt::Debug, + panic::{RefUnwindSafe, UnwindSafe}, + u8, +}; + +use alloc::{sync::Arc, vec, vec::Vec}; + +use crate::{ + packed, + util::{ + alphabet::ByteSet, + search::{Match, MatchKind, Span}, + }, +}; + +/// A prefilter for accelerating a search. +/// +/// This crate uses prefilters in the core search implementations to accelerate +/// common cases. They typically only apply to cases where there are a small +/// number of patterns (less than 100 or so), but when they do, thoughput can +/// be boosted considerably, perhaps by an order of magnitude. When a prefilter +/// is active, it is used whenever a search enters an automaton's start state. +/// +/// Currently, prefilters cannot be constructed by +/// callers. A `Prefilter` can only be accessed via the +/// [`Automaton::prefilter`](crate::automaton::Automaton::prefilter) +/// method and used to execute a search. In other words, a prefilter can be +/// used to optimize your own search implementation if necessary, but cannot do +/// much else. If you have a use case for more APIs, please submit an issue. +#[derive(Clone, Debug)] +pub struct Prefilter { + finder: Arc<dyn PrefilterI>, + memory_usage: usize, +} + +impl Prefilter { + /// Execute a search in the haystack within the span given. If a match or + /// a possible match is returned, then it is guaranteed to occur within + /// the bounds of the span. + /// + /// If the span provided is invalid for the given haystack, then behavior + /// is unspecified. + #[inline] + pub fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { + self.finder.find_in(haystack, span) + } + + #[inline] + pub(crate) fn memory_usage(&self) -> usize { + self.memory_usage + } +} + +/// A candidate is the result of running a prefilter on a haystack at a +/// particular position. +/// +/// The result is either no match, a confirmed match or a possible match. +/// +/// When no match is returned, the prefilter is guaranteeing that no possible +/// match can be found in the haystack, and the caller may trust this. That is, +/// all correct prefilters must never report false negatives. +/// +/// In some cases, a prefilter can confirm a match very quickly, in which case, +/// the caller may use this to stop what it's doing and report the match. In +/// this case, prefilter implementations must never report a false positive. +/// In other cases, the prefilter can only report a potential match, in which +/// case the callers must attempt to confirm the match. In this case, prefilter +/// implementations are permitted to return false positives. +#[derive(Clone, Debug)] +pub enum Candidate { + /// No match was found. Since false negatives are not possible, this means + /// the search can quit as it is guaranteed not to find another match. + None, + /// A confirmed match was found. Callers do not need to confirm it. + Match(Match), + /// The start of a possible match was found. Callers must confirm it before + /// reporting it as a match. + PossibleStartOfMatch(usize), +} + +impl Candidate { + /// Convert this candidate into an option. This is useful when callers + /// do not distinguish between true positives and false positives (i.e., + /// the caller must always confirm the match). + pub fn into_option(self) -> Option<usize> { + match self { + Candidate::None => None, + Candidate::Match(ref m) => Some(m.start()), + Candidate::PossibleStartOfMatch(start) => Some(start), + } + } +} + +/// A prefilter describes the behavior of fast literal scanners for quickly +/// skipping past bytes in the haystack that we know cannot possibly +/// participate in a match. +trait PrefilterI: + Send + Sync + RefUnwindSafe + UnwindSafe + Debug + 'static +{ + /// Returns the next possible match candidate. This may yield false + /// positives, so callers must confirm a match starting at the position + /// returned. This, however, must never produce false negatives. That is, + /// this must, at minimum, return the starting position of the next match + /// in the given haystack after or at the given position. + fn find_in(&self, haystack: &[u8], span: Span) -> Candidate; +} + +impl<P: PrefilterI + ?Sized> PrefilterI for Arc<P> { + #[inline(always)] + fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { + (**self).find_in(haystack, span) + } +} + +/// A builder for constructing the best possible prefilter. When constructed, +/// this builder will heuristically select the best prefilter it can build, +/// if any, and discard the rest. +#[derive(Debug)] +pub(crate) struct Builder { + count: usize, + ascii_case_insensitive: bool, + start_bytes: StartBytesBuilder, + rare_bytes: RareBytesBuilder, + memmem: MemmemBuilder, + packed: Option<packed::Builder>, + // If we run across a condition that suggests we shouldn't use a prefilter + // at all (like an empty pattern), then disable prefilters entirely. + enabled: bool, +} + +impl Builder { + /// Create a new builder for constructing the best possible prefilter. + pub(crate) fn new(kind: MatchKind) -> Builder { + let pbuilder = kind + .as_packed() + .map(|kind| packed::Config::new().match_kind(kind).builder()); + Builder { + count: 0, + ascii_case_insensitive: false, + start_bytes: StartBytesBuilder::new(), + rare_bytes: RareBytesBuilder::new(), + memmem: MemmemBuilder::default(), + packed: pbuilder, + enabled: true, + } + } + + /// Enable ASCII case insensitivity. When set, byte strings added to this + /// builder will be interpreted without respect to ASCII case. + pub(crate) fn ascii_case_insensitive(mut self, yes: bool) -> Builder { + self.ascii_case_insensitive = yes; + self.start_bytes = self.start_bytes.ascii_case_insensitive(yes); + self.rare_bytes = self.rare_bytes.ascii_case_insensitive(yes); + self + } + + /// Return a prefilter suitable for quickly finding potential matches. + /// + /// All patterns added to an Aho-Corasick automaton should be added to this + /// builder before attempting to construct the prefilter. + pub(crate) fn build(&self) -> Option<Prefilter> { + if !self.enabled { + debug!("prefilter not enabled, skipping"); + return None; + } + // If we only have one pattern, then deferring to memmem is always + // the best choice. This is kind of a weird case, because, well, why + // use Aho-Corasick if you only have one pattern? But maybe you don't + // know exactly how many patterns you'll get up front, and you need to + // support the option of multiple patterns. So instead of relying on + // the caller to branch and use memmem explicitly, we just do it for + // them. + if !self.ascii_case_insensitive { + if let Some(pre) = self.memmem.build() { + debug!("using memmem prefilter"); + return Some(pre); + } + } + let (packed, patlen, minlen) = if self.ascii_case_insensitive { + (None, usize::MAX, 0) + } else { + let patlen = self.packed.as_ref().map_or(usize::MAX, |p| p.len()); + let minlen = self.packed.as_ref().map_or(0, |p| p.minimum_len()); + let packed = + self.packed.as_ref().and_then(|b| b.build()).map(|s| { + let memory_usage = s.memory_usage(); + debug!( + "built packed prefilter (len: {}, \ + minimum pattern len: {}, memory usage: {}) \ + for consideration", + patlen, minlen, memory_usage, + ); + Prefilter { finder: Arc::new(Packed(s)), memory_usage } + }); + (packed, patlen, minlen) + }; + match (self.start_bytes.build(), self.rare_bytes.build()) { + // If we could build both start and rare prefilters, then there are + // a few cases in which we'd want to use the start-byte prefilter + // over the rare-byte prefilter, since the former has lower + // overhead. + (prestart @ Some(_), prerare @ Some(_)) => { + debug!( + "both start (len={}, rank={}) and \ + rare (len={}, rank={}) byte prefilters \ + are available", + self.start_bytes.count, + self.start_bytes.rank_sum, + self.rare_bytes.count, + self.rare_bytes.rank_sum, + ); + if patlen <= 16 + && minlen >= 2 + && self.start_bytes.count >= 3 + && self.rare_bytes.count >= 3 + { + debug!( + "start and rare byte prefilters available, but \ + they're probably slower than packed so using \ + packed" + ); + return packed; + } + // If the start-byte prefilter can scan for a smaller number + // of bytes than the rare-byte prefilter, then it's probably + // faster. + let has_fewer_bytes = + self.start_bytes.count < self.rare_bytes.count; + // Otherwise, if the combined frequency rank of the detected + // bytes in the start-byte prefilter is "close" to the combined + // frequency rank of the rare-byte prefilter, then we pick + // the start-byte prefilter even if the rare-byte prefilter + // heuristically searches for rare bytes. This is because the + // rare-byte prefilter has higher constant costs, so we tend to + // prefer the start-byte prefilter when we can. + let has_rarer_bytes = + self.start_bytes.rank_sum <= self.rare_bytes.rank_sum + 50; + if has_fewer_bytes { + debug!( + "using start byte prefilter because it has fewer + bytes to search for than the rare byte prefilter", + ); + prestart + } else if has_rarer_bytes { + debug!( + "using start byte prefilter because its byte \ + frequency rank was determined to be \ + \"good enough\" relative to the rare byte prefilter \ + byte frequency rank", + ); + prestart + } else { + debug!("using rare byte prefilter"); + prerare + } + } + (prestart @ Some(_), None) => { + if patlen <= 16 && minlen >= 2 && self.start_bytes.count >= 3 { + debug!( + "start byte prefilter available, but \ + it's probably slower than packed so using \ + packed" + ); + return packed; + } + debug!( + "have start byte prefilter but not rare byte prefilter, \ + so using start byte prefilter", + ); + prestart + } + (None, prerare @ Some(_)) => { + if patlen <= 16 && minlen >= 2 && self.rare_bytes.count >= 3 { + debug!( + "rare byte prefilter available, but \ + it's probably slower than packed so using \ + packed" + ); + return packed; + } + debug!( + "have rare byte prefilter but not start byte prefilter, \ + so using rare byte prefilter", + ); + prerare + } + (None, None) if self.ascii_case_insensitive => { + debug!( + "no start or rare byte prefilter and ASCII case \ + insensitivity was enabled, so skipping prefilter", + ); + None + } + (None, None) => { + if packed.is_some() { + debug!("falling back to packed prefilter"); + } else { + debug!("no prefilter available"); + } + packed + } + } + } + + /// Add a literal string to this prefilter builder. + pub(crate) fn add(&mut self, bytes: &[u8]) { + if bytes.is_empty() { + self.enabled = false; + } + if !self.enabled { + return; + } + self.count += 1; + self.start_bytes.add(bytes); + self.rare_bytes.add(bytes); + self.memmem.add(bytes); + if let Some(ref mut pbuilder) = self.packed { + pbuilder.add(bytes); + } + } +} + +/// A type that wraps a packed searcher and implements the `Prefilter` +/// interface. +#[derive(Clone, Debug)] +struct Packed(packed::Searcher); + +impl PrefilterI for Packed { + fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { + self.0 + .find_in(&haystack, span) + .map_or(Candidate::None, Candidate::Match) + } +} + +/// A builder for constructing a prefilter that uses memmem. +#[derive(Debug, Default)] +struct MemmemBuilder { + /// The number of patterns that have been added. + count: usize, + /// The singular pattern to search for. This is only set when count==1. + one: Option<Vec<u8>>, +} + +impl MemmemBuilder { + fn build(&self) -> Option<Prefilter> { + #[cfg(all(feature = "std", feature = "perf-literal"))] + fn imp(builder: &MemmemBuilder) -> Option<Prefilter> { + let pattern = builder.one.as_ref()?; + assert_eq!(1, builder.count); + let finder = Arc::new(Memmem( + memchr::memmem::Finder::new(pattern).into_owned(), + )); + let memory_usage = pattern.len(); + Some(Prefilter { finder, memory_usage }) + } + + #[cfg(not(all(feature = "std", feature = "perf-literal")))] + fn imp(_: &MemmemBuilder) -> Option<Prefilter> { + None + } + + imp(self) + } + + fn add(&mut self, bytes: &[u8]) { + self.count += 1; + if self.count == 1 { + self.one = Some(bytes.to_vec()); + } else { + self.one = None; + } + } +} + +/// A type that wraps a SIMD accelerated single substring search from the +/// `memchr` crate for use as a prefilter. +/// +/// Currently, this prefilter is only active for Aho-Corasick searchers with +/// a single pattern. In theory, this could be extended to support searchers +/// that have a common prefix of more than one byte (for one byte, we would use +/// memchr), but it's not clear if it's worth it or not. +/// +/// Also, unfortunately, this currently also requires the 'std' feature to +/// be enabled. That's because memchr doesn't have a no-std-but-with-alloc +/// mode, and so APIs like Finder::into_owned aren't available when 'std' is +/// disabled. But there should be an 'alloc' feature that brings in APIs like +/// Finder::into_owned but doesn't use std-only features like runtime CPU +/// feature detection. +#[cfg(all(feature = "std", feature = "perf-literal"))] +#[derive(Clone, Debug)] +struct Memmem(memchr::memmem::Finder<'static>); + +#[cfg(all(feature = "std", feature = "perf-literal"))] +impl PrefilterI for Memmem { + fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { + use crate::util::primitives::PatternID; + + self.0.find(&haystack[span]).map_or(Candidate::None, |i| { + let start = span.start + i; + let end = start + self.0.needle().len(); + // N.B. We can declare a match and use a fixed pattern ID here + // because a Memmem prefilter is only ever created for searchers + // with exactly one pattern. Thus, every match is always a match + // and it is always for the first and only pattern. + Candidate::Match(Match::new(PatternID::ZERO, start..end)) + }) + } +} + +/// A builder for constructing a rare byte prefilter. +/// +/// A rare byte prefilter attempts to pick out a small set of rare bytes that +/// occurr in the patterns, and then quickly scan to matches of those rare +/// bytes. +#[derive(Clone, Debug)] +struct RareBytesBuilder { + /// Whether this prefilter should account for ASCII case insensitivity or + /// not. + ascii_case_insensitive: bool, + /// A set of rare bytes, indexed by byte value. + rare_set: ByteSet, + /// A set of byte offsets associated with bytes in a pattern. An entry + /// corresponds to a particular bytes (its index) and is only non-zero if + /// the byte occurred at an offset greater than 0 in at least one pattern. + /// + /// If a byte's offset is not representable in 8 bits, then the rare bytes + /// prefilter becomes inert. + byte_offsets: RareByteOffsets, + /// Whether this is available as a prefilter or not. This can be set to + /// false during construction if a condition is seen that invalidates the + /// use of the rare-byte prefilter. + available: bool, + /// The number of bytes set to an active value in `byte_offsets`. + count: usize, + /// The sum of frequency ranks for the rare bytes detected. This is + /// intended to give a heuristic notion of how rare the bytes are. + rank_sum: u16, +} + +/// A set of byte offsets, keyed by byte. +#[derive(Clone, Copy)] +struct RareByteOffsets { + /// Each entry corresponds to the maximum offset of the corresponding + /// byte across all patterns seen. + set: [RareByteOffset; 256], +} + +impl RareByteOffsets { + /// Create a new empty set of rare byte offsets. + pub(crate) fn empty() -> RareByteOffsets { + RareByteOffsets { set: [RareByteOffset::default(); 256] } + } + + /// Add the given offset for the given byte to this set. If the offset is + /// greater than the existing offset, then it overwrites the previous + /// value and returns false. If there is no previous value set, then this + /// sets it and returns true. + pub(crate) fn set(&mut self, byte: u8, off: RareByteOffset) { + self.set[byte as usize].max = + cmp::max(self.set[byte as usize].max, off.max); + } +} + +impl core::fmt::Debug for RareByteOffsets { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let mut offsets = vec![]; + for off in self.set.iter() { + if off.max > 0 { + offsets.push(off); + } + } + f.debug_struct("RareByteOffsets").field("set", &offsets).finish() + } +} + +/// Offsets associated with an occurrence of a "rare" byte in any of the +/// patterns used to construct a single Aho-Corasick automaton. +#[derive(Clone, Copy, Debug)] +struct RareByteOffset { + /// The maximum offset at which a particular byte occurs from the start + /// of any pattern. This is used as a shift amount. That is, when an + /// occurrence of this byte is found, the candidate position reported by + /// the prefilter is `position_of_byte - max`, such that the automaton + /// will begin its search at a position that is guaranteed to observe a + /// match. + /// + /// To avoid accidentally quadratic behavior, a prefilter is considered + /// ineffective when it is asked to start scanning from a position that it + /// has already scanned past. + /// + /// Using a `u8` here means that if we ever see a pattern that's longer + /// than 255 bytes, then the entire rare byte prefilter is disabled. + max: u8, +} + +impl Default for RareByteOffset { + fn default() -> RareByteOffset { + RareByteOffset { max: 0 } + } +} + +impl RareByteOffset { + /// Create a new rare byte offset. If the given offset is too big, then + /// None is returned. In that case, callers should render the rare bytes + /// prefilter inert. + fn new(max: usize) -> Option<RareByteOffset> { + if max > u8::MAX as usize { + None + } else { + Some(RareByteOffset { max: max as u8 }) + } + } +} + +impl RareBytesBuilder { + /// Create a new builder for constructing a rare byte prefilter. + fn new() -> RareBytesBuilder { + RareBytesBuilder { + ascii_case_insensitive: false, + rare_set: ByteSet::empty(), + byte_offsets: RareByteOffsets::empty(), + available: true, + count: 0, + rank_sum: 0, + } + } + + /// Enable ASCII case insensitivity. When set, byte strings added to this + /// builder will be interpreted without respect to ASCII case. + fn ascii_case_insensitive(mut self, yes: bool) -> RareBytesBuilder { + self.ascii_case_insensitive = yes; + self + } + + /// Build the rare bytes prefilter. + /// + /// If there are more than 3 distinct rare bytes found, or if heuristics + /// otherwise determine that this prefilter should not be used, then `None` + /// is returned. + fn build(&self) -> Option<Prefilter> { + #[cfg(feature = "perf-literal")] + fn imp(builder: &RareBytesBuilder) -> Option<Prefilter> { + if !builder.available || builder.count > 3 { + return None; + } + let (mut bytes, mut len) = ([0; 3], 0); + for b in 0..=255 { + if builder.rare_set.contains(b) { + bytes[len] = b as u8; + len += 1; + } + } + let finder: Arc<dyn PrefilterI> = match len { + 0 => return None, + 1 => Arc::new(RareBytesOne { + byte1: bytes[0], + offset: builder.byte_offsets.set[bytes[0] as usize], + }), + 2 => Arc::new(RareBytesTwo { + offsets: builder.byte_offsets, + byte1: bytes[0], + byte2: bytes[1], + }), + 3 => Arc::new(RareBytesThree { + offsets: builder.byte_offsets, + byte1: bytes[0], + byte2: bytes[1], + byte3: bytes[2], + }), + _ => unreachable!(), + }; + Some(Prefilter { finder, memory_usage: 0 }) + } + + #[cfg(not(feature = "perf-literal"))] + fn imp(_: &RareBytesBuilder) -> Option<Prefilter> { + None + } + + imp(self) + } + + /// Add a byte string to this builder. + /// + /// All patterns added to an Aho-Corasick automaton should be added to this + /// builder before attempting to construct the prefilter. + fn add(&mut self, bytes: &[u8]) { + // If we've already given up, then do nothing. + if !self.available { + return; + } + // If we've already blown our budget, then don't waste time looking + // for more rare bytes. + if self.count > 3 { + self.available = false; + return; + } + // If the pattern is too long, then our offset table is bunk, so + // give up. + if bytes.len() >= 256 { + self.available = false; + return; + } + let mut rarest = match bytes.get(0) { + None => return, + Some(&b) => (b, freq_rank(b)), + }; + // The idea here is to look for the rarest byte in each pattern, and + // add that to our set. As a special exception, if we see a byte that + // we've already added, then we immediately stop and choose that byte, + // even if there's another rare byte in the pattern. This helps us + // apply the rare byte optimization in more cases by attempting to pick + // bytes that are in common between patterns. So for example, if we + // were searching for `Sherlock` and `lockjaw`, then this would pick + // `k` for both patterns, resulting in the use of `memchr` instead of + // `memchr2` for `k` and `j`. + let mut found = false; + for (pos, &b) in bytes.iter().enumerate() { + self.set_offset(pos, b); + if found { + continue; + } + if self.rare_set.contains(b) { + found = true; + continue; + } + let rank = freq_rank(b); + if rank < rarest.1 { + rarest = (b, rank); + } + } + if !found { + self.add_rare_byte(rarest.0); + } + } + + fn set_offset(&mut self, pos: usize, byte: u8) { + // This unwrap is OK because pos is never bigger than our max. + let offset = RareByteOffset::new(pos).unwrap(); + self.byte_offsets.set(byte, offset); + if self.ascii_case_insensitive { + self.byte_offsets.set(opposite_ascii_case(byte), offset); + } + } + + fn add_rare_byte(&mut self, byte: u8) { + self.add_one_rare_byte(byte); + if self.ascii_case_insensitive { + self.add_one_rare_byte(opposite_ascii_case(byte)); + } + } + + fn add_one_rare_byte(&mut self, byte: u8) { + if !self.rare_set.contains(byte) { + self.rare_set.add(byte); + self.count += 1; + self.rank_sum += freq_rank(byte) as u16; + } + } +} + +/// A prefilter for scanning for a single "rare" byte. +#[cfg(feature = "perf-literal")] +#[derive(Clone, Debug)] +struct RareBytesOne { + byte1: u8, + offset: RareByteOffset, +} + +#[cfg(feature = "perf-literal")] +impl PrefilterI for RareBytesOne { + fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { + memchr::memchr(self.byte1, &haystack[span]) + .map(|i| { + let pos = span.start + i; + cmp::max( + span.start, + pos.saturating_sub(usize::from(self.offset.max)), + ) + }) + .map_or(Candidate::None, Candidate::PossibleStartOfMatch) + } +} + +/// A prefilter for scanning for two "rare" bytes. +#[cfg(feature = "perf-literal")] +#[derive(Clone, Debug)] +struct RareBytesTwo { + offsets: RareByteOffsets, + byte1: u8, + byte2: u8, +} + +#[cfg(feature = "perf-literal")] +impl PrefilterI for RareBytesTwo { + fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { + memchr::memchr2(self.byte1, self.byte2, &haystack[span]) + .map(|i| { + let pos = span.start + i; + let offset = self.offsets.set[usize::from(haystack[pos])].max; + cmp::max(span.start, pos.saturating_sub(usize::from(offset))) + }) + .map_or(Candidate::None, Candidate::PossibleStartOfMatch) + } +} + +/// A prefilter for scanning for three "rare" bytes. +#[cfg(feature = "perf-literal")] +#[derive(Clone, Debug)] +struct RareBytesThree { + offsets: RareByteOffsets, + byte1: u8, + byte2: u8, + byte3: u8, +} + +#[cfg(feature = "perf-literal")] +impl PrefilterI for RareBytesThree { + fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { + memchr::memchr3(self.byte1, self.byte2, self.byte3, &haystack[span]) + .map(|i| { + let pos = span.start + i; + let offset = self.offsets.set[usize::from(haystack[pos])].max; + cmp::max(span.start, pos.saturating_sub(usize::from(offset))) + }) + .map_or(Candidate::None, Candidate::PossibleStartOfMatch) + } +} + +/// A builder for constructing a starting byte prefilter. +/// +/// A starting byte prefilter is a simplistic prefilter that looks for possible +/// matches by reporting all positions corresponding to a particular byte. This +/// generally only takes affect when there are at most 3 distinct possible +/// starting bytes. e.g., the patterns `foo`, `bar`, and `baz` have two +/// distinct starting bytes (`f` and `b`), and this prefilter returns all +/// occurrences of either `f` or `b`. +/// +/// In some cases, a heuristic frequency analysis may determine that it would +/// be better not to use this prefilter even when there are 3 or fewer distinct +/// starting bytes. +#[derive(Clone, Debug)] +struct StartBytesBuilder { + /// Whether this prefilter should account for ASCII case insensitivity or + /// not. + ascii_case_insensitive: bool, + /// The set of starting bytes observed. + byteset: Vec<bool>, + /// The number of bytes set to true in `byteset`. + count: usize, + /// The sum of frequency ranks for the rare bytes detected. This is + /// intended to give a heuristic notion of how rare the bytes are. + rank_sum: u16, +} + +impl StartBytesBuilder { + /// Create a new builder for constructing a start byte prefilter. + fn new() -> StartBytesBuilder { + StartBytesBuilder { + ascii_case_insensitive: false, + byteset: vec![false; 256], + count: 0, + rank_sum: 0, + } + } + + /// Enable ASCII case insensitivity. When set, byte strings added to this + /// builder will be interpreted without respect to ASCII case. + fn ascii_case_insensitive(mut self, yes: bool) -> StartBytesBuilder { + self.ascii_case_insensitive = yes; + self + } + + /// Build the starting bytes prefilter. + /// + /// If there are more than 3 distinct starting bytes, or if heuristics + /// otherwise determine that this prefilter should not be used, then `None` + /// is returned. + fn build(&self) -> Option<Prefilter> { + #[cfg(feature = "perf-literal")] + fn imp(builder: &StartBytesBuilder) -> Option<Prefilter> { + if builder.count > 3 { + return None; + } + let (mut bytes, mut len) = ([0; 3], 0); + for b in 0..256 { + if !builder.byteset[b] { + continue; + } + // We don't handle non-ASCII bytes for now. Getting non-ASCII + // bytes right is trickier, since we generally don't want to put + // a leading UTF-8 code unit into a prefilter that isn't ASCII, + // since they can frequently. Instead, it would be better to use a + // continuation byte, but this requires more sophisticated analysis + // of the automaton and a richer prefilter API. + if b > 0x7F { + return None; + } + bytes[len] = b as u8; + len += 1; + } + let finder: Arc<dyn PrefilterI> = match len { + 0 => return None, + 1 => Arc::new(StartBytesOne { byte1: bytes[0] }), + 2 => Arc::new(StartBytesTwo { + byte1: bytes[0], + byte2: bytes[1], + }), + 3 => Arc::new(StartBytesThree { + byte1: bytes[0], + byte2: bytes[1], + byte3: bytes[2], + }), + _ => unreachable!(), + }; + Some(Prefilter { finder, memory_usage: 0 }) + } + + #[cfg(not(feature = "perf-literal"))] + fn imp(_: &StartBytesBuilder) -> Option<Prefilter> { + None + } + + imp(self) + } + + /// Add a byte string to this builder. + /// + /// All patterns added to an Aho-Corasick automaton should be added to this + /// builder before attempting to construct the prefilter. + fn add(&mut self, bytes: &[u8]) { + if self.count > 3 { + return; + } + if let Some(&byte) = bytes.get(0) { + self.add_one_byte(byte); + if self.ascii_case_insensitive { + self.add_one_byte(opposite_ascii_case(byte)); + } + } + } + + fn add_one_byte(&mut self, byte: u8) { + if !self.byteset[byte as usize] { + self.byteset[byte as usize] = true; + self.count += 1; + self.rank_sum += freq_rank(byte) as u16; + } + } +} + +/// A prefilter for scanning for a single starting byte. +#[cfg(feature = "perf-literal")] +#[derive(Clone, Debug)] +struct StartBytesOne { + byte1: u8, +} + +#[cfg(feature = "perf-literal")] +impl PrefilterI for StartBytesOne { + fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { + memchr::memchr(self.byte1, &haystack[span]) + .map(|i| span.start + i) + .map_or(Candidate::None, Candidate::PossibleStartOfMatch) + } +} + +/// A prefilter for scanning for two starting bytes. +#[cfg(feature = "perf-literal")] +#[derive(Clone, Debug)] +struct StartBytesTwo { + byte1: u8, + byte2: u8, +} + +#[cfg(feature = "perf-literal")] +impl PrefilterI for StartBytesTwo { + fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { + memchr::memchr2(self.byte1, self.byte2, &haystack[span]) + .map(|i| span.start + i) + .map_or(Candidate::None, Candidate::PossibleStartOfMatch) + } +} + +/// A prefilter for scanning for three starting bytes. +#[cfg(feature = "perf-literal")] +#[derive(Clone, Debug)] +struct StartBytesThree { + byte1: u8, + byte2: u8, + byte3: u8, +} + +#[cfg(feature = "perf-literal")] +impl PrefilterI for StartBytesThree { + fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { + memchr::memchr3(self.byte1, self.byte2, self.byte3, &haystack[span]) + .map(|i| span.start + i) + .map_or(Candidate::None, Candidate::PossibleStartOfMatch) + } +} + +/// If the given byte is an ASCII letter, then return it in the opposite case. +/// e.g., Given `b'A'`, this returns `b'a'`, and given `b'a'`, this returns +/// `b'A'`. If a non-ASCII letter is given, then the given byte is returned. +pub(crate) fn opposite_ascii_case(b: u8) -> u8 { + if b'A' <= b && b <= b'Z' { + b.to_ascii_lowercase() + } else if b'a' <= b && b <= b'z' { + b.to_ascii_uppercase() + } else { + b + } +} + +/// Return the frequency rank of the given byte. The higher the rank, the more +/// common the byte (heuristically speaking). +fn freq_rank(b: u8) -> u8 { + use crate::util::byte_frequencies::BYTE_FREQUENCIES; + BYTE_FREQUENCIES[b as usize] +} diff --git a/vendor/aho-corasick/src/util/primitives.rs b/vendor/aho-corasick/src/util/primitives.rs new file mode 100644 index 0000000..784d397 --- /dev/null +++ b/vendor/aho-corasick/src/util/primitives.rs @@ -0,0 +1,759 @@ +/*! +Lower level primitive types that are useful in a variety of circumstances. + +# Overview + +This list represents the principle types in this module and briefly describes +when you might want to use them. + +* [`PatternID`] - A type that represents the identifier of a regex pattern. +This is probably the most widely used type in this module (which is why it's +also re-exported in the crate root). +* [`StateID`] - A type the represents the identifier of a finite automaton +state. This is used for both NFAs and DFAs, with the notable exception of +the hybrid NFA/DFA. (The hybrid NFA/DFA uses a special purpose "lazy" state +identifier.) +* [`SmallIndex`] - The internal representation of both a `PatternID` and a +`StateID`. Its purpose is to serve as a type that can index memory without +being as big as a `usize` on 64-bit targets. The main idea behind this type +is that there are many things in regex engines that will, in practice, never +overflow a 32-bit integer. (For example, like the number of patterns in a regex +or the number of states in an NFA.) Thus, a `SmallIndex` can be used to index +memory without peppering `as` casts everywhere. Moreover, it forces callers +to handle errors in the case where, somehow, the value would otherwise overflow +either a 32-bit integer or a `usize` (e.g., on 16-bit targets). +*/ + +// The macro we use to define some types below adds methods that we don't +// use on some of the types. There isn't much, so we just squash the warning. +#![allow(dead_code)] + +use alloc::vec::Vec; + +use crate::util::int::{Usize, U16, U32, U64}; + +/// A type that represents a "small" index. +/// +/// The main idea of this type is to provide something that can index memory, +/// but uses less memory than `usize` on 64-bit systems. Specifically, its +/// representation is always a `u32` and has `repr(transparent)` enabled. (So +/// it is safe to transmute between a `u32` and a `SmallIndex`.) +/// +/// A small index is typically useful in cases where there is no practical way +/// that the index will overflow a 32-bit integer. A good example of this is +/// an NFA state. If you could somehow build an NFA with `2^30` states, its +/// memory usage would be exorbitant and its runtime execution would be so +/// slow as to be completely worthless. Therefore, this crate generally deems +/// it acceptable to return an error if it would otherwise build an NFA that +/// requires a slice longer than what a 32-bit integer can index. In exchange, +/// we can use 32-bit indices instead of 64-bit indices in various places. +/// +/// This type ensures this by providing a constructor that will return an error +/// if its argument cannot fit into the type. This makes it much easier to +/// handle these sorts of boundary cases that are otherwise extremely subtle. +/// +/// On all targets, this type guarantees that its value will fit in a `u32`, +/// `i32`, `usize` and an `isize`. This means that on 16-bit targets, for +/// example, this type's maximum value will never overflow an `isize`, +/// which means it will never overflow a `i16` even though its internal +/// representation is still a `u32`. +/// +/// The purpose for making the type fit into even signed integer types like +/// `isize` is to guarantee that the difference between any two small indices +/// is itself also a small index. This is useful in certain contexts, e.g., +/// for delta encoding. +/// +/// # Other types +/// +/// The following types wrap `SmallIndex` to provide a more focused use case: +/// +/// * [`PatternID`] is for representing the identifiers of patterns. +/// * [`StateID`] is for representing the identifiers of states in finite +/// automata. It is used for both NFAs and DFAs. +/// +/// # Representation +/// +/// This type is always represented internally by a `u32` and is marked as +/// `repr(transparent)`. Thus, this type always has the same representation as +/// a `u32`. It is thus safe to transmute between a `u32` and a `SmallIndex`. +/// +/// # Indexing +/// +/// For convenience, callers may use a `SmallIndex` to index slices. +/// +/// # Safety +/// +/// While a `SmallIndex` is meant to guarantee that its value fits into `usize` +/// without using as much space as a `usize` on all targets, callers must +/// not rely on this property for safety. Callers may choose to rely on this +/// property for correctness however. For example, creating a `SmallIndex` with +/// an invalid value can be done in entirely safe code. This may in turn result +/// in panics or silent logical errors. +#[derive( + Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord, +)] +#[repr(transparent)] +pub(crate) struct SmallIndex(u32); + +impl SmallIndex { + /// The maximum index value. + #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] + pub const MAX: SmallIndex = + // FIXME: Use as_usize() once const functions in traits are stable. + SmallIndex::new_unchecked(core::i32::MAX as usize - 1); + + /// The maximum index value. + #[cfg(target_pointer_width = "16")] + pub const MAX: SmallIndex = + SmallIndex::new_unchecked(core::isize::MAX - 1); + + /// The total number of values that can be represented as a small index. + pub const LIMIT: usize = SmallIndex::MAX.as_usize() + 1; + + /// The zero index value. + pub const ZERO: SmallIndex = SmallIndex::new_unchecked(0); + + /// The number of bytes that a single small index uses in memory. + pub const SIZE: usize = core::mem::size_of::<SmallIndex>(); + + /// Create a new small index. + /// + /// If the given index exceeds [`SmallIndex::MAX`], then this returns + /// an error. + #[inline] + pub fn new(index: usize) -> Result<SmallIndex, SmallIndexError> { + SmallIndex::try_from(index) + } + + /// Create a new small index without checking whether the given value + /// exceeds [`SmallIndex::MAX`]. + /// + /// Using this routine with an invalid index value will result in + /// unspecified behavior, but *not* undefined behavior. In particular, an + /// invalid index value is likely to cause panics or possibly even silent + /// logical errors. + /// + /// Callers must never rely on a `SmallIndex` to be within a certain range + /// for memory safety. + #[inline] + pub const fn new_unchecked(index: usize) -> SmallIndex { + // FIXME: Use as_u32() once const functions in traits are stable. + SmallIndex::from_u32_unchecked(index as u32) + } + + /// Create a new small index from a `u32` without checking whether the + /// given value exceeds [`SmallIndex::MAX`]. + /// + /// Using this routine with an invalid index value will result in + /// unspecified behavior, but *not* undefined behavior. In particular, an + /// invalid index value is likely to cause panics or possibly even silent + /// logical errors. + /// + /// Callers must never rely on a `SmallIndex` to be within a certain range + /// for memory safety. + #[inline] + pub const fn from_u32_unchecked(index: u32) -> SmallIndex { + SmallIndex(index) + } + + /// Like [`SmallIndex::new`], but panics if the given index is not valid. + #[inline] + pub fn must(index: usize) -> SmallIndex { + SmallIndex::new(index).expect("invalid small index") + } + + /// Return this small index as a `usize`. This is guaranteed to never + /// overflow `usize`. + #[inline] + pub const fn as_usize(&self) -> usize { + // FIXME: Use as_usize() once const functions in traits are stable. + self.0 as usize + } + + /// Return this small index as a `u64`. This is guaranteed to never + /// overflow. + #[inline] + pub const fn as_u64(&self) -> u64 { + // FIXME: Use u64::from() once const functions in traits are stable. + self.0 as u64 + } + + /// Return the internal `u32` of this small index. This is guaranteed to + /// never overflow `u32`. + #[inline] + pub const fn as_u32(&self) -> u32 { + self.0 + } + + /// Return the internal `u32` of this small index represented as an `i32`. + /// This is guaranteed to never overflow an `i32`. + #[inline] + pub const fn as_i32(&self) -> i32 { + // This is OK because we guarantee that our max value is <= i32::MAX. + self.0 as i32 + } + + /// Returns one more than this small index as a usize. + /// + /// Since a small index has constraints on its maximum value, adding `1` to + /// it will always fit in a `usize`, `isize`, `u32` and a `i32`. + #[inline] + pub fn one_more(&self) -> usize { + self.as_usize() + 1 + } + + /// Decode this small index from the bytes given using the native endian + /// byte order for the current target. + /// + /// If the decoded integer is not representable as a small index for the + /// current target, then this returns an error. + #[inline] + pub fn from_ne_bytes( + bytes: [u8; 4], + ) -> Result<SmallIndex, SmallIndexError> { + let id = u32::from_ne_bytes(bytes); + if id > SmallIndex::MAX.as_u32() { + return Err(SmallIndexError { attempted: u64::from(id) }); + } + Ok(SmallIndex::new_unchecked(id.as_usize())) + } + + /// Decode this small index from the bytes given using the native endian + /// byte order for the current target. + /// + /// This is analogous to [`SmallIndex::new_unchecked`] in that is does not + /// check whether the decoded integer is representable as a small index. + #[inline] + pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> SmallIndex { + SmallIndex::new_unchecked(u32::from_ne_bytes(bytes).as_usize()) + } + + /// Return the underlying small index integer as raw bytes in native endian + /// format. + #[inline] + pub fn to_ne_bytes(&self) -> [u8; 4] { + self.0.to_ne_bytes() + } +} + +impl<T> core::ops::Index<SmallIndex> for [T] { + type Output = T; + + #[inline] + fn index(&self, index: SmallIndex) -> &T { + &self[index.as_usize()] + } +} + +impl<T> core::ops::IndexMut<SmallIndex> for [T] { + #[inline] + fn index_mut(&mut self, index: SmallIndex) -> &mut T { + &mut self[index.as_usize()] + } +} + +impl<T> core::ops::Index<SmallIndex> for Vec<T> { + type Output = T; + + #[inline] + fn index(&self, index: SmallIndex) -> &T { + &self[index.as_usize()] + } +} + +impl<T> core::ops::IndexMut<SmallIndex> for Vec<T> { + #[inline] + fn index_mut(&mut self, index: SmallIndex) -> &mut T { + &mut self[index.as_usize()] + } +} + +impl From<StateID> for SmallIndex { + fn from(sid: StateID) -> SmallIndex { + sid.0 + } +} + +impl From<PatternID> for SmallIndex { + fn from(pid: PatternID) -> SmallIndex { + pid.0 + } +} + +impl From<u8> for SmallIndex { + fn from(index: u8) -> SmallIndex { + SmallIndex::new_unchecked(usize::from(index)) + } +} + +impl TryFrom<u16> for SmallIndex { + type Error = SmallIndexError; + + fn try_from(index: u16) -> Result<SmallIndex, SmallIndexError> { + if u32::from(index) > SmallIndex::MAX.as_u32() { + return Err(SmallIndexError { attempted: u64::from(index) }); + } + Ok(SmallIndex::new_unchecked(index.as_usize())) + } +} + +impl TryFrom<u32> for SmallIndex { + type Error = SmallIndexError; + + fn try_from(index: u32) -> Result<SmallIndex, SmallIndexError> { + if index > SmallIndex::MAX.as_u32() { + return Err(SmallIndexError { attempted: u64::from(index) }); + } + Ok(SmallIndex::new_unchecked(index.as_usize())) + } +} + +impl TryFrom<u64> for SmallIndex { + type Error = SmallIndexError; + + fn try_from(index: u64) -> Result<SmallIndex, SmallIndexError> { + if index > SmallIndex::MAX.as_u64() { + return Err(SmallIndexError { attempted: index }); + } + Ok(SmallIndex::new_unchecked(index.as_usize())) + } +} + +impl TryFrom<usize> for SmallIndex { + type Error = SmallIndexError; + + fn try_from(index: usize) -> Result<SmallIndex, SmallIndexError> { + if index > SmallIndex::MAX.as_usize() { + return Err(SmallIndexError { attempted: index.as_u64() }); + } + Ok(SmallIndex::new_unchecked(index)) + } +} + +/// This error occurs when a small index could not be constructed. +/// +/// This occurs when given an integer exceeding the maximum small index value. +/// +/// When the `std` feature is enabled, this implements the `Error` trait. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct SmallIndexError { + attempted: u64, +} + +impl SmallIndexError { + /// Returns the value that could not be converted to a small index. + pub fn attempted(&self) -> u64 { + self.attempted + } +} + +#[cfg(feature = "std")] +impl std::error::Error for SmallIndexError {} + +impl core::fmt::Display for SmallIndexError { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!( + f, + "failed to create small index from {:?}, which exceeds {:?}", + self.attempted(), + SmallIndex::MAX, + ) + } +} + +#[derive(Clone, Debug)] +pub(crate) struct SmallIndexIter { + rng: core::ops::Range<usize>, +} + +impl Iterator for SmallIndexIter { + type Item = SmallIndex; + + fn next(&mut self) -> Option<SmallIndex> { + if self.rng.start >= self.rng.end { + return None; + } + let next_id = self.rng.start + 1; + let id = core::mem::replace(&mut self.rng.start, next_id); + // new_unchecked is OK since we asserted that the number of + // elements in this iterator will fit in an ID at construction. + Some(SmallIndex::new_unchecked(id)) + } +} + +macro_rules! index_type_impls { + ($name:ident, $err:ident, $iter:ident, $withiter:ident) => { + impl $name { + /// The maximum value. + pub const MAX: $name = $name(SmallIndex::MAX); + + /// The total number of values that can be represented. + pub const LIMIT: usize = SmallIndex::LIMIT; + + /// The zero value. + pub const ZERO: $name = $name(SmallIndex::ZERO); + + /// The number of bytes that a single value uses in memory. + pub const SIZE: usize = SmallIndex::SIZE; + + /// Create a new value that is represented by a "small index." + /// + /// If the given index exceeds the maximum allowed value, then this + /// returns an error. + #[inline] + pub fn new(value: usize) -> Result<$name, $err> { + SmallIndex::new(value).map($name).map_err($err) + } + + /// Create a new value without checking whether the given argument + /// exceeds the maximum. + /// + /// Using this routine with an invalid value will result in + /// unspecified behavior, but *not* undefined behavior. In + /// particular, an invalid ID value is likely to cause panics or + /// possibly even silent logical errors. + /// + /// Callers must never rely on this type to be within a certain + /// range for memory safety. + #[inline] + pub const fn new_unchecked(value: usize) -> $name { + $name(SmallIndex::new_unchecked(value)) + } + + /// Create a new value from a `u32` without checking whether the + /// given value exceeds the maximum. + /// + /// Using this routine with an invalid value will result in + /// unspecified behavior, but *not* undefined behavior. In + /// particular, an invalid ID value is likely to cause panics or + /// possibly even silent logical errors. + /// + /// Callers must never rely on this type to be within a certain + /// range for memory safety. + #[inline] + pub const fn from_u32_unchecked(index: u32) -> $name { + $name(SmallIndex::from_u32_unchecked(index)) + } + + /// Like `new`, but panics if the given value is not valid. + #[inline] + pub fn must(value: usize) -> $name { + $name::new(value).expect(concat!( + "invalid ", + stringify!($name), + " value" + )) + } + + /// Return the internal value as a `usize`. This is guaranteed to + /// never overflow `usize`. + #[inline] + pub const fn as_usize(&self) -> usize { + self.0.as_usize() + } + + /// Return the internal value as a `u64`. This is guaranteed to + /// never overflow. + #[inline] + pub const fn as_u64(&self) -> u64 { + self.0.as_u64() + } + + /// Return the internal value as a `u32`. This is guaranteed to + /// never overflow `u32`. + #[inline] + pub const fn as_u32(&self) -> u32 { + self.0.as_u32() + } + + /// Return the internal value as a `i32`. This is guaranteed to + /// never overflow an `i32`. + #[inline] + pub const fn as_i32(&self) -> i32 { + self.0.as_i32() + } + + /// Returns one more than this value as a usize. + /// + /// Since values represented by a "small index" have constraints + /// on their maximum value, adding `1` to it will always fit in a + /// `usize`, `u32` and a `i32`. + #[inline] + pub fn one_more(&self) -> usize { + self.0.one_more() + } + + /// Decode this value from the bytes given using the native endian + /// byte order for the current target. + /// + /// If the decoded integer is not representable as a small index + /// for the current target, then this returns an error. + #[inline] + pub fn from_ne_bytes(bytes: [u8; 4]) -> Result<$name, $err> { + SmallIndex::from_ne_bytes(bytes).map($name).map_err($err) + } + + /// Decode this value from the bytes given using the native endian + /// byte order for the current target. + /// + /// This is analogous to `new_unchecked` in that is does not check + /// whether the decoded integer is representable as a small index. + #[inline] + pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> $name { + $name(SmallIndex::from_ne_bytes_unchecked(bytes)) + } + + /// Return the underlying integer as raw bytes in native endian + /// format. + #[inline] + pub fn to_ne_bytes(&self) -> [u8; 4] { + self.0.to_ne_bytes() + } + + /// Returns an iterator over all values from 0 up to and not + /// including the given length. + /// + /// If the given length exceeds this type's limit, then this + /// panics. + pub(crate) fn iter(len: usize) -> $iter { + $iter::new(len) + } + } + + // We write our own Debug impl so that we get things like PatternID(5) + // instead of PatternID(SmallIndex(5)). + impl core::fmt::Debug for $name { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.debug_tuple(stringify!($name)).field(&self.as_u32()).finish() + } + } + + impl<T> core::ops::Index<$name> for [T] { + type Output = T; + + #[inline] + fn index(&self, index: $name) -> &T { + &self[index.as_usize()] + } + } + + impl<T> core::ops::IndexMut<$name> for [T] { + #[inline] + fn index_mut(&mut self, index: $name) -> &mut T { + &mut self[index.as_usize()] + } + } + + impl<T> core::ops::Index<$name> for Vec<T> { + type Output = T; + + #[inline] + fn index(&self, index: $name) -> &T { + &self[index.as_usize()] + } + } + + impl<T> core::ops::IndexMut<$name> for Vec<T> { + #[inline] + fn index_mut(&mut self, index: $name) -> &mut T { + &mut self[index.as_usize()] + } + } + + impl From<SmallIndex> for $name { + fn from(index: SmallIndex) -> $name { + $name(index) + } + } + + impl From<u8> for $name { + fn from(value: u8) -> $name { + $name(SmallIndex::from(value)) + } + } + + impl TryFrom<u16> for $name { + type Error = $err; + + fn try_from(value: u16) -> Result<$name, $err> { + SmallIndex::try_from(value).map($name).map_err($err) + } + } + + impl TryFrom<u32> for $name { + type Error = $err; + + fn try_from(value: u32) -> Result<$name, $err> { + SmallIndex::try_from(value).map($name).map_err($err) + } + } + + impl TryFrom<u64> for $name { + type Error = $err; + + fn try_from(value: u64) -> Result<$name, $err> { + SmallIndex::try_from(value).map($name).map_err($err) + } + } + + impl TryFrom<usize> for $name { + type Error = $err; + + fn try_from(value: usize) -> Result<$name, $err> { + SmallIndex::try_from(value).map($name).map_err($err) + } + } + + /// This error occurs when an ID could not be constructed. + /// + /// This occurs when given an integer exceeding the maximum allowed + /// value. + /// + /// When the `std` feature is enabled, this implements the `Error` + /// trait. + #[derive(Clone, Debug, Eq, PartialEq)] + pub struct $err(SmallIndexError); + + impl $err { + /// Returns the value that could not be converted to an ID. + pub fn attempted(&self) -> u64 { + self.0.attempted() + } + } + + #[cfg(feature = "std")] + impl std::error::Error for $err {} + + impl core::fmt::Display for $err { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!( + f, + "failed to create {} from {:?}, which exceeds {:?}", + stringify!($name), + self.attempted(), + $name::MAX, + ) + } + } + + #[derive(Clone, Debug)] + pub(crate) struct $iter(SmallIndexIter); + + impl $iter { + fn new(len: usize) -> $iter { + assert!( + len <= $name::LIMIT, + "cannot create iterator for {} when number of \ + elements exceed {:?}", + stringify!($name), + $name::LIMIT, + ); + $iter(SmallIndexIter { rng: 0..len }) + } + } + + impl Iterator for $iter { + type Item = $name; + + fn next(&mut self) -> Option<$name> { + self.0.next().map($name) + } + } + + /// An iterator adapter that is like std::iter::Enumerate, but attaches + /// small index values instead. It requires `ExactSizeIterator`. At + /// construction, it ensures that the index of each element in the + /// iterator is representable in the corresponding small index type. + #[derive(Clone, Debug)] + pub(crate) struct $withiter<I> { + it: I, + ids: $iter, + } + + impl<I: Iterator + ExactSizeIterator> $withiter<I> { + fn new(it: I) -> $withiter<I> { + let ids = $name::iter(it.len()); + $withiter { it, ids } + } + } + + impl<I: Iterator + ExactSizeIterator> Iterator for $withiter<I> { + type Item = ($name, I::Item); + + fn next(&mut self) -> Option<($name, I::Item)> { + let item = self.it.next()?; + // Number of elements in this iterator must match, according + // to contract of ExactSizeIterator. + let id = self.ids.next().unwrap(); + Some((id, item)) + } + } + }; +} + +/// The identifier of a pattern in an Aho-Corasick automaton. +/// +/// It is represented by a `u32` even on 64-bit systems in order to conserve +/// space. Namely, on all targets, this type guarantees that its value will +/// fit in a `u32`, `i32`, `usize` and an `isize`. This means that on 16-bit +/// targets, for example, this type's maximum value will never overflow an +/// `isize`, which means it will never overflow a `i16` even though its +/// internal representation is still a `u32`. +/// +/// # Safety +/// +/// While a `PatternID` is meant to guarantee that its value fits into `usize` +/// without using as much space as a `usize` on all targets, callers must +/// not rely on this property for safety. Callers may choose to rely on this +/// property for correctness however. For example, creating a `StateID` with an +/// invalid value can be done in entirely safe code. This may in turn result in +/// panics or silent logical errors. +#[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)] +#[repr(transparent)] +pub struct PatternID(SmallIndex); + +/// The identifier of a finite automaton state. +/// +/// It is represented by a `u32` even on 64-bit systems in order to conserve +/// space. Namely, on all targets, this type guarantees that its value will +/// fit in a `u32`, `i32`, `usize` and an `isize`. This means that on 16-bit +/// targets, for example, this type's maximum value will never overflow an +/// `isize`, which means it will never overflow a `i16` even though its +/// internal representation is still a `u32`. +/// +/// # Safety +/// +/// While a `StateID` is meant to guarantee that its value fits into `usize` +/// without using as much space as a `usize` on all targets, callers must +/// not rely on this property for safety. Callers may choose to rely on this +/// property for correctness however. For example, creating a `StateID` with an +/// invalid value can be done in entirely safe code. This may in turn result in +/// panics or silent logical errors. +#[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)] +#[repr(transparent)] +pub struct StateID(SmallIndex); + +index_type_impls!(PatternID, PatternIDError, PatternIDIter, WithPatternIDIter); +index_type_impls!(StateID, StateIDError, StateIDIter, WithStateIDIter); + +/// A utility trait that defines a couple of adapters for making it convenient +/// to access indices as "small index" types. We require ExactSizeIterator so +/// that iterator construction can do a single check to make sure the index of +/// each element is representable by its small index type. +pub(crate) trait IteratorIndexExt: Iterator { + fn with_pattern_ids(self) -> WithPatternIDIter<Self> + where + Self: Sized + ExactSizeIterator, + { + WithPatternIDIter::new(self) + } + + fn with_state_ids(self) -> WithStateIDIter<Self> + where + Self: Sized + ExactSizeIterator, + { + WithStateIDIter::new(self) + } +} + +impl<I: Iterator> IteratorIndexExt for I {} diff --git a/vendor/aho-corasick/src/util/remapper.rs b/vendor/aho-corasick/src/util/remapper.rs new file mode 100644 index 0000000..7c47a08 --- /dev/null +++ b/vendor/aho-corasick/src/util/remapper.rs @@ -0,0 +1,214 @@ +use alloc::vec::Vec; + +use crate::{nfa::noncontiguous, util::primitives::StateID}; + +/// Remappable is a tightly coupled abstraction that facilitates remapping +/// state identifiers in DFAs. +/// +/// The main idea behind remapping state IDs is that DFAs often need to check +/// if a certain state is a "special" state of some kind (like a match state) +/// during a search. Since this is extremely perf critical code, we want this +/// check to be as fast as possible. Partitioning state IDs into, for example, +/// into "non-match" and "match" states means one can tell if a state is a +/// match state via a simple comparison of the state ID. +/// +/// The issue is that during the DFA construction process, it's not +/// particularly easy to partition the states. Instead, the simplest thing is +/// to often just do a pass over all of the states and shuffle them into their +/// desired partitionings. To do that, we need a mechanism for swapping states. +/// Hence, this abstraction. +/// +/// Normally, for such little code, I would just duplicate it. But this is a +/// key optimization and the implementation is a bit subtle. So the abstraction +/// is basically a ham-fisted attempt at DRY. The only place we use this is in +/// the dense and one-pass DFAs. +/// +/// See also src/dfa/special.rs for a more detailed explanation of how dense +/// DFAs are partitioned. +pub(crate) trait Remappable: core::fmt::Debug { + /// Return the total number of states. + fn state_len(&self) -> usize; + + /// Swap the states pointed to by the given IDs. The underlying finite + /// state machine should be mutated such that all of the transitions in + /// `id1` are now in the memory region where the transitions for `id2` + /// were, and all of the transitions in `id2` are now in the memory region + /// where the transitions for `id1` were. + /// + /// Essentially, this "moves" `id1` to `id2` and `id2` to `id1`. + /// + /// It is expected that, after calling this, the underlying state machine + /// will be left in an inconsistent state, since any other transitions + /// pointing to, e.g., `id1` need to be updated to point to `id2`, since + /// that's where `id1` moved to. + /// + /// In order to "fix" the underlying inconsistent state, a `Remapper` + /// should be used to guarantee that `remap` is called at the appropriate + /// time. + fn swap_states(&mut self, id1: StateID, id2: StateID); + + /// This must remap every single state ID in the underlying value according + /// to the function given. For example, in a DFA, this should remap every + /// transition and every starting state ID. + fn remap(&mut self, map: impl Fn(StateID) -> StateID); +} + +/// Remapper is an abstraction the manages the remapping of state IDs in a +/// finite state machine. This is useful when one wants to shuffle states into +/// different positions in the machine. +/// +/// One of the key complexities this manages is the ability to correctly move +/// one state multiple times. +/// +/// Once shuffling is complete, `remap` must be called, which will rewrite +/// all pertinent transitions to updated state IDs. Neglecting to call `remap` +/// will almost certainly result in a corrupt machine. +#[derive(Debug)] +pub(crate) struct Remapper { + /// A map from the index of a state to its pre-multiplied identifier. + /// + /// When a state is swapped with another, then their corresponding + /// locations in this map are also swapped. Thus, its new position will + /// still point to its old pre-multiplied StateID. + /// + /// While there is a bit more to it, this then allows us to rewrite the + /// state IDs in a DFA's transition table in a single pass. This is done + /// by iterating over every ID in this map, then iterating over each + /// transition for the state at that ID and re-mapping the transition from + /// `old_id` to `map[dfa.to_index(old_id)]`. That is, we find the position + /// in this map where `old_id` *started*, and set it to where it ended up + /// after all swaps have been completed. + map: Vec<StateID>, + /// A way to map indices to state IDs (and back). + idx: IndexMapper, +} + +impl Remapper { + /// Create a new remapper from the given remappable implementation. The + /// remapper can then be used to swap states. The remappable value given + /// here must the same one given to `swap` and `remap`. + /// + /// The given stride should be the stride of the transition table expressed + /// as a power of 2. This stride is used to map between state IDs and state + /// indices. If state IDs and state indices are equivalent, then provide + /// a `stride2` of `0`, which acts as an identity. + pub(crate) fn new(r: &impl Remappable, stride2: usize) -> Remapper { + let idx = IndexMapper { stride2 }; + let map = (0..r.state_len()).map(|i| idx.to_state_id(i)).collect(); + Remapper { map, idx } + } + + /// Swap two states. Once this is called, callers must follow through to + /// call `remap`, or else it's possible for the underlying remappable + /// value to be in a corrupt state. + pub(crate) fn swap( + &mut self, + r: &mut impl Remappable, + id1: StateID, + id2: StateID, + ) { + if id1 == id2 { + return; + } + r.swap_states(id1, id2); + self.map.swap(self.idx.to_index(id1), self.idx.to_index(id2)); + } + + /// Complete the remapping process by rewriting all state IDs in the + /// remappable value according to the swaps performed. + pub(crate) fn remap(mut self, r: &mut impl Remappable) { + // Update the map to account for states that have been swapped + // multiple times. For example, if (A, C) and (C, G) are swapped, then + // transitions previously pointing to A should now point to G. But if + // we don't update our map, they will erroneously be set to C. All we + // do is follow the swaps in our map until we see our original state + // ID. + // + // The intuition here is to think about how changes are made to the + // map: only through pairwise swaps. That means that starting at any + // given state, it is always possible to find the loop back to that + // state by following the swaps represented in the map (which might be + // 0 swaps). + // + // We are also careful to clone the map before starting in order to + // freeze it. We use the frozen map to find our loops, since we need to + // update our map as well. Without freezing it, our updates could break + // the loops referenced above and produce incorrect results. + let oldmap = self.map.clone(); + for i in 0..r.state_len() { + let cur_id = self.idx.to_state_id(i); + let mut new_id = oldmap[i]; + if cur_id == new_id { + continue; + } + loop { + let id = oldmap[self.idx.to_index(new_id)]; + if cur_id == id { + self.map[i] = new_id; + break; + } + new_id = id; + } + } + r.remap(|sid| self.map[self.idx.to_index(sid)]); + } +} + +/// A simple type for mapping between state indices and state IDs. +/// +/// The reason why this exists is because state IDs are "premultiplied" in a +/// DFA. That is, in order to get to the transitions for a particular state, +/// one need only use the state ID as-is, instead of having to multiply it by +/// transition table's stride. +/// +/// The downside of this is that it's inconvenient to map between state IDs +/// using a dense map, e.g., Vec<StateID>. That's because state IDs look like +/// `0`, `stride`, `2*stride`, `3*stride`, etc., instead of `0`, `1`, `2`, `3`, +/// etc. +/// +/// Since our state IDs are premultiplied, we can convert back-and-forth +/// between IDs and indices by simply unmultiplying the IDs and multiplying the +/// indices. +/// +/// Note that for a sparse NFA, state IDs and indices are equivalent. In this +/// case, we set the stride of the index mapped to be `0`, which acts as an +/// identity. +#[derive(Debug)] +struct IndexMapper { + /// The power of 2 corresponding to the stride of the corresponding + /// transition table. 'id >> stride2' de-multiplies an ID while 'index << + /// stride2' pre-multiplies an index to an ID. + stride2: usize, +} + +impl IndexMapper { + /// Convert a state ID to a state index. + fn to_index(&self, id: StateID) -> usize { + id.as_usize() >> self.stride2 + } + + /// Convert a state index to a state ID. + fn to_state_id(&self, index: usize) -> StateID { + // CORRECTNESS: If the given index is not valid, then it is not + // required for this to panic or return a valid state ID. We'll "just" + // wind up with panics or silent logic errors at some other point. But + // this is OK because if Remappable::state_len is correct and so is + // 'to_index', then all inputs to 'to_state_id' should be valid indices + // and thus transform into valid state IDs. + StateID::new_unchecked(index << self.stride2) + } +} + +impl Remappable for noncontiguous::NFA { + fn state_len(&self) -> usize { + noncontiguous::NFA::states(self).len() + } + + fn swap_states(&mut self, id1: StateID, id2: StateID) { + noncontiguous::NFA::swap_states(self, id1, id2) + } + + fn remap(&mut self, map: impl Fn(StateID) -> StateID) { + noncontiguous::NFA::remap(self, map) + } +} diff --git a/vendor/aho-corasick/src/util/search.rs b/vendor/aho-corasick/src/util/search.rs new file mode 100644 index 0000000..59b7035 --- /dev/null +++ b/vendor/aho-corasick/src/util/search.rs @@ -0,0 +1,1148 @@ +use core::ops::{Range, RangeBounds}; + +use crate::util::primitives::PatternID; + +/// The configuration and the haystack to use for an Aho-Corasick search. +/// +/// When executing a search, there are a few parameters one might want to +/// configure: +/// +/// * The haystack to search, provided to the [`Input::new`] constructor. This +/// is the only required parameter. +/// * The span _within_ the haystack to limit a search to. (The default +/// is the entire haystack.) This is configured via [`Input::span`] or +/// [`Input::range`]. +/// * Whether to run an unanchored (matches can occur anywhere after the +/// start of the search) or anchored (matches can only occur beginning at +/// the start of the search) search. Unanchored search is the default. This is +/// configured via [`Input::anchored`]. +/// * Whether to quit the search as soon as a match has been found, regardless +/// of the [`MatchKind`] that the searcher was built with. This is configured +/// via [`Input::earliest`]. +/// +/// For most cases, the defaults for all optional parameters are appropriate. +/// The utility of this type is that it keeps the default or common case simple +/// while permitting tweaking parameters in more niche use cases while reusing +/// the same search APIs. +/// +/// # Valid bounds and search termination +/// +/// An `Input` permits setting the bounds of a search via either +/// [`Input::span`] or [`Input::range`]. The bounds set must be valid, or +/// else a panic will occur. Bounds are valid if and only if: +/// +/// * The bounds represent a valid range into the input's haystack. +/// * **or** the end bound is a valid ending bound for the haystack *and* +/// the start bound is exactly one greater than the end bound. +/// +/// In the latter case, [`Input::is_done`] will return true and indicates any +/// search receiving such an input should immediately return with no match. +/// +/// Other than representing "search is complete," the `Input::span` and +/// `Input::range` APIs are never necessary. Instead, callers can slice the +/// haystack instead, e.g., with `&haystack[start..end]`. With that said, they +/// can be more convenient than slicing because the match positions reported +/// when using `Input::span` or `Input::range` are in terms of the original +/// haystack. If you instead use `&haystack[start..end]`, then you'll need to +/// add `start` to any match position returned in order for it to be a correct +/// index into `haystack`. +/// +/// # Example: `&str` and `&[u8]` automatically convert to an `Input` +/// +/// There is a `From<&T> for Input` implementation for all `T: AsRef<[u8]>`. +/// Additionally, the [`AhoCorasick`](crate::AhoCorasick) search APIs accept +/// a `Into<Input>`. These two things combined together mean you can provide +/// things like `&str` and `&[u8]` to search APIs when the defaults are +/// suitable, but also an `Input` when they're not. For example: +/// +/// ``` +/// use aho_corasick::{AhoCorasick, Anchored, Input, Match, StartKind}; +/// +/// // Build a searcher that supports both unanchored and anchored modes. +/// let ac = AhoCorasick::builder() +/// .start_kind(StartKind::Both) +/// .build(&["abcd", "b"]) +/// .unwrap(); +/// let haystack = "abcd"; +/// +/// // A search using default parameters is unanchored. With standard +/// // semantics, this finds `b` first. +/// assert_eq!( +/// Some(Match::must(1, 1..2)), +/// ac.find(haystack), +/// ); +/// // Using the same 'find' routine, we can provide an 'Input' explicitly +/// // that is configured to do an anchored search. Since 'b' doesn't start +/// // at the beginning of the search, it is not reported as a match. +/// assert_eq!( +/// Some(Match::must(0, 0..4)), +/// ac.find(Input::new(haystack).anchored(Anchored::Yes)), +/// ); +/// ``` +#[derive(Clone)] +pub struct Input<'h> { + haystack: &'h [u8], + span: Span, + anchored: Anchored, + earliest: bool, +} + +impl<'h> Input<'h> { + /// Create a new search configuration for the given haystack. + #[inline] + pub fn new<H: ?Sized + AsRef<[u8]>>(haystack: &'h H) -> Input<'h> { + Input { + haystack: haystack.as_ref(), + span: Span { start: 0, end: haystack.as_ref().len() }, + anchored: Anchored::No, + earliest: false, + } + } + + /// Set the span for this search. + /// + /// This routine is generic over how a span is provided. While + /// a [`Span`] may be given directly, one may also provide a + /// `std::ops::Range<usize>`. To provide anything supported by range + /// syntax, use the [`Input::range`] method. + /// + /// The default span is the entire haystack. + /// + /// Note that [`Input::range`] overrides this method and vice versa. + /// + /// # Panics + /// + /// This panics if the given span does not correspond to valid bounds in + /// the haystack or the termination of a search. + /// + /// # Example + /// + /// This example shows how the span of the search can impact whether a + /// match is reported or not. + /// + /// ``` + /// use aho_corasick::{AhoCorasick, Input, MatchKind}; + /// + /// let patterns = &["b", "abcd", "abc"]; + /// let haystack = "abcd"; + /// + /// let ac = AhoCorasick::builder() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(patterns) + /// .unwrap(); + /// let input = Input::new(haystack).span(0..3); + /// let mat = ac.try_find(input)?.expect("should have a match"); + /// // Without the span stopping the search early, 'abcd' would be reported + /// // because it is the correct leftmost-first match. + /// assert_eq!("abc", &haystack[mat.span()]); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn span<S: Into<Span>>(mut self, span: S) -> Input<'h> { + self.set_span(span); + self + } + + /// Like `Input::span`, but accepts any range instead. + /// + /// The default range is the entire haystack. + /// + /// Note that [`Input::span`] overrides this method and vice versa. + /// + /// # Panics + /// + /// This routine will panic if the given range could not be converted + /// to a valid [`Range`]. For example, this would panic when given + /// `0..=usize::MAX` since it cannot be represented using a half-open + /// interval in terms of `usize`. + /// + /// This routine also panics if the given range does not correspond to + /// valid bounds in the haystack or the termination of a search. + /// + /// # Example + /// + /// ``` + /// use aho_corasick::Input; + /// + /// let input = Input::new("foobar"); + /// assert_eq!(0..6, input.get_range()); + /// + /// let input = Input::new("foobar").range(2..=4); + /// assert_eq!(2..5, input.get_range()); + /// ``` + #[inline] + pub fn range<R: RangeBounds<usize>>(mut self, range: R) -> Input<'h> { + self.set_range(range); + self + } + + /// Sets the anchor mode of a search. + /// + /// When a search is anchored (via [`Anchored::Yes`]), a match must begin + /// at the start of a search. When a search is not anchored (that's + /// [`Anchored::No`]), searchers will look for a match anywhere in the + /// haystack. + /// + /// By default, the anchored mode is [`Anchored::No`]. + /// + /// # Support for anchored searches + /// + /// Anchored or unanchored searches might not always be available, + /// depending on the type of searcher used and its configuration: + /// + /// * [`noncontiguous::NFA`](crate::nfa::noncontiguous::NFA) always + /// supports both unanchored and anchored searches. + /// * [`contiguous::NFA`](crate::nfa::contiguous::NFA) always supports both + /// unanchored and anchored searches. + /// * [`dfa::DFA`](crate::dfa::DFA) supports only unanchored + /// searches by default. + /// [`dfa::Builder::start_kind`](crate::dfa::Builder::start_kind) can + /// be used to change the default to supporting both kinds of searches + /// or even just anchored searches. + /// * [`AhoCorasick`](crate::AhoCorasick) inherits the same setup as a + /// `DFA`. Namely, it only supports unanchored searches by default, but + /// [`AhoCorasickBuilder::start_kind`](crate::AhoCorasickBuilder::start_kind) + /// can change this. + /// + /// If you try to execute a search using a `try_` ("fallible") method with + /// an unsupported anchor mode, then an error will be returned. For calls + /// to infallible search methods, a panic will result. + /// + /// # Example + /// + /// This demonstrates the differences between an anchored search and + /// an unanchored search. Notice that we build our `AhoCorasick` searcher + /// with [`StartKind::Both`] so that it supports both unanchored and + /// anchored searches simultaneously. + /// + /// ``` + /// use aho_corasick::{ + /// AhoCorasick, Anchored, Input, MatchKind, StartKind, + /// }; + /// + /// let patterns = &["bcd"]; + /// let haystack = "abcd"; + /// + /// let ac = AhoCorasick::builder() + /// .start_kind(StartKind::Both) + /// .build(patterns) + /// .unwrap(); + /// + /// // Note that 'Anchored::No' is the default, so it doesn't need to + /// // be explicitly specified here. + /// let input = Input::new(haystack); + /// let mat = ac.try_find(input)?.expect("should have a match"); + /// assert_eq!("bcd", &haystack[mat.span()]); + /// + /// // While 'bcd' occurs in the haystack, it does not begin where our + /// // search begins, so no match is found. + /// let input = Input::new(haystack).anchored(Anchored::Yes); + /// assert_eq!(None, ac.try_find(input)?); + /// + /// // However, if we start our search where 'bcd' starts, then we will + /// // find a match. + /// let input = Input::new(haystack).range(1..).anchored(Anchored::Yes); + /// let mat = ac.try_find(input)?.expect("should have a match"); + /// assert_eq!("bcd", &haystack[mat.span()]); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn anchored(mut self, mode: Anchored) -> Input<'h> { + self.set_anchored(mode); + self + } + + /// Whether to execute an "earliest" search or not. + /// + /// When running a non-overlapping search, an "earliest" search will + /// return the match location as early as possible. For example, given + /// the patterns `abc` and `b`, and a haystack of `abc`, a normal + /// leftmost-first search will return `abc` as a match. But an "earliest" + /// search will return as soon as it is known that a match occurs, which + /// happens once `b` is seen. + /// + /// Note that when using [`MatchKind::Standard`], the "earliest" option + /// has no effect since standard semantics are already "earliest." Note + /// also that this has no effect in overlapping searches, since overlapping + /// searches also use standard semantics and report all possible matches. + /// + /// This is disabled by default. + /// + /// # Example + /// + /// This example shows the difference between "earliest" searching and + /// normal leftmost searching. + /// + /// ``` + /// use aho_corasick::{AhoCorasick, Anchored, Input, MatchKind, StartKind}; + /// + /// let patterns = &["abc", "b"]; + /// let haystack = "abc"; + /// + /// let ac = AhoCorasick::builder() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(patterns) + /// .unwrap(); + /// + /// // The normal leftmost-first match. + /// let input = Input::new(haystack); + /// let mat = ac.try_find(input)?.expect("should have a match"); + /// assert_eq!("abc", &haystack[mat.span()]); + /// + /// // The "earliest" possible match, even if it isn't leftmost-first. + /// let input = Input::new(haystack).earliest(true); + /// let mat = ac.try_find(input)?.expect("should have a match"); + /// assert_eq!("b", &haystack[mat.span()]); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn earliest(mut self, yes: bool) -> Input<'h> { + self.set_earliest(yes); + self + } + + /// Set the span for this search configuration. + /// + /// This is like the [`Input::span`] method, except this mutates the + /// span in place. + /// + /// This routine is generic over how a span is provided. While + /// a [`Span`] may be given directly, one may also provide a + /// `std::ops::Range<usize>`. + /// + /// # Panics + /// + /// This panics if the given span does not correspond to valid bounds in + /// the haystack or the termination of a search. + /// + /// # Example + /// + /// ``` + /// use aho_corasick::Input; + /// + /// let mut input = Input::new("foobar"); + /// assert_eq!(0..6, input.get_range()); + /// input.set_span(2..4); + /// assert_eq!(2..4, input.get_range()); + /// ``` + #[inline] + pub fn set_span<S: Into<Span>>(&mut self, span: S) { + let span = span.into(); + assert!( + span.end <= self.haystack.len() + && span.start <= span.end.wrapping_add(1), + "invalid span {:?} for haystack of length {}", + span, + self.haystack.len(), + ); + self.span = span; + } + + /// Set the span for this search configuration given any range. + /// + /// This is like the [`Input::range`] method, except this mutates the + /// span in place. + /// + /// # Panics + /// + /// This routine will panic if the given range could not be converted + /// to a valid [`Range`]. For example, this would panic when given + /// `0..=usize::MAX` since it cannot be represented using a half-open + /// interval in terms of `usize`. + /// + /// This routine also panics if the given range does not correspond to + /// valid bounds in the haystack or the termination of a search. + /// + /// # Example + /// + /// ``` + /// use aho_corasick::Input; + /// + /// let mut input = Input::new("foobar"); + /// assert_eq!(0..6, input.get_range()); + /// input.set_range(2..=4); + /// assert_eq!(2..5, input.get_range()); + /// ``` + #[inline] + pub fn set_range<R: RangeBounds<usize>>(&mut self, range: R) { + use core::ops::Bound; + + // It's a little weird to convert ranges into spans, and then spans + // back into ranges when we actually slice the haystack. Because + // of that process, we always represent everything as a half-open + // internal. Therefore, handling things like m..=n is a little awkward. + let start = match range.start_bound() { + Bound::Included(&i) => i, + // Can this case ever happen? Range syntax doesn't support it... + Bound::Excluded(&i) => i.checked_add(1).unwrap(), + Bound::Unbounded => 0, + }; + let end = match range.end_bound() { + Bound::Included(&i) => i.checked_add(1).unwrap(), + Bound::Excluded(&i) => i, + Bound::Unbounded => self.haystack().len(), + }; + self.set_span(Span { start, end }); + } + + /// Set the starting offset for the span for this search configuration. + /// + /// This is a convenience routine for only mutating the start of a span + /// without having to set the entire span. + /// + /// # Panics + /// + /// This panics if the given span does not correspond to valid bounds in + /// the haystack or the termination of a search. + /// + /// # Example + /// + /// ``` + /// use aho_corasick::Input; + /// + /// let mut input = Input::new("foobar"); + /// assert_eq!(0..6, input.get_range()); + /// input.set_start(5); + /// assert_eq!(5..6, input.get_range()); + /// ``` + #[inline] + pub fn set_start(&mut self, start: usize) { + self.set_span(Span { start, ..self.get_span() }); + } + + /// Set the ending offset for the span for this search configuration. + /// + /// This is a convenience routine for only mutating the end of a span + /// without having to set the entire span. + /// + /// # Panics + /// + /// This panics if the given span does not correspond to valid bounds in + /// the haystack or the termination of a search. + /// + /// # Example + /// + /// ``` + /// use aho_corasick::Input; + /// + /// let mut input = Input::new("foobar"); + /// assert_eq!(0..6, input.get_range()); + /// input.set_end(5); + /// assert_eq!(0..5, input.get_range()); + /// ``` + #[inline] + pub fn set_end(&mut self, end: usize) { + self.set_span(Span { end, ..self.get_span() }); + } + + /// Set the anchor mode of a search. + /// + /// This is like [`Input::anchored`], except it mutates the search + /// configuration in place. + /// + /// # Example + /// + /// ``` + /// use aho_corasick::{Anchored, Input}; + /// + /// let mut input = Input::new("foobar"); + /// assert_eq!(Anchored::No, input.get_anchored()); + /// + /// input.set_anchored(Anchored::Yes); + /// assert_eq!(Anchored::Yes, input.get_anchored()); + /// ``` + #[inline] + pub fn set_anchored(&mut self, mode: Anchored) { + self.anchored = mode; + } + + /// Set whether the search should execute in "earliest" mode or not. + /// + /// This is like [`Input::earliest`], except it mutates the search + /// configuration in place. + /// + /// # Example + /// + /// ``` + /// use aho_corasick::Input; + /// + /// let mut input = Input::new("foobar"); + /// assert!(!input.get_earliest()); + /// input.set_earliest(true); + /// assert!(input.get_earliest()); + /// ``` + #[inline] + pub fn set_earliest(&mut self, yes: bool) { + self.earliest = yes; + } + + /// Return a borrow of the underlying haystack as a slice of bytes. + /// + /// # Example + /// + /// ``` + /// use aho_corasick::Input; + /// + /// let input = Input::new("foobar"); + /// assert_eq!(b"foobar", input.haystack()); + /// ``` + #[inline] + pub fn haystack(&self) -> &[u8] { + self.haystack + } + + /// Return the start position of this search. + /// + /// This is a convenience routine for `search.get_span().start()`. + /// + /// # Example + /// + /// ``` + /// use aho_corasick::Input; + /// + /// let input = Input::new("foobar"); + /// assert_eq!(0, input.start()); + /// + /// let input = Input::new("foobar").span(2..4); + /// assert_eq!(2, input.start()); + /// ``` + #[inline] + pub fn start(&self) -> usize { + self.get_span().start + } + + /// Return the end position of this search. + /// + /// This is a convenience routine for `search.get_span().end()`. + /// + /// # Example + /// + /// ``` + /// use aho_corasick::Input; + /// + /// let input = Input::new("foobar"); + /// assert_eq!(6, input.end()); + /// + /// let input = Input::new("foobar").span(2..4); + /// assert_eq!(4, input.end()); + /// ``` + #[inline] + pub fn end(&self) -> usize { + self.get_span().end + } + + /// Return the span for this search configuration. + /// + /// If one was not explicitly set, then the span corresponds to the entire + /// range of the haystack. + /// + /// # Example + /// + /// ``` + /// use aho_corasick::{Input, Span}; + /// + /// let input = Input::new("foobar"); + /// assert_eq!(Span { start: 0, end: 6 }, input.get_span()); + /// ``` + #[inline] + pub fn get_span(&self) -> Span { + self.span + } + + /// Return the span as a range for this search configuration. + /// + /// If one was not explicitly set, then the span corresponds to the entire + /// range of the haystack. + /// + /// # Example + /// + /// ``` + /// use aho_corasick::Input; + /// + /// let input = Input::new("foobar"); + /// assert_eq!(0..6, input.get_range()); + /// ``` + #[inline] + pub fn get_range(&self) -> Range<usize> { + self.get_span().range() + } + + /// Return the anchored mode for this search configuration. + /// + /// If no anchored mode was set, then it defaults to [`Anchored::No`]. + /// + /// # Example + /// + /// ``` + /// use aho_corasick::{Anchored, Input}; + /// + /// let mut input = Input::new("foobar"); + /// assert_eq!(Anchored::No, input.get_anchored()); + /// + /// input.set_anchored(Anchored::Yes); + /// assert_eq!(Anchored::Yes, input.get_anchored()); + /// ``` + #[inline] + pub fn get_anchored(&self) -> Anchored { + self.anchored + } + + /// Return whether this search should execute in "earliest" mode. + /// + /// # Example + /// + /// ``` + /// use aho_corasick::Input; + /// + /// let input = Input::new("foobar"); + /// assert!(!input.get_earliest()); + /// ``` + #[inline] + pub fn get_earliest(&self) -> bool { + self.earliest + } + + /// Return true if this input has been exhausted, which in turn means all + /// subsequent searches will return no matches. + /// + /// This occurs precisely when the start position of this search is greater + /// than the end position of the search. + /// + /// # Example + /// + /// ``` + /// use aho_corasick::Input; + /// + /// let mut input = Input::new("foobar"); + /// assert!(!input.is_done()); + /// input.set_start(6); + /// assert!(!input.is_done()); + /// input.set_start(7); + /// assert!(input.is_done()); + /// ``` + #[inline] + pub fn is_done(&self) -> bool { + self.get_span().start > self.get_span().end + } +} + +impl<'h> core::fmt::Debug for Input<'h> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let mut fmter = f.debug_struct("Input"); + match core::str::from_utf8(self.haystack()) { + Ok(nice) => fmter.field("haystack", &nice), + Err(_) => fmter.field("haystack", &self.haystack()), + } + .field("span", &self.span) + .field("anchored", &self.anchored) + .field("earliest", &self.earliest) + .finish() + } +} + +impl<'h, H: ?Sized + AsRef<[u8]>> From<&'h H> for Input<'h> { + #[inline] + fn from(haystack: &'h H) -> Input<'h> { + Input::new(haystack) + } +} + +/// A representation of a range in a haystack. +/// +/// A span corresponds to the starting and ending _byte offsets_ of a +/// contiguous region of bytes. The starting offset is inclusive while the +/// ending offset is exclusive. That is, a span is a half-open interval. +/// +/// A span is used to report the offsets of a match, but it is also used to +/// convey which region of a haystack should be searched via routines like +/// [`Input::span`]. +/// +/// This is basically equivalent to a `std::ops::Range<usize>`, except this +/// type implements `Copy` which makes it more ergonomic to use in the context +/// of this crate. Indeed, `Span` exists only because `Range<usize>` does +/// not implement `Copy`. Like a range, this implements `Index` for `[u8]` +/// and `str`, and `IndexMut` for `[u8]`. For convenience, this also impls +/// `From<Range>`, which means things like `Span::from(5..10)` work. +/// +/// There are no constraints on the values of a span. It is, for example, legal +/// to create a span where `start > end`. +#[derive(Clone, Copy, Eq, Hash, PartialEq)] +pub struct Span { + /// The start offset of the span, inclusive. + pub start: usize, + /// The end offset of the span, exclusive. + pub end: usize, +} + +impl Span { + /// Returns this span as a range. + #[inline] + pub fn range(&self) -> Range<usize> { + Range::from(*self) + } + + /// Returns true when this span is empty. That is, when `start >= end`. + #[inline] + pub fn is_empty(&self) -> bool { + self.start >= self.end + } + + /// Returns the length of this span. + /// + /// This returns `0` in precisely the cases that `is_empty` returns `true`. + #[inline] + pub fn len(&self) -> usize { + self.end.saturating_sub(self.start) + } + + /// Returns true when the given offset is contained within this span. + /// + /// Note that an empty span contains no offsets and will always return + /// false. + #[inline] + pub fn contains(&self, offset: usize) -> bool { + !self.is_empty() && self.start <= offset && offset <= self.end + } + + /// Returns a new span with `offset` added to this span's `start` and `end` + /// values. + #[inline] + pub fn offset(&self, offset: usize) -> Span { + Span { start: self.start + offset, end: self.end + offset } + } +} + +impl core::fmt::Debug for Span { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "{}..{}", self.start, self.end) + } +} + +impl core::ops::Index<Span> for [u8] { + type Output = [u8]; + + #[inline] + fn index(&self, index: Span) -> &[u8] { + &self[index.range()] + } +} + +impl core::ops::IndexMut<Span> for [u8] { + #[inline] + fn index_mut(&mut self, index: Span) -> &mut [u8] { + &mut self[index.range()] + } +} + +impl core::ops::Index<Span> for str { + type Output = str; + + #[inline] + fn index(&self, index: Span) -> &str { + &self[index.range()] + } +} + +impl From<Range<usize>> for Span { + #[inline] + fn from(range: Range<usize>) -> Span { + Span { start: range.start, end: range.end } + } +} + +impl From<Span> for Range<usize> { + #[inline] + fn from(span: Span) -> Range<usize> { + Range { start: span.start, end: span.end } + } +} + +impl PartialEq<Range<usize>> for Span { + #[inline] + fn eq(&self, range: &Range<usize>) -> bool { + self.start == range.start && self.end == range.end + } +} + +impl PartialEq<Span> for Range<usize> { + #[inline] + fn eq(&self, span: &Span) -> bool { + self.start == span.start && self.end == span.end + } +} + +/// The type of anchored search to perform. +/// +/// If an Aho-Corasick searcher does not support the anchored mode selected, +/// then the search will return an error or panic, depending on whether a +/// fallible or an infallible routine was called. +#[non_exhaustive] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum Anchored { + /// Run an unanchored search. This means a match may occur anywhere at or + /// after the start position of the search up until the end position of the + /// search. + No, + /// Run an anchored search. This means that a match must begin at the start + /// position of the search and end before the end position of the search. + Yes, +} + +impl Anchored { + /// Returns true if and only if this anchor mode corresponds to an anchored + /// search. + /// + /// # Example + /// + /// ``` + /// use aho_corasick::Anchored; + /// + /// assert!(!Anchored::No.is_anchored()); + /// assert!(Anchored::Yes.is_anchored()); + /// ``` + #[inline] + pub fn is_anchored(&self) -> bool { + matches!(*self, Anchored::Yes) + } +} + +/// A representation of a match reported by an Aho-Corasick searcher. +/// +/// A match has two essential pieces of information: the [`PatternID`] that +/// matches, and the [`Span`] of the match in a haystack. +/// +/// The pattern is identified by an ID, which corresponds to its position +/// (starting from `0`) relative to other patterns used to construct the +/// corresponding searcher. If only a single pattern is provided, then all +/// matches are guaranteed to have a pattern ID of `0`. +/// +/// Every match reported by a searcher guarantees that its span has its start +/// offset as less than or equal to its end offset. +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +pub struct Match { + /// The pattern ID. + pattern: PatternID, + /// The underlying match span. + span: Span, +} + +impl Match { + /// Create a new match from a pattern ID and a span. + /// + /// This constructor is generic over how a span is provided. While + /// a [`Span`] may be given directly, one may also provide a + /// `std::ops::Range<usize>`. + /// + /// # Panics + /// + /// This panics if `end < start`. + /// + /// # Example + /// + /// This shows how to create a match for the first pattern in an + /// Aho-Corasick searcher using convenient range syntax. + /// + /// ``` + /// use aho_corasick::{Match, PatternID}; + /// + /// let m = Match::new(PatternID::ZERO, 5..10); + /// assert_eq!(0, m.pattern().as_usize()); + /// assert_eq!(5, m.start()); + /// assert_eq!(10, m.end()); + /// ``` + #[inline] + pub fn new<S: Into<Span>>(pattern: PatternID, span: S) -> Match { + let span = span.into(); + assert!(span.start <= span.end, "invalid match span"); + Match { pattern, span } + } + + /// Create a new match from a pattern ID and a byte offset span. + /// + /// This constructor is generic over how a span is provided. While + /// a [`Span`] may be given directly, one may also provide a + /// `std::ops::Range<usize>`. + /// + /// This is like [`Match::new`], but accepts a `usize` instead of a + /// [`PatternID`]. This panics if the given `usize` is not representable + /// as a `PatternID`. + /// + /// # Panics + /// + /// This panics if `end < start` or if `pattern > PatternID::MAX`. + /// + /// # Example + /// + /// This shows how to create a match for the third pattern in an + /// Aho-Corasick searcher using convenient range syntax. + /// + /// ``` + /// use aho_corasick::Match; + /// + /// let m = Match::must(3, 5..10); + /// assert_eq!(3, m.pattern().as_usize()); + /// assert_eq!(5, m.start()); + /// assert_eq!(10, m.end()); + /// ``` + #[inline] + pub fn must<S: Into<Span>>(pattern: usize, span: S) -> Match { + Match::new(PatternID::must(pattern), span) + } + + /// Returns the ID of the pattern that matched. + /// + /// The ID of a pattern is derived from the position in which it was + /// originally inserted into the corresponding searcher. The first pattern + /// has identifier `0`, and each subsequent pattern is `1`, `2` and so on. + #[inline] + pub fn pattern(&self) -> PatternID { + self.pattern + } + + /// The starting position of the match. + /// + /// This is a convenience routine for `Match::span().start`. + #[inline] + pub fn start(&self) -> usize { + self.span().start + } + + /// The ending position of the match. + /// + /// This is a convenience routine for `Match::span().end`. + #[inline] + pub fn end(&self) -> usize { + self.span().end + } + + /// Returns the match span as a range. + /// + /// This is a convenience routine for `Match::span().range()`. + #[inline] + pub fn range(&self) -> core::ops::Range<usize> { + self.span().range() + } + + /// Returns the span for this match. + #[inline] + pub fn span(&self) -> Span { + self.span + } + + /// Returns true when the span in this match is empty. + /// + /// An empty match can only be returned when empty pattern is in the + /// Aho-Corasick searcher. + #[inline] + pub fn is_empty(&self) -> bool { + self.span().is_empty() + } + + /// Returns the length of this match. + /// + /// This returns `0` in precisely the cases that `is_empty` returns `true`. + #[inline] + pub fn len(&self) -> usize { + self.span().len() + } + + /// Returns a new match with `offset` added to its span's `start` and `end` + /// values. + #[inline] + pub fn offset(&self, offset: usize) -> Match { + Match { + pattern: self.pattern, + span: Span { + start: self.start() + offset, + end: self.end() + offset, + }, + } + } +} + +/// A knob for controlling the match semantics of an Aho-Corasick automaton. +/// +/// There are two generally different ways that Aho-Corasick automatons can +/// report matches. The first way is the "standard" approach that results from +/// implementing most textbook explanations of Aho-Corasick. The second way is +/// to report only the leftmost non-overlapping matches. The leftmost approach +/// is in turn split into two different ways of resolving ambiguous matches: +/// leftmost-first and leftmost-longest. +/// +/// The `Standard` match kind is the default and is the only one that supports +/// overlapping matches and stream searching. (Trying to find overlapping or +/// streaming matches using leftmost match semantics will result in an error in +/// fallible APIs and a panic when using infallibe APIs.) The `Standard` match +/// kind will report matches as they are seen. When searching for overlapping +/// matches, then all possible matches are reported. When searching for +/// non-overlapping matches, the first match seen is reported. For example, for +/// non-overlapping matches, given the patterns `abcd` and `b` and the haystack +/// `abcdef`, only a match for `b` is reported since it is detected first. The +/// `abcd` match is never reported since it overlaps with the `b` match. +/// +/// In contrast, the leftmost match kind always prefers the leftmost match +/// among all possible matches. Given the same example as above with `abcd` and +/// `b` as patterns and `abcdef` as the haystack, the leftmost match is `abcd` +/// since it begins before the `b` match, even though the `b` match is detected +/// before the `abcd` match. In this case, the `b` match is not reported at all +/// since it overlaps with the `abcd` match. +/// +/// The difference between leftmost-first and leftmost-longest is in how they +/// resolve ambiguous matches when there are multiple leftmost matches to +/// choose from. Leftmost-first always chooses the pattern that was provided +/// earliest, where as leftmost-longest always chooses the longest matching +/// pattern. For example, given the patterns `a` and `ab` and the subject +/// string `ab`, the leftmost-first match is `a` but the leftmost-longest match +/// is `ab`. Conversely, if the patterns were given in reverse order, i.e., +/// `ab` and `a`, then both the leftmost-first and leftmost-longest matches +/// would be `ab`. Stated differently, the leftmost-first match depends on the +/// order in which the patterns were given to the Aho-Corasick automaton. +/// Because of that, when leftmost-first matching is used, if a pattern `A` +/// that appears before a pattern `B` is a prefix of `B`, then it is impossible +/// to ever observe a match of `B`. +/// +/// If you're not sure which match kind to pick, then stick with the standard +/// kind, which is the default. In particular, if you need overlapping or +/// streaming matches, then you _must_ use the standard kind. The leftmost +/// kinds are useful in specific circumstances. For example, leftmost-first can +/// be very useful as a way to implement match priority based on the order of +/// patterns given and leftmost-longest can be useful for dictionary searching +/// such that only the longest matching words are reported. +/// +/// # Relationship with regular expression alternations +/// +/// Understanding match semantics can be a little tricky, and one easy way +/// to conceptualize non-overlapping matches from an Aho-Corasick automaton +/// is to think about them as a simple alternation of literals in a regular +/// expression. For example, let's say we wanted to match the strings +/// `Sam` and `Samwise`, which would turn into the regex `Sam|Samwise`. It +/// turns out that regular expression engines have two different ways of +/// matching this alternation. The first way, leftmost-longest, is commonly +/// found in POSIX compatible implementations of regular expressions (such as +/// `grep`). The second way, leftmost-first, is commonly found in backtracking +/// implementations such as Perl. (Some regex engines, such as RE2 and Rust's +/// regex engine do not use backtracking, but still implement leftmost-first +/// semantics in an effort to match the behavior of dominant backtracking +/// regex engines such as those found in Perl, Ruby, Python, Javascript and +/// PHP.) +/// +/// That is, when matching `Sam|Samwise` against `Samwise`, a POSIX regex +/// will match `Samwise` because it is the longest possible match, but a +/// Perl-like regex will match `Sam` since it appears earlier in the +/// alternation. Indeed, the regex `Sam|Samwise` in a Perl-like regex engine +/// will never match `Samwise` since `Sam` will always have higher priority. +/// Conversely, matching the regex `Samwise|Sam` against `Samwise` will lead to +/// a match of `Samwise` in both POSIX and Perl-like regexes since `Samwise` is +/// still longest match, but it also appears earlier than `Sam`. +/// +/// The "standard" match semantics of Aho-Corasick generally don't correspond +/// to the match semantics of any large group of regex implementations, so +/// there's no direct analogy that can be made here. Standard match semantics +/// are generally useful for overlapping matches, or if you just want to see +/// matches as they are detected. +/// +/// The main conclusion to draw from this section is that the match semantics +/// can be tweaked to precisely match either Perl-like regex alternations or +/// POSIX regex alternations. +#[non_exhaustive] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum MatchKind { + /// Use standard match semantics, which support overlapping matches. When + /// used with non-overlapping matches, matches are reported as they are + /// seen. + Standard, + /// Use leftmost-first match semantics, which reports leftmost matches. + /// When there are multiple possible leftmost matches, the match + /// corresponding to the pattern that appeared earlier when constructing + /// the automaton is reported. + /// + /// This does **not** support overlapping matches or stream searching. If + /// this match kind is used, attempting to find overlapping matches or + /// stream matches will fail. + LeftmostFirst, + /// Use leftmost-longest match semantics, which reports leftmost matches. + /// When there are multiple possible leftmost matches, the longest match + /// is chosen. + /// + /// This does **not** support overlapping matches or stream searching. If + /// this match kind is used, attempting to find overlapping matches or + /// stream matches will fail. + LeftmostLongest, +} + +/// The default match kind is `MatchKind::Standard`. +impl Default for MatchKind { + fn default() -> MatchKind { + MatchKind::Standard + } +} + +impl MatchKind { + #[inline] + pub(crate) fn is_standard(&self) -> bool { + matches!(*self, MatchKind::Standard) + } + + #[inline] + pub(crate) fn is_leftmost(&self) -> bool { + matches!(*self, MatchKind::LeftmostFirst | MatchKind::LeftmostLongest) + } + + #[inline] + pub(crate) fn is_leftmost_first(&self) -> bool { + matches!(*self, MatchKind::LeftmostFirst) + } + + /// Convert this match kind into a packed match kind. If this match kind + /// corresponds to standard semantics, then this returns None, since + /// packed searching does not support standard semantics. + #[inline] + pub(crate) fn as_packed(&self) -> Option<crate::packed::MatchKind> { + match *self { + MatchKind::Standard => None, + MatchKind::LeftmostFirst => { + Some(crate::packed::MatchKind::LeftmostFirst) + } + MatchKind::LeftmostLongest => { + Some(crate::packed::MatchKind::LeftmostLongest) + } + } + } +} + +/// The kind of anchored starting configurations to support in an Aho-Corasick +/// searcher. +/// +/// Depending on which searcher is used internally by +/// [`AhoCorasick`](crate::AhoCorasick), supporting both unanchored +/// and anchored searches can be quite costly. For this reason, +/// [`AhoCorasickBuilder::start_kind`](crate::AhoCorasickBuilder::start_kind) +/// can be used to configure whether your searcher supports unanchored, +/// anchored or both kinds of searches. +/// +/// This searcher configuration knob works in concert with the search time +/// configuration [`Input::anchored`]. Namely, if one requests an unsupported +/// anchored mode, then the search will either panic or return an error, +/// depending on whether you're using infallible or fallibe APIs, respectively. +/// +/// `AhoCorasick` by default only supports unanchored searches. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum StartKind { + /// Support both anchored and unanchored searches. + Both, + /// Support only unanchored searches. Requesting an anchored search will + /// return an error in fallible APIs and panic in infallible APIs. + Unanchored, + /// Support only anchored searches. Requesting an unanchored search will + /// return an error in fallible APIs and panic in infallible APIs. + Anchored, +} + +impl Default for StartKind { + fn default() -> StartKind { + StartKind::Unanchored + } +} diff --git a/vendor/aho-corasick/src/util/special.rs b/vendor/aho-corasick/src/util/special.rs new file mode 100644 index 0000000..beeba40 --- /dev/null +++ b/vendor/aho-corasick/src/util/special.rs @@ -0,0 +1,42 @@ +use crate::util::primitives::StateID; + +/// A collection of sentinel state IDs for Aho-Corasick automata. +/// +/// This specifically enables the technique by which we determine which states +/// are dead, matches or start states. Namely, by arranging states in a +/// particular order, we can determine the type of a state simply by looking at +/// its ID. +#[derive(Clone, Debug)] +pub(crate) struct Special { + /// The maximum ID of all the "special" states. This corresponds either to + /// start_anchored_id when a prefilter is active and max_match_id when a + /// prefilter is not active. The idea here is that if there is no prefilter, + /// then there is no point in treating start states as special. + pub(crate) max_special_id: StateID, + /// The maximum ID of all the match states. Any state ID bigger than this + /// is guaranteed to be a non-match ID. + /// + /// It is possible and legal for max_match_id to be equal to + /// start_anchored_id, which occurs precisely in the case where the empty + /// string is a pattern that was added to the underlying automaton. + pub(crate) max_match_id: StateID, + /// The state ID of the start state used for unanchored searches. + pub(crate) start_unanchored_id: StateID, + /// The state ID of the start state used for anchored searches. This is + /// always start_unanchored_id+1. + pub(crate) start_anchored_id: StateID, +} + +impl Special { + /// Create a new set of "special" state IDs with all IDs initialized to + /// zero. The general idea here is that they will be updated and set to + /// correct values later. + pub(crate) fn zero() -> Special { + Special { + max_special_id: StateID::ZERO, + max_match_id: StateID::ZERO, + start_unanchored_id: StateID::ZERO, + start_anchored_id: StateID::ZERO, + } + } +} |