diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-30 18:31:44 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-30 18:31:44 +0000 |
commit | c23a457e72abe608715ac76f076f47dc42af07a5 (patch) | |
tree | 2772049aaf84b5c9d0ed12ec8d86812f7a7904b6 /vendor/regex-automata/src/dfa/mod.rs | |
parent | Releasing progress-linux version 1.73.0+dfsg1-1~progress7.99u1. (diff) | |
download | rustc-c23a457e72abe608715ac76f076f47dc42af07a5.tar.xz rustc-c23a457e72abe608715ac76f076f47dc42af07a5.zip |
Merging upstream version 1.74.1+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/regex-automata/src/dfa/mod.rs')
-rw-r--r-- | vendor/regex-automata/src/dfa/mod.rs | 135 |
1 files changed, 66 insertions, 69 deletions
diff --git a/vendor/regex-automata/src/dfa/mod.rs b/vendor/regex-automata/src/dfa/mod.rs index 6f9fe605e..4bb870435 100644 --- a/vendor/regex-automata/src/dfa/mod.rs +++ b/vendor/regex-automata/src/dfa/mod.rs @@ -1,5 +1,5 @@ /*! -A module for building and searching with determinstic finite automata (DFAs). +A module for building and searching with deterministic finite automata (DFAs). Like other modules in this crate, DFAs support a rich regex syntax with Unicode features. DFAs also have extensive options for configuring the best space vs @@ -26,20 +26,25 @@ DFAs implement. (A `regex::Regex` is generic over this trait.) [`dense::DFA::to_bytes_little_endian`]) and cheap deserialization (e.g., [`dense::DFA::from_bytes`]). +There is also a [`onepass`] module that provides a [one-pass +DFA](onepass::DFA). The unique advantage of this DFA is that, for the class +of regexes it can be built with, it supports reporting the spans of matching +capturing groups. It is the only DFA in this crate capable of such a thing. + # Example: basic regex searching This example shows how to compile a regex using the default configuration and then use it to find matches in a byte string: ``` -use regex_automata::{MultiMatch, dfa::regex::Regex}; +use regex_automata::{Match, dfa::regex::Regex}; let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?; let text = b"2018-12-24 2016-10-08"; -let matches: Vec<MultiMatch> = re.find_leftmost_iter(text).collect(); +let matches: Vec<Match> = re.find_iter(text).collect(); assert_eq!(matches, vec![ - MultiMatch::must(0, 0, 10), - MultiMatch::must(0, 11, 21), + Match::must(0, 0..10), + Match::must(0, 11..21), ]); # Ok::<(), Box<dyn std::error::Error>>(()) ``` @@ -51,36 +56,15 @@ simultaneously. You can use this support with standard leftmost-first style searching to find non-overlapping matches: ``` -use regex_automata::{MultiMatch, dfa::regex::Regex}; +# if cfg!(miri) { return Ok(()); } // miri takes too long +use regex_automata::{Match, dfa::regex::Regex}; let re = Regex::new_many(&[r"\w+", r"\S+"])?; let text = b"@foo bar"; -let matches: Vec<MultiMatch> = re.find_leftmost_iter(text).collect(); +let matches: Vec<Match> = re.find_iter(text).collect(); assert_eq!(matches, vec![ - MultiMatch::must(1, 0, 4), - MultiMatch::must(0, 5, 8), -]); -# Ok::<(), Box<dyn std::error::Error>>(()) -``` - -Or use overlapping style searches to find all possible occurrences: - -``` -use regex_automata::{MatchKind, MultiMatch, dfa::{dense, regex::Regex}}; - -// N.B. For overlapping searches, we need the underlying DFA to report all -// possible matches. -let re = Regex::builder() - .dense(dense::Config::new().match_kind(MatchKind::All)) - .build_many(&[r"\w{3}", r"\S{3}"])?; -let text = b"@foo bar"; -let matches: Vec<MultiMatch> = re.find_overlapping_iter(text).collect(); -assert_eq!(matches, vec![ - MultiMatch::must(1, 0, 3), - MultiMatch::must(0, 1, 4), - MultiMatch::must(1, 1, 4), - MultiMatch::must(0, 5, 8), - MultiMatch::must(1, 5, 8), + Match::must(1, 0..4), + Match::must(0, 5..8), ]); # Ok::<(), Box<dyn std::error::Error>>(()) ``` @@ -96,14 +80,14 @@ Using sparse DFAs is as easy as using `Regex::new_sparse` instead of `Regex::new`: ``` -use regex_automata::{MultiMatch, dfa::regex::Regex}; +use regex_automata::{Match, dfa::regex::Regex}; let re = Regex::new_sparse(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); let text = b"2018-12-24 2016-10-08"; -let matches: Vec<MultiMatch> = re.find_leftmost_iter(text).collect(); +let matches: Vec<Match> = re.find_iter(text).collect(); assert_eq!(matches, vec![ - MultiMatch::must(0, 0, 10), - MultiMatch::must(0, 11, 21), + Match::must(0, 0..10), + Match::must(0, 11..21), ]); # Ok::<(), Box<dyn std::error::Error>>(()) ``` @@ -112,7 +96,7 @@ If you already have dense DFAs for some reason, they can be converted to sparse DFAs and used to build a new `Regex`. For example: ``` -use regex_automata::{MultiMatch, dfa::regex::Regex}; +use regex_automata::{Match, dfa::regex::Regex}; let dense_re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); let sparse_re = Regex::builder().build_from_dfas( @@ -120,10 +104,10 @@ let sparse_re = Regex::builder().build_from_dfas( dense_re.reverse().to_sparse()?, ); let text = b"2018-12-24 2016-10-08"; -let matches: Vec<MultiMatch> = sparse_re.find_leftmost_iter(text).collect(); +let matches: Vec<Match> = sparse_re.find_iter(text).collect(); assert_eq!(matches, vec![ - MultiMatch::must(0, 0, 10), - MultiMatch::must(0, 11, 21), + Match::must(0, 0..10), + Match::must(0, 11..21), ]); # Ok::<(), Box<dyn std::error::Error>>(()) ``` @@ -136,7 +120,7 @@ bit contrived, this same technique can be used in your program to deserialize a DFA at start up time or by memory mapping a file. ``` -use regex_automata::{MultiMatch, dfa::{dense, regex::Regex}}; +use regex_automata::{Match, dfa::{dense, regex::Regex}}; let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); // serialize both the forward and reverse DFAs, see note below @@ -150,10 +134,10 @@ let re2 = Regex::builder().build_from_dfas(fwd, rev); // we can use it like normal let text = b"2018-12-24 2016-10-08"; -let matches: Vec<MultiMatch> = re2.find_leftmost_iter(text).collect(); +let matches: Vec<Match> = re2.find_iter(text).collect(); assert_eq!(matches, vec![ - MultiMatch::must(0, 0, 10), - MultiMatch::must(0, 11, 21), + Match::must(0, 0..10), + Match::must(0, 11..21), ]); # Ok::<(), Box<dyn std::error::Error>>(()) ``` @@ -183,7 +167,7 @@ valid DFA. The same process can be achieved with sparse DFAs as well: ``` -use regex_automata::{MultiMatch, dfa::{sparse, regex::Regex}}; +use regex_automata::{Match, dfa::{sparse, regex::Regex}}; let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); // serialize both @@ -197,17 +181,17 @@ let re2 = Regex::builder().build_from_dfas(fwd, rev); // we can use it like normal let text = b"2018-12-24 2016-10-08"; -let matches: Vec<MultiMatch> = re2.find_leftmost_iter(text).collect(); +let matches: Vec<Match> = re2.find_iter(text).collect(); assert_eq!(matches, vec![ - MultiMatch::must(0, 0, 10), - MultiMatch::must(0, 11, 21), + Match::must(0, 0..10), + Match::must(0, 11..21), ]); # Ok::<(), Box<dyn std::error::Error>>(()) ``` Note that unlike dense DFAs, sparse DFAs have no alignment requirements. Conversely, dense DFAs must be be aligned to the same alignment as a -[`StateID`](crate::util::id::StateID). +[`StateID`](crate::util::primitives::StateID). # Support for `no_std` and `alloc`-only @@ -232,8 +216,8 @@ you would any regex. Deserialization can happen anywhere. For example, with bytes embedded into a binary or with a file memory mapped at runtime. -TODO: Include link to `regex-cli` here pointing out how to generate Rust code -for deserializing DFAs. +The `regex-cli` command (found in the same repository as this crate) can be +used to serialize DFAs to files and generate Rust code to read them. # Syntax @@ -283,7 +267,7 @@ the regexes in this module are almost universally slow to compile, especially when they contain large Unicode character classes. For example, on my system, compiling `\w{50}` takes about 1 second and almost 15MB of memory! (Compiling a sparse regex takes about the same time but only uses about 1.2MB of -memory.) Conversly, compiling the same regex without Unicode support, e.g., +memory.) Conversely, compiling the same regex without Unicode support, e.g., `(?-u)\w{50}`, takes under 1 millisecond and about 15KB of memory. For this reason, you should only use Unicode character classes if you absolutely need them! (They are enabled by default though.) @@ -299,10 +283,10 @@ optimizations means that searches may run much slower than what you're accustomed to, although, it does provide more predictable and consistent performance. * There is no `&str` API like in the regex crate. In this module, all APIs -operate on `&[u8]`. By default, match indices are guaranteed to fall on UTF-8 -boundaries, unless any of [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8), -[`nfa::thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) or -[`regex::Config::utf8`] are disabled. +operate on `&[u8]`. By default, match indices are +guaranteed to fall on UTF-8 boundaries, unless either of +[`syntax::Config::utf8`](crate::util::syntax::Config::utf8) or +[`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) are disabled. With some of the downsides out of the way, here are some positive differences: @@ -334,9 +318,11 @@ via [`dense::Config::minimize`], but it can increase compilation times dramatically. */ -pub use crate::dfa::automaton::{Automaton, OverlappingState}; -#[cfg(feature = "alloc")] -pub use crate::dfa::error::Error; +#[cfg(feature = "dfa-search")] +pub use crate::dfa::{ + automaton::{Automaton, OverlappingState}, + start::StartKind, +}; /// This is an alias for a state ID of zero. It has special significance /// because it always corresponds to the first state in a DFA, and the first @@ -344,20 +330,31 @@ pub use crate::dfa::error::Error; /// of its transitions set to itself. Moreover, the dead state is used as a /// sentinel for various things. e.g., In search, reaching a dead state means /// that the search must stop. -const DEAD: crate::util::id::StateID = crate::util::id::StateID::ZERO; +const DEAD: crate::util::primitives::StateID = + crate::util::primitives::StateID::ZERO; -mod accel; -mod automaton; +#[cfg(feature = "dfa-search")] pub mod dense; -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-onepass")] +pub mod onepass; +#[cfg(feature = "dfa-search")] +pub mod regex; +#[cfg(feature = "dfa-search")] +pub mod sparse; + +#[cfg(feature = "dfa-search")] +pub(crate) mod accel; +#[cfg(feature = "dfa-search")] +mod automaton; +#[cfg(feature = "dfa-build")] mod determinize; -#[cfg(feature = "alloc")] -pub(crate) mod error; -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-build")] mod minimize; -pub mod regex; +#[cfg(any(feature = "dfa-build", feature = "dfa-onepass"))] +mod remapper; +#[cfg(feature = "dfa-search")] mod search; -pub mod sparse; +#[cfg(feature = "dfa-search")] mod special; -#[cfg(feature = "transducer")] -mod transducer; +#[cfg(feature = "dfa-search")] +mod start; |