diff options
Diffstat (limited to 'vendor/matchers/src/lib.rs')
-rw-r--r-- | vendor/matchers/src/lib.rs | 600 |
1 files changed, 600 insertions, 0 deletions
diff --git a/vendor/matchers/src/lib.rs b/vendor/matchers/src/lib.rs new file mode 100644 index 000000000..2720a1acf --- /dev/null +++ b/vendor/matchers/src/lib.rs @@ -0,0 +1,600 @@ +//! Regex matchers on character and byte streams. +//! +//! ## Overview +//! +//! The [`regex`] crate implements regular expression matching on strings and byte +//! arrays. However, in order to match the output of implementations of `fmt::Debug` +//! and `fmt::Display`, or by any code which writes to an instance of `fmt::Write` +//! or `io::Write`, it is necessary to first allocate a buffer, write to that +//! buffer, and then match the buffer against a regex. +//! +//! In cases where it is not necessary to extract substrings, but only to test whether +//! or not output matches a regex, it is not strictly necessary to allocate and +//! write this output to a buffer. This crate provides a simple interface on top of +//! the lower-level [`regex-automata`] library that implements `fmt::Write` and +//! `io::Write` for regex patterns. This may be used to test whether streaming +//! output matches a pattern without buffering that output. +//! +//! Users who need to extract substrings based on a pattern or who already have +//! buffered data should probably use the [`regex`] crate instead. +//! +//! ## Syntax +//! +//! This crate uses the same [regex syntax][syntax] of the `regex-automata` crate. +//! +//! [`regex`]: https://crates.io/crates/regex +//! [`regex-automata`]: https://crates.io/crates/regex-automata +//! [syntax]: https://docs.rs/regex-automata/0.1.7/regex_automata/#syntax + +use regex_automata::{dense, DenseDFA, SparseDFA, StateID, DFA}; +use std::{fmt, io, marker::PhantomData, str::FromStr}; + +pub use regex_automata::Error; + +/// A compiled match pattern that can match multipe inputs, or return a +/// [`Matcher`] that matches a single input. +/// +/// [`Matcher`]: ../struct.Matcher.html +#[derive(Debug, Clone)] +pub struct Pattern<S = usize, A = DenseDFA<Vec<S>, S>> +where + S: StateID, + A: DFA<ID = S>, +{ + automaton: A, +} + +/// A reference to a [`Pattern`] that matches a single input. +/// +/// [`Pattern`]: ../struct.Pattern.html +#[derive(Debug, Clone)] +pub struct Matcher<'a, S = usize, A = DenseDFA<&'a [S], S>> +where + S: StateID, + A: DFA<ID = S>, +{ + automaton: A, + state: S, + _lt: PhantomData<&'a ()>, +} + +// === impl Pattern === + +impl Pattern { + /// Returns a new `Pattern` for the given regex, or an error if the regex + /// was invalid. + /// + /// The returned `Pattern` will match occurances of the pattern which start + /// at *any* in a byte or character stream — the pattern may be preceded by + /// any number of non-matching characters. Essentially, it will behave as + /// though the regular expression started with a `.*?`, which enables a + /// match to appear anywhere. If this is not the desired behavior, use + /// [`Pattern::new_anchored`] instead. + /// + /// For example: + /// ``` + /// use matchers::Pattern; + /// + /// // This pattern matches any number of `a`s followed by a `b`. + /// let pattern = Pattern::new("a+b").expect("regex is not invalid"); + /// + /// // Of course, the pattern matches an input where the entire sequence of + /// // characters matches the pattern: + /// assert!(pattern.display_matches(&"aaaaab")); + /// + /// // And, since the pattern is unanchored, it will also match the + /// // sequence when it's followed by non-matching characters: + /// assert!(pattern.display_matches(&"hello world! aaaaab")); + /// ``` + pub fn new(pattern: &str) -> Result<Self, Error> { + let automaton = DenseDFA::new(pattern)?; + Ok(Pattern { automaton }) + } + + /// Returns a new `Pattern` anchored at the beginning of the input stream, + /// or an error if the regex was invalid. + /// + /// The returned `Pattern` will *only* match an occurence of the pattern in + /// an input sequence if the first character or byte in the input matches + /// the pattern. If this is not the desired behavior, use [`Pattern::new`] + /// instead. + /// + /// For example: + /// ``` + /// use matchers::Pattern; + /// + /// // This pattern matches any number of `a`s followed by a `b`. + /// let pattern = Pattern::new_anchored("a+b") + /// .expect("regex is not invalid"); + /// + /// // The pattern matches an input where the entire sequence of + /// // characters matches the pattern: + /// assert!(pattern.display_matches(&"aaaaab")); + /// + /// // Since the pattern is anchored, it will *not* match an input that + /// // begins with non-matching characters: + /// assert!(!pattern.display_matches(&"hello world! aaaaab")); + /// + /// // ...however, if we create a pattern beginning with `.*?`, it will: + /// let pattern2 = Pattern::new_anchored(".*?a+b") + /// .expect("regex is not invalid"); + /// assert!(pattern2.display_matches(&"hello world! aaaaab")); + /// ``` + pub fn new_anchored(pattern: &str) -> Result<Self, Error> { + let automaton = dense::Builder::new().anchored(true).build(pattern)?; + Ok(Pattern { automaton }) + } +} + +impl FromStr for Pattern { + type Err = Error; + fn from_str(s: &str) -> Result<Self, Self::Err> { + Self::new(s) + } +} + +impl<S, A> Pattern<S, A> +where + S: StateID, + A: DFA<ID = S>, + Self: for<'a> ToMatcher<'a, S>, +{ + /// Returns `true` if this pattern matches the given string. + #[inline] + pub fn matches(&self, s: &impl AsRef<str>) -> bool { + self.matcher().matches(s) + } + + /// Returns `true` if this pattern matches the formatted output of the given + /// type implementing `fmt::Debug`. + /// + /// For example: + /// ```rust + /// use matchers::Pattern; + /// + /// #[derive(Debug)] + /// pub struct Hello { + /// to: &'static str, + /// } + /// + /// let pattern = Pattern::new(r#"Hello \{ to: "W[^"]*" \}"#).unwrap(); + /// + /// let hello_world = Hello { to: "World" }; + /// assert!(pattern.debug_matches(&hello_world)); + /// + /// let hello_sf = Hello { to: "San Francisco" }; + /// assert_eq!(pattern.debug_matches(&hello_sf), false); + /// + /// let hello_washington = Hello { to: "Washington" }; + /// assert!(pattern.debug_matches(&hello_washington)); + /// ``` + #[inline] + pub fn debug_matches(&self, d: &impl fmt::Debug) -> bool { + self.matcher().debug_matches(d) + } + + /// Returns `true` if this pattern matches the formatted output of the given + /// type implementing `fmt::Display`. + /// + /// For example: + /// ```rust + /// # use std::fmt; + /// use matchers::Pattern; + /// + /// #[derive(Debug)] + /// pub struct Hello { + /// to: &'static str, + /// } + /// + /// impl fmt::Display for Hello { + /// fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + /// write!(f, "Hello {}", self.to) + /// } + /// } + /// + /// let pattern = Pattern::new("Hello [Ww].+").unwrap(); + /// + /// let hello_world = Hello { to: "world" }; + /// assert!(pattern.display_matches(&hello_world)); + /// assert_eq!(pattern.debug_matches(&hello_world), false); + /// + /// let hello_sf = Hello { to: "San Francisco" }; + /// assert_eq!(pattern.display_matches(&hello_sf), false); + /// + /// let hello_washington = Hello { to: "Washington" }; + /// assert!(pattern.display_matches(&hello_washington)); + /// ``` + #[inline] + pub fn display_matches(&self, d: &impl fmt::Display) -> bool { + self.matcher().display_matches(d) + } + + /// Returns either a `bool` indicating whether or not this pattern matches the + /// data read from the provided `io::Read` stream, or an `io::Error` if an + /// error occurred reading from the stream. + #[inline] + pub fn read_matches(&self, io: impl io::Read) -> io::Result<bool> { + self.matcher().read_matches(io) + } +} + +// === impl Matcher === + +impl<'a, S, A> Matcher<'a, S, A> +where + S: StateID, + A: DFA<ID = S>, +{ + fn new(automaton: A) -> Self { + let state = automaton.start_state(); + Self { + automaton, + state, + _lt: PhantomData, + } + } + + #[inline] + fn advance(&mut self, input: u8) { + self.state = unsafe { + // It's safe to call `next_state_unchecked` since the matcher may + // only be constructed by a `Pattern`, which, in turn,can only be + // constructed with a valid DFA. + self.automaton.next_state_unchecked(self.state, input) + }; + } + + /// Returns `true` if this `Matcher` has matched any input that has been + /// provided. + #[inline] + pub fn is_matched(&self) -> bool { + self.automaton.is_match_state(self.state) + } + + /// Returns `true` if this pattern matches the formatted output of the given + /// type implementing `fmt::Debug`. + pub fn matches(mut self, s: &impl AsRef<str>) -> bool { + for &byte in s.as_ref().as_bytes() { + self.advance(byte); + if self.automaton.is_dead_state(self.state) { + return false; + } + } + self.is_matched() + } + + /// Returns `true` if this pattern matches the formatted output of the given + /// type implementing `fmt::Debug`. + pub fn debug_matches(mut self, d: &impl fmt::Debug) -> bool { + use std::fmt::Write; + write!(&mut self, "{:?}", d).expect("matcher write impl should not fail"); + self.is_matched() + } + + /// Returns `true` if this pattern matches the formatted output of the given + /// type implementing `fmt::Display`. + pub fn display_matches(mut self, d: &impl fmt::Display) -> bool { + use std::fmt::Write; + write!(&mut self, "{}", d).expect("matcher write impl should not fail"); + self.is_matched() + } + + /// Returns either a `bool` indicating whether or not this pattern matches the + /// data read from the provided `io::Read` stream, or an `io::Error` if an + /// error occurred reading from the stream. + pub fn read_matches(mut self, io: impl io::Read + Sized) -> io::Result<bool> { + for r in io.bytes() { + self.advance(r?); + if self.automaton.is_dead_state(self.state) { + return Ok(false); + } + } + Ok(self.is_matched()) + } +} + +impl<'a, S, A> fmt::Write for Matcher<'a, S, A> +where + S: StateID, + A: DFA<ID = S>, +{ + fn write_str(&mut self, s: &str) -> fmt::Result { + for &byte in s.as_bytes() { + self.advance(byte); + if self.automaton.is_dead_state(self.state) { + break; + } + } + Ok(()) + } +} + +impl<'a, S, A> io::Write for Matcher<'a, S, A> +where + S: StateID, + A: DFA<ID = S>, +{ + fn write(&mut self, bytes: &[u8]) -> Result<usize, io::Error> { + let mut i = 0; + for &byte in bytes { + self.advance(byte); + i += 1; + if self.automaton.is_dead_state(self.state) { + break; + } + } + Ok(i) + } + + fn flush(&mut self) -> Result<(), io::Error> { + Ok(()) + } +} + +pub trait ToMatcher<'a, S> +where + Self: crate::sealed::Sealed, + S: StateID + 'a, +{ + type Automaton: DFA<ID = S>; + fn matcher(&'a self) -> Matcher<'a, S, Self::Automaton>; +} + +impl<S> crate::sealed::Sealed for Pattern<S, DenseDFA<Vec<S>, S>> where S: StateID {} + +impl<'a, S> ToMatcher<'a, S> for Pattern<S, DenseDFA<Vec<S>, S>> +where + S: StateID + 'a, +{ + type Automaton = DenseDFA<&'a [S], S>; + fn matcher(&'a self) -> Matcher<'a, S, Self::Automaton> { + Matcher::new(self.automaton.as_ref()) + } +} + +impl<'a, S> ToMatcher<'a, S> for Pattern<S, SparseDFA<Vec<u8>, S>> +where + S: StateID + 'a, +{ + type Automaton = SparseDFA<&'a [u8], S>; + fn matcher(&'a self) -> Matcher<'a, S, Self::Automaton> { + Matcher::new(self.automaton.as_ref()) + } +} + +impl<S> crate::sealed::Sealed for Pattern<S, SparseDFA<Vec<u8>, S>> where S: StateID {} + +mod sealed { + pub trait Sealed {} +} + +#[cfg(test)] +mod test { + use super::*; + + struct Str<'a>(&'a str); + struct ReadStr<'a>(io::Cursor<&'a [u8]>); + + impl<'a> fmt::Debug for Str<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.0) + } + } + + impl<'a> fmt::Display for Str<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.0) + } + } + + impl<'a> io::Read for ReadStr<'a> { + fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> { + self.0.read(buf) + } + } + + impl Str<'static> { + fn hello_world() -> Self { + Self::new("hello world") + } + } + + impl<'a> Str<'a> { + fn new(s: &'a str) -> Self { + Str(s) + } + + fn to_reader(self) -> ReadStr<'a> { + ReadStr(io::Cursor::new(self.0.as_bytes())) + } + } + + fn test_debug_matches(new_pattern: impl Fn(&str) -> Result<Pattern, Error>) { + let pat = new_pattern("hello world").unwrap(); + assert!(pat.debug_matches(&Str::hello_world())); + + let pat = new_pattern("hel+o w[orl]{3}d").unwrap(); + assert!(pat.debug_matches(&Str::hello_world())); + + let pat = new_pattern("goodbye world").unwrap(); + assert_eq!(pat.debug_matches(&Str::hello_world()), false); + } + + fn test_display_matches(new_pattern: impl Fn(&str) -> Result<Pattern, Error>) { + let pat = new_pattern("hello world").unwrap(); + assert!(pat.display_matches(&Str::hello_world())); + + let pat = new_pattern("hel+o w[orl]{3}d").unwrap(); + assert!(pat.display_matches(&Str::hello_world())); + + let pat = new_pattern("goodbye world").unwrap(); + assert_eq!(pat.display_matches(&Str::hello_world()), false); + } + + fn test_reader_matches(new_pattern: impl Fn(&str) -> Result<Pattern, Error>) { + let pat = new_pattern("hello world").unwrap(); + assert!(pat + .read_matches(Str::hello_world().to_reader()) + .expect("no io error should occur")); + + let pat = new_pattern("hel+o w[orl]{3}d").unwrap(); + assert!(pat + .read_matches(Str::hello_world().to_reader()) + .expect("no io error should occur")); + + let pat = new_pattern("goodbye world").unwrap(); + assert_eq!( + pat.read_matches(Str::hello_world().to_reader()) + .expect("no io error should occur"), + false + ); + } + + fn test_debug_rep_patterns(new_pattern: impl Fn(&str) -> Result<Pattern, Error>) { + let pat = new_pattern("a+b").unwrap(); + assert!(pat.debug_matches(&Str::new("ab"))); + assert!(pat.debug_matches(&Str::new("aaaab"))); + assert!(pat.debug_matches(&Str::new("aaaaaaaaaab"))); + assert_eq!(pat.debug_matches(&Str::new("b")), false); + assert_eq!(pat.debug_matches(&Str::new("abb")), false); + assert_eq!(pat.debug_matches(&Str::new("aaaaabb")), false); + } + + mod anchored { + use super::*; + #[test] + fn debug_matches() { + test_debug_matches(Pattern::new_anchored) + } + + #[test] + fn display_matches() { + test_display_matches(Pattern::new_anchored) + } + + #[test] + fn reader_matches() { + test_reader_matches(Pattern::new_anchored) + } + + #[test] + fn debug_rep_patterns() { + test_debug_rep_patterns(Pattern::new_anchored) + } + + // === anchored behavior ============================================= + // Tests that anchored patterns match each input type only beginning at + // the first character. + fn test_is_anchored(f: impl Fn(&Pattern, Str) -> bool) { + let pat = Pattern::new_anchored("a+b").unwrap(); + assert!(f(&pat, Str::new("ab"))); + assert!(f(&pat, Str::new("aaaab"))); + assert!(f(&pat, Str::new("aaaaaaaaaab"))); + assert!(!f(&pat, Str::new("bab"))); + assert!(!f(&pat, Str::new("ffab"))); + assert!(!f(&pat, Str::new("qqqqqqqaaaaab"))); + } + + #[test] + fn debug_is_anchored() { + test_is_anchored(|pat, input| pat.debug_matches(&input)) + } + + #[test] + fn display_is_anchored() { + test_is_anchored(|pat, input| pat.display_matches(&input)); + } + + #[test] + fn reader_is_anchored() { + test_is_anchored(|pat, input| { + pat.read_matches(input.to_reader()) + .expect("no io error occurs") + }); + } + + // === explicitly unanchored ========================================= + // Tests that if an "anchored" pattern begins with `.*?`, it matches as + // though it was unanchored. + fn test_explicitly_unanchored(f: impl Fn(&Pattern, Str) -> bool) { + let pat = Pattern::new_anchored(".*?a+b").unwrap(); + assert!(f(&pat, Str::new("ab"))); + assert!(f(&pat, Str::new("aaaab"))); + assert!(f(&pat, Str::new("aaaaaaaaaab"))); + assert!(f(&pat, Str::new("bab"))); + assert!(f(&pat, Str::new("ffab"))); + assert!(f(&pat, Str::new("qqqqqqqaaaaab"))); + } + + #[test] + fn debug_explicitly_unanchored() { + test_explicitly_unanchored(|pat, input| pat.debug_matches(&input)) + } + + #[test] + fn display_explicitly_unanchored() { + test_explicitly_unanchored(|pat, input| pat.display_matches(&input)); + } + + #[test] + fn reader_explicitly_unanchored() { + test_explicitly_unanchored(|pat, input| { + pat.read_matches(input.to_reader()) + .expect("no io error occurs") + }); + } + } + + mod unanchored { + use super::*; + #[test] + fn debug_matches() { + test_debug_matches(Pattern::new) + } + + #[test] + fn display_matches() { + test_display_matches(Pattern::new) + } + + #[test] + fn reader_matches() { + test_reader_matches(Pattern::new) + } + + #[test] + fn debug_rep_patterns() { + test_debug_rep_patterns(Pattern::new) + } + + // === anchored behavior ============================================= + // Tests that unanchored patterns match anywhere in the input stream. + fn test_is_unanchored(f: impl Fn(&Pattern, Str) -> bool) { + let pat = Pattern::new("a+b").unwrap(); + assert!(f(&pat, Str::new("ab"))); + assert!(f(&pat, Str::new("aaaab"))); + assert!(f(&pat, Str::new("aaaaaaaaaab"))); + assert!(f(&pat, Str::new("bab"))); + assert!(f(&pat, Str::new("ffab"))); + assert!(f(&pat, Str::new("qqqfqqqqaaaaab"))); + } + + #[test] + fn debug_is_unanchored() { + test_is_unanchored(|pat, input| pat.debug_matches(&input)) + } + + #[test] + fn display_is_unanchored() { + test_is_unanchored(|pat, input| pat.display_matches(&input)); + } + + #[test] + fn reader_is_unanchored() { + test_is_unanchored(|pat, input| { + pat.read_matches(input.to_reader()) + .expect("no io error occurs") + }); + } + } +} |