From 4547b622d8d29df964fa2914213088b148c498fc Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 17 Apr 2024 14:18:32 +0200 Subject: Merging upstream version 1.67.1+dfsg1. Signed-off-by: Daniel Baumann --- vendor/regex-automata/tests/hybrid/api.rs | 195 ++++++++++++++++++++++++++++++ 1 file changed, 195 insertions(+) create mode 100644 vendor/regex-automata/tests/hybrid/api.rs (limited to 'vendor/regex-automata/tests/hybrid/api.rs') diff --git a/vendor/regex-automata/tests/hybrid/api.rs b/vendor/regex-automata/tests/hybrid/api.rs new file mode 100644 index 000000000..9a834dbb8 --- /dev/null +++ b/vendor/regex-automata/tests/hybrid/api.rs @@ -0,0 +1,195 @@ +use std::error::Error; + +use regex_automata::{ + hybrid::{ + dfa::{self, DFA}, + regex::Regex, + OverlappingState, + }, + nfa::thompson, + HalfMatch, MatchError, MatchKind, MultiMatch, +}; + +use crate::util::{BunkPrefilter, SubstringPrefilter}; + +// Tests that too many cache resets cause the lazy DFA to quit. +// +// We only test this on 64-bit because the test is gingerly crafted based on +// implementation details of cache sizes. It's not a great test because of +// that, but it does check some interesting properties around how positions are +// reported when a search "gives up." +#[test] +#[cfg(target_pointer_width = "64")] +fn too_many_cache_resets_cause_quit() -> Result<(), Box> { + // This is a carefully chosen regex. The idea is to pick one that requires + // some decent number of states (hence the bounded repetition). But we + // specifically choose to create a class with an ASCII letter and a + // non-ASCII letter so that we can check that no new states are created + // once the cache is full. Namely, if we fill up the cache on a haystack + // of 'a's, then in order to match one 'β', a new state will need to be + // created since a 'β' is encoded with multiple bytes. Since there's no + // room for this state, the search should quit at the very first position. + let pattern = r"[aβ]{100}"; + let dfa = DFA::builder() + .configure( + // Configure it so that we have the minimum cache capacity + // possible. And that if any resets occur, the search quits. + DFA::config() + .skip_cache_capacity_check(true) + .cache_capacity(0) + .minimum_cache_clear_count(Some(0)), + ) + .build(pattern)?; + let mut cache = dfa.create_cache(); + + let haystack = "a".repeat(101).into_bytes(); + let err = MatchError::GaveUp { offset: 25 }; + assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err.clone())); + assert_eq!(dfa.find_leftmost_fwd(&mut cache, &haystack), Err(err.clone())); + assert_eq!( + dfa.find_overlapping_fwd( + &mut cache, + &haystack, + &mut OverlappingState::start() + ), + Err(err.clone()) + ); + + let haystack = "β".repeat(101).into_bytes(); + let err = MatchError::GaveUp { offset: 0 }; + assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err)); + // no need to test that other find routines quit, since we did that above + + // OK, if we reset the cache, then we should be able to create more states + // and make more progress with searching for betas. + cache.reset(&dfa); + let err = MatchError::GaveUp { offset: 26 }; + assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err)); + + // ... switching back to ASCII still makes progress since it just needs to + // set transitions on existing states! + let haystack = "a".repeat(101).into_bytes(); + let err = MatchError::GaveUp { offset: 13 }; + assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err)); + + Ok(()) +} + +// Tests that quit bytes in the forward direction work correctly. +#[test] +fn quit_fwd() -> Result<(), Box> { + let dfa = DFA::builder() + .configure(DFA::config().quit(b'x', true)) + .build("[[:word:]]+$")?; + let mut cache = dfa.create_cache(); + + assert_eq!( + dfa.find_earliest_fwd(&mut cache, b"abcxyz"), + Err(MatchError::Quit { byte: b'x', offset: 3 }) + ); + assert_eq!( + dfa.find_leftmost_fwd(&mut cache, b"abcxyz"), + Err(MatchError::Quit { byte: b'x', offset: 3 }) + ); + assert_eq!( + dfa.find_overlapping_fwd( + &mut cache, + b"abcxyz", + &mut OverlappingState::start() + ), + Err(MatchError::Quit { byte: b'x', offset: 3 }) + ); + + Ok(()) +} + +// Tests that quit bytes in the reverse direction work correctly. +#[test] +fn quit_rev() -> Result<(), Box> { + let dfa = DFA::builder() + .configure(DFA::config().quit(b'x', true)) + .thompson(thompson::Config::new().reverse(true)) + .build("^[[:word:]]+")?; + let mut cache = dfa.create_cache(); + + assert_eq!( + dfa.find_earliest_rev(&mut cache, b"abcxyz"), + Err(MatchError::Quit { byte: b'x', offset: 3 }) + ); + assert_eq!( + dfa.find_leftmost_rev(&mut cache, b"abcxyz"), + Err(MatchError::Quit { byte: b'x', offset: 3 }) + ); + + Ok(()) +} + +// Tests that if we heuristically enable Unicode word boundaries but then +// instruct that a non-ASCII byte should NOT be a quit byte, then the builder +// will panic. +#[test] +#[should_panic] +fn quit_panics() { + DFA::config().unicode_word_boundary(true).quit(b'\xFF', false); +} + +// This tests an intesting case where even if the Unicode word boundary option +// is disabled, setting all non-ASCII bytes to be quit bytes will cause Unicode +// word boundaries to be enabled. +#[test] +fn unicode_word_implicitly_works() -> Result<(), Box> { + let mut config = DFA::config(); + for b in 0x80..=0xFF { + config = config.quit(b, true); + } + let dfa = DFA::builder().configure(config).build(r"\b")?; + let mut cache = dfa.create_cache(); + let expected = HalfMatch::must(0, 1); + assert_eq!(dfa.find_leftmost_fwd(&mut cache, b" a"), Ok(Some(expected))); + Ok(()) +} + +// Tests that we can provide a prefilter to a Regex, and the search reports +// correct results. +#[test] +fn prefilter_works() -> Result<(), Box> { + let mut re = Regex::new(r"a[0-9]+").unwrap(); + re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a")))); + let mut cache = re.create_cache(); + + let text = b"foo abc foo a1a2a3 foo a123 bar aa456"; + let matches: Vec<(usize, usize)> = re + .find_leftmost_iter(&mut cache, text) + .map(|m| (m.start(), m.end())) + .collect(); + assert_eq!( + matches, + vec![(12, 14), (14, 16), (16, 18), (23, 27), (33, 37),] + ); + Ok(()) +} + +// This test confirms that a prefilter is active by using a prefilter that +// reports false negatives. +#[test] +fn prefilter_is_active() -> Result<(), Box> { + let text = b"za123"; + let mut re = Regex::new(r"a[0-9]+").unwrap(); + let mut cache = re.create_cache(); + + re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a")))); + assert_eq!( + re.find_leftmost(&mut cache, b"za123"), + Some(MultiMatch::must(0, 1, 5)) + ); + assert_eq!( + re.find_leftmost(&mut cache, b"a123"), + Some(MultiMatch::must(0, 0, 4)) + ); + re.set_prefilter(Some(Box::new(BunkPrefilter::new()))); + assert_eq!(re.find_leftmost(&mut cache, b"za123"), None); + // This checks that the prefilter is used when first starting the search, + // instead of waiting until at least one transition has occurred. + assert_eq!(re.find_leftmost(&mut cache, b"a123"), None); + Ok(()) +} -- cgit v1.2.3