use std::error::Error; use regex_automata::{ hybrid::{ dfa::{self, DFA}, regex::Regex, OverlappingState, }, nfa::thompson, HalfMatch, MatchError, MatchKind, MultiMatch, }; use crate::util::{BunkPrefilter, SubstringPrefilter}; // Tests that too many cache resets cause the lazy DFA to quit. // // We only test this on 64-bit because the test is gingerly crafted based on // implementation details of cache sizes. It's not a great test because of // that, but it does check some interesting properties around how positions are // reported when a search "gives up." #[test] #[cfg(target_pointer_width = "64")] fn too_many_cache_resets_cause_quit() -> Result<(), Box> { // This is a carefully chosen regex. The idea is to pick one that requires // some decent number of states (hence the bounded repetition). But we // specifically choose to create a class with an ASCII letter and a // non-ASCII letter so that we can check that no new states are created // once the cache is full. Namely, if we fill up the cache on a haystack // of 'a's, then in order to match one 'β', a new state will need to be // created since a 'β' is encoded with multiple bytes. Since there's no // room for this state, the search should quit at the very first position. let pattern = r"[aβ]{100}"; let dfa = DFA::builder() .configure( // Configure it so that we have the minimum cache capacity // possible. And that if any resets occur, the search quits. DFA::config() .skip_cache_capacity_check(true) .cache_capacity(0) .minimum_cache_clear_count(Some(0)), ) .build(pattern)?; let mut cache = dfa.create_cache(); let haystack = "a".repeat(101).into_bytes(); let err = MatchError::GaveUp { offset: 25 }; assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err.clone())); assert_eq!(dfa.find_leftmost_fwd(&mut cache, &haystack), Err(err.clone())); assert_eq!( dfa.find_overlapping_fwd( &mut cache, &haystack, &mut OverlappingState::start() ), Err(err.clone()) ); let haystack = "β".repeat(101).into_bytes(); let err = MatchError::GaveUp { offset: 0 }; assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err)); // no need to test that other find routines quit, since we did that above // OK, if we reset the cache, then we should be able to create more states // and make more progress with searching for betas. cache.reset(&dfa); let err = MatchError::GaveUp { offset: 26 }; assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err)); // ... switching back to ASCII still makes progress since it just needs to // set transitions on existing states! let haystack = "a".repeat(101).into_bytes(); let err = MatchError::GaveUp { offset: 13 }; assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err)); Ok(()) } // Tests that quit bytes in the forward direction work correctly. #[test] fn quit_fwd() -> Result<(), Box> { let dfa = DFA::builder() .configure(DFA::config().quit(b'x', true)) .build("[[:word:]]+$")?; let mut cache = dfa.create_cache(); assert_eq!( dfa.find_earliest_fwd(&mut cache, b"abcxyz"), Err(MatchError::Quit { byte: b'x', offset: 3 }) ); assert_eq!( dfa.find_leftmost_fwd(&mut cache, b"abcxyz"), Err(MatchError::Quit { byte: b'x', offset: 3 }) ); assert_eq!( dfa.find_overlapping_fwd( &mut cache, b"abcxyz", &mut OverlappingState::start() ), Err(MatchError::Quit { byte: b'x', offset: 3 }) ); Ok(()) } // Tests that quit bytes in the reverse direction work correctly. #[test] fn quit_rev() -> Result<(), Box> { let dfa = DFA::builder() .configure(DFA::config().quit(b'x', true)) .thompson(thompson::Config::new().reverse(true)) .build("^[[:word:]]+")?; let mut cache = dfa.create_cache(); assert_eq!( dfa.find_earliest_rev(&mut cache, b"abcxyz"), Err(MatchError::Quit { byte: b'x', offset: 3 }) ); assert_eq!( dfa.find_leftmost_rev(&mut cache, b"abcxyz"), Err(MatchError::Quit { byte: b'x', offset: 3 }) ); Ok(()) } // Tests that if we heuristically enable Unicode word boundaries but then // instruct that a non-ASCII byte should NOT be a quit byte, then the builder // will panic. #[test] #[should_panic] fn quit_panics() { DFA::config().unicode_word_boundary(true).quit(b'\xFF', false); } // This tests an intesting case where even if the Unicode word boundary option // is disabled, setting all non-ASCII bytes to be quit bytes will cause Unicode // word boundaries to be enabled. #[test] fn unicode_word_implicitly_works() -> Result<(), Box> { let mut config = DFA::config(); for b in 0x80..=0xFF { config = config.quit(b, true); } let dfa = DFA::builder().configure(config).build(r"\b")?; let mut cache = dfa.create_cache(); let expected = HalfMatch::must(0, 1); assert_eq!(dfa.find_leftmost_fwd(&mut cache, b" a"), Ok(Some(expected))); Ok(()) } // Tests that we can provide a prefilter to a Regex, and the search reports // correct results. #[test] fn prefilter_works() -> Result<(), Box> { let mut re = Regex::new(r"a[0-9]+").unwrap(); re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a")))); let mut cache = re.create_cache(); let text = b"foo abc foo a1a2a3 foo a123 bar aa456"; let matches: Vec<(usize, usize)> = re .find_leftmost_iter(&mut cache, text) .map(|m| (m.start(), m.end())) .collect(); assert_eq!( matches, vec![(12, 14), (14, 16), (16, 18), (23, 27), (33, 37),] ); Ok(()) } // This test confirms that a prefilter is active by using a prefilter that // reports false negatives. #[test] fn prefilter_is_active() -> Result<(), Box> { let text = b"za123"; let mut re = Regex::new(r"a[0-9]+").unwrap(); let mut cache = re.create_cache(); re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a")))); assert_eq!( re.find_leftmost(&mut cache, b"za123"), Some(MultiMatch::must(0, 1, 5)) ); assert_eq!( re.find_leftmost(&mut cache, b"a123"), Some(MultiMatch::must(0, 0, 4)) ); re.set_prefilter(Some(Box::new(BunkPrefilter::new()))); assert_eq!(re.find_leftmost(&mut cache, b"za123"), None); // This checks that the prefilter is used when first starting the search, // instead of waiting until at least one transition has occurred. assert_eq!(re.find_leftmost(&mut cache, b"a123"), None); Ok(()) }