use regex_automata::{ hybrid::{ dfa::DFA, regex::{self, Regex}, }, nfa::thompson, MatchKind, SyntaxConfig, }; use regex_syntax as syntax; use regex_test::{ bstr::{BString, ByteSlice}, CompiledRegex, Match, MatchKind as TestMatchKind, RegexTest, RegexTests, SearchKind as TestSearchKind, TestResult, TestRunner, }; use crate::{suite, Result}; /// Tests the default configuration of the hybrid NFA/DFA. #[test] fn default() -> Result<()> { let builder = Regex::builder(); TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert(); Ok(()) } /// Tests the hybrid NFA/DFA with NFA shrinking disabled. /// /// This is actually the typical configuration one wants for a lazy DFA. NFA /// shrinking is mostly only advantageous when building a full DFA since it /// can sharply decrease the amount of time determinization takes. But NFA /// shrinking is itself otherwise fairly expensive. Since a lazy DFA has /// no compilation time (other than for building the NFA of course) before /// executing a search, it's usually worth it to forgo NFA shrinking. #[test] fn no_nfa_shrink() -> Result<()> { let mut builder = Regex::builder(); builder.thompson(thompson::Config::new().shrink(false)); TestRunner::new()? // Without NFA shrinking, this test blows the default cache capacity. .blacklist("expensive/regression-many-repeat-no-stack-overflow") .test_iter(suite()?.iter(), compiler(builder)) .assert(); Ok(()) } /// Tests the hybrid NFA/DFA when 'starts_for_each_pattern' is enabled. #[test] fn starts_for_each_pattern() -> Result<()> { let mut builder = Regex::builder(); builder.dfa(DFA::config().starts_for_each_pattern(true)); TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert(); Ok(()) } /// Tests the hybrid NFA/DFA when byte classes are disabled. /// /// N.B. Disabling byte classes doesn't avoid any indirection at search time. /// All it does is cause every byte value to be its own distinct equivalence /// class. #[test] fn no_byte_classes() -> Result<()> { let mut builder = Regex::builder(); builder.dfa(DFA::config().byte_classes(false)); TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert(); Ok(()) } /// Tests that hybrid NFA/DFA never clears its cache for any test with the /// default capacity. /// /// N.B. If a regex suite test is added that causes the cache to be cleared, /// then this should just skip that test. (Which can be done by calling the /// 'blacklist' method on 'TestRunner'.) #[test] fn no_cache_clearing() -> Result<()> { let mut builder = Regex::builder(); builder.dfa(DFA::config().minimum_cache_clear_count(Some(0))); TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert(); Ok(()) } /// Tests the hybrid NFA/DFA when the minimum cache capacity is set. #[test] fn min_cache_capacity() -> Result<()> { let mut builder = Regex::builder(); builder .dfa(DFA::config().cache_capacity(0).skip_cache_capacity_check(true)); TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert(); Ok(()) } fn compiler( mut builder: regex::Builder, ) -> impl FnMut(&RegexTest, &[BString]) -> Result { move |test, regexes| { let regexes = regexes .iter() .map(|r| r.to_str().map(|s| s.to_string())) .collect::, _>>()?; // Check if our regex contains things that aren't supported by DFAs. // That is, Unicode word boundaries when searching non-ASCII text. let mut thompson = thompson::Builder::new(); thompson.syntax(config_syntax(test)).configure(config_thompson(test)); if let Ok(nfa) = thompson.build_many(®exes) { let non_ascii = test.input().iter().any(|&b| !b.is_ascii()); if nfa.has_word_boundary_unicode() && non_ascii { return Ok(CompiledRegex::skip()); } } if !configure_regex_builder(test, &mut builder) { return Ok(CompiledRegex::skip()); } let re = builder.build_many(®exes)?; let mut cache = re.create_cache(); Ok(CompiledRegex::compiled(move |test| -> Vec { run_test(&re, &mut cache, test) })) } } fn run_test( re: &Regex, cache: &mut regex::Cache, test: &RegexTest, ) -> Vec { let is_match = if re.is_match(cache, test.input()) { TestResult::matched() } else { TestResult::no_match() }; let is_match = is_match.name("is_match"); let find_matches = match test.search_kind() { TestSearchKind::Earliest => { let it = re .find_earliest_iter(cache, test.input()) .take(test.match_limit().unwrap_or(std::usize::MAX)) .map(|m| Match { id: m.pattern().as_usize(), start: m.start(), end: m.end(), }); TestResult::matches(it).name("find_earliest_iter") } TestSearchKind::Leftmost => { let it = re .find_leftmost_iter(cache, test.input()) .take(test.match_limit().unwrap_or(std::usize::MAX)) .map(|m| Match { id: m.pattern().as_usize(), start: m.start(), end: m.end(), }); TestResult::matches(it).name("find_leftmost_iter") } TestSearchKind::Overlapping => { let it = re .find_overlapping_iter(cache, test.input()) .take(test.match_limit().unwrap_or(std::usize::MAX)) .map(|m| Match { id: m.pattern().as_usize(), start: m.start(), end: m.end(), }); TestResult::matches(it).name("find_overlapping_iter") } }; vec![is_match, find_matches] } /// Configures the given regex builder with all relevant settings on the given /// regex test. /// /// If the regex test has a setting that is unsupported, then this returns /// false (implying the test should be skipped). fn configure_regex_builder( test: &RegexTest, builder: &mut regex::Builder, ) -> bool { let match_kind = match test.match_kind() { TestMatchKind::All => MatchKind::All, TestMatchKind::LeftmostFirst => MatchKind::LeftmostFirst, TestMatchKind::LeftmostLongest => return false, }; let dense_config = DFA::config() .anchored(test.anchored()) .match_kind(match_kind) .unicode_word_boundary(true); let regex_config = Regex::config().utf8(test.utf8()); builder .configure(regex_config) .syntax(config_syntax(test)) .thompson(config_thompson(test)) .dfa(dense_config); true } /// Configuration of a Thompson NFA compiler from a regex test. fn config_thompson(test: &RegexTest) -> thompson::Config { thompson::Config::new().utf8(test.utf8()) } /// Configuration of the regex parser from a regex test. fn config_syntax(test: &RegexTest) -> SyntaxConfig { SyntaxConfig::new() .case_insensitive(test.case_insensitive()) .unicode(test.unicode()) .utf8(test.utf8()) }