use regex_automata::{ dfa::{self, dense, regex::Regex, sparse, Automaton}, nfa::thompson, MatchKind, SyntaxConfig, }; use regex_syntax as syntax; use regex_test::{ bstr::{BString, ByteSlice}, CompiledRegex, Match, MatchKind as TestMatchKind, RegexTest, RegexTests, SearchKind as TestSearchKind, TestResult, TestRunner, }; use crate::{suite, Result}; /// Runs the test suite with the default configuration. #[test] fn unminimized_default() -> Result<()> { let builder = Regex::builder(); TestRunner::new()? .test_iter(suite()?.iter(), dense_compiler(builder)) .assert(); Ok(()) } /// Runs the test suite with byte classes disabled. #[test] fn unminimized_no_byte_class() -> Result<()> { let mut builder = Regex::builder(); builder.dense(dense::Config::new().byte_classes(false)); TestRunner::new()? .test_iter(suite()?.iter(), dense_compiler(builder)) .assert(); Ok(()) } /// Runs the test suite with NFA shrinking disabled. #[test] fn unminimized_no_nfa_shrink() -> Result<()> { let mut builder = Regex::builder(); builder.thompson(thompson::Config::new().shrink(false)); TestRunner::new()? .test_iter(suite()?.iter(), dense_compiler(builder)) .assert(); Ok(()) } /// Runs the test suite on a minimized DFA with an otherwise default /// configuration. #[test] fn minimized_default() -> Result<()> { let mut builder = Regex::builder(); builder.dense(dense::Config::new().minimize(true)); TestRunner::new()? // These regexes tend to be too big. Minimization takes... forever. .blacklist("expensive") .test_iter(suite()?.iter(), dense_compiler(builder)) .assert(); Ok(()) } /// Runs the test suite on a minimized DFA with byte classes disabled. #[test] fn minimized_no_byte_class() -> Result<()> { let mut builder = Regex::builder(); builder.dense(dense::Config::new().minimize(true).byte_classes(false)); TestRunner::new()? // These regexes tend to be too big. Minimization takes... forever. .blacklist("expensive") .test_iter(suite()?.iter(), dense_compiler(builder)) .assert(); Ok(()) } /// Runs the test suite on a sparse unminimized DFA. #[test] fn sparse_unminimized_default() -> Result<()> { let builder = Regex::builder(); TestRunner::new()? .test_iter(suite()?.iter(), sparse_compiler(builder)) .assert(); Ok(()) } /// Another basic sanity test that checks we can serialize and then deserialize /// a regex, and that the resulting regex can be used for searching correctly. #[test] fn serialization_unminimized_default() -> Result<()> { let builder = Regex::builder(); let my_compiler = |builder| { compiler(builder, |builder, re| { let builder = builder.clone(); let (fwd_bytes, _) = re.forward().to_bytes_native_endian(); let (rev_bytes, _) = re.reverse().to_bytes_native_endian(); Ok(CompiledRegex::compiled(move |test| -> Vec { let fwd: dense::DFA<&[u32]> = dense::DFA::from_bytes(&fwd_bytes).unwrap().0; let rev: dense::DFA<&[u32]> = dense::DFA::from_bytes(&rev_bytes).unwrap().0; let re = builder.build_from_dfas(fwd, rev); run_test(&re, test) })) }) }; TestRunner::new()? .test_iter(suite()?.iter(), my_compiler(builder)) .assert(); Ok(()) } /// A basic sanity test that checks we can serialize and then deserialize a /// regex using sparse DFAs, and that the resulting regex can be used for /// searching correctly. #[test] fn sparse_serialization_unminimized_default() -> Result<()> { let builder = Regex::builder(); let my_compiler = |builder| { compiler(builder, |builder, re| { let builder = builder.clone(); let fwd_bytes = re.forward().to_sparse()?.to_bytes_native_endian(); let rev_bytes = re.reverse().to_sparse()?.to_bytes_native_endian(); Ok(CompiledRegex::compiled(move |test| -> Vec { let fwd: sparse::DFA<&[u8]> = sparse::DFA::from_bytes(&fwd_bytes).unwrap().0; let rev: sparse::DFA<&[u8]> = sparse::DFA::from_bytes(&rev_bytes).unwrap().0; let re = builder.build_from_dfas(fwd, rev); run_test(&re, test) })) }) }; TestRunner::new()? .test_iter(suite()?.iter(), my_compiler(builder)) .assert(); Ok(()) } fn dense_compiler( builder: dfa::regex::Builder, ) -> impl FnMut(&RegexTest, &[BString]) -> Result { compiler(builder, |_, re| { Ok(CompiledRegex::compiled(move |test| -> Vec { run_test(&re, test) })) }) } fn sparse_compiler( builder: dfa::regex::Builder, ) -> impl FnMut(&RegexTest, &[BString]) -> Result { compiler(builder, |builder, re| { let fwd = re.forward().to_sparse()?; let rev = re.reverse().to_sparse()?; let re = builder.build_from_dfas(fwd, rev); Ok(CompiledRegex::compiled(move |test| -> Vec { run_test(&re, test) })) }) } fn compiler( mut builder: dfa::regex::Builder, mut create_matcher: impl FnMut( &dfa::regex::Builder, Regex, ) -> Result, ) -> impl FnMut(&RegexTest, &[BString]) -> Result { move |test, regexes| { let regexes = regexes .iter() .map(|r| r.to_str().map(|s| s.to_string())) .collect::, _>>()?; // Check if our regex contains things that aren't supported by DFAs. // That is, Unicode word boundaries when searching non-ASCII text. let mut thompson = thompson::Builder::new(); thompson.configure(config_thompson(test)); // TODO: Modify Hir to report facts like this, instead of needing to // build an NFA to do it. if let Ok(nfa) = thompson.build_many(®exes) { let non_ascii = test.input().iter().any(|&b| !b.is_ascii()); if nfa.has_word_boundary_unicode() && non_ascii { return Ok(CompiledRegex::skip()); } } if !configure_regex_builder(test, &mut builder) { return Ok(CompiledRegex::skip()); } create_matcher(&builder, builder.build_many(®exes)?) } } fn run_test(re: &Regex, test: &RegexTest) -> Vec { let is_match = if re.is_match(test.input()) { TestResult::matched() } else { TestResult::no_match() }; let is_match = is_match.name("is_match"); let find_matches = match test.search_kind() { TestSearchKind::Earliest => { let it = re .find_earliest_iter(test.input()) .take(test.match_limit().unwrap_or(std::usize::MAX)) .map(|m| Match { id: m.pattern().as_usize(), start: m.start(), end: m.end(), }); TestResult::matches(it).name("find_earliest_iter") } TestSearchKind::Leftmost => { let it = re .find_leftmost_iter(test.input()) .take(test.match_limit().unwrap_or(std::usize::MAX)) .map(|m| Match { id: m.pattern().as_usize(), start: m.start(), end: m.end(), }); TestResult::matches(it).name("find_leftmost_iter") } TestSearchKind::Overlapping => { let it = re .find_overlapping_iter(test.input()) .take(test.match_limit().unwrap_or(std::usize::MAX)) .map(|m| Match { id: m.pattern().as_usize(), start: m.start(), end: m.end(), }); TestResult::matches(it).name("find_overlapping_iter") } }; vec![is_match, find_matches] } /// Configures the given regex builder with all relevant settings on the given /// regex test. /// /// If the regex test has a setting that is unsupported, then this returns /// false (implying the test should be skipped). fn configure_regex_builder( test: &RegexTest, builder: &mut dfa::regex::Builder, ) -> bool { let match_kind = match test.match_kind() { TestMatchKind::All => MatchKind::All, TestMatchKind::LeftmostFirst => MatchKind::LeftmostFirst, TestMatchKind::LeftmostLongest => return false, }; let syntax_config = SyntaxConfig::new() .case_insensitive(test.case_insensitive()) .unicode(test.unicode()) .utf8(test.utf8()); let dense_config = dense::Config::new() .anchored(test.anchored()) .match_kind(match_kind) .unicode_word_boundary(true); let regex_config = Regex::config().utf8(test.utf8()); builder .configure(regex_config) .syntax(syntax_config) .thompson(config_thompson(test)) .dense(dense_config); true } /// Configuration of a Thompson NFA compiler from a regex test. fn config_thompson(test: &RegexTest) -> thompson::Config { thompson::Config::new().utf8(test.utf8()) }