summaryrefslogtreecommitdiffstats
path: root/vendor/regex-automata/tests/hybrid
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-17 12:18:32 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-17 12:18:32 +0000
commit4547b622d8d29df964fa2914213088b148c498fc (patch)
tree9fc6b25f3c3add6b745be9a2400a6e96140046e9 /vendor/regex-automata/tests/hybrid
parentReleasing progress-linux version 1.66.0+dfsg1-1~progress7.99u1. (diff)
downloadrustc-4547b622d8d29df964fa2914213088b148c498fc.tar.xz
rustc-4547b622d8d29df964fa2914213088b148c498fc.zip
Merging upstream version 1.67.1+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/regex-automata/tests/hybrid')
-rw-r--r--vendor/regex-automata/tests/hybrid/api.rs195
-rw-r--r--vendor/regex-automata/tests/hybrid/mod.rs2
-rw-r--r--vendor/regex-automata/tests/hybrid/suite.rs212
3 files changed, 409 insertions, 0 deletions
diff --git a/vendor/regex-automata/tests/hybrid/api.rs b/vendor/regex-automata/tests/hybrid/api.rs
new file mode 100644
index 000000000..9a834dbb8
--- /dev/null
+++ b/vendor/regex-automata/tests/hybrid/api.rs
@@ -0,0 +1,195 @@
+use std::error::Error;
+
+use regex_automata::{
+ hybrid::{
+ dfa::{self, DFA},
+ regex::Regex,
+ OverlappingState,
+ },
+ nfa::thompson,
+ HalfMatch, MatchError, MatchKind, MultiMatch,
+};
+
+use crate::util::{BunkPrefilter, SubstringPrefilter};
+
+// Tests that too many cache resets cause the lazy DFA to quit.
+//
+// We only test this on 64-bit because the test is gingerly crafted based on
+// implementation details of cache sizes. It's not a great test because of
+// that, but it does check some interesting properties around how positions are
+// reported when a search "gives up."
+#[test]
+#[cfg(target_pointer_width = "64")]
+fn too_many_cache_resets_cause_quit() -> Result<(), Box<dyn Error>> {
+ // This is a carefully chosen regex. The idea is to pick one that requires
+ // some decent number of states (hence the bounded repetition). But we
+ // specifically choose to create a class with an ASCII letter and a
+ // non-ASCII letter so that we can check that no new states are created
+ // once the cache is full. Namely, if we fill up the cache on a haystack
+ // of 'a's, then in order to match one 'β', a new state will need to be
+ // created since a 'β' is encoded with multiple bytes. Since there's no
+ // room for this state, the search should quit at the very first position.
+ let pattern = r"[aβ]{100}";
+ let dfa = DFA::builder()
+ .configure(
+ // Configure it so that we have the minimum cache capacity
+ // possible. And that if any resets occur, the search quits.
+ DFA::config()
+ .skip_cache_capacity_check(true)
+ .cache_capacity(0)
+ .minimum_cache_clear_count(Some(0)),
+ )
+ .build(pattern)?;
+ let mut cache = dfa.create_cache();
+
+ let haystack = "a".repeat(101).into_bytes();
+ let err = MatchError::GaveUp { offset: 25 };
+ assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err.clone()));
+ assert_eq!(dfa.find_leftmost_fwd(&mut cache, &haystack), Err(err.clone()));
+ assert_eq!(
+ dfa.find_overlapping_fwd(
+ &mut cache,
+ &haystack,
+ &mut OverlappingState::start()
+ ),
+ Err(err.clone())
+ );
+
+ let haystack = "β".repeat(101).into_bytes();
+ let err = MatchError::GaveUp { offset: 0 };
+ assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
+ // no need to test that other find routines quit, since we did that above
+
+ // OK, if we reset the cache, then we should be able to create more states
+ // and make more progress with searching for betas.
+ cache.reset(&dfa);
+ let err = MatchError::GaveUp { offset: 26 };
+ assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
+
+ // ... switching back to ASCII still makes progress since it just needs to
+ // set transitions on existing states!
+ let haystack = "a".repeat(101).into_bytes();
+ let err = MatchError::GaveUp { offset: 13 };
+ assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
+
+ Ok(())
+}
+
+// Tests that quit bytes in the forward direction work correctly.
+#[test]
+fn quit_fwd() -> Result<(), Box<dyn Error>> {
+ let dfa = DFA::builder()
+ .configure(DFA::config().quit(b'x', true))
+ .build("[[:word:]]+$")?;
+ let mut cache = dfa.create_cache();
+
+ assert_eq!(
+ dfa.find_earliest_fwd(&mut cache, b"abcxyz"),
+ Err(MatchError::Quit { byte: b'x', offset: 3 })
+ );
+ assert_eq!(
+ dfa.find_leftmost_fwd(&mut cache, b"abcxyz"),
+ Err(MatchError::Quit { byte: b'x', offset: 3 })
+ );
+ assert_eq!(
+ dfa.find_overlapping_fwd(
+ &mut cache,
+ b"abcxyz",
+ &mut OverlappingState::start()
+ ),
+ Err(MatchError::Quit { byte: b'x', offset: 3 })
+ );
+
+ Ok(())
+}
+
+// Tests that quit bytes in the reverse direction work correctly.
+#[test]
+fn quit_rev() -> Result<(), Box<dyn Error>> {
+ let dfa = DFA::builder()
+ .configure(DFA::config().quit(b'x', true))
+ .thompson(thompson::Config::new().reverse(true))
+ .build("^[[:word:]]+")?;
+ let mut cache = dfa.create_cache();
+
+ assert_eq!(
+ dfa.find_earliest_rev(&mut cache, b"abcxyz"),
+ Err(MatchError::Quit { byte: b'x', offset: 3 })
+ );
+ assert_eq!(
+ dfa.find_leftmost_rev(&mut cache, b"abcxyz"),
+ Err(MatchError::Quit { byte: b'x', offset: 3 })
+ );
+
+ Ok(())
+}
+
+// Tests that if we heuristically enable Unicode word boundaries but then
+// instruct that a non-ASCII byte should NOT be a quit byte, then the builder
+// will panic.
+#[test]
+#[should_panic]
+fn quit_panics() {
+ DFA::config().unicode_word_boundary(true).quit(b'\xFF', false);
+}
+
+// This tests an intesting case where even if the Unicode word boundary option
+// is disabled, setting all non-ASCII bytes to be quit bytes will cause Unicode
+// word boundaries to be enabled.
+#[test]
+fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> {
+ let mut config = DFA::config();
+ for b in 0x80..=0xFF {
+ config = config.quit(b, true);
+ }
+ let dfa = DFA::builder().configure(config).build(r"\b")?;
+ let mut cache = dfa.create_cache();
+ let expected = HalfMatch::must(0, 1);
+ assert_eq!(dfa.find_leftmost_fwd(&mut cache, b" a"), Ok(Some(expected)));
+ Ok(())
+}
+
+// Tests that we can provide a prefilter to a Regex, and the search reports
+// correct results.
+#[test]
+fn prefilter_works() -> Result<(), Box<dyn Error>> {
+ let mut re = Regex::new(r"a[0-9]+").unwrap();
+ re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a"))));
+ let mut cache = re.create_cache();
+
+ let text = b"foo abc foo a1a2a3 foo a123 bar aa456";
+ let matches: Vec<(usize, usize)> = re
+ .find_leftmost_iter(&mut cache, text)
+ .map(|m| (m.start(), m.end()))
+ .collect();
+ assert_eq!(
+ matches,
+ vec![(12, 14), (14, 16), (16, 18), (23, 27), (33, 37),]
+ );
+ Ok(())
+}
+
+// This test confirms that a prefilter is active by using a prefilter that
+// reports false negatives.
+#[test]
+fn prefilter_is_active() -> Result<(), Box<dyn Error>> {
+ let text = b"za123";
+ let mut re = Regex::new(r"a[0-9]+").unwrap();
+ let mut cache = re.create_cache();
+
+ re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a"))));
+ assert_eq!(
+ re.find_leftmost(&mut cache, b"za123"),
+ Some(MultiMatch::must(0, 1, 5))
+ );
+ assert_eq!(
+ re.find_leftmost(&mut cache, b"a123"),
+ Some(MultiMatch::must(0, 0, 4))
+ );
+ re.set_prefilter(Some(Box::new(BunkPrefilter::new())));
+ assert_eq!(re.find_leftmost(&mut cache, b"za123"), None);
+ // This checks that the prefilter is used when first starting the search,
+ // instead of waiting until at least one transition has occurred.
+ assert_eq!(re.find_leftmost(&mut cache, b"a123"), None);
+ Ok(())
+}
diff --git a/vendor/regex-automata/tests/hybrid/mod.rs b/vendor/regex-automata/tests/hybrid/mod.rs
new file mode 100644
index 000000000..f4299510c
--- /dev/null
+++ b/vendor/regex-automata/tests/hybrid/mod.rs
@@ -0,0 +1,2 @@
+mod api;
+mod suite;
diff --git a/vendor/regex-automata/tests/hybrid/suite.rs b/vendor/regex-automata/tests/hybrid/suite.rs
new file mode 100644
index 000000000..d60570d84
--- /dev/null
+++ b/vendor/regex-automata/tests/hybrid/suite.rs
@@ -0,0 +1,212 @@
+use regex_automata::{
+ hybrid::{
+ dfa::DFA,
+ regex::{self, Regex},
+ },
+ nfa::thompson,
+ MatchKind, SyntaxConfig,
+};
+use regex_syntax as syntax;
+
+use regex_test::{
+ bstr::{BString, ByteSlice},
+ CompiledRegex, Match, MatchKind as TestMatchKind, RegexTest, RegexTests,
+ SearchKind as TestSearchKind, TestResult, TestRunner,
+};
+
+use crate::{suite, Result};
+
+/// Tests the default configuration of the hybrid NFA/DFA.
+#[test]
+fn default() -> Result<()> {
+ let builder = Regex::builder();
+ TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert();
+ Ok(())
+}
+
+/// Tests the hybrid NFA/DFA with NFA shrinking disabled.
+///
+/// This is actually the typical configuration one wants for a lazy DFA. NFA
+/// shrinking is mostly only advantageous when building a full DFA since it
+/// can sharply decrease the amount of time determinization takes. But NFA
+/// shrinking is itself otherwise fairly expensive. Since a lazy DFA has
+/// no compilation time (other than for building the NFA of course) before
+/// executing a search, it's usually worth it to forgo NFA shrinking.
+#[test]
+fn no_nfa_shrink() -> Result<()> {
+ let mut builder = Regex::builder();
+ builder.thompson(thompson::Config::new().shrink(false));
+ TestRunner::new()?
+ // Without NFA shrinking, this test blows the default cache capacity.
+ .blacklist("expensive/regression-many-repeat-no-stack-overflow")
+ .test_iter(suite()?.iter(), compiler(builder))
+ .assert();
+ Ok(())
+}
+
+/// Tests the hybrid NFA/DFA when 'starts_for_each_pattern' is enabled.
+#[test]
+fn starts_for_each_pattern() -> Result<()> {
+ let mut builder = Regex::builder();
+ builder.dfa(DFA::config().starts_for_each_pattern(true));
+ TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert();
+ Ok(())
+}
+
+/// Tests the hybrid NFA/DFA when byte classes are disabled.
+///
+/// N.B. Disabling byte classes doesn't avoid any indirection at search time.
+/// All it does is cause every byte value to be its own distinct equivalence
+/// class.
+#[test]
+fn no_byte_classes() -> Result<()> {
+ let mut builder = Regex::builder();
+ builder.dfa(DFA::config().byte_classes(false));
+ TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert();
+ Ok(())
+}
+
+/// Tests that hybrid NFA/DFA never clears its cache for any test with the
+/// default capacity.
+///
+/// N.B. If a regex suite test is added that causes the cache to be cleared,
+/// then this should just skip that test. (Which can be done by calling the
+/// 'blacklist' method on 'TestRunner'.)
+#[test]
+fn no_cache_clearing() -> Result<()> {
+ let mut builder = Regex::builder();
+ builder.dfa(DFA::config().minimum_cache_clear_count(Some(0)));
+ TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert();
+ Ok(())
+}
+
+/// Tests the hybrid NFA/DFA when the minimum cache capacity is set.
+#[test]
+fn min_cache_capacity() -> Result<()> {
+ let mut builder = Regex::builder();
+ builder
+ .dfa(DFA::config().cache_capacity(0).skip_cache_capacity_check(true));
+ TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert();
+ Ok(())
+}
+
+fn compiler(
+ mut builder: regex::Builder,
+) -> impl FnMut(&RegexTest, &[BString]) -> Result<CompiledRegex> {
+ move |test, regexes| {
+ let regexes = regexes
+ .iter()
+ .map(|r| r.to_str().map(|s| s.to_string()))
+ .collect::<std::result::Result<Vec<String>, _>>()?;
+
+ // Check if our regex contains things that aren't supported by DFAs.
+ // That is, Unicode word boundaries when searching non-ASCII text.
+ let mut thompson = thompson::Builder::new();
+ thompson.syntax(config_syntax(test)).configure(config_thompson(test));
+ if let Ok(nfa) = thompson.build_many(&regexes) {
+ let non_ascii = test.input().iter().any(|&b| !b.is_ascii());
+ if nfa.has_word_boundary_unicode() && non_ascii {
+ return Ok(CompiledRegex::skip());
+ }
+ }
+ if !configure_regex_builder(test, &mut builder) {
+ return Ok(CompiledRegex::skip());
+ }
+ let re = builder.build_many(&regexes)?;
+ let mut cache = re.create_cache();
+ Ok(CompiledRegex::compiled(move |test| -> Vec<TestResult> {
+ run_test(&re, &mut cache, test)
+ }))
+ }
+}
+
+fn run_test(
+ re: &Regex,
+ cache: &mut regex::Cache,
+ test: &RegexTest,
+) -> Vec<TestResult> {
+ let is_match = if re.is_match(cache, test.input()) {
+ TestResult::matched()
+ } else {
+ TestResult::no_match()
+ };
+ let is_match = is_match.name("is_match");
+
+ let find_matches = match test.search_kind() {
+ TestSearchKind::Earliest => {
+ let it = re
+ .find_earliest_iter(cache, test.input())
+ .take(test.match_limit().unwrap_or(std::usize::MAX))
+ .map(|m| Match {
+ id: m.pattern().as_usize(),
+ start: m.start(),
+ end: m.end(),
+ });
+ TestResult::matches(it).name("find_earliest_iter")
+ }
+ TestSearchKind::Leftmost => {
+ let it = re
+ .find_leftmost_iter(cache, test.input())
+ .take(test.match_limit().unwrap_or(std::usize::MAX))
+ .map(|m| Match {
+ id: m.pattern().as_usize(),
+ start: m.start(),
+ end: m.end(),
+ });
+ TestResult::matches(it).name("find_leftmost_iter")
+ }
+ TestSearchKind::Overlapping => {
+ let it = re
+ .find_overlapping_iter(cache, test.input())
+ .take(test.match_limit().unwrap_or(std::usize::MAX))
+ .map(|m| Match {
+ id: m.pattern().as_usize(),
+ start: m.start(),
+ end: m.end(),
+ });
+ TestResult::matches(it).name("find_overlapping_iter")
+ }
+ };
+ vec![is_match, find_matches]
+}
+
+/// Configures the given regex builder with all relevant settings on the given
+/// regex test.
+///
+/// If the regex test has a setting that is unsupported, then this returns
+/// false (implying the test should be skipped).
+fn configure_regex_builder(
+ test: &RegexTest,
+ builder: &mut regex::Builder,
+) -> bool {
+ let match_kind = match test.match_kind() {
+ TestMatchKind::All => MatchKind::All,
+ TestMatchKind::LeftmostFirst => MatchKind::LeftmostFirst,
+ TestMatchKind::LeftmostLongest => return false,
+ };
+
+ let dense_config = DFA::config()
+ .anchored(test.anchored())
+ .match_kind(match_kind)
+ .unicode_word_boundary(true);
+ let regex_config = Regex::config().utf8(test.utf8());
+ builder
+ .configure(regex_config)
+ .syntax(config_syntax(test))
+ .thompson(config_thompson(test))
+ .dfa(dense_config);
+ true
+}
+
+/// Configuration of a Thompson NFA compiler from a regex test.
+fn config_thompson(test: &RegexTest) -> thompson::Config {
+ thompson::Config::new().utf8(test.utf8())
+}
+
+/// Configuration of the regex parser from a regex test.
+fn config_syntax(test: &RegexTest) -> SyntaxConfig {
+ SyntaxConfig::new()
+ .case_insensitive(test.case_insensitive())
+ .unicode(test.unicode())
+ .utf8(test.utf8())
+}