# See: https://github.com/rust-lang/regex/issues/48 [[test]] name = "invalid-regex-no-crash-100" regex = '(*)' haystack = "" matches = [] compiles = false # See: https://github.com/rust-lang/regex/issues/48 [[test]] name = "invalid-regex-no-crash-200" regex = '(?:?)' haystack = "" matches = [] compiles = false # See: https://github.com/rust-lang/regex/issues/48 [[test]] name = "invalid-regex-no-crash-300" regex = '(?)' haystack = "" matches = [] compiles = false # See: https://github.com/rust-lang/regex/issues/48 [[test]] name = "invalid-regex-no-crash-400" regex = '*' haystack = "" matches = [] compiles = false # See: https://github.com/rust-lang/regex/issues/75 [[test]] name = "unsorted-binary-search-100" regex = '(?i-u)[a_]+' haystack = "A_" matches = [[0, 2]] # See: https://github.com/rust-lang/regex/issues/75 [[test]] name = "unsorted-binary-search-200" regex = '(?i-u)[A_]+' haystack = "a_" matches = [[0, 2]] # See: https://github.com/rust-lang/regex/issues/76 [[test]] name = "unicode-case-lower-nocase-flag" regex = '(?i)\p{Ll}+' haystack = "ΛΘΓΔα" matches = [[0, 10]] # See: https://github.com/rust-lang/regex/issues/99 [[test]] name = "negated-char-class-100" regex = '(?i)[^x]' haystack = "x" matches = [] # See: https://github.com/rust-lang/regex/issues/99 [[test]] name = "negated-char-class-200" regex = '(?i)[^x]' haystack = "X" matches = [] # See: https://github.com/rust-lang/regex/issues/101 [[test]] name = "ascii-word-underscore" regex = '[[:word:]]' haystack = "_" matches = [[0, 1]] # See: https://github.com/rust-lang/regex/issues/129 [[test]] name = "captures-repeat" regex = '([a-f]){2}(?P[x-z])' haystack = "abx" matches = [ [[0, 3], [1, 2], [2, 3]], ] # See: https://github.com/rust-lang/regex/issues/153 [[test]] name = "alt-in-alt-100" regex = 'ab?|$' haystack = "az" matches = [[0, 1], [2, 2]] # See: https://github.com/rust-lang/regex/issues/153 [[test]] name = "alt-in-alt-200" regex = '^(?:.*?)(?:\n|\r\n?|$)' haystack = "ab\rcd" matches = [[0, 3]] # See: https://github.com/rust-lang/regex/issues/169 [[test]] name = "leftmost-first-prefix" regex = 'z*azb' haystack = "azb" matches = [[0, 3]] # See: https://github.com/rust-lang/regex/issues/191 [[test]] name = "many-alternates" regex = '1|2|3|4|5|6|7|8|9|10|int' haystack = "int" matches = [[0, 3]] # See: https://github.com/rust-lang/regex/issues/204 [[test]] name = "word-boundary-alone-100" regex = '\b' haystack = "Should this (work?)" matches = [[0, 0], [6, 6], [7, 7], [11, 11], [13, 13], [17, 17]] # See: https://github.com/rust-lang/regex/issues/204 [[test]] name = "word-boundary-alone-200" regex = '\b' haystack = "a b c" matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]] # See: https://github.com/rust-lang/regex/issues/264 [[test]] name = "word-boundary-ascii-no-capture" regex = '\B' haystack = "\U00028F3E" matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]] unicode = false utf8 = false # See: https://github.com/rust-lang/regex/issues/264 [[test]] name = "word-boundary-ascii-capture" regex = '(?:\B)' haystack = "\U00028F3E" matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]] unicode = false utf8 = false # See: https://github.com/rust-lang/regex/issues/268 [[test]] name = "partial-anchor" regex = '^a|b' haystack = "ba" matches = [[0, 1]] # See: https://github.com/rust-lang/regex/issues/271 [[test]] name = "endl-or-word-boundary" regex = '(?m:$)|(?-u:\b)' haystack = "\U0006084E" matches = [[4, 4]] # See: https://github.com/rust-lang/regex/issues/271 [[test]] name = "zero-or-end" regex = '(?i-u:\x00)|$' haystack = "\U000E682F" matches = [[4, 4]] # See: https://github.com/rust-lang/regex/issues/271 [[test]] name = "y-or-endl" regex = '(?i-u:y)|(?m:$)' haystack = "\U000B4331" matches = [[4, 4]] # See: https://github.com/rust-lang/regex/issues/271 [[test]] name = "word-boundary-start-x" regex = '(?u:\b)^(?-u:X)' haystack = "X" matches = [[0, 1]] # See: https://github.com/rust-lang/regex/issues/271 [[test]] name = "word-boundary-ascii-start-x" regex = '(?-u:\b)^(?-u:X)' haystack = "X" matches = [[0, 1]] # See: https://github.com/rust-lang/regex/issues/271 [[test]] name = "end-not-word-boundary" regex = '$\B' haystack = "\U0005C124\U000B576C" matches = [[8, 8]] unicode = false utf8 = false # See: https://github.com/rust-lang/regex/issues/280 [[test]] name = "partial-anchor-alternate-begin" regex = '^a|z' haystack = "yyyyya" matches = [] # See: https://github.com/rust-lang/regex/issues/280 [[test]] name = "partial-anchor-alternate-end" regex = 'a$|z' haystack = "ayyyyy" matches = [] # See: https://github.com/rust-lang/regex/issues/289 [[test]] name = "lits-unambiguous-100" regex = '(?:ABC|CDA|BC)X' haystack = "CDAX" matches = [[0, 4]] # See: https://github.com/rust-lang/regex/issues/291 [[test]] name = "lits-unambiguous-200" regex = '((IMG|CAM|MG|MB2)_|(DSCN|CIMG))(?P[0-9]+)$' haystack = "CIMG2341" matches = [ [[0, 8], [0, 4], [], [0, 4], [4, 8]], ] # See: https://github.com/rust-lang/regex/issues/303 # # 2022-09-19: This has now been "properly" fixed in that empty character # classes are fully supported as something that can never match. This test # used to be marked as 'compiles = false', but now it works. [[test]] name = "negated-full-byte-range" regex = '[^\x00-\xFF]' haystack = "" matches = [] compiles = true unicode = false utf8 = false # See: https://github.com/rust-lang/regex/issues/321 [[test]] name = "strange-anchor-non-complete-prefix" regex = 'a^{2}' haystack = "" matches = [] # See: https://github.com/rust-lang/regex/issues/321 [[test]] name = "strange-anchor-non-complete-suffix" regex = '${2}a' haystack = "" matches = [] # See: https://github.com/rust-lang/regex/issues/334 # See: https://github.com/rust-lang/regex/issues/557 [[test]] name = "captures-after-dfa-premature-end-100" regex = 'a(b*(X|$))?' haystack = "abcbX" matches = [ [[0, 1], [], []], ] # See: https://github.com/rust-lang/regex/issues/334 # See: https://github.com/rust-lang/regex/issues/557 [[test]] name = "captures-after-dfa-premature-end-200" regex = 'a(bc*(X|$))?' haystack = "abcbX" matches = [ [[0, 1], [], []], ] # See: https://github.com/rust-lang/regex/issues/334 # See: https://github.com/rust-lang/regex/issues/557 [[test]] name = "captures-after-dfa-premature-end-300" regex = '(aa$)?' haystack = "aaz" matches = [ [[0, 0], []], [[1, 1], []], [[2, 2], []], [[3, 3], []], ] # Plucked from "Why aren’t regular expressions a lingua franca? an empirical # study on the re-use and portability of regular expressions", The ACM Joint # European Software Engineering Conference and Symposium on the Foundations of # Software Engineering (ESEC/FSE), 2019. # # Link: https://dl.acm.org/doi/pdf/10.1145/3338906.3338909 [[test]] name = "captures-after-dfa-premature-end-400" regex = '(a)\d*\.?\d+\b' haystack = "a0.0c" matches = [ [[0, 2], [0, 1]], ] # See: https://github.com/rust-lang/regex/issues/437 [[test]] name = "literal-panic" regex = 'typename type\-parameter\-[0-9]+\-[0-9]+::.+' haystack = "test" matches = [] # See: https://github.com/rust-lang/regex/issues/527 [[test]] name = "empty-flag-expr" regex = '(?:(?:(?x)))' haystack = "" matches = [[0, 0]] # See: https://github.com/rust-lang/regex/issues/533 #[[tests]] #name = "blank-matches-nothing-between-space-and-tab" #regex = '[[:blank:]]' #input = '\x0A\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F' #match = false #unescape = true # See: https://github.com/rust-lang/regex/issues/533 #[[tests]] #name = "blank-matches-nothing-between-space-and-tab-inverted" #regex = '^[[:^blank:]]+$' #input = '\x0A\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F' #match = true #unescape = true # See: https://github.com/rust-lang/regex/issues/555 [[test]] name = "invalid-repetition" regex = '(?m){1,1}' haystack = "" matches = [] compiles = false # See: https://github.com/rust-lang/regex/issues/640 [[test]] name = "flags-are-unset" regex = '(?:(?i)foo)|Bar' haystack = "foo Foo bar Bar" matches = [[0, 3], [4, 7], [12, 15]] # Note that 'Ј' is not 'j', but cyrillic Je # https://en.wikipedia.org/wiki/Je_(Cyrillic) # # See: https://github.com/rust-lang/regex/issues/659 [[test]] name = "empty-group-with-unicode" regex = '(?:)Ј01' haystack = 'zЈ01' matches = [[1, 5]] # See: https://github.com/rust-lang/regex/issues/579 [[test]] name = "word-boundary-weird" regex = '\b..\b' haystack = "I have 12, he has 2!" matches = [[0, 2], [7, 9], [9, 11], [11, 13], [17, 19]] # See: https://github.com/rust-lang/regex/issues/579 [[test]] name = "word-boundary-weird-ascii" regex = '\b..\b' haystack = "I have 12, he has 2!" matches = [[0, 2], [7, 9], [9, 11], [11, 13], [17, 19]] unicode = false utf8 = false # See: https://github.com/rust-lang/regex/issues/579 [[test]] name = "word-boundary-weird-minimal-ascii" regex = '\b..\b' haystack = "az,,b" matches = [[0, 2], [2, 4]] unicode = false utf8 = false # See: https://github.com/BurntSushi/ripgrep/issues/1203 [[test]] name = "reverse-suffix-100" regex = '[0-4][0-4][0-4]000' haystack = "153.230000" matches = [[4, 10]] # See: https://github.com/BurntSushi/ripgrep/issues/1203 [[test]] name = "reverse-suffix-200" regex = '[0-9][0-9][0-9]000' haystack = "153.230000\n" matches = [[4, 10]] # This is a tricky case for the reverse suffix optimization, because it # finds the 'foobar' match but the reverse scan must fail to find a match by # correctly dealing with the word boundary following the 'foobar' literal when # computing the start state. # # This test exists because I tried to break the following assumption that # is currently in the code: that if a suffix is found and the reverse scan # succeeds, then it's guaranteed that there is an overall match. Namely, the # 'is_match' routine does *not* do another forward scan in this case because of # this assumption. [[test]] name = "reverse-suffix-300" regex = '\w+foobar\b' haystack = "xyzfoobarZ" matches = [] unicode = false utf8 = false # See: https://github.com/BurntSushi/ripgrep/issues/1247 [[test]] name = "stops" regex = '\bs(?:[ab])' haystack = 's\xE4' matches = [] unescape = true utf8 = false # See: https://github.com/BurntSushi/ripgrep/issues/1247 [[test]] name = "stops-ascii" regex = '(?-u:\b)s(?:[ab])' haystack = 's\xE4' matches = [] unescape = true utf8 = false # See: https://github.com/rust-lang/regex/issues/850 [[test]] name = "adjacent-line-boundary-100" regex = '(?m)^(?:[^ ]+?)$' haystack = "line1\nline2" matches = [[0, 5], [6, 11]] # Continued. [[test]] name = "adjacent-line-boundary-200" regex = '(?m)^(?:[^ ]+?)$' haystack = "A\nB" matches = [[0, 1], [2, 3]] # There is no issue for this bug. [[test]] name = "anchored-prefix-100" regex = '^a[[:^space:]]' haystack = "a " matches = [] # There is no issue for this bug. [[test]] name = "anchored-prefix-200" regex = '^a[[:^space:]]' haystack = "foo boo a" matches = [] # There is no issue for this bug. [[test]] name = "anchored-prefix-300" regex = '^-[a-z]' haystack = "r-f" matches = [] # Tests that a possible Aho-Corasick optimization works correctly. It only # kicks in when we have a lot of literals. By "works correctly," we mean that # leftmost-first match semantics are properly respected. That is, samwise # should match, not sam. # # There is no issue for this bug. [[test]] name = "aho-corasick-100" regex = 'samwise|sam|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z' haystack = "samwise" matches = [[0, 7]] # See: https://github.com/rust-lang/regex/issues/921 [[test]] name = "interior-anchor-capture" regex = '(a$)b$' haystack = 'ab' matches = [] # I found this bug in the course of adding some of the regexes that Ruff uses # to rebar. It turns out that the lazy DFA was finding a match that was being # rejected by the one-pass DFA. Yikes. I then minimized the regex and haystack. # # Source: https://github.com/charliermarsh/ruff/blob/a919041ddaa64cdf6f216f90dd0480dab69fd3ba/crates/ruff/src/rules/pycodestyle/rules/whitespace_around_keywords.rs#L52 [[test]] name = "ruff-whitespace-around-keywords" regex = '^(a|ab)$' haystack = "ab" anchored = true unicode = false utf8 = true matches = [[[0, 2], [0, 2]]] # From: https://github.com/rust-lang/regex/issues/429 [[test]] name = "i429-0" regex = '(?:(?-u:\b)|(?u:h))+' haystack = "h" unicode = true utf8 = false matches = [[0, 0], [1, 1]] # From: https://github.com/rust-lang/regex/issues/429 [[test]] name = "i429-1" regex = '(?u:\B)' haystack = "鋸" unicode = true utf8 = false matches = [] # From: https://github.com/rust-lang/regex/issues/429 [[test]] name = "i429-2" regex = '(?:(?u:\b)|(?s-u:.))+' haystack = "oB" unicode = true utf8 = false matches = [[0, 0], [1, 2]] # From: https://github.com/rust-lang/regex/issues/429 [[test]] name = "i429-3" regex = '(?:(?-u:\B)|(?su:.))+' haystack = "\U000FEF80" unicode = true utf8 = false matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]] # From: https://github.com/rust-lang/regex/issues/429 [[test]] name = "i429-3-utf8" regex = '(?:(?-u:\B)|(?su:.))+' haystack = "\U000FEF80" unicode = true utf8 = true matches = [[0, 0], [4, 4]] # From: https://github.com/rust-lang/regex/issues/429 [[test]] name = "i429-4" regex = '(?m:$)(?m:^)(?su:.)' haystack = "\n‣" unicode = true utf8 = false matches = [[0, 1]] # From: https://github.com/rust-lang/regex/issues/429 [[test]] name = "i429-5" regex = '(?m:$)^(?m:^)' haystack = "\n" unicode = true utf8 = false matches = [[0, 0]] # From: https://github.com/rust-lang/regex/issues/429 [[test]] name = "i429-6" regex = '(?P(?iu:do)(?m:$))*' haystack = "dodo" unicode = true utf8 = false matches = [ [[0, 0], []], [[1, 1], []], [[2, 4], [2, 4]], ] # From: https://github.com/rust-lang/regex/issues/429 [[test]] name = "i429-7" regex = '(?u:\B)' haystack = "䡁" unicode = true utf8 = false matches = [] # From: https://github.com/rust-lang/regex/issues/429 [[test]] name = "i429-8" regex = '(?:(?-u:\b)|(?u:[\u{0}-W]))+' haystack = "0" unicode = true utf8 = false matches = [[0, 0], [1, 1]] # From: https://github.com/rust-lang/regex/issues/429 [[test]] name = "i429-9" regex = '((?m:$)(?-u:\B)(?s-u:.)(?-u:\B)$)' haystack = "\n\n" unicode = true utf8 = false matches = [ [[1, 2], [1, 2]], ] # From: https://github.com/rust-lang/regex/issues/429 [[test]] name = "i429-10" regex = '(?m:$)(?m:$)^(?su:.)' haystack = "\n\u0081¨\u200a" unicode = true utf8 = false matches = [[0, 1]] # From: https://github.com/rust-lang/regex/issues/429 [[test]] name = "i429-11" regex = '(?-u:\B)(?m:^)' haystack = "0\n" unicode = true utf8 = false matches = [[2, 2]] # From: https://github.com/rust-lang/regex/issues/429 [[test]] name = "i429-12" regex = '(?:(?u:\b)|(?-u:.))+' haystack = "0" unicode = true utf8 = false matches = [[0, 0], [1, 1]] # From: https://github.com/rust-lang/regex/issues/969 [[test]] name = "i969" regex = 'c.*d\z' haystack = "ababcd" bounds = [4, 6] search-kind = "earliest" matches = [[4, 6]] # I found this during the regex-automata migration. This is the fowler basic # 154 test, but without anchored = true and without a match limit. # # This test caught a subtle bug in the hybrid reverse DFA search, where it # would skip over the termination condition if it entered a start state. This # was a double bug. Firstly, the reverse DFA shouldn't have had start states # specialized in the first place, and thus it shouldn't have possible to detect # that the DFA had entered a start state. The second bug was that the start # state handling was incorrect by jumping over the termination condition. [[test]] name = "fowler-basic154-unanchored" regex = '''a([bc]*)c*''' haystack = '''abc''' matches = [[[0, 3], [1, 3]]] # From: https://github.com/rust-lang/regex/issues/981 # # This was never really a problem in the new architecture because the # regex-automata engines are far more principled about how they deal with # look-around. (This was one of the many reasons I wanted to re-work the # original regex crate engines.) [[test]] name = "word-boundary-interact-poorly-with-literal-optimizations" regex = '(?i:(?:\b|_)win(?:32|64|dows)?(?:\b|_))' haystack = 'ubi-Darwin-x86_64.tar.gz' matches = [] # This was found during fuzz testing of regex. It provoked a panic in the meta # engine as a result of the reverse suffix optimization. Namely, it hit a case # where a suffix match was found, a corresponding reverse match was found, but # the forward search turned up no match. The forward search should always match # if the suffix and reverse search match. # # This in turn uncovered an inconsistency between the PikeVM and the DFA (lazy # and fully compiled) engines. It was caused by a mishandling of the collection # of NFA state IDs in the generic determinization code (which is why both types # of DFA were impacted). Namely, when a fail state was encountered (that's the # `[^\s\S]` in the pattern below), then it would just stop collecting states. # But that's not correct since a later state could lead to a match. [[test]] name = "impossible-branch" regex = '.*[^\s\S]A|B' haystack = "B" matches = [[0, 1]] # This was found during fuzz testing in regex-lite. The regex crate never # suffered from this bug, but it causes regex-lite to incorrectly compile # captures. [[test]] name = "captures-wrong-order" regex = '(a){0}(a)' haystack = 'a' matches = [[[0, 1], [], [0, 1]]] # This tests a bug in how quit states are handled in the DFA. At some point # during development, the DFAs were tweaked slightly such that if they hit # a quit state (which means, they hit a byte that the caller configured should # stop the search), then it might not return an error necessarily. Namely, if a # match had already been found, then it would be returned instead of an error. # # But this is actually wrong! Why? Because even though a match had been found, # it wouldn't be fully correct to return it once a quit state has been seen # because you can't determine whether the match offset returned is the correct # greedy/leftmost-first match. Since you can't complete the search as requested # by the caller, the DFA should just stop and return an error. # # Interestingly, this does seem to produce an unavoidable difference between # 'try_is_match().unwrap()' and 'try_find().unwrap().is_some()' for the DFAs. # The former will stop immediately once a match is known to occur and return # 'Ok(true)', where as the latter could find the match but quit with an # 'Err(..)' first. # # Thankfully, I believe this inconsistency between 'is_match()' and 'find()' # cannot be observed in the higher level meta regex API because it specifically # will try another engine that won't fail in the case of a DFA failing. # # This regression happened in the regex crate rewrite, but before anything got # released. [[test]] name = "negated-unicode-word-boundary-dfa-fail" regex = '\B.*' haystack = "!\u02D7" matches = [[0, 3]] # This failure was found in the *old* regex crate (prior to regex 1.9), but # I didn't investigate why. My best guess is that it's a literal optimization # bug. It didn't occur in the rewrite. [[test]] name = "missed-match" regex = 'e..+e.ee>' haystack = 'Zeee.eZZZZZZZZeee>eeeeeee>' matches = [[1, 26]] # This test came from the 'ignore' crate and tripped a bug in how accelerated # DFA states were handled in an overlapping search. [[test]] name = "regex-to-glob" regex = ['(?-u)^path1/[^/]*$'] haystack = "path1/foo" matches = [[0, 9]] utf8 = false match-kind = "all" search-kind = "overlapping" # See: https://github.com/rust-lang/regex/issues/1060 [[test]] name = "reverse-inner-plus-shorter-than-expected" regex = '(?:(\d+)[:.])?(\d{1,2})[:.](\d{2})' haystack = '102:12:39' matches = [[[0, 9], [0, 3], [4, 6], [7, 9]]] # Like reverse-inner-plus-shorter-than-expected, but using a far simpler regex # to demonstrate the extent of the rot. Sigh. # # See: https://github.com/rust-lang/regex/issues/1060 [[test]] name = "reverse-inner-short" regex = '(?:([0-9][0-9][0-9]):)?([0-9][0-9]):([0-9][0-9])' haystack = '102:12:39' matches = [[[0, 9], [0, 3], [4, 6], [7, 9]]] # This regression test was found via the RegexSet APIs. It triggered a # particular code path where a regex was compiled with 'All' match semantics # (to support overlapping search), but got funneled down into a standard # leftmost search when calling 'is_match'. This is fine on its own, but the # leftmost search will use a prefilter and that's where this went awry. # # Namely, since 'All' semantics were used, the aho-corasick prefilter was # incorrectly compiled with 'Standard' semantics. This was wrong because # 'Standard' immediately attempts to report a match at every position, even if # that would mean reporting a match past the leftmost match before reporting # the leftmost match. This breaks the prefilter contract of never having false # negatives and leads overall to the engine not finding a match. # # See: https://github.com/rust-lang/regex/issues/1070 [[test]] name = "prefilter-with-aho-corasick-standard-semantics" regex = '(?m)^ *v [0-9]' haystack = 'v 0' matches = [ { id = 0, spans = [[0, 3]] }, ] match-kind = "all" search-kind = "overlapping" unicode = true utf8 = true