[[test]] name = "invalid-utf8-literal1" regex = '\xFF' haystack = '\xFF' matches = [[0, 1]] unicode = false utf8 = false unescape = true [[test]] name = "mixed" regex = '(?:.+)(?-u)(?:.+)' haystack = '\xCE\x93\xCE\x94\xFF' matches = [[0, 5]] utf8 = false unescape = true [[test]] name = "case1" regex = "a" haystack = "A" matches = [[0, 1]] case-insensitive = true unicode = false [[test]] name = "case2" regex = "[a-z]+" haystack = "AaAaA" matches = [[0, 5]] case-insensitive = true unicode = false [[test]] name = "case3" regex = "[a-z]+" haystack = "aA\u212AaA" matches = [[0, 7]] case-insensitive = true [[test]] name = "case4" regex = "[a-z]+" haystack = "aA\u212AaA" matches = [[0, 2], [5, 7]] case-insensitive = true unicode = false [[test]] name = "negate1" regex = "[^a]" haystack = "δ" matches = [[0, 2]] [[test]] name = "negate2" regex = "[^a]" haystack = "δ" matches = [[0, 1], [1, 2]] unicode = false utf8 = false [[test]] name = "dotstar-prefix1" regex = "a" haystack = '\xFFa' matches = [[1, 2]] unicode = false utf8 = false unescape = true [[test]] name = "dotstar-prefix2" regex = "a" haystack = '\xFFa' matches = [[1, 2]] utf8 = false unescape = true [[test]] name = "null-bytes1" regex = '[^\x00]+\x00' haystack = 'foo\x00' matches = [[0, 4]] unicode = false utf8 = false unescape = true [[test]] name = "word-ascii" regex = '\w+' haystack = "aδ" matches = [[0, 1]] unicode = false [[test]] name = "word-unicode" regex = '\w+' haystack = "aδ" matches = [[0, 3]] [[test]] name = "decimal-ascii" regex = '\d+' haystack = "1२३9" matches = [[0, 1], [7, 8]] unicode = false [[test]] name = "decimal-unicode" regex = '\d+' haystack = "1२३9" matches = [[0, 8]] [[test]] name = "space-ascii" regex = '\s+' haystack = " \u1680" matches = [[0, 1]] unicode = false [[test]] name = "space-unicode" regex = '\s+' haystack = " \u1680" matches = [[0, 4]] [[test]] # See: https://github.com/rust-lang/regex/issues/484 name = "iter1-bytes" regex = '' haystack = "☃" matches = [[0, 0], [1, 1], [2, 2], [3, 3]] utf8 = false [[test]] # See: https://github.com/rust-lang/regex/issues/484 name = "iter1-utf8" regex = '' haystack = "☃" matches = [[0, 0], [3, 3]] [[test]] # See: https://github.com/rust-lang/regex/issues/484 # Note that iter2-utf8 doesn't make sense here, since the input isn't UTF-8. name = "iter2-bytes" regex = '' haystack = 'b\xFFr' matches = [[0, 0], [1, 1], [2, 2], [3, 3]] unescape = true utf8 = false # These test that unanchored prefixes can munch through invalid UTF-8 even when # utf8 is enabled. # # This test actually reflects an interesting simplification in how the Thompson # NFA is constructed. It used to be that the NFA could be built with an # unanchored prefix that either matched any byte or _only_ matched valid UTF-8. # But the latter turns out to be pretty precarious when it comes to prefilters, # because if you search a haystack that contains invalid UTF-8 but have an # unanchored prefix that requires UTF-8, then prefilters are no longer a valid # optimization because you actually have to check that everything is valid # UTF-8. # # Originally, I had thought that we needed a valid UTF-8 unanchored prefix in # order to guarantee that we only match at valid UTF-8 boundaries. But this # isn't actually true! There are really only two things to consider here: # # 1) Will a regex match split an encoded codepoint? No. Because by construction, # we ensure that a MATCH state can only be reached by following valid UTF-8 (assuming # all of the UTF-8 modes are enabled). # # 2) Will a regex match arbitrary bytes that aren't valid UTF-8? Again, no, # assuming all of the UTF-8 modes are enabled. [[test]] name = "unanchored-invalid-utf8-match-100" regex = '[a-z]' haystack = '\xFFa\xFF' matches = [[1, 2]] unescape = true utf8 = false # This test shows that we can still prevent a match from occurring by requiring # that valid UTF-8 match by inserting our own unanchored prefix. Thus, if the # behavior of not munching through invalid UTF-8 anywhere is needed, then it # can be achieved thusly. [[test]] name = "unanchored-invalid-utf8-nomatch" regex = '^(?s:.)*?[a-z]' haystack = '\xFFa\xFF' matches = [] unescape = true utf8 = false # This is a tricky test that makes sure we don't accidentally do a kind of # unanchored search when we've requested that a regex engine not report # empty matches that split a codepoint. This test caught a regression during # development where the code for skipping over bad empty matches would do so # even if the search should have been anchored. This is ultimately what led to # making 'anchored' an 'Input' option, so that it was always clear what kind # of search was being performed. (Before that, whether a search was anchored # or not was a config knob on the regex engine.) This did wind up making DFAs # a little more complex to configure (with their 'StartKind' knob), but it # generally smoothed out everything else. # # Great example of a test whose failure motivated a sweeping API refactoring. [[test]] name = "anchored-iter-empty-utf8" regex = '' haystack = 'a☃z' matches = [[0, 0], [1, 1]] unescape = false utf8 = true anchored = true