diff options
Diffstat (limited to '')
-rw-r--r-- | vendor/regex/testdata/no-unicode.toml | 222 |
1 files changed, 222 insertions, 0 deletions
diff --git a/vendor/regex/testdata/no-unicode.toml b/vendor/regex/testdata/no-unicode.toml new file mode 100644 index 0000000..0ddac4c --- /dev/null +++ b/vendor/regex/testdata/no-unicode.toml @@ -0,0 +1,222 @@ +[[test]] +name = "invalid-utf8-literal1" +regex = '\xFF' +haystack = '\xFF' +matches = [[0, 1]] +unicode = false +utf8 = false +unescape = true + + +[[test]] +name = "mixed" +regex = '(?:.+)(?-u)(?:.+)' +haystack = '\xCE\x93\xCE\x94\xFF' +matches = [[0, 5]] +utf8 = false +unescape = true + + +[[test]] +name = "case1" +regex = "a" +haystack = "A" +matches = [[0, 1]] +case-insensitive = true +unicode = false + +[[test]] +name = "case2" +regex = "[a-z]+" +haystack = "AaAaA" +matches = [[0, 5]] +case-insensitive = true +unicode = false + +[[test]] +name = "case3" +regex = "[a-z]+" +haystack = "aA\u212AaA" +matches = [[0, 7]] +case-insensitive = true + +[[test]] +name = "case4" +regex = "[a-z]+" +haystack = "aA\u212AaA" +matches = [[0, 2], [5, 7]] +case-insensitive = true +unicode = false + + +[[test]] +name = "negate1" +regex = "[^a]" +haystack = "δ" +matches = [[0, 2]] + +[[test]] +name = "negate2" +regex = "[^a]" +haystack = "δ" +matches = [[0, 1], [1, 2]] +unicode = false +utf8 = false + + +[[test]] +name = "dotstar-prefix1" +regex = "a" +haystack = '\xFFa' +matches = [[1, 2]] +unicode = false +utf8 = false +unescape = true + +[[test]] +name = "dotstar-prefix2" +regex = "a" +haystack = '\xFFa' +matches = [[1, 2]] +utf8 = false +unescape = true + + +[[test]] +name = "null-bytes1" +regex = '[^\x00]+\x00' +haystack = 'foo\x00' +matches = [[0, 4]] +unicode = false +utf8 = false +unescape = true + + +[[test]] +name = "word-ascii" +regex = '\w+' +haystack = "aδ" +matches = [[0, 1]] +unicode = false + +[[test]] +name = "word-unicode" +regex = '\w+' +haystack = "aδ" +matches = [[0, 3]] + +[[test]] +name = "decimal-ascii" +regex = '\d+' +haystack = "1२३9" +matches = [[0, 1], [7, 8]] +unicode = false + +[[test]] +name = "decimal-unicode" +regex = '\d+' +haystack = "1२३9" +matches = [[0, 8]] + +[[test]] +name = "space-ascii" +regex = '\s+' +haystack = " \u1680" +matches = [[0, 1]] +unicode = false + +[[test]] +name = "space-unicode" +regex = '\s+' +haystack = " \u1680" +matches = [[0, 4]] + + +[[test]] +# See: https://github.com/rust-lang/regex/issues/484 +name = "iter1-bytes" +regex = '' +haystack = "☃" +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] +utf8 = false + +[[test]] +# See: https://github.com/rust-lang/regex/issues/484 +name = "iter1-utf8" +regex = '' +haystack = "☃" +matches = [[0, 0], [3, 3]] + +[[test]] +# See: https://github.com/rust-lang/regex/issues/484 +# Note that iter2-utf8 doesn't make sense here, since the input isn't UTF-8. +name = "iter2-bytes" +regex = '' +haystack = 'b\xFFr' +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] +unescape = true +utf8 = false + + +# These test that unanchored prefixes can munch through invalid UTF-8 even when +# utf8 is enabled. +# +# This test actually reflects an interesting simplification in how the Thompson +# NFA is constructed. It used to be that the NFA could be built with an +# unanchored prefix that either matched any byte or _only_ matched valid UTF-8. +# But the latter turns out to be pretty precarious when it comes to prefilters, +# because if you search a haystack that contains invalid UTF-8 but have an +# unanchored prefix that requires UTF-8, then prefilters are no longer a valid +# optimization because you actually have to check that everything is valid +# UTF-8. +# +# Originally, I had thought that we needed a valid UTF-8 unanchored prefix in +# order to guarantee that we only match at valid UTF-8 boundaries. But this +# isn't actually true! There are really only two things to consider here: +# +# 1) Will a regex match split an encoded codepoint? No. Because by construction, +# we ensure that a MATCH state can only be reached by following valid UTF-8 (assuming +# all of the UTF-8 modes are enabled). +# +# 2) Will a regex match arbitrary bytes that aren't valid UTF-8? Again, no, +# assuming all of the UTF-8 modes are enabled. +[[test]] +name = "unanchored-invalid-utf8-match-100" +regex = '[a-z]' +haystack = '\xFFa\xFF' +matches = [[1, 2]] +unescape = true +utf8 = false + +# This test shows that we can still prevent a match from occurring by requiring +# that valid UTF-8 match by inserting our own unanchored prefix. Thus, if the +# behavior of not munching through invalid UTF-8 anywhere is needed, then it +# can be achieved thusly. +[[test]] +name = "unanchored-invalid-utf8-nomatch" +regex = '^(?s:.)*?[a-z]' +haystack = '\xFFa\xFF' +matches = [] +unescape = true +utf8 = false + +# This is a tricky test that makes sure we don't accidentally do a kind of +# unanchored search when we've requested that a regex engine not report +# empty matches that split a codepoint. This test caught a regression during +# development where the code for skipping over bad empty matches would do so +# even if the search should have been anchored. This is ultimately what led to +# making 'anchored' an 'Input' option, so that it was always clear what kind +# of search was being performed. (Before that, whether a search was anchored +# or not was a config knob on the regex engine.) This did wind up making DFAs +# a little more complex to configure (with their 'StartKind' knob), but it +# generally smoothed out everything else. +# +# Great example of a test whose failure motivated a sweeping API refactoring. +[[test]] +name = "anchored-iter-empty-utf8" +regex = '' +haystack = 'a☃z' +matches = [[0, 0], [1, 1]] +unescape = false +utf8 = true +anchored = true |