222 lines
5.1 KiB
TOML
222 lines
5.1 KiB
TOML
[[test]]
|
|
name = "invalid-utf8-literal1"
|
|
regex = '\xFF'
|
|
haystack = '\xFF'
|
|
matches = [[0, 1]]
|
|
unicode = false
|
|
utf8 = false
|
|
unescape = true
|
|
|
|
|
|
[[test]]
|
|
name = "mixed"
|
|
regex = '(?:.+)(?-u)(?:.+)'
|
|
haystack = '\xCE\x93\xCE\x94\xFF'
|
|
matches = [[0, 5]]
|
|
utf8 = false
|
|
unescape = true
|
|
|
|
|
|
[[test]]
|
|
name = "case1"
|
|
regex = "a"
|
|
haystack = "A"
|
|
matches = [[0, 1]]
|
|
case-insensitive = true
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "case2"
|
|
regex = "[a-z]+"
|
|
haystack = "AaAaA"
|
|
matches = [[0, 5]]
|
|
case-insensitive = true
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "case3"
|
|
regex = "[a-z]+"
|
|
haystack = "aA\u212AaA"
|
|
matches = [[0, 7]]
|
|
case-insensitive = true
|
|
|
|
[[test]]
|
|
name = "case4"
|
|
regex = "[a-z]+"
|
|
haystack = "aA\u212AaA"
|
|
matches = [[0, 2], [5, 7]]
|
|
case-insensitive = true
|
|
unicode = false
|
|
|
|
|
|
[[test]]
|
|
name = "negate1"
|
|
regex = "[^a]"
|
|
haystack = "δ"
|
|
matches = [[0, 2]]
|
|
|
|
[[test]]
|
|
name = "negate2"
|
|
regex = "[^a]"
|
|
haystack = "δ"
|
|
matches = [[0, 1], [1, 2]]
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
|
|
[[test]]
|
|
name = "dotstar-prefix1"
|
|
regex = "a"
|
|
haystack = '\xFFa'
|
|
matches = [[1, 2]]
|
|
unicode = false
|
|
utf8 = false
|
|
unescape = true
|
|
|
|
[[test]]
|
|
name = "dotstar-prefix2"
|
|
regex = "a"
|
|
haystack = '\xFFa'
|
|
matches = [[1, 2]]
|
|
utf8 = false
|
|
unescape = true
|
|
|
|
|
|
[[test]]
|
|
name = "null-bytes1"
|
|
regex = '[^\x00]+\x00'
|
|
haystack = 'foo\x00'
|
|
matches = [[0, 4]]
|
|
unicode = false
|
|
utf8 = false
|
|
unescape = true
|
|
|
|
|
|
[[test]]
|
|
name = "word-ascii"
|
|
regex = '\w+'
|
|
haystack = "aδ"
|
|
matches = [[0, 1]]
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "word-unicode"
|
|
regex = '\w+'
|
|
haystack = "aδ"
|
|
matches = [[0, 3]]
|
|
|
|
[[test]]
|
|
name = "decimal-ascii"
|
|
regex = '\d+'
|
|
haystack = "1२३9"
|
|
matches = [[0, 1], [7, 8]]
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "decimal-unicode"
|
|
regex = '\d+'
|
|
haystack = "1२३9"
|
|
matches = [[0, 8]]
|
|
|
|
[[test]]
|
|
name = "space-ascii"
|
|
regex = '\s+'
|
|
haystack = " \u1680"
|
|
matches = [[0, 1]]
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "space-unicode"
|
|
regex = '\s+'
|
|
haystack = " \u1680"
|
|
matches = [[0, 4]]
|
|
|
|
|
|
[[test]]
|
|
# See: https://github.com/rust-lang/regex/issues/484
|
|
name = "iter1-bytes"
|
|
regex = ''
|
|
haystack = "☃"
|
|
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
# See: https://github.com/rust-lang/regex/issues/484
|
|
name = "iter1-utf8"
|
|
regex = ''
|
|
haystack = "☃"
|
|
matches = [[0, 0], [3, 3]]
|
|
|
|
[[test]]
|
|
# See: https://github.com/rust-lang/regex/issues/484
|
|
# Note that iter2-utf8 doesn't make sense here, since the input isn't UTF-8.
|
|
name = "iter2-bytes"
|
|
regex = ''
|
|
haystack = 'b\xFFr'
|
|
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
|
|
unescape = true
|
|
utf8 = false
|
|
|
|
|
|
# These test that unanchored prefixes can munch through invalid UTF-8 even when
|
|
# utf8 is enabled.
|
|
#
|
|
# This test actually reflects an interesting simplification in how the Thompson
|
|
# NFA is constructed. It used to be that the NFA could be built with an
|
|
# unanchored prefix that either matched any byte or _only_ matched valid UTF-8.
|
|
# But the latter turns out to be pretty precarious when it comes to prefilters,
|
|
# because if you search a haystack that contains invalid UTF-8 but have an
|
|
# unanchored prefix that requires UTF-8, then prefilters are no longer a valid
|
|
# optimization because you actually have to check that everything is valid
|
|
# UTF-8.
|
|
#
|
|
# Originally, I had thought that we needed a valid UTF-8 unanchored prefix in
|
|
# order to guarantee that we only match at valid UTF-8 boundaries. But this
|
|
# isn't actually true! There are really only two things to consider here:
|
|
#
|
|
# 1) Will a regex match split an encoded codepoint? No. Because by construction,
|
|
# we ensure that a MATCH state can only be reached by following valid UTF-8 (assuming
|
|
# all of the UTF-8 modes are enabled).
|
|
#
|
|
# 2) Will a regex match arbitrary bytes that aren't valid UTF-8? Again, no,
|
|
# assuming all of the UTF-8 modes are enabled.
|
|
[[test]]
|
|
name = "unanchored-invalid-utf8-match-100"
|
|
regex = '[a-z]'
|
|
haystack = '\xFFa\xFF'
|
|
matches = [[1, 2]]
|
|
unescape = true
|
|
utf8 = false
|
|
|
|
# This test shows that we can still prevent a match from occurring by requiring
|
|
# that valid UTF-8 match by inserting our own unanchored prefix. Thus, if the
|
|
# behavior of not munching through invalid UTF-8 anywhere is needed, then it
|
|
# can be achieved thusly.
|
|
[[test]]
|
|
name = "unanchored-invalid-utf8-nomatch"
|
|
regex = '^(?s:.)*?[a-z]'
|
|
haystack = '\xFFa\xFF'
|
|
matches = []
|
|
unescape = true
|
|
utf8 = false
|
|
|
|
# This is a tricky test that makes sure we don't accidentally do a kind of
|
|
# unanchored search when we've requested that a regex engine not report
|
|
# empty matches that split a codepoint. This test caught a regression during
|
|
# development where the code for skipping over bad empty matches would do so
|
|
# even if the search should have been anchored. This is ultimately what led to
|
|
# making 'anchored' an 'Input' option, so that it was always clear what kind
|
|
# of search was being performed. (Before that, whether a search was anchored
|
|
# or not was a config knob on the regex engine.) This did wind up making DFAs
|
|
# a little more complex to configure (with their 'StartKind' knob), but it
|
|
# generally smoothed out everything else.
|
|
#
|
|
# Great example of a test whose failure motivated a sweeping API refactoring.
|
|
[[test]]
|
|
name = "anchored-iter-empty-utf8"
|
|
regex = ''
|
|
haystack = 'a☃z'
|
|
matches = [[0, 0], [1, 1]]
|
|
unescape = false
|
|
utf8 = true
|
|
anchored = true
|