235 lines
4.4 KiB
TOML
235 lines
4.4 KiB
TOML
# These are tests specifically crafted for regexes that can match arbitrary
|
|
# bytes. In some cases, we also test the Unicode variant as well, just because
|
|
# it's good sense to do so. But also, these tests aren't really about Unicode,
|
|
# but whether matches are only reported at valid UTF-8 boundaries. For most
|
|
# tests in this entire collection, utf8 = true. But for these tests, we use
|
|
# utf8 = false.
|
|
|
|
[[test]]
|
|
name = "word-boundary-ascii"
|
|
regex = ' \b'
|
|
haystack = " δ"
|
|
matches = []
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "word-boundary-unicode"
|
|
regex = ' \b'
|
|
haystack = " δ"
|
|
matches = [[0, 1]]
|
|
unicode = true
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "word-boundary-ascii-not"
|
|
regex = ' \B'
|
|
haystack = " δ"
|
|
matches = [[0, 1]]
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "word-boundary-unicode-not"
|
|
regex = ' \B'
|
|
haystack = " δ"
|
|
matches = []
|
|
unicode = true
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "perl-word-ascii"
|
|
regex = '\w+'
|
|
haystack = "aδ"
|
|
matches = [[0, 1]]
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "perl-word-unicode"
|
|
regex = '\w+'
|
|
haystack = "aδ"
|
|
matches = [[0, 3]]
|
|
unicode = true
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "perl-decimal-ascii"
|
|
regex = '\d+'
|
|
haystack = "1२३9"
|
|
matches = [[0, 1], [7, 8]]
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "perl-decimal-unicode"
|
|
regex = '\d+'
|
|
haystack = "1२३9"
|
|
matches = [[0, 8]]
|
|
unicode = true
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "perl-whitespace-ascii"
|
|
regex = '\s+'
|
|
haystack = " \u1680"
|
|
matches = [[0, 1]]
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "perl-whitespace-unicode"
|
|
regex = '\s+'
|
|
haystack = " \u1680"
|
|
matches = [[0, 4]]
|
|
unicode = true
|
|
utf8 = false
|
|
|
|
# The first `(.+)` matches two Unicode codepoints, but can't match the 5th
|
|
# byte, which isn't valid UTF-8. The second (byte based) `(.+)` takes over and
|
|
# matches.
|
|
[[test]]
|
|
name = "mixed-dot"
|
|
regex = '(.+)(?-u)(.+)'
|
|
haystack = '\xCE\x93\xCE\x94\xFF'
|
|
matches = [
|
|
[[0, 5], [0, 4], [4, 5]],
|
|
]
|
|
unescape = true
|
|
unicode = true
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "case-one-ascii"
|
|
regex = 'a'
|
|
haystack = "A"
|
|
matches = [[0, 1]]
|
|
case-insensitive = true
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "case-one-unicode"
|
|
regex = 'a'
|
|
haystack = "A"
|
|
matches = [[0, 1]]
|
|
case-insensitive = true
|
|
unicode = true
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "case-class-simple-ascii"
|
|
regex = '[a-z]+'
|
|
haystack = "AaAaA"
|
|
matches = [[0, 5]]
|
|
case-insensitive = true
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "case-class-ascii"
|
|
regex = '[a-z]+'
|
|
haystack = "aA\u212AaA"
|
|
matches = [[0, 2], [5, 7]]
|
|
case-insensitive = true
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "case-class-unicode"
|
|
regex = '[a-z]+'
|
|
haystack = "aA\u212AaA"
|
|
matches = [[0, 7]]
|
|
case-insensitive = true
|
|
unicode = true
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "negate-ascii"
|
|
regex = '[^a]'
|
|
haystack = "δ"
|
|
matches = [[0, 1], [1, 2]]
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "negate-unicode"
|
|
regex = '[^a]'
|
|
haystack = "δ"
|
|
matches = [[0, 2]]
|
|
unicode = true
|
|
utf8 = false
|
|
|
|
# When utf8=true, this won't match, because the implicit '.*?' prefix is
|
|
# Unicode aware and will refuse to match through invalid UTF-8 bytes.
|
|
[[test]]
|
|
name = "dotstar-prefix-ascii"
|
|
regex = 'a'
|
|
haystack = '\xFFa'
|
|
matches = [[1, 2]]
|
|
unescape = true
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "dotstar-prefix-unicode"
|
|
regex = 'a'
|
|
haystack = '\xFFa'
|
|
matches = [[1, 2]]
|
|
unescape = true
|
|
unicode = true
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "null-bytes"
|
|
regex = '(?P<cstr>[^\x00]+)\x00'
|
|
haystack = 'foo\x00'
|
|
matches = [
|
|
[[0, 4], [0, 3]],
|
|
]
|
|
unescape = true
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "invalid-utf8-anchor-100"
|
|
regex = '\xCC?^'
|
|
haystack = '\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4'
|
|
matches = [[0, 0]]
|
|
unescape = true
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "invalid-utf8-anchor-200"
|
|
regex = '^\xf7|4\xff\d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########[] d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########\[] #####\x80\S7|$'
|
|
haystack = '\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4'
|
|
matches = [[22, 22]]
|
|
unescape = true
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "invalid-utf8-anchor-300"
|
|
regex = '^|ddp\xff\xffdddddlQd@\x80'
|
|
haystack = '\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4'
|
|
matches = [[0, 0]]
|
|
unescape = true
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "word-boundary-ascii-100"
|
|
regex = '\Bx\B'
|
|
haystack = "áxβ"
|
|
matches = []
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "word-boundary-ascii-200"
|
|
regex = '\B'
|
|
haystack = "0\U0007EF5E"
|
|
matches = [[2, 2], [3, 3], [4, 4], [5, 5]]
|
|
unicode = false
|
|
utf8 = false
|