# These are tests specifically crafted for regexes that can match arbitrary # bytes. In some cases, we also test the Unicode variant as well, just because # it's good sense to do so. But also, these tests aren't really about Unicode, # but whether matches are only reported at valid UTF-8 boundaries. For most # tests in this entire collection, utf8 = true. But for these tests, we use # utf8 = false. [[test]] name = "word-boundary-ascii" regex = ' \b' haystack = " δ" matches = [] unicode = false utf8 = false [[test]] name = "word-boundary-unicode" regex = ' \b' haystack = " δ" matches = [[0, 1]] unicode = true utf8 = false [[test]] name = "word-boundary-ascii-not" regex = ' \B' haystack = " δ" matches = [[0, 1]] unicode = false utf8 = false [[test]] name = "word-boundary-unicode-not" regex = ' \B' haystack = " δ" matches = [] unicode = true utf8 = false [[test]] name = "perl-word-ascii" regex = '\w+' haystack = "aδ" matches = [[0, 1]] unicode = false utf8 = false [[test]] name = "perl-word-unicode" regex = '\w+' haystack = "aδ" matches = [[0, 3]] unicode = true utf8 = false [[test]] name = "perl-decimal-ascii" regex = '\d+' haystack = "1२३9" matches = [[0, 1], [7, 8]] unicode = false utf8 = false [[test]] name = "perl-decimal-unicode" regex = '\d+' haystack = "1२३9" matches = [[0, 8]] unicode = true utf8 = false [[test]] name = "perl-whitespace-ascii" regex = '\s+' haystack = " \u1680" matches = [[0, 1]] unicode = false utf8 = false [[test]] name = "perl-whitespace-unicode" regex = '\s+' haystack = " \u1680" matches = [[0, 4]] unicode = true utf8 = false # The first `(.+)` matches two Unicode codepoints, but can't match the 5th # byte, which isn't valid UTF-8. The second (byte based) `(.+)` takes over and # matches. [[test]] name = "mixed-dot" regex = '(.+)(?-u)(.+)' haystack = '\xCE\x93\xCE\x94\xFF' matches = [ [[0, 5], [0, 4], [4, 5]], ] unescape = true unicode = true utf8 = false [[test]] name = "case-one-ascii" regex = 'a' haystack = "A" matches = [[0, 1]] case-insensitive = true unicode = false utf8 = false [[test]] name = "case-one-unicode" regex = 'a' haystack = "A" matches = [[0, 1]] case-insensitive = true unicode = true utf8 = false [[test]] name = "case-class-simple-ascii" regex = '[a-z]+' haystack = "AaAaA" matches = [[0, 5]] case-insensitive = true unicode = false utf8 = false [[test]] name = "case-class-ascii" regex = '[a-z]+' haystack = "aA\u212AaA" matches = [[0, 2], [5, 7]] case-insensitive = true unicode = false utf8 = false [[test]] name = "case-class-unicode" regex = '[a-z]+' haystack = "aA\u212AaA" matches = [[0, 7]] case-insensitive = true unicode = true utf8 = false [[test]] name = "negate-ascii" regex = '[^a]' haystack = "δ" matches = [[0, 1], [1, 2]] unicode = false utf8 = false [[test]] name = "negate-unicode" regex = '[^a]' haystack = "δ" matches = [[0, 2]] unicode = true utf8 = false # When utf8=true, this won't match, because the implicit '.*?' prefix is # Unicode aware and will refuse to match through invalid UTF-8 bytes. [[test]] name = "dotstar-prefix-ascii" regex = 'a' haystack = '\xFFa' matches = [[1, 2]] unescape = true unicode = false utf8 = false [[test]] name = "dotstar-prefix-unicode" regex = 'a' haystack = '\xFFa' matches = [[1, 2]] unescape = true unicode = true utf8 = false [[test]] name = "null-bytes" regex = '(?P[^\x00]+)\x00' haystack = 'foo\x00' matches = [ [[0, 4], [0, 3]], ] unescape = true unicode = false utf8 = false [[test]] name = "invalid-utf8-anchor-100" regex = '\xCC?^' haystack = '\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4' matches = [[0, 0]] unescape = true unicode = false utf8 = false [[test]] name = "invalid-utf8-anchor-200" regex = '^\xf7|4\xff\d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########[] d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########\[] #####\x80\S7|$' haystack = '\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4' matches = [[22, 22]] unescape = true unicode = false utf8 = false [[test]] name = "invalid-utf8-anchor-300" regex = '^|ddp\xff\xffdddddlQd@\x80' haystack = '\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4' matches = [[0, 0]] unescape = true unicode = false utf8 = false [[test]] name = "word-boundary-ascii-100" regex = '\Bx\B' haystack = "áxβ" matches = [] unicode = false utf8 = false [[test]] name = "word-boundary-ascii-200" regex = '\B' haystack = "0\U0007EF5E" matches = [[2, 2], [3, 3], [4, 4], [5, 5]] unicode = false utf8 = false