# Some of these are cribbed from RE2's test suite. # These test \b. Below are tests for \B. [[test]] name = "wb1" regex = '\b' haystack = "" matches = [] unicode = false [[test]] name = "wb2" regex = '\b' haystack = "a" matches = [[0, 0], [1, 1]] unicode = false [[test]] name = "wb3" regex = '\b' haystack = "ab" matches = [[0, 0], [2, 2]] unicode = false [[test]] name = "wb4" regex = '^\b' haystack = "ab" matches = [[0, 0]] unicode = false [[test]] name = "wb5" regex = '\b$' haystack = "ab" matches = [[2, 2]] unicode = false [[test]] name = "wb6" regex = '^\b$' haystack = "ab" matches = [] unicode = false [[test]] name = "wb7" regex = '\bbar\b' haystack = "nobar bar foo bar" matches = [[6, 9], [14, 17]] unicode = false [[test]] name = "wb8" regex = 'a\b' haystack = "faoa x" matches = [[3, 4]] unicode = false [[test]] name = "wb9" regex = '\bbar' haystack = "bar x" matches = [[0, 3]] unicode = false [[test]] name = "wb10" regex = '\bbar' haystack = "foo\nbar x" matches = [[4, 7]] unicode = false [[test]] name = "wb11" regex = 'bar\b' haystack = "foobar" matches = [[3, 6]] unicode = false [[test]] name = "wb12" regex = 'bar\b' haystack = "foobar\nxxx" matches = [[3, 6]] unicode = false [[test]] name = "wb13" regex = '(?:foo|bar|[A-Z])\b' haystack = "foo" matches = [[0, 3]] unicode = false [[test]] name = "wb14" regex = '(?:foo|bar|[A-Z])\b' haystack = "foo\n" matches = [[0, 3]] unicode = false [[test]] name = "wb15" regex = '\b(?:foo|bar|[A-Z])' haystack = "foo" matches = [[0, 3]] unicode = false [[test]] name = "wb16" regex = '\b(?:foo|bar|[A-Z])\b' haystack = "X" matches = [[0, 1]] unicode = false [[test]] name = "wb17" regex = '\b(?:foo|bar|[A-Z])\b' haystack = "XY" matches = [] unicode = false [[test]] name = "wb18" regex = '\b(?:foo|bar|[A-Z])\b' haystack = "bar" matches = [[0, 3]] unicode = false [[test]] name = "wb19" regex = '\b(?:foo|bar|[A-Z])\b' haystack = "foo" matches = [[0, 3]] unicode = false [[test]] name = "wb20" regex = '\b(?:foo|bar|[A-Z])\b' haystack = "foo\n" matches = [[0, 3]] unicode = false [[test]] name = "wb21" regex = '\b(?:foo|bar|[A-Z])\b' haystack = "ffoo bbar N x" matches = [[10, 11]] unicode = false [[test]] name = "wb22" regex = '\b(?:fo|foo)\b' haystack = "fo" matches = [[0, 2]] unicode = false [[test]] name = "wb23" regex = '\b(?:fo|foo)\b' haystack = "foo" matches = [[0, 3]] unicode = false [[test]] name = "wb24" regex = '\b\b' haystack = "" matches = [] unicode = false [[test]] name = "wb25" regex = '\b\b' haystack = "a" matches = [[0, 0], [1, 1]] unicode = false [[test]] name = "wb26" regex = '\b$' haystack = "" matches = [] unicode = false [[test]] name = "wb27" regex = '\b$' haystack = "x" matches = [[1, 1]] unicode = false [[test]] name = "wb28" regex = '\b$' haystack = "y x" matches = [[3, 3]] unicode = false [[test]] name = "wb29" regex = '(?-u:\b).$' haystack = "x" matches = [[0, 1]] [[test]] name = "wb30" regex = '^\b(?:fo|foo)\b' haystack = "fo" matches = [[0, 2]] unicode = false [[test]] name = "wb31" regex = '^\b(?:fo|foo)\b' haystack = "foo" matches = [[0, 3]] unicode = false [[test]] name = "wb32" regex = '^\b$' haystack = "" matches = [] unicode = false [[test]] name = "wb33" regex = '^\b$' haystack = "x" matches = [] unicode = false [[test]] name = "wb34" regex = '^(?-u:\b).$' haystack = "x" matches = [[0, 1]] [[test]] name = "wb35" regex = '^(?-u:\b).(?-u:\b)$' haystack = "x" matches = [[0, 1]] [[test]] name = "wb36" regex = '^^^^^\b$$$$$' haystack = "" matches = [] unicode = false [[test]] name = "wb37" regex = '^^^^^(?-u:\b).$$$$$' haystack = "x" matches = [[0, 1]] [[test]] name = "wb38" regex = '^^^^^\b$$$$$' haystack = "x" matches = [] unicode = false [[test]] name = "wb39" regex = '^^^^^(?-u:\b\b\b).(?-u:\b\b\b)$$$$$' haystack = "x" matches = [[0, 1]] [[test]] name = "wb40" regex = '(?-u:\b).+(?-u:\b)' haystack = "$$abc$$" matches = [[2, 5]] [[test]] name = "wb41" regex = '\b' haystack = "a b c" matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]] unicode = false [[test]] name = "wb42" regex = '\bfoo\b' haystack = "zzz foo zzz" matches = [[4, 7]] unicode = false [[test]] name = "wb43" regex = '\b^' haystack = "ab" matches = [[0, 0]] unicode = false [[test]] name = "wb44" regex = '$\b' haystack = "ab" matches = [[2, 2]] unicode = false # Tests for \B. Note that \B is not allowed if UTF-8 mode is enabled, so we # have to disable it for most of these tests. This is because \B can match at # non-UTF-8 boundaries. [[test]] name = "nb1" regex = '\Bfoo\B' haystack = "n foo xfoox that" matches = [[7, 10]] unicode = false utf8 = false [[test]] name = "nb2" regex = 'a\B' haystack = "faoa x" matches = [[1, 2]] unicode = false utf8 = false [[test]] name = "nb3" regex = '\Bbar' haystack = "bar x" matches = [] unicode = false utf8 = false [[test]] name = "nb4" regex = '\Bbar' haystack = "foo\nbar x" matches = [] unicode = false utf8 = false [[test]] name = "nb5" regex = 'bar\B' haystack = "foobar" matches = [] unicode = false utf8 = false [[test]] name = "nb6" regex = 'bar\B' haystack = "foobar\nxxx" matches = [] unicode = false utf8 = false [[test]] name = "nb7" regex = '(?:foo|bar|[A-Z])\B' haystack = "foox" matches = [[0, 3]] unicode = false utf8 = false [[test]] name = "nb8" regex = '(?:foo|bar|[A-Z])\B' haystack = "foo\n" matches = [] unicode = false utf8 = false [[test]] name = "nb9" regex = '\B' haystack = "" matches = [[0, 0]] unicode = false utf8 = false [[test]] name = "nb10" regex = '\B' haystack = "x" matches = [] unicode = false utf8 = false [[test]] name = "nb11" regex = '\B(?:foo|bar|[A-Z])' haystack = "foo" matches = [] unicode = false utf8 = false [[test]] name = "nb12" regex = '\B(?:foo|bar|[A-Z])\B' haystack = "xXy" matches = [[1, 2]] unicode = false utf8 = false [[test]] name = "nb13" regex = '\B(?:foo|bar|[A-Z])\B' haystack = "XY" matches = [] unicode = false utf8 = false [[test]] name = "nb14" regex = '\B(?:foo|bar|[A-Z])\B' haystack = "XYZ" matches = [[1, 2]] unicode = false utf8 = false [[test]] name = "nb15" regex = '\B(?:foo|bar|[A-Z])\B' haystack = "abara" matches = [[1, 4]] unicode = false utf8 = false [[test]] name = "nb16" regex = '\B(?:foo|bar|[A-Z])\B' haystack = "xfoo_" matches = [[1, 4]] unicode = false utf8 = false [[test]] name = "nb17" regex = '\B(?:foo|bar|[A-Z])\B' haystack = "xfoo\n" matches = [] unicode = false utf8 = false [[test]] name = "nb18" regex = '\B(?:foo|bar|[A-Z])\B' haystack = "foo bar vNX" matches = [[9, 10]] unicode = false utf8 = false [[test]] name = "nb19" regex = '\B(?:fo|foo)\B' haystack = "xfoo" matches = [[1, 3]] unicode = false utf8 = false [[test]] name = "nb20" regex = '\B(?:foo|fo)\B' haystack = "xfooo" matches = [[1, 4]] unicode = false utf8 = false [[test]] name = "nb21" regex = '\B\B' haystack = "" matches = [[0, 0]] unicode = false utf8 = false [[test]] name = "nb22" regex = '\B\B' haystack = "x" matches = [] unicode = false utf8 = false [[test]] name = "nb23" regex = '\B$' haystack = "" matches = [[0, 0]] unicode = false utf8 = false [[test]] name = "nb24" regex = '\B$' haystack = "x" matches = [] unicode = false utf8 = false [[test]] name = "nb25" regex = '\B$' haystack = "y x" matches = [] unicode = false utf8 = false [[test]] name = "nb26" regex = '\B.$' haystack = "x" matches = [] unicode = false utf8 = false [[test]] name = "nb27" regex = '^\B(?:fo|foo)\B' haystack = "fo" matches = [] unicode = false utf8 = false [[test]] name = "nb28" regex = '^\B(?:fo|foo)\B' haystack = "fo" matches = [] unicode = false utf8 = false [[test]] name = "nb29" regex = '^\B' haystack = "" matches = [[0, 0]] unicode = false utf8 = false [[test]] name = "nb30" regex = '^\B' haystack = "x" matches = [] unicode = false utf8 = false [[test]] name = "nb31" regex = '^\B\B' haystack = "" matches = [[0, 0]] unicode = false utf8 = false [[test]] name = "nb32" regex = '^\B\B' haystack = "x" matches = [] unicode = false utf8 = false [[test]] name = "nb33" regex = '^\B$' haystack = "" matches = [[0, 0]] unicode = false utf8 = false [[test]] name = "nb34" regex = '^\B$' haystack = "x" matches = [] unicode = false utf8 = false [[test]] name = "nb35" regex = '^\B.$' haystack = "x" matches = [] unicode = false utf8 = false [[test]] name = "nb36" regex = '^\B.\B$' haystack = "x" matches = [] unicode = false utf8 = false [[test]] name = "nb37" regex = '^^^^^\B$$$$$' haystack = "" matches = [[0, 0]] unicode = false utf8 = false [[test]] name = "nb38" regex = '^^^^^\B.$$$$$' haystack = "x" matches = [] unicode = false utf8 = false [[test]] name = "nb39" regex = '^^^^^\B$$$$$' haystack = "x" matches = [] unicode = false utf8 = false # unicode1* and unicode2* work for both Unicode and ASCII because all matches # are reported as byte offsets, and « and » do not correspond to word # boundaries at either the character or byte level. [[test]] name = "unicode1" regex = '\bx\b' haystack = "«x" matches = [[2, 3]] [[test]] name = "unicode1-only-ascii" regex = '\bx\b' haystack = "«x" matches = [[2, 3]] unicode = false [[test]] name = "unicode2" regex = '\bx\b' haystack = "x»" matches = [[0, 1]] [[test]] name = "unicode2-only-ascii" regex = '\bx\b' haystack = "x»" matches = [[0, 1]] unicode = false # ASCII word boundaries are completely oblivious to Unicode characters, so # even though β is a character, an ASCII \b treats it as a word boundary # when it is adjacent to another ASCII character. (The ASCII \b only looks # at the leading byte of β.) For Unicode \b, the tests are precisely inverted. [[test]] name = "unicode3" regex = '\bx\b' haystack = 'áxβ' matches = [] [[test]] name = "unicode3-only-ascii" regex = '\bx\b' haystack = 'áxβ' matches = [[2, 3]] unicode = false [[test]] name = "unicode4" regex = '\Bx\B' haystack = 'áxβ' matches = [[2, 3]] [[test]] name = "unicode4-only-ascii" regex = '\Bx\B' haystack = 'áxβ' matches = [] unicode = false utf8 = false # The same as above, but with \b instead of \B as a sanity check. [[test]] name = "unicode5" regex = '\b' haystack = "0\U0007EF5E" matches = [[0, 0], [1, 1]] [[test]] name = "unicode5-only-ascii" regex = '\b' haystack = "0\U0007EF5E" matches = [[0, 0], [1, 1]] unicode = false utf8 = false [[test]] name = "unicode5-noutf8" regex = '\b' haystack = '0\xFF\xFF\xFF\xFF' matches = [[0, 0], [1, 1]] unescape = true utf8 = false [[test]] name = "unicode5-noutf8-only-ascii" regex = '\b' haystack = '0\xFF\xFF\xFF\xFF' matches = [[0, 0], [1, 1]] unescape = true unicode = false utf8 = false # Weird special case to ensure that ASCII \B treats each individual code unit # as a non-word byte. (The specific codepoint is irrelevant. It's an arbitrary # codepoint that uses 4 bytes in its UTF-8 encoding and is not a member of the # \w character class.) [[test]] name = "unicode5-not" regex = '\B' haystack = "0\U0007EF5E" matches = [[5, 5]] [[test]] name = "unicode5-not-only-ascii" regex = '\B' haystack = "0\U0007EF5E" matches = [[2, 2], [3, 3], [4, 4], [5, 5]] unicode = false utf8 = false # This gets no matches since \B only matches in the presence of valid UTF-8 # when Unicode is enabled, even when UTF-8 mode is disabled. [[test]] name = "unicode5-not-noutf8" regex = '\B' haystack = '0\xFF\xFF\xFF\xFF' matches = [] unescape = true utf8 = false # But this DOES get matches since \B in ASCII mode only looks at individual # bytes. [[test]] name = "unicode5-not-noutf8-only-ascii" regex = '\B' haystack = '0\xFF\xFF\xFF\xFF' matches = [[2, 2], [3, 3], [4, 4], [5, 5]] unescape = true unicode = false utf8 = false # Some tests of no particular significance. [[test]] name = "unicode6" regex = '\b[0-9]+\b' haystack = "foo 123 bar 456 quux 789" matches = [[4, 7], [12, 15], [21, 24]] [[test]] name = "unicode7" regex = '\b[0-9]+\b' haystack = "foo 123 bar a456 quux 789" matches = [[4, 7], [22, 25]] [[test]] name = "unicode8" regex = '\b[0-9]+\b' haystack = "foo 123 bar 456a quux 789" matches = [[4, 7], [22, 25]] # A variant of the problem described here: # https://github.com/google/re2/blob/89567f5de5b23bb5ad0c26cbafc10bdc7389d1fa/re2/dfa.cc#L658-L667 [[test]] name = "alt-with-assertion-repetition" regex = '(?:\b|%)+' haystack = "z%" bounds = [1, 2] anchored = true matches = [[1, 1]]