# Some of these are cribbed from RE2's test suite. # These test \b. Below are tests for \B. [[tests]] name = "wb1" regex = '\b' input = "" matches = [] unicode = false [[tests]] name = "wb2" regex = '\b' input = "a" matches = [[0, 0], [1, 1]] unicode = false [[tests]] name = "wb3" regex = '\b' input = "ab" matches = [[0, 0], [2, 2]] unicode = false [[tests]] name = "wb4" regex = '^\b' input = "ab" matches = [[0, 0]] unicode = false [[tests]] name = "wb5" regex = '\b$' input = "ab" matches = [[2, 2]] unicode = false [[tests]] name = "wb6" regex = '^\b$' input = "ab" matches = [] unicode = false [[tests]] name = "wb7" regex = '\bbar\b' input = "nobar bar foo bar" matches = [[6, 9], [14, 17]] unicode = false [[tests]] name = "wb8" regex = 'a\b' input = "faoa x" matches = [[3, 4]] unicode = false [[tests]] name = "wb9" regex = '\bbar' input = "bar x" matches = [[0, 3]] unicode = false [[tests]] name = "wb10" regex = '\bbar' input = "foo\nbar x" matches = [[4, 7]] unicode = false [[tests]] name = "wb11" regex = 'bar\b' input = "foobar" matches = [[3, 6]] unicode = false [[tests]] name = "wb12" regex = 'bar\b' input = "foobar\nxxx" matches = [[3, 6]] unicode = false [[tests]] name = "wb13" regex = '(foo|bar|[A-Z])\b' input = "foo" matches = [[0, 3]] unicode = false [[tests]] name = "wb14" regex = '(foo|bar|[A-Z])\b' input = "foo\n" matches = [[0, 3]] unicode = false [[tests]] name = "wb15" regex = '\b(foo|bar|[A-Z])' input = "foo" matches = [[0, 3]] unicode = false [[tests]] name = "wb16" regex = '\b(foo|bar|[A-Z])\b' input = "X" matches = [[0, 1]] unicode = false [[tests]] name = "wb17" regex = '\b(foo|bar|[A-Z])\b' input = "XY" matches = [] unicode = false [[tests]] name = "wb18" regex = '\b(foo|bar|[A-Z])\b' input = "bar" matches = [[0, 3]] unicode = false [[tests]] name = "wb19" regex = '\b(foo|bar|[A-Z])\b' input = "foo" matches = [[0, 3]] unicode = false [[tests]] name = "wb20" regex = '\b(foo|bar|[A-Z])\b' input = "foo\n" matches = [[0, 3]] unicode = false [[tests]] name = "wb21" regex = '\b(foo|bar|[A-Z])\b' input = "ffoo bbar N x" matches = [[10, 11]] unicode = false [[tests]] name = "wb22" regex = '\b(fo|foo)\b' input = "fo" matches = [[0, 2]] unicode = false [[tests]] name = "wb23" regex = '\b(fo|foo)\b' input = "foo" matches = [[0, 3]] unicode = false [[tests]] name = "wb24" regex = '\b\b' input = "" matches = [] unicode = false [[tests]] name = "wb25" regex = '\b\b' input = "a" matches = [[0, 0], [1, 1]] unicode = false [[tests]] name = "wb26" regex = '\b$' input = "" matches = [] unicode = false [[tests]] name = "wb27" regex = '\b$' input = "x" matches = [[1, 1]] unicode = false [[tests]] name = "wb28" regex = '\b$' input = "y x" matches = [[3, 3]] unicode = false [[tests]] name = "wb29" regex = '(?-u:\b).$' input = "x" matches = [[0, 1]] [[tests]] name = "wb30" regex = '^\b(fo|foo)\b' input = "fo" matches = [[0, 2]] unicode = false [[tests]] name = "wb31" regex = '^\b(fo|foo)\b' input = "foo" matches = [[0, 3]] unicode = false [[tests]] name = "wb32" regex = '^\b$' input = "" matches = [] unicode = false [[tests]] name = "wb33" regex = '^\b$' input = "x" matches = [] unicode = false [[tests]] name = "wb34" regex = '^(?-u:\b).$' input = "x" matches = [[0, 1]] [[tests]] name = "wb35" regex = '^(?-u:\b).(?-u:\b)$' input = "x" matches = [[0, 1]] [[tests]] name = "wb36" regex = '^^^^^\b$$$$$' input = "" matches = [] unicode = false [[tests]] name = "wb37" regex = '^^^^^(?-u:\b).$$$$$' input = "x" matches = [[0, 1]] [[tests]] name = "wb38" regex = '^^^^^\b$$$$$' input = "x" matches = [] unicode = false [[tests]] name = "wb39" regex = '^^^^^(?-u:\b\b\b).(?-u:\b\b\b)$$$$$' input = "x" matches = [[0, 1]] [[tests]] name = "wb40" regex = '(?-u:\b).+(?-u:\b)' input = "$$abc$$" matches = [[2, 5]] [[tests]] name = "wb41" regex = '\b' input = "a b c" matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]] unicode = false [[tests]] name = "wb42" regex = '\bfoo\b' input = "zzz foo zzz" matches = [[4, 7]] unicode = false [[tests]] name = "wb43" regex = '\b^' input = "ab" matches = [[0, 0]] unicode = false [[tests]] name = "wb44" regex = '$\b' input = "ab" matches = [[2, 2]] unicode = false # Tests for \B. Note that \B is not allowed if UTF-8 mode is enabled, so we # have to disable it for most of these tests. This is because \B can match at # non-UTF-8 boundaries. [[tests]] name = "nb1" regex = '\Bfoo\B' input = "n foo xfoox that" matches = [[7, 10]] unicode = false utf8 = false [[tests]] name = "nb2" regex = 'a\B' input = "faoa x" matches = [[1, 2]] unicode = false utf8 = false [[tests]] name = "nb3" regex = '\Bbar' input = "bar x" matches = [] unicode = false utf8 = false [[tests]] name = "nb4" regex = '\Bbar' input = "foo\nbar x" matches = [] unicode = false utf8 = false [[tests]] name = "nb5" regex = 'bar\B' input = "foobar" matches = [] unicode = false utf8 = false [[tests]] name = "nb6" regex = 'bar\B' input = "foobar\nxxx" matches = [] unicode = false utf8 = false [[tests]] name = "nb7" regex = '(foo|bar|[A-Z])\B' input = "foox" matches = [[0, 3]] unicode = false utf8 = false [[tests]] name = "nb8" regex = '(foo|bar|[A-Z])\B' input = "foo\n" matches = [] unicode = false utf8 = false [[tests]] name = "nb9" regex = '\B' input = "" matches = [[0, 0]] unicode = false utf8 = false [[tests]] name = "nb10" regex = '\B' input = "x" matches = [] unicode = false utf8 = false [[tests]] name = "nb11" regex = '\B(foo|bar|[A-Z])' input = "foo" matches = [] unicode = false utf8 = false [[tests]] name = "nb12" regex = '\B(foo|bar|[A-Z])\B' input = "xXy" matches = [[1, 2]] unicode = false utf8 = false [[tests]] name = "nb13" regex = '\B(foo|bar|[A-Z])\B' input = "XY" matches = [] unicode = false utf8 = false [[tests]] name = "nb14" regex = '\B(foo|bar|[A-Z])\B' input = "XYZ" matches = [[1, 2]] unicode = false utf8 = false [[tests]] name = "nb15" regex = '\B(foo|bar|[A-Z])\B' input = "abara" matches = [[1, 4]] unicode = false utf8 = false [[tests]] name = "nb16" regex = '\B(foo|bar|[A-Z])\B' input = "xfoo_" matches = [[1, 4]] unicode = false utf8 = false [[tests]] name = "nb17" regex = '\B(foo|bar|[A-Z])\B' input = "xfoo\n" matches = [] unicode = false utf8 = false [[tests]] name = "nb18" regex = '\B(foo|bar|[A-Z])\B' input = "foo bar vNX" matches = [[9, 10]] unicode = false utf8 = false [[tests]] name = "nb19" regex = '\B(fo|foo)\B' input = "xfoo" matches = [[1, 3]] unicode = false utf8 = false [[tests]] name = "nb20" regex = '\B(foo|fo)\B' input = "xfooo" matches = [[1, 4]] unicode = false utf8 = false [[tests]] name = "nb21" regex = '\B\B' input = "" matches = [[0, 0]] unicode = false utf8 = false [[tests]] name = "nb22" regex = '\B\B' input = "x" matches = [] unicode = false utf8 = false [[tests]] name = "nb23" regex = '\B$' input = "" matches = [[0, 0]] unicode = false utf8 = false [[tests]] name = "nb24" regex = '\B$' input = "x" matches = [] unicode = false utf8 = false [[tests]] name = "nb25" regex = '\B$' input = "y x" matches = [] unicode = false utf8 = false [[tests]] name = "nb26" regex = '\B.$' input = "x" matches = [] unicode = false utf8 = false [[tests]] name = "nb27" regex = '^\B(fo|foo)\B' input = "fo" matches = [] unicode = false utf8 = false [[tests]] name = "nb28" regex = '^\B(fo|foo)\B' input = "fo" matches = [] unicode = false utf8 = false [[tests]] name = "nb29" regex = '^\B' input = "" matches = [[0, 0]] unicode = false utf8 = false [[tests]] name = "nb30" regex = '^\B' input = "x" matches = [] unicode = false utf8 = false [[tests]] name = "nb31" regex = '^\B\B' input = "" matches = [[0, 0]] unicode = false utf8 = false [[tests]] name = "nb32" regex = '^\B\B' input = "x" matches = [] unicode = false utf8 = false [[tests]] name = "nb33" regex = '^\B$' input = "" matches = [[0, 0]] unicode = false utf8 = false [[tests]] name = "nb34" regex = '^\B$' input = "x" matches = [] unicode = false utf8 = false [[tests]] name = "nb35" regex = '^\B.$' input = "x" matches = [] unicode = false utf8 = false [[tests]] name = "nb36" regex = '^\B.\B$' input = "x" matches = [] unicode = false utf8 = false [[tests]] name = "nb37" regex = '^^^^^\B$$$$$' input = "" matches = [[0, 0]] unicode = false utf8 = false [[tests]] name = "nb38" regex = '^^^^^\B.$$$$$' input = "x" matches = [] unicode = false utf8 = false [[tests]] name = "nb39" regex = '^^^^^\B$$$$$' input = "x" matches = [] unicode = false utf8 = false # unicode1* and unicode2* work for both Unicode and ASCII because all matches # are reported as byte offsets, and « and » do not correspond to word # boundaries at either the character or byte level. [[tests]] name = "unicode1" regex = '\bx\b' input = "«x" matches = [[2, 3]] [[tests]] name = "unicode1-only-ascii" regex = '\bx\b' input = "«x" matches = [[2, 3]] unicode = false [[tests]] name = "unicode2" regex = '\bx\b' input = "x»" matches = [[0, 1]] [[tests]] name = "unicode2-only-ascii" regex = '\bx\b' input = "x»" matches = [[0, 1]] unicode = false # ASCII word boundaries are completely oblivious to Unicode characters, so # even though β is a character, an ASCII \b treats it as a word boundary # when it is adjacent to another ASCII character. (The ASCII \b only looks # at the leading byte of β.) For Unicode \b, the tests are precisely inverted. [[tests]] name = "unicode3" regex = '\bx\b' input = 'áxβ' matches = [] [[tests]] name = "unicode3-only-ascii" regex = '\bx\b' input = 'áxβ' matches = [[2, 3]] unicode = false [[tests]] name = "unicode4" regex = '\Bx\B' input = 'áxβ' matches = [[2, 3]] [[tests]] name = "unicode4-only-ascii" regex = '\Bx\B' input = 'áxβ' matches = [] unicode = false utf8 = false # The same as above, but with \b instead of \B as a sanity check. [[tests]] name = "unicode5" regex = '\b' input = "0\U0007EF5E" matches = [[0, 0], [1, 1]] [[tests]] name = "unicode5-only-ascii" regex = '\b' input = "0\U0007EF5E" matches = [[0, 0], [1, 1]] unicode = false utf8 = false [[tests]] name = "unicode5-noutf8" regex = '\b' input = '0\xFF\xFF\xFF\xFF' matches = [[0, 0], [1, 1]] unescape = true utf8 = false [[tests]] name = "unicode5-noutf8-only-ascii" regex = '\b' input = '0\xFF\xFF\xFF\xFF' matches = [[0, 0], [1, 1]] unescape = true unicode = false utf8 = false # Weird special case to ensure that ASCII \B treats each individual code unit # as a non-word byte. (The specific codepoint is irrelevant. It's an arbitrary # codepoint that uses 4 bytes in its UTF-8 encoding and is not a member of the # \w character class.) [[tests]] name = "unicode5-not" regex = '\B' input = "0\U0007EF5E" matches = [[5, 5]] [[tests]] name = "unicode5-not-only-ascii" regex = '\B' input = "0\U0007EF5E" matches = [[2, 2], [3, 3], [4, 4], [5, 5]] unicode = false utf8 = false # This gets no matches since \B only matches in the presence of valid UTF-8 # when Unicode is enabled, even when UTF-8 mode is disabled. [[tests]] name = "unicode5-not-noutf8" regex = '\B' input = '0\xFF\xFF\xFF\xFF' matches = [] unescape = true utf8 = false # But this DOES get matches since \B in ASCII mode only looks at individual # bytes. [[tests]] name = "unicode5-not-noutf8-only-ascii" regex = '\B' input = '0\xFF\xFF\xFF\xFF' matches = [[2, 2], [3, 3], [4, 4], [5, 5]] unescape = true unicode = false utf8 = false # Some tests of no particular significance. [[tests]] name = "unicode6" regex = '\b[0-9]+\b' input = "foo 123 bar 456 quux 789" matches = [[4, 7], [12, 15], [21, 24]] [[tests]] name = "unicode7" regex = '\b[0-9]+\b' input = "foo 123 bar a456 quux 789" matches = [[4, 7], [22, 25]] [[tests]] name = "unicode8" regex = '\b[0-9]+\b' input = "foo 123 bar 456a quux 789" matches = [[4, 7], [22, 25]]