# These tests are specifically written to test the regex-lite crate. While it # largely has the same semantics as the regex crate, there are some differences # around Unicode support and UTF-8. # # To be clear, regex-lite supports far fewer patterns because of its lack of # Unicode support, nested character classes and character class set operations. # What we're talking about here are the patterns that both crates support but # where the semantics might differ. # regex-lite uses ASCII definitions for Perl character classes. [[test]] name = "perl-class-decimal" regex = '\d' haystack = '᠕' matches = [] unicode = true # regex-lite uses ASCII definitions for Perl character classes. [[test]] name = "perl-class-space" regex = '\s' haystack = "\u2000" matches = [] unicode = true # regex-lite uses ASCII definitions for Perl character classes. [[test]] name = "perl-class-word" regex = '\w' haystack = 'δ' matches = [] unicode = true # regex-lite uses the ASCII definition of word for word boundary assertions. [[test]] name = "word-boundary" regex = '\b' haystack = 'δ' matches = [] unicode = true # regex-lite uses the ASCII definition of word for negated word boundary # assertions. But note that it should still not split codepoints! [[test]] name = "word-boundary-negated" regex = '\B' haystack = 'δ' matches = [[0, 0], [2, 2]] unicode = true # While we're here, the empty regex---which matches at every # position---shouldn't split a codepoint either. [[test]] name = "empty-no-split-codepoint" regex = '' haystack = '💩' matches = [[0, 0], [4, 4]] unicode = true # A dot always matches a full codepoint. [[test]] name = "dot-always-matches-codepoint" regex = '.' haystack = '💩' matches = [[0, 4]] unicode = false # A negated character class also always matches a full codepoint. [[test]] name = "negated-class-always-matches-codepoint" regex = '[^a]' haystack = '💩' matches = [[0, 4]] unicode = false # regex-lite only supports ASCII-aware case insensitive matching. [[test]] name = "case-insensitive-is-ascii-only" regex = 's' haystack = 'ſ' matches = [] unicode = true case-insensitive = true # Negated word boundaries shouldn't split a codepoint, but they will match # between invalid UTF-8. # # This test is only valid for a 'bytes' API, but that doesn't (yet) exist in # regex-lite. This can't happen in the main API because &str can't contain # invalid UTF-8. # [[test]] # name = "word-boundary-invalid-utf8" # regex = '\B' # haystack = '\xFF\xFF\xFF\xFF' # unescape = true # matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]] # unicode = true # utf8 = false