517 lines
8.1 KiB
TOML
517 lines
8.1 KiB
TOML
# Basic Unicode literal support.
|
||
[[test]]
|
||
name = "literal1"
|
||
regex = '☃'
|
||
haystack = "☃"
|
||
matches = [[0, 3]]
|
||
|
||
[[test]]
|
||
name = "literal2"
|
||
regex = '☃+'
|
||
haystack = "☃"
|
||
matches = [[0, 3]]
|
||
|
||
[[test]]
|
||
name = "literal3"
|
||
regex = '☃+'
|
||
haystack = "☃"
|
||
matches = [[0, 3]]
|
||
case-insensitive = true
|
||
|
||
[[test]]
|
||
name = "literal4"
|
||
regex = 'Δ'
|
||
haystack = "δ"
|
||
matches = [[0, 2]]
|
||
case-insensitive = true
|
||
|
||
# Unicode word boundaries.
|
||
[[test]]
|
||
name = "wb-100"
|
||
regex = '\d\b'
|
||
haystack = "6δ"
|
||
matches = []
|
||
|
||
[[test]]
|
||
name = "wb-200"
|
||
regex = '\d\b'
|
||
haystack = "6 "
|
||
matches = [[0, 1]]
|
||
|
||
[[test]]
|
||
name = "wb-300"
|
||
regex = '\d\B'
|
||
haystack = "6δ"
|
||
matches = [[0, 1]]
|
||
|
||
[[test]]
|
||
name = "wb-400"
|
||
regex = '\d\B'
|
||
haystack = "6 "
|
||
matches = []
|
||
|
||
# Unicode character class support.
|
||
[[test]]
|
||
name = "class1"
|
||
regex = '[☃Ⅰ]+'
|
||
haystack = "☃"
|
||
matches = [[0, 3]]
|
||
|
||
[[test]]
|
||
name = "class2"
|
||
regex = '\pN'
|
||
haystack = "Ⅰ"
|
||
matches = [[0, 3]]
|
||
|
||
[[test]]
|
||
name = "class3"
|
||
regex = '\pN+'
|
||
haystack = "Ⅰ1Ⅱ2"
|
||
matches = [[0, 8]]
|
||
|
||
[[test]]
|
||
name = "class4"
|
||
regex = '\PN+'
|
||
haystack = "abⅠ"
|
||
matches = [[0, 2]]
|
||
|
||
[[test]]
|
||
name = "class5"
|
||
regex = '[\PN]+'
|
||
haystack = "abⅠ"
|
||
matches = [[0, 2]]
|
||
|
||
[[test]]
|
||
name = "class6"
|
||
regex = '[^\PN]+'
|
||
haystack = "abⅠ"
|
||
matches = [[2, 5]]
|
||
|
||
[[test]]
|
||
name = "class7"
|
||
regex = '\p{Lu}+'
|
||
haystack = "ΛΘΓΔα"
|
||
matches = [[0, 8]]
|
||
|
||
[[test]]
|
||
name = "class8"
|
||
regex = '\p{Lu}+'
|
||
haystack = "ΛΘΓΔα"
|
||
matches = [[0, 10]]
|
||
case-insensitive = true
|
||
|
||
[[test]]
|
||
name = "class9"
|
||
regex = '\pL+'
|
||
haystack = "ΛΘΓΔα"
|
||
matches = [[0, 10]]
|
||
|
||
[[test]]
|
||
name = "class10"
|
||
regex = '\p{Ll}+'
|
||
haystack = "ΛΘΓΔα"
|
||
matches = [[8, 10]]
|
||
|
||
# Unicode aware "Perl" character classes.
|
||
[[test]]
|
||
name = "perl1"
|
||
regex = '\w+'
|
||
haystack = "dδd"
|
||
matches = [[0, 4]]
|
||
|
||
[[test]]
|
||
name = "perl2"
|
||
regex = '\w+'
|
||
haystack = "⥡"
|
||
matches = []
|
||
|
||
[[test]]
|
||
name = "perl3"
|
||
regex = '\W+'
|
||
haystack = "⥡"
|
||
matches = [[0, 3]]
|
||
|
||
[[test]]
|
||
name = "perl4"
|
||
regex = '\d+'
|
||
haystack = "1२३9"
|
||
matches = [[0, 8]]
|
||
|
||
[[test]]
|
||
name = "perl5"
|
||
regex = '\d+'
|
||
haystack = "Ⅱ"
|
||
matches = []
|
||
|
||
[[test]]
|
||
name = "perl6"
|
||
regex = '\D+'
|
||
haystack = "Ⅱ"
|
||
matches = [[0, 3]]
|
||
|
||
[[test]]
|
||
name = "perl7"
|
||
regex = '\s+'
|
||
haystack = " "
|
||
matches = [[0, 3]]
|
||
|
||
[[test]]
|
||
name = "perl8"
|
||
regex = '\s+'
|
||
haystack = "☃"
|
||
matches = []
|
||
|
||
[[test]]
|
||
name = "perl9"
|
||
regex = '\S+'
|
||
haystack = "☃"
|
||
matches = [[0, 3]]
|
||
|
||
# Specific tests for Unicode general category classes.
|
||
[[test]]
|
||
name = "class-gencat1"
|
||
regex = '\p{Cased_Letter}'
|
||
haystack = "A"
|
||
matches = [[0, 3]]
|
||
|
||
[[test]]
|
||
name = "class-gencat2"
|
||
regex = '\p{Close_Punctuation}'
|
||
haystack = "❯"
|
||
matches = [[0, 3]]
|
||
|
||
[[test]]
|
||
name = "class-gencat3"
|
||
regex = '\p{Connector_Punctuation}'
|
||
haystack = "⁀"
|
||
matches = [[0, 3]]
|
||
|
||
[[test]]
|
||
name = "class-gencat4"
|
||
regex = '\p{Control}'
|
||
haystack = "\u009F"
|
||
matches = [[0, 2]]
|
||
|
||
[[test]]
|
||
name = "class-gencat5"
|
||
regex = '\p{Currency_Symbol}'
|
||
haystack = "£"
|
||
matches = [[0, 3]]
|
||
|
||
[[test]]
|
||
name = "class-gencat6"
|
||
regex = '\p{Dash_Punctuation}'
|
||
haystack = "〰"
|
||
matches = [[0, 3]]
|
||
|
||
[[test]]
|
||
name = "class-gencat7"
|
||
regex = '\p{Decimal_Number}'
|
||
haystack = "𑓙"
|
||
matches = [[0, 4]]
|
||
|
||
[[test]]
|
||
name = "class-gencat8"
|
||
regex = '\p{Enclosing_Mark}'
|
||
haystack = "\uA672"
|
||
matches = [[0, 3]]
|
||
|
||
[[test]]
|
||
name = "class-gencat9"
|
||
regex = '\p{Final_Punctuation}'
|
||
haystack = "⸡"
|
||
matches = [[0, 3]]
|
||
|
||
[[test]]
|
||
name = "class-gencat10"
|
||
regex = '\p{Format}'
|
||
haystack = "\U000E007F"
|
||
matches = [[0, 4]]
|
||
|
||
[[test]]
|
||
name = "class-gencat11"
|
||
regex = '\p{Initial_Punctuation}'
|
||
haystack = "⸜"
|
||
matches = [[0, 3]]
|
||
|
||
[[test]]
|
||
name = "class-gencat12"
|
||
regex = '\p{Letter}'
|
||
haystack = "Έ"
|
||
matches = [[0, 2]]
|
||
|
||
[[test]]
|
||
name = "class-gencat13"
|
||
regex = '\p{Letter_Number}'
|
||
haystack = "ↂ"
|
||
matches = [[0, 3]]
|
||
|
||
[[test]]
|
||
name = "class-gencat14"
|
||
regex = '\p{Line_Separator}'
|
||
haystack = "\u2028"
|
||
matches = [[0, 3]]
|
||
|
||
[[test]]
|
||
name = "class-gencat15"
|
||
regex = '\p{Lowercase_Letter}'
|
||
haystack = "ϛ"
|
||
matches = [[0, 2]]
|
||
|
||
[[test]]
|
||
name = "class-gencat16"
|
||
regex = '\p{Mark}'
|
||
haystack = "\U000E01EF"
|
||
matches = [[0, 4]]
|
||
|
||
[[test]]
|
||
name = "class-gencat17"
|
||
regex = '\p{Math}'
|
||
haystack = "⋿"
|
||
matches = [[0, 3]]
|
||
|
||
[[test]]
|
||
name = "class-gencat18"
|
||
regex = '\p{Modifier_Letter}'
|
||
haystack = "𖭃"
|
||
matches = [[0, 4]]
|
||
|
||
[[test]]
|
||
name = "class-gencat19"
|
||
regex = '\p{Modifier_Symbol}'
|
||
haystack = "🏿"
|
||
matches = [[0, 4]]
|
||
|
||
[[test]]
|
||
name = "class-gencat20"
|
||
regex = '\p{Nonspacing_Mark}'
|
||
haystack = "\U0001E94A"
|
||
matches = [[0, 4]]
|
||
|
||
[[test]]
|
||
name = "class-gencat21"
|
||
regex = '\p{Number}'
|
||
haystack = "⓿"
|
||
matches = [[0, 3]]
|
||
|
||
[[test]]
|
||
name = "class-gencat22"
|
||
regex = '\p{Open_Punctuation}'
|
||
haystack = "⦅"
|
||
matches = [[0, 3]]
|
||
|
||
[[test]]
|
||
name = "class-gencat23"
|
||
regex = '\p{Other}'
|
||
haystack = "\u0BC9"
|
||
matches = [[0, 3]]
|
||
|
||
[[test]]
|
||
name = "class-gencat24"
|
||
regex = '\p{Other_Letter}'
|
||
haystack = "ꓷ"
|
||
matches = [[0, 3]]
|
||
|
||
[[test]]
|
||
name = "class-gencat25"
|
||
regex = '\p{Other_Number}'
|
||
haystack = "㉏"
|
||
matches = [[0, 3]]
|
||
|
||
[[test]]
|
||
name = "class-gencat26"
|
||
regex = '\p{Other_Punctuation}'
|
||
haystack = "𞥞"
|
||
matches = [[0, 4]]
|
||
|
||
[[test]]
|
||
name = "class-gencat27"
|
||
regex = '\p{Other_Symbol}'
|
||
haystack = "⅌"
|
||
matches = [[0, 3]]
|
||
|
||
[[test]]
|
||
name = "class-gencat28"
|
||
regex = '\p{Paragraph_Separator}'
|
||
haystack = "\u2029"
|
||
matches = [[0, 3]]
|
||
|
||
[[test]]
|
||
name = "class-gencat29"
|
||
regex = '\p{Private_Use}'
|
||
haystack = "\U0010FFFD"
|
||
matches = [[0, 4]]
|
||
|
||
[[test]]
|
||
name = "class-gencat30"
|
||
regex = '\p{Punctuation}'
|
||
haystack = "𑁍"
|
||
matches = [[0, 4]]
|
||
|
||
[[test]]
|
||
name = "class-gencat31"
|
||
regex = '\p{Separator}'
|
||
haystack = "\u3000"
|
||
matches = [[0, 3]]
|
||
|
||
[[test]]
|
||
name = "class-gencat32"
|
||
regex = '\p{Space_Separator}'
|
||
haystack = "\u205F"
|
||
matches = [[0, 3]]
|
||
|
||
[[test]]
|
||
name = "class-gencat33"
|
||
regex = '\p{Spacing_Mark}'
|
||
haystack = "\U00016F7E"
|
||
matches = [[0, 4]]
|
||
|
||
[[test]]
|
||
name = "class-gencat34"
|
||
regex = '\p{Symbol}'
|
||
haystack = "⯈"
|
||
matches = [[0, 3]]
|
||
|
||
[[test]]
|
||
name = "class-gencat35"
|
||
regex = '\p{Titlecase_Letter}'
|
||
haystack = "ῼ"
|
||
matches = [[0, 3]]
|
||
|
||
[[test]]
|
||
name = "class-gencat36"
|
||
regex = '\p{Unassigned}'
|
||
haystack = "\U0010FFFF"
|
||
matches = [[0, 4]]
|
||
|
||
[[test]]
|
||
name = "class-gencat37"
|
||
regex = '\p{Uppercase_Letter}'
|
||
haystack = "Ꝋ"
|
||
matches = [[0, 3]]
|
||
|
||
|
||
# Tests for Unicode emoji properties.
|
||
[[test]]
|
||
name = "class-emoji1"
|
||
regex = '\p{Emoji}'
|
||
haystack = "\u23E9"
|
||
matches = [[0, 3]]
|
||
|
||
[[test]]
|
||
name = "class-emoji2"
|
||
regex = '\p{emoji}'
|
||
haystack = "\U0001F21A"
|
||
matches = [[0, 4]]
|
||
|
||
[[test]]
|
||
name = "class-emoji3"
|
||
regex = '\p{extendedpictographic}'
|
||
haystack = "\U0001FA6E"
|
||
matches = [[0, 4]]
|
||
|
||
[[test]]
|
||
name = "class-emoji4"
|
||
regex = '\p{extendedpictographic}'
|
||
haystack = "\U0001FFFD"
|
||
matches = [[0, 4]]
|
||
|
||
|
||
# Tests for Unicode grapheme cluster properties.
|
||
[[test]]
|
||
name = "class-gcb1"
|
||
regex = '\p{grapheme_cluster_break=prepend}'
|
||
haystack = "\U00011D46"
|
||
matches = [[0, 4]]
|
||
|
||
[[test]]
|
||
name = "class-gcb2"
|
||
regex = '\p{gcb=regional_indicator}'
|
||
haystack = "\U0001F1E6"
|
||
matches = [[0, 4]]
|
||
|
||
[[test]]
|
||
name = "class-gcb3"
|
||
regex = '\p{gcb=ri}'
|
||
haystack = "\U0001F1E7"
|
||
matches = [[0, 4]]
|
||
|
||
[[test]]
|
||
name = "class-gcb4"
|
||
regex = '\p{regionalindicator}'
|
||
haystack = "\U0001F1FF"
|
||
matches = [[0, 4]]
|
||
|
||
[[test]]
|
||
name = "class-gcb5"
|
||
regex = '\p{gcb=lvt}'
|
||
haystack = "\uC989"
|
||
matches = [[0, 3]]
|
||
|
||
[[test]]
|
||
name = "class-gcb6"
|
||
regex = '\p{gcb=zwj}'
|
||
haystack = "\u200D"
|
||
matches = [[0, 3]]
|
||
|
||
# Tests for Unicode word boundary properties.
|
||
[[test]]
|
||
name = "class-word-break1"
|
||
regex = '\p{word_break=Hebrew_Letter}'
|
||
haystack = "\uFB46"
|
||
matches = [[0, 3]]
|
||
|
||
[[test]]
|
||
name = "class-word-break2"
|
||
regex = '\p{wb=hebrewletter}'
|
||
haystack = "\uFB46"
|
||
matches = [[0, 3]]
|
||
|
||
[[test]]
|
||
name = "class-word-break3"
|
||
regex = '\p{wb=ExtendNumLet}'
|
||
haystack = "\uFF3F"
|
||
matches = [[0, 3]]
|
||
|
||
[[test]]
|
||
name = "class-word-break4"
|
||
regex = '\p{wb=WSegSpace}'
|
||
haystack = "\u3000"
|
||
matches = [[0, 3]]
|
||
|
||
[[test]]
|
||
name = "class-word-break5"
|
||
regex = '\p{wb=numeric}'
|
||
haystack = "\U0001E950"
|
||
matches = [[0, 4]]
|
||
|
||
# Tests for Unicode sentence boundary properties.
|
||
[[test]]
|
||
name = "class-sentence-break1"
|
||
regex = '\p{sentence_break=Lower}'
|
||
haystack = "\u0469"
|
||
matches = [[0, 2]]
|
||
|
||
[[test]]
|
||
name = "class-sentence-break2"
|
||
regex = '\p{sb=lower}'
|
||
haystack = "\u0469"
|
||
matches = [[0, 2]]
|
||
|
||
[[test]]
|
||
name = "class-sentence-break3"
|
||
regex = '\p{sb=Close}'
|
||
haystack = "\uFF60"
|
||
matches = [[0, 3]]
|
||
|
||
[[test]]
|
||
name = "class-sentence-break4"
|
||
regex = '\p{sb=Close}'
|
||
haystack = "\U0001F677"
|
||
matches = [[0, 4]]
|
||
|
||
[[test]]
|
||
name = "class-sentence-break5"
|
||
regex = '\p{sb=SContinue}'
|
||
haystack = "\uFF64"
|
||
matches = [[0, 3]]
|