summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/test/testdata/break_rules/grapheme.txt
blob: d5776f33c206ad07e6041d5d6c8b8b0160191c45 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.

# file: grapheme.txt
#
# Reference Grapheme Break rules for intltest rbbi/RBBIMonkeyTest
#
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
#       They are expected to change with review and the addition of support for rule tailoring.

type = grapheme;      # one of grapheme | word | line | sentence
locale = en;

CR                 = [\p{Grapheme_Cluster_Break = CR}];
LF                 = [\p{Grapheme_Cluster_Break = LF}];

Control            = [[\p{Grapheme_Cluster_Break = Control}]];
Extend             = [[\p{Grapheme_Cluster_Break = Extend}]];
ZWJ                = [\p{Grapheme_Cluster_Break = ZWJ}];
Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
Prepend            = [\p{Grapheme_Cluster_Break = Prepend}];
SpacingMark        = [\p{Grapheme_Cluster_Break = SpacingMark}];

#
# Korean Syllable Definitions
#
L                  = [\p{Grapheme_Cluster_Break = L}];
V                  = [\p{Grapheme_Cluster_Break = V}];
T                  = [\p{Grapheme_Cluster_Break = T}];
LV                 = [\p{Grapheme_Cluster_Break = LV}];
LVT                = [\p{Grapheme_Cluster_Break = LVT}];

# Emoji definitions

Extended_Pict      = [:ExtPict:];

# Indic Sequences
Virama_            = [[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}] & [\p{Indic_Syllabic_Category=Virama}]];

LinkingConsonant   = [[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}] & [\p{Indic_Syllabic_Category=Consonant}]];

ExtCccZwj          = [[Extend-[\p{ccc=0}]] ZWJ];

GB3:     CR LF;
GB4:     (Control | CR | LF) ÷;
GB5:     . ÷ (Control | CR | LF);

GB6:     L (L | V | LV | LVT);
GB7:     (LV | V) (V | T);
GB8:     (LVT | T) T;

GB11:    Extended_Pict Extend* ZWJ Extended_Pict;
GB9c:    LinkingConsonant ExtCccZwj* Virama_ ExtCccZwj* LinkingConsonant;
GB9:     . (Extend | ZWJ);

GB9a:    . SpacingMark;
GB9b:    Prepend .;

# Regional Indicators, split into pairs.
#      Note that a pair of RIs that is not followed by a third RI will fall into
#      the normal rules for Extend, etc.
#
GB12:  Regional_Indicator Regional_Indicator ÷ Regional_Indicator;
GB13:  Regional_Indicator Regional_Indicator;

GB999:     . ÷;