1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
|
#
# Copyright (C) 2002-2003, International Business Machines Corporation and others.
# All Rights Reserved.
#
# file: dict_word.txt
#
# ICU Word Break Rules
# See Unicode Standard Annex #29.
# These rules are based on Version 4.0.0, dated 2003-04-17
#
####################################################################################
#
# Character class definitions from TR 29
#
####################################################################################
$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
[:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
[:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
[:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
$Ideographic = [:Ideographic:];
$Hangul = [:Script = HANGUL:];
$ALetter = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:]
- $Ideographic
- $Katakana
- $Hangul
- [:Script = Thai:]
- [:Script = Lao:]
- [:Script = Hiragana:]];
$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:]
[:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:]
[:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:]
[:name = HYPHEN-MINUS:] ];
$SufixLetter = [:name= FULL STOP:];
$MidNum = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:]
[:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:]
[:name = PRIME:]];
$Numeric = [:LineBreak = Numeric:];
$TheZWSP = \u200b;
#
# Character Class Definitions.
# The names are those from TR29.
#
$CR = \u000d;
$LF = \u000a;
$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP];
$Extend = [[:Grapheme_Extend = TRUE:]];
####################################################################################
#
# Word Break Rules. Definitions and Rules specific to word break begin Here.
#
####################################################################################
$Format = [[:Cf:] - $TheZWSP];
# Rule 3: Treat a grapheme cluster as if it were a single character.
# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters
# because we don't need to find the boundaries between adjacent syllables -
# they won't be word boundaries.
#
#
# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char.
#
$ALetterEx = $ALetter $Extend*;
$NumericEx = $Numeric $Extend*;
$MidNumEx = $MidNum $Extend*;
$MidLetterEx = $MidLetter $Extend*;
$SufixLetterEx= $SufixLetter $Extend*;
$KatakanaEx = $Katakana $Extend*;
$IdeographicEx= $Ideographic $Extend*;
$HangulEx = $Hangul $Extend*;
$FormatEx = $Format $Extend*;
#
# Numbers. Rules 8, 11, 12 form the TR.
#
$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*;
$NumberSequence {100};
#
# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10
# - must include at least one letter.
# - may include both letters and numbers.
# - may include MideLetter, MidNumber punctuation.
#
$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7
($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200};
[[:P:][:S:]]*;
#
# Do not break between Katakana. Rule #13.
#
$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
[:Hiragana:] $Extend* {300};
#
# Ideographic Characters. Stand by themselves as words.
# Separated from the "Everything Else" rule, below, only so that they
# can be tagged with a return value. TODO: is this what we want?
#
$IdeographicEx ($FormatEx* $IdeographicEx)* {400};
$HangulEx ($FormatEx* $HangulEx)* {400};
#
# Everything Else, with no tag.
# Non-Control chars combine with $Extend (combining) chars.
# Controls are do not.
#
[^$Control [:Ideographic:]] $Extend*;
$CR $LF;
#
# Reverse Rules. Back up over any of the chars that can group together.
# (Reverse rules do not need to be exact; they can back up too far,
# but must back up at least enough, and must stop on a boundary.)
#
# NonStarters are the set of all characters that can appear at the 2nd - nth position of
# a word. (They may also be the first.) The reverse rule skips over these, until it
# reaches something that can only be the start (and probably only) char in a "word".
# A space or punctuation meets the test.
#
$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format];
#!.*;
! ($NonStarters* | \n \r) .;
|