summaryrefslogtreecommitdiffstats
path: root/intl/unicharutil/util/IrishCasing.cpp
blob: 5de8a529aeee7e4f03e18ea4557c4a53d8b4e9d1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

/******************************************************************************

This file provides a finite state machine to support Irish Gaelic uppercasing
rules.

The caller will need to iterate through a string, passing a State variable
along with the current character to each UpperCase call and checking the flags
that are returned:

  If aMarkPos is true, caller must remember the current index in the string as
  a possible target for a future action.

  If aAction is non-zero, then one or more characters from the marked index are
  to be modified:
    1  lowercase the marked letter
    2  lowercase the marked letter and its successor
    3  lowercase the marked letter, and delete its successor


### Rules from https://bugzilla.mozilla.org/show_bug.cgi?id=1014639,
### comments 1 and 4:

v = [a,á,e,é,i,í,o,ó,u,ú]
V = [A,Á,E,É,I,Í,O,Ó,U,Ú]

bhf -> bhF
bhF -> bhF
bp  -> bP
bP  -> bP
dt  -> dT
dT  -> dT
gc  -> gC
gC  -> gC
h{V}  -> h{V}
mb  -> mB
mB  -> mB
n-{v} -> n{V}
n{V} -> n{V}
nd  -> nD
nD  -> nD
ng  -> nG
nG  -> nG
t-{v} -> t{V}
t{V} -> t{V}
ts{v} -> tS{V}
tS{v} -> tS{V}
tS{V} -> tS{V}
tsl  -> tSL
tSl  -> tSL
tSL  -> tSL
tsn  -> tSN
tSn  -> tSN
tSN  -> tSN
tsr  -> tSR
tSr  -> tSR
tSR  -> tSR

### Create table of states and actions for each input class.

Start (non-word) state is #; generic in-word state is _, once we know there's
no special action to do in this word.

         #   _   b   bh  d   g   h   m   n   n-  t   t-  ts
input\state
b        b'  _   _   _   _   _   _   1   _   _   _   _   _
B        _   _   _   _   _   _   _   1   _   _   _   _   _
c        _   _   _   _   _   1   _   _   _   _   _   _   _
C        _   _   _   _   _   1   _   _   _   _   _   _   _
d        d'  _   _   _   _   _   _   _   1   _   _   _   _
D        _   _   _   _   _   _   _   _   1   _   _   _   _
f        _   _   _   2   _   _   _   _   _   _   _   _   _
F        _   _   _   2   _   _   _   _   _   _   _   _   _
g        g'  _   _   _   _   _   _   _   1   _   _   _   _
G        _   _   _   _   _   _   _   _   1   _   _   _   _
h        h'  _   bh  _   _   _   _   _   _   _   _   _   _
l        _   _   _   _   _   _   _   _   _   _   _   _   1
L        _   _   _   _   _   _   _   _   _   _   _   _   1
m        m'  _   _   _   _   _   _   _   _   _   _   _   _
n        n'  _   _   _   _   _   _   _   _   _   _   _   1
N        _   _   _   _   _   _   _   _   _   _   _   _   1
p        _   _   1   _   _   _   _   _   _   _   _   _   _
P        _   _   1   _   _   _   _   _   _   _   _   _   _
r        _   _   _   _   _   _   _   _   _   _   _   _   1
R        _   _   _   _   _   _   _   _   _   _   _   _   1
s        _   _   _   _   _   _   _   _   _   _   ts  _   _
S        _   _   _   _   _   _   _   _   _   _   ts  _   _
t        t'  _   _   _   1   _   _   _   _   _   _   _   _
T        _   _   _   _   1   _   _   _   _   _   _   _   _
vowel    _   _   _   _   _   _   _   _   _   1d  _   1d  1
Vowel    _   _   _   _   _   _   1   _   1   _   1   _   1
hyph     _   _   _   _   _   _   _   _   n-  _   t-  _   _
letter   _   _   _   _   _   _   _   _   _   _   _   _   _
other    #   #   #   #   #   #   #   #   #   #   #   #   #

Actions:
  1            lowercase one letter at start of word
  2            lowercase two letters at start of word
  1d           lowercase one letter at start of word, and delete next
               (and then go to state _, nothing further to do in this word)

else just go to the given state; suffix ' indicates mark start-of-word.

### Consolidate identical states and classes:

         0   1   2   3   4   5   6   7   8   9   A   B
         #   _   b   bh  d   g   h   m   n [nt]- t   ts
input\state
b        b'  _   _   _   _   _   _   1   _   _   _   _
B        _   _   _   _   _   _   _   1   _   _   _   _
[cC]     _   _   _   _   _   1   _   _   _   _   _   _
d        d'  _   _   _   _   _   _   _   1   _   _   _
[DG]     _   _   _   _   _   _   _   _   1   _   _   _
[fF]     _   _   _   2   _   _   _   _   _   _   _   _
g        g'  _   _   _   _   _   _   _   1   _   _   _
h        h'  _   bh  _   _   _   _   _   _   _   _   _
[lLNrR]  _   _   _   _   _   _   _   _   _   _   _   1
m        m'  _   _   _   _   _   _   _   _   _   _   _
n        n'  _   _   _   _   _   _   _   _   _   _   1
[pP]     _   _   1   _   _   _   _   _   _   _   _   _
[sS]     _   _   _   _   _   _   _   _   _   _   ts  _
t        t'  _   _   _   1   _   _   _   _   _   _   _
T        _   _   _   _   1   _   _   _   _   _   _   _
vowel    _   _   _   _   _   _   _   _   _   1d  _   1
Vowel    _   _   _   _   _   _   1   _   1   _   1   1
hyph     _   _   _   _   _   _   _   _ [nt-] _ [nt-] _
letter   _   _   _   _   _   _   _   _   _   _   _   _
other    #   #   #   #   #   #   #   #   #   #   #   #

So we have 20 input classes, and 12 states.

State table array will contain bytes that encode action and new state:

  0x80  -  bit flag: mark start-of-word position
  0x40  -  currently unused
  0x30  -  action mask: 4 values
           0x00  -  do nothing
           0x10  -  lowercase one letter
           0x20  -  lowercase two letters
           0x30  -  lowercase one, delete one
  0x0F  -  next-state mask
******************************************************************************/

#include "IrishCasing.h"

#include "nsUnicodeProperties.h"
#include "nsUnicharUtils.h"

namespace mozilla {

const uint8_t IrishCasing::sUppercaseStateTable[kNumClasses][kNumStates] = {
    //  #     _     b     bh    d     g     h     m     n     [nt]- t     ts
    {0x82, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01,
     0x01},  // b
    {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01,
     0x01},  // B
    {0x01, 0x01, 0x01, 0x01, 0x01, 0x10, 0x01, 0x01, 0x01, 0x01, 0x01,
     0x01},  // [cC]
    {0x84, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01,
     0x01},  // d
    {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01,
     0x01},  // [DG]
    {0x01, 0x01, 0x01, 0x21, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
     0x01},  // [fF]
    {0x85, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01,
     0x01},  // g
    {0x86, 0x01, 0x03, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
     0x01},  // h
    {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
     0x11},  // [lLNrR]
    {0x87, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
     0x01},  // m
    {0x88, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
     0x11},  // n
    {0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
     0x01},  // [pP]
    {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x0B,
     0x01},  // [sS]
    {0x8A, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
     0x01},  // t
    {0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
     0x01},  // T
    {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x31, 0x01,
     0x11},  // vowel
    {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x11, 0x01, 0x11,
     0x11},  // Vowel
    {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x09, 0x01, 0x09,
     0x01},  // hyph
    {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
     0x01},  // letter
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00}  // other
};

#define HYPHEN 0x2010
#define NO_BREAK_HYPHEN 0x2011
#define a_ACUTE 0x00e1
#define e_ACUTE 0x00e9
#define i_ACUTE 0x00ed
#define o_ACUTE 0x00f3
#define u_ACUTE 0x00fa
#define A_ACUTE 0x00c1
#define E_ACUTE 0x00c9
#define I_ACUTE 0x00cd
#define O_ACUTE 0x00d3
#define U_ACUTE 0x00da

const uint8_t IrishCasing::sLcClasses[26] = {
    kClass_vowel,  kClass_b,      kClass_cC,     kClass_d,      kClass_vowel,
    kClass_fF,     kClass_g,      kClass_h,      kClass_vowel,  kClass_letter,
    kClass_letter, kClass_lLNrR,  kClass_m,      kClass_n,      kClass_vowel,
    kClass_pP,     kClass_letter, kClass_lLNrR,  kClass_sS,     kClass_t,
    kClass_vowel,  kClass_letter, kClass_letter, kClass_letter, kClass_letter,
    kClass_letter};

const uint8_t IrishCasing::sUcClasses[26] = {
    kClass_Vowel,  kClass_B,      kClass_cC,     kClass_DG,     kClass_Vowel,
    kClass_fF,     kClass_DG,     kClass_letter, kClass_Vowel,  kClass_letter,
    kClass_letter, kClass_lLNrR,  kClass_letter, kClass_lLNrR,  kClass_Vowel,
    kClass_pP,     kClass_letter, kClass_lLNrR,  kClass_sS,     kClass_T,
    kClass_Vowel,  kClass_letter, kClass_letter, kClass_letter, kClass_letter,
    kClass_letter};

uint8_t IrishCasing::GetClass(uint32_t aCh) {
  using mozilla::unicode::GetGenCategory;
  if (aCh >= 'a' && aCh <= 'z') {
    return sLcClasses[aCh - 'a'];
  } else if (aCh >= 'A' && aCh <= 'Z') {
    return sUcClasses[aCh - 'A'];
  } else if (GetGenCategory(aCh) == nsUGenCategory::kLetter) {
    if (aCh == a_ACUTE || aCh == e_ACUTE || aCh == i_ACUTE || aCh == o_ACUTE ||
        aCh == u_ACUTE) {
      return kClass_vowel;
    } else if (aCh == A_ACUTE || aCh == E_ACUTE || aCh == I_ACUTE ||
               aCh == O_ACUTE || aCh == U_ACUTE) {
      return kClass_Vowel;
    } else {
      return kClass_letter;
    }
  } else if (aCh == '-' || aCh == HYPHEN || aCh == NO_BREAK_HYPHEN) {
    return kClass_hyph;
  } else {
    return kClass_other;
  }
}

uint32_t IrishCasing::UpperCase(uint32_t aCh, State& aState, bool& aMarkPos,
                                uint8_t& aAction) {
  uint8_t cls = GetClass(aCh);
  uint8_t stateEntry = sUppercaseStateTable[cls][aState];
  aMarkPos = !!(stateEntry & kMarkPositionFlag);
  aAction = (stateEntry & kActionMask) >> kActionShift;
  aState = State(stateEntry & kNextStateMask);

  return ToUpperCase(aCh);
}

}  // namespace mozilla