summaryrefslogtreecommitdiffstats
path: root/intl/unicharutil/util/GreekCasing.cpp
blob: 5c7e7d506ed5a540599bb1799e27e4a1caae4639 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "GreekCasing.h"
#include "nsUnicharUtils.h"
#include "nsUnicodeProperties.h"

// Custom uppercase mapping for Greek; see bug 307039 for details
#define GREEK_LOWER_ALPHA 0x03B1
#define GREEK_LOWER_ALPHA_TONOS 0x03AC
#define GREEK_LOWER_ALPHA_OXIA 0x1F71
#define GREEK_LOWER_EPSILON 0x03B5
#define GREEK_LOWER_EPSILON_TONOS 0x03AD
#define GREEK_LOWER_EPSILON_OXIA 0x1F73
#define GREEK_LOWER_ETA 0x03B7
#define GREEK_LOWER_ETA_TONOS 0x03AE
#define GREEK_LOWER_ETA_OXIA 0x1F75
#define GREEK_LOWER_IOTA 0x03B9
#define GREEK_LOWER_IOTA_TONOS 0x03AF
#define GREEK_LOWER_IOTA_OXIA 0x1F77
#define GREEK_LOWER_IOTA_DIALYTIKA 0x03CA
#define GREEK_LOWER_IOTA_DIALYTIKA_TONOS 0x0390
#define GREEK_LOWER_IOTA_DIALYTIKA_OXIA 0x1FD3
#define GREEK_LOWER_OMICRON 0x03BF
#define GREEK_LOWER_OMICRON_TONOS 0x03CC
#define GREEK_LOWER_OMICRON_OXIA 0x1F79
#define GREEK_LOWER_UPSILON 0x03C5
#define GREEK_LOWER_UPSILON_TONOS 0x03CD
#define GREEK_LOWER_UPSILON_OXIA 0x1F7B
#define GREEK_LOWER_UPSILON_DIALYTIKA 0x03CB
#define GREEK_LOWER_UPSILON_DIALYTIKA_TONOS 0x03B0
#define GREEK_LOWER_UPSILON_DIALYTIKA_OXIA 0x1FE3
#define GREEK_LOWER_OMEGA 0x03C9
#define GREEK_LOWER_OMEGA_TONOS 0x03CE
#define GREEK_LOWER_OMEGA_OXIA 0x1F7D
#define GREEK_UPPER_ALPHA 0x0391
#define GREEK_UPPER_EPSILON 0x0395
#define GREEK_UPPER_ETA 0x0397
#define GREEK_UPPER_IOTA 0x0399
#define GREEK_UPPER_IOTA_DIALYTIKA 0x03AA
#define GREEK_UPPER_OMICRON 0x039F
#define GREEK_UPPER_UPSILON 0x03A5
#define GREEK_UPPER_UPSILON_DIALYTIKA 0x03AB
#define GREEK_UPPER_OMEGA 0x03A9
#define GREEK_UPPER_ALPHA_TONOS 0x0386
#define GREEK_UPPER_ALPHA_OXIA 0x1FBB
#define GREEK_UPPER_EPSILON_TONOS 0x0388
#define GREEK_UPPER_EPSILON_OXIA 0x1FC9
#define GREEK_UPPER_ETA_TONOS 0x0389
#define GREEK_UPPER_ETA_OXIA 0x1FCB
#define GREEK_UPPER_IOTA_TONOS 0x038A
#define GREEK_UPPER_IOTA_OXIA 0x1FDB
#define GREEK_UPPER_OMICRON_TONOS 0x038C
#define GREEK_UPPER_OMICRON_OXIA 0x1FF9
#define GREEK_UPPER_UPSILON_TONOS 0x038E
#define GREEK_UPPER_UPSILON_OXIA 0x1FEB
#define GREEK_UPPER_OMEGA_TONOS 0x038F
#define GREEK_UPPER_OMEGA_OXIA 0x1FFB
#define COMBINING_ACUTE_ACCENT 0x0301
#define COMBINING_DIAERESIS 0x0308
#define COMBINING_ACUTE_TONE_MARK 0x0341
#define COMBINING_GREEK_DIALYTIKA_TONOS 0x0344

namespace mozilla {

uint32_t GreekCasing::UpperCase(uint32_t aCh, GreekCasing::State& aState,
                                bool& aMarkEtaPos, bool& aUpdateMarkedEta) {
  aMarkEtaPos = false;
  aUpdateMarkedEta = false;

  uint8_t category = unicode::GetGeneralCategory(aCh);

  if (aState == kEtaAccMarked) {
    switch (category) {
      case HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER:
      case HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER:
      case HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER:
      case HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER:
      case HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER:
      case HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK:
      case HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK:
      case HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK:
        aUpdateMarkedEta = true;
        break;
      default:
        break;
    }
    aState = kEtaAcc;
  }

  switch (aCh) {
    case GREEK_UPPER_ALPHA:
    case GREEK_LOWER_ALPHA:
      aState = kAlpha;
      return GREEK_UPPER_ALPHA;

    case GREEK_UPPER_EPSILON:
    case GREEK_LOWER_EPSILON:
      aState = kEpsilon;
      return GREEK_UPPER_EPSILON;

    case GREEK_UPPER_ETA:
    case GREEK_LOWER_ETA:
      aState = kEta;
      return GREEK_UPPER_ETA;

    case GREEK_UPPER_IOTA:
      aState = kIota;
      return GREEK_UPPER_IOTA;

    case GREEK_UPPER_OMICRON:
    case GREEK_LOWER_OMICRON:
      aState = kOmicron;
      return GREEK_UPPER_OMICRON;

    case GREEK_UPPER_UPSILON:
      switch (aState) {
        case kOmicron:
          aState = kOmicronUpsilon;
          break;
        default:
          aState = kUpsilon;
          break;
      }
      return GREEK_UPPER_UPSILON;

    case GREEK_UPPER_OMEGA:
    case GREEK_LOWER_OMEGA:
      aState = kOmega;
      return GREEK_UPPER_OMEGA;

    // iota and upsilon may be the second vowel of a diphthong
    case GREEK_LOWER_IOTA:
      switch (aState) {
        case kAlphaAcc:
        case kEpsilonAcc:
        case kOmicronAcc:
        case kUpsilonAcc:
          aState = kInWord;
          return GREEK_UPPER_IOTA_DIALYTIKA;
        default:
          break;
      }
      aState = kIota;
      return GREEK_UPPER_IOTA;

    case GREEK_LOWER_UPSILON:
      switch (aState) {
        case kAlphaAcc:
        case kEpsilonAcc:
        case kEtaAcc:
        case kOmicronAcc:
          aState = kInWord;
          return GREEK_UPPER_UPSILON_DIALYTIKA;
        case kOmicron:
          aState = kOmicronUpsilon;
          break;
        default:
          aState = kUpsilon;
          break;
      }
      return GREEK_UPPER_UPSILON;

    case GREEK_UPPER_IOTA_DIALYTIKA:
    case GREEK_LOWER_IOTA_DIALYTIKA:
    case GREEK_UPPER_UPSILON_DIALYTIKA:
    case GREEK_LOWER_UPSILON_DIALYTIKA:
    case COMBINING_DIAERESIS:
      aState = kDiaeresis;
      return ToUpperCase(aCh);

    // remove accent if it follows a vowel or diaeresis,
    // and set appropriate state for diphthong detection
    case COMBINING_ACUTE_ACCENT:
    case COMBINING_ACUTE_TONE_MARK:
      switch (aState) {
        case kAlpha:
          aState = kAlphaAcc;
          return uint32_t(-1);  // omit this char from result string
        case kEpsilon:
          aState = kEpsilonAcc;
          return uint32_t(-1);
        case kEta:
          aState = kEtaAcc;
          return uint32_t(-1);
        case kIota:
          aState = kIotaAcc;
          return uint32_t(-1);
        case kOmicron:
          aState = kOmicronAcc;
          return uint32_t(-1);
        case kUpsilon:
          aState = kUpsilonAcc;
          return uint32_t(-1);
        case kOmicronUpsilon:
          aState = kInWord;  // this completed a diphthong
          return uint32_t(-1);
        case kOmega:
          aState = kOmegaAcc;
          return uint32_t(-1);
        case kDiaeresis:
          aState = kInWord;
          return uint32_t(-1);
        default:
          break;
      }
      break;

    // combinations with dieresis+accent just strip the accent,
    // and reset to start state (don't form diphthong with following vowel)
    case GREEK_LOWER_IOTA_DIALYTIKA_TONOS:
    case GREEK_LOWER_IOTA_DIALYTIKA_OXIA:
      aState = kInWord;
      return GREEK_UPPER_IOTA_DIALYTIKA;

    case GREEK_LOWER_UPSILON_DIALYTIKA_TONOS:
    case GREEK_LOWER_UPSILON_DIALYTIKA_OXIA:
      aState = kInWord;
      return GREEK_UPPER_UPSILON_DIALYTIKA;

    case COMBINING_GREEK_DIALYTIKA_TONOS:
      aState = kInWord;
      return COMBINING_DIAERESIS;

    // strip accents from vowels, and note the vowel seen so that we can detect
    // diphthongs where diaeresis needs to be added
    case GREEK_LOWER_ALPHA_TONOS:
    case GREEK_LOWER_ALPHA_OXIA:
    case GREEK_UPPER_ALPHA_TONOS:
    case GREEK_UPPER_ALPHA_OXIA:
      aState = kAlphaAcc;
      return GREEK_UPPER_ALPHA;

    case GREEK_LOWER_EPSILON_TONOS:
    case GREEK_LOWER_EPSILON_OXIA:
    case GREEK_UPPER_EPSILON_TONOS:
    case GREEK_UPPER_EPSILON_OXIA:
      aState = kEpsilonAcc;
      return GREEK_UPPER_EPSILON;

    case GREEK_LOWER_ETA_TONOS:
    case GREEK_UPPER_ETA_TONOS:
      if (aState == kStart) {
        aState = kEtaAccMarked;
        aMarkEtaPos = true;  // mark in case we need to remove the tonos later
        return GREEK_UPPER_ETA_TONOS;  // treat as disjunctive eta for now
      }
      // if not in initial state, fall through to strip the accent
      [[fallthrough]];

    case GREEK_LOWER_ETA_OXIA:
    case GREEK_UPPER_ETA_OXIA:
      aState = kEtaAcc;
      return GREEK_UPPER_ETA;

    case GREEK_LOWER_IOTA_TONOS:
    case GREEK_LOWER_IOTA_OXIA:
    case GREEK_UPPER_IOTA_TONOS:
    case GREEK_UPPER_IOTA_OXIA:
      aState = kIotaAcc;
      return GREEK_UPPER_IOTA;

    case GREEK_LOWER_OMICRON_TONOS:
    case GREEK_LOWER_OMICRON_OXIA:
    case GREEK_UPPER_OMICRON_TONOS:
    case GREEK_UPPER_OMICRON_OXIA:
      aState = kOmicronAcc;
      return GREEK_UPPER_OMICRON;

    case GREEK_LOWER_UPSILON_TONOS:
    case GREEK_LOWER_UPSILON_OXIA:
    case GREEK_UPPER_UPSILON_TONOS:
    case GREEK_UPPER_UPSILON_OXIA:
      switch (aState) {
        case kOmicron:
          aState = kInWord;  // this completed a diphthong
          break;
        default:
          aState = kUpsilonAcc;
          break;
      }
      return GREEK_UPPER_UPSILON;

    case GREEK_LOWER_OMEGA_TONOS:
    case GREEK_LOWER_OMEGA_OXIA:
    case GREEK_UPPER_OMEGA_TONOS:
    case GREEK_UPPER_OMEGA_OXIA:
      aState = kOmegaAcc;
      return GREEK_UPPER_OMEGA;
  }

  // all other characters just reset the state to either kStart or kInWord,
  // and use standard mappings
  switch (category) {
    case HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER:
    case HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER:
    case HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER:
    case HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER:
    case HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER:
    case HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK:
    case HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK:
    case HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK:
      aState = kInWord;
      break;
    default:
      aState = kStart;
      break;
  }

  return ToUpperCase(aCh);
}

}  // namespace mozilla