summaryrefslogtreecommitdiffstats
path: root/intl/unicharutil/util/IrishCasing.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'intl/unicharutil/util/IrishCasing.cpp')
-rw-r--r--intl/unicharutil/util/IrishCasing.cpp262
1 files changed, 262 insertions, 0 deletions
diff --git a/intl/unicharutil/util/IrishCasing.cpp b/intl/unicharutil/util/IrishCasing.cpp
new file mode 100644
index 0000000000..5de8a529ae
--- /dev/null
+++ b/intl/unicharutil/util/IrishCasing.cpp
@@ -0,0 +1,262 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/******************************************************************************
+
+This file provides a finite state machine to support Irish Gaelic uppercasing
+rules.
+
+The caller will need to iterate through a string, passing a State variable
+along with the current character to each UpperCase call and checking the flags
+that are returned:
+
+ If aMarkPos is true, caller must remember the current index in the string as
+ a possible target for a future action.
+
+ If aAction is non-zero, then one or more characters from the marked index are
+ to be modified:
+ 1 lowercase the marked letter
+ 2 lowercase the marked letter and its successor
+ 3 lowercase the marked letter, and delete its successor
+
+
+### Rules from https://bugzilla.mozilla.org/show_bug.cgi?id=1014639,
+### comments 1 and 4:
+
+v = [a,á,e,é,i,í,o,ó,u,ú]
+V = [A,Á,E,É,I,Í,O,Ó,U,Ú]
+
+bhf -> bhF
+bhF -> bhF
+bp -> bP
+bP -> bP
+dt -> dT
+dT -> dT
+gc -> gC
+gC -> gC
+h{V} -> h{V}
+mb -> mB
+mB -> mB
+n-{v} -> n{V}
+n{V} -> n{V}
+nd -> nD
+nD -> nD
+ng -> nG
+nG -> nG
+t-{v} -> t{V}
+t{V} -> t{V}
+ts{v} -> tS{V}
+tS{v} -> tS{V}
+tS{V} -> tS{V}
+tsl -> tSL
+tSl -> tSL
+tSL -> tSL
+tsn -> tSN
+tSn -> tSN
+tSN -> tSN
+tsr -> tSR
+tSr -> tSR
+tSR -> tSR
+
+### Create table of states and actions for each input class.
+
+Start (non-word) state is #; generic in-word state is _, once we know there's
+no special action to do in this word.
+
+ # _ b bh d g h m n n- t t- ts
+input\state
+b b' _ _ _ _ _ _ 1 _ _ _ _ _
+B _ _ _ _ _ _ _ 1 _ _ _ _ _
+c _ _ _ _ _ 1 _ _ _ _ _ _ _
+C _ _ _ _ _ 1 _ _ _ _ _ _ _
+d d' _ _ _ _ _ _ _ 1 _ _ _ _
+D _ _ _ _ _ _ _ _ 1 _ _ _ _
+f _ _ _ 2 _ _ _ _ _ _ _ _ _
+F _ _ _ 2 _ _ _ _ _ _ _ _ _
+g g' _ _ _ _ _ _ _ 1 _ _ _ _
+G _ _ _ _ _ _ _ _ 1 _ _ _ _
+h h' _ bh _ _ _ _ _ _ _ _ _ _
+l _ _ _ _ _ _ _ _ _ _ _ _ 1
+L _ _ _ _ _ _ _ _ _ _ _ _ 1
+m m' _ _ _ _ _ _ _ _ _ _ _ _
+n n' _ _ _ _ _ _ _ _ _ _ _ 1
+N _ _ _ _ _ _ _ _ _ _ _ _ 1
+p _ _ 1 _ _ _ _ _ _ _ _ _ _
+P _ _ 1 _ _ _ _ _ _ _ _ _ _
+r _ _ _ _ _ _ _ _ _ _ _ _ 1
+R _ _ _ _ _ _ _ _ _ _ _ _ 1
+s _ _ _ _ _ _ _ _ _ _ ts _ _
+S _ _ _ _ _ _ _ _ _ _ ts _ _
+t t' _ _ _ 1 _ _ _ _ _ _ _ _
+T _ _ _ _ 1 _ _ _ _ _ _ _ _
+vowel _ _ _ _ _ _ _ _ _ 1d _ 1d 1
+Vowel _ _ _ _ _ _ 1 _ 1 _ 1 _ 1
+hyph _ _ _ _ _ _ _ _ n- _ t- _ _
+letter _ _ _ _ _ _ _ _ _ _ _ _ _
+other # # # # # # # # # # # # #
+
+Actions:
+ 1 lowercase one letter at start of word
+ 2 lowercase two letters at start of word
+ 1d lowercase one letter at start of word, and delete next
+ (and then go to state _, nothing further to do in this word)
+
+else just go to the given state; suffix ' indicates mark start-of-word.
+
+### Consolidate identical states and classes:
+
+ 0 1 2 3 4 5 6 7 8 9 A B
+ # _ b bh d g h m n [nt]- t ts
+input\state
+b b' _ _ _ _ _ _ 1 _ _ _ _
+B _ _ _ _ _ _ _ 1 _ _ _ _
+[cC] _ _ _ _ _ 1 _ _ _ _ _ _
+d d' _ _ _ _ _ _ _ 1 _ _ _
+[DG] _ _ _ _ _ _ _ _ 1 _ _ _
+[fF] _ _ _ 2 _ _ _ _ _ _ _ _
+g g' _ _ _ _ _ _ _ 1 _ _ _
+h h' _ bh _ _ _ _ _ _ _ _ _
+[lLNrR] _ _ _ _ _ _ _ _ _ _ _ 1
+m m' _ _ _ _ _ _ _ _ _ _ _
+n n' _ _ _ _ _ _ _ _ _ _ 1
+[pP] _ _ 1 _ _ _ _ _ _ _ _ _
+[sS] _ _ _ _ _ _ _ _ _ _ ts _
+t t' _ _ _ 1 _ _ _ _ _ _ _
+T _ _ _ _ 1 _ _ _ _ _ _ _
+vowel _ _ _ _ _ _ _ _ _ 1d _ 1
+Vowel _ _ _ _ _ _ 1 _ 1 _ 1 1
+hyph _ _ _ _ _ _ _ _ [nt-] _ [nt-] _
+letter _ _ _ _ _ _ _ _ _ _ _ _
+other # # # # # # # # # # # #
+
+So we have 20 input classes, and 12 states.
+
+State table array will contain bytes that encode action and new state:
+
+ 0x80 - bit flag: mark start-of-word position
+ 0x40 - currently unused
+ 0x30 - action mask: 4 values
+ 0x00 - do nothing
+ 0x10 - lowercase one letter
+ 0x20 - lowercase two letters
+ 0x30 - lowercase one, delete one
+ 0x0F - next-state mask
+******************************************************************************/
+
+#include "IrishCasing.h"
+
+#include "nsUnicodeProperties.h"
+#include "nsUnicharUtils.h"
+
+namespace mozilla {
+
+const uint8_t IrishCasing::sUppercaseStateTable[kNumClasses][kNumStates] = {
+ // # _ b bh d g h m n [nt]- t ts
+ {0x82, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01,
+ 0x01}, // b
+ {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01,
+ 0x01}, // B
+ {0x01, 0x01, 0x01, 0x01, 0x01, 0x10, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x01}, // [cC]
+ {0x84, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01,
+ 0x01}, // d
+ {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01,
+ 0x01}, // [DG]
+ {0x01, 0x01, 0x01, 0x21, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x01}, // [fF]
+ {0x85, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01,
+ 0x01}, // g
+ {0x86, 0x01, 0x03, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x01}, // h
+ {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x11}, // [lLNrR]
+ {0x87, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x01}, // m
+ {0x88, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x11}, // n
+ {0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x01}, // [pP]
+ {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x0B,
+ 0x01}, // [sS]
+ {0x8A, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x01}, // t
+ {0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x01}, // T
+ {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x31, 0x01,
+ 0x11}, // vowel
+ {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x11, 0x01, 0x11,
+ 0x11}, // Vowel
+ {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x09, 0x01, 0x09,
+ 0x01}, // hyph
+ {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x01}, // letter
+ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00} // other
+};
+
+#define HYPHEN 0x2010
+#define NO_BREAK_HYPHEN 0x2011
+#define a_ACUTE 0x00e1
+#define e_ACUTE 0x00e9
+#define i_ACUTE 0x00ed
+#define o_ACUTE 0x00f3
+#define u_ACUTE 0x00fa
+#define A_ACUTE 0x00c1
+#define E_ACUTE 0x00c9
+#define I_ACUTE 0x00cd
+#define O_ACUTE 0x00d3
+#define U_ACUTE 0x00da
+
+const uint8_t IrishCasing::sLcClasses[26] = {
+ kClass_vowel, kClass_b, kClass_cC, kClass_d, kClass_vowel,
+ kClass_fF, kClass_g, kClass_h, kClass_vowel, kClass_letter,
+ kClass_letter, kClass_lLNrR, kClass_m, kClass_n, kClass_vowel,
+ kClass_pP, kClass_letter, kClass_lLNrR, kClass_sS, kClass_t,
+ kClass_vowel, kClass_letter, kClass_letter, kClass_letter, kClass_letter,
+ kClass_letter};
+
+const uint8_t IrishCasing::sUcClasses[26] = {
+ kClass_Vowel, kClass_B, kClass_cC, kClass_DG, kClass_Vowel,
+ kClass_fF, kClass_DG, kClass_letter, kClass_Vowel, kClass_letter,
+ kClass_letter, kClass_lLNrR, kClass_letter, kClass_lLNrR, kClass_Vowel,
+ kClass_pP, kClass_letter, kClass_lLNrR, kClass_sS, kClass_T,
+ kClass_Vowel, kClass_letter, kClass_letter, kClass_letter, kClass_letter,
+ kClass_letter};
+
+uint8_t IrishCasing::GetClass(uint32_t aCh) {
+ using mozilla::unicode::GetGenCategory;
+ if (aCh >= 'a' && aCh <= 'z') {
+ return sLcClasses[aCh - 'a'];
+ } else if (aCh >= 'A' && aCh <= 'Z') {
+ return sUcClasses[aCh - 'A'];
+ } else if (GetGenCategory(aCh) == nsUGenCategory::kLetter) {
+ if (aCh == a_ACUTE || aCh == e_ACUTE || aCh == i_ACUTE || aCh == o_ACUTE ||
+ aCh == u_ACUTE) {
+ return kClass_vowel;
+ } else if (aCh == A_ACUTE || aCh == E_ACUTE || aCh == I_ACUTE ||
+ aCh == O_ACUTE || aCh == U_ACUTE) {
+ return kClass_Vowel;
+ } else {
+ return kClass_letter;
+ }
+ } else if (aCh == '-' || aCh == HYPHEN || aCh == NO_BREAK_HYPHEN) {
+ return kClass_hyph;
+ } else {
+ return kClass_other;
+ }
+}
+
+uint32_t IrishCasing::UpperCase(uint32_t aCh, State& aState, bool& aMarkPos,
+ uint8_t& aAction) {
+ uint8_t cls = GetClass(aCh);
+ uint8_t stateEntry = sUppercaseStateTable[cls][aState];
+ aMarkPos = !!(stateEntry & kMarkPositionFlag);
+ aAction = (stateEntry & kActionMask) >> kActionShift;
+ aState = State(stateEntry & kNextStateMask);
+
+ return ToUpperCase(aCh);
+}
+
+} // namespace mozilla