diff options
Diffstat (limited to 'intl/unicharutil/util/IrishCasing.cpp')
-rw-r--r-- | intl/unicharutil/util/IrishCasing.cpp | 270 |
1 files changed, 270 insertions, 0 deletions
diff --git a/intl/unicharutil/util/IrishCasing.cpp b/intl/unicharutil/util/IrishCasing.cpp new file mode 100644 index 0000000000..566d9c38ae --- /dev/null +++ b/intl/unicharutil/util/IrishCasing.cpp @@ -0,0 +1,270 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/****************************************************************************** + +This file provides a finite state machine to support Irish Gaelic uppercasing +rules. + +The caller will need to iterate through a string, passing a State variable +along with the current character to each UpperCase call and checking the flags +that are returned: + + If aMarkPos is true, caller must remember the current index in the string as + a possible target for a future action. + + If aAction is non-zero, then one or more characters from the marked index are + to be modified: + 1 lowercase the marked letter + 2 lowercase the marked letter and its successor + 3 lowercase the marked letter, and delete its successor + + +### Rules from https://bugzilla.mozilla.org/show_bug.cgi?id=1014639, +### comments 1 and 4: + +v = [a,á,e,é,i,í,o,ó,u,ú] +V = [A,Á,E,É,I,Í,O,Ó,U,Ú] + +bhf -> bhF +bhF -> bhF +bp -> bP +bP -> bP +dt -> dT +dT -> dT +gc -> gC +gC -> gC +h{V} -> h{V} +mb -> mB +mB -> mB +n-{v} -> n{V} +n{V} -> n{V} +nd -> nD +nD -> nD +ng -> nG +nG -> nG +t-{v} -> t{V} +t{V} -> t{V} +ts{v} -> tS{V} +tS{v} -> tS{V} +tS{V} -> tS{V} +tsl -> tSL +tSl -> tSL +tSL -> tSL +tsn -> tSN +tSn -> tSN +tSN -> tSN +tsr -> tSR +tSr -> tSR +tSR -> tSR + +### Create table of states and actions for each input class. + +Start (non-word) state is #; generic in-word state is _, once we know there's +no special action to do in this word. + + # _ b bh d g h m n n- t t- ts +input\state +b b' _ _ _ _ _ _ 1 _ _ _ _ _ +B _ _ _ _ _ _ _ 1 _ _ _ _ _ +c _ _ _ _ _ 1 _ _ _ _ _ _ _ +C _ _ _ _ _ 1 _ _ _ _ _ _ _ +d d' _ _ _ _ _ _ _ 1 _ _ _ _ +D _ _ _ _ _ _ _ _ 1 _ _ _ _ +f _ _ _ 2 _ _ _ _ _ _ _ _ _ +F _ _ _ 2 _ _ _ _ _ _ _ _ _ +g g' _ _ _ _ _ _ _ 1 _ _ _ _ +G _ _ _ _ _ _ _ _ 1 _ _ _ _ +h h' _ bh _ _ _ _ _ _ _ _ _ _ +l _ _ _ _ _ _ _ _ _ _ _ _ 1 +L _ _ _ _ _ _ _ _ _ _ _ _ 1 +m m' _ _ _ _ _ _ _ _ _ _ _ _ +n n' _ _ _ _ _ _ _ _ _ _ _ 1 +N _ _ _ _ _ _ _ _ _ _ _ _ 1 +p _ _ 1 _ _ _ _ _ _ _ _ _ _ +P _ _ 1 _ _ _ _ _ _ _ _ _ _ +r _ _ _ _ _ _ _ _ _ _ _ _ 1 +R _ _ _ _ _ _ _ _ _ _ _ _ 1 +s _ _ _ _ _ _ _ _ _ _ ts _ _ +S _ _ _ _ _ _ _ _ _ _ ts _ _ +t t' _ _ _ 1 _ _ _ _ _ _ _ _ +T _ _ _ _ 1 _ _ _ _ _ _ _ _ +vowel _ _ _ _ _ _ _ _ _ 1d _ 1d 1 +Vowel _ _ _ _ _ _ 1 _ 1 _ 1 _ 1 +hyph _ _ _ _ _ _ _ _ n- _ t- _ _ +letter _ _ _ _ _ _ _ _ _ _ _ _ _ +other # # # # # # # # # # # # # + +Actions: + 1 lowercase one letter at start of word + 2 lowercase two letters at start of word + 1d lowercase one letter at start of word, and delete next + (and then go to state _, nothing further to do in this word) + +else just go to the given state; suffix ' indicates mark start-of-word. + +### Consolidate identical states and classes: + + 0 1 2 3 4 5 6 7 8 9 A B + # _ b bh d g h m n [nt]- t ts +input\state +b b' _ _ _ _ _ _ 1 _ _ _ _ +B _ _ _ _ _ _ _ 1 _ _ _ _ +[cC] _ _ _ _ _ 1 _ _ _ _ _ _ +d d' _ _ _ _ _ _ _ 1 _ _ _ +[DG] _ _ _ _ _ _ _ _ 1 _ _ _ +[fF] _ _ _ 2 _ _ _ _ _ _ _ _ +g g' _ _ _ _ _ _ _ 1 _ _ _ +h h' _ bh _ _ _ _ _ _ _ _ _ +[lLNrR] _ _ _ _ _ _ _ _ _ _ _ 1 +m m' _ _ _ _ _ _ _ _ _ _ _ +n n' _ _ _ _ _ _ _ _ _ _ 1 +[pP] _ _ 1 _ _ _ _ _ _ _ _ _ +[sS] _ _ _ _ _ _ _ _ _ _ ts _ +t t' _ _ _ 1 _ _ _ _ _ _ _ +T _ _ _ _ 1 _ _ _ _ _ _ _ +vowel _ _ _ _ _ _ _ _ _ 1d _ 1 +Vowel _ _ _ _ _ _ 1 _ 1 _ 1 1 +hyph _ _ _ _ _ _ _ _ [nt-] _ [nt-] _ +letter _ _ _ _ _ _ _ _ _ _ _ _ +other # # # # # # # # # # # # + +So we have 20 input classes, and 12 states. + +State table array will contain bytes that encode action and new state: + + 0x80 - bit flag: mark start-of-word position + 0x40 - currently unused + 0x30 - action mask: 4 values + 0x00 - do nothing + 0x10 - lowercase one letter + 0x20 - lowercase two letters + 0x30 - lowercase one, delete one + 0x0F - next-state mask +******************************************************************************/ + +#include "IrishCasing.h" + +#include "nsUnicodeProperties.h" +#include "nsUnicharUtils.h" + +namespace mozilla { + +const uint8_t IrishCasing::sUppercaseStateTable[kNumClasses][kNumStates] = { + // # _ b bh d g h m n [nt]- t ts + {0x82, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, + 0x01}, // b + {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, + 0x01}, // B + {0x01, 0x01, 0x01, 0x01, 0x01, 0x10, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01}, // [cC] + {0x84, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, + 0x01}, // d + {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, + 0x01}, // [DG] + {0x01, 0x01, 0x01, 0x21, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01}, // [fF] + {0x85, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, + 0x01}, // g + {0x86, 0x01, 0x03, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01}, // h + {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x11}, // [lLNrR] + {0x87, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01}, // m + {0x88, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x11}, // n + {0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01}, // [pP] + {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x0B, + 0x01}, // [sS] + {0x8A, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01}, // t + {0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01}, // T + {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x31, 0x01, + 0x11}, // vowel + {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x11, 0x01, 0x11, + 0x11}, // Vowel + {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x09, 0x01, 0x09, + 0x01}, // hyph + {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01}, // letter + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00} // other +}; + +#define HYPHEN 0x2010 +#define NO_BREAK_HYPHEN 0x2011 +#define a_ACUTE 0x00e1 +#define e_ACUTE 0x00e9 +#define i_ACUTE 0x00ed +#define o_ACUTE 0x00f3 +#define u_ACUTE 0x00fa +#define A_ACUTE 0x00c1 +#define E_ACUTE 0x00c9 +#define I_ACUTE 0x00cd +#define O_ACUTE 0x00d3 +#define U_ACUTE 0x00da + +const uint8_t IrishCasing::sLcClasses[26] = { + kClass_vowel, kClass_b, kClass_cC, kClass_d, kClass_vowel, + kClass_fF, kClass_g, kClass_h, kClass_vowel, kClass_letter, + kClass_letter, kClass_lLNrR, kClass_m, kClass_n, kClass_vowel, + kClass_pP, kClass_letter, kClass_lLNrR, kClass_sS, kClass_t, + kClass_vowel, kClass_letter, kClass_letter, kClass_letter, kClass_letter, + kClass_letter}; + +const uint8_t IrishCasing::sUcClasses[26] = { + kClass_Vowel, kClass_B, kClass_cC, kClass_DG, kClass_Vowel, + kClass_fF, kClass_DG, kClass_letter, kClass_Vowel, kClass_letter, + kClass_letter, kClass_lLNrR, kClass_letter, kClass_lLNrR, kClass_Vowel, + kClass_pP, kClass_letter, kClass_lLNrR, kClass_sS, kClass_T, + kClass_Vowel, kClass_letter, kClass_letter, kClass_letter, kClass_letter, + kClass_letter}; + +uint8_t IrishCasing::GetClass(uint32_t aCh) { + using mozilla::unicode::GetGenCategory; + if (aCh >= 'a' && aCh <= 'z') { + return sLcClasses[aCh - 'a']; + } + + if (aCh >= 'A' && aCh <= 'Z') { + return sUcClasses[aCh - 'A']; + } + + if (GetGenCategory(aCh) == nsUGenCategory::kLetter) { + if (aCh == a_ACUTE || aCh == e_ACUTE || aCh == i_ACUTE || aCh == o_ACUTE || + aCh == u_ACUTE) { + return kClass_vowel; + } + + if (aCh == A_ACUTE || aCh == E_ACUTE || aCh == I_ACUTE || aCh == O_ACUTE || + aCh == U_ACUTE) { + return kClass_Vowel; + } + + return kClass_letter; + } + + if (aCh == '-' || aCh == HYPHEN || aCh == NO_BREAK_HYPHEN) { + return kClass_hyph; + } + + return kClass_other; +} + +uint32_t IrishCasing::UpperCase(uint32_t aCh, State& aState, bool& aMarkPos, + uint8_t& aAction) { + uint8_t cls = GetClass(aCh); + uint8_t stateEntry = sUppercaseStateTable[cls][aState]; + aMarkPos = !!(stateEntry & kMarkPositionFlag); + aAction = (stateEntry & kActionMask) >> kActionShift; + aState = State(stateEntry & kNextStateMask); + + return ToUpperCase(aCh); +} + +} // namespace mozilla |