summaryrefslogtreecommitdiffstats
path: root/intl/unicharutil/util/nsUnicodeProperties.h
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--intl/unicharutil/util/nsUnicodeProperties.h283
1 files changed, 283 insertions, 0 deletions
diff --git a/intl/unicharutil/util/nsUnicodeProperties.h b/intl/unicharutil/util/nsUnicodeProperties.h
new file mode 100644
index 0000000000..11566cbffc
--- /dev/null
+++ b/intl/unicharutil/util/nsUnicodeProperties.h
@@ -0,0 +1,283 @@
+/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim:set ts=4 sw=2 sts=2 et cindent: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef NS_UNICODEPROPERTIES_H
+#define NS_UNICODEPROPERTIES_H
+
+#include "nsBidiUtils.h"
+#include "nsUGenCategory.h"
+#include "nsUnicodeScriptCodes.h"
+#include "harfbuzz/hb.h"
+
+#include "unicode/uchar.h"
+#include "unicode/uscript.h"
+
+const nsCharProps2& GetCharProps2(uint32_t aCh);
+
+namespace mozilla {
+
+namespace unicode {
+
+extern const nsUGenCategory sDetailedToGeneralCategory[];
+
+/* This MUST match the values assigned by genUnicodePropertyData.pl! */
+enum VerticalOrientation {
+ VERTICAL_ORIENTATION_U = 0,
+ VERTICAL_ORIENTATION_R = 1,
+ VERTICAL_ORIENTATION_Tu = 2,
+ VERTICAL_ORIENTATION_Tr = 3
+};
+
+/* This MUST match the values assigned by genUnicodePropertyData.pl! */
+enum PairedBracketType {
+ PAIRED_BRACKET_TYPE_NONE = 0,
+ PAIRED_BRACKET_TYPE_OPEN = 1,
+ PAIRED_BRACKET_TYPE_CLOSE = 2
+};
+
+/* Flags for Unicode security IdentifierType.txt attributes. Only a subset
+ of these are currently checked by Gecko, so we only define flags for the
+ ones we need. */
+enum IdentifierType {
+ IDTYPE_RESTRICTED = 0,
+ IDTYPE_ALLOWED = 1,
+};
+
+enum EmojiPresentation { TextOnly = 0, TextDefault = 1, EmojiDefault = 2 };
+
+const uint32_t kVariationSelector15 = 0xFE0E; // text presentation
+const uint32_t kVariationSelector16 = 0xFE0F; // emoji presentation
+
+// Unicode values for EMOJI MODIFIER FITZPATRICK TYPE-*
+const uint32_t kEmojiSkinToneFirst = 0x1f3fb;
+const uint32_t kEmojiSkinToneLast = 0x1f3ff;
+
+extern const hb_unicode_general_category_t sICUtoHBcategory[];
+
+inline uint32_t GetMirroredChar(uint32_t aCh) { return u_charMirror(aCh); }
+
+inline bool HasMirroredChar(uint32_t aCh) { return u_isMirrored(aCh); }
+
+inline uint8_t GetCombiningClass(uint32_t aCh) {
+ return u_getCombiningClass(aCh);
+}
+
+inline uint8_t GetGeneralCategory(uint32_t aCh) {
+ return sICUtoHBcategory[u_charType(aCh)];
+}
+
+inline nsCharType GetBidiCat(uint32_t aCh) {
+ return nsCharType(u_charDirection(aCh));
+}
+
+inline int8_t GetNumericValue(uint32_t aCh) {
+ UNumericType type =
+ UNumericType(u_getIntPropertyValue(aCh, UCHAR_NUMERIC_TYPE));
+ return type == U_NT_DECIMAL || type == U_NT_DIGIT
+ ? int8_t(u_getNumericValue(aCh))
+ : -1;
+}
+
+inline uint8_t GetLineBreakClass(uint32_t aCh) {
+ return u_getIntPropertyValue(aCh, UCHAR_LINE_BREAK);
+}
+
+inline Script GetScriptCode(uint32_t aCh) {
+ UErrorCode err = U_ZERO_ERROR;
+ return Script(uscript_getScript(aCh, &err));
+}
+
+inline bool HasScript(uint32_t aCh, Script aScript) {
+ return uscript_hasScript(aCh, UScriptCode(aScript));
+}
+
+inline uint32_t GetScriptTagForCode(Script aScriptCode) {
+ const char* tag = uscript_getShortName(UScriptCode(aScriptCode));
+ if (tag) {
+ return HB_TAG(tag[0], tag[1], tag[2], tag[3]);
+ }
+ // return UNKNOWN script tag (running with older ICU?)
+ return HB_SCRIPT_UNKNOWN;
+}
+
+inline PairedBracketType GetPairedBracketType(uint32_t aCh) {
+ return PairedBracketType(
+ u_getIntPropertyValue(aCh, UCHAR_BIDI_PAIRED_BRACKET_TYPE));
+}
+
+inline uint32_t GetPairedBracket(uint32_t aCh) {
+ return u_getBidiPairedBracket(aCh);
+}
+
+inline uint32_t GetUppercase(uint32_t aCh) { return u_toupper(aCh); }
+
+inline uint32_t GetLowercase(uint32_t aCh) { return u_tolower(aCh); }
+
+inline uint32_t GetTitlecaseForLower(
+ uint32_t aCh) // maps LC to titlecase, UC unchanged
+{
+ return u_isULowercase(aCh) ? u_totitle(aCh) : aCh;
+}
+
+inline uint32_t GetTitlecaseForAll(
+ uint32_t aCh) // maps both UC and LC to titlecase
+{
+ return u_totitle(aCh);
+}
+
+inline uint32_t GetFoldedcase(uint32_t aCh) {
+ // Handle dotted capital I and dotless small i specially because we want to
+ // use a combination of ordinary case-folding rules and Turkish case-folding
+ // rules.
+ if (aCh == 0x0130 || aCh == 0x0131) {
+ return 'i';
+ }
+ return u_foldCase(aCh, U_FOLD_CASE_DEFAULT);
+}
+
+inline bool IsEastAsianWidthFHWexcludingEmoji(uint32_t aCh) {
+ switch (u_getIntPropertyValue(aCh, UCHAR_EAST_ASIAN_WIDTH)) {
+ case U_EA_FULLWIDTH:
+ case U_EA_HALFWIDTH:
+ return true;
+ case U_EA_WIDE:
+ return u_hasBinaryProperty(aCh, UCHAR_EMOJI) ? false : true;
+ case U_EA_AMBIGUOUS:
+ case U_EA_NARROW:
+ case U_EA_NEUTRAL:
+ return false;
+ }
+ return false;
+}
+
+inline bool IsEastAsianWidthAFW(uint32_t aCh) {
+ switch (u_getIntPropertyValue(aCh, UCHAR_EAST_ASIAN_WIDTH)) {
+ case U_EA_AMBIGUOUS:
+ case U_EA_FULLWIDTH:
+ case U_EA_WIDE:
+ return true;
+ case U_EA_HALFWIDTH:
+ case U_EA_NARROW:
+ case U_EA_NEUTRAL:
+ return false;
+ }
+ return false;
+}
+
+inline bool IsDefaultIgnorable(uint32_t aCh) {
+ return u_hasBinaryProperty(aCh, UCHAR_DEFAULT_IGNORABLE_CODE_POINT);
+}
+
+inline EmojiPresentation GetEmojiPresentation(uint32_t aCh) {
+ if (!u_hasBinaryProperty(aCh, UCHAR_EMOJI)) {
+ return TextOnly;
+ }
+
+ if (u_hasBinaryProperty(aCh, UCHAR_EMOJI_PRESENTATION)) {
+ return EmojiDefault;
+ }
+ return TextDefault;
+}
+
+// returns the simplified Gen Category as defined in nsUGenCategory
+inline nsUGenCategory GetGenCategory(uint32_t aCh) {
+ return sDetailedToGeneralCategory[GetGeneralCategory(aCh)];
+}
+
+inline VerticalOrientation GetVerticalOrientation(uint32_t aCh) {
+ return VerticalOrientation(GetCharProps2(aCh).mVertOrient);
+}
+
+inline IdentifierType GetIdentifierType(uint32_t aCh) {
+ return IdentifierType(GetCharProps2(aCh).mIdType);
+}
+
+uint32_t GetFullWidth(uint32_t aCh);
+// This is the reverse function of GetFullWidth which guarantees that
+// for every codepoint c, GetFullWidthInverse(GetFullWidth(c)) == c.
+// Note that, this function does not guarantee to convert all wide
+// form characters to their possible narrow form.
+uint32_t GetFullWidthInverse(uint32_t aCh);
+
+bool IsClusterExtender(uint32_t aCh, uint8_t aCategory);
+
+inline bool IsClusterExtender(uint32_t aCh) {
+ return IsClusterExtender(aCh, GetGeneralCategory(aCh));
+}
+
+// A simple iterator for a string of char16_t codepoints that advances
+// by Unicode grapheme clusters
+class ClusterIterator {
+ public:
+ ClusterIterator(const char16_t* aText, uint32_t aLength)
+ : mPos(aText),
+ mLimit(aText + aLength)
+#ifdef DEBUG
+ ,
+ mText(aText)
+#endif
+ {
+ }
+
+ operator const char16_t*() const { return mPos; }
+
+ bool AtEnd() const { return mPos >= mLimit; }
+
+ void Next();
+
+ private:
+ const char16_t* mPos;
+ const char16_t* mLimit;
+#ifdef DEBUG
+ const char16_t* mText;
+#endif
+};
+
+// Count the number of grapheme clusters in the given string
+uint32_t CountGraphemeClusters(const char16_t* aText, uint32_t aLength);
+
+// Determine whether a character is a "combining diacritic" for the purpose
+// of diacritic-insensitive text search. Examples of such characters include
+// European accents and Hebrew niqqud, but not Hangul components or Thaana
+// vowels, even though Thaana vowels are combining nonspacing marks that could
+// be considered diacritics.
+// As an exception to strictly following Unicode properties, we exclude the
+// Japanese kana voicing marks
+// 3099;COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK;Mn;8;NSM
+// 309A;COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK;Mn;8;NSM
+// which users report should not be ignored (bug 1624244).
+inline bool IsCombiningDiacritic(uint32_t aCh) {
+ uint8_t cc = u_getCombiningClass(aCh);
+ return cc != HB_UNICODE_COMBINING_CLASS_NOT_REORDERED &&
+ cc != HB_UNICODE_COMBINING_CLASS_KANA_VOICING;
+}
+
+// Remove diacritics from a character
+uint32_t GetNaked(uint32_t aCh);
+
+// A simple reverse iterator for a string of char16_t codepoints that
+// advances by Unicode grapheme clusters
+class ClusterReverseIterator {
+ public:
+ ClusterReverseIterator(const char16_t* aText, uint32_t aLength)
+ : mPos(aText + aLength), mLimit(aText) {}
+
+ operator const char16_t*() const { return mPos; }
+
+ bool AtEnd() const { return mPos <= mLimit; }
+
+ void Next();
+
+ private:
+ const char16_t* mPos;
+ const char16_t* mLimit;
+};
+
+} // end namespace unicode
+
+} // end namespace mozilla
+
+#endif /* NS_UNICODEPROPERTIES_H */