summaryrefslogtreecommitdiffstats
path: root/intl/unicharutil/util/nsUnicodeProperties.h
diff options
context:
space:
mode:
Diffstat (limited to 'intl/unicharutil/util/nsUnicodeProperties.h')
-rw-r--r--intl/unicharutil/util/nsUnicodeProperties.h203
1 files changed, 203 insertions, 0 deletions
diff --git a/intl/unicharutil/util/nsUnicodeProperties.h b/intl/unicharutil/util/nsUnicodeProperties.h
new file mode 100644
index 0000000000..9e71ecec94
--- /dev/null
+++ b/intl/unicharutil/util/nsUnicodeProperties.h
@@ -0,0 +1,203 @@
+/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim:set ts=4 sw=2 sts=2 et cindent: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef NS_UNICODEPROPERTIES_H
+#define NS_UNICODEPROPERTIES_H
+
+#include "mozilla/intl/UnicodeProperties.h"
+
+#include "mozilla/Span.h"
+#include "nsBidiUtils.h"
+#include "nsUGenCategory.h"
+#include "harfbuzz/hb.h"
+
+struct nsCharProps2 {
+ // Currently only 4 bits are defined here, so 4 more could be added without
+ // affecting the storage requirements for this struct. Or we could pack two
+ // records per byte, at the cost of a slightly more complex accessor.
+ unsigned char mVertOrient : 2;
+ unsigned char mIdType : 2;
+};
+
+const nsCharProps2& GetCharProps2(uint32_t aCh);
+
+namespace mozilla {
+
+namespace unicode {
+
+extern const nsUGenCategory sDetailedToGeneralCategory[];
+
+/* This MUST match the values assigned by genUnicodePropertyData.pl! */
+enum VerticalOrientation {
+ VERTICAL_ORIENTATION_U = 0,
+ VERTICAL_ORIENTATION_R = 1,
+ VERTICAL_ORIENTATION_Tu = 2,
+ VERTICAL_ORIENTATION_Tr = 3
+};
+
+/* This MUST match the values assigned by genUnicodePropertyData.pl! */
+enum PairedBracketType {
+ PAIRED_BRACKET_TYPE_NONE = 0,
+ PAIRED_BRACKET_TYPE_OPEN = 1,
+ PAIRED_BRACKET_TYPE_CLOSE = 2
+};
+
+/* Flags for Unicode security IdentifierType.txt attributes. Only a subset
+ of these are currently checked by Gecko, so we only define flags for the
+ ones we need. */
+enum IdentifierType {
+ IDTYPE_RESTRICTED = 0,
+ IDTYPE_ALLOWED = 1,
+};
+
+enum EmojiPresentation { TextOnly = 0, TextDefault = 1, EmojiDefault = 2 };
+
+const uint32_t kVariationSelector15 = 0xFE0E; // text presentation
+const uint32_t kVariationSelector16 = 0xFE0F; // emoji presentation
+
+// Unicode values for EMOJI MODIFIER FITZPATRICK TYPE-*
+const uint32_t kEmojiSkinToneFirst = 0x1f3fb;
+const uint32_t kEmojiSkinToneLast = 0x1f3ff;
+
+extern const hb_unicode_general_category_t sICUtoHBcategory[];
+
+// NOTE: This returns values matching harfbuzz HB_UNICODE_GENERAL_CATEGORY_*
+// constants, NOT the mozilla::intl::GeneralCategory enum.
+// For the GeneralCategory enum, use intl::UnicodeProperties::CharType itself.
+inline uint8_t GetGeneralCategory(uint32_t aCh) {
+ return sICUtoHBcategory[unsigned(intl::UnicodeProperties::CharType(aCh))];
+}
+
+inline int8_t GetNumericValue(uint32_t aCh) {
+ return intl::UnicodeProperties::GetNumericValue(aCh);
+}
+
+inline uint8_t GetLineBreakClass(uint32_t aCh) {
+ return intl::UnicodeProperties::GetIntPropertyValue(
+ aCh, intl::UnicodeProperties::IntProperty::LineBreak);
+}
+
+inline uint32_t GetScriptTagForCode(intl::Script aScriptCode) {
+ const char* tag = intl::UnicodeProperties::GetScriptShortName(aScriptCode);
+ if (tag) {
+ return HB_TAG(tag[0], tag[1], tag[2], tag[3]);
+ }
+ // return UNKNOWN script tag (running with older ICU?)
+ return HB_SCRIPT_UNKNOWN;
+}
+
+inline PairedBracketType GetPairedBracketType(uint32_t aCh) {
+ return PairedBracketType(intl::UnicodeProperties::GetIntPropertyValue(
+ aCh, intl::UnicodeProperties::IntProperty::BidiPairedBracketType));
+}
+
+inline uint32_t GetTitlecaseForLower(
+ uint32_t aCh) // maps LC to titlecase, UC unchanged
+{
+ return intl::UnicodeProperties::IsLowercase(aCh)
+ ? intl::UnicodeProperties::ToTitle(aCh)
+ : aCh;
+}
+
+inline uint32_t GetTitlecaseForAll(
+ uint32_t aCh) // maps both UC and LC to titlecase
+{
+ return intl::UnicodeProperties::ToTitle(aCh);
+}
+
+inline uint32_t GetFoldedcase(uint32_t aCh) {
+ // Handle dotted capital I and dotless small i specially because we want to
+ // use a combination of ordinary case-folding rules and Turkish case-folding
+ // rules.
+ if (aCh == 0x0130 || aCh == 0x0131) {
+ return 'i';
+ }
+ return intl::UnicodeProperties::FoldCase(aCh);
+}
+
+inline bool IsDefaultIgnorable(uint32_t aCh) {
+ return intl::UnicodeProperties::HasBinaryProperty(
+ aCh, intl::UnicodeProperties::BinaryProperty::DefaultIgnorableCodePoint);
+}
+
+inline EmojiPresentation GetEmojiPresentation(uint32_t aCh) {
+ if (!intl::UnicodeProperties::HasBinaryProperty(
+ aCh, intl::UnicodeProperties::BinaryProperty::Emoji)) {
+ return TextOnly;
+ }
+
+ if (intl::UnicodeProperties::HasBinaryProperty(
+ aCh, intl::UnicodeProperties::BinaryProperty::EmojiPresentation)) {
+ return EmojiDefault;
+ }
+ return TextDefault;
+}
+
+// returns the simplified Gen Category as defined in nsUGenCategory
+inline nsUGenCategory GetGenCategory(uint32_t aCh) {
+ return sDetailedToGeneralCategory[GetGeneralCategory(aCh)];
+}
+
+inline VerticalOrientation GetVerticalOrientation(uint32_t aCh) {
+ return VerticalOrientation(GetCharProps2(aCh).mVertOrient);
+}
+
+inline IdentifierType GetIdentifierType(uint32_t aCh) {
+ return IdentifierType(GetCharProps2(aCh).mIdType);
+}
+
+uint32_t GetFullWidth(uint32_t aCh);
+// This is the reverse function of GetFullWidth which guarantees that
+// for every codepoint c, GetFullWidthInverse(GetFullWidth(c)) == c.
+// Note that, this function does not guarantee to convert all wide
+// form characters to their possible narrow form.
+uint32_t GetFullWidthInverse(uint32_t aCh);
+
+bool IsClusterExtender(uint32_t aCh, uint8_t aCategory);
+
+inline bool IsClusterExtender(uint32_t aCh) {
+ // There are no cluster-extender characters before the first combining-
+ // character block at U+03xx, so we short-circuit here to avoid the cost
+ // of calling GetGeneralCategory for Latin-1 letters etc.
+ return aCh >= 0x0300 && IsClusterExtender(aCh, GetGeneralCategory(aCh));
+}
+
+bool IsClusterExtenderExcludingJoiners(uint32_t aCh, uint8_t aCategory);
+
+inline bool IsClusterExtenderExcludingJoiners(uint32_t aCh) {
+ return aCh >= 0x0300 &&
+ IsClusterExtenderExcludingJoiners(aCh, GetGeneralCategory(aCh));
+}
+
+// Count the number of grapheme clusters in the given string
+uint32_t CountGraphemeClusters(Span<const char16_t> aText);
+
+// Determine whether a character is a "combining diacritic" for the purpose
+// of diacritic-insensitive text search. Examples of such characters include
+// European accents and Hebrew niqqud, but not Hangul components or Thaana
+// vowels, even though Thaana vowels are combining nonspacing marks that could
+// be considered diacritics.
+// As an exception to strictly following Unicode properties, we exclude the
+// Japanese kana voicing marks
+// 3099;COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK;Mn;8;NSM
+// 309A;COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK;Mn;8;NSM
+// which users report should not be ignored (bug 1624244).
+// See is_combining_diacritic in base_chars.py and is_combining_diacritic.py.
+//
+// TODO: once ICU4X is integrated (replacing ICU4C) as the source of Unicode
+// properties, re-evaluate whether building the static bitset is worthwhile
+// or if we can revert to simply getting the combining class and comparing
+// to the values we care about at runtime.
+bool IsCombiningDiacritic(uint32_t aCh);
+
+// Remove diacritics from a character
+uint32_t GetNaked(uint32_t aCh);
+
+} // end namespace unicode
+
+} // end namespace mozilla
+
+#endif /* NS_UNICODEPROPERTIES_H */