summaryrefslogtreecommitdiffstats
path: root/intl/unicharutil/util/nsUnicodeProperties.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'intl/unicharutil/util/nsUnicodeProperties.cpp')
-rw-r--r--intl/unicharutil/util/nsUnicodeProperties.cpp387
1 files changed, 387 insertions, 0 deletions
diff --git a/intl/unicharutil/util/nsUnicodeProperties.cpp b/intl/unicharutil/util/nsUnicodeProperties.cpp
new file mode 100644
index 0000000000..244b7818ca
--- /dev/null
+++ b/intl/unicharutil/util/nsUnicodeProperties.cpp
@@ -0,0 +1,387 @@
+/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim:set ts=4 sw=2 sts=2 et cindent: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "nsUnicodeProperties.h"
+#include "nsUnicodePropertyData.cpp"
+
+#include "mozilla/ArrayUtils.h"
+#include "mozilla/HashTable.h"
+#include "nsCharTraits.h"
+
+#include "unicode/uchar.h"
+#include "unicode/unorm2.h"
+
+#define UNICODE_BMP_LIMIT 0x10000
+#define UNICODE_LIMIT 0x110000
+
+const nsCharProps2& GetCharProps2(uint32_t aCh) {
+ if (aCh < UNICODE_BMP_LIMIT) {
+ return sCharProp2Values[sCharProp2Pages[0][aCh >> kCharProp2CharBits]]
+ [aCh & ((1 << kCharProp2CharBits) - 1)];
+ }
+ if (aCh < (kCharProp2MaxPlane + 1) * 0x10000) {
+ return sCharProp2Values[sCharProp2Pages[sCharProp2Planes[(aCh >> 16) - 1]]
+ [(aCh & 0xffff) >>
+ kCharProp2CharBits]]
+ [aCh & ((1 << kCharProp2CharBits) - 1)];
+ }
+
+ MOZ_ASSERT_UNREACHABLE(
+ "Getting CharProps for codepoint outside Unicode "
+ "range");
+
+ // Default values for unassigned
+ using namespace mozilla::unicode;
+ static const nsCharProps2 undefined = {
+ VERTICAL_ORIENTATION_R,
+ 0 // IdentifierType
+ };
+ return undefined;
+}
+
+namespace mozilla {
+
+namespace unicode {
+
+/*
+To store properties for a million Unicode codepoints compactly, we use
+a three-level array structure, with the Unicode values considered as
+three elements: Plane, Page, and Char.
+
+Space optimization happens because multiple Planes can refer to the same
+Page array, and multiple Pages can refer to the same Char array holding
+the actual values. In practice, most of the higher planes are empty and
+thus share the same data; and within the BMP, there are also many pages
+that repeat the same data for any given property.
+
+Plane is usually zero, so we skip a lookup in this case, and require
+that the Plane 0 pages are always the first set of entries in the Page
+array.
+
+The division of the remaining 16 bits into Page and Char fields is
+adjusted for each property (by experiment using the generation tool)
+to provide the most compact storage, depending on the distribution
+of values.
+*/
+
+const nsUGenCategory sDetailedToGeneralCategory[] = {
+ // clang-format off
+ /*
+ * The order here corresponds to the HB_UNICODE_GENERAL_CATEGORY_* constants
+ * of the hb_unicode_general_category_t enum in gfx/harfbuzz/src/hb-unicode.h.
+ */
+ /* CONTROL */ nsUGenCategory::kOther,
+ /* FORMAT */ nsUGenCategory::kOther,
+ /* UNASSIGNED */ nsUGenCategory::kOther,
+ /* PRIVATE_USE */ nsUGenCategory::kOther,
+ /* SURROGATE */ nsUGenCategory::kOther,
+ /* LOWERCASE_LETTER */ nsUGenCategory::kLetter,
+ /* MODIFIER_LETTER */ nsUGenCategory::kLetter,
+ /* OTHER_LETTER */ nsUGenCategory::kLetter,
+ /* TITLECASE_LETTER */ nsUGenCategory::kLetter,
+ /* UPPERCASE_LETTER */ nsUGenCategory::kLetter,
+ /* COMBINING_MARK */ nsUGenCategory::kMark,
+ /* ENCLOSING_MARK */ nsUGenCategory::kMark,
+ /* NON_SPACING_MARK */ nsUGenCategory::kMark,
+ /* DECIMAL_NUMBER */ nsUGenCategory::kNumber,
+ /* LETTER_NUMBER */ nsUGenCategory::kNumber,
+ /* OTHER_NUMBER */ nsUGenCategory::kNumber,
+ /* CONNECT_PUNCTUATION */ nsUGenCategory::kPunctuation,
+ /* DASH_PUNCTUATION */ nsUGenCategory::kPunctuation,
+ /* CLOSE_PUNCTUATION */ nsUGenCategory::kPunctuation,
+ /* FINAL_PUNCTUATION */ nsUGenCategory::kPunctuation,
+ /* INITIAL_PUNCTUATION */ nsUGenCategory::kPunctuation,
+ /* OTHER_PUNCTUATION */ nsUGenCategory::kPunctuation,
+ /* OPEN_PUNCTUATION */ nsUGenCategory::kPunctuation,
+ /* CURRENCY_SYMBOL */ nsUGenCategory::kSymbol,
+ /* MODIFIER_SYMBOL */ nsUGenCategory::kSymbol,
+ /* MATH_SYMBOL */ nsUGenCategory::kSymbol,
+ /* OTHER_SYMBOL */ nsUGenCategory::kSymbol,
+ /* LINE_SEPARATOR */ nsUGenCategory::kSeparator,
+ /* PARAGRAPH_SEPARATOR */ nsUGenCategory::kSeparator,
+ /* SPACE_SEPARATOR */ nsUGenCategory::kSeparator
+ // clang-format on
+};
+
+const hb_unicode_general_category_t sICUtoHBcategory[U_CHAR_CATEGORY_COUNT] = {
+ // clang-format off
+ HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED, // U_GENERAL_OTHER_TYPES = 0,
+ HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER, // U_UPPERCASE_LETTER = 1,
+ HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER, // U_LOWERCASE_LETTER = 2,
+ HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER, // U_TITLECASE_LETTER = 3,
+ HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER, // U_MODIFIER_LETTER = 4,
+ HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER, // U_OTHER_LETTER = 5,
+ HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK, // U_NON_SPACING_MARK = 6,
+ HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK, // U_ENCLOSING_MARK = 7,
+ HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK, // U_COMBINING_SPACING_MARK = 8,
+ HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER, // U_DECIMAL_DIGIT_NUMBER = 9,
+ HB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER, // U_LETTER_NUMBER = 10,
+ HB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER, // U_OTHER_NUMBER = 11,
+ HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR, // U_SPACE_SEPARATOR = 12,
+ HB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR, // U_LINE_SEPARATOR = 13,
+ HB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR, // U_PARAGRAPH_SEPARATOR = 14,
+ HB_UNICODE_GENERAL_CATEGORY_CONTROL, // U_CONTROL_CHAR = 15,
+ HB_UNICODE_GENERAL_CATEGORY_FORMAT, // U_FORMAT_CHAR = 16,
+ HB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE, // U_PRIVATE_USE_CHAR = 17,
+ HB_UNICODE_GENERAL_CATEGORY_SURROGATE, // U_SURROGATE = 18,
+ HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION, // U_DASH_PUNCTUATION = 19,
+ HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION, // U_START_PUNCTUATION = 20,
+ HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION, // U_END_PUNCTUATION = 21,
+ HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION, // U_CONNECTOR_PUNCTUATION = 22,
+ HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION, // U_OTHER_PUNCTUATION = 23,
+ HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL, // U_MATH_SYMBOL = 24,
+ HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL, // U_CURRENCY_SYMBOL = 25,
+ HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL, // U_MODIFIER_SYMBOL = 26,
+ HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL, // U_OTHER_SYMBOL = 27,
+ HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION, // U_INITIAL_PUNCTUATION = 28,
+ HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION, // U_FINAL_PUNCTUATION = 29,
+ // clang-format on
+};
+
+#define DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(prefix_) \
+ uint32_t Get##prefix_(uint32_t aCh) { \
+ if (aCh >= UNICODE_BMP_LIMIT) { \
+ return aCh; \
+ } \
+ auto page = s##prefix_##Pages[aCh >> k##prefix_##CharBits]; \
+ auto index = aCh & ((1 << k##prefix_##CharBits) - 1); \
+ uint32_t v = s##prefix_##Values[page][index]; \
+ return v ? v : aCh; \
+ }
+
+// full-width mappings only exist for BMP characters; all others are
+// returned unchanged
+DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(FullWidth)
+DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(FullWidthInverse)
+
+bool IsClusterExtender(uint32_t aCh, uint8_t aCategory) {
+ return (
+ (aCategory >= HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK &&
+ aCategory <= HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK) ||
+ (aCh >= 0x200c && aCh <= 0x200d) || // ZWJ, ZWNJ
+ (aCh >= 0xff9e && aCh <= 0xff9f) || // katakana sound marks
+ (aCh >= 0x1F3FB && aCh <= 0x1F3FF) || // fitzpatrick skin tone modifiers
+ (aCh >= 0xe0020 && aCh <= 0xe007f)); // emoji (flag) tag characters
+}
+
+enum HSType {
+ HST_NONE = U_HST_NOT_APPLICABLE,
+ HST_L = U_HST_LEADING_JAMO,
+ HST_V = U_HST_VOWEL_JAMO,
+ HST_T = U_HST_TRAILING_JAMO,
+ HST_LV = U_HST_LV_SYLLABLE,
+ HST_LVT = U_HST_LVT_SYLLABLE
+};
+
+static HSType GetHangulSyllableType(uint32_t aCh) {
+ return HSType(u_getIntPropertyValue(aCh, UCHAR_HANGUL_SYLLABLE_TYPE));
+}
+
+void ClusterIterator::Next() {
+ if (AtEnd()) {
+ NS_WARNING("ClusterIterator has already reached the end");
+ return;
+ }
+
+ uint32_t ch = *mPos++;
+
+ if (mPos < mLimit && NS_IS_SURROGATE_PAIR(ch, *mPos)) {
+ ch = SURROGATE_TO_UCS4(ch, *mPos++);
+ } else if ((ch & ~0xff) == 0x1100 || (ch >= 0xa960 && ch <= 0xa97f) ||
+ (ch >= 0xac00 && ch <= 0xd7ff)) {
+ // Handle conjoining Jamo that make Hangul syllables
+ HSType hangulState = GetHangulSyllableType(ch);
+ while (mPos < mLimit) {
+ ch = *mPos;
+ HSType hangulType = GetHangulSyllableType(ch);
+ switch (hangulType) {
+ case HST_L:
+ case HST_LV:
+ case HST_LVT:
+ if (hangulState == HST_L) {
+ hangulState = hangulType;
+ mPos++;
+ continue;
+ }
+ break;
+ case HST_V:
+ if ((hangulState != HST_NONE) && (hangulState != HST_T) &&
+ (hangulState != HST_LVT)) {
+ hangulState = hangulType;
+ mPos++;
+ continue;
+ }
+ break;
+ case HST_T:
+ if (hangulState != HST_NONE && hangulState != HST_L) {
+ hangulState = hangulType;
+ mPos++;
+ continue;
+ }
+ break;
+ default:
+ break;
+ }
+ break;
+ }
+ }
+
+ const uint32_t kVS16 = 0xfe0f;
+ const uint32_t kZWJ = 0x200d;
+ // UTF-16 surrogate values for Fitzpatrick type modifiers
+ const uint32_t kFitzpatrickHigh = 0xD83C;
+ const uint32_t kFitzpatrickLowFirst = 0xDFFB;
+ const uint32_t kFitzpatrickLowLast = 0xDFFF;
+
+ bool baseIsEmoji = (GetEmojiPresentation(ch) == EmojiDefault) ||
+ (GetEmojiPresentation(ch) == TextDefault &&
+ ((mPos < mLimit && *mPos == kVS16) ||
+ (mPos + 1 < mLimit && *mPos == kFitzpatrickHigh &&
+ *(mPos + 1) >= kFitzpatrickLowFirst &&
+ *(mPos + 1) <= kFitzpatrickLowLast)));
+ bool prevWasZwj = false;
+
+ while (mPos < mLimit) {
+ ch = *mPos;
+ size_t chLen = 1;
+
+ // Check for surrogate pairs; note that isolated surrogates will just
+ // be treated as generic (non-cluster-extending) characters here,
+ // which is fine for cluster-iterating purposes
+ if (mPos < mLimit - 1 && NS_IS_SURROGATE_PAIR(ch, *(mPos + 1))) {
+ ch = SURROGATE_TO_UCS4(ch, *(mPos + 1));
+ chLen = 2;
+ }
+
+ bool extendCluster =
+ IsClusterExtender(ch) ||
+ (baseIsEmoji && prevWasZwj &&
+ ((GetEmojiPresentation(ch) == EmojiDefault) ||
+ (GetEmojiPresentation(ch) == TextDefault && mPos + chLen < mLimit &&
+ *(mPos + chLen) == kVS16)));
+ if (!extendCluster) {
+ break;
+ }
+
+ prevWasZwj = (ch == kZWJ);
+ mPos += chLen;
+ }
+
+ NS_ASSERTION(mText < mPos && mPos <= mLimit,
+ "ClusterIterator::Next has overshot the string!");
+}
+
+void ClusterReverseIterator::Next() {
+ if (AtEnd()) {
+ NS_WARNING("ClusterReverseIterator has already reached the end");
+ return;
+ }
+
+ uint32_t ch;
+ do {
+ ch = *--mPos;
+
+ if (mPos > mLimit && NS_IS_SURROGATE_PAIR(*(mPos - 1), ch)) {
+ ch = SURROGATE_TO_UCS4(*--mPos, ch);
+ }
+
+ if (!IsClusterExtender(ch)) {
+ break;
+ }
+ } while (mPos > mLimit);
+
+ // XXX May need to handle conjoining Jamo
+
+ NS_ASSERTION(mPos >= mLimit,
+ "ClusterReverseIterator::Next has overshot the string!");
+}
+
+uint32_t CountGraphemeClusters(const char16_t* aText, uint32_t aLength) {
+ ClusterIterator iter(aText, aLength);
+ uint32_t result = 0;
+ while (!iter.AtEnd()) {
+ ++result;
+ iter.Next();
+ }
+ return result;
+}
+
+uint32_t GetNaked(uint32_t aCh) {
+ using namespace mozilla;
+
+ static const UNormalizer2* normalizer;
+ static HashMap<uint32_t, uint32_t> nakedCharCache;
+
+ NS_ASSERTION(!IsCombiningDiacritic(aCh),
+ "This character needs to be skipped");
+
+ HashMap<uint32_t, uint32_t>::Ptr entry = nakedCharCache.lookup(aCh);
+ if (entry.found()) {
+ return entry->value();
+ }
+
+ UErrorCode error = U_ZERO_ERROR;
+ if (!normalizer) {
+ normalizer = unorm2_getNFDInstance(&error);
+ if (U_FAILURE(error)) {
+ return aCh;
+ }
+ }
+
+ static const size_t MAX_DECOMPOSITION_SIZE = 16;
+ UChar decomposition[MAX_DECOMPOSITION_SIZE];
+ UChar* combiners;
+ int32_t decompositionLen;
+ uint32_t baseChar, nextChar;
+ decompositionLen = unorm2_getDecomposition(normalizer, aCh, decomposition,
+ MAX_DECOMPOSITION_SIZE, &error);
+ if (decompositionLen < 1) {
+ // The character does not decompose.
+ return aCh;
+ }
+
+ if (NS_IS_HIGH_SURROGATE(decomposition[0])) {
+ baseChar = SURROGATE_TO_UCS4(decomposition[0], decomposition[1]);
+ combiners = decomposition + 2;
+ } else {
+ baseChar = decomposition[0];
+ combiners = decomposition + 1;
+ }
+
+ if (IS_IN_BMP(baseChar) != IS_IN_BMP(aCh)) {
+ // Mappings that would change the length of a UTF-16 string are not
+ // currently supported.
+ baseChar = aCh;
+ goto cache;
+ }
+
+ if (decompositionLen > 1) {
+ if (NS_IS_HIGH_SURROGATE(combiners[0])) {
+ nextChar = SURROGATE_TO_UCS4(combiners[0], combiners[1]);
+ } else {
+ nextChar = combiners[0];
+ }
+ if (!IsCombiningDiacritic(nextChar)) {
+ // Hangul syllables decompose but do not actually have diacritics.
+ // This also excludes decompositions with the Japanese marks U+3099 and
+ // U+309A (COMBINING KATAKANA-HIRAGANA [SEMI-]VOICED SOUND MARK), which
+ // we should not ignore for searching (bug 1624244).
+ baseChar = aCh;
+ }
+ }
+
+cache:
+ if (!nakedCharCache.putNew(aCh, baseChar)) {
+ // We're out of memory, so delete the cache to free some up.
+ nakedCharCache.clearAndCompact();
+ }
+
+ return baseChar;
+}
+
+} // end namespace unicode
+
+} // end namespace mozilla