diff options
Diffstat (limited to '')
-rw-r--r-- | intl/unicharutil/util/nsUnicodeProperties.cpp | 387 |
1 files changed, 387 insertions, 0 deletions
diff --git a/intl/unicharutil/util/nsUnicodeProperties.cpp b/intl/unicharutil/util/nsUnicodeProperties.cpp new file mode 100644 index 0000000000..244b7818ca --- /dev/null +++ b/intl/unicharutil/util/nsUnicodeProperties.cpp @@ -0,0 +1,387 @@ +/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=4 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsUnicodeProperties.h" +#include "nsUnicodePropertyData.cpp" + +#include "mozilla/ArrayUtils.h" +#include "mozilla/HashTable.h" +#include "nsCharTraits.h" + +#include "unicode/uchar.h" +#include "unicode/unorm2.h" + +#define UNICODE_BMP_LIMIT 0x10000 +#define UNICODE_LIMIT 0x110000 + +const nsCharProps2& GetCharProps2(uint32_t aCh) { + if (aCh < UNICODE_BMP_LIMIT) { + return sCharProp2Values[sCharProp2Pages[0][aCh >> kCharProp2CharBits]] + [aCh & ((1 << kCharProp2CharBits) - 1)]; + } + if (aCh < (kCharProp2MaxPlane + 1) * 0x10000) { + return sCharProp2Values[sCharProp2Pages[sCharProp2Planes[(aCh >> 16) - 1]] + [(aCh & 0xffff) >> + kCharProp2CharBits]] + [aCh & ((1 << kCharProp2CharBits) - 1)]; + } + + MOZ_ASSERT_UNREACHABLE( + "Getting CharProps for codepoint outside Unicode " + "range"); + + // Default values for unassigned + using namespace mozilla::unicode; + static const nsCharProps2 undefined = { + VERTICAL_ORIENTATION_R, + 0 // IdentifierType + }; + return undefined; +} + +namespace mozilla { + +namespace unicode { + +/* +To store properties for a million Unicode codepoints compactly, we use +a three-level array structure, with the Unicode values considered as +three elements: Plane, Page, and Char. + +Space optimization happens because multiple Planes can refer to the same +Page array, and multiple Pages can refer to the same Char array holding +the actual values. In practice, most of the higher planes are empty and +thus share the same data; and within the BMP, there are also many pages +that repeat the same data for any given property. + +Plane is usually zero, so we skip a lookup in this case, and require +that the Plane 0 pages are always the first set of entries in the Page +array. + +The division of the remaining 16 bits into Page and Char fields is +adjusted for each property (by experiment using the generation tool) +to provide the most compact storage, depending on the distribution +of values. +*/ + +const nsUGenCategory sDetailedToGeneralCategory[] = { + // clang-format off + /* + * The order here corresponds to the HB_UNICODE_GENERAL_CATEGORY_* constants + * of the hb_unicode_general_category_t enum in gfx/harfbuzz/src/hb-unicode.h. + */ + /* CONTROL */ nsUGenCategory::kOther, + /* FORMAT */ nsUGenCategory::kOther, + /* UNASSIGNED */ nsUGenCategory::kOther, + /* PRIVATE_USE */ nsUGenCategory::kOther, + /* SURROGATE */ nsUGenCategory::kOther, + /* LOWERCASE_LETTER */ nsUGenCategory::kLetter, + /* MODIFIER_LETTER */ nsUGenCategory::kLetter, + /* OTHER_LETTER */ nsUGenCategory::kLetter, + /* TITLECASE_LETTER */ nsUGenCategory::kLetter, + /* UPPERCASE_LETTER */ nsUGenCategory::kLetter, + /* COMBINING_MARK */ nsUGenCategory::kMark, + /* ENCLOSING_MARK */ nsUGenCategory::kMark, + /* NON_SPACING_MARK */ nsUGenCategory::kMark, + /* DECIMAL_NUMBER */ nsUGenCategory::kNumber, + /* LETTER_NUMBER */ nsUGenCategory::kNumber, + /* OTHER_NUMBER */ nsUGenCategory::kNumber, + /* CONNECT_PUNCTUATION */ nsUGenCategory::kPunctuation, + /* DASH_PUNCTUATION */ nsUGenCategory::kPunctuation, + /* CLOSE_PUNCTUATION */ nsUGenCategory::kPunctuation, + /* FINAL_PUNCTUATION */ nsUGenCategory::kPunctuation, + /* INITIAL_PUNCTUATION */ nsUGenCategory::kPunctuation, + /* OTHER_PUNCTUATION */ nsUGenCategory::kPunctuation, + /* OPEN_PUNCTUATION */ nsUGenCategory::kPunctuation, + /* CURRENCY_SYMBOL */ nsUGenCategory::kSymbol, + /* MODIFIER_SYMBOL */ nsUGenCategory::kSymbol, + /* MATH_SYMBOL */ nsUGenCategory::kSymbol, + /* OTHER_SYMBOL */ nsUGenCategory::kSymbol, + /* LINE_SEPARATOR */ nsUGenCategory::kSeparator, + /* PARAGRAPH_SEPARATOR */ nsUGenCategory::kSeparator, + /* SPACE_SEPARATOR */ nsUGenCategory::kSeparator + // clang-format on +}; + +const hb_unicode_general_category_t sICUtoHBcategory[U_CHAR_CATEGORY_COUNT] = { + // clang-format off + HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED, // U_GENERAL_OTHER_TYPES = 0, + HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER, // U_UPPERCASE_LETTER = 1, + HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER, // U_LOWERCASE_LETTER = 2, + HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER, // U_TITLECASE_LETTER = 3, + HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER, // U_MODIFIER_LETTER = 4, + HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER, // U_OTHER_LETTER = 5, + HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK, // U_NON_SPACING_MARK = 6, + HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK, // U_ENCLOSING_MARK = 7, + HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK, // U_COMBINING_SPACING_MARK = 8, + HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER, // U_DECIMAL_DIGIT_NUMBER = 9, + HB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER, // U_LETTER_NUMBER = 10, + HB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER, // U_OTHER_NUMBER = 11, + HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR, // U_SPACE_SEPARATOR = 12, + HB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR, // U_LINE_SEPARATOR = 13, + HB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR, // U_PARAGRAPH_SEPARATOR = 14, + HB_UNICODE_GENERAL_CATEGORY_CONTROL, // U_CONTROL_CHAR = 15, + HB_UNICODE_GENERAL_CATEGORY_FORMAT, // U_FORMAT_CHAR = 16, + HB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE, // U_PRIVATE_USE_CHAR = 17, + HB_UNICODE_GENERAL_CATEGORY_SURROGATE, // U_SURROGATE = 18, + HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION, // U_DASH_PUNCTUATION = 19, + HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION, // U_START_PUNCTUATION = 20, + HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION, // U_END_PUNCTUATION = 21, + HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION, // U_CONNECTOR_PUNCTUATION = 22, + HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION, // U_OTHER_PUNCTUATION = 23, + HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL, // U_MATH_SYMBOL = 24, + HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL, // U_CURRENCY_SYMBOL = 25, + HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL, // U_MODIFIER_SYMBOL = 26, + HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL, // U_OTHER_SYMBOL = 27, + HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION, // U_INITIAL_PUNCTUATION = 28, + HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION, // U_FINAL_PUNCTUATION = 29, + // clang-format on +}; + +#define DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(prefix_) \ + uint32_t Get##prefix_(uint32_t aCh) { \ + if (aCh >= UNICODE_BMP_LIMIT) { \ + return aCh; \ + } \ + auto page = s##prefix_##Pages[aCh >> k##prefix_##CharBits]; \ + auto index = aCh & ((1 << k##prefix_##CharBits) - 1); \ + uint32_t v = s##prefix_##Values[page][index]; \ + return v ? v : aCh; \ + } + +// full-width mappings only exist for BMP characters; all others are +// returned unchanged +DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(FullWidth) +DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(FullWidthInverse) + +bool IsClusterExtender(uint32_t aCh, uint8_t aCategory) { + return ( + (aCategory >= HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK && + aCategory <= HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK) || + (aCh >= 0x200c && aCh <= 0x200d) || // ZWJ, ZWNJ + (aCh >= 0xff9e && aCh <= 0xff9f) || // katakana sound marks + (aCh >= 0x1F3FB && aCh <= 0x1F3FF) || // fitzpatrick skin tone modifiers + (aCh >= 0xe0020 && aCh <= 0xe007f)); // emoji (flag) tag characters +} + +enum HSType { + HST_NONE = U_HST_NOT_APPLICABLE, + HST_L = U_HST_LEADING_JAMO, + HST_V = U_HST_VOWEL_JAMO, + HST_T = U_HST_TRAILING_JAMO, + HST_LV = U_HST_LV_SYLLABLE, + HST_LVT = U_HST_LVT_SYLLABLE +}; + +static HSType GetHangulSyllableType(uint32_t aCh) { + return HSType(u_getIntPropertyValue(aCh, UCHAR_HANGUL_SYLLABLE_TYPE)); +} + +void ClusterIterator::Next() { + if (AtEnd()) { + NS_WARNING("ClusterIterator has already reached the end"); + return; + } + + uint32_t ch = *mPos++; + + if (mPos < mLimit && NS_IS_SURROGATE_PAIR(ch, *mPos)) { + ch = SURROGATE_TO_UCS4(ch, *mPos++); + } else if ((ch & ~0xff) == 0x1100 || (ch >= 0xa960 && ch <= 0xa97f) || + (ch >= 0xac00 && ch <= 0xd7ff)) { + // Handle conjoining Jamo that make Hangul syllables + HSType hangulState = GetHangulSyllableType(ch); + while (mPos < mLimit) { + ch = *mPos; + HSType hangulType = GetHangulSyllableType(ch); + switch (hangulType) { + case HST_L: + case HST_LV: + case HST_LVT: + if (hangulState == HST_L) { + hangulState = hangulType; + mPos++; + continue; + } + break; + case HST_V: + if ((hangulState != HST_NONE) && (hangulState != HST_T) && + (hangulState != HST_LVT)) { + hangulState = hangulType; + mPos++; + continue; + } + break; + case HST_T: + if (hangulState != HST_NONE && hangulState != HST_L) { + hangulState = hangulType; + mPos++; + continue; + } + break; + default: + break; + } + break; + } + } + + const uint32_t kVS16 = 0xfe0f; + const uint32_t kZWJ = 0x200d; + // UTF-16 surrogate values for Fitzpatrick type modifiers + const uint32_t kFitzpatrickHigh = 0xD83C; + const uint32_t kFitzpatrickLowFirst = 0xDFFB; + const uint32_t kFitzpatrickLowLast = 0xDFFF; + + bool baseIsEmoji = (GetEmojiPresentation(ch) == EmojiDefault) || + (GetEmojiPresentation(ch) == TextDefault && + ((mPos < mLimit && *mPos == kVS16) || + (mPos + 1 < mLimit && *mPos == kFitzpatrickHigh && + *(mPos + 1) >= kFitzpatrickLowFirst && + *(mPos + 1) <= kFitzpatrickLowLast))); + bool prevWasZwj = false; + + while (mPos < mLimit) { + ch = *mPos; + size_t chLen = 1; + + // Check for surrogate pairs; note that isolated surrogates will just + // be treated as generic (non-cluster-extending) characters here, + // which is fine for cluster-iterating purposes + if (mPos < mLimit - 1 && NS_IS_SURROGATE_PAIR(ch, *(mPos + 1))) { + ch = SURROGATE_TO_UCS4(ch, *(mPos + 1)); + chLen = 2; + } + + bool extendCluster = + IsClusterExtender(ch) || + (baseIsEmoji && prevWasZwj && + ((GetEmojiPresentation(ch) == EmojiDefault) || + (GetEmojiPresentation(ch) == TextDefault && mPos + chLen < mLimit && + *(mPos + chLen) == kVS16))); + if (!extendCluster) { + break; + } + + prevWasZwj = (ch == kZWJ); + mPos += chLen; + } + + NS_ASSERTION(mText < mPos && mPos <= mLimit, + "ClusterIterator::Next has overshot the string!"); +} + +void ClusterReverseIterator::Next() { + if (AtEnd()) { + NS_WARNING("ClusterReverseIterator has already reached the end"); + return; + } + + uint32_t ch; + do { + ch = *--mPos; + + if (mPos > mLimit && NS_IS_SURROGATE_PAIR(*(mPos - 1), ch)) { + ch = SURROGATE_TO_UCS4(*--mPos, ch); + } + + if (!IsClusterExtender(ch)) { + break; + } + } while (mPos > mLimit); + + // XXX May need to handle conjoining Jamo + + NS_ASSERTION(mPos >= mLimit, + "ClusterReverseIterator::Next has overshot the string!"); +} + +uint32_t CountGraphemeClusters(const char16_t* aText, uint32_t aLength) { + ClusterIterator iter(aText, aLength); + uint32_t result = 0; + while (!iter.AtEnd()) { + ++result; + iter.Next(); + } + return result; +} + +uint32_t GetNaked(uint32_t aCh) { + using namespace mozilla; + + static const UNormalizer2* normalizer; + static HashMap<uint32_t, uint32_t> nakedCharCache; + + NS_ASSERTION(!IsCombiningDiacritic(aCh), + "This character needs to be skipped"); + + HashMap<uint32_t, uint32_t>::Ptr entry = nakedCharCache.lookup(aCh); + if (entry.found()) { + return entry->value(); + } + + UErrorCode error = U_ZERO_ERROR; + if (!normalizer) { + normalizer = unorm2_getNFDInstance(&error); + if (U_FAILURE(error)) { + return aCh; + } + } + + static const size_t MAX_DECOMPOSITION_SIZE = 16; + UChar decomposition[MAX_DECOMPOSITION_SIZE]; + UChar* combiners; + int32_t decompositionLen; + uint32_t baseChar, nextChar; + decompositionLen = unorm2_getDecomposition(normalizer, aCh, decomposition, + MAX_DECOMPOSITION_SIZE, &error); + if (decompositionLen < 1) { + // The character does not decompose. + return aCh; + } + + if (NS_IS_HIGH_SURROGATE(decomposition[0])) { + baseChar = SURROGATE_TO_UCS4(decomposition[0], decomposition[1]); + combiners = decomposition + 2; + } else { + baseChar = decomposition[0]; + combiners = decomposition + 1; + } + + if (IS_IN_BMP(baseChar) != IS_IN_BMP(aCh)) { + // Mappings that would change the length of a UTF-16 string are not + // currently supported. + baseChar = aCh; + goto cache; + } + + if (decompositionLen > 1) { + if (NS_IS_HIGH_SURROGATE(combiners[0])) { + nextChar = SURROGATE_TO_UCS4(combiners[0], combiners[1]); + } else { + nextChar = combiners[0]; + } + if (!IsCombiningDiacritic(nextChar)) { + // Hangul syllables decompose but do not actually have diacritics. + // This also excludes decompositions with the Japanese marks U+3099 and + // U+309A (COMBINING KATAKANA-HIRAGANA [SEMI-]VOICED SOUND MARK), which + // we should not ignore for searching (bug 1624244). + baseChar = aCh; + } + } + +cache: + if (!nakedCharCache.putNew(aCh, baseChar)) { + // We're out of memory, so delete the cache to free some up. + nakedCharCache.clearAndCompact(); + } + + return baseChar; +} + +} // end namespace unicode + +} // end namespace mozilla |