summaryrefslogtreecommitdiffstats
path: root/intl/unicharutil/util/nsUnicodeProperties.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'intl/unicharutil/util/nsUnicodeProperties.cpp')
-rw-r--r--intl/unicharutil/util/nsUnicodeProperties.cpp216
1 files changed, 216 insertions, 0 deletions
diff --git a/intl/unicharutil/util/nsUnicodeProperties.cpp b/intl/unicharutil/util/nsUnicodeProperties.cpp
new file mode 100644
index 0000000000..748a2d32c2
--- /dev/null
+++ b/intl/unicharutil/util/nsUnicodeProperties.cpp
@@ -0,0 +1,216 @@
+/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim:set ts=4 sw=2 sts=2 et cindent: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "nsUnicodeProperties.h"
+#include "nsUnicodePropertyData.cpp"
+
+#include "mozilla/ArrayUtils.h"
+#include "mozilla/HashTable.h"
+#include "mozilla/intl/Segmenter.h"
+
+#include "BaseChars.h"
+#include "IsCombiningDiacritic.h"
+
+#define UNICODE_BMP_LIMIT 0x10000
+#define UNICODE_LIMIT 0x110000
+
+const nsCharProps2& GetCharProps2(uint32_t aCh) {
+ if (aCh < UNICODE_BMP_LIMIT) {
+ return sCharProp2Values[sCharProp2Pages[0][aCh >> kCharProp2CharBits]]
+ [aCh & ((1 << kCharProp2CharBits) - 1)];
+ }
+ if (aCh < (kCharProp2MaxPlane + 1) * 0x10000) {
+ return sCharProp2Values[sCharProp2Pages[sCharProp2Planes[(aCh >> 16) - 1]]
+ [(aCh & 0xffff) >>
+ kCharProp2CharBits]]
+ [aCh & ((1 << kCharProp2CharBits) - 1)];
+ }
+
+ MOZ_ASSERT_UNREACHABLE(
+ "Getting CharProps for codepoint outside Unicode "
+ "range");
+
+ // Default values for unassigned
+ using namespace mozilla::unicode;
+ static const nsCharProps2 undefined = {
+ VERTICAL_ORIENTATION_R,
+ 0 // IdentifierType
+ };
+ return undefined;
+}
+
+namespace mozilla {
+
+namespace unicode {
+
+/*
+To store properties for a million Unicode codepoints compactly, we use
+a three-level array structure, with the Unicode values considered as
+three elements: Plane, Page, and Char.
+
+Space optimization happens because multiple Planes can refer to the same
+Page array, and multiple Pages can refer to the same Char array holding
+the actual values. In practice, most of the higher planes are empty and
+thus share the same data; and within the BMP, there are also many pages
+that repeat the same data for any given property.
+
+Plane is usually zero, so we skip a lookup in this case, and require
+that the Plane 0 pages are always the first set of entries in the Page
+array.
+
+The division of the remaining 16 bits into Page and Char fields is
+adjusted for each property (by experiment using the generation tool)
+to provide the most compact storage, depending on the distribution
+of values.
+*/
+
+const nsUGenCategory sDetailedToGeneralCategory[] = {
+ // clang-format off
+ /*
+ * The order here corresponds to the HB_UNICODE_GENERAL_CATEGORY_* constants
+ * of the hb_unicode_general_category_t enum in gfx/harfbuzz/src/hb-unicode.h.
+ */
+ /* CONTROL */ nsUGenCategory::kOther,
+ /* FORMAT */ nsUGenCategory::kOther,
+ /* UNASSIGNED */ nsUGenCategory::kOther,
+ /* PRIVATE_USE */ nsUGenCategory::kOther,
+ /* SURROGATE */ nsUGenCategory::kOther,
+ /* LOWERCASE_LETTER */ nsUGenCategory::kLetter,
+ /* MODIFIER_LETTER */ nsUGenCategory::kLetter,
+ /* OTHER_LETTER */ nsUGenCategory::kLetter,
+ /* TITLECASE_LETTER */ nsUGenCategory::kLetter,
+ /* UPPERCASE_LETTER */ nsUGenCategory::kLetter,
+ /* COMBINING_MARK */ nsUGenCategory::kMark,
+ /* ENCLOSING_MARK */ nsUGenCategory::kMark,
+ /* NON_SPACING_MARK */ nsUGenCategory::kMark,
+ /* DECIMAL_NUMBER */ nsUGenCategory::kNumber,
+ /* LETTER_NUMBER */ nsUGenCategory::kNumber,
+ /* OTHER_NUMBER */ nsUGenCategory::kNumber,
+ /* CONNECT_PUNCTUATION */ nsUGenCategory::kPunctuation,
+ /* DASH_PUNCTUATION */ nsUGenCategory::kPunctuation,
+ /* CLOSE_PUNCTUATION */ nsUGenCategory::kPunctuation,
+ /* FINAL_PUNCTUATION */ nsUGenCategory::kPunctuation,
+ /* INITIAL_PUNCTUATION */ nsUGenCategory::kPunctuation,
+ /* OTHER_PUNCTUATION */ nsUGenCategory::kPunctuation,
+ /* OPEN_PUNCTUATION */ nsUGenCategory::kPunctuation,
+ /* CURRENCY_SYMBOL */ nsUGenCategory::kSymbol,
+ /* MODIFIER_SYMBOL */ nsUGenCategory::kSymbol,
+ /* MATH_SYMBOL */ nsUGenCategory::kSymbol,
+ /* OTHER_SYMBOL */ nsUGenCategory::kSymbol,
+ /* LINE_SEPARATOR */ nsUGenCategory::kSeparator,
+ /* PARAGRAPH_SEPARATOR */ nsUGenCategory::kSeparator,
+ /* SPACE_SEPARATOR */ nsUGenCategory::kSeparator
+ // clang-format on
+};
+
+const hb_unicode_general_category_t sICUtoHBcategory[U_CHAR_CATEGORY_COUNT] = {
+ // clang-format off
+ HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED, // U_GENERAL_OTHER_TYPES = 0,
+ HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER, // U_UPPERCASE_LETTER = 1,
+ HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER, // U_LOWERCASE_LETTER = 2,
+ HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER, // U_TITLECASE_LETTER = 3,
+ HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER, // U_MODIFIER_LETTER = 4,
+ HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER, // U_OTHER_LETTER = 5,
+ HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK, // U_NON_SPACING_MARK = 6,
+ HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK, // U_ENCLOSING_MARK = 7,
+ HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK, // U_COMBINING_SPACING_MARK = 8,
+ HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER, // U_DECIMAL_DIGIT_NUMBER = 9,
+ HB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER, // U_LETTER_NUMBER = 10,
+ HB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER, // U_OTHER_NUMBER = 11,
+ HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR, // U_SPACE_SEPARATOR = 12,
+ HB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR, // U_LINE_SEPARATOR = 13,
+ HB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR, // U_PARAGRAPH_SEPARATOR = 14,
+ HB_UNICODE_GENERAL_CATEGORY_CONTROL, // U_CONTROL_CHAR = 15,
+ HB_UNICODE_GENERAL_CATEGORY_FORMAT, // U_FORMAT_CHAR = 16,
+ HB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE, // U_PRIVATE_USE_CHAR = 17,
+ HB_UNICODE_GENERAL_CATEGORY_SURROGATE, // U_SURROGATE = 18,
+ HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION, // U_DASH_PUNCTUATION = 19,
+ HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION, // U_START_PUNCTUATION = 20,
+ HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION, // U_END_PUNCTUATION = 21,
+ HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION, // U_CONNECTOR_PUNCTUATION = 22,
+ HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION, // U_OTHER_PUNCTUATION = 23,
+ HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL, // U_MATH_SYMBOL = 24,
+ HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL, // U_CURRENCY_SYMBOL = 25,
+ HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL, // U_MODIFIER_SYMBOL = 26,
+ HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL, // U_OTHER_SYMBOL = 27,
+ HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION, // U_INITIAL_PUNCTUATION = 28,
+ HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION, // U_FINAL_PUNCTUATION = 29,
+ // clang-format on
+};
+
+#define DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(prefix_) \
+ uint32_t Get##prefix_(uint32_t aCh) { \
+ if (aCh >= UNICODE_BMP_LIMIT) { \
+ return aCh; \
+ } \
+ auto page = s##prefix_##Pages[aCh >> k##prefix_##CharBits]; \
+ auto index = aCh & ((1 << k##prefix_##CharBits) - 1); \
+ uint32_t v = s##prefix_##Values[page][index]; \
+ return v ? v : aCh; \
+ }
+
+// full-width mappings only exist for BMP characters; all others are
+// returned unchanged
+DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(FullWidth)
+DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(FullWidthInverse)
+
+bool IsClusterExtender(uint32_t aCh, uint8_t aCategory) {
+ return (
+ (aCategory >= HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK &&
+ aCategory <= HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK) ||
+ (aCh >= 0x200c && aCh <= 0x200d) || // ZWJ, ZWNJ
+ (aCh >= 0xff9e && aCh <= 0xff9f) || // katakana sound marks
+ (aCh >= 0x1F3FB && aCh <= 0x1F3FF) || // fitzpatrick skin tone modifiers
+ (aCh >= 0xe0020 && aCh <= 0xe007f)); // emoji (flag) tag characters
+}
+
+bool IsClusterExtenderExcludingJoiners(uint32_t aCh, uint8_t aCategory) {
+ return (
+ (aCategory >= HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK &&
+ aCategory <= HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK) ||
+ (aCh >= 0xff9e && aCh <= 0xff9f) || // katakana sound marks
+ (aCh >= 0x1F3FB && aCh <= 0x1F3FF) || // fitzpatrick skin tone modifiers
+ (aCh >= 0xe0020 && aCh <= 0xe007f)); // emoji (flag) tag characters
+}
+
+uint32_t CountGraphemeClusters(Span<const char16_t> aText) {
+ if (aText.IsEmpty()) {
+ // Fast path for empty text.
+ return 0;
+ }
+ intl::GraphemeClusterBreakIteratorUtf16 iter(aText);
+ uint32_t result = 0;
+ while (iter.Next()) {
+ ++result;
+ }
+ return result;
+}
+
+uint32_t GetNaked(uint32_t aCh) {
+ uint32_t index = aCh >> 8;
+ if (index >= MOZ_ARRAY_LENGTH(BASE_CHAR_MAPPING_BLOCK_INDEX)) {
+ return aCh;
+ }
+ index = BASE_CHAR_MAPPING_BLOCK_INDEX[index];
+ if (index == 0xff) {
+ return aCh;
+ }
+ const BaseCharMappingBlock& block = BASE_CHAR_MAPPING_BLOCKS[index];
+ uint8_t lo = aCh & 0xff;
+ if (lo < block.mFirst || lo > block.mLast) {
+ return aCh;
+ }
+ return (aCh & 0xffff0000) |
+ BASE_CHAR_MAPPING_LIST[block.mMappingStartOffset + lo - block.mFirst];
+}
+
+bool IsCombiningDiacritic(uint32_t aCh) {
+ return sCombiningDiacriticsSet->test(aCh);
+}
+
+} // end namespace unicode
+
+} // end namespace mozilla