diff options
Diffstat (limited to 'intl/unicharutil/util/nsUnicodeProperties.h')
-rw-r--r-- | intl/unicharutil/util/nsUnicodeProperties.h | 283 |
1 files changed, 283 insertions, 0 deletions
diff --git a/intl/unicharutil/util/nsUnicodeProperties.h b/intl/unicharutil/util/nsUnicodeProperties.h new file mode 100644 index 0000000000..11566cbffc --- /dev/null +++ b/intl/unicharutil/util/nsUnicodeProperties.h @@ -0,0 +1,283 @@ +/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=4 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef NS_UNICODEPROPERTIES_H +#define NS_UNICODEPROPERTIES_H + +#include "nsBidiUtils.h" +#include "nsUGenCategory.h" +#include "nsUnicodeScriptCodes.h" +#include "harfbuzz/hb.h" + +#include "unicode/uchar.h" +#include "unicode/uscript.h" + +const nsCharProps2& GetCharProps2(uint32_t aCh); + +namespace mozilla { + +namespace unicode { + +extern const nsUGenCategory sDetailedToGeneralCategory[]; + +/* This MUST match the values assigned by genUnicodePropertyData.pl! */ +enum VerticalOrientation { + VERTICAL_ORIENTATION_U = 0, + VERTICAL_ORIENTATION_R = 1, + VERTICAL_ORIENTATION_Tu = 2, + VERTICAL_ORIENTATION_Tr = 3 +}; + +/* This MUST match the values assigned by genUnicodePropertyData.pl! */ +enum PairedBracketType { + PAIRED_BRACKET_TYPE_NONE = 0, + PAIRED_BRACKET_TYPE_OPEN = 1, + PAIRED_BRACKET_TYPE_CLOSE = 2 +}; + +/* Flags for Unicode security IdentifierType.txt attributes. Only a subset + of these are currently checked by Gecko, so we only define flags for the + ones we need. */ +enum IdentifierType { + IDTYPE_RESTRICTED = 0, + IDTYPE_ALLOWED = 1, +}; + +enum EmojiPresentation { TextOnly = 0, TextDefault = 1, EmojiDefault = 2 }; + +const uint32_t kVariationSelector15 = 0xFE0E; // text presentation +const uint32_t kVariationSelector16 = 0xFE0F; // emoji presentation + +// Unicode values for EMOJI MODIFIER FITZPATRICK TYPE-* +const uint32_t kEmojiSkinToneFirst = 0x1f3fb; +const uint32_t kEmojiSkinToneLast = 0x1f3ff; + +extern const hb_unicode_general_category_t sICUtoHBcategory[]; + +inline uint32_t GetMirroredChar(uint32_t aCh) { return u_charMirror(aCh); } + +inline bool HasMirroredChar(uint32_t aCh) { return u_isMirrored(aCh); } + +inline uint8_t GetCombiningClass(uint32_t aCh) { + return u_getCombiningClass(aCh); +} + +inline uint8_t GetGeneralCategory(uint32_t aCh) { + return sICUtoHBcategory[u_charType(aCh)]; +} + +inline nsCharType GetBidiCat(uint32_t aCh) { + return nsCharType(u_charDirection(aCh)); +} + +inline int8_t GetNumericValue(uint32_t aCh) { + UNumericType type = + UNumericType(u_getIntPropertyValue(aCh, UCHAR_NUMERIC_TYPE)); + return type == U_NT_DECIMAL || type == U_NT_DIGIT + ? int8_t(u_getNumericValue(aCh)) + : -1; +} + +inline uint8_t GetLineBreakClass(uint32_t aCh) { + return u_getIntPropertyValue(aCh, UCHAR_LINE_BREAK); +} + +inline Script GetScriptCode(uint32_t aCh) { + UErrorCode err = U_ZERO_ERROR; + return Script(uscript_getScript(aCh, &err)); +} + +inline bool HasScript(uint32_t aCh, Script aScript) { + return uscript_hasScript(aCh, UScriptCode(aScript)); +} + +inline uint32_t GetScriptTagForCode(Script aScriptCode) { + const char* tag = uscript_getShortName(UScriptCode(aScriptCode)); + if (tag) { + return HB_TAG(tag[0], tag[1], tag[2], tag[3]); + } + // return UNKNOWN script tag (running with older ICU?) + return HB_SCRIPT_UNKNOWN; +} + +inline PairedBracketType GetPairedBracketType(uint32_t aCh) { + return PairedBracketType( + u_getIntPropertyValue(aCh, UCHAR_BIDI_PAIRED_BRACKET_TYPE)); +} + +inline uint32_t GetPairedBracket(uint32_t aCh) { + return u_getBidiPairedBracket(aCh); +} + +inline uint32_t GetUppercase(uint32_t aCh) { return u_toupper(aCh); } + +inline uint32_t GetLowercase(uint32_t aCh) { return u_tolower(aCh); } + +inline uint32_t GetTitlecaseForLower( + uint32_t aCh) // maps LC to titlecase, UC unchanged +{ + return u_isULowercase(aCh) ? u_totitle(aCh) : aCh; +} + +inline uint32_t GetTitlecaseForAll( + uint32_t aCh) // maps both UC and LC to titlecase +{ + return u_totitle(aCh); +} + +inline uint32_t GetFoldedcase(uint32_t aCh) { + // Handle dotted capital I and dotless small i specially because we want to + // use a combination of ordinary case-folding rules and Turkish case-folding + // rules. + if (aCh == 0x0130 || aCh == 0x0131) { + return 'i'; + } + return u_foldCase(aCh, U_FOLD_CASE_DEFAULT); +} + +inline bool IsEastAsianWidthFHWexcludingEmoji(uint32_t aCh) { + switch (u_getIntPropertyValue(aCh, UCHAR_EAST_ASIAN_WIDTH)) { + case U_EA_FULLWIDTH: + case U_EA_HALFWIDTH: + return true; + case U_EA_WIDE: + return u_hasBinaryProperty(aCh, UCHAR_EMOJI) ? false : true; + case U_EA_AMBIGUOUS: + case U_EA_NARROW: + case U_EA_NEUTRAL: + return false; + } + return false; +} + +inline bool IsEastAsianWidthAFW(uint32_t aCh) { + switch (u_getIntPropertyValue(aCh, UCHAR_EAST_ASIAN_WIDTH)) { + case U_EA_AMBIGUOUS: + case U_EA_FULLWIDTH: + case U_EA_WIDE: + return true; + case U_EA_HALFWIDTH: + case U_EA_NARROW: + case U_EA_NEUTRAL: + return false; + } + return false; +} + +inline bool IsDefaultIgnorable(uint32_t aCh) { + return u_hasBinaryProperty(aCh, UCHAR_DEFAULT_IGNORABLE_CODE_POINT); +} + +inline EmojiPresentation GetEmojiPresentation(uint32_t aCh) { + if (!u_hasBinaryProperty(aCh, UCHAR_EMOJI)) { + return TextOnly; + } + + if (u_hasBinaryProperty(aCh, UCHAR_EMOJI_PRESENTATION)) { + return EmojiDefault; + } + return TextDefault; +} + +// returns the simplified Gen Category as defined in nsUGenCategory +inline nsUGenCategory GetGenCategory(uint32_t aCh) { + return sDetailedToGeneralCategory[GetGeneralCategory(aCh)]; +} + +inline VerticalOrientation GetVerticalOrientation(uint32_t aCh) { + return VerticalOrientation(GetCharProps2(aCh).mVertOrient); +} + +inline IdentifierType GetIdentifierType(uint32_t aCh) { + return IdentifierType(GetCharProps2(aCh).mIdType); +} + +uint32_t GetFullWidth(uint32_t aCh); +// This is the reverse function of GetFullWidth which guarantees that +// for every codepoint c, GetFullWidthInverse(GetFullWidth(c)) == c. +// Note that, this function does not guarantee to convert all wide +// form characters to their possible narrow form. +uint32_t GetFullWidthInverse(uint32_t aCh); + +bool IsClusterExtender(uint32_t aCh, uint8_t aCategory); + +inline bool IsClusterExtender(uint32_t aCh) { + return IsClusterExtender(aCh, GetGeneralCategory(aCh)); +} + +// A simple iterator for a string of char16_t codepoints that advances +// by Unicode grapheme clusters +class ClusterIterator { + public: + ClusterIterator(const char16_t* aText, uint32_t aLength) + : mPos(aText), + mLimit(aText + aLength) +#ifdef DEBUG + , + mText(aText) +#endif + { + } + + operator const char16_t*() const { return mPos; } + + bool AtEnd() const { return mPos >= mLimit; } + + void Next(); + + private: + const char16_t* mPos; + const char16_t* mLimit; +#ifdef DEBUG + const char16_t* mText; +#endif +}; + +// Count the number of grapheme clusters in the given string +uint32_t CountGraphemeClusters(const char16_t* aText, uint32_t aLength); + +// Determine whether a character is a "combining diacritic" for the purpose +// of diacritic-insensitive text search. Examples of such characters include +// European accents and Hebrew niqqud, but not Hangul components or Thaana +// vowels, even though Thaana vowels are combining nonspacing marks that could +// be considered diacritics. +// As an exception to strictly following Unicode properties, we exclude the +// Japanese kana voicing marks +// 3099;COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK;Mn;8;NSM +// 309A;COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK;Mn;8;NSM +// which users report should not be ignored (bug 1624244). +inline bool IsCombiningDiacritic(uint32_t aCh) { + uint8_t cc = u_getCombiningClass(aCh); + return cc != HB_UNICODE_COMBINING_CLASS_NOT_REORDERED && + cc != HB_UNICODE_COMBINING_CLASS_KANA_VOICING; +} + +// Remove diacritics from a character +uint32_t GetNaked(uint32_t aCh); + +// A simple reverse iterator for a string of char16_t codepoints that +// advances by Unicode grapheme clusters +class ClusterReverseIterator { + public: + ClusterReverseIterator(const char16_t* aText, uint32_t aLength) + : mPos(aText + aLength), mLimit(aText) {} + + operator const char16_t*() const { return mPos; } + + bool AtEnd() const { return mPos <= mLimit; } + + void Next(); + + private: + const char16_t* mPos; + const char16_t* mLimit; +}; + +} // end namespace unicode + +} // end namespace mozilla + +#endif /* NS_UNICODEPROPERTIES_H */ |