diff options
Diffstat (limited to 'intl/unicharutil/util/nsUnicharUtils.cpp')
-rw-r--r-- | intl/unicharutil/util/nsUnicharUtils.cpp | 522 |
1 files changed, 522 insertions, 0 deletions
diff --git a/intl/unicharutil/util/nsUnicharUtils.cpp b/intl/unicharutil/util/nsUnicharUtils.cpp new file mode 100644 index 0000000000..db1627460c --- /dev/null +++ b/intl/unicharutil/util/nsUnicharUtils.cpp @@ -0,0 +1,522 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsUnicharUtils.h" +#include "nsUnicodeProperties.h" +#include "nsUTF8Utils.h" +#include "mozilla/Likely.h" +#include "mozilla/HashFunctions.h" +#include "mozilla/intl/UnicodeProperties.h" + +// We map x -> x, except for upper-case letters, +// which we map to their lower-case equivalents. +static const uint8_t gASCIIToLower[128] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, + 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, + 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, + 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, + 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, + 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, +}; + +// We want ToLowerCase(uint32_t) and ToLowerCaseASCII(uint32_t) to be fast +// when they're called from within the case-insensitive comparators, so we +// define inlined versions. +static MOZ_ALWAYS_INLINE uint32_t ToLowerCase_inline(uint32_t aChar) { + if (IS_ASCII(aChar)) { + return gASCIIToLower[aChar]; + } + + return mozilla::intl::UnicodeProperties::ToLower(aChar); +} + +static MOZ_ALWAYS_INLINE uint32_t +ToLowerCaseASCII_inline(const uint32_t aChar) { + if (IS_ASCII(aChar)) { + return gASCIIToLower[aChar]; + } + + return aChar; +} + +void ToLowerCase(nsAString& aString) { + char16_t* buf = aString.BeginWriting(); + ToLowerCase(buf, buf, aString.Length()); +} + +void ToLowerCaseASCII(nsAString& aString) { + char16_t* buf = aString.BeginWriting(); + ToLowerCaseASCII(buf, buf, aString.Length()); +} + +char ToLowerCaseASCII(char aChar) { + if (aChar >= 'A' && aChar <= 'Z') { + return aChar + 0x20; + } + return aChar; +} + +char16_t ToLowerCaseASCII(char16_t aChar) { + if (aChar >= 'A' && aChar <= 'Z') { + return aChar + 0x20; + } + return aChar; +} + +char32_t ToLowerCaseASCII(char32_t aChar) { + if (aChar >= 'A' && aChar <= 'Z') { + return aChar + 0x20; + } + return aChar; +} + +char ToUpperCaseASCII(char aChar) { + if (aChar >= 'a' && aChar <= 'z') { + return aChar - 0x20; + } + return aChar; +} + +char16_t ToUpperCaseASCII(char16_t aChar) { + if (aChar >= 'a' && aChar <= 'z') { + return aChar - 0x20; + } + return aChar; +} + +char32_t ToUpperCaseASCII(char32_t aChar) { + if (aChar >= 'a' && aChar <= 'z') { + return aChar - 0x20; + } + return aChar; +} + +void ToLowerCase(const nsAString& aSource, nsAString& aDest) { + const char16_t* in = aSource.BeginReading(); + size_t len = aSource.Length(); + + aDest.SetLength(len); + char16_t* out = aDest.BeginWriting(); + + ToLowerCase(in, out, len); +} + +void ToLowerCaseASCII(const nsAString& aSource, nsAString& aDest) { + const char16_t* in = aSource.BeginReading(); + size_t len = aSource.Length(); + + aDest.SetLength(len); + char16_t* out = aDest.BeginWriting(); + + ToLowerCaseASCII(in, out, len); +} + +uint32_t ToLowerCaseASCII(const uint32_t aChar) { + return ToLowerCaseASCII_inline(aChar); +} + +void ToUpperCase(nsAString& aString) { + char16_t* buf = aString.BeginWriting(); + ToUpperCase(buf, buf, aString.Length()); +} + +void ToUpperCase(const nsAString& aSource, nsAString& aDest) { + const char16_t* in = aSource.BeginReading(); + size_t len = aSource.Length(); + + aDest.SetLength(len); + char16_t* out = aDest.BeginWriting(); + + ToUpperCase(in, out, len); +} + +#ifdef MOZILLA_INTERNAL_API + +uint32_t ToFoldedCase(uint32_t aChar) { + if (IS_ASCII(aChar)) return gASCIIToLower[aChar]; + return mozilla::unicode::GetFoldedcase(aChar); +} + +void ToFoldedCase(nsAString& aString) { + char16_t* buf = aString.BeginWriting(); + ToFoldedCase(buf, buf, aString.Length()); +} + +void ToFoldedCase(const char16_t* aIn, char16_t* aOut, size_t aLen) { + for (uint32_t i = 0; i < aLen; i++) { + uint32_t ch = aIn[i]; + if (i < aLen - 1 && NS_IS_SURROGATE_PAIR(ch, aIn[i + 1])) { + ch = mozilla::unicode::GetFoldedcase(SURROGATE_TO_UCS4(ch, aIn[i + 1])); + NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!"); + aOut[i++] = H_SURROGATE(ch); + aOut[i] = L_SURROGATE(ch); + continue; + } + aOut[i] = ToFoldedCase(ch); + } +} + +uint32_t ToNaked(uint32_t aChar) { + if (IS_ASCII(aChar)) { + return aChar; + } + return mozilla::unicode::GetNaked(aChar); +} + +void ToNaked(nsAString& aString) { + uint32_t i = 0; + while (i < aString.Length()) { + uint32_t ch = aString[i]; + if (i < aString.Length() - 1 && NS_IS_SURROGATE_PAIR(ch, aString[i + 1])) { + ch = SURROGATE_TO_UCS4(ch, aString[i + 1]); + if (mozilla::unicode::IsCombiningDiacritic(ch)) { + aString.Cut(i, 2); + } else { + ch = mozilla::unicode::GetNaked(ch); + NS_ASSERTION(!IS_IN_BMP(ch), "stripping crossed BMP/SMP boundary!"); + aString.Replace(i++, 1, H_SURROGATE(ch)); + aString.Replace(i++, 1, L_SURROGATE(ch)); + } + continue; + } + if (mozilla::unicode::IsCombiningDiacritic(ch)) { + aString.Cut(i, 1); + } else { + aString.Replace(i++, 1, ToNaked(ch)); + } + } +} + +int32_t nsCaseInsensitiveStringComparator(const char16_t* lhs, + const char16_t* rhs, size_t lLength, + size_t rLength) { + return (lLength == rLength) ? CaseInsensitiveCompare(lhs, rhs, lLength) + : (lLength > rLength) ? 1 + : -1; +} + +int32_t nsCaseInsensitiveUTF8StringComparator(const char* lhs, const char* rhs, + size_t lLength, size_t rLength) { + return CaseInsensitiveCompare(lhs, rhs, lLength, rLength); +} + +int32_t nsASCIICaseInsensitiveStringComparator(const char16_t* lhs, + const char16_t* rhs, + size_t lLength, size_t rLength) { + if (lLength != rLength) { + if (lLength > rLength) return 1; + return -1; + } + + while (rLength) { + // we don't care about surrogates here, because we're only + // lowercasing the ASCII range + char16_t l = *lhs++; + char16_t r = *rhs++; + if (l != r) { + l = ToLowerCaseASCII_inline(l); + r = ToLowerCaseASCII_inline(r); + + if (l > r) + return 1; + else if (r > l) + return -1; + } + rLength--; + } + + return 0; +} + +#endif // MOZILLA_INTERNAL_API + +uint32_t ToLowerCase(uint32_t aChar) { return ToLowerCase_inline(aChar); } + +void ToLowerCase(const char16_t* aIn, char16_t* aOut, size_t aLen) { + for (size_t i = 0; i < aLen; i++) { + uint32_t ch = aIn[i]; + if (i < aLen - 1 && NS_IS_SURROGATE_PAIR(ch, aIn[i + 1])) { + ch = mozilla::intl::UnicodeProperties::ToLower( + SURROGATE_TO_UCS4(ch, aIn[i + 1])); + NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!"); + aOut[i++] = H_SURROGATE(ch); + aOut[i] = L_SURROGATE(ch); + continue; + } + aOut[i] = ToLowerCase(ch); + } +} + +void ToLowerCaseASCII(const char16_t* aIn, char16_t* aOut, size_t aLen) { + for (size_t i = 0; i < aLen; i++) { + char16_t ch = aIn[i]; + aOut[i] = IS_ASCII_UPPER(ch) ? (ch + 0x20) : ch; + } +} + +uint32_t ToUpperCase(uint32_t aChar) { + if (IS_ASCII(aChar)) { + if (IS_ASCII_LOWER(aChar)) { + return aChar - 0x20; + } + return aChar; + } + + return mozilla::intl::UnicodeProperties::ToUpper(aChar); +} + +void ToUpperCase(const char16_t* aIn, char16_t* aOut, size_t aLen) { + for (size_t i = 0; i < aLen; i++) { + uint32_t ch = aIn[i]; + if (i < aLen - 1 && NS_IS_SURROGATE_PAIR(ch, aIn[i + 1])) { + ch = mozilla::intl::UnicodeProperties::ToUpper( + SURROGATE_TO_UCS4(ch, aIn[i + 1])); + NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!"); + aOut[i++] = H_SURROGATE(ch); + aOut[i] = L_SURROGATE(ch); + continue; + } + aOut[i] = ToUpperCase(ch); + } +} + +uint32_t ToTitleCase(uint32_t aChar) { + if (IS_ASCII(aChar)) { + return ToUpperCase(aChar); + } + + return mozilla::unicode::GetTitlecaseForLower(aChar); +} + +int32_t CaseInsensitiveCompare(const char16_t* a, const char16_t* b, + size_t len) { + NS_ASSERTION(a && b, "Do not pass in invalid pointers!"); + + if (len) { + do { + uint32_t c1 = *a++; + uint32_t c2 = *b++; + + // Unfortunately, we need to check for surrogates BEFORE we check + // for equality, because we could have identical high surrogates + // but non-identical characters, so we can't just skip them + + // If c1 isn't a surrogate, we don't bother to check c2; + // in the case where it _is_ a surrogate, we're definitely going to get + // a mismatch, and don't need to interpret and lowercase it + + if (len > 1 && NS_IS_SURROGATE_PAIR(c1, *a)) { + c1 = SURROGATE_TO_UCS4(c1, *a++); + if (NS_IS_SURROGATE_PAIR(c2, *b)) { + c2 = SURROGATE_TO_UCS4(c2, *b++); + } + // If c2 wasn't a surrogate, decrementing len means we'd stop + // short of the end of string b, but that doesn't actually matter + // because we're going to find a mismatch and return early + --len; + } + + if (c1 != c2) { + c1 = ToLowerCase_inline(c1); + c2 = ToLowerCase_inline(c2); + if (c1 != c2) { + if (c1 < c2) { + return -1; + } + return 1; + } + } + } while (--len != 0); + } + return 0; +} + +// Inlined definition of GetLowerUTF8Codepoint, which we use because we want +// to be fast when called from the case-insensitive comparators. +static MOZ_ALWAYS_INLINE uint32_t GetLowerUTF8Codepoint_inline( + const char* aStr, const char* aEnd, const char** aNext) { + // Convert to unsigned char so that stuffing chars into PRUint32s doesn't + // sign extend. + const unsigned char* str = (unsigned char*)aStr; + + if (UTF8traits::isASCII(str[0])) { + // It's ASCII; just convert to lower-case and return it. + *aNext = aStr + 1; + return gASCIIToLower[*str]; + } + if (UTF8traits::is2byte(str[0]) && MOZ_LIKELY(aStr + 1 < aEnd)) { + // It's a two-byte sequence, so it looks like + // 110XXXXX 10XXXXXX. + // This is definitely in the BMP, so we can store straightaway into a + // uint16_t. + + uint16_t c; + c = (str[0] & 0x1F) << 6; + c += (str[1] & 0x3F); + + // we don't go through ToLowerCase here, because we know this isn't + // an ASCII character so the ASCII fast-path there is useless + c = mozilla::intl::UnicodeProperties::ToLower(c); + + *aNext = aStr + 2; + return c; + } + if (UTF8traits::is3byte(str[0]) && MOZ_LIKELY(aStr + 2 < aEnd)) { + // It's a three-byte sequence, so it looks like + // 1110XXXX 10XXXXXX 10XXXXXX. + // This will just barely fit into 16-bits, so store into a uint16_t. + + uint16_t c; + c = (str[0] & 0x0F) << 12; + c += (str[1] & 0x3F) << 6; + c += (str[2] & 0x3F); + + c = mozilla::intl::UnicodeProperties::ToLower(c); + + *aNext = aStr + 3; + return c; + } + if (UTF8traits::is4byte(str[0]) && MOZ_LIKELY(aStr + 3 < aEnd)) { + // It's a four-byte sequence, so it looks like + // 11110XXX 10XXXXXX 10XXXXXX 10XXXXXX. + + uint32_t c; + c = (str[0] & 0x07) << 18; + c += (str[1] & 0x3F) << 12; + c += (str[2] & 0x3F) << 6; + c += (str[3] & 0x3F); + + c = mozilla::intl::UnicodeProperties::ToLower(c); + + *aNext = aStr + 4; + return c; + } + + // Hm, we don't understand this sequence. + return -1; +} + +uint32_t GetLowerUTF8Codepoint(const char* aStr, const char* aEnd, + const char** aNext) { + return GetLowerUTF8Codepoint_inline(aStr, aEnd, aNext); +} + +int32_t CaseInsensitiveCompare(const char* aLeft, const char* aRight, + size_t aLeftBytes, size_t aRightBytes) { + const char* leftEnd = aLeft + aLeftBytes; + const char* rightEnd = aRight + aRightBytes; + + while (aLeft < leftEnd && aRight < rightEnd) { + uint32_t leftChar = GetLowerUTF8Codepoint_inline(aLeft, leftEnd, &aLeft); + if (MOZ_UNLIKELY(leftChar == uint32_t(-1))) return -1; + + uint32_t rightChar = + GetLowerUTF8Codepoint_inline(aRight, rightEnd, &aRight); + if (MOZ_UNLIKELY(rightChar == uint32_t(-1))) return -1; + + // Now leftChar and rightChar are lower-case, so we can compare them. + if (leftChar != rightChar) { + if (leftChar > rightChar) return 1; + return -1; + } + } + + // Make sure that if one string is longer than the other we return the + // correct result. + if (aLeft < leftEnd) return 1; + if (aRight < rightEnd) return -1; + + return 0; +} + +static MOZ_ALWAYS_INLINE uint32_t +GetLowerUTF8Codepoint_inline(const char* aStr, const char* aEnd, + const char** aNext, bool aMatchDiacritics) { + uint32_t c; + for (;;) { + c = GetLowerUTF8Codepoint_inline(aStr, aEnd, aNext); + if (aMatchDiacritics) { + break; + } + if (!mozilla::unicode::IsCombiningDiacritic(c)) { + break; + } + aStr = *aNext; + } + return c; +} + +bool CaseInsensitiveUTF8CharsEqual(const char* aLeft, const char* aRight, + const char* aLeftEnd, const char* aRightEnd, + const char** aLeftNext, + const char** aRightNext, bool* aErr, + bool aMatchDiacritics) { + NS_ASSERTION(aLeftNext, "Out pointer shouldn't be null."); + NS_ASSERTION(aRightNext, "Out pointer shouldn't be null."); + NS_ASSERTION(aErr, "Out pointer shouldn't be null."); + NS_ASSERTION(aLeft < aLeftEnd, "aLeft must be less than aLeftEnd."); + NS_ASSERTION(aRight < aRightEnd, "aRight must be less than aRightEnd."); + + uint32_t leftChar = GetLowerUTF8Codepoint_inline(aLeft, aLeftEnd, aLeftNext, + aMatchDiacritics); + if (MOZ_UNLIKELY(leftChar == uint32_t(-1))) { + *aErr = true; + return false; + } + + uint32_t rightChar = GetLowerUTF8Codepoint_inline( + aRight, aRightEnd, aRightNext, aMatchDiacritics); + if (MOZ_UNLIKELY(rightChar == uint32_t(-1))) { + *aErr = true; + return false; + } + + // Can't have an error past this point. + *aErr = false; + + if (!aMatchDiacritics) { + leftChar = ToNaked(leftChar); + rightChar = ToNaked(rightChar); + } + + return leftChar == rightChar; +} + +namespace mozilla { + +uint32_t HashUTF8AsUTF16(const char* aUTF8, size_t aLength, bool* aErr) { + uint32_t hash = 0; + const char* s = aUTF8; + const char* end = aUTF8 + aLength; + + *aErr = false; + + while (s < end) { + uint32_t ucs4 = UTF8CharEnumerator::NextChar(&s, end, aErr); + if (*aErr) { + return 0; + } + + if (ucs4 < PLANE1_BASE) { + hash = AddToHash(hash, ucs4); + } else { + hash = AddToHash(hash, H_SURROGATE(ucs4), L_SURROGATE(ucs4)); + } + } + + return hash; +} + +bool IsSegmentBreakSkipChar(uint32_t u) { + return intl::UnicodeProperties::IsEastAsianWidthFHWexcludingEmoji(u) && + intl::UnicodeProperties::GetScriptCode(u) != intl::Script::HANGUL; +} + +} // namespace mozilla |