diff options
Diffstat (limited to 'intl/components/src/String.h')
-rw-r--r-- | intl/components/src/String.h | 256 |
1 files changed, 256 insertions, 0 deletions
diff --git a/intl/components/src/String.h b/intl/components/src/String.h new file mode 100644 index 0000000000..f07acd6578 --- /dev/null +++ b/intl/components/src/String.h @@ -0,0 +1,256 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef intl_components_String_h_ +#define intl_components_String_h_ + +#include "mozilla/Assertions.h" +#include "mozilla/Casting.h" +#include "mozilla/intl/ICU4CGlue.h" +#include "mozilla/intl/ICUError.h" +#include "mozilla/PodOperations.h" +#include "mozilla/Result.h" +#include "mozilla/Span.h" + +#include "unicode/uchar.h" +#include "unicode/unorm2.h" +#include "unicode/ustring.h" +#include "unicode/utext.h" +#include "unicode/utypes.h" + +namespace mozilla::intl { + +/** + * This component is a Mozilla-focused API for working with strings in + * internationalization code. + */ +class String final { + public: + String() = delete; + + /** + * Return the locale-sensitive lower case string of the input. + */ + template <typename B> + static Result<Ok, ICUError> ToLocaleLowerCase(const char* aLocale, + Span<const char16_t> aString, + B& aBuffer) { + if (!aBuffer.reserve(aString.size())) { + return Err(ICUError::OutOfMemory); + } + return FillBufferWithICUCall( + aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) { + return u_strToLower(target, length, aString.data(), aString.size(), + aLocale, status); + }); + } + + /** + * Return the locale-sensitive upper case string of the input. + */ + template <typename B> + static Result<Ok, ICUError> ToLocaleUpperCase(const char* aLocale, + Span<const char16_t> aString, + B& aBuffer) { + if (!aBuffer.reserve(aString.size())) { + return Err(ICUError::OutOfMemory); + } + return FillBufferWithICUCall( + aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) { + return u_strToUpper(target, length, aString.data(), aString.size(), + aLocale, status); + }); + } + + /** + * Normalization form constants to describe which normalization algorithm + * should be performed. + * + * Also see: + * - Unicode Standard, §2.12 Equivalent Sequences + * - Unicode Standard, §3.11 Normalization Forms + * - https://unicode.org/reports/tr15/ + */ + enum class NormalizationForm { + /** + * Normalization Form C + */ + NFC, + + /** + * Normalization Form D + */ + NFD, + + /** + * Normalization Form KC + */ + NFKC, + + /** + * Normalization Form KD + */ + NFKD, + }; + + enum class AlreadyNormalized : bool { No, Yes }; + + /** + * Normalize the input string according to requested normalization form. + * + * Returns `AlreadyNormalized::Yes` when the string is already in normalized + * form. The output buffer is unchanged in this case. Otherwise returns + * `AlreadyNormalized::No` and places the normalized string into the output + * buffer. + */ + template <typename B> + static Result<AlreadyNormalized, ICUError> Normalize( + NormalizationForm aForm, Span<const char16_t> aString, B& aBuffer) { + // The unorm2_getXXXInstance() methods return a shared instance which must + // not be deleted. + UErrorCode status = U_ZERO_ERROR; + const UNormalizer2* normalizer; + switch (aForm) { + case NormalizationForm::NFC: + normalizer = unorm2_getNFCInstance(&status); + break; + case NormalizationForm::NFD: + normalizer = unorm2_getNFDInstance(&status); + break; + case NormalizationForm::NFKC: + normalizer = unorm2_getNFKCInstance(&status); + break; + case NormalizationForm::NFKD: + normalizer = unorm2_getNFKDInstance(&status); + break; + } + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + int32_t spanLengthInt = unorm2_spanQuickCheckYes(normalizer, aString.data(), + aString.size(), &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + size_t spanLength = AssertedCast<size_t>(spanLengthInt); + MOZ_ASSERT(spanLength <= aString.size()); + + // Return if the input string is already normalized. + if (spanLength == aString.size()) { + return AlreadyNormalized::Yes; + } + + if (!aBuffer.reserve(aString.size())) { + return Err(ICUError::OutOfMemory); + } + + // Copy the already normalized prefix. + if (spanLength > 0) { + PodCopy(aBuffer.data(), aString.data(), spanLength); + + aBuffer.written(spanLength); + } + + MOZ_TRY(FillBufferWithICUCall( + aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) { + Span<const char16_t> remaining = aString.From(spanLength); + return unorm2_normalizeSecondAndAppend(normalizer, target, spanLength, + length, remaining.data(), + remaining.size(), status); + })); + + return AlreadyNormalized::No; + } + + /** + * Return true if the code point has the binary property "Cased". + */ + static bool IsCased(char32_t codePoint) { + return u_hasBinaryProperty(static_cast<UChar32>(codePoint), UCHAR_CASED); + } + + /** + * Return true if the code point has the binary property "Case_Ignorable". + */ + static bool IsCaseIgnorable(char32_t codePoint) { + return u_hasBinaryProperty(static_cast<UChar32>(codePoint), + UCHAR_CASE_IGNORABLE); + } + + /** + * Return the NFC pairwise composition of the two input characters, if any; + * returns 0 (which we know is not a composed char!) if none exists. + */ + static char32_t ComposePairNFC(char32_t a, char32_t b) { + // unorm2_getNFCInstance returns a static instance that does not have to be + // released here. If it fails, we just return 0 (no composition) always. + static UErrorCode status = U_ZERO_ERROR; + static const UNormalizer2* normalizer = unorm2_getNFCInstance(&status); + if (U_FAILURE(status)) { + return 0; + } + UChar32 ch = unorm2_composePair(normalizer, static_cast<UChar32>(a), + static_cast<UChar32>(b)); + return ch < 0 ? 0 : static_cast<char32_t>(ch); + } + + /** + * Put the "raw" (single-level) canonical decomposition of the input char, if + * any, into the provided buffer. Canonical decomps are never more than two + * chars in length (although full normalization may result in longer output + * due to recursion). + * Returns the length of the decomposition (0 if none, else 1 or 2). + */ + static int DecomposeRawNFD(char32_t ab, char32_t decomp[2]) { + // unorm2_getNFCInstance returns a static instance that does not have to be + // released here. If it fails, we just return 0 (no decomposition) always. + // Although we are using it to query for a decomposition, the mode of the + // Normalizer2 is irrelevant here, so we may as well use the same singleton + // instance as ComposePairNFC. + static UErrorCode status = U_ZERO_ERROR; + static const UNormalizer2* normalizer = unorm2_getNFCInstance(&status); + if (U_FAILURE(status)) { + return 0; + } + + // Canonical decompositions are never more than two Unicode characters, + // or a maximum of 4 utf-16 code units. + const unsigned MAX_DECOMP_LENGTH = 4; + UErrorCode error = U_ZERO_ERROR; + UChar decompUtf16[MAX_DECOMP_LENGTH]; + int32_t len = + unorm2_getRawDecomposition(normalizer, static_cast<UChar32>(ab), + decompUtf16, MAX_DECOMP_LENGTH, &error); + if (U_FAILURE(error) || len < 0) { + return 0; + } + UText text = UTEXT_INITIALIZER; + utext_openUChars(&text, decompUtf16, len, &error); + MOZ_ASSERT(U_SUCCESS(error)); + UChar32 ch = UTEXT_NEXT32(&text); + len = 0; + if (ch != U_SENTINEL) { + decomp[0] = static_cast<char32_t>(ch); + ++len; + ch = UTEXT_NEXT32(&text); + if (ch != U_SENTINEL) { + decomp[1] = static_cast<char32_t>(ch); + ++len; + } + } + utext_close(&text); + return len; + } + + /** + * Return the Unicode version, for example "13.0". + */ + static Span<const char> GetUnicodeVersion(); +}; + +} // namespace mozilla::intl + +#endif |