1 files changed, 256 insertions, 0 deletions
diff --git a/intl/components/src/String.h b/intl/components/src/String.h
new file mode 100644
index 0000000000..c44c6b3443
--- /dev/null
+++ b/intl/components/src/String.h
@@ -0,0 +1,256 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef intl_components_String_h_
+#define intl_components_String_h_
+
+#include "mozilla/Assertions.h"
+#include "mozilla/Casting.h"
+#include "mozilla/intl/ICU4CGlue.h"
+#include "mozilla/intl/ICUError.h"
+#include "mozilla/PodOperations.h"
+#include "mozilla/Span.h"
+#include "mozilla/Try.h"
+
+#include "unicode/uchar.h"
+#include "unicode/unorm2.h"
+#include "unicode/ustring.h"
+#include "unicode/utext.h"
+#include "unicode/utypes.h"
+
+namespace mozilla::intl {
+
+/**
+ * This component is a Mozilla-focused API for working with strings in
+ * internationalization code.
+ */
+class String final {
+ public:
+  String() = delete;
+
+  /**
+   * Return the locale-sensitive lower case string of the input.
+   */
+  template <typename B>
+  static Result<Ok, ICUError> ToLocaleLowerCase(const char* aLocale,
+                                                Span<const char16_t> aString,
+                                                B& aBuffer) {
+    if (!aBuffer.reserve(aString.size())) {
+      return Err(ICUError::OutOfMemory);
+    }
+    return FillBufferWithICUCall(
+        aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) {
+          return u_strToLower(target, length, aString.data(), aString.size(),
+                              aLocale, status);
+        });
+  }
+
+  /**
+   * Return the locale-sensitive upper case string of the input.
+   */
+  template <typename B>
+  static Result<Ok, ICUError> ToLocaleUpperCase(const char* aLocale,
+                                                Span<const char16_t> aString,
+                                                B& aBuffer) {
+    if (!aBuffer.reserve(aString.size())) {
+      return Err(ICUError::OutOfMemory);
+    }
+    return FillBufferWithICUCall(
+        aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) {
+          return u_strToUpper(target, length, aString.data(), aString.size(),
+                              aLocale, status);
+        });
+  }
+
+  /**
+   * Normalization form constants to describe which normalization algorithm
+   * should be performed.
+   *
+   * Also see:
+   * - Unicode Standard, §2.12 Equivalent Sequences
+   * - Unicode Standard, §3.11 Normalization Forms
+   * - https://unicode.org/reports/tr15/
+   */
+  enum class NormalizationForm {
+    /**
+     * Normalization Form C
+     */
+    NFC,
+
+    /**
+     * Normalization Form D
+     */
+    NFD,
+
+    /**
+     * Normalization Form KC
+     */
+    NFKC,
+
+    /**
+     * Normalization Form KD
+     */
+    NFKD,
+  };
+
+  enum class AlreadyNormalized : bool { No, Yes };
+
+  /**
+   * Normalize the input string according to requested normalization form.
+   *
+   * Returns `AlreadyNormalized::Yes` when the string is already in normalized
+   * form. The output buffer is unchanged in this case. Otherwise returns
+   * `AlreadyNormalized::No` and places the normalized string into the output
+   * buffer.
+   */
+  template <typename B>
+  static Result<AlreadyNormalized, ICUError> Normalize(
+      NormalizationForm aForm, Span<const char16_t> aString, B& aBuffer) {
+    // The unorm2_getXXXInstance() methods return a shared instance which must
+    // not be deleted.
+    UErrorCode status = U_ZERO_ERROR;
+    const UNormalizer2* normalizer;
+    switch (aForm) {
+      case NormalizationForm::NFC:
+        normalizer = unorm2_getNFCInstance(&status);
+        break;
+      case NormalizationForm::NFD:
+        normalizer = unorm2_getNFDInstance(&status);
+        break;
+      case NormalizationForm::NFKC:
+        normalizer = unorm2_getNFKCInstance(&status);
+        break;
+      case NormalizationForm::NFKD:
+        normalizer = unorm2_getNFKDInstance(&status);
+        break;
+    }
+    if (U_FAILURE(status)) {
+      return Err(ToICUError(status));
+    }
+
+    int32_t spanLengthInt = unorm2_spanQuickCheckYes(normalizer, aString.data(),
+                                                     aString.size(), &status);
+    if (U_FAILURE(status)) {
+      return Err(ToICUError(status));
+    }
+
+    size_t spanLength = AssertedCast<size_t>(spanLengthInt);
+    MOZ_ASSERT(spanLength <= aString.size());
+
+    // Return if the input string is already normalized.
+    if (spanLength == aString.size()) {
+      return AlreadyNormalized::Yes;
+    }
+
+    if (!aBuffer.reserve(aString.size())) {
+      return Err(ICUError::OutOfMemory);
+    }
+
+    // Copy the already normalized prefix.
+    if (spanLength > 0) {
+      PodCopy(aBuffer.data(), aString.data(), spanLength);
+
+      aBuffer.written(spanLength);
+    }
+
+    MOZ_TRY(FillBufferWithICUCall(
+        aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) {
+          Span<const char16_t> remaining = aString.From(spanLength);
+          return unorm2_normalizeSecondAndAppend(normalizer, target, spanLength,
+                                                 length, remaining.data(),
+                                                 remaining.size(), status);
+        }));
+
+    return AlreadyNormalized::No;
+  }
+
+  /**
+   * Return true if the code point has the binary property "Cased".
+   */
+  static bool IsCased(char32_t codePoint) {
+    return u_hasBinaryProperty(static_cast<UChar32>(codePoint), UCHAR_CASED);
+  }
+
+  /**
+   * Return true if the code point has the binary property "Case_Ignorable".
+   */
+  static bool IsCaseIgnorable(char32_t codePoint) {
+    return u_hasBinaryProperty(static_cast<UChar32>(codePoint),
+                               UCHAR_CASE_IGNORABLE);
+  }
+
+  /**
+   * Return the NFC pairwise composition of the two input characters, if any;
+   * returns 0 (which we know is not a composed char!) if none exists.
+   */
+  static char32_t ComposePairNFC(char32_t a, char32_t b) {
+    // unorm2_getNFCInstance returns a static instance that does not have to be
+    // released here. If it fails, we just return 0 (no composition) always.
+    static UErrorCode status = U_ZERO_ERROR;
+    static const UNormalizer2* normalizer = unorm2_getNFCInstance(&status);
+    if (U_FAILURE(status)) {
+      return 0;
+    }
+    UChar32 ch = unorm2_composePair(normalizer, static_cast<UChar32>(a),
+                                    static_cast<UChar32>(b));
+    return ch < 0 ? 0 : static_cast<char32_t>(ch);
+  }
+
+  /**
+   * Put the "raw" (single-level) canonical decomposition of the input char, if
+   * any, into the provided buffer. Canonical decomps are never more than two
+   * chars in length (although full normalization may result in longer output
+   * due to recursion).
+   * Returns the length of the decomposition (0 if none, else 1 or 2).
+   */
+  static int DecomposeRawNFD(char32_t ab, char32_t decomp[2]) {
+    // unorm2_getNFCInstance returns a static instance that does not have to be
+    // released here. If it fails, we just return 0 (no decomposition) always.
+    // Although we are using it to query for a decomposition, the mode of the
+    // Normalizer2 is irrelevant here, so we may as well use the same singleton
+    // instance as ComposePairNFC.
+    static UErrorCode status = U_ZERO_ERROR;
+    static const UNormalizer2* normalizer = unorm2_getNFCInstance(&status);
+    if (U_FAILURE(status)) {
+      return 0;
+    }
+
+    // Canonical decompositions are never more than two Unicode characters,
+    // or a maximum of 4 utf-16 code units.
+    const unsigned MAX_DECOMP_LENGTH = 4;
+    UErrorCode error = U_ZERO_ERROR;
+    UChar decompUtf16[MAX_DECOMP_LENGTH];
+    int32_t len =
+        unorm2_getRawDecomposition(normalizer, static_cast<UChar32>(ab),
+                                   decompUtf16, MAX_DECOMP_LENGTH, &error);
+    if (U_FAILURE(error) || len < 0) {
+      return 0;
+    }
+    UText text = UTEXT_INITIALIZER;
+    utext_openUChars(&text, decompUtf16, len, &error);
+    MOZ_ASSERT(U_SUCCESS(error));
+    UChar32 ch = UTEXT_NEXT32(&text);
+    len = 0;
+    if (ch != U_SENTINEL) {
+      decomp[0] = static_cast<char32_t>(ch);
+      ++len;
+      ch = UTEXT_NEXT32(&text);
+      if (ch != U_SENTINEL) {
+        decomp[1] = static_cast<char32_t>(ch);
+        ++len;
+      }
+    }
+    utext_close(&text);
+    return len;
+  }
+
+  /**
+   * Return the Unicode version, for example "13.0".
+   */
+  static Span<const char> GetUnicodeVersion();
+};
+
+}  // namespace mozilla::intl
+
+#endif