1 files changed, 247 insertions, 0 deletions
diff --git a/xpcom/string/nsUTF8Utils.h b/xpcom/string/nsUTF8Utils.h
new file mode 100644
index 0000000000..0145011ec1
--- /dev/null
+++ b/xpcom/string/nsUTF8Utils.h
@@ -0,0 +1,247 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+#ifndef nsUTF8Utils_h_
+#define nsUTF8Utils_h_
+
+// NB: This code may be used from non-XPCOM code, in particular, the
+// standalone updater executable.  That is, this file may be used in
+// two ways: if MOZILLA_INTERNAL_API is defined, this file will
+// provide signatures for the Mozilla abstract string types. It will
+// use XPCOM assertion/debugging macros, etc.
+
+#include <type_traits>
+
+#include "mozilla/Assertions.h"
+#include "mozilla/EndianUtils.h"
+
+#include "nsCharTraits.h"
+
+#ifdef MOZILLA_INTERNAL_API
+#  define UTF8UTILS_WARNING(msg) NS_WARNING(msg)
+#else
+#  define UTF8UTILS_WARNING(msg)
+#endif
+
+class UTF8traits {
+ public:
+  static bool isASCII(char aChar) { return (aChar & 0x80) == 0x00; }
+  static bool isInSeq(char aChar) { return (aChar & 0xC0) == 0x80; }
+  static bool is2byte(char aChar) { return (aChar & 0xE0) == 0xC0; }
+  static bool is3byte(char aChar) { return (aChar & 0xF0) == 0xE0; }
+  static bool is4byte(char aChar) { return (aChar & 0xF8) == 0xF0; }
+  static bool is5byte(char aChar) { return (aChar & 0xFC) == 0xF8; }
+  static bool is6byte(char aChar) { return (aChar & 0xFE) == 0xFC; }
+  // return the number of bytes in a sequence beginning with aChar
+  static int bytes(char aChar) {
+    if (isASCII(aChar)) {
+      return 1;
+    }
+    if (is2byte(aChar)) {
+      return 2;
+    }
+    if (is3byte(aChar)) {
+      return 3;
+    }
+    if (is4byte(aChar)) {
+      return 4;
+    }
+    MOZ_ASSERT_UNREACHABLE("should not be used for in-sequence characters");
+    return 1;
+  }
+};
+
+/**
+ * Extract the next Unicode scalar value from the buffer and return it. The
+ * pointer passed in is advanced to the start of the next character in the
+ * buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced
+ * over the maximal valid prefix and *aErr is set to true (if aErr is not
+ * null).
+ *
+ * Note: This method never sets *aErr to false to allow error accumulation
+ * across multiple calls.
+ *
+ * Precondition: *aBuffer < aEnd
+ */
+class UTF8CharEnumerator {
+ public:
+  static inline char32_t NextChar(const char** aBuffer, const char* aEnd,
+                                  bool* aErr = nullptr) {
+    MOZ_ASSERT(aBuffer, "null buffer pointer pointer");
+    MOZ_ASSERT(aEnd, "null end pointer");
+
+    const unsigned char* p = reinterpret_cast<const unsigned char*>(*aBuffer);
+    const unsigned char* end = reinterpret_cast<const unsigned char*>(aEnd);
+
+    MOZ_ASSERT(p, "null buffer");
+    MOZ_ASSERT(p < end, "Bogus range");
+
+    unsigned char first = *p;
+    ++p;
+
+    if (MOZ_LIKELY(first < 0x80U)) {
+      *aBuffer = reinterpret_cast<const char*>(p);
+      return first;
+    }
+
+    // Unsigned underflow is defined behavior
+    if (MOZ_UNLIKELY((p == end) || ((first - 0xC2U) >= (0xF5U - 0xC2U)))) {
+      *aBuffer = reinterpret_cast<const char*>(p);
+      if (aErr) {
+        *aErr = true;
+      }
+      return 0xFFFDU;
+    }
+
+    unsigned char second = *p;
+
+    if (first < 0xE0U) {
+      // Two-byte
+      if (MOZ_LIKELY((second & 0xC0U) == 0x80U)) {
+        ++p;
+        *aBuffer = reinterpret_cast<const char*>(p);
+        return ((uint32_t(first) & 0x1FU) << 6) | (uint32_t(second) & 0x3FU);
+      }
+      *aBuffer = reinterpret_cast<const char*>(p);
+      if (aErr) {
+        *aErr = true;
+      }
+      return 0xFFFDU;
+    }
+
+    if (MOZ_LIKELY(first < 0xF0U)) {
+      // Three-byte
+      unsigned char lower = 0x80U;
+      unsigned char upper = 0xBFU;
+      if (first == 0xE0U) {
+        lower = 0xA0U;
+      } else if (first == 0xEDU) {
+        upper = 0x9FU;
+      }
+      if (MOZ_LIKELY(second >= lower && second <= upper)) {
+        ++p;
+        if (MOZ_LIKELY(p != end)) {
+          unsigned char third = *p;
+          if (MOZ_LIKELY((third & 0xC0U) == 0x80U)) {
+            ++p;
+            *aBuffer = reinterpret_cast<const char*>(p);
+            return ((uint32_t(first) & 0xFU) << 12) |
+                   ((uint32_t(second) & 0x3FU) << 6) |
+                   (uint32_t(third) & 0x3FU);
+          }
+        }
+      }
+      *aBuffer = reinterpret_cast<const char*>(p);
+      if (aErr) {
+        *aErr = true;
+      }
+      return 0xFFFDU;
+    }
+
+    // Four-byte
+    unsigned char lower = 0x80U;
+    unsigned char upper = 0xBFU;
+    if (first == 0xF0U) {
+      lower = 0x90U;
+    } else if (first == 0xF4U) {
+      upper = 0x8FU;
+    }
+    if (MOZ_LIKELY(second >= lower && second <= upper)) {
+      ++p;
+      if (MOZ_LIKELY(p != end)) {
+        unsigned char third = *p;
+        if (MOZ_LIKELY((third & 0xC0U) == 0x80U)) {
+          ++p;
+          if (MOZ_LIKELY(p != end)) {
+            unsigned char fourth = *p;
+            if (MOZ_LIKELY((fourth & 0xC0U) == 0x80U)) {
+              ++p;
+              *aBuffer = reinterpret_cast<const char*>(p);
+              return ((uint32_t(first) & 0x7U) << 18) |
+                     ((uint32_t(second) & 0x3FU) << 12) |
+                     ((uint32_t(third) & 0x3FU) << 6) |
+                     (uint32_t(fourth) & 0x3FU);
+            }
+          }
+        }
+      }
+    }
+    *aBuffer = reinterpret_cast<const char*>(p);
+    if (aErr) {
+      *aErr = true;
+    }
+    return 0xFFFDU;
+  }
+};
+
+/**
+ * Extract the next Unicode scalar value from the buffer and return it. The
+ * pointer passed in is advanced to the start of the next character in the
+ * buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced over
+ * the unpaired surrogate and *aErr is set to true (if aErr is not null).
+ *
+ * Note: This method never sets *aErr to false to allow error accumulation
+ * across multiple calls.
+ *
+ * Precondition: *aBuffer < aEnd
+ */
+class UTF16CharEnumerator {
+ public:
+  static inline char32_t NextChar(const char16_t** aBuffer,
+                                  const char16_t* aEnd, bool* aErr = nullptr) {
+    MOZ_ASSERT(aBuffer, "null buffer pointer pointer");
+    MOZ_ASSERT(aEnd, "null end pointer");
+
+    const char16_t* p = *aBuffer;
+
+    MOZ_ASSERT(p, "null buffer");
+    MOZ_ASSERT(p < aEnd, "Bogus range");
+
+    char16_t c = *p++;
+
+    // Let's use encoding_rs-style code golf here.
+    // Unsigned underflow is defined behavior
+    char16_t cMinusSurrogateStart = c - 0xD800U;
+    if (MOZ_LIKELY(cMinusSurrogateStart > (0xDFFFU - 0xD800U))) {
+      *aBuffer = p;
+      return c;
+    }
+    if (MOZ_LIKELY(cMinusSurrogateStart <= (0xDBFFU - 0xD800U))) {
+      // High surrogate
+      if (MOZ_LIKELY(p != aEnd)) {
+        char16_t second = *p;
+        // Unsigned underflow is defined behavior
+        if (MOZ_LIKELY((second - 0xDC00U) <= (0xDFFFU - 0xDC00U))) {
+          *aBuffer = ++p;
+          return (uint32_t(c) << 10) + uint32_t(second) -
+                 (((0xD800U << 10) - 0x10000U) + 0xDC00U);
+        }
+      }
+    }
+    // Unpaired surrogate
+    *aBuffer = p;
+    if (aErr) {
+      *aErr = true;
+    }
+    return 0xFFFDU;
+  }
+};
+
+template <typename Char, typename UnsignedT>
+inline UnsignedT RewindToPriorUTF8Codepoint(const Char* utf8Chars,
+                                            UnsignedT index) {
+  static_assert(std::is_same_v<Char, char> ||
+                    std::is_same_v<Char, unsigned char> ||
+                    std::is_same_v<Char, signed char>,
+                "UTF-8 data must be in 8-bit units");
+  static_assert(std::is_unsigned_v<UnsignedT>, "index type must be unsigned");
+  while (index > 0 && (utf8Chars[index] & 0xC0) == 0x80) --index;
+
+  return index;
+}
+
+#undef UTF8UTILS_WARNING
+
+#endif /* !defined(nsUTF8Utils_h_) */