Adding upstream version 1:115.7.0.upstream/1%115.7.0 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 17:32:43 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 17:32:43 +0000
commit: 6bf0a5cb5034a7e684dcc3500e841785237ce2dd (patch)
tree: a68f146d7fa01f0134297619fbe7e33db084e0aa /js/src/vm/CharacterEncoding.cpp
parent: Initial commit. (diff)
download: thunderbird-upstream.tar.xz
thunderbird-upstream.zip
1 files changed, 888 insertions, 0 deletions
diff --git a/js/src/vm/CharacterEncoding.cpp b/js/src/vm/CharacterEncoding.cpp
new file mode 100644
index 0000000000..52edcae45e
--- /dev/null
+++ b/js/src/vm/CharacterEncoding.cpp
@@ -0,0 +1,888 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
+ * vim: set ts=8 sts=2 et sw=2 tw=80:
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "js/CharacterEncoding.h"
+
+#include "mozilla/CheckedInt.h"
+#include "mozilla/DebugOnly.h"
+#include "mozilla/Latin1.h"
+#include "mozilla/Maybe.h"
+#include "mozilla/Range.h"
+#include "mozilla/Span.h"
+#include "mozilla/Sprintf.h"
+#include "mozilla/TextUtils.h"
+#include "mozilla/Utf8.h"
+
+#ifndef XP_LINUX
+// We still support libstd++ versions without codecvt support on Linux.
+#  include <codecvt>
+#endif
+#include <cwchar>
+#include <limits>
+#include <locale>
+#include <type_traits>
+
+#include "frontend/FrontendContext.h"
+#include "js/friend/ErrorMessages.h"  // js::GetErrorMessage, JSMSG_*
+#include "util/StringBuffer.h"
+#include "util/Unicode.h"  // unicode::REPLACEMENT_CHARACTER
+#include "vm/JSContext.h"
+
+using mozilla::AsChars;
+using mozilla::AsciiValidUpTo;
+using mozilla::AsWritableChars;
+using mozilla::ConvertLatin1toUtf8Partial;
+using mozilla::ConvertUtf16toUtf8Partial;
+using mozilla::IsAscii;
+using mozilla::IsUtf8Latin1;
+using mozilla::LossyConvertUtf16toLatin1;
+using mozilla::Span;
+using mozilla::Utf8Unit;
+
+using JS::Latin1CharsZ;
+using JS::TwoByteCharsZ;
+using JS::UTF8Chars;
+using JS::UTF8CharsZ;
+
+using namespace js;
+using namespace js::unicode;
+
+Latin1CharsZ JS::LossyTwoByteCharsToNewLatin1CharsZ(
+    JSContext* cx, const mozilla::Range<const char16_t> tbchars) {
+  MOZ_ASSERT(cx);
+  size_t len = tbchars.length();
+  unsigned char* latin1 = cx->pod_malloc<unsigned char>(len + 1);
+  if (!latin1) {
+    return Latin1CharsZ();
+  }
+  LossyConvertUtf16toLatin1(tbchars, AsWritableChars(Span(latin1, len)));
+  latin1[len] = '\0';
+  return Latin1CharsZ(latin1, len);
+}
+
+template <typename CharT>
+static size_t GetDeflatedUTF8StringLength(const CharT* chars, size_t nchars) {
+  size_t nbytes = nchars;
+  for (const CharT* end = chars + nchars; chars < end; chars++) {
+    char16_t c = *chars;
+    if (c < 0x80) {
+      continue;
+    }
+    char32_t v;
+    if (IsSurrogate(c)) {
+      /* nbytes sets 1 length since this is surrogate pair. */
+      if (IsTrailSurrogate(c) || (chars + 1) == end) {
+        nbytes += 2; /* Bad Surrogate */
+        continue;
+      }
+      char16_t c2 = chars[1];
+      if (!IsTrailSurrogate(c2)) {
+        nbytes += 2; /* Bad Surrogate */
+        continue;
+      }
+      v = UTF16Decode(c, c2);
+      nbytes--;
+      chars++;
+    } else {
+      v = c;
+    }
+    v >>= 11;
+    nbytes++;
+    while (v) {
+      v >>= 5;
+      nbytes++;
+    }
+  }
+  return nbytes;
+}
+
+JS_PUBLIC_API size_t JS::GetDeflatedUTF8StringLength(JSLinearString* s) {
+  JS::AutoCheckCannotGC nogc;
+  return s->hasLatin1Chars()
+             ? ::GetDeflatedUTF8StringLength(s->latin1Chars(nogc), s->length())
+             : ::GetDeflatedUTF8StringLength(s->twoByteChars(nogc),
+                                             s->length());
+}
+
+JS_PUBLIC_API size_t JS::DeflateStringToUTF8Buffer(JSLinearString* src,
+                                                   mozilla::Span<char> dst) {
+  JS::AutoCheckCannotGC nogc;
+  if (src->hasLatin1Chars()) {
+    auto source = AsChars(Span(src->latin1Chars(nogc), src->length()));
+    auto [read, written] = ConvertLatin1toUtf8Partial(source, dst);
+    (void)read;
+    return written;
+  }
+  auto source = Span(src->twoByteChars(nogc), src->length());
+  auto [read, written] = ConvertUtf16toUtf8Partial(source, dst);
+  (void)read;
+  return written;
+}
+
+template <typename CharT>
+void ConvertToUTF8(mozilla::Span<CharT> src, mozilla::Span<char> dst);
+
+template <>
+void ConvertToUTF8<const char16_t>(mozilla::Span<const char16_t> src,
+                                   mozilla::Span<char> dst) {
+  (void)ConvertUtf16toUtf8Partial(src, dst);
+}
+
+template <>
+void ConvertToUTF8<const Latin1Char>(mozilla::Span<const Latin1Char> src,
+                                     mozilla::Span<char> dst) {
+  (void)ConvertLatin1toUtf8Partial(AsChars(src), dst);
+}
+
+template <typename CharT, typename Allocator>
+UTF8CharsZ JS::CharsToNewUTF8CharsZ(Allocator* alloc,
+                                    const mozilla::Range<CharT> chars) {
+  /* Get required buffer size. */
+  const CharT* str = chars.begin().get();
+  size_t len = ::GetDeflatedUTF8StringLength(str, chars.length());
+
+  /* Allocate buffer. */
+  char* utf8 = alloc->template pod_malloc<char>(len + 1);
+  if (!utf8) {
+    return UTF8CharsZ();
+  }
+
+  /* Encode to UTF8. */
+  ::ConvertToUTF8(Span(str, chars.length()), Span(utf8, len));
+  utf8[len] = '\0';
+
+  return UTF8CharsZ(utf8, len);
+}
+
+template UTF8CharsZ JS::CharsToNewUTF8CharsZ(
+    JSContext* cx, const mozilla::Range<Latin1Char> chars);
+
+template UTF8CharsZ JS::CharsToNewUTF8CharsZ(
+    JSContext* cx, const mozilla::Range<char16_t> chars);
+
+template UTF8CharsZ JS::CharsToNewUTF8CharsZ(
+    JSContext* cx, const mozilla::Range<const Latin1Char> chars);
+
+template UTF8CharsZ JS::CharsToNewUTF8CharsZ(
+    JSContext* cx, const mozilla::Range<const char16_t> chars);
+
+template UTF8CharsZ JS::CharsToNewUTF8CharsZ(
+    FrontendAllocator* cx, const mozilla::Range<Latin1Char> chars);
+
+template UTF8CharsZ JS::CharsToNewUTF8CharsZ(
+    FrontendAllocator* cx, const mozilla::Range<char16_t> chars);
+
+template UTF8CharsZ JS::CharsToNewUTF8CharsZ(
+    FrontendAllocator* cx, const mozilla::Range<const Latin1Char> chars);
+
+template UTF8CharsZ JS::CharsToNewUTF8CharsZ(
+    FrontendAllocator* cx, const mozilla::Range<const char16_t> chars);
+
+static constexpr uint32_t INVALID_UTF8 = std::numeric_limits<char32_t>::max();
+
+/*
+ * Convert a UTF-8 character sequence into a UCS-4 character and return that
+ * character. It is assumed that the caller already checked that the sequence
+ * is valid.
+ */
+static char32_t Utf8ToOneUcs4CharImpl(const uint8_t* utf8Buffer,
+                                      int utf8Length) {
+  MOZ_ASSERT(1 <= utf8Length && utf8Length <= 4);
+
+  if (utf8Length == 1) {
+    MOZ_ASSERT(!(*utf8Buffer & 0x80));
+    return *utf8Buffer;
+  }
+
+  /* from Unicode 3.1, non-shortest form is illegal */
+  static const char32_t minucs4Table[] = {0x80, 0x800, NonBMPMin};
+
+  MOZ_ASSERT((*utf8Buffer & (0x100 - (1 << (7 - utf8Length)))) ==
+             (0x100 - (1 << (8 - utf8Length))));
+  char32_t ucs4Char = *utf8Buffer++ & ((1 << (7 - utf8Length)) - 1);
+  char32_t minucs4Char = minucs4Table[utf8Length - 2];
+  while (--utf8Length) {
+    MOZ_ASSERT((*utf8Buffer & 0xC0) == 0x80);
+    ucs4Char = (ucs4Char << 6) | (*utf8Buffer++ & 0x3F);
+  }
+
+  if (MOZ_UNLIKELY(ucs4Char < minucs4Char)) {
+    return INVALID_UTF8;
+  }
+
+  if (MOZ_UNLIKELY(IsSurrogate(ucs4Char))) {
+    return INVALID_UTF8;
+  }
+
+  return ucs4Char;
+}
+
+char32_t JS::Utf8ToOneUcs4Char(const uint8_t* utf8Buffer, int utf8Length) {
+  return Utf8ToOneUcs4CharImpl(utf8Buffer, utf8Length);
+}
+
+static void ReportInvalidCharacter(JSContext* cx, uint32_t offset) {
+  char buffer[10];
+  SprintfLiteral(buffer, "%u", offset);
+  JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
+                            JSMSG_MALFORMED_UTF8_CHAR, buffer);
+}
+
+static void ReportBufferTooSmall(JSContext* cx, uint32_t dummy) {
+  JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
+                            JSMSG_BUFFER_TOO_SMALL);
+}
+
+static void ReportTooBigCharacter(JSContext* cx, uint32_t v) {
+  char buffer[11];
+  SprintfLiteral(buffer, "0x%x", v);
+  JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
+                            JSMSG_UTF8_CHAR_TOO_LARGE, buffer);
+}
+
+enum class LoopDisposition {
+  Break,
+  Continue,
+};
+
+enum class OnUTF8Error {
+  InsertReplacementCharacter,
+  InsertQuestionMark,
+  Throw,
+  Crash,
+};
+
+// Scan UTF-8 input and (internally, at least) convert it to a series of UTF-16
+// code units. But you can also do odd things like pass an empty lambda for
+// `dst`, in which case the output is discarded entirely--the only effect of
+// calling the template that way is error-checking.
+template <OnUTF8Error ErrorAction, typename OutputFn>
+static bool InflateUTF8ToUTF16(JSContext* cx, const UTF8Chars src,
+                               OutputFn dst) {
+  size_t srclen = src.length();
+  for (uint32_t i = 0; i < srclen; i++) {
+    uint32_t v = uint32_t(src[i]);
+    if (!(v & 0x80)) {
+      // ASCII code unit.  Simple copy.
+      if (dst(uint16_t(v)) == LoopDisposition::Break) {
+        break;
+      }
+    } else {
+      // Non-ASCII code unit.  Determine its length in bytes (n).
+      uint32_t n = 1;
+      while (v & (0x80 >> n)) {
+        n++;
+      }
+
+#define INVALID(report, arg, n2)                                    \
+  do {                                                              \
+    if (ErrorAction == OnUTF8Error::Throw) {                        \
+      report(cx, arg);                                              \
+      return false;                                                 \
+    } else if (ErrorAction == OnUTF8Error::Crash) {                 \
+      MOZ_CRASH("invalid UTF-8 string: " #report);                  \
+    } else {                                                        \
+      char16_t replacement;                                         \
+      if (ErrorAction == OnUTF8Error::InsertReplacementCharacter) { \
+        replacement = REPLACEMENT_CHARACTER;                        \
+      } else {                                                      \
+        MOZ_ASSERT(ErrorAction == OnUTF8Error::InsertQuestionMark); \
+        replacement = '?';                                          \
+      }                                                             \
+      if (dst(replacement) == LoopDisposition::Break) {             \
+        break;                                                      \
+      }                                                             \
+      n = n2;                                                       \
+      goto invalidMultiByteCodeUnit;                                \
+    }                                                               \
+  } while (0)
+
+      // Check the leading byte.
+      if (n < 2 || n > 4) {
+        INVALID(ReportInvalidCharacter, i, 1);
+      }
+
+      // Check that |src| is large enough to hold an n-byte code unit.
+      if (i + n > srclen) {
+        INVALID(ReportBufferTooSmall, /* dummy = */ 0, 1);
+      }
+
+      // Check the second byte.  From Unicode Standard v6.2, Table 3-7
+      // Well-Formed UTF-8 Byte Sequences.
+      if ((v == 0xE0 && ((uint8_t)src[i + 1] & 0xE0) != 0xA0) ||  // E0 A0~BF
+          (v == 0xED && ((uint8_t)src[i + 1] & 0xE0) != 0x80) ||  // ED 80~9F
+          (v == 0xF0 && ((uint8_t)src[i + 1] & 0xF0) == 0x80) ||  // F0 90~BF
+          (v == 0xF4 && ((uint8_t)src[i + 1] & 0xF0) != 0x80))    // F4 80~8F
+      {
+        INVALID(ReportInvalidCharacter, i, 1);
+      }
+
+      // Check the continuation bytes.
+      for (uint32_t m = 1; m < n; m++) {
+        if ((src[i + m] & 0xC0) != 0x80) {
+          INVALID(ReportInvalidCharacter, i, m);
+        }
+      }
+
+      // Determine the code unit's length in CharT and act accordingly.
+      v = Utf8ToOneUcs4CharImpl((uint8_t*)&src[i], n);
+      if (v < NonBMPMin) {
+        // The n-byte UTF8 code unit will fit in a single CharT.
+        if (dst(char16_t(v)) == LoopDisposition::Break) {
+          break;
+        }
+      } else if (v <= NonBMPMax) {
+        // The n-byte UTF8 code unit will fit in two CharT units.
+        if (dst(LeadSurrogate(v)) == LoopDisposition::Break) {
+          break;
+        }
+        if (dst(TrailSurrogate(v)) == LoopDisposition::Break) {
+          break;
+        }
+      } else {
+        // The n-byte UTF8 code unit won't fit in two CharT units.
+        INVALID(ReportTooBigCharacter, v, 1);
+      }
+
+    invalidMultiByteCodeUnit:
+      // Move i to the last byte of the multi-byte code unit; the loop
+      // header will do the final i++ to move to the start of the next
+      // code unit.
+      i += n - 1;
+    }
+  }
+
+  return true;
+}
+
+template <OnUTF8Error ErrorAction, typename CharT>
+static void CopyAndInflateUTF8IntoBuffer(JSContext* cx, const UTF8Chars src,
+                                         CharT* dst, size_t outlen,
+                                         bool allASCII) {
+  if (allASCII) {
+    size_t srclen = src.length();
+    MOZ_ASSERT(outlen == srclen);
+    for (uint32_t i = 0; i < srclen; i++) {
+      dst[i] = CharT(src[i]);
+    }
+  } else {
+    size_t j = 0;
+    auto push = [dst, &j](char16_t c) -> LoopDisposition {
+      dst[j++] = CharT(c);
+      return LoopDisposition::Continue;
+    };
+    MOZ_ALWAYS_TRUE((InflateUTF8ToUTF16<ErrorAction>(cx, src, push)));
+    MOZ_ASSERT(j == outlen);
+  }
+}
+
+template <OnUTF8Error ErrorAction, typename CharsT>
+static CharsT InflateUTF8StringHelper(JSContext* cx, const UTF8Chars src,
+                                      size_t* outlen, arena_id_t destArenaId) {
+  using CharT = typename CharsT::CharT;
+  static_assert(
+      std::is_same_v<CharT, char16_t> || std::is_same_v<CharT, Latin1Char>,
+      "bad CharT");
+
+  *outlen = 0;
+
+  size_t len = 0;
+  bool allASCII = true;
+  auto count = [&len, &allASCII](char16_t c) -> LoopDisposition {
+    len++;
+    allASCII &= (c < 0x80);
+    return LoopDisposition::Continue;
+  };
+  if (!InflateUTF8ToUTF16<ErrorAction>(cx, src, count)) {
+    return CharsT();
+  }
+  *outlen = len;
+
+  CharT* dst = cx->pod_arena_malloc<CharT>(destArenaId,
+                                           *outlen + 1);  // +1 for NUL
+
+  if (!dst) {
+    ReportOutOfMemory(cx);
+    return CharsT();
+  }
+
+  constexpr OnUTF8Error errorMode =
+      std::is_same_v<CharT, Latin1Char>
+          ? OnUTF8Error::InsertQuestionMark
+          : OnUTF8Error::InsertReplacementCharacter;
+  CopyAndInflateUTF8IntoBuffer<errorMode>(cx, src, dst, *outlen, allASCII);
+  dst[*outlen] = CharT('\0');
+
+  return CharsT(dst, *outlen);
+}
+
+TwoByteCharsZ JS::UTF8CharsToNewTwoByteCharsZ(JSContext* cx,
+                                              const UTF8Chars utf8,
+                                              size_t* outlen,
+                                              arena_id_t destArenaId) {
+  return InflateUTF8StringHelper<OnUTF8Error::Throw, TwoByteCharsZ>(
+      cx, utf8, outlen, destArenaId);
+}
+
+TwoByteCharsZ JS::UTF8CharsToNewTwoByteCharsZ(JSContext* cx,
+                                              const ConstUTF8CharsZ& utf8,
+                                              size_t* outlen,
+                                              arena_id_t destArenaId) {
+  UTF8Chars chars(utf8.c_str(), strlen(utf8.c_str()));
+  return InflateUTF8StringHelper<OnUTF8Error::Throw, TwoByteCharsZ>(
+      cx, chars, outlen, destArenaId);
+}
+
+TwoByteCharsZ JS::LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx,
+                                                   const JS::UTF8Chars utf8,
+                                                   size_t* outlen,
+                                                   arena_id_t destArenaId) {
+  return InflateUTF8StringHelper<OnUTF8Error::InsertReplacementCharacter,
+                                 TwoByteCharsZ>(cx, utf8, outlen, destArenaId);
+}
+
+TwoByteCharsZ JS::LossyUTF8CharsToNewTwoByteCharsZ(
+    JSContext* cx, const JS::ConstUTF8CharsZ& utf8, size_t* outlen,
+    arena_id_t destArenaId) {
+  UTF8Chars chars(utf8.c_str(), strlen(utf8.c_str()));
+  return InflateUTF8StringHelper<OnUTF8Error::InsertReplacementCharacter,
+                                 TwoByteCharsZ>(cx, chars, outlen, destArenaId);
+}
+
+static void UpdateSmallestEncodingForChar(char16_t c,
+                                          JS::SmallestEncoding* encoding) {
+  JS::SmallestEncoding newEncoding = JS::SmallestEncoding::ASCII;
+  if (c >= 0x80) {
+    if (c < 0x100) {
+      newEncoding = JS::SmallestEncoding::Latin1;
+    } else {
+      newEncoding = JS::SmallestEncoding::UTF16;
+    }
+  }
+  if (newEncoding > *encoding) {
+    *encoding = newEncoding;
+  }
+}
+
+JS::SmallestEncoding JS::FindSmallestEncoding(UTF8Chars utf8) {
+  Span<unsigned char> unsignedSpan = utf8;
+  auto charSpan = AsChars(unsignedSpan);
+  size_t upTo = AsciiValidUpTo(charSpan);
+  if (upTo == charSpan.Length()) {
+    return SmallestEncoding::ASCII;
+  }
+  if (IsUtf8Latin1(charSpan.From(upTo))) {
+    return SmallestEncoding::Latin1;
+  }
+  return SmallestEncoding::UTF16;
+}
+
+Latin1CharsZ JS::UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8,
+                                            size_t* outlen,
+                                            arena_id_t destArenaId) {
+  return InflateUTF8StringHelper<OnUTF8Error::Throw, Latin1CharsZ>(
+      cx, utf8, outlen, destArenaId);
+}
+
+Latin1CharsZ JS::LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx,
+                                                 const UTF8Chars utf8,
+                                                 size_t* outlen,
+                                                 arena_id_t destArenaId) {
+  return InflateUTF8StringHelper<OnUTF8Error::InsertQuestionMark, Latin1CharsZ>(
+      cx, utf8, outlen, destArenaId);
+}
+
+/**
+ * Atomization Helpers.
+ *
+ * These functions are extremely single-use, and are not intended for general
+ * consumption.
+ */
+
+bool GetUTF8AtomizationData(JSContext* cx, const JS::UTF8Chars utf8,
+                            size_t* outlen, JS::SmallestEncoding* encoding,
+                            HashNumber* hashNum) {
+  *outlen = 0;
+  *encoding = JS::SmallestEncoding::ASCII;
+  *hashNum = 0;
+
+  auto getMetadata = [outlen, encoding,
+                      hashNum](char16_t c) -> LoopDisposition {
+    (*outlen)++;
+    UpdateSmallestEncodingForChar(c, encoding);
+    *hashNum = mozilla::AddToHash(*hashNum, c);
+    return LoopDisposition::Continue;
+  };
+  if (!InflateUTF8ToUTF16<OnUTF8Error::Throw>(cx, utf8, getMetadata)) {
+    return false;
+  }
+
+  return true;
+}
+
+template <typename CharT>
+bool UTF8EqualsChars(const JS::UTF8Chars utfChars, const CharT* chars) {
+  size_t ind = 0;
+  bool isEqual = true;
+
+  auto checkEqual = [&isEqual, &ind, chars](char16_t c) -> LoopDisposition {
+#ifdef DEBUG
+    JS::SmallestEncoding encoding = JS::SmallestEncoding::ASCII;
+    UpdateSmallestEncodingForChar(c, &encoding);
+    if (std::is_same_v<CharT, JS::Latin1Char>) {
+      MOZ_ASSERT(encoding <= JS::SmallestEncoding::Latin1);
+    } else if (!std::is_same_v<CharT, char16_t>) {
+      MOZ_CRASH("Invalid character type in UTF8EqualsChars");
+    }
+#endif
+
+    if (CharT(c) != chars[ind]) {
+      isEqual = false;
+      return LoopDisposition::Break;
+    }
+
+    ind++;
+    return LoopDisposition::Continue;
+  };
+
+  // To get here, you must have checked your work.
+  InflateUTF8ToUTF16<OnUTF8Error::Crash>(/* cx = */ nullptr, utfChars,
+                                         checkEqual);
+
+  return isEqual;
+}
+
+template bool UTF8EqualsChars(const JS::UTF8Chars, const char16_t*);
+template bool UTF8EqualsChars(const JS::UTF8Chars, const JS::Latin1Char*);
+
+template <typename CharT>
+void InflateUTF8CharsToBuffer(const JS::UTF8Chars src, CharT* dst,
+                              size_t dstLen, JS::SmallestEncoding encoding) {
+  CopyAndInflateUTF8IntoBuffer<OnUTF8Error::Crash>(
+      /* cx = */ nullptr, src, dst, dstLen,
+      encoding == JS::SmallestEncoding::ASCII);
+}
+
+template void InflateUTF8CharsToBuffer(const UTF8Chars src, char16_t* dst,
+                                       size_t dstLen,
+                                       JS::SmallestEncoding encoding);
+template void InflateUTF8CharsToBuffer(const UTF8Chars src, JS::Latin1Char* dst,
+                                       size_t dstLen,
+                                       JS::SmallestEncoding encoding);
+
+#ifdef DEBUG
+void JS::ConstUTF8CharsZ::validate(size_t aLength) {
+  MOZ_ASSERT(data_);
+  UTF8Chars chars(data_, aLength);
+  auto nop = [](char16_t) -> LoopDisposition {
+    return LoopDisposition::Continue;
+  };
+  InflateUTF8ToUTF16<OnUTF8Error::Crash>(/* cx = */ nullptr, chars, nop);
+}
+#endif
+
+bool JS::StringIsASCII(const char* s) {
+  while (*s) {
+    if (*s & 0x80) {
+      return false;
+    }
+    s++;
+  }
+  return true;
+}
+
+bool JS::StringIsASCII(Span<const char> s) { return IsAscii(s); }
+
+JS_PUBLIC_API JS::UniqueChars JS::EncodeNarrowToUtf8(JSContext* cx,
+                                                     const char* chars) {
+  // Convert the narrow multibyte character string to a wide string and then
+  // use EncodeWideToUtf8() to convert the wide string to a UTF-8 string.
+
+  std::mbstate_t mb{};
+
+  // NOTE: The 2nd parameter is overwritten even if the 1st parameter is nullptr
+  //       on Android NDK older than v16.  Use a temporary variable to save the
+  //       `chars` for the subsequent call.  See bug 1492090.
+  const char* tmpChars = chars;
+
+  size_t wideLen = std::mbsrtowcs(nullptr, &tmpChars, 0, &mb);
+  if (wideLen == size_t(-1)) {
+    JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
+                              JSMSG_CANT_CONVERT_TO_WIDE);
+    return nullptr;
+  }
+  MOZ_ASSERT(std::mbsinit(&mb),
+             "multi-byte state is in its initial state when no conversion "
+             "error occured");
+
+  size_t bufLen = wideLen + 1;
+  auto wideChars = cx->make_pod_array<wchar_t>(bufLen);
+  if (!wideChars) {
+    return nullptr;
+  }
+
+  mozilla::DebugOnly<size_t> actualLen =
+      std::mbsrtowcs(wideChars.get(), &chars, bufLen, &mb);
+  MOZ_ASSERT(wideLen == actualLen);
+  MOZ_ASSERT(wideChars[actualLen] == '\0');
+
+  return EncodeWideToUtf8(cx, wideChars.get());
+}
+
+JS_PUBLIC_API JS::UniqueChars JS::EncodeWideToUtf8(JSContext* cx,
+                                                   const wchar_t* chars) {
+  using CheckedSizeT = mozilla::CheckedInt<size_t>;
+
+#ifndef XP_LINUX
+  // Use the standard codecvt facet to convert a wide string to UTF-8.
+  std::codecvt_utf8<wchar_t> cv;
+
+  size_t len = std::wcslen(chars);
+  CheckedSizeT utf8MaxLen = CheckedSizeT(len) * cv.max_length();
+  CheckedSizeT utf8BufLen = utf8MaxLen + 1;
+  if (!utf8BufLen.isValid()) {
+    JS_ReportAllocationOverflow(cx);
+    return nullptr;
+  }
+  auto utf8 = cx->make_pod_array<char>(utf8BufLen.value());
+  if (!utf8) {
+    return nullptr;
+  }
+
+  // STL returns |codecvt_base::partial| for empty strings.
+  if (len == 0) {
+    return utf8;
+  }
+
+  std::mbstate_t mb{};
+  const wchar_t* fromNext;
+  char* toNext;
+  std::codecvt_base::result result =
+      cv.out(mb, chars, chars + len, fromNext, utf8.get(),
+             utf8.get() + utf8MaxLen.value(), toNext);
+  if (result != std::codecvt_base::ok) {
+    MOZ_ASSERT(result == std::codecvt_base::error);
+    JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
+                              JSMSG_CANT_CONVERT_WIDE_TO_UTF8);
+    return nullptr;
+  }
+  *toNext = '\0';  // Explicit null-termination required.
+
+  // codecvt_utf8 doesn't validate its output and may produce WTF-8 instead
+  // of UTF-8 on some platforms when the input contains unpaired surrogate
+  // characters. We don't allow this.
+  if (!mozilla::IsUtf8(
+          mozilla::Span(utf8.get(), size_t(toNext - utf8.get())))) {
+    JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
+                              JSMSG_CANT_CONVERT_WIDE_TO_UTF8);
+    return nullptr;
+  }
+
+  return utf8;
+#else
+  static_assert(sizeof(wchar_t) == 4,
+                "Assume wchar_t is UTF-32 on Linux systems");
+
+  constexpr size_t MaxUtf8CharLength = 4;
+
+  size_t len = std::wcslen(chars);
+  CheckedSizeT utf8MaxLen = CheckedSizeT(len) * MaxUtf8CharLength;
+  CheckedSizeT utf8BufLen = utf8MaxLen + 1;
+  if (!utf8BufLen.isValid()) {
+    JS_ReportAllocationOverflow(cx);
+    return nullptr;
+  }
+  auto utf8 = cx->make_pod_array<char>(utf8BufLen.value());
+  if (!utf8) {
+    return nullptr;
+  }
+
+  char* dst = utf8.get();
+  for (size_t i = 0; i < len; i++) {
+    uint8_t utf8buf[MaxUtf8CharLength];
+    uint32_t utf8Len = OneUcs4ToUtf8Char(utf8buf, chars[i]);
+    for (size_t j = 0; j < utf8Len; j++) {
+      *dst++ = char(utf8buf[j]);
+    }
+  }
+  *dst = '\0';
+
+  return utf8;
+#endif
+}
+
+JS_PUBLIC_API JS::UniqueChars JS::EncodeUtf8ToNarrow(JSContext* cx,
+                                                     const char* chars) {
+  // Convert the UTF-8 string to a wide string via EncodeUtf8ToWide() and
+  // then convert the resulting wide string to a narrow multibyte character
+  // string.
+
+  auto wideChars = EncodeUtf8ToWide(cx, chars);
+  if (!wideChars) {
+    return nullptr;
+  }
+
+  const wchar_t* cWideChars = wideChars.get();
+  std::mbstate_t mb{};
+  size_t narrowLen = std::wcsrtombs(nullptr, &cWideChars, 0, &mb);
+  if (narrowLen == size_t(-1)) {
+    JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
+                              JSMSG_CANT_CONVERT_TO_NARROW);
+    return nullptr;
+  }
+  MOZ_ASSERT(std::mbsinit(&mb),
+             "multi-byte state is in its initial state when no conversion "
+             "error occured");
+
+  size_t bufLen = narrowLen + 1;
+  auto narrow = cx->make_pod_array<char>(bufLen);
+  if (!narrow) {
+    return nullptr;
+  }
+
+  mozilla::DebugOnly<size_t> actualLen =
+      std::wcsrtombs(narrow.get(), &cWideChars, bufLen, &mb);
+  MOZ_ASSERT(narrowLen == actualLen);
+  MOZ_ASSERT(narrow[actualLen] == '\0');
+
+  return narrow;
+}
+
+JS_PUBLIC_API JS::UniqueWideChars JS::EncodeUtf8ToWide(JSContext* cx,
+                                                       const char* chars) {
+  // Only valid UTF-8 strings should be passed to this function.
+  MOZ_ASSERT(mozilla::IsUtf8(mozilla::Span(chars, strlen(chars))));
+
+#ifndef XP_LINUX
+  // Use the standard codecvt facet to convert from UTF-8 to a wide string.
+  std::codecvt_utf8<wchar_t> cv;
+
+  size_t len = strlen(chars);
+  auto wideChars = cx->make_pod_array<wchar_t>(len + 1);
+  if (!wideChars) {
+    return nullptr;
+  }
+
+  // STL returns |codecvt_base::partial| for empty strings.
+  if (len == 0) {
+    return wideChars;
+  }
+
+  std::mbstate_t mb{};
+  const char* fromNext;
+  wchar_t* toNext;
+  std::codecvt_base::result result =
+      cv.in(mb, chars, chars + len, fromNext, wideChars.get(),
+            wideChars.get() + len, toNext);
+  if (result != std::codecvt_base::ok) {
+    MOZ_ASSERT(result == std::codecvt_base::error);
+    JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
+                              JSMSG_CANT_CONVERT_UTF8_TO_WIDE);
+    return nullptr;
+  }
+  *toNext = '\0';  // Explicit null-termination required.
+
+  return wideChars;
+#else
+  static_assert(sizeof(wchar_t) == 4,
+                "Assume wchar_t is UTF-32 on Linux systems");
+
+  size_t len = strlen(chars);
+  auto wideChars = cx->make_pod_array<wchar_t>(len + 1);
+  if (!wideChars) {
+    return nullptr;
+  }
+
+  const auto* s = reinterpret_cast<const unsigned char*>(chars);
+  const auto* const limit = s + len;
+
+  wchar_t* dst = wideChars.get();
+  while (s < limit) {
+    unsigned char c = *s++;
+
+    if (mozilla::IsAscii(c)) {
+      *dst++ = wchar_t(c);
+      continue;
+    }
+
+    mozilla::Utf8Unit utf8(c);
+    mozilla::Maybe<char32_t> codePoint =
+        mozilla::DecodeOneUtf8CodePoint(utf8, &s, limit);
+    MOZ_ASSERT(codePoint.isSome());
+    *dst++ = wchar_t(*codePoint);
+  }
+  *dst++ = '\0';
+
+  return wideChars;
+#endif
+}
+
+bool StringBuffer::append(const Utf8Unit* units, size_t len) {
+  MOZ_ASSERT(maybeCx_);
+
+  if (isLatin1()) {
+    Latin1CharBuffer& latin1 = latin1Chars();
+
+    while (len > 0) {
+      if (!IsAscii(*units)) {
+        break;
+      }
+
+      if (!latin1.append(units->toUnsignedChar())) {
+        return false;
+      }
+
+      ++units;
+      --len;
+    }
+    if (len == 0) {
+      return true;
+    }
+
+    // Non-ASCII doesn't *necessarily* mean we couldn't keep appending to
+    // |latin1|, but it's only possible for [U+0080, U+0100) code points,
+    // and handling the full complexity of UTF-8 only for that very small
+    // additional range isn't worth it.  Inflate to two-byte storage before
+    // appending the remaining code points.
+    if (!inflateChars()) {
+      return false;
+    }
+  }
+
+  UTF8Chars remainingUtf8(units, len);
+
+  // Determine how many UTF-16 code units are required to represent the
+  // remaining units.
+  size_t utf16Len = 0;
+  auto countInflated = [&utf16Len](char16_t c) -> LoopDisposition {
+    utf16Len++;
+    return LoopDisposition::Continue;
+  };
+  if (!InflateUTF8ToUTF16<OnUTF8Error::Throw>(maybeCx_, remainingUtf8,
+                                              countInflated)) {
+    return false;
+  }
+
+  TwoByteCharBuffer& buf = twoByteChars();
+
+  size_t i = buf.length();
+  if (!buf.growByUninitialized(utf16Len)) {
+    return false;
+  }
+  MOZ_ASSERT(i + utf16Len == buf.length(),
+             "growByUninitialized assumed to increase length immediately");
+
+  char16_t* toFill = &buf[i];
+  auto appendUtf16 = [&toFill](char16_t unit) {
+    *toFill++ = unit;
+    return LoopDisposition::Continue;
+  };
+
+  MOZ_ALWAYS_TRUE(InflateUTF8ToUTF16<OnUTF8Error::Throw>(
+      maybeCx_, remainingUtf8, appendUtf16));
+  MOZ_ASSERT(toFill == buf.end());
+  return true;
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 17:32:43 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 17:32:43 +0000
commit	6bf0a5cb5034a7e684dcc3500e841785237ce2dd (patch)
tree	a68f146d7fa01f0134297619fbe7e33db084e0aa /js/src/vm/CharacterEncoding.cpp
parent	Initial commit. (diff)
download	thunderbird-upstream.tar.xz thunderbird-upstream.zip