/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- * vim: set ts=8 sts=2 et sw=2 tw=80: * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ #include "js/CharacterEncoding.h" #include "mozilla/Latin1.h" #include "mozilla/Range.h" #include "mozilla/Span.h" #include "mozilla/Sprintf.h" #include "mozilla/TextUtils.h" #include "mozilla/Utf8.h" #include #include #include "js/friend/ErrorMessages.h" // js::GetErrorMessage, JSMSG_* #include "util/StringBuffer.h" #include "util/Unicode.h" // unicode::REPLACEMENT_CHARACTER #include "vm/JSContext.h" using mozilla::AsChars; using mozilla::AsciiValidUpTo; using mozilla::AsWritableChars; using mozilla::ConvertLatin1toUtf8Partial; using mozilla::ConvertUtf16toUtf8Partial; using mozilla::IsAscii; using mozilla::IsUtf8Latin1; using mozilla::LossyConvertUtf16toLatin1; using mozilla::Span; using mozilla::Tie; using mozilla::Tuple; using mozilla::Unused; using mozilla::Utf8Unit; using JS::Latin1CharsZ; using JS::TwoByteCharsZ; using JS::UTF8Chars; using JS::UTF8CharsZ; using JS::WTF8Chars; using namespace js; using namespace js::unicode; Latin1CharsZ JS::LossyTwoByteCharsToNewLatin1CharsZ( JSContext* cx, const mozilla::Range tbchars) { MOZ_ASSERT(cx); size_t len = tbchars.length(); unsigned char* latin1 = cx->pod_malloc(len + 1); if (!latin1) { return Latin1CharsZ(); } LossyConvertUtf16toLatin1(tbchars, AsWritableChars(Span(latin1, len))); latin1[len] = '\0'; return Latin1CharsZ(latin1, len); } template static size_t GetDeflatedUTF8StringLength(const CharT* chars, size_t nchars) { size_t nbytes = nchars; for (const CharT* end = chars + nchars; chars < end; chars++) { char16_t c = *chars; if (c < 0x80) { continue; } uint32_t v; if (IsSurrogate(c)) { /* nbytes sets 1 length since this is surrogate pair. */ if (IsTrailSurrogate(c) || (chars + 1) == end) { nbytes += 2; /* Bad Surrogate */ continue; } char16_t c2 = chars[1]; if (!IsTrailSurrogate(c2)) { nbytes += 2; /* Bad Surrogate */ continue; } v = UTF16Decode(c, c2); nbytes--; chars++; } else { v = c; } v >>= 11; nbytes++; while (v) { v >>= 5; nbytes++; } } return nbytes; } JS_PUBLIC_API size_t JS::GetDeflatedUTF8StringLength(JSLinearString* s) { JS::AutoCheckCannotGC nogc; return s->hasLatin1Chars() ? ::GetDeflatedUTF8StringLength(s->latin1Chars(nogc), s->length()) : ::GetDeflatedUTF8StringLength(s->twoByteChars(nogc), s->length()); } JS_PUBLIC_API size_t JS::DeflateStringToUTF8Buffer(JSLinearString* src, mozilla::Span dst) { JS::AutoCheckCannotGC nogc; if (src->hasLatin1Chars()) { auto source = AsChars(Span(src->latin1Chars(nogc), src->length())); size_t read; size_t written; Tie(read, written) = ConvertLatin1toUtf8Partial(source, dst); Unused << read; return written; } auto source = Span(src->twoByteChars(nogc), src->length()); size_t read; size_t written; Tie(read, written) = ConvertUtf16toUtf8Partial(source, dst); Unused << read; return written; } template void ConvertToUTF8(mozilla::Span src, mozilla::Span dst); template <> void ConvertToUTF8(mozilla::Span src, mozilla::Span dst) { Unused << ConvertUtf16toUtf8Partial(src, dst); } template <> void ConvertToUTF8(mozilla::Span src, mozilla::Span dst) { Unused << ConvertLatin1toUtf8Partial(AsChars(src), dst); } template UTF8CharsZ JS::CharsToNewUTF8CharsZ(JSContext* maybeCx, const mozilla::Range chars) { /* Get required buffer size. */ const CharT* str = chars.begin().get(); size_t len = ::GetDeflatedUTF8StringLength(str, chars.length()); /* Allocate buffer. */ char* utf8; if (maybeCx) { utf8 = maybeCx->pod_malloc(len + 1); } else { utf8 = js_pod_malloc(len + 1); } if (!utf8) { return UTF8CharsZ(); } /* Encode to UTF8. */ ::ConvertToUTF8(Span(str, chars.length()), Span(utf8, len)); utf8[len] = '\0'; return UTF8CharsZ(utf8, len); } template UTF8CharsZ JS::CharsToNewUTF8CharsZ( JSContext* maybeCx, const mozilla::Range chars); template UTF8CharsZ JS::CharsToNewUTF8CharsZ( JSContext* maybeCx, const mozilla::Range chars); template UTF8CharsZ JS::CharsToNewUTF8CharsZ( JSContext* maybeCx, const mozilla::Range chars); template UTF8CharsZ JS::CharsToNewUTF8CharsZ( JSContext* maybeCx, const mozilla::Range chars); static const uint32_t INVALID_UTF8 = UINT32_MAX; /* * Convert a UTF-8 or WTF-8 (depending on InputCharsT, which is either * UTF8Chars or WTF8Chars) character sequence into a UCS-4 character and return * that character. It is assumed that the caller already checked that the * sequence is valid. */ template static uint32_t Utf8ToOneUcs4CharImpl(const uint8_t* utf8Buffer, int utf8Length) { static_assert(std::is_same_v || std::is_same_v, "must be either UTF-8 or WTF-8"); MOZ_ASSERT(1 <= utf8Length && utf8Length <= 4); if (utf8Length == 1) { MOZ_ASSERT(!(*utf8Buffer & 0x80)); return *utf8Buffer; } /* from Unicode 3.1, non-shortest form is illegal */ static const uint32_t minucs4Table[] = {0x80, 0x800, NonBMPMin}; MOZ_ASSERT((*utf8Buffer & (0x100 - (1 << (7 - utf8Length)))) == (0x100 - (1 << (8 - utf8Length)))); uint32_t ucs4Char = *utf8Buffer++ & ((1 << (7 - utf8Length)) - 1); uint32_t minucs4Char = minucs4Table[utf8Length - 2]; while (--utf8Length) { MOZ_ASSERT((*utf8Buffer & 0xC0) == 0x80); ucs4Char = (ucs4Char << 6) | (*utf8Buffer++ & 0x3F); } if (MOZ_UNLIKELY(ucs4Char < minucs4Char)) { return INVALID_UTF8; } // WTF-8 allows lone surrogate. if (std::is_same_v && MOZ_UNLIKELY(IsSurrogate(ucs4Char))) { return INVALID_UTF8; } return ucs4Char; } uint32_t JS::Utf8ToOneUcs4Char(const uint8_t* utf8Buffer, int utf8Length) { return Utf8ToOneUcs4CharImpl(utf8Buffer, utf8Length); } static void ReportInvalidCharacter(JSContext* cx, uint32_t offset) { char buffer[10]; SprintfLiteral(buffer, "%u", offset); JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, JSMSG_MALFORMED_UTF8_CHAR, buffer); } static void ReportBufferTooSmall(JSContext* cx, uint32_t dummy) { JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, JSMSG_BUFFER_TOO_SMALL); } static void ReportTooBigCharacter(JSContext* cx, uint32_t v) { char buffer[10]; SprintfLiteral(buffer, "0x%x", v); JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, JSMSG_UTF8_CHAR_TOO_LARGE, buffer); } enum class LoopDisposition { Break, Continue, }; enum class OnUTF8Error { InsertReplacementCharacter, InsertQuestionMark, Throw, Crash, }; // Scan UTF-8 or WTF-8 input and (internally, at least) convert it to a series // of UTF-16 code units. But you can also do odd things like pass an empty // lambda for `dst`, in which case the output is discarded entirely--the only // effect of calling the template that way is error-checking. template static bool InflateUTF8ToUTF16(JSContext* cx, const InputCharsT src, OutputFn dst) { size_t srclen = src.length(); for (uint32_t i = 0; i < srclen; i++) { uint32_t v = uint32_t(src[i]); if (!(v & 0x80)) { // ASCII code unit. Simple copy. if (dst(uint16_t(v)) == LoopDisposition::Break) { break; } } else { // Non-ASCII code unit. Determine its length in bytes (n). uint32_t n = 1; while (v & (0x80 >> n)) { n++; } #define INVALID(report, arg, n2) \ do { \ if (ErrorAction == OnUTF8Error::Throw) { \ report(cx, arg); \ return false; \ } else if (ErrorAction == OnUTF8Error::Crash) { \ MOZ_CRASH("invalid UTF-8 string: " #report); \ } else { \ char16_t replacement; \ if (ErrorAction == OnUTF8Error::InsertReplacementCharacter) { \ replacement = REPLACEMENT_CHARACTER; \ } else { \ MOZ_ASSERT(ErrorAction == OnUTF8Error::InsertQuestionMark); \ replacement = '?'; \ } \ if (dst(replacement) == LoopDisposition::Break) { \ break; \ } \ n = n2; \ goto invalidMultiByteCodeUnit; \ } \ } while (0) // Check the leading byte. if (n < 2 || n > 4) { INVALID(ReportInvalidCharacter, i, 1); } // Check that |src| is large enough to hold an n-byte code unit. if (i + n > srclen) { INVALID(ReportBufferTooSmall, /* dummy = */ 0, 1); } // Check the second byte. From Unicode Standard v6.2, Table 3-7 // Well-Formed UTF-8 Byte Sequences. if ((v == 0xE0 && ((uint8_t)src[i + 1] & 0xE0) != 0xA0) || // E0 A0~BF (v == 0xED && ((uint8_t)src[i + 1] & 0xE0) != 0x80) || // ED 80~9F (v == 0xF0 && ((uint8_t)src[i + 1] & 0xF0) == 0x80) || // F0 90~BF (v == 0xF4 && ((uint8_t)src[i + 1] & 0xF0) != 0x80)) // F4 80~8F { if constexpr (std::is_same_v) { INVALID(ReportInvalidCharacter, i, 1); } else { // WTF-8 allows lone surrogate as ED A0~BF 80~BF. static_assert(std::is_same_v); if (v == 0xED && ((uint8_t)src[i + 1] & 0xE0) != 0xA0) { // ED A0~BF INVALID(ReportInvalidCharacter, i, 1); } } } // Check the continuation bytes. for (uint32_t m = 1; m < n; m++) { if ((src[i + m] & 0xC0) != 0x80) { INVALID(ReportInvalidCharacter, i, m); } } // Determine the code unit's length in CharT and act accordingly. v = Utf8ToOneUcs4CharImpl((uint8_t*)&src[i], n); if (v < NonBMPMin) { // The n-byte UTF8 code unit will fit in a single CharT. if (dst(char16_t(v)) == LoopDisposition::Break) { break; } } else if (v <= NonBMPMax) { // The n-byte UTF8 code unit will fit in two CharT units. if (dst(LeadSurrogate(v)) == LoopDisposition::Break) { break; } if (dst(TrailSurrogate(v)) == LoopDisposition::Break) { break; } } else { // The n-byte UTF8 code unit won't fit in two CharT units. INVALID(ReportTooBigCharacter, v, 1); } invalidMultiByteCodeUnit: // Move i to the last byte of the multi-byte code unit; the loop // header will do the final i++ to move to the start of the next // code unit. i += n - 1; } } return true; } template static void CopyAndInflateUTF8IntoBuffer(JSContext* cx, const InputCharsT src, CharT* dst, size_t outlen, bool allASCII) { if (allASCII) { size_t srclen = src.length(); MOZ_ASSERT(outlen == srclen); for (uint32_t i = 0; i < srclen; i++) { dst[i] = CharT(src[i]); } } else { size_t j = 0; auto push = [dst, &j](char16_t c) -> LoopDisposition { dst[j++] = CharT(c); return LoopDisposition::Continue; }; MOZ_ALWAYS_TRUE((InflateUTF8ToUTF16(cx, src, push))); MOZ_ASSERT(j == outlen); } dst[outlen] = CharT('\0'); // NUL char } template static CharsT InflateUTF8StringHelper(JSContext* cx, const InputCharsT src, size_t* outlen, arena_id_t destArenaId) { using CharT = typename CharsT::CharT; static_assert( std::is_same_v || std::is_same_v, "bad CharT"); *outlen = 0; size_t len = 0; bool allASCII = true; auto count = [&len, &allASCII](char16_t c) -> LoopDisposition { len++; allASCII &= (c < 0x80); return LoopDisposition::Continue; }; if (!InflateUTF8ToUTF16(cx, src, count)) { return CharsT(); } *outlen = len; CharT* dst = cx->pod_arena_malloc(destArenaId, *outlen + 1); // +1 for NUL if (!dst) { ReportOutOfMemory(cx); return CharsT(); } constexpr OnUTF8Error errorMode = std::is_same_v ? OnUTF8Error::InsertQuestionMark : OnUTF8Error::InsertReplacementCharacter; CopyAndInflateUTF8IntoBuffer(cx, src, dst, *outlen, allASCII); return CharsT(dst, *outlen); } TwoByteCharsZ JS::UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen, arena_id_t destArenaId) { return InflateUTF8StringHelper( cx, utf8, outlen, destArenaId); } TwoByteCharsZ JS::WTF8CharsToNewTwoByteCharsZ(JSContext* cx, const WTF8Chars wtf8, size_t* outlen, arena_id_t destArenaId) { return InflateUTF8StringHelper( cx, wtf8, outlen, destArenaId); } TwoByteCharsZ JS::UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, size_t* outlen, arena_id_t destArenaId) { UTF8Chars chars(utf8.c_str(), strlen(utf8.c_str())); return InflateUTF8StringHelper( cx, chars, outlen, destArenaId); } TwoByteCharsZ JS::LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const JS::UTF8Chars utf8, size_t* outlen, arena_id_t destArenaId) { return InflateUTF8StringHelper(cx, utf8, outlen, destArenaId); } TwoByteCharsZ JS::LossyUTF8CharsToNewTwoByteCharsZ( JSContext* cx, const JS::ConstUTF8CharsZ& utf8, size_t* outlen, arena_id_t destArenaId) { UTF8Chars chars(utf8.c_str(), strlen(utf8.c_str())); return InflateUTF8StringHelper(cx, chars, outlen, destArenaId); } static void UpdateSmallestEncodingForChar(char16_t c, JS::SmallestEncoding* encoding) { JS::SmallestEncoding newEncoding = JS::SmallestEncoding::ASCII; if (c >= 0x80) { if (c < 0x100) { newEncoding = JS::SmallestEncoding::Latin1; } else { newEncoding = JS::SmallestEncoding::UTF16; } } if (newEncoding > *encoding) { *encoding = newEncoding; } } JS::SmallestEncoding JS::FindSmallestEncoding(UTF8Chars utf8) { Span unsignedSpan = utf8; auto charSpan = AsChars(unsignedSpan); size_t upTo = AsciiValidUpTo(charSpan); if (upTo == charSpan.Length()) { return SmallestEncoding::ASCII; } if (IsUtf8Latin1(charSpan.From(upTo))) { return SmallestEncoding::Latin1; } return SmallestEncoding::UTF16; } Latin1CharsZ JS::UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen, arena_id_t destArenaId) { return InflateUTF8StringHelper( cx, utf8, outlen, destArenaId); } Latin1CharsZ JS::LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen, arena_id_t destArenaId) { return InflateUTF8StringHelper( cx, utf8, outlen, destArenaId); } /** * Atomization Helpers. * * These functions are extremely single-use, and are not intended for general * consumption. */ template bool GetUTF8AtomizationData(JSContext* cx, const InputCharsT utf8, size_t* outlen, JS::SmallestEncoding* encoding, HashNumber* hashNum) { *outlen = 0; *encoding = JS::SmallestEncoding::ASCII; *hashNum = 0; auto getMetadata = [outlen, encoding, hashNum](char16_t c) -> LoopDisposition { (*outlen)++; UpdateSmallestEncodingForChar(c, encoding); *hashNum = mozilla::AddToHash(*hashNum, c); return LoopDisposition::Continue; }; if (!InflateUTF8ToUTF16(cx, utf8, getMetadata)) { return false; } return true; } template bool GetUTF8AtomizationData( JSContext* cx, const JS::UTF8Chars utf8, size_t* outlen, JS::SmallestEncoding* encoding, HashNumber* hashNum); template bool GetUTF8AtomizationData( JSContext* cx, const JS::WTF8Chars utf8, size_t* outlen, JS::SmallestEncoding* encoding, HashNumber* hashNum); template bool UTF8OrWTF8EqualsChars(const CharsT utfChars, const CharT* chars) { size_t ind = 0; bool isEqual = true; auto checkEqual = [&isEqual, &ind, chars](char16_t c) -> LoopDisposition { #ifdef DEBUG JS::SmallestEncoding encoding = JS::SmallestEncoding::ASCII; UpdateSmallestEncodingForChar(c, &encoding); if (std::is_same_v) { MOZ_ASSERT(encoding <= JS::SmallestEncoding::Latin1); } else if (!std::is_same_v) { MOZ_CRASH("Invalid character type in UTF8EqualsChars"); } #endif if (CharT(c) != chars[ind]) { isEqual = false; return LoopDisposition::Break; } ind++; return LoopDisposition::Continue; }; // To get here, you must have checked your work. InflateUTF8ToUTF16(/* cx = */ nullptr, utfChars, checkEqual); return isEqual; } template bool UTF8OrWTF8EqualsChars(const JS::UTF8Chars, const char16_t*); template bool UTF8OrWTF8EqualsChars(const JS::UTF8Chars, const JS::Latin1Char*); template bool UTF8OrWTF8EqualsChars(const JS::WTF8Chars, const char16_t*); template bool UTF8OrWTF8EqualsChars(const JS::WTF8Chars, const JS::Latin1Char*); template void InflateUTF8CharsToBufferAndTerminate(const InputCharsT src, CharT* dst, size_t dstLen, JS::SmallestEncoding encoding) { CopyAndInflateUTF8IntoBuffer( /* cx = */ nullptr, src, dst, dstLen, encoding == JS::SmallestEncoding::ASCII); } template void InflateUTF8CharsToBufferAndTerminate( const UTF8Chars src, char16_t* dst, size_t dstLen, JS::SmallestEncoding encoding); template void InflateUTF8CharsToBufferAndTerminate( const UTF8Chars src, JS::Latin1Char* dst, size_t dstLen, JS::SmallestEncoding encoding); template void InflateUTF8CharsToBufferAndTerminate( const WTF8Chars src, char16_t* dst, size_t dstLen, JS::SmallestEncoding encoding); template void InflateUTF8CharsToBufferAndTerminate( const WTF8Chars src, JS::Latin1Char* dst, size_t dstLen, JS::SmallestEncoding encoding); #ifdef DEBUG void JS::ConstUTF8CharsZ::validate(size_t aLength) { MOZ_ASSERT(data_); UTF8Chars chars(data_, aLength); auto nop = [](char16_t) -> LoopDisposition { return LoopDisposition::Continue; }; InflateUTF8ToUTF16(/* cx = */ nullptr, chars, nop); } #endif bool JS::StringIsASCII(const char* s) { while (*s) { if (*s & 0x80) { return false; } s++; } return true; } bool JS::StringIsASCII(Span s) { return IsAscii(s); } bool StringBuffer::append(const Utf8Unit* units, size_t len) { if (isLatin1()) { Latin1CharBuffer& latin1 = latin1Chars(); while (len > 0) { if (!IsAscii(*units)) { break; } if (!latin1.append(units->toUnsignedChar())) { return false; } ++units; --len; } if (len == 0) { return true; } // Non-ASCII doesn't *necessarily* mean we couldn't keep appending to // |latin1|, but it's only possible for [U+0080, U+0100) code points, // and handling the full complexity of UTF-8 only for that very small // additional range isn't worth it. Inflate to two-byte storage before // appending the remaining code points. if (!inflateChars()) { return false; } } UTF8Chars remainingUtf8(units, len); // Determine how many UTF-16 code units are required to represent the // remaining units. size_t utf16Len = 0; auto countInflated = [&utf16Len](char16_t c) -> LoopDisposition { utf16Len++; return LoopDisposition::Continue; }; if (!InflateUTF8ToUTF16(cx_, remainingUtf8, countInflated)) { return false; } TwoByteCharBuffer& buf = twoByteChars(); size_t i = buf.length(); if (!buf.growByUninitialized(utf16Len)) { return false; } MOZ_ASSERT(i + utf16Len == buf.length(), "growByUninitialized assumed to increase length immediately"); char16_t* toFill = &buf[i]; auto appendUtf16 = [&toFill](char16_t unit) { *toFill++ = unit; return LoopDisposition::Continue; }; MOZ_ALWAYS_TRUE( InflateUTF8ToUTF16(cx_, remainingUtf8, appendUtf16)); MOZ_ASSERT(toFill == buf.end()); return true; }