summaryrefslogtreecommitdiffstats
path: root/js/src/frontend/TokenStream.cpp
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
commit26a029d407be480d791972afb5975cf62c9360a6 (patch)
treef435a8308119effd964b339f76abb83a57c29483 /js/src/frontend/TokenStream.cpp
parentInitial commit. (diff)
downloadfirefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz
firefox-26a029d407be480d791972afb5975cf62c9360a6.zip
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'js/src/frontend/TokenStream.cpp')
-rw-r--r--js/src/frontend/TokenStream.cpp3733
1 files changed, 3733 insertions, 0 deletions
diff --git a/js/src/frontend/TokenStream.cpp b/js/src/frontend/TokenStream.cpp
new file mode 100644
index 0000000000..2134972bf4
--- /dev/null
+++ b/js/src/frontend/TokenStream.cpp
@@ -0,0 +1,3733 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
+ * vim: set ts=8 sts=2 et sw=2 tw=80:
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+// JS lexical scanner.
+
+#include "frontend/TokenStream.h"
+
+#include "mozilla/ArrayUtils.h"
+#include "mozilla/Attributes.h"
+#include "mozilla/Likely.h"
+#include "mozilla/Maybe.h"
+#include "mozilla/MemoryChecking.h"
+#include "mozilla/ScopeExit.h"
+#include "mozilla/Span.h"
+#include "mozilla/TemplateLib.h"
+#include "mozilla/TextUtils.h"
+#include "mozilla/Utf8.h"
+
+#include <algorithm>
+#include <iterator>
+#include <limits>
+#include <stdarg.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <type_traits>
+#include <utility>
+
+#include "jsnum.h"
+
+#include "frontend/FrontendContext.h"
+#include "frontend/Parser.h"
+#include "frontend/ParserAtom.h"
+#include "frontend/ReservedWords.h"
+#include "js/CharacterEncoding.h" // JS::ConstUTF8CharsZ
+#include "js/ColumnNumber.h" // JS::LimitedColumnNumberOneOrigin, JS::ColumnNumberOneOrigin, JS::TaggedColumnNumberOneOrigin
+#include "js/ErrorReport.h" // JSErrorBase
+#include "js/friend/ErrorMessages.h" // js::GetErrorMessage, JSMSG_*
+#include "js/Printf.h" // JS_smprintf
+#include "js/RegExpFlags.h" // JS::RegExpFlags
+#include "js/UniquePtr.h"
+#include "util/Text.h"
+#include "util/Unicode.h"
+#include "vm/FrameIter.h" // js::{,NonBuiltin}FrameIter
+#include "vm/JSContext.h"
+#include "vm/Realm.h"
+
+using mozilla::AsciiAlphanumericToNumber;
+using mozilla::AssertedCast;
+using mozilla::DecodeOneUtf8CodePoint;
+using mozilla::IsAscii;
+using mozilla::IsAsciiAlpha;
+using mozilla::IsAsciiDigit;
+using mozilla::IsAsciiHexDigit;
+using mozilla::IsTrailingUnit;
+using mozilla::MakeScopeExit;
+using mozilla::Maybe;
+using mozilla::PointerRangeSize;
+using mozilla::Span;
+using mozilla::Utf8Unit;
+
+using JS::ReadOnlyCompileOptions;
+using JS::RegExpFlag;
+using JS::RegExpFlags;
+
+struct ReservedWordInfo {
+ const char* chars; // C string with reserved word text
+ js::frontend::TokenKind tokentype;
+};
+
+static const ReservedWordInfo reservedWords[] = {
+#define RESERVED_WORD_INFO(word, name, type) {#word, js::frontend::type},
+ FOR_EACH_JAVASCRIPT_RESERVED_WORD(RESERVED_WORD_INFO)
+#undef RESERVED_WORD_INFO
+};
+
+enum class ReservedWordsIndex : size_t {
+#define ENTRY_(_1, NAME, _3) NAME,
+ FOR_EACH_JAVASCRIPT_RESERVED_WORD(ENTRY_)
+#undef ENTRY_
+};
+
+// Returns a ReservedWordInfo for the specified characters, or nullptr if the
+// string is not a reserved word.
+template <typename CharT>
+static const ReservedWordInfo* FindReservedWord(const CharT* s, size_t length) {
+ MOZ_ASSERT(length != 0);
+
+ size_t i;
+ const ReservedWordInfo* rw;
+ const char* chars;
+
+#define JSRW_LENGTH() length
+#define JSRW_AT(column) s[column]
+#define JSRW_GOT_MATCH(index) \
+ i = (index); \
+ goto got_match;
+#define JSRW_TEST_GUESS(index) \
+ i = (index); \
+ goto test_guess;
+#define JSRW_NO_MATCH() goto no_match;
+#include "frontend/ReservedWordsGenerated.h"
+#undef JSRW_NO_MATCH
+#undef JSRW_TEST_GUESS
+#undef JSRW_GOT_MATCH
+#undef JSRW_AT
+#undef JSRW_LENGTH
+
+got_match:
+ return &reservedWords[i];
+
+test_guess:
+ rw = &reservedWords[i];
+ chars = rw->chars;
+ do {
+ if (*s++ != static_cast<unsigned char>(*chars++)) {
+ goto no_match;
+ }
+ } while (--length != 0);
+ return rw;
+
+no_match:
+ return nullptr;
+}
+
+template <>
+MOZ_ALWAYS_INLINE const ReservedWordInfo* FindReservedWord<Utf8Unit>(
+ const Utf8Unit* units, size_t length) {
+ return FindReservedWord(Utf8AsUnsignedChars(units), length);
+}
+
+static const ReservedWordInfo* FindReservedWord(
+ const js::frontend::TaggedParserAtomIndex atom) {
+ switch (atom.rawData()) {
+#define CASE_(_1, NAME, _3) \
+ case js::frontend::TaggedParserAtomIndex::WellKnownRawData::NAME(): \
+ return &reservedWords[size_t(ReservedWordsIndex::NAME)];
+ FOR_EACH_JAVASCRIPT_RESERVED_WORD(CASE_)
+#undef CASE_
+ }
+
+ return nullptr;
+}
+
+template <typename CharT>
+static constexpr bool IsAsciiBinary(CharT c) {
+ using UnsignedCharT = std::make_unsigned_t<CharT>;
+ auto uc = static_cast<UnsignedCharT>(c);
+ return uc == '0' || uc == '1';
+}
+
+template <typename CharT>
+static constexpr bool IsAsciiOctal(CharT c) {
+ using UnsignedCharT = std::make_unsigned_t<CharT>;
+ auto uc = static_cast<UnsignedCharT>(c);
+ return '0' <= uc && uc <= '7';
+}
+
+template <typename CharT>
+static constexpr uint8_t AsciiOctalToNumber(CharT c) {
+ using UnsignedCharT = std::make_unsigned_t<CharT>;
+ auto uc = static_cast<UnsignedCharT>(c);
+ return uc - '0';
+}
+
+namespace js {
+
+namespace frontend {
+
+bool IsKeyword(TaggedParserAtomIndex atom) {
+ if (const ReservedWordInfo* rw = FindReservedWord(atom)) {
+ return TokenKindIsKeyword(rw->tokentype);
+ }
+
+ return false;
+}
+
+TokenKind ReservedWordTokenKind(TaggedParserAtomIndex name) {
+ if (const ReservedWordInfo* rw = FindReservedWord(name)) {
+ return rw->tokentype;
+ }
+
+ return TokenKind::Limit;
+}
+
+const char* ReservedWordToCharZ(TaggedParserAtomIndex name) {
+ if (const ReservedWordInfo* rw = FindReservedWord(name)) {
+ return ReservedWordToCharZ(rw->tokentype);
+ }
+
+ return nullptr;
+}
+
+const char* ReservedWordToCharZ(TokenKind tt) {
+ MOZ_ASSERT(tt != TokenKind::Name);
+ switch (tt) {
+#define EMIT_CASE(word, name, type) \
+ case type: \
+ return #word;
+ FOR_EACH_JAVASCRIPT_RESERVED_WORD(EMIT_CASE)
+#undef EMIT_CASE
+ default:
+ MOZ_ASSERT_UNREACHABLE("Not a reserved word PropertyName.");
+ }
+ return nullptr;
+}
+
+TaggedParserAtomIndex TokenStreamAnyChars::reservedWordToPropertyName(
+ TokenKind tt) const {
+ MOZ_ASSERT(tt != TokenKind::Name);
+ switch (tt) {
+#define EMIT_CASE(word, name, type) \
+ case type: \
+ return TaggedParserAtomIndex::WellKnown::name();
+ FOR_EACH_JAVASCRIPT_RESERVED_WORD(EMIT_CASE)
+#undef EMIT_CASE
+ default:
+ MOZ_ASSERT_UNREACHABLE("Not a reserved word TokenKind.");
+ }
+ return TaggedParserAtomIndex::null();
+}
+
+SourceCoords::SourceCoords(FrontendContext* fc, uint32_t initialLineNumber,
+ uint32_t initialOffset)
+ : lineStartOffsets_(fc), initialLineNum_(initialLineNumber), lastIndex_(0) {
+ // This is actually necessary! Removing it causes compile errors on
+ // GCC and clang. You could try declaring this:
+ //
+ // const uint32_t SourceCoords::MAX_PTR;
+ //
+ // which fixes the GCC/clang error, but causes bustage on Windows. Sigh.
+ //
+ uint32_t maxPtr = MAX_PTR;
+
+ // The first line begins at buffer offset |initialOffset|. MAX_PTR is the
+ // sentinel. The appends cannot fail because |lineStartOffsets_| has
+ // statically-allocated elements.
+ MOZ_ASSERT(lineStartOffsets_.capacity() >= 2);
+ MOZ_ALWAYS_TRUE(lineStartOffsets_.reserve(2));
+ lineStartOffsets_.infallibleAppend(initialOffset);
+ lineStartOffsets_.infallibleAppend(maxPtr);
+}
+
+MOZ_ALWAYS_INLINE bool SourceCoords::add(uint32_t lineNum,
+ uint32_t lineStartOffset) {
+ uint32_t index = indexFromLineNumber(lineNum);
+ uint32_t sentinelIndex = lineStartOffsets_.length() - 1;
+
+ MOZ_ASSERT(lineStartOffsets_[0] <= lineStartOffset);
+ MOZ_ASSERT(lineStartOffsets_[sentinelIndex] == MAX_PTR);
+
+ if (index == sentinelIndex) {
+ // We haven't seen this newline before. Update lineStartOffsets_
+ // only if lineStartOffsets_.append succeeds, to keep sentinel.
+ // Otherwise return false to tell TokenStream about OOM.
+ uint32_t maxPtr = MAX_PTR;
+ if (!lineStartOffsets_.append(maxPtr)) {
+ static_assert(std::is_same_v<decltype(lineStartOffsets_.allocPolicy()),
+ TempAllocPolicy&>,
+ "this function's caller depends on it reporting an "
+ "error on failure, as TempAllocPolicy ensures");
+ return false;
+ }
+
+ lineStartOffsets_[index] = lineStartOffset;
+ } else {
+ // We have seen this newline before (and ungot it). Do nothing (other
+ // than checking it hasn't mysteriously changed).
+ // This path can be executed after hitting OOM, so check index.
+ MOZ_ASSERT_IF(index < sentinelIndex,
+ lineStartOffsets_[index] == lineStartOffset);
+ }
+ return true;
+}
+
+MOZ_ALWAYS_INLINE bool SourceCoords::fill(const SourceCoords& other) {
+ MOZ_ASSERT(lineStartOffsets_[0] == other.lineStartOffsets_[0]);
+ MOZ_ASSERT(lineStartOffsets_.back() == MAX_PTR);
+ MOZ_ASSERT(other.lineStartOffsets_.back() == MAX_PTR);
+
+ if (lineStartOffsets_.length() >= other.lineStartOffsets_.length()) {
+ return true;
+ }
+
+ uint32_t sentinelIndex = lineStartOffsets_.length() - 1;
+ lineStartOffsets_[sentinelIndex] = other.lineStartOffsets_[sentinelIndex];
+
+ for (size_t i = sentinelIndex + 1; i < other.lineStartOffsets_.length();
+ i++) {
+ if (!lineStartOffsets_.append(other.lineStartOffsets_[i])) {
+ return false;
+ }
+ }
+ return true;
+}
+
+MOZ_ALWAYS_INLINE uint32_t
+SourceCoords::indexFromOffset(uint32_t offset) const {
+ uint32_t iMin, iMax, iMid;
+
+ if (lineStartOffsets_[lastIndex_] <= offset) {
+ // If we reach here, offset is on a line the same as or higher than
+ // last time. Check first for the +0, +1, +2 cases, because they
+ // typically cover 85--98% of cases.
+ if (offset < lineStartOffsets_[lastIndex_ + 1]) {
+ return lastIndex_; // index is same as last time
+ }
+
+ // If we reach here, there must be at least one more entry (plus the
+ // sentinel). Try it.
+ lastIndex_++;
+ if (offset < lineStartOffsets_[lastIndex_ + 1]) {
+ return lastIndex_; // index is one higher than last time
+ }
+
+ // The same logic applies here.
+ lastIndex_++;
+ if (offset < lineStartOffsets_[lastIndex_ + 1]) {
+ return lastIndex_; // index is two higher than last time
+ }
+
+ // No luck. Oh well, we have a better-than-default starting point for
+ // the binary search.
+ iMin = lastIndex_ + 1;
+ MOZ_ASSERT(iMin <
+ lineStartOffsets_.length() - 1); // -1 due to the sentinel
+
+ } else {
+ iMin = 0;
+ }
+
+ // This is a binary search with deferred detection of equality, which was
+ // marginally faster in this case than a standard binary search.
+ // The -2 is because |lineStartOffsets_.length() - 1| is the sentinel, and we
+ // want one before that.
+ iMax = lineStartOffsets_.length() - 2;
+ while (iMax > iMin) {
+ iMid = iMin + (iMax - iMin) / 2;
+ if (offset >= lineStartOffsets_[iMid + 1]) {
+ iMin = iMid + 1; // offset is above lineStartOffsets_[iMid]
+ } else {
+ iMax = iMid; // offset is below or within lineStartOffsets_[iMid]
+ }
+ }
+
+ MOZ_ASSERT(iMax == iMin);
+ MOZ_ASSERT(lineStartOffsets_[iMin] <= offset);
+ MOZ_ASSERT(offset < lineStartOffsets_[iMin + 1]);
+
+ lastIndex_ = iMin;
+ return iMin;
+}
+
+SourceCoords::LineToken SourceCoords::lineToken(uint32_t offset) const {
+ return LineToken(indexFromOffset(offset), offset);
+}
+
+TokenStreamAnyChars::TokenStreamAnyChars(FrontendContext* fc,
+ const ReadOnlyCompileOptions& options,
+ StrictModeGetter* smg)
+ : fc(fc),
+ options_(options),
+ strictModeGetter_(smg),
+ filename_(options.filename()),
+ longLineColumnInfo_(fc),
+ srcCoords(fc, options.lineno, options.scriptSourceOffset),
+ lineno(options.lineno),
+ mutedErrors(options.mutedErrors()) {
+ // |isExprEnding| was initially zeroed: overwrite the true entries here.
+ isExprEnding[size_t(TokenKind::Comma)] = true;
+ isExprEnding[size_t(TokenKind::Semi)] = true;
+ isExprEnding[size_t(TokenKind::Colon)] = true;
+ isExprEnding[size_t(TokenKind::RightParen)] = true;
+ isExprEnding[size_t(TokenKind::RightBracket)] = true;
+ isExprEnding[size_t(TokenKind::RightCurly)] = true;
+}
+
+template <typename Unit>
+TokenStreamCharsBase<Unit>::TokenStreamCharsBase(FrontendContext* fc,
+ ParserAtomsTable* parserAtoms,
+ const Unit* units,
+ size_t length,
+ size_t startOffset)
+ : TokenStreamCharsShared(fc, parserAtoms),
+ sourceUnits(units, length, startOffset) {}
+
+bool FillCharBufferFromSourceNormalizingAsciiLineBreaks(CharBuffer& charBuffer,
+ const char16_t* cur,
+ const char16_t* end) {
+ MOZ_ASSERT(charBuffer.length() == 0);
+
+ while (cur < end) {
+ char16_t ch = *cur++;
+ if (ch == '\r') {
+ ch = '\n';
+ if (cur < end && *cur == '\n') {
+ cur++;
+ }
+ }
+
+ if (!charBuffer.append(ch)) {
+ return false;
+ }
+ }
+
+ MOZ_ASSERT(cur == end);
+ return true;
+}
+
+bool FillCharBufferFromSourceNormalizingAsciiLineBreaks(CharBuffer& charBuffer,
+ const Utf8Unit* cur,
+ const Utf8Unit* end) {
+ MOZ_ASSERT(charBuffer.length() == 0);
+
+ while (cur < end) {
+ Utf8Unit unit = *cur++;
+ if (MOZ_LIKELY(IsAscii(unit))) {
+ char16_t ch = unit.toUint8();
+ if (ch == '\r') {
+ ch = '\n';
+ if (cur < end && *cur == Utf8Unit('\n')) {
+ cur++;
+ }
+ }
+
+ if (!charBuffer.append(ch)) {
+ return false;
+ }
+
+ continue;
+ }
+
+ Maybe<char32_t> ch = DecodeOneUtf8CodePoint(unit, &cur, end);
+ MOZ_ASSERT(ch.isSome(),
+ "provided source text should already have been validated");
+
+ if (!AppendCodePointToCharBuffer(charBuffer, ch.value())) {
+ return false;
+ }
+ }
+
+ MOZ_ASSERT(cur == end);
+ return true;
+}
+
+template <typename Unit, class AnyCharsAccess>
+TokenStreamSpecific<Unit, AnyCharsAccess>::TokenStreamSpecific(
+ FrontendContext* fc, ParserAtomsTable* parserAtoms,
+ const ReadOnlyCompileOptions& options, const Unit* units, size_t length)
+ : TokenStreamChars<Unit, AnyCharsAccess>(fc, parserAtoms, units, length,
+ options.scriptSourceOffset) {}
+
+bool TokenStreamAnyChars::checkOptions() {
+ // Constrain starting columns to where they will saturate.
+ if (options().column.oneOriginValue() >
+ JS::LimitedColumnNumberOneOrigin::Limit) {
+ reportErrorNoOffset(JSMSG_BAD_COLUMN_NUMBER);
+ return false;
+ }
+
+ return true;
+}
+
+void TokenStreamAnyChars::reportErrorNoOffset(unsigned errorNumber, ...) const {
+ va_list args;
+ va_start(args, errorNumber);
+
+ reportErrorNoOffsetVA(errorNumber, &args);
+
+ va_end(args);
+}
+
+void TokenStreamAnyChars::reportErrorNoOffsetVA(unsigned errorNumber,
+ va_list* args) const {
+ ErrorMetadata metadata;
+ computeErrorMetadataNoOffset(&metadata);
+
+ ReportCompileErrorLatin1VA(fc, std::move(metadata), nullptr, errorNumber,
+ args);
+}
+
+[[nodiscard]] MOZ_ALWAYS_INLINE bool
+TokenStreamAnyChars::internalUpdateLineInfoForEOL(uint32_t lineStartOffset) {
+ prevLinebase = linebase;
+ linebase = lineStartOffset;
+ lineno++;
+
+ // On overflow, report error.
+ if (MOZ_UNLIKELY(!lineno)) {
+ reportErrorNoOffset(JSMSG_BAD_LINE_NUMBER);
+ return false;
+ }
+
+ return srcCoords.add(lineno, linebase);
+}
+
+#ifdef DEBUG
+
+template <>
+inline void SourceUnits<char16_t>::assertNextCodePoint(
+ const PeekedCodePoint<char16_t>& peeked) {
+ char32_t c = peeked.codePoint();
+ if (c < unicode::NonBMPMin) {
+ MOZ_ASSERT(peeked.lengthInUnits() == 1);
+ MOZ_ASSERT(ptr[0] == c);
+ } else {
+ MOZ_ASSERT(peeked.lengthInUnits() == 2);
+ char16_t lead, trail;
+ unicode::UTF16Encode(c, &lead, &trail);
+ MOZ_ASSERT(ptr[0] == lead);
+ MOZ_ASSERT(ptr[1] == trail);
+ }
+}
+
+template <>
+inline void SourceUnits<Utf8Unit>::assertNextCodePoint(
+ const PeekedCodePoint<Utf8Unit>& peeked) {
+ char32_t c = peeked.codePoint();
+
+ // This is all roughly indulgence of paranoia only for assertions, so the
+ // reimplementation of UTF-8 encoding a code point is (we think) a virtue.
+ uint8_t expectedUnits[4] = {};
+ if (c < 0x80) {
+ expectedUnits[0] = AssertedCast<uint8_t>(c);
+ } else if (c < 0x800) {
+ expectedUnits[0] = 0b1100'0000 | (c >> 6);
+ expectedUnits[1] = 0b1000'0000 | (c & 0b11'1111);
+ } else if (c < 0x10000) {
+ expectedUnits[0] = 0b1110'0000 | (c >> 12);
+ expectedUnits[1] = 0b1000'0000 | ((c >> 6) & 0b11'1111);
+ expectedUnits[2] = 0b1000'0000 | (c & 0b11'1111);
+ } else {
+ expectedUnits[0] = 0b1111'0000 | (c >> 18);
+ expectedUnits[1] = 0b1000'0000 | ((c >> 12) & 0b11'1111);
+ expectedUnits[2] = 0b1000'0000 | ((c >> 6) & 0b11'1111);
+ expectedUnits[3] = 0b1000'0000 | (c & 0b11'1111);
+ }
+
+ MOZ_ASSERT(peeked.lengthInUnits() <= 4);
+ for (uint8_t i = 0; i < peeked.lengthInUnits(); i++) {
+ MOZ_ASSERT(expectedUnits[i] == ptr[i].toUint8());
+ }
+}
+
+#endif // DEBUG
+
+static MOZ_ALWAYS_INLINE void RetractPointerToCodePointBoundary(
+ const Utf8Unit** ptr, const Utf8Unit* limit) {
+ MOZ_ASSERT(*ptr <= limit);
+
+ // |limit| is a code point boundary.
+ if (MOZ_UNLIKELY(*ptr == limit)) {
+ return;
+ }
+
+ // Otherwise rewind past trailing units to the start of the code point.
+#ifdef DEBUG
+ size_t retracted = 0;
+#endif
+ while (MOZ_UNLIKELY(IsTrailingUnit((*ptr)[0]))) {
+ --*ptr;
+#ifdef DEBUG
+ retracted++;
+#endif
+ }
+
+ MOZ_ASSERT(retracted < 4,
+ "the longest UTF-8 code point is four units, so this should never "
+ "retract more than three units");
+}
+
+static MOZ_ALWAYS_INLINE void RetractPointerToCodePointBoundary(
+ const char16_t** ptr, const char16_t* limit) {
+ MOZ_ASSERT(*ptr <= limit);
+
+ // |limit| is a code point boundary.
+ if (MOZ_UNLIKELY(*ptr == limit)) {
+ return;
+ }
+
+ // Otherwise the pointer must be retracted by one iff it splits a two-unit
+ // code point.
+ if (MOZ_UNLIKELY(unicode::IsTrailSurrogate((*ptr)[0]))) {
+ // Outside test suites testing garbage WTF-16, it's basically guaranteed
+ // here that |(*ptr)[-1] (*ptr)[0]| is a surrogate pair.
+ if (MOZ_LIKELY(unicode::IsLeadSurrogate((*ptr)[-1]))) {
+ --*ptr;
+ }
+ }
+}
+
+template <typename Unit>
+JS::ColumnNumberUnsignedOffset TokenStreamAnyChars::computeColumnOffset(
+ const LineToken lineToken, const uint32_t offset,
+ const SourceUnits<Unit>& sourceUnits) const {
+ lineToken.assertConsistentOffset(offset);
+
+ const uint32_t start = srcCoords.lineStart(lineToken);
+ const uint32_t offsetInLine = offset - start;
+
+ if constexpr (std::is_same_v<Unit, char16_t>) {
+ // Column offset is in UTF-16 code units.
+ return JS::ColumnNumberUnsignedOffset(offsetInLine);
+ }
+
+ return computeColumnOffsetForUTF8(lineToken, offset, start, offsetInLine,
+ sourceUnits);
+}
+
+template <typename Unit>
+JS::ColumnNumberUnsignedOffset TokenStreamAnyChars::computeColumnOffsetForUTF8(
+ const LineToken lineToken, const uint32_t offset, const uint32_t start,
+ const uint32_t offsetInLine, const SourceUnits<Unit>& sourceUnits) const {
+ const uint32_t line = lineNumber(lineToken);
+
+ // Reset the previous offset/column number offset cache for this line, if the
+ // previous lookup wasn't on this line.
+ if (line != lineOfLastColumnComputation_) {
+ lineOfLastColumnComputation_ = line;
+ lastChunkVectorForLine_ = nullptr;
+ lastOffsetOfComputedColumn_ = start;
+ lastComputedColumnOffset_ = JS::ColumnNumberUnsignedOffset::zero();
+ }
+
+ // Compute and return the final column number offset from a partially
+ // calculated offset/column number offset, using the last-cached
+ // offset/column number offset if they're more optimal.
+ auto OffsetFromPartial =
+ [this, offset, &sourceUnits](
+ uint32_t partialOffset,
+ JS::ColumnNumberUnsignedOffset partialColumnOffset,
+ UnitsType unitsType) {
+ MOZ_ASSERT(partialOffset <= offset);
+
+ // If the last lookup on this line was closer to |offset|, use it.
+ if (partialOffset < this->lastOffsetOfComputedColumn_ &&
+ this->lastOffsetOfComputedColumn_ <= offset) {
+ partialOffset = this->lastOffsetOfComputedColumn_;
+ partialColumnOffset = this->lastComputedColumnOffset_;
+ }
+
+ const Unit* begin = sourceUnits.codeUnitPtrAt(partialOffset);
+ const Unit* end = sourceUnits.codeUnitPtrAt(offset);
+
+ size_t offsetDelta =
+ AssertedCast<uint32_t>(PointerRangeSize(begin, end));
+ partialOffset += offsetDelta;
+
+ if (unitsType == UnitsType::GuaranteedSingleUnit) {
+ MOZ_ASSERT(unicode::CountUTF16CodeUnits(begin, end) == offsetDelta,
+ "guaranteed-single-units also guarantee pointer distance "
+ "equals UTF-16 code unit count");
+ partialColumnOffset += JS::ColumnNumberUnsignedOffset(offsetDelta);
+ } else {
+ partialColumnOffset += JS::ColumnNumberUnsignedOffset(
+ AssertedCast<uint32_t>(unicode::CountUTF16CodeUnits(begin, end)));
+ }
+
+ this->lastOffsetOfComputedColumn_ = partialOffset;
+ this->lastComputedColumnOffset_ = partialColumnOffset;
+ return partialColumnOffset;
+ };
+
+ // We won't add an entry to |longLineColumnInfo_| for lines where the maximum
+ // column has offset less than this value. The most common (non-minified)
+ // long line length is likely 80ch, maybe 100ch, so we use that, rounded up to
+ // the next power of two for efficient division/multiplication below.
+ constexpr uint32_t ColumnChunkLength = mozilla::tl::RoundUpPow2<100>::value;
+
+ // The index within any associated |Vector<ChunkInfo>| of |offset|'s chunk.
+ const uint32_t chunkIndex = offsetInLine / ColumnChunkLength;
+ if (chunkIndex == 0) {
+ // We don't know from an |offset| in the zeroth chunk that this line is even
+ // long. First-chunk info is mostly useless, anyway -- we have |start|
+ // already. So if we have *easy* access to that zeroth chunk, use it --
+ // otherwise just count pessimally. (This will still benefit from caching
+ // the last column/offset for computations for successive offsets, so it's
+ // not *always* worst-case.)
+ UnitsType unitsType;
+ if (lastChunkVectorForLine_ && lastChunkVectorForLine_->length() > 0) {
+ MOZ_ASSERT((*lastChunkVectorForLine_)[0].columnOffset() ==
+ JS::ColumnNumberUnsignedOffset::zero());
+ unitsType = (*lastChunkVectorForLine_)[0].unitsType();
+ } else {
+ unitsType = UnitsType::PossiblyMultiUnit;
+ }
+
+ return OffsetFromPartial(start, JS::ColumnNumberUnsignedOffset::zero(),
+ unitsType);
+ }
+
+ // If this line has no chunk vector yet, insert one in the hash map. (The
+ // required index is allocated and filled further down.)
+ if (!lastChunkVectorForLine_) {
+ auto ptr = longLineColumnInfo_.lookupForAdd(line);
+ if (!ptr) {
+ // This could rehash and invalidate a cached vector pointer, but the outer
+ // condition means we don't have a cached pointer.
+ if (!longLineColumnInfo_.add(ptr, line, Vector<ChunkInfo>(fc))) {
+ // In case of OOM, just count columns from the start of the line.
+ fc->recoverFromOutOfMemory();
+ return OffsetFromPartial(start, JS::ColumnNumberUnsignedOffset::zero(),
+ UnitsType::PossiblyMultiUnit);
+ }
+ }
+
+ // Note that adding elements to this vector won't invalidate this pointer.
+ lastChunkVectorForLine_ = &ptr->value();
+ }
+
+ const Unit* const limit = sourceUnits.codeUnitPtrAt(offset);
+
+ auto RetractedOffsetOfChunk = [
+#ifdef DEBUG
+ this,
+#endif
+ start, limit,
+ &sourceUnits](uint32_t index) {
+ MOZ_ASSERT(index < this->lastChunkVectorForLine_->length());
+
+ uint32_t naiveOffset = start + index * ColumnChunkLength;
+ const Unit* naivePtr = sourceUnits.codeUnitPtrAt(naiveOffset);
+
+ const Unit* actualPtr = naivePtr;
+ RetractPointerToCodePointBoundary(&actualPtr, limit);
+
+#ifdef DEBUG
+ if ((*this->lastChunkVectorForLine_)[index].unitsType() ==
+ UnitsType::GuaranteedSingleUnit) {
+ MOZ_ASSERT(naivePtr == actualPtr, "miscomputed unitsType value");
+ }
+#endif
+
+ return naiveOffset - PointerRangeSize(actualPtr, naivePtr);
+ };
+
+ uint32_t partialOffset;
+ JS::ColumnNumberUnsignedOffset partialColumnOffset;
+ UnitsType unitsType;
+
+ auto entriesLen = AssertedCast<uint32_t>(lastChunkVectorForLine_->length());
+ if (chunkIndex < entriesLen) {
+ // We've computed the chunk |offset| resides in. Compute the column number
+ // from the chunk.
+ partialOffset = RetractedOffsetOfChunk(chunkIndex);
+ partialColumnOffset = (*lastChunkVectorForLine_)[chunkIndex].columnOffset();
+
+ // This is exact if |chunkIndex| isn't the last chunk.
+ unitsType = (*lastChunkVectorForLine_)[chunkIndex].unitsType();
+
+ // Otherwise the last chunk is pessimistically assumed to contain multi-unit
+ // code points because we haven't fully examined its contents yet -- they
+ // may not have been tokenized yet, they could contain encoding errors, or
+ // they might not even exist.
+ MOZ_ASSERT_IF(chunkIndex == entriesLen - 1,
+ (*lastChunkVectorForLine_)[chunkIndex].unitsType() ==
+ UnitsType::PossiblyMultiUnit);
+ } else {
+ // Extend the vector from its last entry or the start of the line. (This is
+ // also a suitable partial start point if we must recover from OOM.)
+ if (entriesLen > 0) {
+ partialOffset = RetractedOffsetOfChunk(entriesLen - 1);
+ partialColumnOffset =
+ (*lastChunkVectorForLine_)[entriesLen - 1].columnOffset();
+ } else {
+ partialOffset = start;
+ partialColumnOffset = JS::ColumnNumberUnsignedOffset::zero();
+ }
+
+ if (!lastChunkVectorForLine_->reserve(chunkIndex + 1)) {
+ // As earlier, just start from the greatest offset/column in case of OOM.
+ fc->recoverFromOutOfMemory();
+ return OffsetFromPartial(partialOffset, partialColumnOffset,
+ UnitsType::PossiblyMultiUnit);
+ }
+
+ // OOM is no longer possible now. \o/
+
+ // The vector always begins with the column of the line start, i.e. zero,
+ // with chunk units pessimally assumed not single-unit.
+ if (entriesLen == 0) {
+ lastChunkVectorForLine_->infallibleAppend(
+ ChunkInfo(JS::ColumnNumberUnsignedOffset::zero(),
+ UnitsType::PossiblyMultiUnit));
+ entriesLen++;
+ }
+
+ do {
+ const Unit* const begin = sourceUnits.codeUnitPtrAt(partialOffset);
+ const Unit* chunkLimit = sourceUnits.codeUnitPtrAt(
+ start + std::min(entriesLen++ * ColumnChunkLength, offsetInLine));
+
+ MOZ_ASSERT(begin < chunkLimit);
+ MOZ_ASSERT(chunkLimit <= limit);
+
+ static_assert(
+ ColumnChunkLength > SourceUnitTraits<Unit>::maxUnitsLength - 1,
+ "any retraction below is assumed to never underflow to the "
+ "preceding chunk, even for the longest code point");
+
+ // Prior tokenizing ensured that [begin, limit) is validly encoded, and
+ // |begin < chunkLimit|, so any retraction here can't underflow.
+ RetractPointerToCodePointBoundary(&chunkLimit, limit);
+
+ MOZ_ASSERT(begin < chunkLimit);
+ MOZ_ASSERT(chunkLimit <= limit);
+
+ size_t numUnits = PointerRangeSize(begin, chunkLimit);
+ size_t numUTF16CodeUnits =
+ unicode::CountUTF16CodeUnits(begin, chunkLimit);
+
+ // If this chunk (which will become non-final at the end of the loop) is
+ // all single-unit code points, annotate the chunk accordingly.
+ if (numUnits == numUTF16CodeUnits) {
+ lastChunkVectorForLine_->back().guaranteeSingleUnits();
+ }
+
+ partialOffset += numUnits;
+ partialColumnOffset += JS::ColumnNumberUnsignedOffset(numUTF16CodeUnits);
+
+ lastChunkVectorForLine_->infallibleEmplaceBack(
+ partialColumnOffset, UnitsType::PossiblyMultiUnit);
+ } while (entriesLen < chunkIndex + 1);
+
+ // We're at a spot in the current final chunk, and final chunks never have
+ // complete units information, so be pessimistic.
+ unitsType = UnitsType::PossiblyMultiUnit;
+ }
+
+ return OffsetFromPartial(partialOffset, partialColumnOffset, unitsType);
+}
+
+template <typename Unit, class AnyCharsAccess>
+JS::LimitedColumnNumberOneOrigin
+GeneralTokenStreamChars<Unit, AnyCharsAccess>::computeColumn(
+ LineToken lineToken, uint32_t offset) const {
+ lineToken.assertConsistentOffset(offset);
+
+ const TokenStreamAnyChars& anyChars = anyCharsAccess();
+
+ JS::ColumnNumberUnsignedOffset columnOffset =
+ anyChars.computeColumnOffset(lineToken, offset, this->sourceUnits);
+
+ if (!lineToken.isFirstLine()) {
+ return JS::LimitedColumnNumberOneOrigin::fromUnlimited(
+ JS::ColumnNumberOneOrigin() + columnOffset);
+ }
+
+ if (1 + columnOffset.value() > JS::LimitedColumnNumberOneOrigin::Limit) {
+ return JS::LimitedColumnNumberOneOrigin::limit();
+ }
+
+ return JS::LimitedColumnNumberOneOrigin::fromUnlimited(
+ (anyChars.options_.column + columnOffset).oneOriginValue());
+}
+
+template <typename Unit, class AnyCharsAccess>
+void GeneralTokenStreamChars<Unit, AnyCharsAccess>::computeLineAndColumn(
+ uint32_t offset, uint32_t* line,
+ JS::LimitedColumnNumberOneOrigin* column) const {
+ const TokenStreamAnyChars& anyChars = anyCharsAccess();
+
+ auto lineToken = anyChars.lineToken(offset);
+ *line = anyChars.lineNumber(lineToken);
+ *column = computeColumn(lineToken, offset);
+}
+
+template <class AnyCharsAccess>
+MOZ_COLD void TokenStreamChars<Utf8Unit, AnyCharsAccess>::internalEncodingError(
+ uint8_t relevantUnits, unsigned errorNumber, ...) {
+ va_list args;
+ va_start(args, errorNumber);
+
+ do {
+ size_t offset = this->sourceUnits.offset();
+
+ ErrorMetadata err;
+
+ TokenStreamAnyChars& anyChars = anyCharsAccess();
+
+ bool canAddLineOfContext = fillExceptingContext(&err, offset);
+ if (canAddLineOfContext) {
+ if (!internalComputeLineOfContext(&err, offset)) {
+ break;
+ }
+
+ // As this is an encoding error, the computed window-end must be
+ // identical to the location of the error -- any further on and the
+ // window would contain invalid Unicode.
+ MOZ_ASSERT_IF(err.lineOfContext != nullptr,
+ err.lineLength == err.tokenOffset);
+ }
+
+ auto notes = MakeUnique<JSErrorNotes>();
+ if (!notes) {
+ ReportOutOfMemory(anyChars.fc);
+ break;
+ }
+
+ // The largest encoding of a UTF-8 code point is 4 units. (Encoding an
+ // obsolete 5- or 6-byte code point will complain only about a bad lead
+ // code unit.)
+ constexpr size_t MaxWidth = sizeof("0xHH 0xHH 0xHH 0xHH");
+
+ MOZ_ASSERT(relevantUnits > 0);
+
+ char badUnitsStr[MaxWidth];
+ char* ptr = badUnitsStr;
+ while (relevantUnits > 0) {
+ byteToString(this->sourceUnits.getCodeUnit().toUint8(), ptr);
+ ptr[4] = ' ';
+
+ ptr += 5;
+ relevantUnits--;
+ }
+
+ ptr[-1] = '\0';
+
+ uint32_t line;
+ JS::LimitedColumnNumberOneOrigin column;
+ computeLineAndColumn(offset, &line, &column);
+
+ if (!notes->addNoteASCII(anyChars.fc, anyChars.getFilename().c_str(), 0,
+ line, JS::ColumnNumberOneOrigin(column),
+ GetErrorMessage, nullptr, JSMSG_BAD_CODE_UNITS,
+ badUnitsStr)) {
+ break;
+ }
+
+ ReportCompileErrorLatin1VA(anyChars.fc, std::move(err), std::move(notes),
+ errorNumber, &args);
+ } while (false);
+
+ va_end(args);
+}
+
+template <class AnyCharsAccess>
+MOZ_COLD void TokenStreamChars<Utf8Unit, AnyCharsAccess>::badLeadUnit(
+ Utf8Unit lead) {
+ uint8_t leadValue = lead.toUint8();
+
+ char leadByteStr[5];
+ byteToTerminatedString(leadValue, leadByteStr);
+
+ internalEncodingError(1, JSMSG_BAD_LEADING_UTF8_UNIT, leadByteStr);
+}
+
+template <class AnyCharsAccess>
+MOZ_COLD void TokenStreamChars<Utf8Unit, AnyCharsAccess>::notEnoughUnits(
+ Utf8Unit lead, uint8_t remaining, uint8_t required) {
+ uint8_t leadValue = lead.toUint8();
+
+ MOZ_ASSERT(required == 2 || required == 3 || required == 4);
+ MOZ_ASSERT(remaining < 4);
+ MOZ_ASSERT(remaining < required);
+
+ char leadByteStr[5];
+ byteToTerminatedString(leadValue, leadByteStr);
+
+ // |toHexChar| produces the desired decimal numbers for values < 4.
+ const char expectedStr[] = {toHexChar(required - 1), '\0'};
+ const char actualStr[] = {toHexChar(remaining - 1), '\0'};
+
+ internalEncodingError(remaining, JSMSG_NOT_ENOUGH_CODE_UNITS, leadByteStr,
+ expectedStr, required == 2 ? "" : "s", actualStr,
+ remaining == 2 ? " was" : "s were");
+}
+
+template <class AnyCharsAccess>
+MOZ_COLD void TokenStreamChars<Utf8Unit, AnyCharsAccess>::badTrailingUnit(
+ uint8_t unitsObserved) {
+ Utf8Unit badUnit =
+ this->sourceUnits.addressOfNextCodeUnit()[unitsObserved - 1];
+
+ char badByteStr[5];
+ byteToTerminatedString(badUnit.toUint8(), badByteStr);
+
+ internalEncodingError(unitsObserved, JSMSG_BAD_TRAILING_UTF8_UNIT,
+ badByteStr);
+}
+
+template <class AnyCharsAccess>
+MOZ_COLD void
+TokenStreamChars<Utf8Unit, AnyCharsAccess>::badStructurallyValidCodePoint(
+ char32_t codePoint, uint8_t codePointLength, const char* reason) {
+ // Construct a string like "0x203D" (including null terminator) to include
+ // in the error message. Write the string end-to-start from end to start
+ // of an adequately sized |char| array, shifting least significant nibbles
+ // off the number and writing the corresponding hex digits until done, then
+ // prefixing with "0x". |codePointStr| points at the incrementally
+ // computed string, within |codePointCharsArray|'s bounds.
+
+ // 0x1F'FFFF is the maximum value that can fit in 3+6+6+6 unconstrained
+ // bits in a four-byte UTF-8 code unit sequence.
+ constexpr size_t MaxHexSize = sizeof(
+ "0x1F"
+ "FFFF"); // including '\0'
+ char codePointCharsArray[MaxHexSize];
+
+ char* codePointStr = std::end(codePointCharsArray);
+ *--codePointStr = '\0';
+
+ // Note that by do-while looping here rather than while-looping, this
+ // writes a '0' when |codePoint == 0|.
+ do {
+ MOZ_ASSERT(codePointCharsArray < codePointStr);
+ *--codePointStr = toHexChar(codePoint & 0xF);
+ codePoint >>= 4;
+ } while (codePoint);
+
+ MOZ_ASSERT(codePointCharsArray + 2 <= codePointStr);
+ *--codePointStr = 'x';
+ *--codePointStr = '0';
+
+ internalEncodingError(codePointLength, JSMSG_FORBIDDEN_UTF8_CODE_POINT,
+ codePointStr, reason);
+}
+
+template <class AnyCharsAccess>
+[[nodiscard]] bool
+TokenStreamChars<Utf8Unit, AnyCharsAccess>::getNonAsciiCodePointDontNormalize(
+ Utf8Unit lead, char32_t* codePoint) {
+ auto onBadLeadUnit = [this, &lead]() { this->badLeadUnit(lead); };
+
+ auto onNotEnoughUnits = [this, &lead](uint8_t remaining, uint8_t required) {
+ this->notEnoughUnits(lead, remaining, required);
+ };
+
+ auto onBadTrailingUnit = [this](uint8_t unitsObserved) {
+ this->badTrailingUnit(unitsObserved);
+ };
+
+ auto onBadCodePoint = [this](char32_t badCodePoint, uint8_t unitsObserved) {
+ this->badCodePoint(badCodePoint, unitsObserved);
+ };
+
+ auto onNotShortestForm = [this](char32_t badCodePoint,
+ uint8_t unitsObserved) {
+ this->notShortestForm(badCodePoint, unitsObserved);
+ };
+
+ // If a valid code point is decoded, this function call consumes its code
+ // units. If not, it ungets the lead code unit and invokes the right error
+ // handler, so on failure we must immediately return false.
+ SourceUnitsIterator iter(this->sourceUnits);
+ Maybe<char32_t> maybeCodePoint = DecodeOneUtf8CodePointInline(
+ lead, &iter, SourceUnitsEnd(), onBadLeadUnit, onNotEnoughUnits,
+ onBadTrailingUnit, onBadCodePoint, onNotShortestForm);
+ if (maybeCodePoint.isNothing()) {
+ return false;
+ }
+
+ *codePoint = maybeCodePoint.value();
+ return true;
+}
+
+template <class AnyCharsAccess>
+bool TokenStreamChars<char16_t, AnyCharsAccess>::getNonAsciiCodePoint(
+ int32_t lead, char32_t* codePoint) {
+ MOZ_ASSERT(lead != EOF);
+ MOZ_ASSERT(!isAsciiCodePoint(lead),
+ "ASCII code unit/point must be handled separately");
+ MOZ_ASSERT(lead == this->sourceUnits.previousCodeUnit(),
+ "getNonAsciiCodePoint called incorrectly");
+
+ // The code point is usually |lead|: overwrite later if needed.
+ *codePoint = AssertedCast<char32_t>(lead);
+
+ // ECMAScript specifically requires that unpaired UTF-16 surrogates be
+ // treated as the corresponding code point and not as an error. See
+ // <https://tc39.github.io/ecma262/#sec-ecmascript-language-types-string-type>.
+ // Thus this function does not consider any sequence of 16-bit numbers to
+ // be intrinsically in error.
+
+ // Dispense with single-unit code points and lone trailing surrogates.
+ if (MOZ_LIKELY(!unicode::IsLeadSurrogate(lead))) {
+ if (MOZ_UNLIKELY(lead == unicode::LINE_SEPARATOR ||
+ lead == unicode::PARA_SEPARATOR)) {
+ if (!updateLineInfoForEOL()) {
+#ifdef DEBUG
+ // Assign to a sentinel value to hopefully cause errors.
+ *codePoint = std::numeric_limits<char32_t>::max();
+#endif
+ MOZ_MAKE_MEM_UNDEFINED(codePoint, sizeof(*codePoint));
+ return false;
+ }
+
+ *codePoint = '\n';
+ } else {
+ MOZ_ASSERT(!IsLineTerminator(*codePoint));
+ }
+
+ return true;
+ }
+
+ // Also handle a lead surrogate not paired with a trailing surrogate.
+ if (MOZ_UNLIKELY(
+ this->sourceUnits.atEnd() ||
+ !unicode::IsTrailSurrogate(this->sourceUnits.peekCodeUnit()))) {
+ MOZ_ASSERT(!IsLineTerminator(*codePoint));
+ return true;
+ }
+
+ // Otherwise we have a multi-unit code point.
+ *codePoint = unicode::UTF16Decode(lead, this->sourceUnits.getCodeUnit());
+ MOZ_ASSERT(!IsLineTerminator(*codePoint));
+ return true;
+}
+
+template <class AnyCharsAccess>
+bool TokenStreamChars<Utf8Unit, AnyCharsAccess>::getNonAsciiCodePoint(
+ int32_t unit, char32_t* codePoint) {
+ MOZ_ASSERT(unit != EOF);
+ MOZ_ASSERT(!isAsciiCodePoint(unit),
+ "ASCII code unit/point must be handled separately");
+
+ Utf8Unit lead = Utf8Unit(static_cast<unsigned char>(unit));
+ MOZ_ASSERT(lead == this->sourceUnits.previousCodeUnit(),
+ "getNonAsciiCodePoint called incorrectly");
+
+ auto onBadLeadUnit = [this, &lead]() { this->badLeadUnit(lead); };
+
+ auto onNotEnoughUnits = [this, &lead](uint_fast8_t remaining,
+ uint_fast8_t required) {
+ this->notEnoughUnits(lead, remaining, required);
+ };
+
+ auto onBadTrailingUnit = [this](uint_fast8_t unitsObserved) {
+ this->badTrailingUnit(unitsObserved);
+ };
+
+ auto onBadCodePoint = [this](char32_t badCodePoint,
+ uint_fast8_t unitsObserved) {
+ this->badCodePoint(badCodePoint, unitsObserved);
+ };
+
+ auto onNotShortestForm = [this](char32_t badCodePoint,
+ uint_fast8_t unitsObserved) {
+ this->notShortestForm(badCodePoint, unitsObserved);
+ };
+
+ // This consumes the full, valid code point or ungets |lead| and calls the
+ // appropriate error functor on failure.
+ SourceUnitsIterator iter(this->sourceUnits);
+ Maybe<char32_t> maybeCodePoint = DecodeOneUtf8CodePoint(
+ lead, &iter, SourceUnitsEnd(), onBadLeadUnit, onNotEnoughUnits,
+ onBadTrailingUnit, onBadCodePoint, onNotShortestForm);
+ if (maybeCodePoint.isNothing()) {
+ return false;
+ }
+
+ char32_t cp = maybeCodePoint.value();
+ if (MOZ_UNLIKELY(cp == unicode::LINE_SEPARATOR ||
+ cp == unicode::PARA_SEPARATOR)) {
+ if (!updateLineInfoForEOL()) {
+#ifdef DEBUG
+ // Assign to a sentinel value to hopefully cause errors.
+ *codePoint = std::numeric_limits<char32_t>::max();
+#endif
+ MOZ_MAKE_MEM_UNDEFINED(codePoint, sizeof(*codePoint));
+ return false;
+ }
+
+ *codePoint = '\n';
+ } else {
+ MOZ_ASSERT(!IsLineTerminator(cp));
+ *codePoint = cp;
+ }
+
+ return true;
+}
+
+template <>
+size_t SourceUnits<char16_t>::findWindowStart(size_t offset) const {
+ // This is JS's understanding of UTF-16 that allows lone surrogates, so
+ // we have to exclude lone surrogates from [windowStart, offset) ourselves.
+
+ const char16_t* const earliestPossibleStart = codeUnitPtrAt(startOffset_);
+
+ const char16_t* const initial = codeUnitPtrAt(offset);
+ const char16_t* p = initial;
+
+ auto HalfWindowSize = [&p, &initial]() {
+ return PointerRangeSize(p, initial);
+ };
+
+ while (true) {
+ MOZ_ASSERT(earliestPossibleStart <= p);
+ MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
+ if (p <= earliestPossibleStart || HalfWindowSize() >= WindowRadius) {
+ break;
+ }
+
+ char16_t c = p[-1];
+
+ // This stops at U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR in
+ // string and template literals. These code points do affect line and
+ // column coordinates, even as they encode their literal values.
+ if (IsLineTerminator(c)) {
+ break;
+ }
+
+ // Don't allow invalid UTF-16 in pre-context. (Current users don't
+ // require this, and this behavior isn't currently imposed on
+ // pre-context, but these facts might change someday.)
+
+ if (MOZ_UNLIKELY(unicode::IsLeadSurrogate(c))) {
+ break;
+ }
+
+ // Optimistically include the code unit, reverting below if needed.
+ p--;
+
+ // If it's not a surrogate at all, keep going.
+ if (MOZ_LIKELY(!unicode::IsTrailSurrogate(c))) {
+ continue;
+ }
+
+ // Stop if we don't have a usable surrogate pair.
+ if (HalfWindowSize() >= WindowRadius ||
+ p <= earliestPossibleStart || // trail surrogate at low end
+ !unicode::IsLeadSurrogate(p[-1])) // no paired lead surrogate
+ {
+ p++;
+ break;
+ }
+
+ p--;
+ }
+
+ MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
+ return offset - HalfWindowSize();
+}
+
+template <>
+size_t SourceUnits<Utf8Unit>::findWindowStart(size_t offset) const {
+ // |offset| must be the location of the error or somewhere before it, so we
+ // know preceding data is valid UTF-8.
+
+ const Utf8Unit* const earliestPossibleStart = codeUnitPtrAt(startOffset_);
+
+ const Utf8Unit* const initial = codeUnitPtrAt(offset);
+ const Utf8Unit* p = initial;
+
+ auto HalfWindowSize = [&p, &initial]() {
+ return PointerRangeSize(p, initial);
+ };
+
+ while (true) {
+ MOZ_ASSERT(earliestPossibleStart <= p);
+ MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
+ if (p <= earliestPossibleStart || HalfWindowSize() >= WindowRadius) {
+ break;
+ }
+
+ // Peek backward for a line break, and only decrement if there is none.
+ uint8_t prev = p[-1].toUint8();
+
+ // First check for the ASCII LineTerminators.
+ if (prev == '\r' || prev == '\n') {
+ break;
+ }
+
+ // Now check for the non-ASCII LineTerminators U+2028 LINE SEPARATOR
+ // (0xE2 0x80 0xA8) and U+2029 PARAGRAPH (0xE2 0x80 0xA9). If there
+ // aren't three code units available, some comparison here will fail
+ // before we'd underflow.
+ if (MOZ_UNLIKELY((prev == 0xA8 || prev == 0xA9) &&
+ p[-2].toUint8() == 0x80 && p[-3].toUint8() == 0xE2)) {
+ break;
+ }
+
+ // Rewind over the non-LineTerminator. This can't underflow
+ // |earliestPossibleStart| because it begins a code point.
+ while (IsTrailingUnit(*--p)) {
+ continue;
+ }
+
+ MOZ_ASSERT(earliestPossibleStart <= p);
+
+ // But if we underflowed |WindowRadius|, adjust forward and stop.
+ if (HalfWindowSize() > WindowRadius) {
+ static_assert(WindowRadius > 3,
+ "skipping over non-lead code units below must not "
+ "advance past |offset|");
+
+ while (IsTrailingUnit(*++p)) {
+ continue;
+ }
+
+ MOZ_ASSERT(HalfWindowSize() < WindowRadius);
+ break;
+ }
+ }
+
+ MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
+ return offset - HalfWindowSize();
+}
+
+template <>
+size_t SourceUnits<char16_t>::findWindowEnd(size_t offset) const {
+ const char16_t* const initial = codeUnitPtrAt(offset);
+ const char16_t* p = initial;
+
+ auto HalfWindowSize = [&initial, &p]() {
+ return PointerRangeSize(initial, p);
+ };
+
+ while (true) {
+ MOZ_ASSERT(p <= limit_);
+ MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
+ if (p >= limit_ || HalfWindowSize() >= WindowRadius) {
+ break;
+ }
+
+ char16_t c = *p;
+
+ // This stops at U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR in
+ // string and template literals. These code points do affect line and
+ // column coordinates, even as they encode their literal values.
+ if (IsLineTerminator(c)) {
+ break;
+ }
+
+ // Don't allow invalid UTF-16 in post-context. (Current users don't
+ // require this, and this behavior isn't currently imposed on
+ // pre-context, but these facts might change someday.)
+
+ if (MOZ_UNLIKELY(unicode::IsTrailSurrogate(c))) {
+ break;
+ }
+
+ // Optimistically consume the code unit, ungetting it below if needed.
+ p++;
+
+ // If it's not a surrogate at all, keep going.
+ if (MOZ_LIKELY(!unicode::IsLeadSurrogate(c))) {
+ continue;
+ }
+
+ // Retract if the lead surrogate would stand alone at the end of the
+ // window.
+ if (HalfWindowSize() >= WindowRadius || // split pair
+ p >= limit_ || // half-pair at end of source
+ !unicode::IsTrailSurrogate(*p)) // no paired trail surrogate
+ {
+ p--;
+ break;
+ }
+
+ p++;
+ }
+
+ return offset + HalfWindowSize();
+}
+
+template <>
+size_t SourceUnits<Utf8Unit>::findWindowEnd(size_t offset) const {
+ const Utf8Unit* const initial = codeUnitPtrAt(offset);
+ const Utf8Unit* p = initial;
+
+ auto HalfWindowSize = [&initial, &p]() {
+ return PointerRangeSize(initial, p);
+ };
+
+ while (true) {
+ MOZ_ASSERT(p <= limit_);
+ MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
+ if (p >= limit_ || HalfWindowSize() >= WindowRadius) {
+ break;
+ }
+
+ // A non-encoding error might be followed by an encoding error within
+ // |maxEnd|, so we must validate as we go to not include invalid UTF-8
+ // in the computed window. What joy!
+
+ Utf8Unit lead = *p;
+ if (mozilla::IsAscii(lead)) {
+ if (IsSingleUnitLineTerminator(lead)) {
+ break;
+ }
+
+ p++;
+ continue;
+ }
+
+ PeekedCodePoint<Utf8Unit> peeked = PeekCodePoint(p, limit_);
+ if (peeked.isNone()) {
+ break; // encoding error
+ }
+
+ char32_t c = peeked.codePoint();
+ if (MOZ_UNLIKELY(c == unicode::LINE_SEPARATOR ||
+ c == unicode::PARA_SEPARATOR)) {
+ break;
+ }
+
+ MOZ_ASSERT(!IsLineTerminator(c));
+
+ uint8_t len = peeked.lengthInUnits();
+ if (HalfWindowSize() + len > WindowRadius) {
+ break;
+ }
+
+ p += len;
+ }
+
+ MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
+ return offset + HalfWindowSize();
+}
+
+template <typename Unit, class AnyCharsAccess>
+bool TokenStreamSpecific<Unit, AnyCharsAccess>::advance(size_t position) {
+ const Unit* end = this->sourceUnits.codeUnitPtrAt(position);
+ while (this->sourceUnits.addressOfNextCodeUnit() < end) {
+ if (!getCodePoint()) {
+ return false;
+ }
+ }
+
+ TokenStreamAnyChars& anyChars = anyCharsAccess();
+ Token* cur = const_cast<Token*>(&anyChars.currentToken());
+ cur->pos.begin = this->sourceUnits.offset();
+ cur->pos.end = cur->pos.begin;
+#ifdef DEBUG
+ cur->type = TokenKind::Limit;
+#endif
+ MOZ_MAKE_MEM_UNDEFINED(&cur->type, sizeof(cur->type));
+ anyChars.lookahead = 0;
+ return true;
+}
+
+template <typename Unit, class AnyCharsAccess>
+void TokenStreamSpecific<Unit, AnyCharsAccess>::seekTo(const Position& pos) {
+ TokenStreamAnyChars& anyChars = anyCharsAccess();
+
+ this->sourceUnits.setAddressOfNextCodeUnit(pos.buf,
+ /* allowPoisoned = */ true);
+ anyChars.flags = pos.flags;
+ anyChars.lineno = pos.lineno;
+ anyChars.linebase = pos.linebase;
+ anyChars.prevLinebase = pos.prevLinebase;
+ anyChars.lookahead = pos.lookahead;
+
+ anyChars.tokens[anyChars.cursor()] = pos.currentToken;
+ for (unsigned i = 0; i < anyChars.lookahead; i++) {
+ anyChars.tokens[anyChars.aheadCursor(1 + i)] = pos.lookaheadTokens[i];
+ }
+}
+
+template <typename Unit, class AnyCharsAccess>
+bool TokenStreamSpecific<Unit, AnyCharsAccess>::seekTo(
+ const Position& pos, const TokenStreamAnyChars& other) {
+ if (!anyCharsAccess().srcCoords.fill(other.srcCoords)) {
+ return false;
+ }
+
+ seekTo(pos);
+ return true;
+}
+
+void TokenStreamAnyChars::computeErrorMetadataNoOffset(
+ ErrorMetadata* err) const {
+ err->isMuted = mutedErrors;
+ err->filename = filename_;
+ err->lineNumber = 0;
+ err->columnNumber = JS::ColumnNumberOneOrigin();
+
+ MOZ_ASSERT(err->lineOfContext == nullptr);
+}
+
+bool TokenStreamAnyChars::fillExceptingContext(ErrorMetadata* err,
+ uint32_t offset) const {
+ err->isMuted = mutedErrors;
+
+ // If this TokenStreamAnyChars doesn't have location information, try to
+ // get it from the caller.
+ if (!filename_) {
+ JSContext* maybeCx = context()->maybeCurrentJSContext();
+ if (maybeCx) {
+ NonBuiltinFrameIter iter(maybeCx,
+ FrameIter::FOLLOW_DEBUGGER_EVAL_PREV_LINK,
+ maybeCx->realm()->principals());
+ if (!iter.done() && iter.filename()) {
+ err->filename = JS::ConstUTF8CharsZ(iter.filename());
+ JS::TaggedColumnNumberOneOrigin columnNumber;
+ err->lineNumber = iter.computeLine(&columnNumber);
+ // NOTE: Wasm frame cannot appear here.
+ err->columnNumber =
+ JS::ColumnNumberOneOrigin(columnNumber.toLimitedColumnNumber());
+ return false;
+ }
+ }
+ }
+
+ // Otherwise use this TokenStreamAnyChars's location information.
+ err->filename = filename_;
+ return true;
+}
+
+template <>
+inline void SourceUnits<char16_t>::computeWindowOffsetAndLength(
+ const char16_t* encodedWindow, size_t encodedTokenOffset,
+ size_t* utf16TokenOffset, size_t encodedWindowLength,
+ size_t* utf16WindowLength) const {
+ MOZ_ASSERT_UNREACHABLE("shouldn't need to recompute for UTF-16");
+}
+
+template <>
+inline void SourceUnits<Utf8Unit>::computeWindowOffsetAndLength(
+ const Utf8Unit* encodedWindow, size_t encodedTokenOffset,
+ size_t* utf16TokenOffset, size_t encodedWindowLength,
+ size_t* utf16WindowLength) const {
+ MOZ_ASSERT(encodedTokenOffset <= encodedWindowLength,
+ "token offset must be within the window, and the two lambda "
+ "calls below presume this ordering of values");
+
+ const Utf8Unit* const encodedWindowEnd = encodedWindow + encodedWindowLength;
+
+ size_t i = 0;
+ auto ComputeUtf16Count = [&i, &encodedWindow](const Utf8Unit* limit) {
+ while (encodedWindow < limit) {
+ Utf8Unit lead = *encodedWindow++;
+ if (MOZ_LIKELY(IsAscii(lead))) {
+ // ASCII contributes a single UTF-16 code unit.
+ i++;
+ continue;
+ }
+
+ Maybe<char32_t> cp = DecodeOneUtf8CodePoint(lead, &encodedWindow, limit);
+ MOZ_ASSERT(cp.isSome(),
+ "computed window should only contain valid UTF-8");
+
+ i += unicode::IsSupplementary(cp.value()) ? 2 : 1;
+ }
+
+ return i;
+ };
+
+ // Compute the token offset from |i == 0| and the initial |encodedWindow|.
+ const Utf8Unit* token = encodedWindow + encodedTokenOffset;
+ MOZ_ASSERT(token <= encodedWindowEnd);
+ *utf16TokenOffset = ComputeUtf16Count(token);
+
+ // Compute the window length, picking up from |i| and |encodedWindow| that,
+ // in general, were modified just above.
+ *utf16WindowLength = ComputeUtf16Count(encodedWindowEnd);
+}
+
+template <typename Unit>
+bool TokenStreamCharsBase<Unit>::addLineOfContext(ErrorMetadata* err,
+ uint32_t offset) const {
+ // Rename the variable to make meaning clearer: an offset into source units
+ // in Unit encoding.
+ size_t encodedOffset = offset;
+
+ // These are also offsets into source units in Unit encoding.
+ size_t encodedWindowStart = sourceUnits.findWindowStart(encodedOffset);
+ size_t encodedWindowEnd = sourceUnits.findWindowEnd(encodedOffset);
+
+ size_t encodedWindowLength = encodedWindowEnd - encodedWindowStart;
+ MOZ_ASSERT(encodedWindowLength <= SourceUnits::WindowRadius * 2);
+
+ // Don't add a useless "line" of context when the window ends up empty
+ // because of an invalid encoding at the start of a line.
+ if (encodedWindowLength == 0) {
+ MOZ_ASSERT(err->lineOfContext == nullptr,
+ "ErrorMetadata::lineOfContext must be null so we don't "
+ "have to set the lineLength/tokenOffset fields");
+ return true;
+ }
+
+ CharBuffer lineOfContext(fc);
+
+ const Unit* encodedWindow = sourceUnits.codeUnitPtrAt(encodedWindowStart);
+ if (!FillCharBufferFromSourceNormalizingAsciiLineBreaks(
+ lineOfContext, encodedWindow, encodedWindow + encodedWindowLength)) {
+ return false;
+ }
+
+ size_t utf16WindowLength = lineOfContext.length();
+
+ // The windowed string is null-terminated.
+ if (!lineOfContext.append('\0')) {
+ return false;
+ }
+
+ err->lineOfContext.reset(lineOfContext.extractOrCopyRawBuffer());
+ if (!err->lineOfContext) {
+ return false;
+ }
+
+ size_t encodedTokenOffset = encodedOffset - encodedWindowStart;
+
+ MOZ_ASSERT(encodedTokenOffset <= encodedWindowLength,
+ "token offset must be inside the window");
+
+ // The length in UTF-8 code units of a code point is always greater than or
+ // equal to the same code point's length in UTF-16 code points. ASCII code
+ // points are 1 unit in either encoding. Code points in [U+0080, U+10000)
+ // are 2-3 UTF-8 code units to 1 UTF-16 code unit. And code points in
+ // [U+10000, U+10FFFF] are 4 UTF-8 code units to 2 UTF-16 code units.
+ //
+ // Therefore, if encoded window length equals the length in UTF-16 (this is
+ // always the case for Unit=char16_t), the UTF-16 offsets are exactly the
+ // encoded offsets. Otherwise we must convert offset/length from UTF-8 to
+ // UTF-16.
+ if constexpr (std::is_same_v<Unit, char16_t>) {
+ MOZ_ASSERT(utf16WindowLength == encodedWindowLength,
+ "UTF-16 to UTF-16 shouldn't change window length");
+ err->tokenOffset = encodedTokenOffset;
+ err->lineLength = encodedWindowLength;
+ } else {
+ static_assert(std::is_same_v<Unit, Utf8Unit>, "should only see UTF-8 here");
+
+ bool simple = utf16WindowLength == encodedWindowLength;
+#ifdef DEBUG
+ auto isAscii = [](Unit u) { return IsAscii(u); };
+ MOZ_ASSERT(std::all_of(encodedWindow, encodedWindow + encodedWindowLength,
+ isAscii) == simple,
+ "equal window lengths in UTF-8 should correspond only to "
+ "wholly-ASCII text");
+#endif
+ if (simple) {
+ err->tokenOffset = encodedTokenOffset;
+ err->lineLength = encodedWindowLength;
+ } else {
+ sourceUnits.computeWindowOffsetAndLength(
+ encodedWindow, encodedTokenOffset, &err->tokenOffset,
+ encodedWindowLength, &err->lineLength);
+ }
+ }
+
+ return true;
+}
+
+template <typename Unit, class AnyCharsAccess>
+bool TokenStreamSpecific<Unit, AnyCharsAccess>::computeErrorMetadata(
+ ErrorMetadata* err, const ErrorOffset& errorOffset) const {
+ if (errorOffset.is<NoOffset>()) {
+ anyCharsAccess().computeErrorMetadataNoOffset(err);
+ return true;
+ }
+
+ uint32_t offset;
+ if (errorOffset.is<uint32_t>()) {
+ offset = errorOffset.as<uint32_t>();
+ } else {
+ offset = this->sourceUnits.offset();
+ }
+
+ // This function's return value isn't a success/failure indication: it
+ // returns true if this TokenStream can be used to provide a line of
+ // context.
+ if (fillExceptingContext(err, offset)) {
+ // Add a line of context from this TokenStream to help with debugging.
+ return internalComputeLineOfContext(err, offset);
+ }
+
+ // We can't fill in any more here.
+ return true;
+}
+
+template <typename Unit, class AnyCharsAccess>
+void TokenStreamSpecific<Unit, AnyCharsAccess>::reportIllegalCharacter(
+ int32_t cp) {
+ UniqueChars display = JS_smprintf("U+%04X", cp);
+ if (!display) {
+ ReportOutOfMemory(anyCharsAccess().fc);
+ return;
+ }
+ error(JSMSG_ILLEGAL_CHARACTER, display.get());
+}
+
+// We have encountered a '\': check for a Unicode escape sequence after it.
+// Return the length of the escape sequence and the encoded code point (by
+// value) if we found a Unicode escape sequence, and skip all code units
+// involed. Otherwise, return 0 and don't advance along the buffer.
+template <typename Unit, class AnyCharsAccess>
+uint32_t GeneralTokenStreamChars<Unit, AnyCharsAccess>::matchUnicodeEscape(
+ char32_t* codePoint) {
+ MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
+
+ int32_t unit = getCodeUnit();
+ if (unit != 'u') {
+ // NOTE: |unit| may be EOF here.
+ ungetCodeUnit(unit);
+ MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
+ return 0;
+ }
+
+ char16_t v;
+ unit = getCodeUnit();
+ if (IsAsciiHexDigit(unit) && this->sourceUnits.matchHexDigits(3, &v)) {
+ *codePoint = (AsciiAlphanumericToNumber(unit) << 12) | v;
+ return 5;
+ }
+
+ if (unit == '{') {
+ return matchExtendedUnicodeEscape(codePoint);
+ }
+
+ // NOTE: |unit| may be EOF here, so this ungets either one or two units.
+ ungetCodeUnit(unit);
+ ungetCodeUnit('u');
+ MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
+ return 0;
+}
+
+template <typename Unit, class AnyCharsAccess>
+uint32_t
+GeneralTokenStreamChars<Unit, AnyCharsAccess>::matchExtendedUnicodeEscape(
+ char32_t* codePoint) {
+ MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('{'));
+
+ int32_t unit = getCodeUnit();
+
+ // Skip leading zeroes.
+ uint32_t leadingZeroes = 0;
+ while (unit == '0') {
+ leadingZeroes++;
+ unit = getCodeUnit();
+ }
+
+ size_t i = 0;
+ uint32_t code = 0;
+ while (IsAsciiHexDigit(unit) && i < 6) {
+ code = (code << 4) | AsciiAlphanumericToNumber(unit);
+ unit = getCodeUnit();
+ i++;
+ }
+
+ uint32_t gotten =
+ 2 + // 'u{'
+ leadingZeroes + i + // significant hexdigits
+ (unit != EOF); // subtract a get if it didn't contribute to length
+
+ if (unit == '}' && (leadingZeroes > 0 || i > 0) &&
+ code <= unicode::NonBMPMax) {
+ *codePoint = code;
+ return gotten;
+ }
+
+ this->sourceUnits.unskipCodeUnits(gotten);
+ MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
+ return 0;
+}
+
+template <typename Unit, class AnyCharsAccess>
+uint32_t
+GeneralTokenStreamChars<Unit, AnyCharsAccess>::matchUnicodeEscapeIdStart(
+ char32_t* codePoint) {
+ uint32_t length = matchUnicodeEscape(codePoint);
+ if (MOZ_LIKELY(length > 0)) {
+ if (MOZ_LIKELY(unicode::IsIdentifierStart(*codePoint))) {
+ return length;
+ }
+
+ this->sourceUnits.unskipCodeUnits(length);
+ }
+
+ MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
+ return 0;
+}
+
+template <typename Unit, class AnyCharsAccess>
+bool GeneralTokenStreamChars<Unit, AnyCharsAccess>::matchUnicodeEscapeIdent(
+ char32_t* codePoint) {
+ uint32_t length = matchUnicodeEscape(codePoint);
+ if (MOZ_LIKELY(length > 0)) {
+ if (MOZ_LIKELY(unicode::IsIdentifierPart(*codePoint))) {
+ return true;
+ }
+
+ this->sourceUnits.unskipCodeUnits(length);
+ }
+
+ MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
+ return false;
+}
+
+template <typename Unit, class AnyCharsAccess>
+[[nodiscard]] bool
+TokenStreamSpecific<Unit, AnyCharsAccess>::matchIdentifierStart(
+ IdentifierEscapes* sawEscape) {
+ int32_t unit = getCodeUnit();
+ if (unit == EOF) {
+ error(JSMSG_MISSING_PRIVATE_NAME);
+ return false;
+ }
+
+ if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
+ if (unicode::IsIdentifierStart(char16_t(unit))) {
+ *sawEscape = IdentifierEscapes::None;
+ return true;
+ }
+
+ if (unit == '\\') {
+ char32_t codePoint;
+ uint32_t escapeLength = matchUnicodeEscapeIdStart(&codePoint);
+ if (escapeLength != 0) {
+ *sawEscape = IdentifierEscapes::SawUnicodeEscape;
+ return true;
+ }
+
+ // We could point "into" a mistyped escape, e.g. for "\u{41H}" we
+ // could point at the 'H'. But we don't do that now, so the code
+ // unit after the '\' isn't necessarily bad, so just point at the
+ // start of the actually-invalid escape.
+ ungetCodeUnit('\\');
+ error(JSMSG_BAD_ESCAPE);
+ return false;
+ }
+ }
+
+ // Unget the lead code unit before peeking at the full code point.
+ ungetCodeUnit(unit);
+
+ PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint();
+ if (!peeked.isNone() && unicode::IsIdentifierStart(peeked.codePoint())) {
+ this->sourceUnits.consumeKnownCodePoint(peeked);
+
+ *sawEscape = IdentifierEscapes::None;
+ return true;
+ }
+
+ error(JSMSG_MISSING_PRIVATE_NAME);
+ return false;
+}
+
+template <typename Unit, class AnyCharsAccess>
+bool TokenStreamSpecific<Unit, AnyCharsAccess>::getDirectives(
+ bool isMultiline, bool shouldWarnDeprecated) {
+ // Match directive comments used in debugging, such as "//# sourceURL" and
+ // "//# sourceMappingURL". Use of "//@" instead of "//#" is deprecated.
+ //
+ // To avoid a crashing bug in IE, several JavaScript transpilers wrap single
+ // line comments containing a source mapping URL inside a multiline
+ // comment. To avoid potentially expensive lookahead and backtracking, we
+ // only check for this case if we encounter a '#' code unit.
+
+ bool res = getDisplayURL(isMultiline, shouldWarnDeprecated) &&
+ getSourceMappingURL(isMultiline, shouldWarnDeprecated);
+ if (!res) {
+ badToken();
+ }
+
+ return res;
+}
+
+[[nodiscard]] bool TokenStreamCharsShared::copyCharBufferTo(
+ UniquePtr<char16_t[], JS::FreePolicy>* destination) {
+ size_t length = charBuffer.length();
+
+ *destination = fc->getAllocator()->make_pod_array<char16_t>(length + 1);
+ if (!*destination) {
+ return false;
+ }
+
+ std::copy(charBuffer.begin(), charBuffer.end(), destination->get());
+ (*destination)[length] = '\0';
+ return true;
+}
+
+template <typename Unit, class AnyCharsAccess>
+[[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::getDirective(
+ bool isMultiline, bool shouldWarnDeprecated, const char* directive,
+ uint8_t directiveLength, const char* errorMsgPragma,
+ UniquePtr<char16_t[], JS::FreePolicy>* destination) {
+ // Stop if we don't find |directive|. (Note that |directive| must be
+ // ASCII, so there are no tricky encoding issues to consider in matching
+ // UTF-8/16-agnostically.)
+ if (!this->sourceUnits.matchCodeUnits(directive, directiveLength)) {
+ return true;
+ }
+
+ if (shouldWarnDeprecated) {
+ if (!warning(JSMSG_DEPRECATED_PRAGMA, errorMsgPragma)) {
+ return false;
+ }
+ }
+
+ this->charBuffer.clear();
+
+ do {
+ int32_t unit = peekCodeUnit();
+ if (unit == EOF) {
+ break;
+ }
+
+ if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
+ if (unicode::IsSpace(AssertedCast<Latin1Char>(unit))) {
+ break;
+ }
+
+ consumeKnownCodeUnit(unit);
+
+ // Debugging directives can occur in both single- and multi-line
+ // comments. If we're currently inside a multi-line comment, we
+ // also must recognize multi-line comment terminators.
+ if (isMultiline && unit == '*' && peekCodeUnit() == '/') {
+ ungetCodeUnit('*');
+ break;
+ }
+
+ if (!this->charBuffer.append(unit)) {
+ return false;
+ }
+
+ continue;
+ }
+
+ // This ignores encoding errors: subsequent caller-side code to
+ // handle the remaining source text in the comment will do so.
+ PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint();
+ if (peeked.isNone() || unicode::IsSpace(peeked.codePoint())) {
+ break;
+ }
+
+ MOZ_ASSERT(!IsLineTerminator(peeked.codePoint()),
+ "!IsSpace must imply !IsLineTerminator or else we'll fail to "
+ "maintain line-info/flags for EOL");
+ this->sourceUnits.consumeKnownCodePoint(peeked);
+
+ if (!AppendCodePointToCharBuffer(this->charBuffer, peeked.codePoint())) {
+ return false;
+ }
+ } while (true);
+
+ if (this->charBuffer.empty()) {
+ // The directive's URL was missing, but comments can contain anything,
+ // so it isn't an error.
+ return true;
+ }
+
+ return copyCharBufferTo(destination);
+}
+
+template <typename Unit, class AnyCharsAccess>
+bool TokenStreamSpecific<Unit, AnyCharsAccess>::getDisplayURL(
+ bool isMultiline, bool shouldWarnDeprecated) {
+ // Match comments of the form "//# sourceURL=<url>" or
+ // "/\* //# sourceURL=<url> *\/"
+ //
+ // Note that while these are labeled "sourceURL" in the source text,
+ // internally we refer to it as a "displayURL" to distinguish what the
+ // developer would like to refer to the source as from the source's actual
+ // URL.
+
+ static constexpr char sourceURLDirective[] = " sourceURL=";
+ constexpr uint8_t sourceURLDirectiveLength = js_strlen(sourceURLDirective);
+ return getDirective(isMultiline, shouldWarnDeprecated, sourceURLDirective,
+ sourceURLDirectiveLength, "sourceURL",
+ &anyCharsAccess().displayURL_);
+}
+
+template <typename Unit, class AnyCharsAccess>
+bool TokenStreamSpecific<Unit, AnyCharsAccess>::getSourceMappingURL(
+ bool isMultiline, bool shouldWarnDeprecated) {
+ // Match comments of the form "//# sourceMappingURL=<url>" or
+ // "/\* //# sourceMappingURL=<url> *\/"
+
+ static constexpr char sourceMappingURLDirective[] = " sourceMappingURL=";
+ constexpr uint8_t sourceMappingURLDirectiveLength =
+ js_strlen(sourceMappingURLDirective);
+ return getDirective(isMultiline, shouldWarnDeprecated,
+ sourceMappingURLDirective,
+ sourceMappingURLDirectiveLength, "sourceMappingURL",
+ &anyCharsAccess().sourceMapURL_);
+}
+
+template <typename Unit, class AnyCharsAccess>
+MOZ_ALWAYS_INLINE Token*
+GeneralTokenStreamChars<Unit, AnyCharsAccess>::newTokenInternal(
+ TokenKind kind, TokenStart start, TokenKind* out) {
+ MOZ_ASSERT(kind < TokenKind::Limit);
+ MOZ_ASSERT(kind != TokenKind::Eol,
+ "TokenKind::Eol should never be used in an actual Token, only "
+ "returned by peekTokenSameLine()");
+
+ TokenStreamAnyChars& anyChars = anyCharsAccess();
+ anyChars.flags.isDirtyLine = true;
+
+ Token* token = anyChars.allocateToken();
+
+ *out = token->type = kind;
+ token->pos = TokenPos(start.offset(), this->sourceUnits.offset());
+ MOZ_ASSERT(token->pos.begin <= token->pos.end);
+
+ // NOTE: |token->modifier| is set in |newToken()| so that optimized,
+ // non-debug code won't do any work to pass a modifier-argument that will
+ // never be used.
+
+ return token;
+}
+
+template <typename Unit, class AnyCharsAccess>
+MOZ_COLD bool GeneralTokenStreamChars<Unit, AnyCharsAccess>::badToken() {
+ // We didn't get a token, so don't set |flags.isDirtyLine|.
+ anyCharsAccess().flags.hadError = true;
+
+ // Poisoning sourceUnits on error establishes an invariant: once an
+ // erroneous token has been seen, sourceUnits will not be consulted again.
+ // This is true because the parser will deal with the illegal token by
+ // aborting parsing immediately.
+ this->sourceUnits.poisonInDebug();
+
+ return false;
+};
+
+bool AppendCodePointToCharBuffer(CharBuffer& charBuffer, char32_t codePoint) {
+ MOZ_ASSERT(codePoint <= unicode::NonBMPMax,
+ "should only be processing code points validly decoded from UTF-8 "
+ "or WTF-16 source text (surrogate code points permitted)");
+
+ char16_t units[2];
+ unsigned numUnits = 0;
+ unicode::UTF16Encode(codePoint, units, &numUnits);
+
+ MOZ_ASSERT(numUnits == 1 || numUnits == 2,
+ "UTF-16 code points are only encoded in one or two units");
+
+ if (!charBuffer.append(units[0])) {
+ return false;
+ }
+
+ if (numUnits == 1) {
+ return true;
+ }
+
+ return charBuffer.append(units[1]);
+}
+
+template <typename Unit, class AnyCharsAccess>
+bool TokenStreamSpecific<Unit, AnyCharsAccess>::putIdentInCharBuffer(
+ const Unit* identStart) {
+ const Unit* const originalAddress = this->sourceUnits.addressOfNextCodeUnit();
+ this->sourceUnits.setAddressOfNextCodeUnit(identStart);
+
+ auto restoreNextRawCharAddress = MakeScopeExit([this, originalAddress]() {
+ this->sourceUnits.setAddressOfNextCodeUnit(originalAddress);
+ });
+
+ this->charBuffer.clear();
+ do {
+ int32_t unit = getCodeUnit();
+ if (unit == EOF) {
+ break;
+ }
+
+ char32_t codePoint;
+ if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
+ if (unicode::IsIdentifierPart(char16_t(unit)) || unit == '#') {
+ if (!this->charBuffer.append(unit)) {
+ return false;
+ }
+
+ continue;
+ }
+
+ if (unit != '\\' || !matchUnicodeEscapeIdent(&codePoint)) {
+ break;
+ }
+ } else {
+ // |restoreNextRawCharAddress| undoes all gets, and this function
+ // doesn't update line/column info.
+ char32_t cp;
+ if (!getNonAsciiCodePointDontNormalize(toUnit(unit), &cp)) {
+ return false;
+ }
+
+ codePoint = cp;
+ if (!unicode::IsIdentifierPart(codePoint)) {
+ break;
+ }
+ }
+
+ if (!AppendCodePointToCharBuffer(this->charBuffer, codePoint)) {
+ return false;
+ }
+ } while (true);
+
+ return true;
+}
+
+template <typename Unit, class AnyCharsAccess>
+[[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::identifierName(
+ TokenStart start, const Unit* identStart, IdentifierEscapes escaping,
+ Modifier modifier, NameVisibility visibility, TokenKind* out) {
+ // Run the bad-token code for every path out of this function except the
+ // two success-cases.
+ auto noteBadToken = MakeScopeExit([this]() { this->badToken(); });
+
+ // We've already consumed an initial code point in the identifer, to *know*
+ // that this is an identifier. So no need to worry about not consuming any
+ // code points in the loop below.
+ int32_t unit;
+ while (true) {
+ unit = peekCodeUnit();
+ if (unit == EOF) {
+ break;
+ }
+
+ if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
+ consumeKnownCodeUnit(unit);
+
+ if (MOZ_UNLIKELY(
+ !unicode::IsIdentifierPart(static_cast<char16_t>(unit)))) {
+ // Handle a Unicode escape -- otherwise it's not part of the
+ // identifier.
+ char32_t codePoint;
+ if (unit != '\\' || !matchUnicodeEscapeIdent(&codePoint)) {
+ ungetCodeUnit(unit);
+ break;
+ }
+
+ escaping = IdentifierEscapes::SawUnicodeEscape;
+ }
+ } else {
+ // This ignores encoding errors: subsequent caller-side code to
+ // handle source text after the IdentifierName will do so.
+ PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint();
+ if (peeked.isNone() || !unicode::IsIdentifierPart(peeked.codePoint())) {
+ break;
+ }
+
+ MOZ_ASSERT(!IsLineTerminator(peeked.codePoint()),
+ "IdentifierPart must guarantee !IsLineTerminator or "
+ "else we'll fail to maintain line-info/flags for EOL");
+
+ this->sourceUnits.consumeKnownCodePoint(peeked);
+ }
+ }
+
+ TaggedParserAtomIndex atom;
+ if (MOZ_UNLIKELY(escaping == IdentifierEscapes::SawUnicodeEscape)) {
+ // Identifiers containing Unicode escapes have to be converted into
+ // tokenbuf before atomizing.
+ if (!putIdentInCharBuffer(identStart)) {
+ return false;
+ }
+
+ atom = drainCharBufferIntoAtom();
+ } else {
+ // Escape-free identifiers can be created directly from sourceUnits.
+ const Unit* chars = identStart;
+ size_t length = this->sourceUnits.addressOfNextCodeUnit() - identStart;
+
+ // Private identifiers start with a '#', and so cannot be reserved words.
+ if (visibility == NameVisibility::Public) {
+ // Represent reserved words lacking escapes as reserved word tokens.
+ if (const ReservedWordInfo* rw = FindReservedWord(chars, length)) {
+ noteBadToken.release();
+ newSimpleToken(rw->tokentype, start, modifier, out);
+ return true;
+ }
+ }
+
+ atom = atomizeSourceChars(Span(chars, length));
+ }
+ if (!atom) {
+ return false;
+ }
+
+ noteBadToken.release();
+ if (visibility == NameVisibility::Private) {
+ newPrivateNameToken(atom, start, modifier, out);
+ return true;
+ }
+ newNameToken(atom, start, modifier, out);
+ return true;
+}
+
+enum FirstCharKind {
+ // A char16_t has the 'OneChar' kind if it, by itself, constitutes a valid
+ // token that cannot also be a prefix of a longer token. E.g. ';' has the
+ // OneChar kind, but '+' does not, because '++' and '+=' are valid longer
+ // tokens
+ // that begin with '+'.
+ //
+ // The few token kinds satisfying these properties cover roughly 35--45%
+ // of the tokens seen in practice.
+ //
+ // We represent the 'OneChar' kind with any positive value less than
+ // TokenKind::Limit. This representation lets us associate
+ // each one-char token char16_t with a TokenKind and thus avoid
+ // a subsequent char16_t-to-TokenKind conversion.
+ OneChar_Min = 0,
+ OneChar_Max = size_t(TokenKind::Limit) - 1,
+
+ Space = size_t(TokenKind::Limit),
+ Ident,
+ Dec,
+ String,
+ EOL,
+ ZeroDigit,
+ Other,
+
+ LastCharKind = Other
+};
+
+// OneChar: 40, 41, 44, 58, 59, 91, 93, 123, 125, 126:
+// '(', ')', ',', ':', ';', '[', ']', '{', '}', '~'
+// Ident: 36, 65..90, 95, 97..122: '$', 'A'..'Z', '_', 'a'..'z'
+// Dot: 46: '.'
+// Equals: 61: '='
+// String: 34, 39, 96: '"', '\'', '`'
+// Dec: 49..57: '1'..'9'
+// Plus: 43: '+'
+// ZeroDigit: 48: '0'
+// Space: 9, 11, 12, 32: '\t', '\v', '\f', ' '
+// EOL: 10, 13: '\n', '\r'
+//
+#define T_COMMA size_t(TokenKind::Comma)
+#define T_COLON size_t(TokenKind::Colon)
+#define T_BITNOT size_t(TokenKind::BitNot)
+#define T_LP size_t(TokenKind::LeftParen)
+#define T_RP size_t(TokenKind::RightParen)
+#define T_SEMI size_t(TokenKind::Semi)
+#define T_LB size_t(TokenKind::LeftBracket)
+#define T_RB size_t(TokenKind::RightBracket)
+#define T_LC size_t(TokenKind::LeftCurly)
+#define T_RC size_t(TokenKind::RightCurly)
+#define _______ Other
+static const uint8_t firstCharKinds[] = {
+ // clang-format off
+/* 0 1 2 3 4 5 6 7 8 9 */
+/* 0+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, Space,
+/* 10+ */ EOL, Space, Space, EOL, _______, _______, _______, _______, _______, _______,
+/* 20+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
+/* 30+ */ _______, _______, Space, _______, String, _______, Ident, _______, _______, String,
+/* 40+ */ T_LP, T_RP, _______, _______, T_COMMA, _______, _______, _______,ZeroDigit, Dec,
+/* 50+ */ Dec, Dec, Dec, Dec, Dec, Dec, Dec, Dec, T_COLON, T_SEMI,
+/* 60+ */ _______, _______, _______, _______, _______, Ident, Ident, Ident, Ident, Ident,
+/* 70+ */ Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident,
+/* 80+ */ Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident,
+/* 90+ */ Ident, T_LB, _______, T_RB, _______, Ident, String, Ident, Ident, Ident,
+/* 100+ */ Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident,
+/* 110+ */ Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident,
+/* 120+ */ Ident, Ident, Ident, T_LC, _______, T_RC,T_BITNOT, _______
+ // clang-format on
+};
+#undef T_COMMA
+#undef T_COLON
+#undef T_BITNOT
+#undef T_LP
+#undef T_RP
+#undef T_SEMI
+#undef T_LB
+#undef T_RB
+#undef T_LC
+#undef T_RC
+#undef _______
+
+static_assert(LastCharKind < (1 << (sizeof(firstCharKinds[0]) * 8)),
+ "Elements of firstCharKinds[] are too small");
+
+template <>
+void SourceUnits<char16_t>::consumeRestOfSingleLineComment() {
+ while (MOZ_LIKELY(!atEnd())) {
+ char16_t unit = peekCodeUnit();
+ if (IsLineTerminator(unit)) {
+ return;
+ }
+
+ consumeKnownCodeUnit(unit);
+ }
+}
+
+template <>
+void SourceUnits<Utf8Unit>::consumeRestOfSingleLineComment() {
+ while (MOZ_LIKELY(!atEnd())) {
+ const Utf8Unit unit = peekCodeUnit();
+ if (IsSingleUnitLineTerminator(unit)) {
+ return;
+ }
+
+ if (MOZ_LIKELY(IsAscii(unit))) {
+ consumeKnownCodeUnit(unit);
+ continue;
+ }
+
+ PeekedCodePoint<Utf8Unit> peeked = peekCodePoint();
+ if (peeked.isNone()) {
+ return;
+ }
+
+ char32_t c = peeked.codePoint();
+ if (MOZ_UNLIKELY(c == unicode::LINE_SEPARATOR ||
+ c == unicode::PARA_SEPARATOR)) {
+ return;
+ }
+
+ consumeKnownCodePoint(peeked);
+ }
+}
+
+template <typename Unit, class AnyCharsAccess>
+[[nodiscard]] MOZ_ALWAYS_INLINE bool
+TokenStreamSpecific<Unit, AnyCharsAccess>::matchInteger(
+ IsIntegerUnit isIntegerUnit, int32_t* nextUnit) {
+ int32_t unit = getCodeUnit();
+ if (!isIntegerUnit(unit)) {
+ *nextUnit = unit;
+ return true;
+ }
+ return matchIntegerAfterFirstDigit(isIntegerUnit, nextUnit);
+}
+
+template <typename Unit, class AnyCharsAccess>
+[[nodiscard]] MOZ_ALWAYS_INLINE bool
+TokenStreamSpecific<Unit, AnyCharsAccess>::matchIntegerAfterFirstDigit(
+ IsIntegerUnit isIntegerUnit, int32_t* nextUnit) {
+ int32_t unit;
+ while (true) {
+ unit = getCodeUnit();
+ if (isIntegerUnit(unit)) {
+ continue;
+ }
+ if (unit != '_') {
+ break;
+ }
+ unit = getCodeUnit();
+ if (!isIntegerUnit(unit)) {
+ if (unit == '_') {
+ ungetCodeUnit(unit);
+ error(JSMSG_NUMBER_MULTIPLE_ADJACENT_UNDERSCORES);
+ } else {
+ ungetCodeUnit(unit);
+ ungetCodeUnit('_');
+ error(JSMSG_NUMBER_END_WITH_UNDERSCORE);
+ }
+ return false;
+ }
+ }
+
+ *nextUnit = unit;
+ return true;
+}
+
+template <typename Unit, class AnyCharsAccess>
+[[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::decimalNumber(
+ int32_t unit, TokenStart start, const Unit* numStart, Modifier modifier,
+ TokenKind* out) {
+ // Run the bad-token code for every path out of this function except the
+ // one success-case.
+ auto noteBadToken = MakeScopeExit([this]() { this->badToken(); });
+
+ // Consume integral component digits.
+ if (IsAsciiDigit(unit)) {
+ if (!matchIntegerAfterFirstDigit(IsAsciiDigit, &unit)) {
+ return false;
+ }
+ }
+
+ // Numbers contain no escapes, so we can read directly from |sourceUnits|.
+ double dval;
+ bool isBigInt = false;
+ DecimalPoint decimalPoint = NoDecimal;
+ if (unit != '.' && unit != 'e' && unit != 'E' && unit != 'n') {
+ // NOTE: |unit| may be EOF here.
+ ungetCodeUnit(unit);
+
+ // Most numbers are pure decimal integers without fractional component
+ // or exponential notation. Handle that with optimized code.
+ if (!GetDecimalInteger(numStart, this->sourceUnits.addressOfNextCodeUnit(),
+ &dval)) {
+ ReportOutOfMemory(this->fc);
+ return false;
+ }
+ } else if (unit == 'n') {
+ isBigInt = true;
+ unit = peekCodeUnit();
+ } else {
+ // Consume any decimal dot and fractional component.
+ if (unit == '.') {
+ decimalPoint = HasDecimal;
+ if (!matchInteger(IsAsciiDigit, &unit)) {
+ return false;
+ }
+ }
+
+ // Consume any exponential notation.
+ if (unit == 'e' || unit == 'E') {
+ unit = getCodeUnit();
+ if (unit == '+' || unit == '-') {
+ unit = getCodeUnit();
+ }
+
+ // Exponential notation must contain at least one digit.
+ if (!IsAsciiDigit(unit)) {
+ ungetCodeUnit(unit);
+ error(JSMSG_MISSING_EXPONENT);
+ return false;
+ }
+
+ // Consume exponential digits.
+ if (!matchIntegerAfterFirstDigit(IsAsciiDigit, &unit)) {
+ return false;
+ }
+ }
+
+ ungetCodeUnit(unit);
+
+ if (!GetDecimal(numStart, this->sourceUnits.addressOfNextCodeUnit(),
+ &dval)) {
+ ReportOutOfMemory(this->fc);
+ return false;
+ }
+ }
+
+ // Number followed by IdentifierStart is an error. (This is the only place
+ // in ECMAScript where token boundary is inadequate to properly separate
+ // two tokens, necessitating this unaesthetic lookahead.)
+ if (unit != EOF) {
+ if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
+ if (unicode::IsIdentifierStart(char16_t(unit))) {
+ error(JSMSG_IDSTART_AFTER_NUMBER);
+ return false;
+ }
+ } else {
+ // This ignores encoding errors: subsequent caller-side code to
+ // handle source text after the number will do so.
+ PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint();
+ if (!peeked.isNone() && unicode::IsIdentifierStart(peeked.codePoint())) {
+ error(JSMSG_IDSTART_AFTER_NUMBER);
+ return false;
+ }
+ }
+ }
+
+ noteBadToken.release();
+
+ if (isBigInt) {
+ return bigIntLiteral(start, modifier, out);
+ }
+
+ newNumberToken(dval, decimalPoint, start, modifier, out);
+ return true;
+}
+
+template <typename Unit, class AnyCharsAccess>
+[[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::regexpLiteral(
+ TokenStart start, TokenKind* out) {
+ MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('/'));
+ this->charBuffer.clear();
+
+ auto ProcessNonAsciiCodePoint = [this](int32_t lead) {
+ MOZ_ASSERT(lead != EOF);
+ MOZ_ASSERT(!this->isAsciiCodePoint(lead));
+
+ char32_t codePoint;
+ if (!this->getNonAsciiCodePointDontNormalize(this->toUnit(lead),
+ &codePoint)) {
+ return false;
+ }
+
+ if (MOZ_UNLIKELY(codePoint == unicode::LINE_SEPARATOR ||
+ codePoint == unicode::PARA_SEPARATOR)) {
+ this->sourceUnits.ungetLineOrParagraphSeparator();
+ this->error(JSMSG_UNTERMINATED_REGEXP);
+ return false;
+ }
+
+ return AppendCodePointToCharBuffer(this->charBuffer, codePoint);
+ };
+
+ auto ReportUnterminatedRegExp = [this](int32_t unit) {
+ this->ungetCodeUnit(unit);
+ this->error(JSMSG_UNTERMINATED_REGEXP);
+ };
+
+ bool inCharClass = false;
+ do {
+ int32_t unit = getCodeUnit();
+ if (unit == EOF) {
+ ReportUnterminatedRegExp(unit);
+ return badToken();
+ }
+
+ if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) {
+ if (!ProcessNonAsciiCodePoint(unit)) {
+ return badToken();
+ }
+
+ continue;
+ }
+
+ if (unit == '\\') {
+ if (!this->charBuffer.append(unit)) {
+ return badToken();
+ }
+
+ unit = getCodeUnit();
+ if (unit == EOF) {
+ ReportUnterminatedRegExp(unit);
+ return badToken();
+ }
+
+ // Fallthrough only handles ASCII code points, so
+ // deal with non-ASCII and skip everything else.
+ if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) {
+ if (!ProcessNonAsciiCodePoint(unit)) {
+ return badToken();
+ }
+
+ continue;
+ }
+ } else if (unit == '[') {
+ inCharClass = true;
+ } else if (unit == ']') {
+ inCharClass = false;
+ } else if (unit == '/' && !inCharClass) {
+ // For IE compat, allow unescaped / in char classes.
+ break;
+ }
+
+ // NOTE: Non-ASCII LineTerminators were handled by
+ // ProcessNonAsciiCodePoint calls above.
+ if (unit == '\r' || unit == '\n') {
+ ReportUnterminatedRegExp(unit);
+ return badToken();
+ }
+
+ MOZ_ASSERT(!IsLineTerminator(AssertedCast<char32_t>(unit)));
+ if (!this->charBuffer.append(unit)) {
+ return badToken();
+ }
+ } while (true);
+
+ int32_t unit;
+ RegExpFlags reflags = RegExpFlag::NoFlags;
+ while (true) {
+ uint8_t flag;
+ unit = getCodeUnit();
+ if (unit == 'd') {
+ flag = RegExpFlag::HasIndices;
+ } else if (unit == 'g') {
+ flag = RegExpFlag::Global;
+ } else if (unit == 'i') {
+ flag = RegExpFlag::IgnoreCase;
+ } else if (unit == 'm') {
+ flag = RegExpFlag::Multiline;
+ } else if (unit == 's') {
+ flag = RegExpFlag::DotAll;
+ } else if (unit == 'u') {
+ flag = RegExpFlag::Unicode;
+ } else if (unit == 'v') {
+ flag = RegExpFlag::UnicodeSets;
+ } else if (unit == 'y') {
+ flag = RegExpFlag::Sticky;
+ } else if (IsAsciiAlpha(unit)) {
+ flag = RegExpFlag::NoFlags;
+ } else {
+ break;
+ }
+
+ if ((reflags & flag) || flag == RegExpFlag::NoFlags) {
+ ungetCodeUnit(unit);
+ char buf[2] = {char(unit), '\0'};
+ error(JSMSG_BAD_REGEXP_FLAG, buf);
+ return badToken();
+ }
+
+ // /u and /v flags are mutually exclusive.
+ if (((reflags & RegExpFlag::Unicode) && (flag & RegExpFlag::UnicodeSets)) ||
+ ((reflags & RegExpFlag::UnicodeSets) && (flag & RegExpFlag::Unicode))) {
+ ungetCodeUnit(unit);
+ char buf[2] = {char(unit), '\0'};
+ error(JSMSG_BAD_REGEXP_FLAG, buf);
+ return badToken();
+ }
+
+ reflags |= flag;
+ }
+ ungetCodeUnit(unit);
+
+ newRegExpToken(reflags, start, out);
+ return true;
+}
+
+template <typename Unit, class AnyCharsAccess>
+[[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::bigIntLiteral(
+ TokenStart start, Modifier modifier, TokenKind* out) {
+ MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == toUnit('n'));
+ MOZ_ASSERT(this->sourceUnits.offset() > start.offset());
+ uint32_t length = this->sourceUnits.offset() - start.offset();
+ MOZ_ASSERT(length >= 2);
+ this->charBuffer.clear();
+ mozilla::Range<const Unit> chars(
+ this->sourceUnits.codeUnitPtrAt(start.offset()), length);
+ for (uint32_t idx = 0; idx < length - 1; idx++) {
+ int32_t unit = CodeUnitValue(chars[idx]);
+ // Char buffer may start with a 0[bBoOxX] prefix, then follows with
+ // binary, octal, decimal, or hex digits. Already checked by caller, as
+ // the "n" indicating bigint comes at the end.
+ MOZ_ASSERT(isAsciiCodePoint(unit));
+ // Skip over any separators.
+ if (unit == '_') {
+ continue;
+ }
+ if (!AppendCodePointToCharBuffer(this->charBuffer, unit)) {
+ return false;
+ }
+ }
+ newBigIntToken(start, modifier, out);
+ return true;
+}
+
+template <typename Unit, class AnyCharsAccess>
+void GeneralTokenStreamChars<Unit,
+ AnyCharsAccess>::consumeOptionalHashbangComment() {
+ MOZ_ASSERT(this->sourceUnits.atStart(),
+ "HashBangComment can only appear immediately at the start of a "
+ "Script or Module");
+
+ // HashbangComment ::
+ // #! SingleLineCommentChars_opt
+
+ if (!matchCodeUnit('#')) {
+ // HashbangComment is optional at start of Script or Module.
+ return;
+ }
+
+ if (!matchCodeUnit('!')) {
+ // # not followed by ! at start of Script or Module is an error, but normal
+ // parsing code will handle that error just fine if we let it.
+ ungetCodeUnit('#');
+ return;
+ }
+
+ // This doesn't consume a concluding LineTerminator, and it stops consuming
+ // just before any encoding error. The subsequent |getToken| call will call
+ // |getTokenInternal| below which will handle these possibilities.
+ this->sourceUnits.consumeRestOfSingleLineComment();
+}
+
+template <typename Unit, class AnyCharsAccess>
+[[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::getTokenInternal(
+ TokenKind* const ttp, const Modifier modifier) {
+ // Assume we'll fail: success cases will overwrite this.
+#ifdef DEBUG
+ *ttp = TokenKind::Limit;
+#endif
+ MOZ_MAKE_MEM_UNDEFINED(ttp, sizeof(*ttp));
+
+ // This loop runs more than once only when whitespace or comments are
+ // encountered.
+ do {
+ int32_t unit = peekCodeUnit();
+ if (MOZ_UNLIKELY(unit == EOF)) {
+ MOZ_ASSERT(this->sourceUnits.atEnd());
+ anyCharsAccess().flags.isEOF = true;
+ TokenStart start(this->sourceUnits, 0);
+ newSimpleToken(TokenKind::Eof, start, modifier, ttp);
+ return true;
+ }
+
+ if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) {
+ // Non-ASCII code points can only be identifiers or whitespace. It would
+ // be nice to compute these *after* discarding whitespace, but IN A WORLD
+ // where |unicode::IsSpace| requires consuming a variable number of code
+ // units, it's easier to assume it's an identifier and maybe do a little
+ // wasted work, than to unget and compute and reget if whitespace.
+ TokenStart start(this->sourceUnits, 0);
+ const Unit* identStart = this->sourceUnits.addressOfNextCodeUnit();
+
+ PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint();
+ if (peeked.isNone()) {
+ MOZ_ALWAYS_FALSE(getCodePoint());
+ return badToken();
+ }
+
+ char32_t cp = peeked.codePoint();
+ if (unicode::IsSpace(cp)) {
+ this->sourceUnits.consumeKnownCodePoint(peeked);
+ if (IsLineTerminator(cp)) {
+ if (!updateLineInfoForEOL()) {
+ return badToken();
+ }
+
+ anyCharsAccess().updateFlagsForEOL();
+ }
+
+ continue;
+ }
+
+ static_assert(isAsciiCodePoint('$'),
+ "IdentifierStart contains '$', but as "
+ "!IsUnicodeIDStart('$'), ensure that '$' is never "
+ "handled here");
+ static_assert(isAsciiCodePoint('_'),
+ "IdentifierStart contains '_', but as "
+ "!IsUnicodeIDStart('_'), ensure that '_' is never "
+ "handled here");
+
+ if (MOZ_LIKELY(unicode::IsUnicodeIDStart(cp))) {
+ this->sourceUnits.consumeKnownCodePoint(peeked);
+ MOZ_ASSERT(!IsLineTerminator(cp),
+ "IdentifierStart must guarantee !IsLineTerminator "
+ "or else we'll fail to maintain line-info/flags "
+ "for EOL here");
+
+ return identifierName(start, identStart, IdentifierEscapes::None,
+ modifier, NameVisibility::Public, ttp);
+ }
+
+ reportIllegalCharacter(cp);
+ return badToken();
+ } // !isAsciiCodePoint(unit)
+
+ consumeKnownCodeUnit(unit);
+
+ // Get the token kind, based on the first char. The ordering of c1kind
+ // comparison is based on the frequency of tokens in real code:
+ // Parsemark (which represents typical JS code on the web) and the
+ // Unreal demo (which represents asm.js code).
+ //
+ // Parsemark Unreal
+ // OneChar 32.9% 39.7%
+ // Space 25.0% 0.6%
+ // Ident 19.2% 36.4%
+ // Dec 7.2% 5.1%
+ // String 7.9% 0.0%
+ // EOL 1.7% 0.0%
+ // ZeroDigit 0.4% 4.9%
+ // Other 5.7% 13.3%
+ //
+ // The ordering is based mostly only Parsemark frequencies, with Unreal
+ // frequencies used to break close categories (e.g. |Dec| and
+ // |String|). |Other| is biggish, but no other token kind is common
+ // enough for it to be worth adding extra values to FirstCharKind.
+ FirstCharKind c1kind = FirstCharKind(firstCharKinds[unit]);
+
+ // Look for an unambiguous single-char token.
+ //
+ if (c1kind <= OneChar_Max) {
+ TokenStart start(this->sourceUnits, -1);
+ newSimpleToken(TokenKind(c1kind), start, modifier, ttp);
+ return true;
+ }
+
+ // Skip over non-EOL whitespace chars.
+ //
+ if (c1kind == Space) {
+ continue;
+ }
+
+ // Look for an identifier.
+ //
+ if (c1kind == Ident) {
+ TokenStart start(this->sourceUnits, -1);
+ return identifierName(
+ start, this->sourceUnits.addressOfNextCodeUnit() - 1,
+ IdentifierEscapes::None, modifier, NameVisibility::Public, ttp);
+ }
+
+ // Look for a decimal number.
+ //
+ if (c1kind == Dec) {
+ TokenStart start(this->sourceUnits, -1);
+ const Unit* numStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
+ return decimalNumber(unit, start, numStart, modifier, ttp);
+ }
+
+ // Look for a string or a template string.
+ //
+ if (c1kind == String) {
+ return getStringOrTemplateToken(static_cast<char>(unit), modifier, ttp);
+ }
+
+ // Skip over EOL chars, updating line state along the way.
+ //
+ if (c1kind == EOL) {
+ if (unit == '\r') {
+ matchLineTerminator('\n');
+ }
+
+ if (!updateLineInfoForEOL()) {
+ return badToken();
+ }
+
+ anyCharsAccess().updateFlagsForEOL();
+ continue;
+ }
+
+ // From a '0', look for a hexadecimal, binary, octal, or "noctal" (a
+ // number starting with '0' that contains '8' or '9' and is treated as
+ // decimal) number.
+ //
+ if (c1kind == ZeroDigit) {
+ TokenStart start(this->sourceUnits, -1);
+ int radix;
+ bool isBigInt = false;
+ const Unit* numStart;
+ unit = getCodeUnit();
+ if (unit == 'x' || unit == 'X') {
+ radix = 16;
+ unit = getCodeUnit();
+ if (!IsAsciiHexDigit(unit)) {
+ // NOTE: |unit| may be EOF here.
+ ungetCodeUnit(unit);
+ error(JSMSG_MISSING_HEXDIGITS);
+ return badToken();
+ }
+
+ // one past the '0x'
+ numStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
+
+ if (!matchIntegerAfterFirstDigit(IsAsciiHexDigit, &unit)) {
+ return badToken();
+ }
+ } else if (unit == 'b' || unit == 'B') {
+ radix = 2;
+ unit = getCodeUnit();
+ if (!IsAsciiBinary(unit)) {
+ // NOTE: |unit| may be EOF here.
+ ungetCodeUnit(unit);
+ error(JSMSG_MISSING_BINARY_DIGITS);
+ return badToken();
+ }
+
+ // one past the '0b'
+ numStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
+
+ if (!matchIntegerAfterFirstDigit(IsAsciiBinary, &unit)) {
+ return badToken();
+ }
+ } else if (unit == 'o' || unit == 'O') {
+ radix = 8;
+ unit = getCodeUnit();
+ if (!IsAsciiOctal(unit)) {
+ // NOTE: |unit| may be EOF here.
+ ungetCodeUnit(unit);
+ error(JSMSG_MISSING_OCTAL_DIGITS);
+ return badToken();
+ }
+
+ // one past the '0o'
+ numStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
+
+ if (!matchIntegerAfterFirstDigit(IsAsciiOctal, &unit)) {
+ return badToken();
+ }
+ } else if (IsAsciiDigit(unit)) {
+ // Reject octal literals that appear in strict mode code.
+ if (!strictModeError(JSMSG_DEPRECATED_OCTAL_LITERAL)) {
+ return badToken();
+ }
+
+ // The above test doesn't catch a few edge cases; see
+ // |GeneralParser::maybeParseDirective|. Record the violation so that
+ // that function can handle them.
+ anyCharsAccess().setSawDeprecatedOctalLiteral();
+
+ radix = 8;
+ // one past the '0'
+ numStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
+
+ bool nonOctalDecimalIntegerLiteral = false;
+ do {
+ if (unit >= '8') {
+ nonOctalDecimalIntegerLiteral = true;
+ }
+ unit = getCodeUnit();
+ } while (IsAsciiDigit(unit));
+
+ if (unit == '_') {
+ ungetCodeUnit(unit);
+ error(JSMSG_SEPARATOR_IN_ZERO_PREFIXED_NUMBER);
+ return badToken();
+ }
+
+ if (unit == 'n') {
+ ungetCodeUnit(unit);
+ error(JSMSG_BIGINT_INVALID_SYNTAX);
+ return badToken();
+ }
+
+ if (nonOctalDecimalIntegerLiteral) {
+ // Use the decimal scanner for the rest of the number.
+ return decimalNumber(unit, start, numStart, modifier, ttp);
+ }
+ } else if (unit == '_') {
+ // Give a more explicit error message when '_' is used after '0'.
+ ungetCodeUnit(unit);
+ error(JSMSG_SEPARATOR_IN_ZERO_PREFIXED_NUMBER);
+ return badToken();
+ } else {
+ // '0' not followed by [XxBbOo0-9_]; scan as a decimal number.
+ ungetCodeUnit(unit);
+ numStart = this->sourceUnits.addressOfNextCodeUnit() - 1; // The '0'.
+ return decimalNumber('0', start, numStart, modifier, ttp);
+ }
+
+ if (unit == 'n') {
+ isBigInt = true;
+ unit = peekCodeUnit();
+ } else {
+ ungetCodeUnit(unit);
+ }
+
+ // Error if an identifier-start code point appears immediately
+ // after the number. Somewhat surprisingly, if we don't check
+ // here, we'll never check at all.
+ if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
+ if (unicode::IsIdentifierStart(char16_t(unit))) {
+ error(JSMSG_IDSTART_AFTER_NUMBER);
+ return badToken();
+ }
+ } else if (MOZ_LIKELY(unit != EOF)) {
+ // This ignores encoding errors: subsequent caller-side code to
+ // handle source text after the number will do so.
+ PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint();
+ if (!peeked.isNone() &&
+ unicode::IsIdentifierStart(peeked.codePoint())) {
+ error(JSMSG_IDSTART_AFTER_NUMBER);
+ return badToken();
+ }
+ }
+
+ if (isBigInt) {
+ return bigIntLiteral(start, modifier, ttp);
+ }
+
+ double dval;
+ if (!GetFullInteger(numStart, this->sourceUnits.addressOfNextCodeUnit(),
+ radix, IntegerSeparatorHandling::SkipUnderscore,
+ &dval)) {
+ ReportOutOfMemory(this->fc);
+ return badToken();
+ }
+ newNumberToken(dval, NoDecimal, start, modifier, ttp);
+ return true;
+ }
+
+ MOZ_ASSERT(c1kind == Other);
+
+ // This handles everything else. Simple tokens distinguished solely by
+ // TokenKind should set |simpleKind| and break, to share simple-token
+ // creation code for all such tokens. All other tokens must be handled
+ // by returning (or by continuing from the loop enclosing this).
+ //
+ TokenStart start(this->sourceUnits, -1);
+ TokenKind simpleKind;
+#ifdef DEBUG
+ simpleKind = TokenKind::Limit; // sentinel value for code after switch
+#endif
+
+ // The block a ways above eliminated all non-ASCII, so cast to the
+ // smallest type possible to assist the C++ compiler.
+ switch (AssertedCast<uint8_t>(CodeUnitValue(toUnit(unit)))) {
+ case '.':
+ if (IsAsciiDigit(peekCodeUnit())) {
+ return decimalNumber('.', start,
+ this->sourceUnits.addressOfNextCodeUnit() - 1,
+ modifier, ttp);
+ }
+
+ unit = getCodeUnit();
+ if (unit == '.') {
+ if (matchCodeUnit('.')) {
+ simpleKind = TokenKind::TripleDot;
+ break;
+ }
+ }
+
+ // NOTE: |unit| may be EOF here. A stray '.' at EOF would be an
+ // error, but subsequent code will handle it.
+ ungetCodeUnit(unit);
+
+ simpleKind = TokenKind::Dot;
+ break;
+
+ case '#': {
+#ifdef ENABLE_RECORD_TUPLE
+ if (matchCodeUnit('{')) {
+ simpleKind = TokenKind::HashCurly;
+ break;
+ }
+ if (matchCodeUnit('[')) {
+ simpleKind = TokenKind::HashBracket;
+ break;
+ }
+#endif
+
+ TokenStart start(this->sourceUnits, -1);
+ const Unit* identStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
+ IdentifierEscapes sawEscape;
+ if (!matchIdentifierStart(&sawEscape)) {
+ return badToken();
+ }
+ return identifierName(start, identStart, sawEscape, modifier,
+ NameVisibility::Private, ttp);
+ }
+
+ case '=':
+ if (matchCodeUnit('=')) {
+ simpleKind = matchCodeUnit('=') ? TokenKind::StrictEq : TokenKind::Eq;
+ } else if (matchCodeUnit('>')) {
+ simpleKind = TokenKind::Arrow;
+ } else {
+ simpleKind = TokenKind::Assign;
+ }
+ break;
+
+ case '+':
+ if (matchCodeUnit('+')) {
+ simpleKind = TokenKind::Inc;
+ } else {
+ simpleKind =
+ matchCodeUnit('=') ? TokenKind::AddAssign : TokenKind::Add;
+ }
+ break;
+
+ case '\\': {
+ char32_t codePoint;
+ if (uint32_t escapeLength = matchUnicodeEscapeIdStart(&codePoint)) {
+ return identifierName(
+ start,
+ this->sourceUnits.addressOfNextCodeUnit() - escapeLength - 1,
+ IdentifierEscapes::SawUnicodeEscape, modifier,
+ NameVisibility::Public, ttp);
+ }
+
+ // We could point "into" a mistyped escape, e.g. for "\u{41H}" we
+ // could point at the 'H'. But we don't do that now, so the code
+ // unit after the '\' isn't necessarily bad, so just point at the
+ // start of the actually-invalid escape.
+ ungetCodeUnit('\\');
+ error(JSMSG_BAD_ESCAPE);
+ return badToken();
+ }
+
+ case '|':
+ if (matchCodeUnit('|')) {
+ simpleKind = matchCodeUnit('=') ? TokenKind::OrAssign : TokenKind::Or;
+ } else {
+ simpleKind =
+ matchCodeUnit('=') ? TokenKind::BitOrAssign : TokenKind::BitOr;
+ }
+ break;
+
+ case '^':
+ simpleKind =
+ matchCodeUnit('=') ? TokenKind::BitXorAssign : TokenKind::BitXor;
+ break;
+
+ case '&':
+ if (matchCodeUnit('&')) {
+ simpleKind =
+ matchCodeUnit('=') ? TokenKind::AndAssign : TokenKind::And;
+ } else {
+ simpleKind =
+ matchCodeUnit('=') ? TokenKind::BitAndAssign : TokenKind::BitAnd;
+ }
+ break;
+
+ case '?':
+ if (matchCodeUnit('.')) {
+ unit = getCodeUnit();
+ if (IsAsciiDigit(unit)) {
+ // if the code unit is followed by a number, for example it has the
+ // following form `<...> ?.5 <..> then it should be treated as a
+ // ternary rather than as an optional chain
+ simpleKind = TokenKind::Hook;
+ ungetCodeUnit(unit);
+ ungetCodeUnit('.');
+ } else {
+ ungetCodeUnit(unit);
+ simpleKind = TokenKind::OptionalChain;
+ }
+ } else if (matchCodeUnit('?')) {
+ simpleKind = matchCodeUnit('=') ? TokenKind::CoalesceAssign
+ : TokenKind::Coalesce;
+ } else {
+ simpleKind = TokenKind::Hook;
+ }
+ break;
+
+ case '!':
+ if (matchCodeUnit('=')) {
+ simpleKind = matchCodeUnit('=') ? TokenKind::StrictNe : TokenKind::Ne;
+ } else {
+ simpleKind = TokenKind::Not;
+ }
+ break;
+
+ case '<':
+ if (anyCharsAccess().options().allowHTMLComments) {
+ // Treat HTML begin-comment as comment-till-end-of-line.
+ if (matchCodeUnit('!')) {
+ if (matchCodeUnit('-')) {
+ if (matchCodeUnit('-')) {
+ this->sourceUnits.consumeRestOfSingleLineComment();
+ continue;
+ }
+ ungetCodeUnit('-');
+ }
+ ungetCodeUnit('!');
+ }
+ }
+ if (matchCodeUnit('<')) {
+ simpleKind =
+ matchCodeUnit('=') ? TokenKind::LshAssign : TokenKind::Lsh;
+ } else {
+ simpleKind = matchCodeUnit('=') ? TokenKind::Le : TokenKind::Lt;
+ }
+ break;
+
+ case '>':
+ if (matchCodeUnit('>')) {
+ if (matchCodeUnit('>')) {
+ simpleKind =
+ matchCodeUnit('=') ? TokenKind::UrshAssign : TokenKind::Ursh;
+ } else {
+ simpleKind =
+ matchCodeUnit('=') ? TokenKind::RshAssign : TokenKind::Rsh;
+ }
+ } else {
+ simpleKind = matchCodeUnit('=') ? TokenKind::Ge : TokenKind::Gt;
+ }
+ break;
+
+ case '*':
+ if (matchCodeUnit('*')) {
+ simpleKind =
+ matchCodeUnit('=') ? TokenKind::PowAssign : TokenKind::Pow;
+ } else {
+ simpleKind =
+ matchCodeUnit('=') ? TokenKind::MulAssign : TokenKind::Mul;
+ }
+ break;
+
+ case '/':
+ // Look for a single-line comment.
+ if (matchCodeUnit('/')) {
+ unit = getCodeUnit();
+ if (unit == '@' || unit == '#') {
+ bool shouldWarn = unit == '@';
+ if (!getDirectives(false, shouldWarn)) {
+ return false;
+ }
+ } else {
+ // NOTE: |unit| may be EOF here.
+ ungetCodeUnit(unit);
+ }
+
+ this->sourceUnits.consumeRestOfSingleLineComment();
+ continue;
+ }
+
+ // Look for a multi-line comment.
+ if (matchCodeUnit('*')) {
+ TokenStreamAnyChars& anyChars = anyCharsAccess();
+ unsigned linenoBefore = anyChars.lineno;
+
+ do {
+ int32_t unit = getCodeUnit();
+ if (unit == EOF) {
+ error(JSMSG_UNTERMINATED_COMMENT);
+ return badToken();
+ }
+
+ if (unit == '*' && matchCodeUnit('/')) {
+ break;
+ }
+
+ if (unit == '@' || unit == '#') {
+ bool shouldWarn = unit == '@';
+ if (!getDirectives(true, shouldWarn)) {
+ return badToken();
+ }
+ } else if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
+ if (!getFullAsciiCodePoint(unit)) {
+ return badToken();
+ }
+ } else {
+ char32_t codePoint;
+ if (!getNonAsciiCodePoint(unit, &codePoint)) {
+ return badToken();
+ }
+ }
+ } while (true);
+
+ if (linenoBefore != anyChars.lineno) {
+ anyChars.updateFlagsForEOL();
+ }
+
+ continue;
+ }
+
+ // Look for a regexp.
+ if (modifier == SlashIsRegExp) {
+ return regexpLiteral(start, ttp);
+ }
+
+ simpleKind = matchCodeUnit('=') ? TokenKind::DivAssign : TokenKind::Div;
+ break;
+
+ case '%':
+ simpleKind = matchCodeUnit('=') ? TokenKind::ModAssign : TokenKind::Mod;
+ break;
+
+ case '-':
+ if (matchCodeUnit('-')) {
+ if (anyCharsAccess().options().allowHTMLComments &&
+ !anyCharsAccess().flags.isDirtyLine) {
+ if (matchCodeUnit('>')) {
+ this->sourceUnits.consumeRestOfSingleLineComment();
+ continue;
+ }
+ }
+
+ simpleKind = TokenKind::Dec;
+ } else {
+ simpleKind =
+ matchCodeUnit('=') ? TokenKind::SubAssign : TokenKind::Sub;
+ }
+ break;
+
+#ifdef ENABLE_DECORATORS
+ case '@':
+ simpleKind = TokenKind::At;
+ break;
+#endif
+
+ default:
+ // We consumed a bad ASCII code point/unit. Put it back so the
+ // error location is the bad code point.
+ ungetCodeUnit(unit);
+ reportIllegalCharacter(unit);
+ return badToken();
+ } // switch (AssertedCast<uint8_t>(CodeUnitValue(toUnit(unit))))
+
+ MOZ_ASSERT(simpleKind != TokenKind::Limit,
+ "switch-statement should have set |simpleKind| before "
+ "breaking");
+
+ newSimpleToken(simpleKind, start, modifier, ttp);
+ return true;
+ } while (true);
+}
+
+template <typename Unit, class AnyCharsAccess>
+bool TokenStreamSpecific<Unit, AnyCharsAccess>::getStringOrTemplateToken(
+ char untilChar, Modifier modifier, TokenKind* out) {
+ MOZ_ASSERT(untilChar == '\'' || untilChar == '"' || untilChar == '`',
+ "unexpected string/template literal delimiter");
+
+ bool parsingTemplate = (untilChar == '`');
+ bool templateHead = false;
+
+ TokenStart start(this->sourceUnits, -1);
+ this->charBuffer.clear();
+
+ // Run the bad-token code for every path out of this function except the
+ // one success-case.
+ auto noteBadToken = MakeScopeExit([this]() { this->badToken(); });
+
+ auto ReportPrematureEndOfLiteral = [this, untilChar](unsigned errnum) {
+ // Unicode separators aren't end-of-line in template or (as of
+ // recently) string literals, so this assertion doesn't allow them.
+ MOZ_ASSERT(this->sourceUnits.atEnd() ||
+ this->sourceUnits.peekCodeUnit() == Unit('\r') ||
+ this->sourceUnits.peekCodeUnit() == Unit('\n'),
+ "must be parked at EOF or EOL to call this function");
+
+ // The various errors reported here include language like "in a ''
+ // literal" or similar, with '' being '', "", or `` as appropriate.
+ const char delimiters[] = {untilChar, untilChar, '\0'};
+
+ this->error(errnum, delimiters);
+ return;
+ };
+
+ // We need to detect any of these chars: " or ', \n (or its
+ // equivalents), \\, EOF. Because we detect EOL sequences here and
+ // put them back immediately, we can use getCodeUnit().
+ int32_t unit;
+ while ((unit = getCodeUnit()) != untilChar) {
+ if (unit == EOF) {
+ ReportPrematureEndOfLiteral(JSMSG_EOF_BEFORE_END_OF_LITERAL);
+ return false;
+ }
+
+ // Non-ASCII code points are always directly appended -- even
+ // U+2028 LINE SEPARATOR and U+2029 PARAGRAPH SEPARATOR that are
+ // ordinarily LineTerminatorSequences. (They contribute their literal
+ // values to template and [as of recently] string literals, but they're
+ // line terminators when computing line/column coordinates.) Handle
+ // the non-ASCII case early for readability.
+ if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) {
+ char32_t cp;
+ if (!getNonAsciiCodePointDontNormalize(toUnit(unit), &cp)) {
+ return false;
+ }
+
+ if (MOZ_UNLIKELY(cp == unicode::LINE_SEPARATOR ||
+ cp == unicode::PARA_SEPARATOR)) {
+ if (!updateLineInfoForEOL()) {
+ return false;
+ }
+
+ anyCharsAccess().updateFlagsForEOL();
+ } else {
+ MOZ_ASSERT(!IsLineTerminator(cp));
+ }
+
+ if (!AppendCodePointToCharBuffer(this->charBuffer, cp)) {
+ return false;
+ }
+
+ continue;
+ }
+
+ if (unit == '\\') {
+ // When parsing templates, we don't immediately report errors for
+ // invalid escapes; these are handled by the parser. We don't
+ // append to charBuffer in those cases because it won't be read.
+ unit = getCodeUnit();
+ if (unit == EOF) {
+ ReportPrematureEndOfLiteral(JSMSG_EOF_IN_ESCAPE_IN_LITERAL);
+ return false;
+ }
+
+ // Non-ASCII |unit| isn't handled by code after this, so dedicate
+ // an unlikely special-case to it and then continue.
+ if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) {
+ char32_t codePoint;
+ if (!getNonAsciiCodePoint(unit, &codePoint)) {
+ return false;
+ }
+
+ // If we consumed U+2028 LINE SEPARATOR or U+2029 PARAGRAPH
+ // SEPARATOR, they'll be normalized to '\n'. '\' followed by
+ // LineContinuation represents no code points, so don't append
+ // in this case.
+ if (codePoint != '\n') {
+ if (!AppendCodePointToCharBuffer(this->charBuffer, codePoint)) {
+ return false;
+ }
+ }
+
+ continue;
+ }
+
+ // The block above eliminated all non-ASCII, so cast to the
+ // smallest type possible to assist the C++ compiler.
+ switch (AssertedCast<uint8_t>(CodeUnitValue(toUnit(unit)))) {
+ case 'b':
+ unit = '\b';
+ break;
+ case 'f':
+ unit = '\f';
+ break;
+ case 'n':
+ unit = '\n';
+ break;
+ case 'r':
+ unit = '\r';
+ break;
+ case 't':
+ unit = '\t';
+ break;
+ case 'v':
+ unit = '\v';
+ break;
+
+ case '\r':
+ matchLineTerminator('\n');
+ [[fallthrough]];
+ case '\n': {
+ // LineContinuation represents no code points. We're manually
+ // consuming a LineTerminatorSequence, so we must manually
+ // update line/column info.
+ if (!updateLineInfoForEOL()) {
+ return false;
+ }
+
+ continue;
+ }
+
+ // Unicode character specification.
+ case 'u': {
+ int32_t c2 = getCodeUnit();
+ if (c2 == EOF) {
+ ReportPrematureEndOfLiteral(JSMSG_EOF_IN_ESCAPE_IN_LITERAL);
+ return false;
+ }
+
+ // First handle a delimited Unicode escape, e.g. \u{1F4A9}.
+ if (c2 == '{') {
+ uint32_t start = this->sourceUnits.offset() - 3;
+ uint32_t code = 0;
+ bool first = true;
+ bool valid = true;
+ do {
+ int32_t u3 = getCodeUnit();
+ if (u3 == EOF) {
+ if (parsingTemplate) {
+ TokenStreamAnyChars& anyChars = anyCharsAccess();
+ anyChars.setInvalidTemplateEscape(start,
+ InvalidEscapeType::Unicode);
+ valid = false;
+ break;
+ }
+ reportInvalidEscapeError(start, InvalidEscapeType::Unicode);
+ return false;
+ }
+ if (u3 == '}') {
+ if (first) {
+ if (parsingTemplate) {
+ TokenStreamAnyChars& anyChars = anyCharsAccess();
+ anyChars.setInvalidTemplateEscape(
+ start, InvalidEscapeType::Unicode);
+ valid = false;
+ break;
+ }
+ reportInvalidEscapeError(start, InvalidEscapeType::Unicode);
+ return false;
+ }
+ break;
+ }
+
+ // Beware: |u3| may be a non-ASCII code point here; if
+ // so it'll pass into this |if|-block.
+ if (!IsAsciiHexDigit(u3)) {
+ if (parsingTemplate) {
+ // We put the code unit back so that we read it
+ // on the next pass, which matters if it was
+ // '`' or '\'.
+ ungetCodeUnit(u3);
+
+ TokenStreamAnyChars& anyChars = anyCharsAccess();
+ anyChars.setInvalidTemplateEscape(start,
+ InvalidEscapeType::Unicode);
+ valid = false;
+ break;
+ }
+ reportInvalidEscapeError(start, InvalidEscapeType::Unicode);
+ return false;
+ }
+
+ code = (code << 4) | AsciiAlphanumericToNumber(u3);
+ if (code > unicode::NonBMPMax) {
+ if (parsingTemplate) {
+ TokenStreamAnyChars& anyChars = anyCharsAccess();
+ anyChars.setInvalidTemplateEscape(
+ start + 3, InvalidEscapeType::UnicodeOverflow);
+ valid = false;
+ break;
+ }
+ reportInvalidEscapeError(start + 3,
+ InvalidEscapeType::UnicodeOverflow);
+ return false;
+ }
+
+ first = false;
+ } while (true);
+
+ if (!valid) {
+ continue;
+ }
+
+ MOZ_ASSERT(code <= unicode::NonBMPMax);
+ if (!AppendCodePointToCharBuffer(this->charBuffer, code)) {
+ return false;
+ }
+
+ continue;
+ } // end of delimited Unicode escape handling
+
+ // Otherwise it must be a fixed-length \uXXXX Unicode escape.
+ // If it isn't, this is usually an error -- but if this is a
+ // template literal, we must defer error reporting because
+ // malformed escapes are okay in *tagged* template literals.
+ char16_t v;
+ if (IsAsciiHexDigit(c2) && this->sourceUnits.matchHexDigits(3, &v)) {
+ unit = (AsciiAlphanumericToNumber(c2) << 12) | v;
+ } else {
+ // Beware: |c2| may not be an ASCII code point here!
+ ungetCodeUnit(c2);
+ uint32_t start = this->sourceUnits.offset() - 2;
+ if (parsingTemplate) {
+ TokenStreamAnyChars& anyChars = anyCharsAccess();
+ anyChars.setInvalidTemplateEscape(start,
+ InvalidEscapeType::Unicode);
+ continue;
+ }
+ reportInvalidEscapeError(start, InvalidEscapeType::Unicode);
+ return false;
+ }
+ break;
+ } // case 'u'
+
+ // Hexadecimal character specification.
+ case 'x': {
+ char16_t v;
+ if (this->sourceUnits.matchHexDigits(2, &v)) {
+ unit = v;
+ } else {
+ uint32_t start = this->sourceUnits.offset() - 2;
+ if (parsingTemplate) {
+ TokenStreamAnyChars& anyChars = anyCharsAccess();
+ anyChars.setInvalidTemplateEscape(start,
+ InvalidEscapeType::Hexadecimal);
+ continue;
+ }
+ reportInvalidEscapeError(start, InvalidEscapeType::Hexadecimal);
+ return false;
+ }
+ break;
+ }
+
+ default: {
+ if (!IsAsciiOctal(unit)) {
+ // \8 or \9 in an untagged template literal is a syntax error,
+ // reported in GeneralParser::noSubstitutionUntaggedTemplate.
+ //
+ // Tagged template literals, however, may contain \8 and \9. The
+ // "cooked" representation of such a part will be |undefined|, and
+ // the "raw" representation will contain the literal characters.
+ //
+ // function f(parts) {
+ // assertEq(parts[0], undefined);
+ // assertEq(parts.raw[0], "\\8");
+ // return "composed";
+ // }
+ // assertEq(f`\8`, "composed");
+ if (unit == '8' || unit == '9') {
+ TokenStreamAnyChars& anyChars = anyCharsAccess();
+ if (parsingTemplate) {
+ anyChars.setInvalidTemplateEscape(
+ this->sourceUnits.offset() - 2,
+ InvalidEscapeType::EightOrNine);
+ continue;
+ }
+
+ // \8 and \9 are forbidden in string literals in strict mode code.
+ if (!strictModeError(JSMSG_DEPRECATED_EIGHT_OR_NINE_ESCAPE)) {
+ return false;
+ }
+
+ // The above test doesn't catch a few edge cases; see
+ // |GeneralParser::maybeParseDirective|. Record the violation so
+ // that that function can handle them.
+ anyChars.setSawDeprecatedEightOrNineEscape();
+ }
+ break;
+ }
+
+ // Octal character specification.
+ int32_t val = AsciiOctalToNumber(unit);
+
+ unit = peekCodeUnit();
+ if (MOZ_UNLIKELY(unit == EOF)) {
+ ReportPrematureEndOfLiteral(JSMSG_EOF_IN_ESCAPE_IN_LITERAL);
+ return false;
+ }
+
+ // Strict mode code allows only \0 followed by a non-digit.
+ if (val != 0 || IsAsciiDigit(unit)) {
+ TokenStreamAnyChars& anyChars = anyCharsAccess();
+ if (parsingTemplate) {
+ anyChars.setInvalidTemplateEscape(this->sourceUnits.offset() - 2,
+ InvalidEscapeType::Octal);
+ continue;
+ }
+
+ if (!strictModeError(JSMSG_DEPRECATED_OCTAL_ESCAPE)) {
+ return false;
+ }
+
+ // The above test doesn't catch a few edge cases; see
+ // |GeneralParser::maybeParseDirective|. Record the violation so
+ // that that function can handle them.
+ anyChars.setSawDeprecatedOctalEscape();
+ }
+
+ if (IsAsciiOctal(unit)) {
+ val = 8 * val + AsciiOctalToNumber(unit);
+ consumeKnownCodeUnit(unit);
+
+ unit = peekCodeUnit();
+ if (MOZ_UNLIKELY(unit == EOF)) {
+ ReportPrematureEndOfLiteral(JSMSG_EOF_IN_ESCAPE_IN_LITERAL);
+ return false;
+ }
+
+ if (IsAsciiOctal(unit)) {
+ int32_t save = val;
+ val = 8 * val + AsciiOctalToNumber(unit);
+ if (val <= 0xFF) {
+ consumeKnownCodeUnit(unit);
+ } else {
+ val = save;
+ }
+ }
+ }
+
+ unit = char16_t(val);
+ break;
+ } // default
+ } // switch (AssertedCast<uint8_t>(CodeUnitValue(toUnit(unit))))
+
+ if (!this->charBuffer.append(unit)) {
+ return false;
+ }
+
+ continue;
+ } // (unit == '\\')
+
+ if (unit == '\r' || unit == '\n') {
+ if (!parsingTemplate) {
+ // String literals don't allow ASCII line breaks.
+ ungetCodeUnit(unit);
+ ReportPrematureEndOfLiteral(JSMSG_EOL_BEFORE_END_OF_STRING);
+ return false;
+ }
+
+ if (unit == '\r') {
+ unit = '\n';
+ matchLineTerminator('\n');
+ }
+
+ if (!updateLineInfoForEOL()) {
+ return false;
+ }
+
+ anyCharsAccess().updateFlagsForEOL();
+ } else if (parsingTemplate && unit == '$' && matchCodeUnit('{')) {
+ templateHead = true;
+ break;
+ }
+
+ if (!this->charBuffer.append(unit)) {
+ return false;
+ }
+ }
+
+ TaggedParserAtomIndex atom = drainCharBufferIntoAtom();
+ if (!atom) {
+ return false;
+ }
+
+ noteBadToken.release();
+
+ MOZ_ASSERT_IF(!parsingTemplate, !templateHead);
+
+ TokenKind kind = !parsingTemplate ? TokenKind::String
+ : templateHead ? TokenKind::TemplateHead
+ : TokenKind::NoSubsTemplate;
+ newAtomToken(kind, atom, start, modifier, out);
+ return true;
+}
+
+const char* TokenKindToDesc(TokenKind tt) {
+ switch (tt) {
+#define EMIT_CASE(name, desc) \
+ case TokenKind::name: \
+ return desc;
+ FOR_EACH_TOKEN_KIND(EMIT_CASE)
+#undef EMIT_CASE
+ case TokenKind::Limit:
+ MOZ_ASSERT_UNREACHABLE("TokenKind::Limit should not be passed.");
+ break;
+ }
+
+ return "<bad TokenKind>";
+}
+
+#ifdef DEBUG
+const char* TokenKindToString(TokenKind tt) {
+ switch (tt) {
+# define EMIT_CASE(name, desc) \
+ case TokenKind::name: \
+ return "TokenKind::" #name;
+ FOR_EACH_TOKEN_KIND(EMIT_CASE)
+# undef EMIT_CASE
+ case TokenKind::Limit:
+ break;
+ }
+
+ return "<bad TokenKind>";
+}
+#endif
+
+template class TokenStreamCharsBase<Utf8Unit>;
+template class TokenStreamCharsBase<char16_t>;
+
+template class GeneralTokenStreamChars<char16_t, TokenStreamAnyCharsAccess>;
+template class TokenStreamChars<char16_t, TokenStreamAnyCharsAccess>;
+template class TokenStreamSpecific<char16_t, TokenStreamAnyCharsAccess>;
+
+template class GeneralTokenStreamChars<
+ Utf8Unit, ParserAnyCharsAccess<GeneralParser<FullParseHandler, Utf8Unit>>>;
+template class GeneralTokenStreamChars<
+ Utf8Unit,
+ ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, Utf8Unit>>>;
+template class GeneralTokenStreamChars<
+ char16_t, ParserAnyCharsAccess<GeneralParser<FullParseHandler, char16_t>>>;
+template class GeneralTokenStreamChars<
+ char16_t,
+ ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, char16_t>>>;
+
+template class TokenStreamChars<
+ Utf8Unit, ParserAnyCharsAccess<GeneralParser<FullParseHandler, Utf8Unit>>>;
+template class TokenStreamChars<
+ Utf8Unit,
+ ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, Utf8Unit>>>;
+template class TokenStreamChars<
+ char16_t, ParserAnyCharsAccess<GeneralParser<FullParseHandler, char16_t>>>;
+template class TokenStreamChars<
+ char16_t,
+ ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, char16_t>>>;
+
+template class TokenStreamSpecific<
+ Utf8Unit, ParserAnyCharsAccess<GeneralParser<FullParseHandler, Utf8Unit>>>;
+template class TokenStreamSpecific<
+ Utf8Unit,
+ ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, Utf8Unit>>>;
+template class TokenStreamSpecific<
+ char16_t, ParserAnyCharsAccess<GeneralParser<FullParseHandler, char16_t>>>;
+template class TokenStreamSpecific<
+ char16_t,
+ ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, char16_t>>>;
+
+} // namespace frontend
+
+} // namespace js