/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- * vim: set ts=8 sts=2 et sw=2 tw=80: * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ // JS lexical scanner. #include "frontend/TokenStream.h" #include "mozilla/ArrayUtils.h" #include "mozilla/Attributes.h" #include "mozilla/Likely.h" #include "mozilla/Maybe.h" #include "mozilla/MemoryChecking.h" #include "mozilla/ScopeExit.h" #include "mozilla/Span.h" #include "mozilla/TemplateLib.h" #include "mozilla/TextUtils.h" #include "mozilla/Utf8.h" #include #include #include #include #include #include #include #include #include "jsnum.h" #include "frontend/BytecodeCompiler.h" #include "frontend/FrontendContext.h" #include "frontend/Parser.h" #include "frontend/ParserAtom.h" #include "frontend/ReservedWords.h" #include "js/friend/ErrorMessages.h" // js::GetErrorMessage, JSMSG_* #include "js/Printf.h" // JS_smprintf #include "js/RegExpFlags.h" // JS::RegExpFlags #include "js/UniquePtr.h" #include "util/Text.h" #include "util/Unicode.h" #include "vm/FrameIter.h" // js::{,NonBuiltin}FrameIter #include "vm/JSContext.h" #include "vm/Realm.h" #include "vm/WellKnownAtom.h" // js_*_str using mozilla::AsciiAlphanumericToNumber; using mozilla::AssertedCast; using mozilla::DecodeOneUtf8CodePoint; using mozilla::IsAscii; using mozilla::IsAsciiAlpha; using mozilla::IsAsciiDigit; using mozilla::IsAsciiHexDigit; using mozilla::IsTrailingUnit; using mozilla::MakeScopeExit; using mozilla::Maybe; using mozilla::PointerRangeSize; using mozilla::Span; using mozilla::Utf8Unit; using JS::ReadOnlyCompileOptions; using JS::RegExpFlag; using JS::RegExpFlags; struct ReservedWordInfo { const char* chars; // C string with reserved word text js::frontend::TokenKind tokentype; }; static const ReservedWordInfo reservedWords[] = { #define RESERVED_WORD_INFO(word, name, type) \ {js_##word##_str, js::frontend::type}, FOR_EACH_JAVASCRIPT_RESERVED_WORD(RESERVED_WORD_INFO) #undef RESERVED_WORD_INFO }; enum class ReservedWordsIndex : size_t { #define ENTRY_(_1, NAME, _3) NAME, FOR_EACH_JAVASCRIPT_RESERVED_WORD(ENTRY_) #undef ENTRY_ }; // Returns a ReservedWordInfo for the specified characters, or nullptr if the // string is not a reserved word. template static const ReservedWordInfo* FindReservedWord(const CharT* s, size_t length) { MOZ_ASSERT(length != 0); size_t i; const ReservedWordInfo* rw; const char* chars; #define JSRW_LENGTH() length #define JSRW_AT(column) s[column] #define JSRW_GOT_MATCH(index) \ i = (index); \ goto got_match; #define JSRW_TEST_GUESS(index) \ i = (index); \ goto test_guess; #define JSRW_NO_MATCH() goto no_match; #include "frontend/ReservedWordsGenerated.h" #undef JSRW_NO_MATCH #undef JSRW_TEST_GUESS #undef JSRW_GOT_MATCH #undef JSRW_AT #undef JSRW_LENGTH got_match: return &reservedWords[i]; test_guess: rw = &reservedWords[i]; chars = rw->chars; do { if (*s++ != static_cast(*chars++)) { goto no_match; } } while (--length != 0); return rw; no_match: return nullptr; } template <> MOZ_ALWAYS_INLINE const ReservedWordInfo* FindReservedWord( const Utf8Unit* units, size_t length) { return FindReservedWord(Utf8AsUnsignedChars(units), length); } static const ReservedWordInfo* FindReservedWord( const js::frontend::TaggedParserAtomIndex atom) { switch (atom.rawData()) { #define CASE_(_1, NAME, _3) \ case js::frontend::TaggedParserAtomIndex::WellKnownRawData::NAME(): \ return &reservedWords[size_t(ReservedWordsIndex::NAME)]; FOR_EACH_JAVASCRIPT_RESERVED_WORD(CASE_) #undef CASE_ } return nullptr; } static char32_t GetSingleCodePoint(const char16_t** p, const char16_t* end) { using namespace js; if (MOZ_UNLIKELY(unicode::IsLeadSurrogate(**p)) && *p + 1 < end) { char16_t lead = **p; char16_t maybeTrail = *(*p + 1); if (unicode::IsTrailSurrogate(maybeTrail)) { *p += 2; return unicode::UTF16Decode(lead, maybeTrail); } } char32_t codePoint = **p; (*p)++; return codePoint; } template static constexpr bool IsAsciiBinary(CharT c) { using UnsignedCharT = std::make_unsigned_t; auto uc = static_cast(c); return uc == '0' || uc == '1'; } template static constexpr bool IsAsciiOctal(CharT c) { using UnsignedCharT = std::make_unsigned_t; auto uc = static_cast(c); return '0' <= uc && uc <= '7'; } template static constexpr uint8_t AsciiOctalToNumber(CharT c) { using UnsignedCharT = std::make_unsigned_t; auto uc = static_cast(c); return uc - '0'; } namespace js { namespace frontend { bool IsIdentifier(JSLinearString* str) { JS::AutoCheckCannotGC nogc; MOZ_ASSERT(str); if (str->hasLatin1Chars()) { return IsIdentifier(str->latin1Chars(nogc), str->length()); } return IsIdentifier(str->twoByteChars(nogc), str->length()); } bool IsIdentifierNameOrPrivateName(JSLinearString* str) { JS::AutoCheckCannotGC nogc; MOZ_ASSERT(str); if (str->hasLatin1Chars()) { return IsIdentifierNameOrPrivateName(str->latin1Chars(nogc), str->length()); } return IsIdentifierNameOrPrivateName(str->twoByteChars(nogc), str->length()); } bool IsIdentifier(const Latin1Char* chars, size_t length) { if (length == 0) { return false; } if (!unicode::IsIdentifierStart(char16_t(*chars))) { return false; } const Latin1Char* end = chars + length; while (++chars != end) { if (!unicode::IsIdentifierPart(char16_t(*chars))) { return false; } } return true; } bool IsIdentifierASCII(char c) { return unicode::IsIdentifierStartASCII(c); } bool IsIdentifierASCII(char c1, char c2) { return unicode::IsIdentifierStartASCII(c1) && unicode::IsIdentifierPartASCII(c2); } bool IsIdentifierNameOrPrivateName(const Latin1Char* chars, size_t length) { if (length == 0) { return false; } // Skip over any private name marker. if (*chars == '#') { ++chars; --length; } return IsIdentifier(chars, length); } bool IsIdentifier(const char16_t* chars, size_t length) { if (length == 0) { return false; } const char16_t* p = chars; const char16_t* end = chars + length; char32_t codePoint; codePoint = GetSingleCodePoint(&p, end); if (!unicode::IsIdentifierStart(codePoint)) { return false; } while (p < end) { codePoint = GetSingleCodePoint(&p, end); if (!unicode::IsIdentifierPart(codePoint)) { return false; } } return true; } bool IsIdentifierNameOrPrivateName(const char16_t* chars, size_t length) { if (length == 0) { return false; } const char16_t* p = chars; const char16_t* end = chars + length; char32_t codePoint; codePoint = GetSingleCodePoint(&p, end); // Skip over any private name marker. if (codePoint == '#') { // The identifier part of a private name mustn't be empty. if (length == 1) { return false; } codePoint = GetSingleCodePoint(&p, end); } if (!unicode::IsIdentifierStart(codePoint)) { return false; } while (p < end) { codePoint = GetSingleCodePoint(&p, end); if (!unicode::IsIdentifierPart(codePoint)) { return false; } } return true; } bool IsKeyword(TaggedParserAtomIndex atom) { if (const ReservedWordInfo* rw = FindReservedWord(atom)) { return TokenKindIsKeyword(rw->tokentype); } return false; } TokenKind ReservedWordTokenKind(TaggedParserAtomIndex name) { if (const ReservedWordInfo* rw = FindReservedWord(name)) { return rw->tokentype; } return TokenKind::Limit; } const char* ReservedWordToCharZ(TaggedParserAtomIndex name) { if (const ReservedWordInfo* rw = FindReservedWord(name)) { return ReservedWordToCharZ(rw->tokentype); } return nullptr; } const char* ReservedWordToCharZ(TokenKind tt) { MOZ_ASSERT(tt != TokenKind::Name); switch (tt) { #define EMIT_CASE(word, name, type) \ case type: \ return js_##word##_str; FOR_EACH_JAVASCRIPT_RESERVED_WORD(EMIT_CASE) #undef EMIT_CASE default: MOZ_ASSERT_UNREACHABLE("Not a reserved word PropertyName."); } return nullptr; } TaggedParserAtomIndex TokenStreamAnyChars::reservedWordToPropertyName( TokenKind tt) const { MOZ_ASSERT(tt != TokenKind::Name); switch (tt) { #define EMIT_CASE(word, name, type) \ case type: \ return TaggedParserAtomIndex::WellKnown::name(); FOR_EACH_JAVASCRIPT_RESERVED_WORD(EMIT_CASE) #undef EMIT_CASE default: MOZ_ASSERT_UNREACHABLE("Not a reserved word TokenKind."); } return TaggedParserAtomIndex::null(); } SourceCoords::SourceCoords(FrontendContext* fc, uint32_t initialLineNumber, uint32_t initialOffset) : lineStartOffsets_(fc), initialLineNum_(initialLineNumber), lastIndex_(0) { // This is actually necessary! Removing it causes compile errors on // GCC and clang. You could try declaring this: // // const uint32_t SourceCoords::MAX_PTR; // // which fixes the GCC/clang error, but causes bustage on Windows. Sigh. // uint32_t maxPtr = MAX_PTR; // The first line begins at buffer offset |initialOffset|. MAX_PTR is the // sentinel. The appends cannot fail because |lineStartOffsets_| has // statically-allocated elements. MOZ_ASSERT(lineStartOffsets_.capacity() >= 2); MOZ_ALWAYS_TRUE(lineStartOffsets_.reserve(2)); lineStartOffsets_.infallibleAppend(initialOffset); lineStartOffsets_.infallibleAppend(maxPtr); } MOZ_ALWAYS_INLINE bool SourceCoords::add(uint32_t lineNum, uint32_t lineStartOffset) { uint32_t index = indexFromLineNumber(lineNum); uint32_t sentinelIndex = lineStartOffsets_.length() - 1; MOZ_ASSERT(lineStartOffsets_[0] <= lineStartOffset); MOZ_ASSERT(lineStartOffsets_[sentinelIndex] == MAX_PTR); if (index == sentinelIndex) { // We haven't seen this newline before. Update lineStartOffsets_ // only if lineStartOffsets_.append succeeds, to keep sentinel. // Otherwise return false to tell TokenStream about OOM. uint32_t maxPtr = MAX_PTR; if (!lineStartOffsets_.append(maxPtr)) { static_assert(std::is_same_v, "this function's caller depends on it reporting an " "error on failure, as TempAllocPolicy ensures"); return false; } lineStartOffsets_[index] = lineStartOffset; } else { // We have seen this newline before (and ungot it). Do nothing (other // than checking it hasn't mysteriously changed). // This path can be executed after hitting OOM, so check index. MOZ_ASSERT_IF(index < sentinelIndex, lineStartOffsets_[index] == lineStartOffset); } return true; } MOZ_ALWAYS_INLINE bool SourceCoords::fill(const SourceCoords& other) { MOZ_ASSERT(lineStartOffsets_[0] == other.lineStartOffsets_[0]); MOZ_ASSERT(lineStartOffsets_.back() == MAX_PTR); MOZ_ASSERT(other.lineStartOffsets_.back() == MAX_PTR); if (lineStartOffsets_.length() >= other.lineStartOffsets_.length()) { return true; } uint32_t sentinelIndex = lineStartOffsets_.length() - 1; lineStartOffsets_[sentinelIndex] = other.lineStartOffsets_[sentinelIndex]; for (size_t i = sentinelIndex + 1; i < other.lineStartOffsets_.length(); i++) { if (!lineStartOffsets_.append(other.lineStartOffsets_[i])) { return false; } } return true; } MOZ_ALWAYS_INLINE uint32_t SourceCoords::indexFromOffset(uint32_t offset) const { uint32_t iMin, iMax, iMid; if (lineStartOffsets_[lastIndex_] <= offset) { // If we reach here, offset is on a line the same as or higher than // last time. Check first for the +0, +1, +2 cases, because they // typically cover 85--98% of cases. if (offset < lineStartOffsets_[lastIndex_ + 1]) { return lastIndex_; // index is same as last time } // If we reach here, there must be at least one more entry (plus the // sentinel). Try it. lastIndex_++; if (offset < lineStartOffsets_[lastIndex_ + 1]) { return lastIndex_; // index is one higher than last time } // The same logic applies here. lastIndex_++; if (offset < lineStartOffsets_[lastIndex_ + 1]) { return lastIndex_; // index is two higher than last time } // No luck. Oh well, we have a better-than-default starting point for // the binary search. iMin = lastIndex_ + 1; MOZ_ASSERT(iMin < lineStartOffsets_.length() - 1); // -1 due to the sentinel } else { iMin = 0; } // This is a binary search with deferred detection of equality, which was // marginally faster in this case than a standard binary search. // The -2 is because |lineStartOffsets_.length() - 1| is the sentinel, and we // want one before that. iMax = lineStartOffsets_.length() - 2; while (iMax > iMin) { iMid = iMin + (iMax - iMin) / 2; if (offset >= lineStartOffsets_[iMid + 1]) { iMin = iMid + 1; // offset is above lineStartOffsets_[iMid] } else { iMax = iMid; // offset is below or within lineStartOffsets_[iMid] } } MOZ_ASSERT(iMax == iMin); MOZ_ASSERT(lineStartOffsets_[iMin] <= offset); MOZ_ASSERT(offset < lineStartOffsets_[iMin + 1]); lastIndex_ = iMin; return iMin; } SourceCoords::LineToken SourceCoords::lineToken(uint32_t offset) const { return LineToken(indexFromOffset(offset), offset); } TokenStreamAnyChars::TokenStreamAnyChars(FrontendContext* fc, const ReadOnlyCompileOptions& options, StrictModeGetter* smg) : fc(fc), options_(options), strictModeGetter_(smg), filename_(options.filename()), longLineColumnInfo_(fc), srcCoords(fc, options.lineno, options.scriptSourceOffset), lineno(options.lineno), mutedErrors(options.mutedErrors()) { // |isExprEnding| was initially zeroed: overwrite the true entries here. isExprEnding[size_t(TokenKind::Comma)] = true; isExprEnding[size_t(TokenKind::Semi)] = true; isExprEnding[size_t(TokenKind::Colon)] = true; isExprEnding[size_t(TokenKind::RightParen)] = true; isExprEnding[size_t(TokenKind::RightBracket)] = true; isExprEnding[size_t(TokenKind::RightCurly)] = true; } template TokenStreamCharsBase::TokenStreamCharsBase(FrontendContext* fc, ParserAtomsTable* parserAtoms, const Unit* units, size_t length, size_t startOffset) : TokenStreamCharsShared(fc, parserAtoms), sourceUnits(units, length, startOffset) {} bool FillCharBufferFromSourceNormalizingAsciiLineBreaks(CharBuffer& charBuffer, const char16_t* cur, const char16_t* end) { MOZ_ASSERT(charBuffer.length() == 0); while (cur < end) { char16_t ch = *cur++; if (ch == '\r') { ch = '\n'; if (cur < end && *cur == '\n') { cur++; } } if (!charBuffer.append(ch)) { return false; } } MOZ_ASSERT(cur == end); return true; } bool FillCharBufferFromSourceNormalizingAsciiLineBreaks(CharBuffer& charBuffer, const Utf8Unit* cur, const Utf8Unit* end) { MOZ_ASSERT(charBuffer.length() == 0); while (cur < end) { Utf8Unit unit = *cur++; if (MOZ_LIKELY(IsAscii(unit))) { char16_t ch = unit.toUint8(); if (ch == '\r') { ch = '\n'; if (cur < end && *cur == Utf8Unit('\n')) { cur++; } } if (!charBuffer.append(ch)) { return false; } continue; } Maybe ch = DecodeOneUtf8CodePoint(unit, &cur, end); MOZ_ASSERT(ch.isSome(), "provided source text should already have been validated"); if (!AppendCodePointToCharBuffer(charBuffer, ch.value())) { return false; } } MOZ_ASSERT(cur == end); return true; } template TokenStreamSpecific::TokenStreamSpecific( FrontendContext* fc, ParserAtomsTable* parserAtoms, const ReadOnlyCompileOptions& options, const Unit* units, size_t length) : TokenStreamChars(fc, parserAtoms, units, length, options.scriptSourceOffset) {} bool TokenStreamAnyChars::checkOptions() { // Constrain starting columns to where they will saturate. if (options().column > ColumnLimit) { reportErrorNoOffset(JSMSG_BAD_COLUMN_NUMBER); return false; } return true; } void TokenStreamAnyChars::reportErrorNoOffset(unsigned errorNumber, ...) const { va_list args; va_start(args, errorNumber); reportErrorNoOffsetVA(errorNumber, &args); va_end(args); } void TokenStreamAnyChars::reportErrorNoOffsetVA(unsigned errorNumber, va_list* args) const { ErrorMetadata metadata; computeErrorMetadataNoOffset(&metadata); ReportCompileErrorLatin1(fc, std::move(metadata), nullptr, errorNumber, args); } [[nodiscard]] MOZ_ALWAYS_INLINE bool TokenStreamAnyChars::internalUpdateLineInfoForEOL(uint32_t lineStartOffset) { prevLinebase = linebase; linebase = lineStartOffset; lineno++; // On overflow, report error. if (MOZ_UNLIKELY(!lineno)) { reportErrorNoOffset(JSMSG_BAD_LINE_NUMBER); return false; } return srcCoords.add(lineno, linebase); } #ifdef DEBUG template <> inline void SourceUnits::assertNextCodePoint( const PeekedCodePoint& peeked) { char32_t c = peeked.codePoint(); if (c < unicode::NonBMPMin) { MOZ_ASSERT(peeked.lengthInUnits() == 1); MOZ_ASSERT(ptr[0] == c); } else { MOZ_ASSERT(peeked.lengthInUnits() == 2); char16_t lead, trail; unicode::UTF16Encode(c, &lead, &trail); MOZ_ASSERT(ptr[0] == lead); MOZ_ASSERT(ptr[1] == trail); } } template <> inline void SourceUnits::assertNextCodePoint( const PeekedCodePoint& peeked) { char32_t c = peeked.codePoint(); // This is all roughly indulgence of paranoia only for assertions, so the // reimplementation of UTF-8 encoding a code point is (we think) a virtue. uint8_t expectedUnits[4] = {}; if (c < 0x80) { expectedUnits[0] = AssertedCast(c); } else if (c < 0x800) { expectedUnits[0] = 0b1100'0000 | (c >> 6); expectedUnits[1] = 0b1000'0000 | (c & 0b11'1111); } else if (c < 0x10000) { expectedUnits[0] = 0b1110'0000 | (c >> 12); expectedUnits[1] = 0b1000'0000 | ((c >> 6) & 0b11'1111); expectedUnits[2] = 0b1000'0000 | (c & 0b11'1111); } else { expectedUnits[0] = 0b1111'0000 | (c >> 18); expectedUnits[1] = 0b1000'0000 | ((c >> 12) & 0b11'1111); expectedUnits[2] = 0b1000'0000 | ((c >> 6) & 0b11'1111); expectedUnits[3] = 0b1000'0000 | (c & 0b11'1111); } MOZ_ASSERT(peeked.lengthInUnits() <= 4); for (uint8_t i = 0; i < peeked.lengthInUnits(); i++) { MOZ_ASSERT(expectedUnits[i] == ptr[i].toUint8()); } } #endif // DEBUG static MOZ_ALWAYS_INLINE void RetractPointerToCodePointBoundary( const Utf8Unit** ptr, const Utf8Unit* limit) { MOZ_ASSERT(*ptr <= limit); // |limit| is a code point boundary. if (MOZ_UNLIKELY(*ptr == limit)) { return; } // Otherwise rewind past trailing units to the start of the code point. #ifdef DEBUG size_t retracted = 0; #endif while (MOZ_UNLIKELY(IsTrailingUnit((*ptr)[0]))) { --*ptr; #ifdef DEBUG retracted++; #endif } MOZ_ASSERT(retracted < 4, "the longest UTF-8 code point is four units, so this should never " "retract more than three units"); } static MOZ_ALWAYS_INLINE void RetractPointerToCodePointBoundary( const char16_t** ptr, const char16_t* limit) { MOZ_ASSERT(*ptr <= limit); // |limit| is a code point boundary. if (MOZ_UNLIKELY(*ptr == limit)) { return; } // Otherwise the pointer must be retracted by one iff it splits a two-unit // code point. if (MOZ_UNLIKELY(unicode::IsTrailSurrogate((*ptr)[0]))) { // Outside test suites testing garbage WTF-16, it's basically guaranteed // here that |(*ptr)[-1] (*ptr)[0]| is a surrogate pair. if (MOZ_LIKELY(unicode::IsLeadSurrogate((*ptr)[-1]))) { --*ptr; } } } template uint32_t TokenStreamAnyChars::computePartialColumn( const LineToken lineToken, const uint32_t offset, const SourceUnits& sourceUnits) const { lineToken.assertConsistentOffset(offset); const uint32_t line = lineNumber(lineToken); const uint32_t start = srcCoords.lineStart(lineToken); // Reset the previous offset/column cache for this line, if the previous // lookup wasn't on this line. if (line != lineOfLastColumnComputation_) { lineOfLastColumnComputation_ = line; lastChunkVectorForLine_ = nullptr; lastOffsetOfComputedColumn_ = start; lastComputedColumn_ = 0; } // Compute and return the final column number from a partial offset/column, // using the last-cached offset/column if they're more optimal. auto ColumnFromPartial = [this, offset, &sourceUnits](uint32_t partialOffset, uint32_t partialCols, UnitsType unitsType) { MOZ_ASSERT(partialOffset <= offset); // If the last lookup on this line was closer to |offset|, use it. if (partialOffset < this->lastOffsetOfComputedColumn_ && this->lastOffsetOfComputedColumn_ <= offset) { partialOffset = this->lastOffsetOfComputedColumn_; partialCols = this->lastComputedColumn_; } const Unit* begin = sourceUnits.codeUnitPtrAt(partialOffset); const Unit* end = sourceUnits.codeUnitPtrAt(offset); size_t offsetDelta = AssertedCast(PointerRangeSize(begin, end)); partialOffset += offsetDelta; if (unitsType == UnitsType::GuaranteedSingleUnit) { MOZ_ASSERT(unicode::CountCodePoints(begin, end) == offsetDelta, "guaranteed-single-units also guarantee pointer distance " "equals code point count"); partialCols += offsetDelta; } else { partialCols += AssertedCast(unicode::CountCodePoints(begin, end)); } this->lastOffsetOfComputedColumn_ = partialOffset; this->lastComputedColumn_ = partialCols; return partialCols; }; const uint32_t offsetInLine = offset - start; // We won't add an entry to |longLineColumnInfo_| for lines where the maximum // column has offset less than this value. The most common (non-minified) // long line length is likely 80ch, maybe 100ch, so we use that, rounded up to // the next power of two for efficient division/multiplication below. constexpr uint32_t ColumnChunkLength = mozilla::tl::RoundUpPow2<100>::value; // The index within any associated |Vector| of |offset|'s chunk. const uint32_t chunkIndex = offsetInLine / ColumnChunkLength; if (chunkIndex == 0) { // We don't know from an |offset| in the zeroth chunk that this line is even // long. First-chunk info is mostly useless, anyway -- we have |start| // already. So if we have *easy* access to that zeroth chunk, use it -- // otherwise just count pessimally. (This will still benefit from caching // the last column/offset for computations for successive offsets, so it's // not *always* worst-case.) UnitsType unitsType; if (lastChunkVectorForLine_ && lastChunkVectorForLine_->length() > 0) { MOZ_ASSERT((*lastChunkVectorForLine_)[0].column() == 0); unitsType = (*lastChunkVectorForLine_)[0].unitsType(); } else { unitsType = UnitsType::PossiblyMultiUnit; } return ColumnFromPartial(start, 0, unitsType); } // If this line has no chunk vector yet, insert one in the hash map. (The // required index is allocated and filled further down.) if (!lastChunkVectorForLine_) { auto ptr = longLineColumnInfo_.lookupForAdd(line); if (!ptr) { // This could rehash and invalidate a cached vector pointer, but the outer // condition means we don't have a cached pointer. if (!longLineColumnInfo_.add(ptr, line, Vector(fc))) { // In case of OOM, just count columns from the start of the line. fc->recoverFromOutOfMemory(); return ColumnFromPartial(start, 0, UnitsType::PossiblyMultiUnit); } } // Note that adding elements to this vector won't invalidate this pointer. lastChunkVectorForLine_ = &ptr->value(); } const Unit* const limit = sourceUnits.codeUnitPtrAt(offset); auto RetractedOffsetOfChunk = [ #ifdef DEBUG this, #endif start, limit, &sourceUnits](uint32_t index) { MOZ_ASSERT(index < this->lastChunkVectorForLine_->length()); uint32_t naiveOffset = start + index * ColumnChunkLength; const Unit* naivePtr = sourceUnits.codeUnitPtrAt(naiveOffset); const Unit* actualPtr = naivePtr; RetractPointerToCodePointBoundary(&actualPtr, limit); #ifdef DEBUG if ((*this->lastChunkVectorForLine_)[index].unitsType() == UnitsType::GuaranteedSingleUnit) { MOZ_ASSERT(naivePtr == actualPtr, "miscomputed unitsType value"); } #endif return naiveOffset - PointerRangeSize(actualPtr, naivePtr); }; uint32_t partialOffset; uint32_t partialColumn; UnitsType unitsType; auto entriesLen = AssertedCast(lastChunkVectorForLine_->length()); if (chunkIndex < entriesLen) { // We've computed the chunk |offset| resides in. Compute the column number // from the chunk. partialOffset = RetractedOffsetOfChunk(chunkIndex); partialColumn = (*lastChunkVectorForLine_)[chunkIndex].column(); // This is exact if |chunkIndex| isn't the last chunk. unitsType = (*lastChunkVectorForLine_)[chunkIndex].unitsType(); // Otherwise the last chunk is pessimistically assumed to contain multi-unit // code points because we haven't fully examined its contents yet -- they // may not have been tokenized yet, they could contain encoding errors, or // they might not even exist. MOZ_ASSERT_IF(chunkIndex == entriesLen - 1, (*lastChunkVectorForLine_)[chunkIndex].unitsType() == UnitsType::PossiblyMultiUnit); } else { // Extend the vector from its last entry or the start of the line. (This is // also a suitable partial start point if we must recover from OOM.) if (entriesLen > 0) { partialOffset = RetractedOffsetOfChunk(entriesLen - 1); partialColumn = (*lastChunkVectorForLine_)[entriesLen - 1].column(); } else { partialOffset = start; partialColumn = 0; } if (!lastChunkVectorForLine_->reserve(chunkIndex + 1)) { // As earlier, just start from the greatest offset/column in case of OOM. fc->recoverFromOutOfMemory(); return ColumnFromPartial(partialOffset, partialColumn, UnitsType::PossiblyMultiUnit); } // OOM is no longer possible now. \o/ // The vector always begins with the column of the line start, i.e. zero, // with chunk units pessimally assumed not single-unit. if (entriesLen == 0) { lastChunkVectorForLine_->infallibleAppend( ChunkInfo(0, UnitsType::PossiblyMultiUnit)); entriesLen++; } do { const Unit* const begin = sourceUnits.codeUnitPtrAt(partialOffset); const Unit* chunkLimit = sourceUnits.codeUnitPtrAt( start + std::min(entriesLen++ * ColumnChunkLength, offsetInLine)); MOZ_ASSERT(begin < chunkLimit); MOZ_ASSERT(chunkLimit <= limit); static_assert( ColumnChunkLength > SourceUnitTraits::maxUnitsLength - 1, "any retraction below is assumed to never underflow to the " "preceding chunk, even for the longest code point"); // Prior tokenizing ensured that [begin, limit) is validly encoded, and // |begin < chunkLimit|, so any retraction here can't underflow. RetractPointerToCodePointBoundary(&chunkLimit, limit); MOZ_ASSERT(begin < chunkLimit); MOZ_ASSERT(chunkLimit <= limit); size_t numUnits = PointerRangeSize(begin, chunkLimit); size_t numCodePoints = unicode::CountCodePoints(begin, chunkLimit); // If this chunk (which will become non-final at the end of the loop) is // all single-unit code points, annotate the chunk accordingly. if (numUnits == numCodePoints) { lastChunkVectorForLine_->back().guaranteeSingleUnits(); } partialOffset += numUnits; partialColumn += numCodePoints; lastChunkVectorForLine_->infallibleEmplaceBack( partialColumn, UnitsType::PossiblyMultiUnit); } while (entriesLen < chunkIndex + 1); // We're at a spot in the current final chunk, and final chunks never have // complete units information, so be pessimistic. unitsType = UnitsType::PossiblyMultiUnit; } return ColumnFromPartial(partialOffset, partialColumn, unitsType); } template uint32_t GeneralTokenStreamChars::computeColumn( LineToken lineToken, uint32_t offset) const { lineToken.assertConsistentOffset(offset); const TokenStreamAnyChars& anyChars = anyCharsAccess(); uint32_t column = anyChars.computePartialColumn(lineToken, offset, this->sourceUnits); if (lineToken.isFirstLine()) { if (column > ColumnLimit) { return ColumnLimit; } static_assert(uint32_t(ColumnLimit + ColumnLimit) > ColumnLimit, "Adding ColumnLimit should not overflow"); uint32_t firstLineOffset = anyChars.options_.column; column += firstLineOffset; } if (column > ColumnLimit) { return ColumnLimit; } return column; } template void GeneralTokenStreamChars::computeLineAndColumn( uint32_t offset, uint32_t* line, uint32_t* column) const { const TokenStreamAnyChars& anyChars = anyCharsAccess(); auto lineToken = anyChars.lineToken(offset); *line = anyChars.lineNumber(lineToken); *column = computeColumn(lineToken, offset); } template MOZ_COLD void TokenStreamChars::internalEncodingError( uint8_t relevantUnits, unsigned errorNumber, ...) { va_list args; va_start(args, errorNumber); do { size_t offset = this->sourceUnits.offset(); ErrorMetadata err; TokenStreamAnyChars& anyChars = anyCharsAccess(); bool canAddLineOfContext = fillExceptingContext(&err, offset); if (canAddLineOfContext) { if (!internalComputeLineOfContext(&err, offset)) { break; } // As this is an encoding error, the computed window-end must be // identical to the location of the error -- any further on and the // window would contain invalid Unicode. MOZ_ASSERT_IF(err.lineOfContext != nullptr, err.lineLength == err.tokenOffset); } auto notes = MakeUnique(); if (!notes) { ReportOutOfMemory(anyChars.fc); break; } // The largest encoding of a UTF-8 code point is 4 units. (Encoding an // obsolete 5- or 6-byte code point will complain only about a bad lead // code unit.) constexpr size_t MaxWidth = sizeof("0xHH 0xHH 0xHH 0xHH"); MOZ_ASSERT(relevantUnits > 0); char badUnitsStr[MaxWidth]; char* ptr = badUnitsStr; while (relevantUnits > 0) { byteToString(this->sourceUnits.getCodeUnit().toUint8(), ptr); ptr[4] = ' '; ptr += 5; relevantUnits--; } ptr[-1] = '\0'; uint32_t line, column; computeLineAndColumn(offset, &line, &column); if (!notes->addNoteASCII(anyChars.fc, anyChars.getFilename(), 0, line, column, GetErrorMessage, nullptr, JSMSG_BAD_CODE_UNITS, badUnitsStr)) { break; } ReportCompileErrorLatin1(anyChars.fc, std::move(err), std::move(notes), errorNumber, &args); } while (false); va_end(args); } template MOZ_COLD void TokenStreamChars::badLeadUnit( Utf8Unit lead) { uint8_t leadValue = lead.toUint8(); char leadByteStr[5]; byteToTerminatedString(leadValue, leadByteStr); internalEncodingError(1, JSMSG_BAD_LEADING_UTF8_UNIT, leadByteStr); } template MOZ_COLD void TokenStreamChars::notEnoughUnits( Utf8Unit lead, uint8_t remaining, uint8_t required) { uint8_t leadValue = lead.toUint8(); MOZ_ASSERT(required == 2 || required == 3 || required == 4); MOZ_ASSERT(remaining < 4); MOZ_ASSERT(remaining < required); char leadByteStr[5]; byteToTerminatedString(leadValue, leadByteStr); // |toHexChar| produces the desired decimal numbers for values < 4. const char expectedStr[] = {toHexChar(required - 1), '\0'}; const char actualStr[] = {toHexChar(remaining - 1), '\0'}; internalEncodingError(remaining, JSMSG_NOT_ENOUGH_CODE_UNITS, leadByteStr, expectedStr, required == 2 ? "" : "s", actualStr, remaining == 2 ? " was" : "s were"); } template MOZ_COLD void TokenStreamChars::badTrailingUnit( uint8_t unitsObserved) { Utf8Unit badUnit = this->sourceUnits.addressOfNextCodeUnit()[unitsObserved - 1]; char badByteStr[5]; byteToTerminatedString(badUnit.toUint8(), badByteStr); internalEncodingError(unitsObserved, JSMSG_BAD_TRAILING_UTF8_UNIT, badByteStr); } template MOZ_COLD void TokenStreamChars::badStructurallyValidCodePoint( char32_t codePoint, uint8_t codePointLength, const char* reason) { // Construct a string like "0x203D" (including null terminator) to include // in the error message. Write the string end-to-start from end to start // of an adequately sized |char| array, shifting least significant nibbles // off the number and writing the corresponding hex digits until done, then // prefixing with "0x". |codePointStr| points at the incrementally // computed string, within |codePointCharsArray|'s bounds. // 0x1F'FFFF is the maximum value that can fit in 3+6+6+6 unconstrained // bits in a four-byte UTF-8 code unit sequence. constexpr size_t MaxHexSize = sizeof( "0x1F" "FFFF"); // including '\0' char codePointCharsArray[MaxHexSize]; char* codePointStr = std::end(codePointCharsArray); *--codePointStr = '\0'; // Note that by do-while looping here rather than while-looping, this // writes a '0' when |codePoint == 0|. do { MOZ_ASSERT(codePointCharsArray < codePointStr); *--codePointStr = toHexChar(codePoint & 0xF); codePoint >>= 4; } while (codePoint); MOZ_ASSERT(codePointCharsArray + 2 <= codePointStr); *--codePointStr = 'x'; *--codePointStr = '0'; internalEncodingError(codePointLength, JSMSG_FORBIDDEN_UTF8_CODE_POINT, codePointStr, reason); } template [[nodiscard]] bool TokenStreamChars::getNonAsciiCodePointDontNormalize( Utf8Unit lead, char32_t* codePoint) { auto onBadLeadUnit = [this, &lead]() { this->badLeadUnit(lead); }; auto onNotEnoughUnits = [this, &lead](uint8_t remaining, uint8_t required) { this->notEnoughUnits(lead, remaining, required); }; auto onBadTrailingUnit = [this](uint8_t unitsObserved) { this->badTrailingUnit(unitsObserved); }; auto onBadCodePoint = [this](char32_t badCodePoint, uint8_t unitsObserved) { this->badCodePoint(badCodePoint, unitsObserved); }; auto onNotShortestForm = [this](char32_t badCodePoint, uint8_t unitsObserved) { this->notShortestForm(badCodePoint, unitsObserved); }; // If a valid code point is decoded, this function call consumes its code // units. If not, it ungets the lead code unit and invokes the right error // handler, so on failure we must immediately return false. SourceUnitsIterator iter(this->sourceUnits); Maybe maybeCodePoint = DecodeOneUtf8CodePointInline( lead, &iter, SourceUnitsEnd(), onBadLeadUnit, onNotEnoughUnits, onBadTrailingUnit, onBadCodePoint, onNotShortestForm); if (maybeCodePoint.isNothing()) { return false; } *codePoint = maybeCodePoint.value(); return true; } template bool TokenStreamChars::getNonAsciiCodePoint( int32_t lead, char32_t* codePoint) { MOZ_ASSERT(lead != EOF); MOZ_ASSERT(!isAsciiCodePoint(lead), "ASCII code unit/point must be handled separately"); MOZ_ASSERT(lead == this->sourceUnits.previousCodeUnit(), "getNonAsciiCodePoint called incorrectly"); // The code point is usually |lead|: overwrite later if needed. *codePoint = AssertedCast(lead); // ECMAScript specifically requires that unpaired UTF-16 surrogates be // treated as the corresponding code point and not as an error. See // . // Thus this function does not consider any sequence of 16-bit numbers to // be intrinsically in error. // Dispense with single-unit code points and lone trailing surrogates. if (MOZ_LIKELY(!unicode::IsLeadSurrogate(lead))) { if (MOZ_UNLIKELY(lead == unicode::LINE_SEPARATOR || lead == unicode::PARA_SEPARATOR)) { if (!updateLineInfoForEOL()) { #ifdef DEBUG // Assign to a sentinel value to hopefully cause errors. *codePoint = std::numeric_limits::max(); #endif MOZ_MAKE_MEM_UNDEFINED(codePoint, sizeof(*codePoint)); return false; } *codePoint = '\n'; } else { MOZ_ASSERT(!IsLineTerminator(*codePoint)); } return true; } // Also handle a lead surrogate not paired with a trailing surrogate. if (MOZ_UNLIKELY( this->sourceUnits.atEnd() || !unicode::IsTrailSurrogate(this->sourceUnits.peekCodeUnit()))) { MOZ_ASSERT(!IsLineTerminator(*codePoint)); return true; } // Otherwise we have a multi-unit code point. *codePoint = unicode::UTF16Decode(lead, this->sourceUnits.getCodeUnit()); MOZ_ASSERT(!IsLineTerminator(*codePoint)); return true; } template bool TokenStreamChars::getNonAsciiCodePoint( int32_t unit, char32_t* codePoint) { MOZ_ASSERT(unit != EOF); MOZ_ASSERT(!isAsciiCodePoint(unit), "ASCII code unit/point must be handled separately"); Utf8Unit lead = Utf8Unit(static_cast(unit)); MOZ_ASSERT(lead == this->sourceUnits.previousCodeUnit(), "getNonAsciiCodePoint called incorrectly"); auto onBadLeadUnit = [this, &lead]() { this->badLeadUnit(lead); }; auto onNotEnoughUnits = [this, &lead](uint_fast8_t remaining, uint_fast8_t required) { this->notEnoughUnits(lead, remaining, required); }; auto onBadTrailingUnit = [this](uint_fast8_t unitsObserved) { this->badTrailingUnit(unitsObserved); }; auto onBadCodePoint = [this](char32_t badCodePoint, uint_fast8_t unitsObserved) { this->badCodePoint(badCodePoint, unitsObserved); }; auto onNotShortestForm = [this](char32_t badCodePoint, uint_fast8_t unitsObserved) { this->notShortestForm(badCodePoint, unitsObserved); }; // This consumes the full, valid code point or ungets |lead| and calls the // appropriate error functor on failure. SourceUnitsIterator iter(this->sourceUnits); Maybe maybeCodePoint = DecodeOneUtf8CodePoint( lead, &iter, SourceUnitsEnd(), onBadLeadUnit, onNotEnoughUnits, onBadTrailingUnit, onBadCodePoint, onNotShortestForm); if (maybeCodePoint.isNothing()) { return false; } char32_t cp = maybeCodePoint.value(); if (MOZ_UNLIKELY(cp == unicode::LINE_SEPARATOR || cp == unicode::PARA_SEPARATOR)) { if (!updateLineInfoForEOL()) { #ifdef DEBUG // Assign to a sentinel value to hopefully cause errors. *codePoint = std::numeric_limits::max(); #endif MOZ_MAKE_MEM_UNDEFINED(codePoint, sizeof(*codePoint)); return false; } *codePoint = '\n'; } else { MOZ_ASSERT(!IsLineTerminator(cp)); *codePoint = cp; } return true; } template <> size_t SourceUnits::findWindowStart(size_t offset) const { // This is JS's understanding of UTF-16 that allows lone surrogates, so // we have to exclude lone surrogates from [windowStart, offset) ourselves. const char16_t* const earliestPossibleStart = codeUnitPtrAt(startOffset_); const char16_t* const initial = codeUnitPtrAt(offset); const char16_t* p = initial; auto HalfWindowSize = [&p, &initial]() { return PointerRangeSize(p, initial); }; while (true) { MOZ_ASSERT(earliestPossibleStart <= p); MOZ_ASSERT(HalfWindowSize() <= WindowRadius); if (p <= earliestPossibleStart || HalfWindowSize() >= WindowRadius) { break; } char16_t c = p[-1]; // This stops at U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR in // string and template literals. These code points do affect line and // column coordinates, even as they encode their literal values. if (IsLineTerminator(c)) { break; } // Don't allow invalid UTF-16 in pre-context. (Current users don't // require this, and this behavior isn't currently imposed on // pre-context, but these facts might change someday.) if (MOZ_UNLIKELY(unicode::IsLeadSurrogate(c))) { break; } // Optimistically include the code unit, reverting below if needed. p--; // If it's not a surrogate at all, keep going. if (MOZ_LIKELY(!unicode::IsTrailSurrogate(c))) { continue; } // Stop if we don't have a usable surrogate pair. if (HalfWindowSize() >= WindowRadius || p <= earliestPossibleStart || // trail surrogate at low end !unicode::IsLeadSurrogate(p[-1])) // no paired lead surrogate { p++; break; } p--; } MOZ_ASSERT(HalfWindowSize() <= WindowRadius); return offset - HalfWindowSize(); } template <> size_t SourceUnits::findWindowStart(size_t offset) const { // |offset| must be the location of the error or somewhere before it, so we // know preceding data is valid UTF-8. const Utf8Unit* const earliestPossibleStart = codeUnitPtrAt(startOffset_); const Utf8Unit* const initial = codeUnitPtrAt(offset); const Utf8Unit* p = initial; auto HalfWindowSize = [&p, &initial]() { return PointerRangeSize(p, initial); }; while (true) { MOZ_ASSERT(earliestPossibleStart <= p); MOZ_ASSERT(HalfWindowSize() <= WindowRadius); if (p <= earliestPossibleStart || HalfWindowSize() >= WindowRadius) { break; } // Peek backward for a line break, and only decrement if there is none. uint8_t prev = p[-1].toUint8(); // First check for the ASCII LineTerminators. if (prev == '\r' || prev == '\n') { break; } // Now check for the non-ASCII LineTerminators U+2028 LINE SEPARATOR // (0xE2 0x80 0xA8) and U+2029 PARAGRAPH (0xE2 0x80 0xA9). If there // aren't three code units available, some comparison here will fail // before we'd underflow. if (MOZ_UNLIKELY((prev == 0xA8 || prev == 0xA9) && p[-2].toUint8() == 0x80 && p[-3].toUint8() == 0xE2)) { break; } // Rewind over the non-LineTerminator. This can't underflow // |earliestPossibleStart| because it begins a code point. while (IsTrailingUnit(*--p)) { continue; } MOZ_ASSERT(earliestPossibleStart <= p); // But if we underflowed |WindowRadius|, adjust forward and stop. if (HalfWindowSize() > WindowRadius) { static_assert(WindowRadius > 3, "skipping over non-lead code units below must not " "advance past |offset|"); while (IsTrailingUnit(*++p)) { continue; } MOZ_ASSERT(HalfWindowSize() < WindowRadius); break; } } MOZ_ASSERT(HalfWindowSize() <= WindowRadius); return offset - HalfWindowSize(); } template <> size_t SourceUnits::findWindowEnd(size_t offset) const { const char16_t* const initial = codeUnitPtrAt(offset); const char16_t* p = initial; auto HalfWindowSize = [&initial, &p]() { return PointerRangeSize(initial, p); }; while (true) { MOZ_ASSERT(p <= limit_); MOZ_ASSERT(HalfWindowSize() <= WindowRadius); if (p >= limit_ || HalfWindowSize() >= WindowRadius) { break; } char16_t c = *p; // This stops at U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR in // string and template literals. These code points do affect line and // column coordinates, even as they encode their literal values. if (IsLineTerminator(c)) { break; } // Don't allow invalid UTF-16 in post-context. (Current users don't // require this, and this behavior isn't currently imposed on // pre-context, but these facts might change someday.) if (MOZ_UNLIKELY(unicode::IsTrailSurrogate(c))) { break; } // Optimistically consume the code unit, ungetting it below if needed. p++; // If it's not a surrogate at all, keep going. if (MOZ_LIKELY(!unicode::IsLeadSurrogate(c))) { continue; } // Retract if the lead surrogate would stand alone at the end of the // window. if (HalfWindowSize() >= WindowRadius || // split pair p >= limit_ || // half-pair at end of source !unicode::IsTrailSurrogate(*p)) // no paired trail surrogate { p--; break; } p++; } return offset + HalfWindowSize(); } template <> size_t SourceUnits::findWindowEnd(size_t offset) const { const Utf8Unit* const initial = codeUnitPtrAt(offset); const Utf8Unit* p = initial; auto HalfWindowSize = [&initial, &p]() { return PointerRangeSize(initial, p); }; while (true) { MOZ_ASSERT(p <= limit_); MOZ_ASSERT(HalfWindowSize() <= WindowRadius); if (p >= limit_ || HalfWindowSize() >= WindowRadius) { break; } // A non-encoding error might be followed by an encoding error within // |maxEnd|, so we must validate as we go to not include invalid UTF-8 // in the computed window. What joy! Utf8Unit lead = *p; if (mozilla::IsAscii(lead)) { if (IsSingleUnitLineTerminator(lead)) { break; } p++; continue; } PeekedCodePoint peeked = PeekCodePoint(p, limit_); if (peeked.isNone()) { break; // encoding error } char32_t c = peeked.codePoint(); if (MOZ_UNLIKELY(c == unicode::LINE_SEPARATOR || c == unicode::PARA_SEPARATOR)) { break; } MOZ_ASSERT(!IsLineTerminator(c)); uint8_t len = peeked.lengthInUnits(); if (HalfWindowSize() + len > WindowRadius) { break; } p += len; } MOZ_ASSERT(HalfWindowSize() <= WindowRadius); return offset + HalfWindowSize(); } template bool TokenStreamSpecific::advance(size_t position) { const Unit* end = this->sourceUnits.codeUnitPtrAt(position); while (this->sourceUnits.addressOfNextCodeUnit() < end) { if (!getCodePoint()) { return false; } } TokenStreamAnyChars& anyChars = anyCharsAccess(); Token* cur = const_cast(&anyChars.currentToken()); cur->pos.begin = this->sourceUnits.offset(); cur->pos.end = cur->pos.begin; #ifdef DEBUG cur->type = TokenKind::Limit; #endif MOZ_MAKE_MEM_UNDEFINED(&cur->type, sizeof(cur->type)); anyChars.lookahead = 0; return true; } template void TokenStreamSpecific::seekTo(const Position& pos) { TokenStreamAnyChars& anyChars = anyCharsAccess(); this->sourceUnits.setAddressOfNextCodeUnit(pos.buf, /* allowPoisoned = */ true); anyChars.flags = pos.flags; anyChars.lineno = pos.lineno; anyChars.linebase = pos.linebase; anyChars.prevLinebase = pos.prevLinebase; anyChars.lookahead = pos.lookahead; anyChars.tokens[anyChars.cursor()] = pos.currentToken; for (unsigned i = 0; i < anyChars.lookahead; i++) { anyChars.tokens[anyChars.aheadCursor(1 + i)] = pos.lookaheadTokens[i]; } } template bool TokenStreamSpecific::seekTo( const Position& pos, const TokenStreamAnyChars& other) { if (!anyCharsAccess().srcCoords.fill(other.srcCoords)) { return false; } seekTo(pos); return true; } void TokenStreamAnyChars::computeErrorMetadataNoOffset( ErrorMetadata* err) const { err->isMuted = mutedErrors; err->filename = filename_; err->lineNumber = 0; err->columnNumber = 0; MOZ_ASSERT(err->lineOfContext == nullptr); } bool TokenStreamAnyChars::fillExceptingContext(ErrorMetadata* err, uint32_t offset) const { err->isMuted = mutedErrors; // If this TokenStreamAnyChars doesn't have location information, try to // get it from the caller. if (!filename_) { JSContext* maybeCx = context()->maybeCurrentJSContext(); if (maybeCx && !maybeCx->isHelperThreadContext()) { NonBuiltinFrameIter iter(maybeCx, FrameIter::FOLLOW_DEBUGGER_EVAL_PREV_LINK, maybeCx->realm()->principals()); if (!iter.done() && iter.filename()) { err->filename = iter.filename(); err->lineNumber = iter.computeLine(&err->columnNumber); return false; } } } // Otherwise use this TokenStreamAnyChars's location information. err->filename = filename_; return true; } template <> inline void SourceUnits::computeWindowOffsetAndLength( const char16_t* encodedWindow, size_t encodedTokenOffset, size_t* utf16TokenOffset, size_t encodedWindowLength, size_t* utf16WindowLength) const { MOZ_ASSERT_UNREACHABLE("shouldn't need to recompute for UTF-16"); } template <> inline void SourceUnits::computeWindowOffsetAndLength( const Utf8Unit* encodedWindow, size_t encodedTokenOffset, size_t* utf16TokenOffset, size_t encodedWindowLength, size_t* utf16WindowLength) const { MOZ_ASSERT(encodedTokenOffset <= encodedWindowLength, "token offset must be within the window, and the two lambda " "calls below presume this ordering of values"); const Utf8Unit* const encodedWindowEnd = encodedWindow + encodedWindowLength; size_t i = 0; auto ComputeUtf16Count = [&i, &encodedWindow](const Utf8Unit* limit) { while (encodedWindow < limit) { Utf8Unit lead = *encodedWindow++; if (MOZ_LIKELY(IsAscii(lead))) { // ASCII contributes a single UTF-16 code unit. i++; continue; } Maybe cp = DecodeOneUtf8CodePoint(lead, &encodedWindow, limit); MOZ_ASSERT(cp.isSome(), "computed window should only contain valid UTF-8"); i += unicode::IsSupplementary(cp.value()) ? 2 : 1; } return i; }; // Compute the token offset from |i == 0| and the initial |encodedWindow|. const Utf8Unit* token = encodedWindow + encodedTokenOffset; MOZ_ASSERT(token <= encodedWindowEnd); *utf16TokenOffset = ComputeUtf16Count(token); // Compute the window length, picking up from |i| and |encodedWindow| that, // in general, were modified just above. *utf16WindowLength = ComputeUtf16Count(encodedWindowEnd); } template bool TokenStreamCharsBase::addLineOfContext(ErrorMetadata* err, uint32_t offset) const { // Rename the variable to make meaning clearer: an offset into source units // in Unit encoding. size_t encodedOffset = offset; // These are also offsets into source units in Unit encoding. size_t encodedWindowStart = sourceUnits.findWindowStart(encodedOffset); size_t encodedWindowEnd = sourceUnits.findWindowEnd(encodedOffset); size_t encodedWindowLength = encodedWindowEnd - encodedWindowStart; MOZ_ASSERT(encodedWindowLength <= SourceUnits::WindowRadius * 2); // Don't add a useless "line" of context when the window ends up empty // because of an invalid encoding at the start of a line. if (encodedWindowLength == 0) { MOZ_ASSERT(err->lineOfContext == nullptr, "ErrorMetadata::lineOfContext must be null so we don't " "have to set the lineLength/tokenOffset fields"); return true; } CharBuffer lineOfContext(fc); const Unit* encodedWindow = sourceUnits.codeUnitPtrAt(encodedWindowStart); if (!FillCharBufferFromSourceNormalizingAsciiLineBreaks( lineOfContext, encodedWindow, encodedWindow + encodedWindowLength)) { return false; } size_t utf16WindowLength = lineOfContext.length(); // The windowed string is null-terminated. if (!lineOfContext.append('\0')) { return false; } err->lineOfContext.reset(lineOfContext.extractOrCopyRawBuffer()); if (!err->lineOfContext) { return false; } size_t encodedTokenOffset = encodedOffset - encodedWindowStart; MOZ_ASSERT(encodedTokenOffset <= encodedWindowLength, "token offset must be inside the window"); // The length in UTF-8 code units of a code point is always greater than or // equal to the same code point's length in UTF-16 code points. ASCII code // points are 1 unit in either encoding. Code points in [U+0080, U+10000) // are 2-3 UTF-8 code units to 1 UTF-16 code unit. And code points in // [U+10000, U+10FFFF] are 4 UTF-8 code units to 2 UTF-16 code units. // // Therefore, if encoded window length equals the length in UTF-16 (this is // always the case for Unit=char16_t), the UTF-16 offsets are exactly the // encoded offsets. Otherwise we must convert offset/length from UTF-8 to // UTF-16. if constexpr (std::is_same_v) { MOZ_ASSERT(utf16WindowLength == encodedWindowLength, "UTF-16 to UTF-16 shouldn't change window length"); err->tokenOffset = encodedTokenOffset; err->lineLength = encodedWindowLength; } else { static_assert(std::is_same_v, "should only see UTF-8 here"); bool simple = utf16WindowLength == encodedWindowLength; #ifdef DEBUG auto isAscii = [](Unit u) { return IsAscii(u); }; MOZ_ASSERT(std::all_of(encodedWindow, encodedWindow + encodedWindowLength, isAscii) == simple, "equal window lengths in UTF-8 should correspond only to " "wholly-ASCII text"); #endif if (simple) { err->tokenOffset = encodedTokenOffset; err->lineLength = encodedWindowLength; } else { sourceUnits.computeWindowOffsetAndLength( encodedWindow, encodedTokenOffset, &err->tokenOffset, encodedWindowLength, &err->lineLength); } } return true; } template bool TokenStreamSpecific::computeErrorMetadata( ErrorMetadata* err, const ErrorOffset& errorOffset) const { if (errorOffset.is()) { anyCharsAccess().computeErrorMetadataNoOffset(err); return true; } uint32_t offset; if (errorOffset.is()) { offset = errorOffset.as(); } else { offset = this->sourceUnits.offset(); } // This function's return value isn't a success/failure indication: it // returns true if this TokenStream can be used to provide a line of // context. if (fillExceptingContext(err, offset)) { // Add a line of context from this TokenStream to help with debugging. return internalComputeLineOfContext(err, offset); } // We can't fill in any more here. return true; } template void TokenStreamSpecific::reportIllegalCharacter( int32_t cp) { UniqueChars display = JS_smprintf("U+%04X", cp); if (!display) { ReportOutOfMemory(anyCharsAccess().fc); return; } error(JSMSG_ILLEGAL_CHARACTER, display.get()); } // We have encountered a '\': check for a Unicode escape sequence after it. // Return the length of the escape sequence and the encoded code point (by // value) if we found a Unicode escape sequence, and skip all code units // involed. Otherwise, return 0 and don't advance along the buffer. template uint32_t GeneralTokenStreamChars::matchUnicodeEscape( char32_t* codePoint) { MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\')); int32_t unit = getCodeUnit(); if (unit != 'u') { // NOTE: |unit| may be EOF here. ungetCodeUnit(unit); MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\')); return 0; } char16_t v; unit = getCodeUnit(); if (IsAsciiHexDigit(unit) && this->sourceUnits.matchHexDigits(3, &v)) { *codePoint = (AsciiAlphanumericToNumber(unit) << 12) | v; return 5; } if (unit == '{') { return matchExtendedUnicodeEscape(codePoint); } // NOTE: |unit| may be EOF here, so this ungets either one or two units. ungetCodeUnit(unit); ungetCodeUnit('u'); MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\')); return 0; } template uint32_t GeneralTokenStreamChars::matchExtendedUnicodeEscape( char32_t* codePoint) { MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('{')); int32_t unit = getCodeUnit(); // Skip leading zeroes. uint32_t leadingZeroes = 0; while (unit == '0') { leadingZeroes++; unit = getCodeUnit(); } size_t i = 0; uint32_t code = 0; while (IsAsciiHexDigit(unit) && i < 6) { code = (code << 4) | AsciiAlphanumericToNumber(unit); unit = getCodeUnit(); i++; } uint32_t gotten = 2 + // 'u{' leadingZeroes + i + // significant hexdigits (unit != EOF); // subtract a get if it didn't contribute to length if (unit == '}' && (leadingZeroes > 0 || i > 0) && code <= unicode::NonBMPMax) { *codePoint = code; return gotten; } this->sourceUnits.unskipCodeUnits(gotten); MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\')); return 0; } template uint32_t GeneralTokenStreamChars::matchUnicodeEscapeIdStart( char32_t* codePoint) { uint32_t length = matchUnicodeEscape(codePoint); if (MOZ_LIKELY(length > 0)) { if (MOZ_LIKELY(unicode::IsIdentifierStart(*codePoint))) { return length; } this->sourceUnits.unskipCodeUnits(length); } MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\')); return 0; } template bool GeneralTokenStreamChars::matchUnicodeEscapeIdent( char32_t* codePoint) { uint32_t length = matchUnicodeEscape(codePoint); if (MOZ_LIKELY(length > 0)) { if (MOZ_LIKELY(unicode::IsIdentifierPart(*codePoint))) { return true; } this->sourceUnits.unskipCodeUnits(length); } MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\')); return false; } template [[nodiscard]] bool TokenStreamSpecific::matchIdentifierStart( IdentifierEscapes* sawEscape) { int32_t unit = getCodeUnit(); if (unit == EOF) { error(JSMSG_MISSING_PRIVATE_NAME); return false; } if (MOZ_LIKELY(isAsciiCodePoint(unit))) { if (unicode::IsIdentifierStart(char16_t(unit))) { *sawEscape = IdentifierEscapes::None; return true; } if (unit == '\\') { char32_t codePoint; uint32_t escapeLength = matchUnicodeEscapeIdStart(&codePoint); if (escapeLength != 0) { *sawEscape = IdentifierEscapes::SawUnicodeEscape; return true; } // We could point "into" a mistyped escape, e.g. for "\u{41H}" we // could point at the 'H'. But we don't do that now, so the code // unit after the '\' isn't necessarily bad, so just point at the // start of the actually-invalid escape. ungetCodeUnit('\\'); error(JSMSG_BAD_ESCAPE); return false; } } // Unget the lead code unit before peeking at the full code point. ungetCodeUnit(unit); PeekedCodePoint peeked = this->sourceUnits.peekCodePoint(); if (!peeked.isNone() && unicode::IsIdentifierStart(peeked.codePoint())) { this->sourceUnits.consumeKnownCodePoint(peeked); *sawEscape = IdentifierEscapes::None; return true; } error(JSMSG_MISSING_PRIVATE_NAME); return false; } template bool TokenStreamSpecific::getDirectives( bool isMultiline, bool shouldWarnDeprecated) { // Match directive comments used in debugging, such as "//# sourceURL" and // "//# sourceMappingURL". Use of "//@" instead of "//#" is deprecated. // // To avoid a crashing bug in IE, several JavaScript transpilers wrap single // line comments containing a source mapping URL inside a multiline // comment. To avoid potentially expensive lookahead and backtracking, we // only check for this case if we encounter a '#' code unit. bool res = getDisplayURL(isMultiline, shouldWarnDeprecated) && getSourceMappingURL(isMultiline, shouldWarnDeprecated); if (!res) { badToken(); } return res; } [[nodiscard]] bool TokenStreamCharsShared::copyCharBufferTo( UniquePtr* destination) { size_t length = charBuffer.length(); *destination = fc->getAllocator()->make_pod_array(length + 1); if (!*destination) { return false; } std::copy(charBuffer.begin(), charBuffer.end(), destination->get()); (*destination)[length] = '\0'; return true; } template [[nodiscard]] bool TokenStreamSpecific::getDirective( bool isMultiline, bool shouldWarnDeprecated, const char* directive, uint8_t directiveLength, const char* errorMsgPragma, UniquePtr* destination) { // Stop if we don't find |directive|. (Note that |directive| must be // ASCII, so there are no tricky encoding issues to consider in matching // UTF-8/16-agnostically.) if (!this->sourceUnits.matchCodeUnits(directive, directiveLength)) { return true; } if (shouldWarnDeprecated) { if (!warning(JSMSG_DEPRECATED_PRAGMA, errorMsgPragma)) { return false; } } this->charBuffer.clear(); do { int32_t unit = peekCodeUnit(); if (unit == EOF) { break; } if (MOZ_LIKELY(isAsciiCodePoint(unit))) { if (unicode::IsSpace(AssertedCast(unit))) { break; } consumeKnownCodeUnit(unit); // Debugging directives can occur in both single- and multi-line // comments. If we're currently inside a multi-line comment, we // also must recognize multi-line comment terminators. if (isMultiline && unit == '*' && peekCodeUnit() == '/') { ungetCodeUnit('*'); break; } if (!this->charBuffer.append(unit)) { return false; } continue; } // This ignores encoding errors: subsequent caller-side code to // handle the remaining source text in the comment will do so. PeekedCodePoint peeked = this->sourceUnits.peekCodePoint(); if (peeked.isNone() || unicode::IsSpace(peeked.codePoint())) { break; } MOZ_ASSERT(!IsLineTerminator(peeked.codePoint()), "!IsSpace must imply !IsLineTerminator or else we'll fail to " "maintain line-info/flags for EOL"); this->sourceUnits.consumeKnownCodePoint(peeked); if (!AppendCodePointToCharBuffer(this->charBuffer, peeked.codePoint())) { return false; } } while (true); if (this->charBuffer.empty()) { // The directive's URL was missing, but comments can contain anything, // so it isn't an error. return true; } return copyCharBufferTo(destination); } template bool TokenStreamSpecific::getDisplayURL( bool isMultiline, bool shouldWarnDeprecated) { // Match comments of the form "//# sourceURL=" or // "/\* //# sourceURL= *\/" // // Note that while these are labeled "sourceURL" in the source text, // internally we refer to it as a "displayURL" to distinguish what the // developer would like to refer to the source as from the source's actual // URL. static constexpr char sourceURLDirective[] = " sourceURL="; constexpr uint8_t sourceURLDirectiveLength = js_strlen(sourceURLDirective); return getDirective(isMultiline, shouldWarnDeprecated, sourceURLDirective, sourceURLDirectiveLength, "sourceURL", &anyCharsAccess().displayURL_); } template bool TokenStreamSpecific::getSourceMappingURL( bool isMultiline, bool shouldWarnDeprecated) { // Match comments of the form "//# sourceMappingURL=" or // "/\* //# sourceMappingURL= *\/" static constexpr char sourceMappingURLDirective[] = " sourceMappingURL="; constexpr uint8_t sourceMappingURLDirectiveLength = js_strlen(sourceMappingURLDirective); return getDirective(isMultiline, shouldWarnDeprecated, sourceMappingURLDirective, sourceMappingURLDirectiveLength, "sourceMappingURL", &anyCharsAccess().sourceMapURL_); } template MOZ_ALWAYS_INLINE Token* GeneralTokenStreamChars::newTokenInternal( TokenKind kind, TokenStart start, TokenKind* out) { MOZ_ASSERT(kind < TokenKind::Limit); MOZ_ASSERT(kind != TokenKind::Eol, "TokenKind::Eol should never be used in an actual Token, only " "returned by peekTokenSameLine()"); TokenStreamAnyChars& anyChars = anyCharsAccess(); anyChars.flags.isDirtyLine = true; Token* token = anyChars.allocateToken(); *out = token->type = kind; token->pos = TokenPos(start.offset(), this->sourceUnits.offset()); MOZ_ASSERT(token->pos.begin <= token->pos.end); // NOTE: |token->modifier| is set in |newToken()| so that optimized, // non-debug code won't do any work to pass a modifier-argument that will // never be used. return token; } template MOZ_COLD bool GeneralTokenStreamChars::badToken() { // We didn't get a token, so don't set |flags.isDirtyLine|. anyCharsAccess().flags.hadError = true; // Poisoning sourceUnits on error establishes an invariant: once an // erroneous token has been seen, sourceUnits will not be consulted again. // This is true because the parser will deal with the illegal token by // aborting parsing immediately. this->sourceUnits.poisonInDebug(); return false; }; bool AppendCodePointToCharBuffer(CharBuffer& charBuffer, char32_t codePoint) { MOZ_ASSERT(codePoint <= unicode::NonBMPMax, "should only be processing code points validly decoded from UTF-8 " "or WTF-16 source text (surrogate code points permitted)"); char16_t units[2]; unsigned numUnits = 0; unicode::UTF16Encode(codePoint, units, &numUnits); MOZ_ASSERT(numUnits == 1 || numUnits == 2, "UTF-16 code points are only encoded in one or two units"); if (!charBuffer.append(units[0])) { return false; } if (numUnits == 1) { return true; } return charBuffer.append(units[1]); } template bool TokenStreamSpecific::putIdentInCharBuffer( const Unit* identStart) { const Unit* const originalAddress = this->sourceUnits.addressOfNextCodeUnit(); this->sourceUnits.setAddressOfNextCodeUnit(identStart); auto restoreNextRawCharAddress = MakeScopeExit([this, originalAddress]() { this->sourceUnits.setAddressOfNextCodeUnit(originalAddress); }); this->charBuffer.clear(); do { int32_t unit = getCodeUnit(); if (unit == EOF) { break; } char32_t codePoint; if (MOZ_LIKELY(isAsciiCodePoint(unit))) { if (unicode::IsIdentifierPart(char16_t(unit)) || unit == '#') { if (!this->charBuffer.append(unit)) { return false; } continue; } if (unit != '\\' || !matchUnicodeEscapeIdent(&codePoint)) { break; } } else { // |restoreNextRawCharAddress| undoes all gets, and this function // doesn't update line/column info. char32_t cp; if (!getNonAsciiCodePointDontNormalize(toUnit(unit), &cp)) { return false; } codePoint = cp; if (!unicode::IsIdentifierPart(codePoint)) { break; } } if (!AppendCodePointToCharBuffer(this->charBuffer, codePoint)) { return false; } } while (true); return true; } template [[nodiscard]] bool TokenStreamSpecific::identifierName( TokenStart start, const Unit* identStart, IdentifierEscapes escaping, Modifier modifier, NameVisibility visibility, TokenKind* out) { // Run the bad-token code for every path out of this function except the // two success-cases. auto noteBadToken = MakeScopeExit([this]() { this->badToken(); }); // We've already consumed an initial code point in the identifer, to *know* // that this is an identifier. So no need to worry about not consuming any // code points in the loop below. int32_t unit; while (true) { unit = peekCodeUnit(); if (unit == EOF) { break; } if (MOZ_LIKELY(isAsciiCodePoint(unit))) { consumeKnownCodeUnit(unit); if (MOZ_UNLIKELY( !unicode::IsIdentifierPart(static_cast(unit)))) { // Handle a Unicode escape -- otherwise it's not part of the // identifier. char32_t codePoint; if (unit != '\\' || !matchUnicodeEscapeIdent(&codePoint)) { ungetCodeUnit(unit); break; } escaping = IdentifierEscapes::SawUnicodeEscape; } } else { // This ignores encoding errors: subsequent caller-side code to // handle source text after the IdentifierName will do so. PeekedCodePoint peeked = this->sourceUnits.peekCodePoint(); if (peeked.isNone() || !unicode::IsIdentifierPart(peeked.codePoint())) { break; } MOZ_ASSERT(!IsLineTerminator(peeked.codePoint()), "IdentifierPart must guarantee !IsLineTerminator or " "else we'll fail to maintain line-info/flags for EOL"); this->sourceUnits.consumeKnownCodePoint(peeked); } } TaggedParserAtomIndex atom; if (MOZ_UNLIKELY(escaping == IdentifierEscapes::SawUnicodeEscape)) { // Identifiers containing Unicode escapes have to be converted into // tokenbuf before atomizing. if (!putIdentInCharBuffer(identStart)) { return false; } atom = drainCharBufferIntoAtom(); } else { // Escape-free identifiers can be created directly from sourceUnits. const Unit* chars = identStart; size_t length = this->sourceUnits.addressOfNextCodeUnit() - identStart; // Private identifiers start with a '#', and so cannot be reserved words. if (visibility == NameVisibility::Public) { // Represent reserved words lacking escapes as reserved word tokens. if (const ReservedWordInfo* rw = FindReservedWord(chars, length)) { noteBadToken.release(); newSimpleToken(rw->tokentype, start, modifier, out); return true; } } atom = atomizeSourceChars(Span(chars, length)); } if (!atom) { return false; } noteBadToken.release(); if (visibility == NameVisibility::Private) { newPrivateNameToken(atom, start, modifier, out); return true; } newNameToken(atom, start, modifier, out); return true; } enum FirstCharKind { // A char16_t has the 'OneChar' kind if it, by itself, constitutes a valid // token that cannot also be a prefix of a longer token. E.g. ';' has the // OneChar kind, but '+' does not, because '++' and '+=' are valid longer // tokens // that begin with '+'. // // The few token kinds satisfying these properties cover roughly 35--45% // of the tokens seen in practice. // // We represent the 'OneChar' kind with any positive value less than // TokenKind::Limit. This representation lets us associate // each one-char token char16_t with a TokenKind and thus avoid // a subsequent char16_t-to-TokenKind conversion. OneChar_Min = 0, OneChar_Max = size_t(TokenKind::Limit) - 1, Space = size_t(TokenKind::Limit), Ident, Dec, String, EOL, ZeroDigit, Other, LastCharKind = Other }; // OneChar: 40, 41, 44, 58, 59, 91, 93, 123, 125, 126: // '(', ')', ',', ':', ';', '[', ']', '{', '}', '~' // Ident: 36, 65..90, 95, 97..122: '$', 'A'..'Z', '_', 'a'..'z' // Dot: 46: '.' // Equals: 61: '=' // String: 34, 39, 96: '"', '\'', '`' // Dec: 49..57: '1'..'9' // Plus: 43: '+' // ZeroDigit: 48: '0' // Space: 9, 11, 12, 32: '\t', '\v', '\f', ' ' // EOL: 10, 13: '\n', '\r' // #define T_COMMA size_t(TokenKind::Comma) #define T_COLON size_t(TokenKind::Colon) #define T_BITNOT size_t(TokenKind::BitNot) #define T_LP size_t(TokenKind::LeftParen) #define T_RP size_t(TokenKind::RightParen) #define T_SEMI size_t(TokenKind::Semi) #define T_LB size_t(TokenKind::LeftBracket) #define T_RB size_t(TokenKind::RightBracket) #define T_LC size_t(TokenKind::LeftCurly) #define T_RC size_t(TokenKind::RightCurly) #define _______ Other static const uint8_t firstCharKinds[] = { // clang-format off /* 0 1 2 3 4 5 6 7 8 9 */ /* 0+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, Space, /* 10+ */ EOL, Space, Space, EOL, _______, _______, _______, _______, _______, _______, /* 20+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, /* 30+ */ _______, _______, Space, _______, String, _______, Ident, _______, _______, String, /* 40+ */ T_LP, T_RP, _______, _______, T_COMMA, _______, _______, _______,ZeroDigit, Dec, /* 50+ */ Dec, Dec, Dec, Dec, Dec, Dec, Dec, Dec, T_COLON, T_SEMI, /* 60+ */ _______, _______, _______, _______, _______, Ident, Ident, Ident, Ident, Ident, /* 70+ */ Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, /* 80+ */ Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, /* 90+ */ Ident, T_LB, _______, T_RB, _______, Ident, String, Ident, Ident, Ident, /* 100+ */ Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, /* 110+ */ Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, /* 120+ */ Ident, Ident, Ident, T_LC, _______, T_RC,T_BITNOT, _______ // clang-format on }; #undef T_COMMA #undef T_COLON #undef T_BITNOT #undef T_LP #undef T_RP #undef T_SEMI #undef T_LB #undef T_RB #undef T_LC #undef T_RC #undef _______ static_assert(LastCharKind < (1 << (sizeof(firstCharKinds[0]) * 8)), "Elements of firstCharKinds[] are too small"); template <> void SourceUnits::consumeRestOfSingleLineComment() { while (MOZ_LIKELY(!atEnd())) { char16_t unit = peekCodeUnit(); if (IsLineTerminator(unit)) { return; } consumeKnownCodeUnit(unit); } } template <> void SourceUnits::consumeRestOfSingleLineComment() { while (MOZ_LIKELY(!atEnd())) { const Utf8Unit unit = peekCodeUnit(); if (IsSingleUnitLineTerminator(unit)) { return; } if (MOZ_LIKELY(IsAscii(unit))) { consumeKnownCodeUnit(unit); continue; } PeekedCodePoint peeked = peekCodePoint(); if (peeked.isNone()) { return; } char32_t c = peeked.codePoint(); if (MOZ_UNLIKELY(c == unicode::LINE_SEPARATOR || c == unicode::PARA_SEPARATOR)) { return; } consumeKnownCodePoint(peeked); } } template [[nodiscard]] MOZ_ALWAYS_INLINE bool TokenStreamSpecific::matchInteger( IsIntegerUnit isIntegerUnit, int32_t* nextUnit) { int32_t unit = getCodeUnit(); if (!isIntegerUnit(unit)) { *nextUnit = unit; return true; } return matchIntegerAfterFirstDigit(isIntegerUnit, nextUnit); } template [[nodiscard]] MOZ_ALWAYS_INLINE bool TokenStreamSpecific::matchIntegerAfterFirstDigit( IsIntegerUnit isIntegerUnit, int32_t* nextUnit) { int32_t unit; while (true) { unit = getCodeUnit(); if (isIntegerUnit(unit)) { continue; } if (unit != '_') { break; } unit = getCodeUnit(); if (!isIntegerUnit(unit)) { if (unit == '_') { ungetCodeUnit(unit); error(JSMSG_NUMBER_MULTIPLE_ADJACENT_UNDERSCORES); } else { ungetCodeUnit(unit); ungetCodeUnit('_'); error(JSMSG_NUMBER_END_WITH_UNDERSCORE); } return false; } } *nextUnit = unit; return true; } template [[nodiscard]] bool TokenStreamSpecific::decimalNumber( int32_t unit, TokenStart start, const Unit* numStart, Modifier modifier, TokenKind* out) { // Run the bad-token code for every path out of this function except the // one success-case. auto noteBadToken = MakeScopeExit([this]() { this->badToken(); }); // Consume integral component digits. if (IsAsciiDigit(unit)) { if (!matchIntegerAfterFirstDigit(IsAsciiDigit, &unit)) { return false; } } // Numbers contain no escapes, so we can read directly from |sourceUnits|. double dval; bool isBigInt = false; DecimalPoint decimalPoint = NoDecimal; if (unit != '.' && unit != 'e' && unit != 'E' && unit != 'n') { // NOTE: |unit| may be EOF here. ungetCodeUnit(unit); // Most numbers are pure decimal integers without fractional component // or exponential notation. Handle that with optimized code. if (!GetDecimalInteger(numStart, this->sourceUnits.addressOfNextCodeUnit(), &dval)) { ReportOutOfMemory(this->fc); return false; } } else if (unit == 'n') { isBigInt = true; unit = peekCodeUnit(); } else { // Consume any decimal dot and fractional component. if (unit == '.') { decimalPoint = HasDecimal; if (!matchInteger(IsAsciiDigit, &unit)) { return false; } } // Consume any exponential notation. if (unit == 'e' || unit == 'E') { unit = getCodeUnit(); if (unit == '+' || unit == '-') { unit = getCodeUnit(); } // Exponential notation must contain at least one digit. if (!IsAsciiDigit(unit)) { ungetCodeUnit(unit); error(JSMSG_MISSING_EXPONENT); return false; } // Consume exponential digits. if (!matchIntegerAfterFirstDigit(IsAsciiDigit, &unit)) { return false; } } ungetCodeUnit(unit); if (!GetDecimal(numStart, this->sourceUnits.addressOfNextCodeUnit(), &dval)) { ReportOutOfMemory(this->fc); return false; } } // Number followed by IdentifierStart is an error. (This is the only place // in ECMAScript where token boundary is inadequate to properly separate // two tokens, necessitating this unaesthetic lookahead.) if (unit != EOF) { if (MOZ_LIKELY(isAsciiCodePoint(unit))) { if (unicode::IsIdentifierStart(char16_t(unit))) { error(JSMSG_IDSTART_AFTER_NUMBER); return false; } } else { // This ignores encoding errors: subsequent caller-side code to // handle source text after the number will do so. PeekedCodePoint peeked = this->sourceUnits.peekCodePoint(); if (!peeked.isNone() && unicode::IsIdentifierStart(peeked.codePoint())) { error(JSMSG_IDSTART_AFTER_NUMBER); return false; } } } noteBadToken.release(); if (isBigInt) { return bigIntLiteral(start, modifier, out); } newNumberToken(dval, decimalPoint, start, modifier, out); return true; } template [[nodiscard]] bool TokenStreamSpecific::regexpLiteral( TokenStart start, TokenKind* out) { MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('/')); this->charBuffer.clear(); auto ProcessNonAsciiCodePoint = [this](int32_t lead) { MOZ_ASSERT(lead != EOF); MOZ_ASSERT(!this->isAsciiCodePoint(lead)); char32_t codePoint; if (!this->getNonAsciiCodePointDontNormalize(this->toUnit(lead), &codePoint)) { return false; } if (MOZ_UNLIKELY(codePoint == unicode::LINE_SEPARATOR || codePoint == unicode::PARA_SEPARATOR)) { this->sourceUnits.ungetLineOrParagraphSeparator(); this->error(JSMSG_UNTERMINATED_REGEXP); return false; } return AppendCodePointToCharBuffer(this->charBuffer, codePoint); }; auto ReportUnterminatedRegExp = [this](int32_t unit) { this->ungetCodeUnit(unit); this->error(JSMSG_UNTERMINATED_REGEXP); }; bool inCharClass = false; do { int32_t unit = getCodeUnit(); if (unit == EOF) { ReportUnterminatedRegExp(unit); return badToken(); } if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) { if (!ProcessNonAsciiCodePoint(unit)) { return badToken(); } continue; } if (unit == '\\') { if (!this->charBuffer.append(unit)) { return badToken(); } unit = getCodeUnit(); if (unit == EOF) { ReportUnterminatedRegExp(unit); return badToken(); } // Fallthrough only handles ASCII code points, so // deal with non-ASCII and skip everything else. if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) { if (!ProcessNonAsciiCodePoint(unit)) { return badToken(); } continue; } } else if (unit == '[') { inCharClass = true; } else if (unit == ']') { inCharClass = false; } else if (unit == '/' && !inCharClass) { // For IE compat, allow unescaped / in char classes. break; } // NOTE: Non-ASCII LineTerminators were handled by // ProcessNonAsciiCodePoint calls above. if (unit == '\r' || unit == '\n') { ReportUnterminatedRegExp(unit); return badToken(); } MOZ_ASSERT(!IsLineTerminator(AssertedCast(unit))); if (!this->charBuffer.append(unit)) { return badToken(); } } while (true); int32_t unit; RegExpFlags reflags = RegExpFlag::NoFlags; while (true) { uint8_t flag; unit = getCodeUnit(); if (unit == 'd') { flag = RegExpFlag::HasIndices; } else if (unit == 'g') { flag = RegExpFlag::Global; } else if (unit == 'i') { flag = RegExpFlag::IgnoreCase; } else if (unit == 'm') { flag = RegExpFlag::Multiline; } else if (unit == 's') { flag = RegExpFlag::DotAll; } else if (unit == 'u') { flag = RegExpFlag::Unicode; } else if (unit == 'y') { flag = RegExpFlag::Sticky; } else if (IsAsciiAlpha(unit)) { flag = RegExpFlag::NoFlags; } else { break; } if ((reflags & flag) || flag == RegExpFlag::NoFlags) { ungetCodeUnit(unit); char buf[2] = {char(unit), '\0'}; error(JSMSG_BAD_REGEXP_FLAG, buf); return badToken(); } reflags |= flag; } ungetCodeUnit(unit); newRegExpToken(reflags, start, out); return true; } template [[nodiscard]] bool TokenStreamSpecific::bigIntLiteral( TokenStart start, Modifier modifier, TokenKind* out) { MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == toUnit('n')); MOZ_ASSERT(this->sourceUnits.offset() > start.offset()); uint32_t length = this->sourceUnits.offset() - start.offset(); MOZ_ASSERT(length >= 2); this->charBuffer.clear(); mozilla::Range chars( this->sourceUnits.codeUnitPtrAt(start.offset()), length); for (uint32_t idx = 0; idx < length - 1; idx++) { int32_t unit = CodeUnitValue(chars[idx]); // Char buffer may start with a 0[bBoOxX] prefix, then follows with // binary, octal, decimal, or hex digits. Already checked by caller, as // the "n" indicating bigint comes at the end. MOZ_ASSERT(isAsciiCodePoint(unit)); // Skip over any separators. if (unit == '_') { continue; } if (!AppendCodePointToCharBuffer(this->charBuffer, unit)) { return false; } } newBigIntToken(start, modifier, out); return true; } template void GeneralTokenStreamChars::consumeOptionalHashbangComment() { MOZ_ASSERT(this->sourceUnits.atStart(), "HashBangComment can only appear immediately at the start of a " "Script or Module"); // HashbangComment :: // #! SingleLineCommentChars_opt if (!matchCodeUnit('#')) { // HashbangComment is optional at start of Script or Module. return; } if (!matchCodeUnit('!')) { // # not followed by ! at start of Script or Module is an error, but normal // parsing code will handle that error just fine if we let it. ungetCodeUnit('#'); return; } // This doesn't consume a concluding LineTerminator, and it stops consuming // just before any encoding error. The subsequent |getToken| call will call // |getTokenInternal| below which will handle these possibilities. this->sourceUnits.consumeRestOfSingleLineComment(); } template [[nodiscard]] bool TokenStreamSpecific::getTokenInternal( TokenKind* const ttp, const Modifier modifier) { // Assume we'll fail: success cases will overwrite this. #ifdef DEBUG *ttp = TokenKind::Limit; #endif MOZ_MAKE_MEM_UNDEFINED(ttp, sizeof(*ttp)); // This loop runs more than once only when whitespace or comments are // encountered. do { int32_t unit = peekCodeUnit(); if (MOZ_UNLIKELY(unit == EOF)) { MOZ_ASSERT(this->sourceUnits.atEnd()); anyCharsAccess().flags.isEOF = true; TokenStart start(this->sourceUnits, 0); newSimpleToken(TokenKind::Eof, start, modifier, ttp); return true; } if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) { // Non-ASCII code points can only be identifiers or whitespace. It would // be nice to compute these *after* discarding whitespace, but IN A WORLD // where |unicode::IsSpace| requires consuming a variable number of code // units, it's easier to assume it's an identifier and maybe do a little // wasted work, than to unget and compute and reget if whitespace. TokenStart start(this->sourceUnits, 0); const Unit* identStart = this->sourceUnits.addressOfNextCodeUnit(); PeekedCodePoint peeked = this->sourceUnits.peekCodePoint(); if (peeked.isNone()) { MOZ_ALWAYS_FALSE(getCodePoint()); return badToken(); } char32_t cp = peeked.codePoint(); if (unicode::IsSpace(cp)) { this->sourceUnits.consumeKnownCodePoint(peeked); if (IsLineTerminator(cp)) { if (!updateLineInfoForEOL()) { return badToken(); } anyCharsAccess().updateFlagsForEOL(); } continue; } static_assert(isAsciiCodePoint('$'), "IdentifierStart contains '$', but as " "!IsUnicodeIDStart('$'), ensure that '$' is never " "handled here"); static_assert(isAsciiCodePoint('_'), "IdentifierStart contains '_', but as " "!IsUnicodeIDStart('_'), ensure that '_' is never " "handled here"); if (MOZ_LIKELY(unicode::IsUnicodeIDStart(cp))) { this->sourceUnits.consumeKnownCodePoint(peeked); MOZ_ASSERT(!IsLineTerminator(cp), "IdentifierStart must guarantee !IsLineTerminator " "or else we'll fail to maintain line-info/flags " "for EOL here"); return identifierName(start, identStart, IdentifierEscapes::None, modifier, NameVisibility::Public, ttp); } reportIllegalCharacter(cp); return badToken(); } // !isAsciiCodePoint(unit) consumeKnownCodeUnit(unit); // Get the token kind, based on the first char. The ordering of c1kind // comparison is based on the frequency of tokens in real code: // Parsemark (which represents typical JS code on the web) and the // Unreal demo (which represents asm.js code). // // Parsemark Unreal // OneChar 32.9% 39.7% // Space 25.0% 0.6% // Ident 19.2% 36.4% // Dec 7.2% 5.1% // String 7.9% 0.0% // EOL 1.7% 0.0% // ZeroDigit 0.4% 4.9% // Other 5.7% 13.3% // // The ordering is based mostly only Parsemark frequencies, with Unreal // frequencies used to break close categories (e.g. |Dec| and // |String|). |Other| is biggish, but no other token kind is common // enough for it to be worth adding extra values to FirstCharKind. FirstCharKind c1kind = FirstCharKind(firstCharKinds[unit]); // Look for an unambiguous single-char token. // if (c1kind <= OneChar_Max) { TokenStart start(this->sourceUnits, -1); newSimpleToken(TokenKind(c1kind), start, modifier, ttp); return true; } // Skip over non-EOL whitespace chars. // if (c1kind == Space) { continue; } // Look for an identifier. // if (c1kind == Ident) { TokenStart start(this->sourceUnits, -1); return identifierName( start, this->sourceUnits.addressOfNextCodeUnit() - 1, IdentifierEscapes::None, modifier, NameVisibility::Public, ttp); } // Look for a decimal number. // if (c1kind == Dec) { TokenStart start(this->sourceUnits, -1); const Unit* numStart = this->sourceUnits.addressOfNextCodeUnit() - 1; return decimalNumber(unit, start, numStart, modifier, ttp); } // Look for a string or a template string. // if (c1kind == String) { return getStringOrTemplateToken(static_cast(unit), modifier, ttp); } // Skip over EOL chars, updating line state along the way. // if (c1kind == EOL) { if (unit == '\r') { matchLineTerminator('\n'); } if (!updateLineInfoForEOL()) { return badToken(); } anyCharsAccess().updateFlagsForEOL(); continue; } // From a '0', look for a hexadecimal, binary, octal, or "noctal" (a // number starting with '0' that contains '8' or '9' and is treated as // decimal) number. // if (c1kind == ZeroDigit) { TokenStart start(this->sourceUnits, -1); int radix; bool isBigInt = false; const Unit* numStart; unit = getCodeUnit(); if (unit == 'x' || unit == 'X') { radix = 16; unit = getCodeUnit(); if (!IsAsciiHexDigit(unit)) { // NOTE: |unit| may be EOF here. ungetCodeUnit(unit); error(JSMSG_MISSING_HEXDIGITS); return badToken(); } // one past the '0x' numStart = this->sourceUnits.addressOfNextCodeUnit() - 1; if (!matchIntegerAfterFirstDigit(IsAsciiHexDigit, &unit)) { return badToken(); } } else if (unit == 'b' || unit == 'B') { radix = 2; unit = getCodeUnit(); if (!IsAsciiBinary(unit)) { // NOTE: |unit| may be EOF here. ungetCodeUnit(unit); error(JSMSG_MISSING_BINARY_DIGITS); return badToken(); } // one past the '0b' numStart = this->sourceUnits.addressOfNextCodeUnit() - 1; if (!matchIntegerAfterFirstDigit(IsAsciiBinary, &unit)) { return badToken(); } } else if (unit == 'o' || unit == 'O') { radix = 8; unit = getCodeUnit(); if (!IsAsciiOctal(unit)) { // NOTE: |unit| may be EOF here. ungetCodeUnit(unit); error(JSMSG_MISSING_OCTAL_DIGITS); return badToken(); } // one past the '0o' numStart = this->sourceUnits.addressOfNextCodeUnit() - 1; if (!matchIntegerAfterFirstDigit(IsAsciiOctal, &unit)) { return badToken(); } } else if (IsAsciiDigit(unit)) { // Reject octal literals that appear in strict mode code. if (!strictModeError(JSMSG_DEPRECATED_OCTAL_LITERAL)) { return badToken(); } // The above test doesn't catch a few edge cases; see // |GeneralParser::maybeParseDirective|. Record the violation so that // that function can handle them. anyCharsAccess().setSawDeprecatedOctalLiteral(); radix = 8; // one past the '0' numStart = this->sourceUnits.addressOfNextCodeUnit() - 1; bool nonOctalDecimalIntegerLiteral = false; do { if (unit >= '8') { nonOctalDecimalIntegerLiteral = true; } unit = getCodeUnit(); } while (IsAsciiDigit(unit)); if (unit == '_') { ungetCodeUnit(unit); error(JSMSG_SEPARATOR_IN_ZERO_PREFIXED_NUMBER); return badToken(); } if (unit == 'n') { ungetCodeUnit(unit); error(JSMSG_BIGINT_INVALID_SYNTAX); return badToken(); } if (nonOctalDecimalIntegerLiteral) { // Use the decimal scanner for the rest of the number. return decimalNumber(unit, start, numStart, modifier, ttp); } } else if (unit == '_') { // Give a more explicit error message when '_' is used after '0'. ungetCodeUnit(unit); error(JSMSG_SEPARATOR_IN_ZERO_PREFIXED_NUMBER); return badToken(); } else { // '0' not followed by [XxBbOo0-9_]; scan as a decimal number. ungetCodeUnit(unit); numStart = this->sourceUnits.addressOfNextCodeUnit() - 1; // The '0'. return decimalNumber('0', start, numStart, modifier, ttp); } if (unit == 'n') { isBigInt = true; unit = peekCodeUnit(); } else { ungetCodeUnit(unit); } // Error if an identifier-start code point appears immediately // after the number. Somewhat surprisingly, if we don't check // here, we'll never check at all. if (MOZ_LIKELY(isAsciiCodePoint(unit))) { if (unicode::IsIdentifierStart(char16_t(unit))) { error(JSMSG_IDSTART_AFTER_NUMBER); return badToken(); } } else if (MOZ_LIKELY(unit != EOF)) { // This ignores encoding errors: subsequent caller-side code to // handle source text after the number will do so. PeekedCodePoint peeked = this->sourceUnits.peekCodePoint(); if (!peeked.isNone() && unicode::IsIdentifierStart(peeked.codePoint())) { error(JSMSG_IDSTART_AFTER_NUMBER); return badToken(); } } if (isBigInt) { return bigIntLiteral(start, modifier, ttp); } double dval; if (!GetFullInteger(numStart, this->sourceUnits.addressOfNextCodeUnit(), radix, IntegerSeparatorHandling::SkipUnderscore, &dval)) { ReportOutOfMemory(this->fc); return badToken(); } newNumberToken(dval, NoDecimal, start, modifier, ttp); return true; } MOZ_ASSERT(c1kind == Other); // This handles everything else. Simple tokens distinguished solely by // TokenKind should set |simpleKind| and break, to share simple-token // creation code for all such tokens. All other tokens must be handled // by returning (or by continuing from the loop enclosing this). // TokenStart start(this->sourceUnits, -1); TokenKind simpleKind; #ifdef DEBUG simpleKind = TokenKind::Limit; // sentinel value for code after switch #endif // The block a ways above eliminated all non-ASCII, so cast to the // smallest type possible to assist the C++ compiler. switch (AssertedCast(CodeUnitValue(toUnit(unit)))) { case '.': if (IsAsciiDigit(peekCodeUnit())) { return decimalNumber('.', start, this->sourceUnits.addressOfNextCodeUnit() - 1, modifier, ttp); } unit = getCodeUnit(); if (unit == '.') { if (matchCodeUnit('.')) { simpleKind = TokenKind::TripleDot; break; } } // NOTE: |unit| may be EOF here. A stray '.' at EOF would be an // error, but subsequent code will handle it. ungetCodeUnit(unit); simpleKind = TokenKind::Dot; break; case '#': { #ifdef ENABLE_RECORD_TUPLE if (matchCodeUnit('{')) { simpleKind = TokenKind::HashCurly; break; } if (matchCodeUnit('[')) { simpleKind = TokenKind::HashBracket; break; } #endif TokenStart start(this->sourceUnits, -1); const Unit* identStart = this->sourceUnits.addressOfNextCodeUnit() - 1; IdentifierEscapes sawEscape; if (!matchIdentifierStart(&sawEscape)) { return badToken(); } return identifierName(start, identStart, sawEscape, modifier, NameVisibility::Private, ttp); } case '=': if (matchCodeUnit('=')) { simpleKind = matchCodeUnit('=') ? TokenKind::StrictEq : TokenKind::Eq; } else if (matchCodeUnit('>')) { simpleKind = TokenKind::Arrow; } else { simpleKind = TokenKind::Assign; } break; case '+': if (matchCodeUnit('+')) { simpleKind = TokenKind::Inc; } else { simpleKind = matchCodeUnit('=') ? TokenKind::AddAssign : TokenKind::Add; } break; case '\\': { char32_t codePoint; if (uint32_t escapeLength = matchUnicodeEscapeIdStart(&codePoint)) { return identifierName( start, this->sourceUnits.addressOfNextCodeUnit() - escapeLength - 1, IdentifierEscapes::SawUnicodeEscape, modifier, NameVisibility::Public, ttp); } // We could point "into" a mistyped escape, e.g. for "\u{41H}" we // could point at the 'H'. But we don't do that now, so the code // unit after the '\' isn't necessarily bad, so just point at the // start of the actually-invalid escape. ungetCodeUnit('\\'); error(JSMSG_BAD_ESCAPE); return badToken(); } case '|': if (matchCodeUnit('|')) { simpleKind = matchCodeUnit('=') ? TokenKind::OrAssign : TokenKind::Or; } else { simpleKind = matchCodeUnit('=') ? TokenKind::BitOrAssign : TokenKind::BitOr; } break; case '^': simpleKind = matchCodeUnit('=') ? TokenKind::BitXorAssign : TokenKind::BitXor; break; case '&': if (matchCodeUnit('&')) { simpleKind = matchCodeUnit('=') ? TokenKind::AndAssign : TokenKind::And; } else { simpleKind = matchCodeUnit('=') ? TokenKind::BitAndAssign : TokenKind::BitAnd; } break; case '?': if (matchCodeUnit('.')) { unit = getCodeUnit(); if (IsAsciiDigit(unit)) { // if the code unit is followed by a number, for example it has the // following form `<...> ?.5 <..> then it should be treated as a // ternary rather than as an optional chain simpleKind = TokenKind::Hook; ungetCodeUnit(unit); ungetCodeUnit('.'); } else { ungetCodeUnit(unit); simpleKind = TokenKind::OptionalChain; } } else if (matchCodeUnit('?')) { simpleKind = matchCodeUnit('=') ? TokenKind::CoalesceAssign : TokenKind::Coalesce; } else { simpleKind = TokenKind::Hook; } break; case '!': if (matchCodeUnit('=')) { simpleKind = matchCodeUnit('=') ? TokenKind::StrictNe : TokenKind::Ne; } else { simpleKind = TokenKind::Not; } break; case '<': if (anyCharsAccess().options().allowHTMLComments) { // Treat HTML begin-comment as comment-till-end-of-line. if (matchCodeUnit('!')) { if (matchCodeUnit('-')) { if (matchCodeUnit('-')) { this->sourceUnits.consumeRestOfSingleLineComment(); continue; } ungetCodeUnit('-'); } ungetCodeUnit('!'); } } if (matchCodeUnit('<')) { simpleKind = matchCodeUnit('=') ? TokenKind::LshAssign : TokenKind::Lsh; } else { simpleKind = matchCodeUnit('=') ? TokenKind::Le : TokenKind::Lt; } break; case '>': if (matchCodeUnit('>')) { if (matchCodeUnit('>')) { simpleKind = matchCodeUnit('=') ? TokenKind::UrshAssign : TokenKind::Ursh; } else { simpleKind = matchCodeUnit('=') ? TokenKind::RshAssign : TokenKind::Rsh; } } else { simpleKind = matchCodeUnit('=') ? TokenKind::Ge : TokenKind::Gt; } break; case '*': if (matchCodeUnit('*')) { simpleKind = matchCodeUnit('=') ? TokenKind::PowAssign : TokenKind::Pow; } else { simpleKind = matchCodeUnit('=') ? TokenKind::MulAssign : TokenKind::Mul; } break; case '/': // Look for a single-line comment. if (matchCodeUnit('/')) { unit = getCodeUnit(); if (unit == '@' || unit == '#') { bool shouldWarn = unit == '@'; if (!getDirectives(false, shouldWarn)) { return false; } } else { // NOTE: |unit| may be EOF here. ungetCodeUnit(unit); } this->sourceUnits.consumeRestOfSingleLineComment(); continue; } // Look for a multi-line comment. if (matchCodeUnit('*')) { TokenStreamAnyChars& anyChars = anyCharsAccess(); unsigned linenoBefore = anyChars.lineno; do { int32_t unit = getCodeUnit(); if (unit == EOF) { error(JSMSG_UNTERMINATED_COMMENT); return badToken(); } if (unit == '*' && matchCodeUnit('/')) { break; } if (unit == '@' || unit == '#') { bool shouldWarn = unit == '@'; if (!getDirectives(true, shouldWarn)) { return badToken(); } } else if (MOZ_LIKELY(isAsciiCodePoint(unit))) { if (!getFullAsciiCodePoint(unit)) { return badToken(); } } else { char32_t codePoint; if (!getNonAsciiCodePoint(unit, &codePoint)) { return badToken(); } } } while (true); if (linenoBefore != anyChars.lineno) { anyChars.updateFlagsForEOL(); } continue; } // Look for a regexp. if (modifier == SlashIsRegExp) { return regexpLiteral(start, ttp); } simpleKind = matchCodeUnit('=') ? TokenKind::DivAssign : TokenKind::Div; break; case '%': simpleKind = matchCodeUnit('=') ? TokenKind::ModAssign : TokenKind::Mod; break; case '-': if (matchCodeUnit('-')) { if (anyCharsAccess().options().allowHTMLComments && !anyCharsAccess().flags.isDirtyLine) { if (matchCodeUnit('>')) { this->sourceUnits.consumeRestOfSingleLineComment(); continue; } } simpleKind = TokenKind::Dec; } else { simpleKind = matchCodeUnit('=') ? TokenKind::SubAssign : TokenKind::Sub; } break; #ifdef ENABLE_DECORATORS case '@': simpleKind = TokenKind::At; break; #endif default: // We consumed a bad ASCII code point/unit. Put it back so the // error location is the bad code point. ungetCodeUnit(unit); reportIllegalCharacter(unit); return badToken(); } // switch (AssertedCast(CodeUnitValue(toUnit(unit)))) MOZ_ASSERT(simpleKind != TokenKind::Limit, "switch-statement should have set |simpleKind| before " "breaking"); newSimpleToken(simpleKind, start, modifier, ttp); return true; } while (true); } template bool TokenStreamSpecific::getStringOrTemplateToken( char untilChar, Modifier modifier, TokenKind* out) { MOZ_ASSERT(untilChar == '\'' || untilChar == '"' || untilChar == '`', "unexpected string/template literal delimiter"); bool parsingTemplate = (untilChar == '`'); bool templateHead = false; TokenStart start(this->sourceUnits, -1); this->charBuffer.clear(); // Run the bad-token code for every path out of this function except the // one success-case. auto noteBadToken = MakeScopeExit([this]() { this->badToken(); }); auto ReportPrematureEndOfLiteral = [this, untilChar](unsigned errnum) { // Unicode separators aren't end-of-line in template or (as of // recently) string literals, so this assertion doesn't allow them. MOZ_ASSERT(this->sourceUnits.atEnd() || this->sourceUnits.peekCodeUnit() == Unit('\r') || this->sourceUnits.peekCodeUnit() == Unit('\n'), "must be parked at EOF or EOL to call this function"); // The various errors reported here include language like "in a '' // literal" or similar, with '' being '', "", or `` as appropriate. const char delimiters[] = {untilChar, untilChar, '\0'}; this->error(errnum, delimiters); return; }; // We need to detect any of these chars: " or ', \n (or its // equivalents), \\, EOF. Because we detect EOL sequences here and // put them back immediately, we can use getCodeUnit(). int32_t unit; while ((unit = getCodeUnit()) != untilChar) { if (unit == EOF) { ReportPrematureEndOfLiteral(JSMSG_EOF_BEFORE_END_OF_LITERAL); return false; } // Non-ASCII code points are always directly appended -- even // U+2028 LINE SEPARATOR and U+2029 PARAGRAPH SEPARATOR that are // ordinarily LineTerminatorSequences. (They contribute their literal // values to template and [as of recently] string literals, but they're // line terminators when computing line/column coordinates.) Handle // the non-ASCII case early for readability. if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) { char32_t cp; if (!getNonAsciiCodePointDontNormalize(toUnit(unit), &cp)) { return false; } if (MOZ_UNLIKELY(cp == unicode::LINE_SEPARATOR || cp == unicode::PARA_SEPARATOR)) { if (!updateLineInfoForEOL()) { return false; } anyCharsAccess().updateFlagsForEOL(); } else { MOZ_ASSERT(!IsLineTerminator(cp)); } if (!AppendCodePointToCharBuffer(this->charBuffer, cp)) { return false; } continue; } if (unit == '\\') { // When parsing templates, we don't immediately report errors for // invalid escapes; these are handled by the parser. We don't // append to charBuffer in those cases because it won't be read. unit = getCodeUnit(); if (unit == EOF) { ReportPrematureEndOfLiteral(JSMSG_EOF_IN_ESCAPE_IN_LITERAL); return false; } // Non-ASCII |unit| isn't handled by code after this, so dedicate // an unlikely special-case to it and then continue. if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) { char32_t codePoint; if (!getNonAsciiCodePoint(unit, &codePoint)) { return false; } // If we consumed U+2028 LINE SEPARATOR or U+2029 PARAGRAPH // SEPARATOR, they'll be normalized to '\n'. '\' followed by // LineContinuation represents no code points, so don't append // in this case. if (codePoint != '\n') { if (!AppendCodePointToCharBuffer(this->charBuffer, codePoint)) { return false; } } continue; } // The block above eliminated all non-ASCII, so cast to the // smallest type possible to assist the C++ compiler. switch (AssertedCast(CodeUnitValue(toUnit(unit)))) { case 'b': unit = '\b'; break; case 'f': unit = '\f'; break; case 'n': unit = '\n'; break; case 'r': unit = '\r'; break; case 't': unit = '\t'; break; case 'v': unit = '\v'; break; case '\r': matchLineTerminator('\n'); [[fallthrough]]; case '\n': { // LineContinuation represents no code points. We're manually // consuming a LineTerminatorSequence, so we must manually // update line/column info. if (!updateLineInfoForEOL()) { return false; } continue; } // Unicode character specification. case 'u': { int32_t c2 = getCodeUnit(); if (c2 == EOF) { ReportPrematureEndOfLiteral(JSMSG_EOF_IN_ESCAPE_IN_LITERAL); return false; } // First handle a delimited Unicode escape, e.g. \u{1F4A9}. if (c2 == '{') { uint32_t start = this->sourceUnits.offset() - 3; uint32_t code = 0; bool first = true; bool valid = true; do { int32_t u3 = getCodeUnit(); if (u3 == EOF) { if (parsingTemplate) { TokenStreamAnyChars& anyChars = anyCharsAccess(); anyChars.setInvalidTemplateEscape(start, InvalidEscapeType::Unicode); valid = false; break; } reportInvalidEscapeError(start, InvalidEscapeType::Unicode); return false; } if (u3 == '}') { if (first) { if (parsingTemplate) { TokenStreamAnyChars& anyChars = anyCharsAccess(); anyChars.setInvalidTemplateEscape( start, InvalidEscapeType::Unicode); valid = false; break; } reportInvalidEscapeError(start, InvalidEscapeType::Unicode); return false; } break; } // Beware: |u3| may be a non-ASCII code point here; if // so it'll pass into this |if|-block. if (!IsAsciiHexDigit(u3)) { if (parsingTemplate) { // We put the code unit back so that we read it // on the next pass, which matters if it was // '`' or '\'. ungetCodeUnit(u3); TokenStreamAnyChars& anyChars = anyCharsAccess(); anyChars.setInvalidTemplateEscape(start, InvalidEscapeType::Unicode); valid = false; break; } reportInvalidEscapeError(start, InvalidEscapeType::Unicode); return false; } code = (code << 4) | AsciiAlphanumericToNumber(u3); if (code > unicode::NonBMPMax) { if (parsingTemplate) { TokenStreamAnyChars& anyChars = anyCharsAccess(); anyChars.setInvalidTemplateEscape( start + 3, InvalidEscapeType::UnicodeOverflow); valid = false; break; } reportInvalidEscapeError(start + 3, InvalidEscapeType::UnicodeOverflow); return false; } first = false; } while (true); if (!valid) { continue; } MOZ_ASSERT(code <= unicode::NonBMPMax); if (!AppendCodePointToCharBuffer(this->charBuffer, code)) { return false; } continue; } // end of delimited Unicode escape handling // Otherwise it must be a fixed-length \uXXXX Unicode escape. // If it isn't, this is usually an error -- but if this is a // template literal, we must defer error reporting because // malformed escapes are okay in *tagged* template literals. char16_t v; if (IsAsciiHexDigit(c2) && this->sourceUnits.matchHexDigits(3, &v)) { unit = (AsciiAlphanumericToNumber(c2) << 12) | v; } else { // Beware: |c2| may not be an ASCII code point here! ungetCodeUnit(c2); uint32_t start = this->sourceUnits.offset() - 2; if (parsingTemplate) { TokenStreamAnyChars& anyChars = anyCharsAccess(); anyChars.setInvalidTemplateEscape(start, InvalidEscapeType::Unicode); continue; } reportInvalidEscapeError(start, InvalidEscapeType::Unicode); return false; } break; } // case 'u' // Hexadecimal character specification. case 'x': { char16_t v; if (this->sourceUnits.matchHexDigits(2, &v)) { unit = v; } else { uint32_t start = this->sourceUnits.offset() - 2; if (parsingTemplate) { TokenStreamAnyChars& anyChars = anyCharsAccess(); anyChars.setInvalidTemplateEscape(start, InvalidEscapeType::Hexadecimal); continue; } reportInvalidEscapeError(start, InvalidEscapeType::Hexadecimal); return false; } break; } default: { if (!IsAsciiOctal(unit)) { // \8 or \9 in an untagged template literal is a syntax error, // reported in GeneralParser::noSubstitutionUntaggedTemplate. // // Tagged template literals, however, may contain \8 and \9. The // "cooked" representation of such a part will be |undefined|, and // the "raw" representation will contain the literal characters. // // function f(parts) { // assertEq(parts[0], undefined); // assertEq(parts.raw[0], "\\8"); // return "composed"; // } // assertEq(f`\8`, "composed"); if (unit == '8' || unit == '9') { TokenStreamAnyChars& anyChars = anyCharsAccess(); if (parsingTemplate) { anyChars.setInvalidTemplateEscape( this->sourceUnits.offset() - 2, InvalidEscapeType::EightOrNine); continue; } // \8 and \9 are forbidden in string literals in strict mode code. if (!strictModeError(JSMSG_DEPRECATED_EIGHT_OR_NINE_ESCAPE)) { return false; } // The above test doesn't catch a few edge cases; see // |GeneralParser::maybeParseDirective|. Record the violation so // that that function can handle them. anyChars.setSawDeprecatedEightOrNineEscape(); } break; } // Octal character specification. int32_t val = AsciiOctalToNumber(unit); unit = peekCodeUnit(); if (MOZ_UNLIKELY(unit == EOF)) { ReportPrematureEndOfLiteral(JSMSG_EOF_IN_ESCAPE_IN_LITERAL); return false; } // Strict mode code allows only \0 followed by a non-digit. if (val != 0 || IsAsciiDigit(unit)) { TokenStreamAnyChars& anyChars = anyCharsAccess(); if (parsingTemplate) { anyChars.setInvalidTemplateEscape(this->sourceUnits.offset() - 2, InvalidEscapeType::Octal); continue; } if (!strictModeError(JSMSG_DEPRECATED_OCTAL_ESCAPE)) { return false; } // The above test doesn't catch a few edge cases; see // |GeneralParser::maybeParseDirective|. Record the violation so // that that function can handle them. anyChars.setSawDeprecatedOctalEscape(); } if (IsAsciiOctal(unit)) { val = 8 * val + AsciiOctalToNumber(unit); consumeKnownCodeUnit(unit); unit = peekCodeUnit(); if (MOZ_UNLIKELY(unit == EOF)) { ReportPrematureEndOfLiteral(JSMSG_EOF_IN_ESCAPE_IN_LITERAL); return false; } if (IsAsciiOctal(unit)) { int32_t save = val; val = 8 * val + AsciiOctalToNumber(unit); if (val <= 0xFF) { consumeKnownCodeUnit(unit); } else { val = save; } } } unit = char16_t(val); break; } // default } // switch (AssertedCast(CodeUnitValue(toUnit(unit)))) if (!this->charBuffer.append(unit)) { return false; } continue; } // (unit == '\\') if (unit == '\r' || unit == '\n') { if (!parsingTemplate) { // String literals don't allow ASCII line breaks. ungetCodeUnit(unit); ReportPrematureEndOfLiteral(JSMSG_EOL_BEFORE_END_OF_STRING); return false; } if (unit == '\r') { unit = '\n'; matchLineTerminator('\n'); } if (!updateLineInfoForEOL()) { return false; } anyCharsAccess().updateFlagsForEOL(); } else if (parsingTemplate && unit == '$' && matchCodeUnit('{')) { templateHead = true; break; } if (!this->charBuffer.append(unit)) { return false; } } TaggedParserAtomIndex atom = drainCharBufferIntoAtom(); if (!atom) { return false; } noteBadToken.release(); MOZ_ASSERT_IF(!parsingTemplate, !templateHead); TokenKind kind = !parsingTemplate ? TokenKind::String : templateHead ? TokenKind::TemplateHead : TokenKind::NoSubsTemplate; newAtomToken(kind, atom, start, modifier, out); return true; } const char* TokenKindToDesc(TokenKind tt) { switch (tt) { #define EMIT_CASE(name, desc) \ case TokenKind::name: \ return desc; FOR_EACH_TOKEN_KIND(EMIT_CASE) #undef EMIT_CASE case TokenKind::Limit: MOZ_ASSERT_UNREACHABLE("TokenKind::Limit should not be passed."); break; } return ""; } #ifdef DEBUG const char* TokenKindToString(TokenKind tt) { switch (tt) { # define EMIT_CASE(name, desc) \ case TokenKind::name: \ return "TokenKind::" #name; FOR_EACH_TOKEN_KIND(EMIT_CASE) # undef EMIT_CASE case TokenKind::Limit: break; } return ""; } #endif template class TokenStreamCharsBase; template class TokenStreamCharsBase; template class GeneralTokenStreamChars; template class TokenStreamChars; template class TokenStreamSpecific; template class GeneralTokenStreamChars< Utf8Unit, ParserAnyCharsAccess>>; template class GeneralTokenStreamChars< Utf8Unit, ParserAnyCharsAccess>>; template class GeneralTokenStreamChars< char16_t, ParserAnyCharsAccess>>; template class GeneralTokenStreamChars< char16_t, ParserAnyCharsAccess>>; template class TokenStreamChars< Utf8Unit, ParserAnyCharsAccess>>; template class TokenStreamChars< Utf8Unit, ParserAnyCharsAccess>>; template class TokenStreamChars< char16_t, ParserAnyCharsAccess>>; template class TokenStreamChars< char16_t, ParserAnyCharsAccess>>; template class TokenStreamSpecific< Utf8Unit, ParserAnyCharsAccess>>; template class TokenStreamSpecific< Utf8Unit, ParserAnyCharsAccess>>; template class TokenStreamSpecific< char16_t, ParserAnyCharsAccess>>; template class TokenStreamSpecific< char16_t, ParserAnyCharsAccess>>; } // namespace frontend } // namespace js