diff options
Diffstat (limited to '')
-rw-r--r-- | js/src/frontend/Token.h | 226 |
1 files changed, 226 insertions, 0 deletions
diff --git a/js/src/frontend/Token.h b/js/src/frontend/Token.h new file mode 100644 index 0000000000..5d8db585d8 --- /dev/null +++ b/js/src/frontend/Token.h @@ -0,0 +1,226 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- + * vim: set ts=8 sts=2 et sw=2 tw=80: + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* + * Token-affiliated data structures except for TokenKind (defined in its own + * header). + */ + +#ifndef frontend_Token_h +#define frontend_Token_h + +#include "mozilla/Assertions.h" // MOZ_ASSERT + +#include <stdint.h> // uint32_t + +#include "frontend/ParserAtom.h" // js::frontend::{ParserAtom,ParserName} +#include "frontend/TokenKind.h" // js::frontend::TokenKind +#include "js/RegExpFlags.h" // JS::RegExpFlags + +namespace js { + +namespace frontend { + +struct TokenPos { + uint32_t begin = 0; // Offset of the token's first code unit. + uint32_t end = 0; // Offset of 1 past the token's last code unit. + + TokenPos() = default; + TokenPos(uint32_t begin, uint32_t end) : begin(begin), end(end) {} + + // Return a TokenPos that covers left, right, and anything in between. + static TokenPos box(const TokenPos& left, const TokenPos& right) { + MOZ_ASSERT(left.begin <= left.end); + MOZ_ASSERT(left.end <= right.begin); + MOZ_ASSERT(right.begin <= right.end); + return TokenPos(left.begin, right.end); + } + + bool operator==(const TokenPos& bpos) const { + return begin == bpos.begin && end == bpos.end; + } + + bool operator!=(const TokenPos& bpos) const { + return begin != bpos.begin || end != bpos.end; + } + + bool operator<(const TokenPos& bpos) const { return begin < bpos.begin; } + + bool operator<=(const TokenPos& bpos) const { return begin <= bpos.begin; } + + bool operator>(const TokenPos& bpos) const { return !(*this <= bpos); } + + bool operator>=(const TokenPos& bpos) const { return !(*this < bpos); } + + bool encloses(const TokenPos& pos) const { + return begin <= pos.begin && pos.end <= end; + } +}; + +enum DecimalPoint { NoDecimal = false, HasDecimal = true }; + +// The only escapes found in IdentifierName are of the Unicode flavor. +enum class IdentifierEscapes { None, SawUnicodeEscape }; + +enum class NameVisibility { Public, Private }; + +class TokenStreamShared; + +struct Token { + private: + // The lexical grammar of JavaScript has a quirk around the '/' character. + // As the spec puts it: + // + // > There are several situations where the identification of lexical input + // > elements is sensitive to the syntactic grammar context that is consuming + // > the input elements. This requires multiple goal symbols for the lexical + // > grammar. [...] The InputElementRegExp goal symbol is used in all + // > syntactic grammar contexts where a RegularExpressionLiteral is permitted + // > [...] In all other contexts, InputElementDiv is used as the lexical + // > goal symbol. + // + // https://tc39.github.io/ecma262/#sec-lexical-and-regexp-grammars + // + // What "sensitive to the syntactic grammar context" means is, the parser has + // to tell the TokenStream whether to interpret '/' as division or + // RegExp. Because only one or the other (or neither) will be legal at that + // point in the program, and only the parser knows which one. + // + // But there's a problem: the parser often gets a token, puts it back, then + // consumes it later; or (equivalently) peeks at a token, leaves it, peeks + // again later, then finally consumes it. Of course we don't actually re-scan + // the token every time; we cache it in the TokenStream. This leads to the + // following rule: + // + // The parser must not pass SlashIsRegExp when getting/peeking at a token + // previously scanned with SlashIsDiv; or vice versa. + // + // That way, code that asks for a SlashIsRegExp mode will never get a cached + // Div token. But this rule is easy to screw up, because tokens are so often + // peeked at on Parser.cpp line A and consumed on line B, where |A-B| is + // thousands of lines. We therefore enforce it with the frontend's most + // annoying assertion (in verifyConsistentModifier), and provide + // Modifier::SlashIsInvalid to help avoid tripping it. + // + // This enum belongs in TokenStream, but C++, so we define it here and + // typedef it there. + enum Modifier { + // Parse `/` and `/=` as the division operators. (That is, use + // InputElementDiv as the goal symbol.) + SlashIsDiv, + + // Parse `/` as the beginning of a RegExp literal. (That is, use + // InputElementRegExp.) + SlashIsRegExp, + + // Neither a Div token nor a RegExp token is syntactically valid here. When + // the parser calls `getToken(SlashIsInvalid)`, it must be prepared to see + // either one (and throw a SyntaxError either way). + // + // It's OK to use SlashIsInvalid to get a token that was originally scanned + // with SlashIsDiv or SlashIsRegExp. The reverse--peeking with + // SlashIsInvalid, then getting with another mode--is not OK. If either Div + // or RegExp is syntactically valid here, use the appropriate modifier. + SlashIsInvalid, + }; + friend class TokenStreamShared; + + public: + // WARNING: TokenStreamPosition assumes that the only GC things a Token + // includes are atoms. DON'T ADD NON-ATOM GC THING POINTERS HERE + // UNLESS YOU ADD ADDITIONAL ROOTING TO THAT CLASS. + + /** The type of this token. */ + TokenKind type; + + /** The token's position in the overall script. */ + TokenPos pos; + + union { + private: + friend struct Token; + + /** Non-numeric atom. */ + const ParserName* name; + + /** Potentially-numeric atom. */ + const ParserAtom* atom; + + struct { + /** Numeric literal's value. */ + double value; + + /** Does the numeric literal contain a '.'? */ + DecimalPoint decimalPoint; + } number; + + /** Regular expression flags; use charBuffer to access source chars. */ + JS::RegExpFlags reflags; + } u; + +#ifdef DEBUG + /** The modifier used to get this token. */ + Modifier modifier; +#endif + + // Mutators + + void setName(const ParserName* name) { + MOZ_ASSERT(type == TokenKind::Name || type == TokenKind::PrivateName); + u.name = name; + } + + void setAtom(const ParserAtom* atom) { + MOZ_ASSERT(type == TokenKind::String || type == TokenKind::TemplateHead || + type == TokenKind::NoSubsTemplate); + u.atom = atom; + } + + void setRegExpFlags(JS::RegExpFlags flags) { + MOZ_ASSERT(type == TokenKind::RegExp); + u.reflags = flags; + } + + void setNumber(double n, DecimalPoint decimalPoint) { + MOZ_ASSERT(type == TokenKind::Number); + u.number.value = n; + u.number.decimalPoint = decimalPoint; + } + + // Type-safe accessors + + const ParserName* name() const { + MOZ_ASSERT(type == TokenKind::Name || type == TokenKind::PrivateName); + return u.name->asName(); // poor-man's type verification + } + + const ParserAtom* atom() const { + MOZ_ASSERT(type == TokenKind::String || type == TokenKind::TemplateHead || + type == TokenKind::NoSubsTemplate); + return u.atom; + } + + JS::RegExpFlags regExpFlags() const { + MOZ_ASSERT(type == TokenKind::RegExp); + return u.reflags; + } + + double number() const { + MOZ_ASSERT(type == TokenKind::Number); + return u.number.value; + } + + DecimalPoint decimalPoint() const { + MOZ_ASSERT(type == TokenKind::Number); + return u.number.decimalPoint; + } +}; + +} // namespace frontend + +} // namespace js + +#endif // frontend_Token_h |