diff options
Diffstat (limited to '')
-rw-r--r-- | devtools/shared/css/lexer.js | 1522 |
1 files changed, 1522 insertions, 0 deletions
diff --git a/devtools/shared/css/lexer.js b/devtools/shared/css/lexer.js new file mode 100644 index 0000000000..18e78717d1 --- /dev/null +++ b/devtools/shared/css/lexer.js @@ -0,0 +1,1522 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +// A CSS Lexer. This file is a bit unusual -- it is a more or less +// direct translation of layout/style/nsCSSScanner.cpp and +// layout/style/CSSLexer.cpp into JS. This implemented the +// CSSLexer.webidl interface, and the intent is to try to keep it in +// sync with changes to the platform CSS lexer. Due to this goal, +// this file violates some naming conventions and consequently locally +// disables some eslint rules. + +/* eslint-disable camelcase, mozilla/no-aArgs, no-else-return, complexity */ + +"use strict"; + +// White space of any kind. No value fields are used. Note that +// comments do *not* count as white space; comments separate tokens +// but are not themselves tokens. +const eCSSToken_Whitespace = "whitespace"; // +// A comment. +const eCSSToken_Comment = "comment"; // /*...*/ + +// Identifier-like tokens. mIdent is the text of the identifier. +// The difference between ID and Hash is: if the text after the # +// would have been a valid Ident if the # hadn't been there, the +// scanner produces an ID token. Otherwise it produces a Hash token. +// (This distinction is required by css3-selectors.) +const eCSSToken_Ident = "ident"; // word +const eCSSToken_Function = "function"; // word( +const eCSSToken_AtKeyword = "at"; // @word +const eCSSToken_ID = "id"; // #word +const eCSSToken_Hash = "hash"; // #0word + +// Numeric tokens. mNumber is the floating-point value of the +// number, and mHasSign indicates whether there was an explicit sign +// (+ or -) in front of the number. If mIntegerValid is true, the +// number had the lexical form of an integer, and mInteger is its +// integer value. Lexically integer values outside the range of a +// 32-bit signed number are clamped to the maximum values; mNumber +// will indicate a 'truer' value in that case. Percentage tokens +// are always considered not to be integers, even if their numeric +// value is integral (100% => mNumber = 1.0). For Dimension +// tokens, mIdent holds the text of the unit. +const eCSSToken_Number = "number"; // 1 -5 +2e3 3.14159 7.297352e-3 +const eCSSToken_Dimension = "dimension"; // 24px 8.5in +const eCSSToken_Percentage = "percentage"; // 85% 1280.4% + +// String-like tokens. In all cases, mIdent holds the text +// belonging to the string, and mSymbol holds the delimiter +// character, which may be ', ", or zero (only for unquoted URLs). +// Bad_String and Bad_URL tokens are emitted when the closing +// delimiter or parenthesis was missing. +const eCSSToken_String = "string"; // 'foo bar' "foo bar" +const eCSSToken_Bad_String = "bad_string"; // 'foo bar +const eCSSToken_URL = "url"; // url(foobar) url("foo bar") +const eCSSToken_Bad_URL = "bad_url"; // url(foo + +// Any one-character symbol. mSymbol holds the character. +const eCSSToken_Symbol = "symbol"; // . ; { } ! * + +// Match operators. These are single tokens rather than pairs of +// Symbol tokens because css3-selectors forbids the presence of +// comments between the two characters. No value fields are used; +// the token type indicates which operator. +const eCSSToken_Includes = "includes"; // ~= +const eCSSToken_Dashmatch = "dashmatch"; // |= +const eCSSToken_Beginsmatch = "beginsmatch"; // ^= +const eCSSToken_Endsmatch = "endsmatch"; // $= +const eCSSToken_Containsmatch = "containsmatch"; // *= + +// Unicode-range token: currently used only in @font-face. +// The lexical rule for this token includes several forms that are +// semantically invalid. Therefore, mIdent always holds the +// complete original text of the token (so we can print it +// accurately in diagnostics), and mIntegerValid is true iff the +// token is semantically valid. In that case, mInteger holds the +// lowest value included in the range, and mInteger2 holds the +// highest value included in the range. +const eCSSToken_URange = "urange"; // U+007e U+01?? U+2000-206F + +// HTML comment delimiters, ignored as a unit when they appear at +// the top level of a style sheet, for compatibility with websites +// written for compatibility with pre-CSS browsers. This token type +// subsumes the css2.1 CDO and CDC tokens, which are always treated +// the same by the parser. mIdent holds the text of the token, for +// diagnostics. +const eCSSToken_HTMLComment = "htmlcomment"; // <!-- --> + +const eEOFCharacters_None = 0x0000; + +// to handle \<EOF> inside strings +const eEOFCharacters_DropBackslash = 0x0001; + +// to handle \<EOF> outside strings +const eEOFCharacters_ReplacementChar = 0x0002; + +// to close comments +const eEOFCharacters_Asterisk = 0x0004; +const eEOFCharacters_Slash = 0x0008; + +// to close double-quoted strings +const eEOFCharacters_DoubleQuote = 0x0010; + +// to close single-quoted strings +const eEOFCharacters_SingleQuote = 0x0020; + +// to close URLs +const eEOFCharacters_CloseParen = 0x0040; + +// Bridge the char/string divide. +const APOSTROPHE = "'".charCodeAt(0); +const ASTERISK = "*".charCodeAt(0); +const CARRIAGE_RETURN = "\r".charCodeAt(0); +const CIRCUMFLEX_ACCENT = "^".charCodeAt(0); +const COMMERCIAL_AT = "@".charCodeAt(0); +const DIGIT_NINE = "9".charCodeAt(0); +const DIGIT_ZERO = "0".charCodeAt(0); +const DOLLAR_SIGN = "$".charCodeAt(0); +const EQUALS_SIGN = "=".charCodeAt(0); +const EXCLAMATION_MARK = "!".charCodeAt(0); +const FULL_STOP = ".".charCodeAt(0); +const GREATER_THAN_SIGN = ">".charCodeAt(0); +const HYPHEN_MINUS = "-".charCodeAt(0); +const LATIN_CAPITAL_LETTER_E = "E".charCodeAt(0); +const LATIN_CAPITAL_LETTER_U = "U".charCodeAt(0); +const LATIN_SMALL_LETTER_E = "e".charCodeAt(0); +const LATIN_SMALL_LETTER_U = "u".charCodeAt(0); +const LEFT_PARENTHESIS = "(".charCodeAt(0); +const LESS_THAN_SIGN = "<".charCodeAt(0); +const LINE_FEED = "\n".charCodeAt(0); +const NUMBER_SIGN = "#".charCodeAt(0); +const PERCENT_SIGN = "%".charCodeAt(0); +const PLUS_SIGN = "+".charCodeAt(0); +const QUESTION_MARK = "?".charCodeAt(0); +const QUOTATION_MARK = '"'.charCodeAt(0); +const REVERSE_SOLIDUS = "\\".charCodeAt(0); +const RIGHT_PARENTHESIS = ")".charCodeAt(0); +const SOLIDUS = "/".charCodeAt(0); +const TILDE = "~".charCodeAt(0); +const VERTICAL_LINE = "|".charCodeAt(0); + +const UCS2_REPLACEMENT_CHAR = 0xfffd; + +const kImpliedEOFCharacters = [ + UCS2_REPLACEMENT_CHAR, + ASTERISK, + SOLIDUS, + QUOTATION_MARK, + APOSTROPHE, + RIGHT_PARENTHESIS, + 0, +]; + +// +const ARGS_LENGTH_MAX = 500 * 1000; + +/** + * Several methods in this helper can reach the 500000 limit for arguments in + * Firefox, see Bug 1414361. + * + * This will apply the provided method, on the provided scope with an array of + * arguments which can exceed the 500k limit supported by Firefox. + * + * In practice, the arguments array will be split in several chunks of 500k + * items maximum and each chunk will be applied separately. + * + * !! Note that if you are expecting to use the return value of the method, here + * we will return an array of each return value for each chunk. It will be up to + * the consumer to decide how to combine the results into a meaningful final + * result !! + * + * @param {Function} method + * The method to apply. + * @param {*} scope + * The scope ("this") to use when applying the method. + * @param {Array} args + * The array of arguments to apply. + * + * @returns {Array} + * The array of return values, one item for each chunk that had to be + * created. + */ +function safeApply(method, scope, args) { + let i = 0; + const res = []; + const length = args.length; + while (i < length) { + const _start = i; + i += ARGS_LENGTH_MAX; + res.push(method.apply(scope, args.slice(_start, i))); + } + return res; +} + +/** + * Ensure that the character is valid. If it is valid, return it; + * otherwise, return the replacement character. + * + * @param {Number} c the character to check + * @return {Number} the character or its replacement + */ +function ensureValidChar(c) { + if (c >= 0x00110000 || (c & 0xfff800) == 0xd800) { + // Out of range or a surrogate. + return UCS2_REPLACEMENT_CHAR; + } + return c; +} + +/** + * Turn a string into an array of character codes. + * + * @param {String} str the input string + * @return {Array} an array of character codes, one per character in + * the input string. + */ +function stringToCodes(str) { + // This is a hot path, and using a simple for loop is faster than any other mean (e.g. + // Array#map ). + const charCodes = []; + for (let i = 0; i < str.length; i++) { + charCodes.push(str.charCodeAt(i)); + } + return charCodes; +} + +const IS_HEX_DIGIT = 0x01; +const IS_IDSTART = 0x02; +const IS_IDCHAR = 0x04; +const IS_URL_CHAR = 0x08; +const IS_HSPACE = 0x10; +const IS_VSPACE = 0x20; +const IS_SPACE = IS_HSPACE | IS_VSPACE; +const IS_STRING = 0x40; + +const H = IS_HSPACE; +const V = IS_VSPACE; +const I = IS_IDCHAR; +const J = IS_IDSTART; +const U = IS_URL_CHAR; +const S = IS_STRING; +const X = IS_HEX_DIGIT; + +const SH = S | H; +const SU = S | U; +const SUI = S | U | I; +const SUIJ = S | U | I | J; +const SUIX = S | U | I | X; +const SUIJX = S | U | I | J | X; + +/* eslint-disable indent, indent-legacy, no-multi-spaces, comma-spacing, spaced-comment */ +const gLexTable = [ + // 00 01 02 03 04 05 06 07 + 0, + S, + S, + S, + S, + S, + S, + S, + // 08 TAB LF 0B FF CR 0E 0F + S, + SH, + V, + S, + V, + V, + S, + S, + // 10 11 12 13 14 15 16 17 + S, + S, + S, + S, + S, + S, + S, + S, + // 18 19 1A 1B 1C 1D 1E 1F + S, + S, + S, + S, + S, + S, + S, + S, + //SPC ! " # $ % & ' + SH, + SU, + 0, + SU, + SU, + SU, + SU, + 0, + // ( ) * + , - . / + S, + S, + SU, + SU, + SU, + SUI, + SU, + SU, + // 0 1 2 3 4 5 6 7 + SUIX, + SUIX, + SUIX, + SUIX, + SUIX, + SUIX, + SUIX, + SUIX, + // 8 9 : ; < = > ? + SUIX, + SUIX, + SU, + SU, + SU, + SU, + SU, + SU, + // @ A B C D E F G + SU, + SUIJX, + SUIJX, + SUIJX, + SUIJX, + SUIJX, + SUIJX, + SUIJ, + // H I J K L M N O + SUIJ, + SUIJ, + SUIJ, + SUIJ, + SUIJ, + SUIJ, + SUIJ, + SUIJ, + // P Q R S T U V W + SUIJ, + SUIJ, + SUIJ, + SUIJ, + SUIJ, + SUIJ, + SUIJ, + SUIJ, + // X Y Z [ \ ] ^ _ + SUIJ, + SUIJ, + SUIJ, + SU, + J, + SU, + SU, + SUIJ, + // ` a b c d e f g + SU, + SUIJX, + SUIJX, + SUIJX, + SUIJX, + SUIJX, + SUIJX, + SUIJ, + // h i j k l m n o + SUIJ, + SUIJ, + SUIJ, + SUIJ, + SUIJ, + SUIJ, + SUIJ, + SUIJ, + // p q r s t u v w + SUIJ, + SUIJ, + SUIJ, + SUIJ, + SUIJ, + SUIJ, + SUIJ, + SUIJ, + // x y z { | } ~ 7F + SUIJ, + SUIJ, + SUIJ, + SU, + SU, + SU, + SU, + S, +]; +/* eslint-enable indent, indent-legacy, no-multi-spaces, comma-spacing, spaced-comment */ + +/** + * True if 'ch' is in character class 'cls', which should be one of + * the constants above or some combination of them. All characters + * above U+007F are considered to be in 'cls'. EOF is never in 'cls'. + */ +function IsOpenCharClass(ch, cls) { + return ch >= 0 && (ch >= 128 || (gLexTable[ch] & cls) != 0); +} + +/** + * True if 'ch' is in character class 'cls', which should be one of + * the constants above or some combination of them. No characters + * above U+007F are considered to be in 'cls'. EOF is never in 'cls'. + */ +function IsClosedCharClass(ch, cls) { + return ch >= 0 && ch < 128 && (gLexTable[ch] & cls) != 0; +} + +/** + * True if 'ch' is CSS whitespace, i.e. any of the ASCII characters + * TAB, LF, FF, CR, or SPC. + */ +function IsWhitespace(ch) { + return IsClosedCharClass(ch, IS_SPACE); +} + +/** + * True if 'ch' is horizontal whitespace, i.e. TAB or SPC. + */ +function IsHorzSpace(ch) { + return IsClosedCharClass(ch, IS_HSPACE); +} + +/** + * True if 'ch' is vertical whitespace, i.e. LF, FF, or CR. Vertical + * whitespace requires special handling when consumed, see AdvanceLine. + */ +function IsVertSpace(ch) { + return IsClosedCharClass(ch, IS_VSPACE); +} + +/** + * True if 'ch' is a character that can appear in the middle of an identifier. + * This includes U+0000 since it is handled as U+FFFD, but for purposes of + * GatherText it should not be included in IsOpenCharClass. + */ +function IsIdentChar(ch) { + return IsOpenCharClass(ch, IS_IDCHAR) || ch == 0; +} + +/** + * True if 'ch' is a character that by itself begins an identifier. + * This includes U+0000 since it is handled as U+FFFD, but for purposes of + * GatherText it should not be included in IsOpenCharClass. + * (This is a subset of IsIdentChar.) + */ +function IsIdentStart(ch) { + return IsOpenCharClass(ch, IS_IDSTART) || ch == 0; +} + +/** + * True if the two-character sequence aFirstChar+aSecondChar begins an + * identifier. + */ +function StartsIdent(aFirstChar, aSecondChar) { + return ( + IsIdentStart(aFirstChar) || + (aFirstChar == HYPHEN_MINUS && + (aSecondChar == HYPHEN_MINUS || IsIdentStart(aSecondChar))) + ); +} + +/** + * True if 'ch' is a decimal digit. + */ +function IsDigit(ch) { + return ch >= DIGIT_ZERO && ch <= DIGIT_NINE; +} + +/** + * True if 'ch' is a hexadecimal digit. + */ +function IsHexDigit(ch) { + return IsClosedCharClass(ch, IS_HEX_DIGIT); +} + +/** + * Assuming that 'ch' is a decimal digit, return its numeric value. + */ +function DecimalDigitValue(ch) { + return ch - DIGIT_ZERO; +} + +/** + * Assuming that 'ch' is a hexadecimal digit, return its numeric value. + */ +function HexDigitValue(ch) { + if (IsDigit(ch)) { + return DecimalDigitValue(ch); + } else { + // Note: c&7 just keeps the low three bits which causes + // upper and lower case alphabetics to both yield their + // "relative to 10" value for computing the hex value. + return (ch & 0x7) + 9; + } +} + +/** + * If 'ch' can be the first character of a two-character match operator + * token, return the token type code for that token, otherwise return + * eCSSToken_Symbol to indicate that it can't. + */ +function MatchOperatorType(ch) { + switch (ch) { + case TILDE: + return eCSSToken_Includes; + case VERTICAL_LINE: + return eCSSToken_Dashmatch; + case CIRCUMFLEX_ACCENT: + return eCSSToken_Beginsmatch; + case DOLLAR_SIGN: + return eCSSToken_Endsmatch; + case ASTERISK: + return eCSSToken_Containsmatch; + default: + return eCSSToken_Symbol; + } +} + +function Scanner(buffer) { + this.mBuffer = buffer || ""; + this.mOffset = 0; + this.mCount = this.mBuffer.length; + this.mLineNumber = 1; + this.mLineOffset = 0; + this.mTokenLineOffset = 0; + this.mTokenOffset = 0; + this.mTokenLineNumber = 1; + this.mEOFCharacters = eEOFCharacters_None; +} + +Scanner.prototype = { + /** + * The line number of the most recently returned token. Line + * numbers are 0-based. + */ + get lineNumber() { + return this.mTokenLineNumber - 1; + }, + + /** + * The column number of the most recently returned token. Column + * numbers are 0-based. + */ + get columnNumber() { + return this.mTokenOffset - this.mTokenLineOffset; + }, + + /** + * When EOF is reached, the last token might be unterminated in some + * ways. This method takes an input string and appends the needed + * terminators. In particular: + * + * 1. If EOF occurs mid-string, this will append the correct quote. + * 2. If EOF occurs in a url token, this will append the close paren. + * 3. If EOF occurs in a comment this will append the comment closer. + * + * A trailing backslash might also have been present in the input + * string. This is handled in different ways, depending on the + * context and arguments. + * + * If preserveBackslash is true, then the existing backslash at the + * end of inputString is preserved, and a new backslash is appended. + * That is, the input |\| is transformed to |\\|, and the + * input |'\| is transformed to |'\\'|. + * + * Otherwise, preserveBackslash is false: + * If the backslash appears in a string context, then the trailing + * backslash is dropped from inputString. That is, |"\| is + * transformed to |""|. + * If the backslash appears outside of a string context, then + * U+FFFD is appended. That is, |\| is transformed to a string + * with two characters: backslash followed by U+FFFD. + * + * Passing false for preserveBackslash makes the result conform to + * the CSS Syntax specification. However, passing true may give + * somewhat more intuitive behavior. + * + * @param inputString the input string + * @param preserveBackslash how to handle trailing backslashes + * @return the input string with the termination characters appended + */ + performEOFFixup(aInputString, aPreserveBackslash) { + let result = aInputString; + + let eofChars = this.mEOFCharacters; + + if ( + aPreserveBackslash && + (eofChars & + (eEOFCharacters_DropBackslash | eEOFCharacters_ReplacementChar)) != + 0 + ) { + eofChars &= ~( + eEOFCharacters_DropBackslash | eEOFCharacters_ReplacementChar + ); + result += "\\"; + } + + if ( + (eofChars & eEOFCharacters_DropBackslash) != 0 && + !!result.length && + result.endsWith("\\") + ) { + result = result.slice(0, -1); + } + + const extra = []; + this.AppendImpliedEOFCharacters(eofChars, extra); + const asString = String.fromCharCode.apply(null, extra); + + return result + asString; + }, + + /** + * Return the next token, or null at EOF. + * + * The token object is described by the following WebIDL definition: + * + * dictionary CSSToken { + * // The token type. + * CSSTokenType tokenType = "whitespace"; + * + * // Offset of the first character of the token. + * unsigned long startOffset = 0; + * // Offset of the character after the final character of the token. + * // This is chosen so that the offsets can be passed to |substring| + * // to yield the exact contents of the token. + * unsigned long endOffset = 0; + * + * // If the token is a number, percentage, or dimension, this holds + * // the value. This is not present for other token types. + * double number; + * // If the token is a number, percentage, or dimension, this is true + * // iff the number had an explicit sign. This is not present for + * // other token types. + * boolean hasSign; + * // If the token is a number, percentage, or dimension, this is true + * // iff the number was specified as an integer. This is not present + * // for other token types. + * boolean isInteger; + * + * // Text associated with the token. This is not present for all + * // token types. In particular it is: + * // + * // Token type Meaning + * // =============================== + * // ident The identifier. + * // function The function name. Note that the "(" is part + * // of the token but is not present in |text|. + * // at The word. + * // id The word. + * // hash The word. + * // dimension The dimension. + * // string The string contents after escape processing. + * // bad_string Ditto. + * // url The URL after escape processing. + * // bad_url Ditto. + * // symbol The symbol text. + * DOMString text; + * }; + */ + nextToken() { + const token = {}; + if (!this.Next(token)) { + return null; + } + + const resultToken = {}; + resultToken.tokenType = token.mType; + resultToken.startOffset = this.mTokenOffset; + resultToken.endOffset = this.mOffset; + const constructText = () => { + return safeApply(String.fromCharCode, null, token.mIdent).join(""); + }; + + switch (token.mType) { + case eCSSToken_Whitespace: + break; + + case eCSSToken_Ident: + case eCSSToken_Function: + case eCSSToken_AtKeyword: + case eCSSToken_ID: + case eCSSToken_Hash: + resultToken.text = constructText(); + break; + + case eCSSToken_Dimension: + resultToken.text = constructText(); + /* Fall through. */ + case eCSSToken_Number: + case eCSSToken_Percentage: + resultToken.number = token.mNumber; + resultToken.hasSign = token.mHasSign; + resultToken.isInteger = token.mIntegerValid; + break; + + case eCSSToken_String: + case eCSSToken_Bad_String: + case eCSSToken_URL: + case eCSSToken_Bad_URL: + resultToken.text = constructText(); + /* Don't bother emitting the delimiter, as it is readily extracted + from the source string when needed. */ + break; + + case eCSSToken_Symbol: + resultToken.text = String.fromCharCode(token.mSymbol); + break; + + case eCSSToken_Includes: + case eCSSToken_Dashmatch: + case eCSSToken_Beginsmatch: + case eCSSToken_Endsmatch: + case eCSSToken_Containsmatch: + case eCSSToken_URange: + break; + + case eCSSToken_Comment: + case eCSSToken_HTMLComment: + /* The comment text is easily extracted from the source string, + and is rarely useful. */ + break; + } + + return resultToken; + }, + + /** + * Return the raw UTF-16 code unit at position |this.mOffset + n| within + * the read buffer. If that is beyond the end of the buffer, returns + * -1 to indicate end of input. + */ + Peek(n = 0) { + if (this.mOffset + n >= this.mCount) { + return -1; + } + return this.mBuffer.charCodeAt(this.mOffset + n); + }, + + /** + * Advance |this.mOffset| over |n| code units. Advance(0) is a no-op. + * If |n| is greater than the distance to end of input, will silently + * stop at the end. May not be used to advance over a line boundary; + * AdvanceLine() must be used instead. + */ + Advance(n = 1) { + if (this.mOffset + n >= this.mCount || this.mOffset + n < this.mOffset) { + this.mOffset = this.mCount; + } else { + this.mOffset += n; + } + }, + + /** + * Advance |this.mOffset| over a line boundary. + */ + AdvanceLine() { + // Advance over \r\n as a unit. + if ( + this.mBuffer.charCodeAt(this.mOffset) == CARRIAGE_RETURN && + this.mOffset + 1 < this.mCount && + this.mBuffer.charCodeAt(this.mOffset + 1) == LINE_FEED + ) { + this.mOffset += 2; + } else { + this.mOffset += 1; + } + // 0 is a magical line number meaning that we don't know (i.e., script) + if (this.mLineNumber != 0) { + this.mLineNumber++; + } + this.mLineOffset = this.mOffset; + }, + + /** + * Skip over a sequence of whitespace characters (vertical or + * horizontal) starting at the current read position. + */ + SkipWhitespace() { + for (;;) { + const ch = this.Peek(); + if (!IsWhitespace(ch)) { + // EOF counts as non-whitespace + break; + } + if (IsVertSpace(ch)) { + this.AdvanceLine(); + } else { + this.Advance(); + } + } + }, + + /** + * Skip over one CSS comment starting at the current read position. + */ + SkipComment() { + this.Advance(2); + for (;;) { + let ch = this.Peek(); + if (ch < 0) { + this.SetEOFCharacters(eEOFCharacters_Asterisk | eEOFCharacters_Slash); + return; + } + if (ch == ASTERISK) { + this.Advance(); + ch = this.Peek(); + if (ch < 0) { + this.SetEOFCharacters(eEOFCharacters_Slash); + return; + } + if (ch == SOLIDUS) { + this.Advance(); + return; + } + } else if (IsVertSpace(ch)) { + this.AdvanceLine(); + } else { + this.Advance(); + } + } + }, + + /** + * If there is a valid escape sequence starting at the current read + * position, consume it, decode it, append the result to |aOutput|, + * and return true. Otherwise, consume nothing, leave |aOutput| + * unmodified, and return false. If |aInString| is true, accept the + * additional form of escape sequence allowed within string-like tokens. + */ + GatherEscape(aOutput, aInString) { + let ch = this.Peek(1); + if (ch < 0) { + // If we are in a string (or a url() containing a string), we want to drop + // the backslash on the floor. Otherwise, we want to treat it as a U+FFFD + // character. + this.Advance(); + if (aInString) { + this.SetEOFCharacters(eEOFCharacters_DropBackslash); + } else { + aOutput.push(UCS2_REPLACEMENT_CHAR); + this.SetEOFCharacters(eEOFCharacters_ReplacementChar); + } + return true; + } + if (IsVertSpace(ch)) { + if (aInString) { + // In strings (and in url() containing a string), escaped + // newlines are completely removed, to allow splitting over + // multiple lines. + this.Advance(); + this.AdvanceLine(); + return true; + } + // Outside of strings, backslash followed by a newline is not an escape. + return false; + } + + if (!IsHexDigit(ch)) { + // "Any character (except a hexadecimal digit, linefeed, carriage + // return, or form feed) can be escaped with a backslash to remove + // its special meaning." -- CSS2.1 section 4.1.3 + this.Advance(2); + if (ch == 0) { + aOutput.push(UCS2_REPLACEMENT_CHAR); + } else { + aOutput.push(ch); + } + return true; + } + + // "[at most six hexadecimal digits following a backslash] stand + // for the ISO 10646 character with that number, which must not be + // zero. (It is undefined in CSS 2.1 what happens if a style sheet + // does contain a character with Unicode codepoint zero.)" + // -- CSS2.1 section 4.1.3 + + // At this point we know we have \ followed by at least one + // hexadecimal digit, therefore the escape sequence is valid and we + // can go ahead and consume the backslash. + this.Advance(); + let val = 0; + let i = 0; + do { + val = val * 16 + HexDigitValue(ch); + i++; + this.Advance(); + ch = this.Peek(); + } while (i < 6 && IsHexDigit(ch)); + + // "Interpret the hex digits as a hexadecimal number. If this + // number is zero, or is greater than the maximum allowed + // codepoint, return U+FFFD REPLACEMENT CHARACTER" -- CSS Syntax + // Level 3 + if (val == 0) { + aOutput.push(UCS2_REPLACEMENT_CHAR); + } else { + aOutput.push(ensureValidChar(val)); + } + + // Consume exactly one whitespace character after a + // hexadecimal escape sequence. + if (IsVertSpace(ch)) { + this.AdvanceLine(); + } else if (IsHorzSpace(ch)) { + this.Advance(); + } + return true; + }, + + /** + * Consume a run of "text" beginning with the current read position, + * consisting of characters in the class |aClass| (which must be a + * suitable argument to IsOpenCharClass) plus escape sequences. + * Append the text to |aText|, after decoding escape sequences. + * + * Returns true if at least one character was appended to |aText|, + * false otherwise. + */ + GatherText(aClass, aText) { + const start = this.mOffset; + const inString = aClass == IS_STRING; + + for (;;) { + // Consume runs of unescaped characters in one go. + let n = this.mOffset; + while ( + n < this.mCount && + IsOpenCharClass(this.mBuffer.charCodeAt(n), aClass) + ) { + n++; + } + if (n > this.mOffset) { + const codes = stringToCodes(this.mBuffer.slice(this.mOffset, n)); + safeApply(Array.prototype.push, aText, codes); + this.mOffset = n; + } + if (n == this.mCount) { + break; + } + + const ch = this.Peek(); + if (ch == 0) { + this.Advance(); + aText.push(UCS2_REPLACEMENT_CHAR); + continue; + } + + if (ch != REVERSE_SOLIDUS) { + break; + } + if (!this.GatherEscape(aText, inString)) { + break; + } + } + + return this.mOffset > start; + }, + + /** + * Scan an Ident token. This also handles Function and URL tokens, + * both of which begin indistinguishably from an identifier. It can + * produce a Symbol token when an apparent identifier actually led + * into an invalid escape sequence. + */ + ScanIdent(aToken) { + if (!this.GatherText(IS_IDCHAR, aToken.mIdent)) { + aToken.mSymbol = this.Peek(); + this.Advance(); + return true; + } + + if (this.Peek() != LEFT_PARENTHESIS) { + aToken.mType = eCSSToken_Ident; + return true; + } + + this.Advance(); + aToken.mType = eCSSToken_Function; + + const asString = String.fromCharCode.apply(null, aToken.mIdent); + if (asString.toLowerCase() === "url") { + this.NextURL(aToken); + } + return true; + }, + + /** + * Scan an AtKeyword token. Also handles production of Symbol when + * an '@' is not followed by an identifier. + */ + ScanAtKeyword(aToken) { + // Fall back for when '@' isn't followed by an identifier. + aToken.mSymbol = COMMERCIAL_AT; + this.Advance(); + + const ch = this.Peek(); + if (StartsIdent(ch, this.Peek(1))) { + if (this.GatherText(IS_IDCHAR, aToken.mIdent)) { + aToken.mType = eCSSToken_AtKeyword; + } + } + return true; + }, + + /** + * Scan a Hash token. Handles the distinction between eCSSToken_ID + * and eCSSToken_Hash, and handles production of Symbol when a '#' + * is not followed by identifier characters. + */ + ScanHash(aToken) { + // Fall back for when '#' isn't followed by identifier characters. + aToken.mSymbol = NUMBER_SIGN; + this.Advance(); + + const ch = this.Peek(); + if (IsIdentChar(ch) || ch == REVERSE_SOLIDUS) { + const type = StartsIdent(ch, this.Peek(1)) + ? eCSSToken_ID + : eCSSToken_Hash; + aToken.mIdent.length = 0; + if (this.GatherText(IS_IDCHAR, aToken.mIdent)) { + aToken.mType = type; + } + } + + return true; + }, + + /** + * Scan a Number, Percentage, or Dimension token (all of which begin + * like a Number). Can produce a Symbol when a '.' is not followed by + * digits, or when '+' or '-' are not followed by either a digit or a + * '.' and then a digit. Can also produce a HTMLComment when it + * encounters '-->'. + */ + ScanNumber(aToken) { + let c = this.Peek(); + + // Sign of the mantissa (-1 or 1). + const sign = c == HYPHEN_MINUS ? -1 : 1; + // Absolute value of the integer part of the mantissa. This is a double so + // we don't run into overflow issues for consumers that only care about our + // floating-point value while still being able to express the full int32_t + // range for consumers who want integers. + let intPart = 0; + // Fractional part of the mantissa. This is a double so that when + // we convert to float at the end we'll end up rounding to nearest + // float instead of truncating down (as we would if fracPart were + // a float and we just effectively lost the last several digits). + let fracPart = 0; + // Absolute value of the power of 10 that we should multiply by + // (only relevant for numbers in scientific notation). Has to be + // a signed integer, because multiplication of signed by unsigned + // converts the unsigned to signed, so if we plan to actually + // multiply by expSign... + let exponent = 0; + // Sign of the exponent. + let expSign = 1; + + aToken.mHasSign = c == PLUS_SIGN || c == HYPHEN_MINUS; + if (aToken.mHasSign) { + this.Advance(); + c = this.Peek(); + } + + let gotDot = c == FULL_STOP; + + if (!gotDot) { + // Scan the integer part of the mantissa. + do { + intPart = 10 * intPart + DecimalDigitValue(c); + this.Advance(); + c = this.Peek(); + } while (IsDigit(c)); + + gotDot = c == FULL_STOP && IsDigit(this.Peek(1)); + } + + if (gotDot) { + // Scan the fractional part of the mantissa. + this.Advance(); + c = this.Peek(); + // Power of ten by which we need to divide our next digit + let divisor = 10; + do { + fracPart += DecimalDigitValue(c) / divisor; + divisor *= 10; + this.Advance(); + c = this.Peek(); + } while (IsDigit(c)); + } + + let gotE = false; + if (c == LATIN_SMALL_LETTER_E || c == LATIN_CAPITAL_LETTER_E) { + const expSignChar = this.Peek(1); + const nextChar = this.Peek(2); + if ( + IsDigit(expSignChar) || + ((expSignChar == HYPHEN_MINUS || expSignChar == PLUS_SIGN) && + IsDigit(nextChar)) + ) { + gotE = true; + if (expSignChar == HYPHEN_MINUS) { + expSign = -1; + } + this.Advance(); // consumes the E + if (expSignChar == HYPHEN_MINUS || expSignChar == PLUS_SIGN) { + this.Advance(); + c = nextChar; + } else { + c = expSignChar; + } + do { + exponent = 10 * exponent + DecimalDigitValue(c); + this.Advance(); + c = this.Peek(); + } while (IsDigit(c)); + } + } + + let type = eCSSToken_Number; + + // Set mIntegerValid for all cases (except %, below) because we need + // it for the "2n" in :nth-child(2n). + aToken.mIntegerValid = false; + + // Time to reassemble our number. + // Do all the math in double precision so it's truncated only once. + let value = sign * (intPart + fracPart); + if (gotE) { + // Explicitly cast expSign*exponent to double to avoid issues with + // overloaded pow() on Windows. + value *= Math.pow(10.0, expSign * exponent); + } else if (!gotDot) { + // Clamp values outside of integer range. + if (sign > 0) { + aToken.mInteger = Math.min(intPart, Number.MAX_SAFE_INTEGER); + } else { + aToken.mInteger = Math.max(-intPart, Number.MIN_SAFE_INTEGER); + } + aToken.mIntegerValid = true; + } + + const ident = aToken.mIdent; + + // Check for Dimension and Percentage tokens. + if (c >= 0) { + if (StartsIdent(c, this.Peek(1))) { + if (this.GatherText(IS_IDCHAR, ident)) { + type = eCSSToken_Dimension; + } + } else if (c == PERCENT_SIGN) { + this.Advance(); + type = eCSSToken_Percentage; + value = value / 100.0; + aToken.mIntegerValid = false; + } + } + aToken.mNumber = value; + aToken.mType = type; + return true; + }, + + /** + * Scan a string constant ('foo' or "foo"). Will always produce + * either a String or a Bad_String token; the latter occurs when the + * close quote is missing. Always returns true (for convenience in Next()). + */ + ScanString(aToken) { + const aStop = this.Peek(); + aToken.mType = eCSSToken_String; + aToken.mSymbol = aStop; // Remember how it's quoted. + this.Advance(); + + for (;;) { + this.GatherText(IS_STRING, aToken.mIdent); + + const ch = this.Peek(); + if (ch == -1) { + this.AddEOFCharacters( + aStop == QUOTATION_MARK + ? eEOFCharacters_DoubleQuote + : eEOFCharacters_SingleQuote + ); + break; // EOF ends a string token with no error. + } + if (ch == aStop) { + this.Advance(); + break; + } + // Both " and ' are excluded from IS_STRING. + if (ch == QUOTATION_MARK || ch == APOSTROPHE) { + aToken.mIdent.push(ch); + this.Advance(); + continue; + } + + aToken.mType = eCSSToken_Bad_String; + break; + } + return true; + }, + + /** + * Scan a unicode-range token. These match the regular expression + * + * u\+[0-9a-f?]{1,6}(-[0-9a-f]{1,6})? + * + * However, some such tokens are "invalid". There are three valid forms: + * + * u+[0-9a-f]{x} 1 <= x <= 6 + * u+[0-9a-f]{x}\?{y} 1 <= x+y <= 6 + * u+[0-9a-f]{x}-[0-9a-f]{y} 1 <= x <= 6, 1 <= y <= 6 + * + * All unicode-range tokens have their text recorded in mIdent; valid ones + * are also decoded into mInteger and mInteger2, and mIntegerValid is set. + * Note that this does not validate the numeric range, only the syntactic + * form. + */ + ScanURange(aResult) { + const intro1 = this.Peek(); + const intro2 = this.Peek(1); + let ch = this.Peek(2); + + aResult.mIdent.push(intro1); + aResult.mIdent.push(intro2); + this.Advance(2); + + let valid = true; + let haveQues = false; + let low = 0; + let high = 0; + let i = 0; + + do { + aResult.mIdent.push(ch); + if (IsHexDigit(ch)) { + if (haveQues) { + valid = false; // All question marks should be at the end. + } + low = low * 16 + HexDigitValue(ch); + high = high * 16 + HexDigitValue(ch); + } else { + haveQues = true; + low = low * 16 + 0x0; + high = high * 16 + 0xf; + } + + i++; + this.Advance(); + ch = this.Peek(); + } while (i < 6 && (IsHexDigit(ch) || ch == QUESTION_MARK)); + + if (ch == HYPHEN_MINUS && IsHexDigit(this.Peek(1))) { + if (haveQues) { + valid = false; + } + + aResult.mIdent.push(ch); + this.Advance(); + ch = this.Peek(); + high = 0; + i = 0; + do { + aResult.mIdent.push(ch); + high = high * 16 + HexDigitValue(ch); + + i++; + this.Advance(); + ch = this.Peek(); + } while (i < 6 && IsHexDigit(ch)); + } + + aResult.mInteger = low; + aResult.mInteger2 = high; + aResult.mIntegerValid = valid; + aResult.mType = eCSSToken_URange; + return true; + }, + + SetEOFCharacters(aEOFCharacters) { + this.mEOFCharacters = aEOFCharacters; + }, + + AddEOFCharacters(aEOFCharacters) { + this.mEOFCharacters = this.mEOFCharacters | aEOFCharacters; + }, + + AppendImpliedEOFCharacters(aEOFCharacters, aResult) { + // First, ignore eEOFCharacters_DropBackslash. + let c = aEOFCharacters >> 1; + + // All of the remaining EOFCharacters bits represent appended characters, + // and the bits are in the order that they need appending. + for (const p of kImpliedEOFCharacters) { + if (c & 1) { + aResult.push(p); + } + c >>= 1; + } + }, + + /** + * Consume the part of an URL token after the initial 'url('. Caller + * is assumed to have consumed 'url(' already. Will always produce + * either an URL or a Bad_URL token. + * + * Exposed for use by nsCSSParser::ParseMozDocumentRule, which applies + * the special lexical rules for URL tokens in a nonstandard context. + */ + NextURL(aToken) { + this.SkipWhitespace(); + + // aToken.mIdent may be "url" at this point; clear that out + aToken.mIdent.length = 0; + + let hasString = false; + let ch = this.Peek(); + // Do we have a string? + if (ch == QUOTATION_MARK || ch == APOSTROPHE) { + this.ScanString(aToken); + if (aToken.mType == eCSSToken_Bad_String) { + aToken.mType = eCSSToken_Bad_URL; + return; + } + hasString = true; + } else { + // Otherwise, this is the start of a non-quoted url (which may be empty). + aToken.mSymbol = 0; + this.GatherText(IS_URL_CHAR, aToken.mIdent); + } + + // Consume trailing whitespace and then look for a close parenthesis. + this.SkipWhitespace(); + ch = this.Peek(); + // ch can be less than zero indicating EOF + if (ch < 0 || ch == RIGHT_PARENTHESIS) { + this.Advance(); + aToken.mType = eCSSToken_URL; + if (ch < 0) { + this.AddEOFCharacters(eEOFCharacters_CloseParen); + } + } else { + aToken.mType = eCSSToken_Bad_URL; + if (!hasString) { + // Consume until before the next right parenthesis, which follows + // how <bad-url-token> is consumed in CSS Syntax 3 spec. + // Note that, we only do this when "url(" is not followed by a + // string, because in the spec, "url(" followed by a string is + // handled as a url function rather than a <url-token>, so the + // rest of content before ")" should be consumed in balance, + // which will be done by the parser. + // The closing ")" is not consumed here. It is left to the parser + // so that the parser can handle both cases. + do { + if (IsVertSpace(ch)) { + this.AdvanceLine(); + } else { + this.Advance(); + } + ch = this.Peek(); + } while (ch >= 0 && ch != RIGHT_PARENTHESIS); + } + } + }, + + /** + * Primary scanner entry point. Consume one token and fill in + * |aToken| accordingly. Will skip over any number of comments first, + * and will also skip over rather than return whitespace and comment + * tokens, depending on the value of |aSkip|. + * + * Returns true if it successfully consumed a token, false if EOF has + * been reached. Will always advance the current read position by at + * least one character unless called when already at EOF. + */ + Next(aToken, aSkip) { + // do this here so we don't have to do it in dozens of other places + aToken.mIdent = []; + aToken.mType = eCSSToken_Symbol; + + this.mTokenOffset = this.mOffset; + this.mTokenLineOffset = this.mLineOffset; + this.mTokenLineNumber = this.mLineNumber; + + const ch = this.Peek(); + if (IsWhitespace(ch)) { + this.SkipWhitespace(); + aToken.mType = eCSSToken_Whitespace; + return true; + } + if ( + ch == SOLIDUS && // !IsSVGMode() && + this.Peek(1) == ASTERISK + ) { + this.SkipComment(); + aToken.mType = eCSSToken_Comment; + return true; + } + + // EOF + if (ch < 0) { + return false; + } + + // 'u' could be UNICODE-RANGE or an identifier-family token + if (ch == LATIN_SMALL_LETTER_U || ch == LATIN_CAPITAL_LETTER_U) { + const c2 = this.Peek(1); + const c3 = this.Peek(2); + if (c2 == PLUS_SIGN && (IsHexDigit(c3) || c3 == QUESTION_MARK)) { + return this.ScanURange(aToken); + } + return this.ScanIdent(aToken); + } + + // identifier family + if (IsIdentStart(ch)) { + return this.ScanIdent(aToken); + } + + // number family + if (IsDigit(ch)) { + return this.ScanNumber(aToken); + } + + if (ch == FULL_STOP && IsDigit(this.Peek(1))) { + return this.ScanNumber(aToken); + } + + if (ch == PLUS_SIGN) { + const c2 = this.Peek(1); + if (IsDigit(c2) || (c2 == FULL_STOP && IsDigit(this.Peek(2)))) { + return this.ScanNumber(aToken); + } + } + + // HYPHEN_MINUS can start an identifier-family token, a number-family token, + // or an HTML-comment + if (ch == HYPHEN_MINUS) { + const c2 = this.Peek(1); + const c3 = this.Peek(2); + if (IsIdentStart(c2) || (c2 == HYPHEN_MINUS && c3 != GREATER_THAN_SIGN)) { + return this.ScanIdent(aToken); + } + if (IsDigit(c2) || (c2 == FULL_STOP && IsDigit(c3))) { + return this.ScanNumber(aToken); + } + if (c2 == HYPHEN_MINUS && c3 == GREATER_THAN_SIGN) { + this.Advance(3); + aToken.mType = eCSSToken_HTMLComment; + aToken.mIdent = stringToCodes("-->"); + return true; + } + } + + // the other HTML-comment token + if ( + ch == LESS_THAN_SIGN && + this.Peek(1) == EXCLAMATION_MARK && + this.Peek(2) == HYPHEN_MINUS && + this.Peek(3) == HYPHEN_MINUS + ) { + this.Advance(4); + aToken.mType = eCSSToken_HTMLComment; + aToken.mIdent = stringToCodes("<!--"); + return true; + } + + // AT_KEYWORD + if (ch == COMMERCIAL_AT) { + return this.ScanAtKeyword(aToken); + } + + // HASH + if (ch == NUMBER_SIGN) { + return this.ScanHash(aToken); + } + + // STRING + if (ch == QUOTATION_MARK || ch == APOSTROPHE) { + return this.ScanString(aToken); + } + + // Match operators: ~= |= ^= $= *= + const opType = MatchOperatorType(ch); + if (opType != eCSSToken_Symbol && this.Peek(1) == EQUALS_SIGN) { + aToken.mType = opType; + this.Advance(2); + return true; + } + + // Otherwise, a symbol (DELIM). + aToken.mSymbol = ch; + this.Advance(); + return true; + }, +}; + +/** + * Create and return a new CSS lexer. + * + * @param {String} input the CSS text to lex + * @return {CSSLexer} the new lexer + */ +function getCSSLexer(input) { + return new Scanner(input); +} + +exports.getCSSLexer = getCSSLexer; |