/* * Copyright (c) 2005-2007 Henri Sivonen * Copyright (c) 2007-2017 Mozilla Foundation * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla * Foundation, and Opera Software ASA. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ /* * THIS IS A GENERATED FILE. PLEASE DO NOT EDIT. * Please edit Tokenizer.java instead and regenerate. */ #ifndef nsHtml5Tokenizer_h #define nsHtml5Tokenizer_h #include "jArray.h" #include "nsAHtml5TreeBuilderState.h" #include "nsAtom.h" #include "nsGkAtoms.h" #include "nsHtml5ArrayCopy.h" #include "nsHtml5AtomTable.h" #include "nsHtml5DocumentMode.h" #include "nsHtml5Highlighter.h" #include "nsHtml5Macros.h" #include "nsHtml5NamedCharacters.h" #include "nsHtml5NamedCharactersAccel.h" #include "nsHtml5String.h" #include "nsIContent.h" #include "nsTraceRefcnt.h" class nsHtml5StreamParser; class nsHtml5AttributeName; class nsHtml5ElementName; class nsHtml5TreeBuilder; class nsHtml5UTF16Buffer; class nsHtml5StateSnapshot; class nsHtml5Portability; class nsHtml5Tokenizer { private: static const int32_t DATA_AND_RCDATA_MASK = ~1; public: static const int32_t DATA = 0; static const int32_t RCDATA = 1; static const int32_t SCRIPT_DATA = 2; static const int32_t RAWTEXT = 3; static const int32_t SCRIPT_DATA_ESCAPED = 4; static const int32_t ATTRIBUTE_VALUE_DOUBLE_QUOTED = 5; static const int32_t ATTRIBUTE_VALUE_SINGLE_QUOTED = 6; static const int32_t ATTRIBUTE_VALUE_UNQUOTED = 7; static const int32_t PLAINTEXT = 8; static const int32_t TAG_OPEN = 9; static const int32_t CLOSE_TAG_OPEN = 10; static const int32_t TAG_NAME = 11; static const int32_t BEFORE_ATTRIBUTE_NAME = 12; static const int32_t ATTRIBUTE_NAME = 13; static const int32_t AFTER_ATTRIBUTE_NAME = 14; static const int32_t BEFORE_ATTRIBUTE_VALUE = 15; static const int32_t AFTER_ATTRIBUTE_VALUE_QUOTED = 16; static const int32_t BOGUS_COMMENT = 17; static const int32_t MARKUP_DECLARATION_OPEN = 18; static const int32_t DOCTYPE = 19; static const int32_t BEFORE_DOCTYPE_NAME = 20; static const int32_t DOCTYPE_NAME = 21; static const int32_t AFTER_DOCTYPE_NAME = 22; static const int32_t BEFORE_DOCTYPE_PUBLIC_IDENTIFIER = 23; static const int32_t DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED = 24; static const int32_t DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED = 25; static const int32_t AFTER_DOCTYPE_PUBLIC_IDENTIFIER = 26; static const int32_t BEFORE_DOCTYPE_SYSTEM_IDENTIFIER = 27; static const int32_t DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED = 28; static const int32_t DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED = 29; static const int32_t AFTER_DOCTYPE_SYSTEM_IDENTIFIER = 30; static const int32_t BOGUS_DOCTYPE = 31; static const int32_t COMMENT_START = 32; static const int32_t COMMENT_START_DASH = 33; static const int32_t COMMENT = 34; static const int32_t COMMENT_END_DASH = 35; static const int32_t COMMENT_END = 36; static const int32_t COMMENT_END_BANG = 37; static const int32_t NON_DATA_END_TAG_NAME = 38; static const int32_t MARKUP_DECLARATION_HYPHEN = 39; static const int32_t MARKUP_DECLARATION_OCTYPE = 40; static const int32_t DOCTYPE_UBLIC = 41; static const int32_t DOCTYPE_YSTEM = 42; static const int32_t AFTER_DOCTYPE_PUBLIC_KEYWORD = 43; static const int32_t BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS = 44; static const int32_t AFTER_DOCTYPE_SYSTEM_KEYWORD = 45; static const int32_t CONSUME_CHARACTER_REFERENCE = 46; static const int32_t CONSUME_NCR = 47; static const int32_t CHARACTER_REFERENCE_TAIL = 48; static const int32_t HEX_NCR_LOOP = 49; static const int32_t DECIMAL_NRC_LOOP = 50; static const int32_t HANDLE_NCR_VALUE = 51; static const int32_t HANDLE_NCR_VALUE_RECONSUME = 52; static const int32_t CHARACTER_REFERENCE_HILO_LOOKUP = 53; static const int32_t SELF_CLOSING_START_TAG = 54; static const int32_t CDATA_START = 55; static const int32_t CDATA_SECTION = 56; static const int32_t CDATA_RSQB = 57; static const int32_t CDATA_RSQB_RSQB = 58; static const int32_t SCRIPT_DATA_LESS_THAN_SIGN = 59; static const int32_t SCRIPT_DATA_ESCAPE_START = 60; static const int32_t SCRIPT_DATA_ESCAPE_START_DASH = 61; static const int32_t SCRIPT_DATA_ESCAPED_DASH = 62; static const int32_t SCRIPT_DATA_ESCAPED_DASH_DASH = 63; static const int32_t BOGUS_COMMENT_HYPHEN = 64; static const int32_t RAWTEXT_RCDATA_LESS_THAN_SIGN = 65; static const int32_t SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 66; static const int32_t SCRIPT_DATA_DOUBLE_ESCAPE_START = 67; static const int32_t SCRIPT_DATA_DOUBLE_ESCAPED = 68; static const int32_t SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 69; static const int32_t SCRIPT_DATA_DOUBLE_ESCAPED_DASH = 70; static const int32_t SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = 71; static const int32_t SCRIPT_DATA_DOUBLE_ESCAPE_END = 72; static const int32_t PROCESSING_INSTRUCTION = 73; static const int32_t PROCESSING_INSTRUCTION_QUESTION_MARK = 74; static const int32_t COMMENT_LESSTHAN = 76; static const int32_t COMMENT_LESSTHAN_BANG = 77; static const int32_t COMMENT_LESSTHAN_BANG_DASH = 78; static const int32_t COMMENT_LESSTHAN_BANG_DASH_DASH = 79; private: static const int32_t LEAD_OFFSET = (0xD800 - (0x10000 >> 10)); static char16_t LT_GT[]; static char16_t LT_SOLIDUS[]; static char16_t RSQB_RSQB[]; static char16_t REPLACEMENT_CHARACTER[]; static char16_t LF[]; static char16_t CDATA_LSQB[]; static char16_t OCTYPE[]; static char16_t UBLIC[]; static char16_t YSTEM[]; static staticJArray<char16_t, int32_t> TITLE_ARR; static staticJArray<char16_t, int32_t> SCRIPT_ARR; static staticJArray<char16_t, int32_t> STYLE_ARR; static staticJArray<char16_t, int32_t> PLAINTEXT_ARR; static staticJArray<char16_t, int32_t> XMP_ARR; static staticJArray<char16_t, int32_t> TEXTAREA_ARR; static staticJArray<char16_t, int32_t> IFRAME_ARR; static staticJArray<char16_t, int32_t> NOEMBED_ARR; static staticJArray<char16_t, int32_t> NOSCRIPT_ARR; static staticJArray<char16_t, int32_t> NOFRAMES_ARR; protected: nsHtml5TreeBuilder* tokenHandler; nsHtml5StreamParser* encodingDeclarationHandler; bool lastCR; int32_t stateSave; private: int32_t returnStateSave; protected: int32_t index; private: bool forceQuirks; char16_t additional; int32_t entCol; int32_t firstCharKey; int32_t lo; int32_t hi; int32_t candidate; int32_t charRefBufMark; protected: int32_t value; private: bool seenDigits; bool suspendAfterCurrentNonTextToken; protected: int32_t cstart; private: nsHtml5String publicId; nsHtml5String systemId; autoJArray<char16_t, int32_t> strBuf; int32_t strBufLen; autoJArray<char16_t, int32_t> charRefBuf; int32_t charRefBufLen; autoJArray<char16_t, int32_t> bmpChar; autoJArray<char16_t, int32_t> astralChar; protected: nsHtml5ElementName* endTagExpectation; private: jArray<char16_t, int32_t> endTagExpectationAsArray; protected: bool endTag; private: bool containsHyphen; nsHtml5ElementName* tagName; nsHtml5ElementName* nonInternedTagName; protected: nsHtml5AttributeName* attributeName; private: nsHtml5AttributeName* nonInternedAttributeName; RefPtr<nsAtom> doctypeName; nsHtml5String publicIdentifier; nsHtml5String systemIdentifier; nsHtml5HtmlAttributes* attributes; bool newAttributesEachTime; bool shouldSuspend; bool keepBuffer; protected: bool confident; private: int32_t line; int32_t attributeLine; nsHtml5AtomTable* interner; bool viewingXmlSource; public: nsHtml5Tokenizer(nsHtml5TreeBuilder* tokenHandler, bool viewingXmlSource); void setInterner(nsHtml5AtomTable* interner); void initLocation(nsHtml5String newPublicId, nsHtml5String newSystemId); bool isViewingXmlSource(); void setKeepBuffer(bool keepBuffer); bool dropBufferIfLongerThan(int32_t length); void setState(int32_t specialTokenizerState); void setStateAndEndTagExpectation(int32_t specialTokenizerState, nsHtml5ElementName* endTagExpectation); private: void endTagExpectationToArray(); public: void setLineNumber(int32_t line); inline int32_t getLineNumber() { return line; } nsHtml5HtmlAttributes* emptyAttributes(); private: inline void appendCharRefBuf(char16_t c) { MOZ_RELEASE_ASSERT(charRefBufLen < charRefBuf.length, "Attempted to overrun charRefBuf!"); charRefBuf[charRefBufLen++] = c; } void emitOrAppendCharRefBuf(int32_t returnState); inline void clearStrBufAfterUse() { strBufLen = 0; } inline void clearStrBufBeforeUse() { MOZ_ASSERT(!strBufLen, "strBufLen not reset after previous use!"); strBufLen = 0; } inline void clearStrBufAfterOneHyphen() { MOZ_ASSERT(strBufLen == 1, "strBufLen length not one!"); MOZ_ASSERT(strBuf[0] == '-', "strBuf does not start with a hyphen!"); strBufLen = 0; } inline void appendStrBuf(char16_t c) { MOZ_ASSERT(strBufLen < strBuf.length, "Previous buffer length insufficient."); if (MOZ_UNLIKELY(strBufLen == strBuf.length)) { if (MOZ_UNLIKELY(!EnsureBufferSpace(1))) { MOZ_CRASH("Unable to recover from buffer reallocation failure"); } } strBuf[strBufLen++] = c; } protected: nsHtml5String strBufToString(); private: void strBufToDoctypeName(); void emitStrBuf(); inline void appendSecondHyphenToBogusComment() { appendStrBuf('-'); } inline void adjustDoubleHyphenAndAppendToStrBufAndErr( char16_t c, bool reportedConsecutiveHyphens) { appendStrBuf(c); } void appendStrBuf(char16_t* buffer, int32_t offset, int32_t length); inline void appendCharRefBufToStrBuf() { appendStrBuf(charRefBuf, 0, charRefBufLen); charRefBufLen = 0; } void emitComment(int32_t provisionalHyphens, int32_t pos); protected: void flushChars(char16_t* buf, int32_t pos); private: void strBufToElementNameString(); int32_t emitCurrentTagToken(bool selfClosing, int32_t pos); void attributeNameComplete(); void addAttributeWithoutValue(); void addAttributeWithValue(); public: void start(); bool tokenizeBuffer(nsHtml5UTF16Buffer* buffer); private: template <class P> int32_t stateLoop(int32_t state, char16_t c, int32_t pos, char16_t* buf, bool reconsume, int32_t returnState, int32_t endPos); void initDoctypeFields(); template <class P> void adjustDoubleHyphenAndAppendToStrBufCarriageReturn(); template <class P> void adjustDoubleHyphenAndAppendToStrBufLineFeed(); template <class P> void appendStrBufLineFeed(); template <class P> void appendStrBufCarriageReturn(); template <class P> void emitCarriageReturn(char16_t* buf, int32_t pos); void emitReplacementCharacter(char16_t* buf, int32_t pos); void maybeEmitReplacementCharacter(char16_t* buf, int32_t pos); void emitPlaintextReplacementCharacter(char16_t* buf, int32_t pos); void setAdditionalAndRememberAmpersandLocation(char16_t add); void bogusDoctype(); void bogusDoctypeWithoutQuirks(); void handleNcrValue(int32_t returnState); public: void eof(); private: void emitDoctypeToken(int32_t pos); void suspendIfRequestedAfterCurrentNonTextToken(); void suspendAfterCurrentTokenIfNotInText(); bool suspensionAfterCurrentNonTextTokenPending(); public: bool internalEncodingDeclaration(nsHtml5String internalCharset); private: void emitOrAppendTwo(const char16_t* val, int32_t returnState); void emitOrAppendOne(const char16_t* val, int32_t returnState); public: void end(); void requestSuspension(); bool isInDataState(); void resetToDataState(); void loadState(nsHtml5Tokenizer* other); void initializeWithoutStarting(); void setEncodingDeclarationHandler( nsHtml5StreamParser* encodingDeclarationHandler); ~nsHtml5Tokenizer(); static void initializeStatics(); static void releaseStatics(); #include "nsHtml5TokenizerHSupplement.h" }; #endif