diff options
Diffstat (limited to '')
-rw-r--r-- | parser/html/nsHtml5StreamParser.h | 766 |
1 files changed, 766 insertions, 0 deletions
diff --git a/parser/html/nsHtml5StreamParser.h b/parser/html/nsHtml5StreamParser.h new file mode 100644 index 0000000000..0dacf257bf --- /dev/null +++ b/parser/html/nsHtml5StreamParser.h @@ -0,0 +1,766 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef nsHtml5StreamParser_h +#define nsHtml5StreamParser_h + +#include <tuple> + +#include "MainThreadUtils.h" +#include "mozilla/AlreadyAddRefed.h" +#include "mozilla/Assertions.h" +#include "mozilla/Encoding.h" +#include "mozilla/Mutex.h" +#include "mozilla/NotNull.h" +#include "mozilla/RefPtr.h" +#include "mozilla/Span.h" +#include "mozilla/UniquePtr.h" +#include "nsCharsetSource.h" +#include "nsCOMPtr.h" +#include "nsCycleCollectionParticipant.h" +#include "nsDebug.h" +#include "nsHtml5AtomTable.h" +#include "nsIRequestObserver.h" +#include "nsISerialEventTarget.h" +#include "nsISupports.h" +#include "nsStringFwd.h" +#include "nsTArray.h" +#include "nscore.h" + +class nsCycleCollectionTraversalCallback; +class nsHtml5OwningUTF16Buffer; +class nsHtml5Parser; +class nsHtml5Speculation; +class nsHtml5String; +class nsHtml5Tokenizer; +class nsHtml5TreeBuilder; +class nsHtml5TreeOpExecutor; +class nsIChannel; +class nsIInputStream; +class nsIRequest; +class nsIRunnable; +class nsITimer; +class nsIURI; + +namespace mozilla { +class EncodingDetector; +template <typename T> +class Buffer; + +namespace dom { +class DocGroup; +} +} // namespace mozilla + +enum eParserMode { + /** + * Parse a document normally as HTML. + */ + NORMAL, + + /** + * View document as HTML source. + */ + VIEW_SOURCE_HTML, + + /** + * View document as XML source + */ + VIEW_SOURCE_XML, + + /** + * View document as plain text source + */ + VIEW_SOURCE_PLAIN, + + /** + * View document as plain text + */ + PLAIN_TEXT, + + /** + * Load as data (XHR) + */ + LOAD_AS_DATA +}; + +enum eBomState { + /** + * BOM sniffing hasn't started. + */ + BOM_SNIFFING_NOT_STARTED, + + /** + * BOM sniffing is ongoing, and the first byte of an UTF-16LE BOM has been + * seen. + */ + SEEN_UTF_16_LE_FIRST_BYTE, + + /** + * BOM sniffing is ongoing, and the first byte of an UTF-16BE BOM has been + * seen. + */ + SEEN_UTF_16_BE_FIRST_BYTE, + + /** + * BOM sniffing is ongoing, and the first byte of an UTF-8 BOM has been + * seen. + */ + SEEN_UTF_8_FIRST_BYTE, + + /** + * BOM sniffing is ongoing, and the first and second bytes of an UTF-8 BOM + * have been seen. + */ + SEEN_UTF_8_SECOND_BYTE, + + /** + * Seen \x00 in UTF-16BE bogo-XML declaration. + */ + SEEN_UTF_16_BE_XML_FIRST, + + /** + * Seen \x00< in UTF-16BE bogo-XML declaration. + */ + SEEN_UTF_16_BE_XML_SECOND, + + /** + * Seen \x00<\x00 in UTF-16BE bogo-XML declaration. + */ + SEEN_UTF_16_BE_XML_THIRD, + + /** + * Seen \x00<\x00? in UTF-16BE bogo-XML declaration. + */ + SEEN_UTF_16_BE_XML_FOURTH, + + /** + * Seen \x00<\x00?\x00 in UTF-16BE bogo-XML declaration. + */ + SEEN_UTF_16_BE_XML_FIFTH, + + /** + * Seen < in UTF-16BE bogo-XML declaration. + */ + SEEN_UTF_16_LE_XML_FIRST, + + /** + * Seen <\x00 in UTF-16BE bogo-XML declaration. + */ + SEEN_UTF_16_LE_XML_SECOND, + + /** + * Seen <\x00? in UTF-16BE bogo-XML declaration. + */ + SEEN_UTF_16_LE_XML_THIRD, + + /** + * Seen <\x00?\x00 in UTF-16BE bogo-XML declaration. + */ + SEEN_UTF_16_LE_XML_FOURTH, + + /** + * Seen <\x00?\x00x in UTF-16BE bogo-XML declaration. + */ + SEEN_UTF_16_LE_XML_FIFTH, + + /** + * BOM sniffing was started but is now over for whatever reason. + */ + BOM_SNIFFING_OVER, +}; + +enum eHtml5StreamState { + STREAM_NOT_STARTED = 0, + STREAM_BEING_READ = 1, + STREAM_ENDED = 2 +}; + +class nsHtml5StreamParser final : public nsISupports { + template <typename T> + using NotNull = mozilla::NotNull<T>; + using Encoding = mozilla::Encoding; + + const uint32_t UNCONDITIONAL_META_SCAN_BOUNDARY = 1024; + const uint32_t READ_BUFFER_SIZE = 1024; + const uint32_t LOCAL_FILE_UTF_8_BUFFER_SIZE = 1024 * 1024 * 4; // 4 MB + + friend class nsHtml5RequestStopper; + friend class nsHtml5DataAvailable; + friend class nsHtml5StreamParserContinuation; + friend class nsHtml5TimerKungFu; + friend class nsHtml5StreamParserPtr; + friend class nsHtml5StreamListener; + + public: + NS_DECL_CYCLE_COLLECTING_ISUPPORTS + NS_DECL_CYCLE_COLLECTION_CLASS(nsHtml5StreamParser) + + nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor, nsHtml5Parser* aOwner, + eParserMode aMode); + + nsresult OnStartRequest(nsIRequest* aRequest); + + nsresult OnDataAvailable(nsIRequest* aRequest, nsIInputStream* aInStream, + uint64_t aSourceOffset, uint32_t aLength); + + nsresult OnStopRequest(nsIRequest* aRequest, nsresult status); + + // EncodingDeclarationHandler + // https://hg.mozilla.org/projects/htmlparser/file/tip/src/nu/validator/htmlparser/common/EncodingDeclarationHandler.java + /** + * Tree builder uses this to report a late <meta charset> + */ + bool internalEncodingDeclaration(nsHtml5String aEncoding); + + bool TemplatePushedOrHeadPopped(); + + void RememberGt(int32_t aPos); + + // Not from an external interface + + /** + * Post a runnable to the main thread to perform the speculative load + * operations without performing the tree operations. + * + * This should be called at the end of each data available or stop + * request runnable running on the parser thread. + */ + void PostLoadFlusher(); + + /** + * Pass a buffer to chardetng. + */ + void FeedDetector(mozilla::Span<const uint8_t> aBuffer); + + /** + * Report EOF to chardetng. + */ + void DetectorEof(); + + /** + * Call this method once you've created a parser, and want to instruct it + * about what charset to load + * + * @param aEncoding the charset of a document + * @param aCharsetSource the source of the charset + */ + inline void SetDocumentCharset(NotNull<const Encoding*> aEncoding, + nsCharsetSource aSource, + bool aForceAutoDetection) { + MOZ_ASSERT(mStreamState == STREAM_NOT_STARTED, + "SetDocumentCharset called too late."); + MOZ_ASSERT(NS_IsMainThread(), "Wrong thread!"); + MOZ_ASSERT(!(aForceAutoDetection && aSource >= kCharsetFromOtherComponent), + "Can't force with high-ranking source."); + mEncoding = aEncoding; + mCharsetSource = aSource; + mForceAutoDetection = aForceAutoDetection; + mChannelHadCharset = (aSource == kCharsetFromChannel); + } + + nsresult GetChannel(nsIChannel** aChannel); + + /** + * The owner parser must call this after script execution + * when no scripts are executing and the document.written + * buffer has been exhausted. + * + * If the first two arguments are nullptr, instead of + * continuing after scripts, this method commits to an + * internally-discovered encoding. + */ + void ContinueAfterScriptsOrEncodingCommitment( + nsHtml5Tokenizer* aTokenizer, nsHtml5TreeBuilder* aTreeBuilder, + bool aLastWasCR); + + /** + * Continues the stream parser if the charset switch failed. + */ + void ContinueAfterFailedCharsetSwitch(); + + void Terminate() { mTerminated = true; } + + void DropTimer(); + + /** + * Sets the URL for View Source title in case this parser ends up being + * used for View Source. If aURL is a view-source: URL, takes the inner + * URL. data: URLs are shown with an ellipsis instead of the actual data. + */ + void SetViewSourceTitle(nsIURI* aURL); + + private: + virtual ~nsHtml5StreamParser(); + +#ifdef DEBUG + bool IsParserThread() { return mEventTarget->IsOnCurrentThread(); } +#endif + + void MarkAsBroken(nsresult aRv); + + /** + * Marks the stream parser as interrupted. If you ever add calls to this + * method, be sure to review Uninterrupt usage very, very carefully to + * avoid having a previous in-flight runnable cancel your Interrupt() + * call on the other thread too soon. + */ + void Interrupt() { + MOZ_ASSERT(NS_IsMainThread(), "Wrong thread!"); + mInterrupted = true; + } + + void Uninterrupt() MOZ_NO_THREAD_SAFETY_ANALYSIS { + MOZ_ASSERT(IsParserThread(), "Wrong thread!"); + mTokenizerMutex.AssertCurrentThreadOwns(); + mInterrupted = false; + } + + /** + * Flushes the tree ops from the tree builder and disarms the flush + * timer. + */ + void FlushTreeOpsAndDisarmTimer(); + + void SwitchDecoderIfAsciiSoFar(NotNull<const Encoding*> aEncoding) + MOZ_REQUIRES(mTokenizerMutex); + ; + + size_t CountGts(); + + void DiscardMetaSpeculation(); + + bool ProcessLookingForMetaCharset(bool aEof) MOZ_REQUIRES(mTokenizerMutex); + + void ParseAvailableData(); + + void DoStopRequest(); + + void DoDataAvailableBuffer(mozilla::Buffer<uint8_t>&& aBuffer) + MOZ_REQUIRES(mTokenizerMutex); + + void DoDataAvailable(mozilla::Span<const uint8_t> aBuffer) + MOZ_REQUIRES(mTokenizerMutex); + + static nsresult CopySegmentsToParser(nsIInputStream* aInStream, + void* aClosure, const char* aFromSegment, + uint32_t aToOffset, uint32_t aCount, + uint32_t* aWriteCount) + MOZ_REQUIRES(mTokenizerMutex); + + bool IsTerminatedOrInterrupted() { return mTerminated || mInterrupted; } + + bool IsTerminated() { return mTerminated; } + + /** + * True when there is a Unicode decoder already + */ + inline bool HasDecoder() { return !!mUnicodeDecoder; } + + /** + * Returns 0 if 1) there aren't at least 2 buffers in mBufferedBytes + * or 2) there is no byte '>' in the second buffer. + * Otherwise, returns the length of the prefix of the second buffer + * that is long enough to contain the first byte '>' in the second + * buffer (including the '>' byte). + */ + size_t LengthOfLtContainingPrefixInSecondBuffer(); + + /** + * Push bytes from network when there is no Unicode decoder yet + */ + nsresult SniffStreamBytes(mozilla::Span<const uint8_t> aFromSegment, + bool aEof) MOZ_REQUIRES(mTokenizerMutex); + + /** + * Push bytes from network when there is a Unicode decoder already + */ + nsresult WriteStreamBytes(mozilla::Span<const uint8_t> aFromSegment) + MOZ_REQUIRES(mTokenizerMutex); + + /** + * Set up the Unicode decoder and write the sniffing buffer into it + * followed by the current network buffer. + * + * @param aPrefix the part of the stream that has already been seen + * prior to aFromSegment. In practice, these are the + * bytes that are baked into the state of the BOM + * and UTF-16 XML declaration-like sniffing state + * machine state. + * @param aFromSegment The current network buffer + */ + nsresult SetupDecodingAndWriteSniffingBufferAndCurrentSegment( + mozilla::Span<const uint8_t> aPrefix, + mozilla::Span<const uint8_t> aFromSegment) MOZ_REQUIRES(mTokenizerMutex); + + /** + * Initialize the Unicode decoder, mark the BOM as the source and + * drop the sniffer. + * + * @param aDecoderCharsetName The name for the decoder's charset + * (UTF-16BE, UTF-16LE or UTF-8; the BOM has + * been swallowed) + */ + void SetupDecodingFromBom(NotNull<const Encoding*> aEncoding); + + void SetupDecodingFromUtf16BogoXml(NotNull<const Encoding*> aEncoding); + + /** + * When speculatively decoding from file: URL as UTF-8, commit + * to UTF-8 as the non-speculative encoding and start processing + * the decoded data. + */ + [[nodiscard]] nsresult CommitLocalFileToEncoding(); + + /** + * When speculatively decoding from file: URL as UTF-8, redecode + * using fallback and then continue normally with the fallback. + */ + [[nodiscard]] nsresult ReDecodeLocalFile() MOZ_REQUIRES(mTokenizerMutex); + + /** + * Potentially guess the encoding using mozilla::EncodingDetector. + * Returns the guessed encoding and a telemetry-appropriate source. + */ + std::tuple<NotNull<const Encoding*>, nsCharsetSource> GuessEncoding( + bool aInitial); + + /** + * Become confident or resolve and encoding name to its preferred form. + * @param aEncoding the value of an internal encoding decl. Acts as an + * out param, too, when the method returns true. + * @return true if the parser needs to start using the new value of + * aEncoding and false if the parser became confident or if + * the encoding name did not specify a usable encoding + */ + const Encoding* PreferredForInternalEncodingDecl(const nsAString& aEncoding); + + /** + * Callback for mFlushTimer. + */ + static void TimerCallback(nsITimer* aTimer, void* aClosure); + + /** + * Parser thread entry point for (maybe) flushing the ops and posting + * a flush runnable back on the main thread. + */ + void TimerFlush(); + + /** + * Called when speculation fails. + */ + void MaybeDisableFutureSpeculation() { mSpeculationFailureCount++; } + + /** + * Used to check whether we're getting too many speculation failures and + * should just stop trying. The 100 is picked pretty randomly to be not too + * small (so most pages are not affected) but small enough that we don't end + * up with failed speculations over and over in pathological cases. + */ + bool IsSpeculationEnabled() { return mSpeculationFailureCount < 100; } + + /** + * Dispatch an event to a Quantum DOM main thread-ish thread. + * (Not the parser thread.) + */ + nsresult DispatchToMain(already_AddRefed<nsIRunnable>&& aRunnable); + + /** + * Notify any devtools listeners about content newly received for parsing. + */ + inline void OnNewContent(mozilla::Span<const char16_t> aData); + + /** + * Notify any devtools listeners after all parse content has been received. + */ + inline void OnContentComplete(); + + nsCOMPtr<nsIRequest> mRequest; + + /** + * The document title to use if this turns out to be a View Source parser. + */ + nsCString mViewSourceTitle; + + /** + * The Unicode decoder + */ + mozilla::UniquePtr<mozilla::Decoder> mUnicodeDecoder; + + /** + * BOM sniffing state + */ + eBomState mBomState; + + // encoding-related stuff + /** + * The source (confidence) of the character encoding in use + */ + nsCharsetSource mCharsetSource; + + nsCharsetSource mEncodingSwitchSource; + + /** + * The character encoding in use + */ + NotNull<const Encoding*> mEncoding; + + const Encoding* mNeedsEncodingSwitchTo; + + bool mSeenEligibleMetaCharset; + + bool mChardetEof; + +#ifdef DEBUG + + bool mStartedFeedingDetector; + + bool mStartedFeedingDevTools; + +#endif + + /** + * Whether reparse is forbidden + */ + bool mReparseForbidden; + + /** + * Whether the Repair Text Encoding menu item was invoked + */ + bool mForceAutoDetection; + + /** + * Whether there was a valid charset parameter on the HTTP layer. + */ + bool mChannelHadCharset; + + /** + * We are in the process of looking for <meta charset> + */ + bool mLookingForMetaCharset; + + /** + * Whether the byte stream started with ASCII <? + */ + bool mStartsWithLtQuestion; + + /** + * If we are viewing XML source and are waiting for a '>' form the network. + */ + bool mLookingForXmlDeclarationForXmlViewSource; + + /** + * Whether template has been pushed or head popped within the first 1024 + * bytes. + */ + bool mTemplatePushedOrHeadPopped; + + // Portable parser objects + /** + * The first buffer in the pending UTF-16 buffer queue + */ + RefPtr<nsHtml5OwningUTF16Buffer> mFirstBuffer; + + /** + * Non-owning pointer to the most recent buffer that contains the most recent + * remembered greater-than sign. Used only while mLookingForMetaCharset is + * true. While mLookingForMetaCharset is true, mFirstBuffer is not changed and + * keeps the whole linked list of buffers alive. This pointer is non-owning to + * avoid frequent refcounting. + */ + nsHtml5OwningUTF16Buffer* mGtBuffer; + + int32_t mGtPos; + + /** + * The last buffer in the pending UTF-16 buffer queue + */ + nsHtml5OwningUTF16Buffer* + mLastBuffer; // weak ref; always points to + // a buffer of the size + // NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE + + /** + * The first buffer of the document if looking for <meta charset> or + * nullptr afterwards. + */ + RefPtr<nsHtml5OwningUTF16Buffer> mFirstBufferOfMetaScan; + + /** + * The tree operation executor + */ + nsHtml5TreeOpExecutor* mExecutor; + + /** + * Network event target for mExecutor->mDocument + */ + nsCOMPtr<nsISerialEventTarget> mNetworkEventTarget; + + /** + * The HTML5 tree builder + */ + mozilla::UniquePtr<nsHtml5TreeBuilder> mTreeBuilder; + + /** + * The HTML5 tokenizer + */ + mozilla::UniquePtr<nsHtml5Tokenizer> mTokenizer; + + /** + * Makes sure the main thread can't mess the tokenizer state while it's + * tokenizing. This mutex also protects the current speculation. + */ + mozilla::Mutex mTokenizerMutex; + + /** + * The scoped atom table + */ + nsHtml5AtomTable mAtomTable; + + /** + * The owner parser. + */ + RefPtr<nsHtml5Parser> mOwner; + + /** + * Whether the last character tokenized was a carriage return (for CRLF) + */ + bool mLastWasCR; + + /** + * For tracking stream life cycle + */ + eHtml5StreamState mStreamState; + + /** + * Whether we are speculating. + */ + bool mSpeculating; + + /** + * Whether the tokenizer has reached EOF. (Reset when stream rewinded.) + */ + bool mAtEOF; + + /** + * The speculations. The mutex protects the nsTArray itself. + * To access the queue of current speculation, mTokenizerMutex must be + * obtained. + * The current speculation is the last element + */ + nsTArray<mozilla::UniquePtr<nsHtml5Speculation>> mSpeculations; + mozilla::Mutex mSpeculationMutex; + + /** + * Number of times speculation has failed for this parser. + */ + mozilla::Atomic<uint32_t> mSpeculationFailureCount; + + /** + * Number of bytes already buffered into mBufferedBytes. + */ + uint32_t mNumBytesBuffered; + + nsTArray<mozilla::Buffer<uint8_t>> mBufferedBytes; + + /** + * True to terminate early. + */ + mozilla::Atomic<bool> mTerminated; + + /** + * True to release mTokenizerMutex early. + */ + mozilla::Atomic<bool> mInterrupted; + + /** + * The thread this stream parser runs on. + */ + nsCOMPtr<nsISerialEventTarget> mEventTarget; + + nsCOMPtr<nsIRunnable> mExecutorFlusher; + + nsCOMPtr<nsIRunnable> mLoadFlusher; + + /** + * This runnable is distinct from the regular flushers to + * signal the intent of encoding commitment without having to + * protect mPendingEncodingCommitment in the executer with a + * mutex. + */ + nsCOMPtr<nsIRunnable> mEncodingCommitter; + + /** + * The generict detector. + */ + mozilla::UniquePtr<mozilla::EncodingDetector> mDetector; + + /** + * The TLD we're loading from or empty if unknown. + */ + nsCString mTLD; + + /** + * Whether the initial charset source was kCharsetFromParentFrame + */ + bool mInitialEncodingWasFromParentFrame; + + bool mHasHadErrors; + + bool mDetectorHasSeenNonAscii; + + /** + * If true, we are decoding a local file that lacks an encoding + * declaration and we are not tokenizing yet. + */ + bool mDecodingLocalFileWithoutTokenizing; + + /** + * Whether we are keeping the incoming bytes. + */ + bool mBufferingBytes; + + /** + * Timer for flushing tree ops once in a while when not speculating. + */ + nsCOMPtr<nsITimer> mFlushTimer; + + /** + * Mutex for protecting access to mFlushTimer (but not for the two + * mFlushTimerFoo booleans below). + */ + mozilla::Mutex mFlushTimerMutex; + + /** + * Keeps track whether mFlushTimer has been armed. Unfortunately, + * nsITimer doesn't enable querying this from the timer itself. + */ + bool mFlushTimerArmed; + + /** + * False initially and true after the timer has fired at least once. + */ + bool mFlushTimerEverFired; + + /** + * Whether the parser is doing a normal parse, view source or plain text. + */ + eParserMode mMode; + + /** + * If the associated docshell is being watched by the devtools, this is + * set to the URI associated with the parse. All parse data is sent to the + * devtools, along with this URI. This URI is cleared out after the parse has + * been marked as completed. + */ + nsCOMPtr<nsIURI> mURIToSendToDevtools; + + /** + * If content is being sent to the devtools, an encoded UUID for the parser. + */ + nsString mUUIDForDevtools; +}; + +#endif // nsHtml5StreamParser_h |