1 files changed, 327 insertions, 0 deletions
diff --git a/parser/htmlparser/nsScanner.cpp b/parser/htmlparser/nsScanner.cpp
new file mode 100644
index 0000000000..e0f5f9228c
--- /dev/null
+++ b/parser/htmlparser/nsScanner.cpp
@@ -0,0 +1,327 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=2 sw=2 et tw=78: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+//#define __INCREMENTAL 1
+
+#include "nsScanner.h"
+
+#include "mozilla/Attributes.h"
+#include "mozilla/DebugOnly.h"
+#include "mozilla/Encoding.h"
+#include "nsDebug.h"
+#include "nsReadableUtils.h"
+#include "nsUTF8Utils.h"  // for LossyConvertEncoding
+#include "nsCRT.h"
+#include "nsParser.h"
+#include "nsCharsetSource.h"
+
+nsReadEndCondition::nsReadEndCondition(const char16_t* aTerminateChars)
+    : mChars(aTerminateChars),
+      mFilter(char16_t(~0))  // All bits set
+{
+  // Build filter that will be used to filter out characters with
+  // bits that none of the terminal chars have. This works very well
+  // because terminal chars often have only the last 4-6 bits set and
+  // normal ascii letters have bit 7 set. Other letters have even higher
+  // bits set.
+
+  // Calculate filter
+  const char16_t* current = aTerminateChars;
+  char16_t terminalChar = *current;
+  while (terminalChar) {
+    mFilter &= ~terminalChar;
+    ++current;
+    terminalChar = *current;
+  }
+}
+
+/**
+ *  Use this constructor if you want i/o to be based on
+ *  a single string you hand in during construction.
+ *  This short cut was added for Javascript.
+ *
+ *  @update  gess 5/12/98
+ *  @param   aMode represents the parser mode (nav, other)
+ *  @return
+ */
+nsScanner::nsScanner(const nsAString& anHTMLString, bool aIncremental)
+    : mIncremental(aIncremental) {
+  MOZ_COUNT_CTOR(nsScanner);
+
+  AppendToBuffer(anHTMLString);
+  MOZ_ASSERT(mMarkPosition == mCurrentPosition);
+}
+
+/**
+ *  Use this constructor if you want i/o to be based on strings
+ *  the scanner receives. If you pass a null filename, you
+ *  can still provide data to the scanner via append.
+ */
+nsScanner::nsScanner(nsIURI* aURI) : mURI(aURI), mIncremental(true) {
+  MOZ_COUNT_CTOR(nsScanner);
+
+  // XXX This is a big hack.  We need to initialize the iterators to something.
+  // What matters is that mCurrentPosition == mEndPosition, so that our methods
+  // believe that we are at EOF (see bug 182067).  We null out mCurrentPosition
+  // so that we have some hope of catching null pointer dereferences associated
+  // with this hack. --darin
+  memset(&mCurrentPosition, 0, sizeof(mCurrentPosition));
+  mMarkPosition = mCurrentPosition;
+  mEndPosition = mCurrentPosition;
+
+  // XML defaults to UTF-8 and about:blank is UTF-8, too.
+  SetDocumentCharset(UTF_8_ENCODING, kCharsetFromDocTypeDefault);
+}
+
+nsresult nsScanner::SetDocumentCharset(NotNull<const Encoding*> aEncoding,
+                                       int32_t aSource) {
+  if (aSource < mCharsetSource)  // priority is lower than the current one
+    return NS_OK;
+
+  mCharsetSource = aSource;
+  nsCString charsetName;
+  aEncoding->Name(charsetName);
+  if (!mCharset.IsEmpty() && charsetName.Equals(mCharset)) {
+    return NS_OK;  // no difference, don't change it
+  }
+
+  // different, need to change it
+
+  mCharset.Assign(charsetName);
+
+  mUnicodeDecoder = aEncoding->NewDecoderWithBOMRemoval();
+
+  return NS_OK;
+}
+
+/**
+ *  default destructor
+ *
+ *  @update  gess 3/25/98
+ *  @param
+ *  @return
+ */
+nsScanner::~nsScanner() { MOZ_COUNT_DTOR(nsScanner); }
+
+/**
+ *  Resets current offset position of input stream to marked position.
+ *  This allows us to back up to this point if the need should arise,
+ *  such as when tokenization gets interrupted.
+ *  NOTE: IT IS REALLY BAD FORM TO CALL RELEASE WITHOUT CALLING MARK FIRST!
+ *
+ *  @update  gess 5/12/98
+ *  @param
+ *  @return
+ */
+void nsScanner::RewindToMark(void) {
+  if (mSlidingBuffer) {
+    mCurrentPosition = mMarkPosition;
+  }
+}
+
+/**
+ *  Records current offset position in input stream. This allows us
+ *  to back up to this point if the need should arise, such as when
+ *  tokenization gets interrupted.
+ *
+ *  @update  gess 7/29/98
+ *  @param
+ *  @return
+ */
+int32_t nsScanner::Mark() {
+  int32_t distance = 0;
+  if (mSlidingBuffer) {
+    nsScannerIterator oldStart;
+    mSlidingBuffer->BeginReading(oldStart);
+
+    distance = Distance(oldStart, mCurrentPosition);
+
+    mSlidingBuffer->DiscardPrefix(mCurrentPosition);
+    mSlidingBuffer->BeginReading(mCurrentPosition);
+    mMarkPosition = mCurrentPosition;
+  }
+
+  return distance;
+}
+
+/**
+ * Insert data to our underlying input buffer as
+ * if it were read from an input stream.
+ *
+ * @update  harishd 01/12/99
+ * @return  error code
+ */
+bool nsScanner::UngetReadable(const nsAString& aBuffer) {
+  if (!mSlidingBuffer) {
+    return false;
+  }
+
+  mSlidingBuffer->UngetReadable(aBuffer, mCurrentPosition);
+  mSlidingBuffer->BeginReading(
+      mCurrentPosition);  // Insertion invalidated our iterators
+  mSlidingBuffer->EndReading(mEndPosition);
+
+  return true;
+}
+
+/**
+ * Append data to our underlying input buffer as
+ * if it were read from an input stream.
+ *
+ * @update  gess4/3/98
+ * @return  error code
+ */
+nsresult nsScanner::Append(const nsAString& aBuffer) {
+  if (!AppendToBuffer(aBuffer)) return NS_ERROR_OUT_OF_MEMORY;
+  return NS_OK;
+}
+
+/**
+ *
+ *
+ *  @update  gess 5/21/98
+ *  @param
+ *  @return
+ */
+nsresult nsScanner::Append(const char* aBuffer, uint32_t aLen) {
+  nsresult res = NS_OK;
+  if (mUnicodeDecoder) {
+    CheckedInt<size_t> needed = mUnicodeDecoder->MaxUTF16BufferLength(aLen);
+    if (!needed.isValid()) {
+      return NS_ERROR_OUT_OF_MEMORY;
+    }
+    CheckedInt<uint32_t> allocLen(1);  // null terminator due to legacy sadness
+    allocLen += needed.value();
+    if (!allocLen.isValid()) {
+      return NS_ERROR_OUT_OF_MEMORY;
+    }
+    nsScannerString::Buffer* buffer =
+        nsScannerString::AllocBuffer(allocLen.value());
+    NS_ENSURE_TRUE(buffer, NS_ERROR_OUT_OF_MEMORY);
+    char16_t* unichars = buffer->DataStart();
+
+    uint32_t result;
+    size_t read;
+    size_t written;
+    // Do not use structured binding lest deal with [-Werror=unused-variable]
+    std::tie(result, read, written) =
+        mUnicodeDecoder->DecodeToUTF16WithoutReplacement(
+            AsBytes(Span(aBuffer, aLen)), Span(unichars, needed.value()),
+            false);  // Retain bug about failure to handle EOF
+    MOZ_ASSERT(result != kOutputFull);
+    MOZ_ASSERT(read <= aLen);
+    MOZ_ASSERT(written <= needed.value());
+    if (result != kInputEmpty) {
+      // Since about:blank is empty, this line runs only for XML. Use a
+      // character that's illegal in XML instead of U+FFFD in order to make
+      // expat flag the error. There is no need to loop and convert more, since
+      // expat will stop here anyway.
+      unichars[written++] = 0xFFFF;
+    }
+    buffer->SetDataLength(written);
+    // Don't propagate return code of unicode decoder
+    // since it doesn't reflect on our success or failure
+    // - Ref. bug 87110
+    res = NS_OK;
+    AppendToBuffer(buffer);
+  } else {
+    NS_WARNING("No decoder found.");
+    res = NS_ERROR_FAILURE;
+  }
+
+  return res;
+}
+
+/**
+ *  retrieve next char from scanners internal input stream
+ *
+ *  @update  gess 3/25/98
+ *  @param
+ *  @return  error code reflecting read status
+ */
+nsresult nsScanner::GetChar(char16_t& aChar) {
+  if (!mSlidingBuffer || mCurrentPosition == mEndPosition) {
+    aChar = 0;
+    return NS_ERROR_HTMLPARSER_EOF;
+  }
+
+  aChar = *mCurrentPosition++;
+
+  return NS_OK;
+}
+
+void nsScanner::BindSubstring(nsScannerSubstring& aSubstring,
+                              const nsScannerIterator& aStart,
+                              const nsScannerIterator& aEnd) {
+  aSubstring.Rebind(*mSlidingBuffer, aStart, aEnd);
+}
+
+void nsScanner::CurrentPosition(nsScannerIterator& aPosition) {
+  aPosition = mCurrentPosition;
+}
+
+void nsScanner::EndReading(nsScannerIterator& aPosition) {
+  aPosition = mEndPosition;
+}
+
+void nsScanner::SetPosition(nsScannerIterator& aPosition, bool aTerminate) {
+  if (mSlidingBuffer) {
+    mCurrentPosition = aPosition;
+    if (aTerminate && (mCurrentPosition == mEndPosition)) {
+      mMarkPosition = mCurrentPosition;
+      mSlidingBuffer->DiscardPrefix(mCurrentPosition);
+    }
+  }
+}
+
+void nsScanner::AppendToBuffer(nsScannerString::Buffer* aBuf) {
+  if (!mSlidingBuffer) {
+    mSlidingBuffer = MakeUnique<nsScannerString>(aBuf);
+    mSlidingBuffer->BeginReading(mCurrentPosition);
+    mMarkPosition = mCurrentPosition;
+  } else {
+    mSlidingBuffer->AppendBuffer(aBuf);
+    if (mCurrentPosition == mEndPosition) {
+      mSlidingBuffer->BeginReading(mCurrentPosition);
+    }
+  }
+  mSlidingBuffer->EndReading(mEndPosition);
+}
+
+/**
+ *  call this to copy bytes out of the scanner that have not yet been consumed
+ *  by the tokenization process.
+ *
+ *  @update  gess 5/12/98
+ *  @param   aCopyBuffer is where the scanner buffer will be copied to
+ *  @return  true if OK or false on OOM
+ */
+bool nsScanner::CopyUnusedData(nsString& aCopyBuffer) {
+  if (!mSlidingBuffer) {
+    aCopyBuffer.Truncate();
+    return true;
+  }
+
+  nsScannerIterator start, end;
+  start = mCurrentPosition;
+  end = mEndPosition;
+
+  return CopyUnicodeTo(start, end, aCopyBuffer);
+}
+
+/**
+ *  Conduct self test. Actually, selftesting for this class
+ *  occurs in the parser selftest.
+ *
+ *  @update  gess 3/25/98
+ *  @param
+ *  @return
+ */
+
+void nsScanner::SelfTest(void) {
+#ifdef _DEBUG
+#endif
+}