diff options
Diffstat (limited to 'dom/serializers')
-rw-r--r-- | dom/serializers/crashtests/bug1747114.html | 30 | ||||
-rw-r--r-- | dom/serializers/crashtests/crashtests.list | 1 | ||||
-rw-r--r-- | dom/serializers/moz.build | 36 | ||||
-rw-r--r-- | dom/serializers/nsDOMSerializer.cpp | 112 | ||||
-rw-r--r-- | dom/serializers/nsDOMSerializer.h | 43 | ||||
-rw-r--r-- | dom/serializers/nsDocumentEncoder.cpp | 2109 | ||||
-rw-r--r-- | dom/serializers/nsHTMLContentSerializer.cpp | 445 | ||||
-rw-r--r-- | dom/serializers/nsHTMLContentSerializer.h | 53 | ||||
-rw-r--r-- | dom/serializers/nsIContentSerializer.h | 97 | ||||
-rw-r--r-- | dom/serializers/nsIDocumentEncoder.idl | 361 | ||||
-rw-r--r-- | dom/serializers/nsPlainTextSerializer.cpp | 1830 | ||||
-rw-r--r-- | dom/serializers/nsPlainTextSerializer.h | 386 | ||||
-rw-r--r-- | dom/serializers/nsXHTMLContentSerializer.cpp | 729 | ||||
-rw-r--r-- | dom/serializers/nsXHTMLContentSerializer.h | 143 | ||||
-rw-r--r-- | dom/serializers/nsXMLContentSerializer.cpp | 1827 | ||||
-rw-r--r-- | dom/serializers/nsXMLContentSerializer.h | 440 |
16 files changed, 8642 insertions, 0 deletions
diff --git a/dom/serializers/crashtests/bug1747114.html b/dom/serializers/crashtests/bug1747114.html new file mode 100644 index 0000000000..09b429a65e --- /dev/null +++ b/dom/serializers/crashtests/bug1747114.html @@ -0,0 +1,30 @@ +<script> +function go() { + a.appendChild(b) + b.setSelectionRange(1, 37, "1") + c.onselectionchange = () => { + b.wrap = "hard" + b.setRangeText(String.fromCodePoint( + 171825, + 2568, + 23726, + 391291, + 509063, + 163770, + 896774, + 556839, + 880943, + 606650, + 55697, + 95835, + 28852, + 507694, + 849936 + )) + } +} +</script> +<body onload=go()> +<textarea id="b" style="white-space: nowrap">&sJ<*\muT+hj</textarea> +<time id="c">a</tt> +<marquee id="a">a</marquee> diff --git a/dom/serializers/crashtests/crashtests.list b/dom/serializers/crashtests/crashtests.list new file mode 100644 index 0000000000..3324d1f473 --- /dev/null +++ b/dom/serializers/crashtests/crashtests.list @@ -0,0 +1 @@ +load bug1747114.html diff --git a/dom/serializers/moz.build b/dom/serializers/moz.build new file mode 100644 index 0000000000..b0a0a35b6f --- /dev/null +++ b/dom/serializers/moz.build @@ -0,0 +1,36 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +with Files("*"): + BUG_COMPONENT = ("Core", "DOM: Serializers") + +XPIDL_SOURCES += [ + "nsIDocumentEncoder.idl", +] + +XPIDL_MODULE = "dom_serializers" + +EXPORTS += [ + "nsDOMSerializer.h", + "nsHTMLContentSerializer.h", + "nsIContentSerializer.h", + "nsPlainTextSerializer.h", + "nsXHTMLContentSerializer.h", + "nsXMLContentSerializer.h", +] + +UNIFIED_SOURCES += [ + "nsDocumentEncoder.cpp", + "nsDOMSerializer.cpp", + "nsHTMLContentSerializer.cpp", + "nsPlainTextSerializer.cpp", + "nsXHTMLContentSerializer.cpp", + "nsXMLContentSerializer.cpp", +] + +FINAL_LIBRARY = "xul" + +CRASHTEST_MANIFESTS += ["crashtests/crashtests.list"] diff --git a/dom/serializers/nsDOMSerializer.cpp b/dom/serializers/nsDOMSerializer.cpp new file mode 100644 index 0000000000..98d92e64ce --- /dev/null +++ b/dom/serializers/nsDOMSerializer.cpp @@ -0,0 +1,112 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsDOMSerializer.h" + +#include "mozilla/Encoding.h" +#include "mozilla/dom/Document.h" +#include "nsIDocumentEncoder.h" +#include "nsComponentManagerUtils.h" +#include "nsContentCID.h" +#include "nsContentUtils.h" +#include "nsError.h" +#include "nsINode.h" + +using namespace mozilla; + +nsDOMSerializer::nsDOMSerializer() = default; + +static already_AddRefed<nsIDocumentEncoder> SetUpEncoder( + nsINode& aRoot, const nsAString& aCharset, ErrorResult& aRv) { + nsCOMPtr<nsIDocumentEncoder> encoder = + do_createDocumentEncoder("application/xhtml+xml"); + if (!encoder) { + aRv.Throw(NS_ERROR_FAILURE); + return nullptr; + } + + dom::Document* doc = aRoot.OwnerDoc(); + bool entireDocument = (doc == &aRoot); + + // This method will fail if no document + nsresult rv = encoder->NativeInit( + doc, u"application/xhtml+xml"_ns, + nsIDocumentEncoder::OutputRaw | + nsIDocumentEncoder::OutputDontRewriteEncodingDeclaration); + + if (NS_FAILED(rv)) { + aRv.Throw(rv); + return nullptr; + } + + NS_ConvertUTF16toUTF8 charset(aCharset); + if (charset.IsEmpty()) { + doc->GetDocumentCharacterSet()->Name(charset); + } + rv = encoder->SetCharset(charset); + if (NS_FAILED(rv)) { + aRv.Throw(rv); + return nullptr; + } + + // If we are working on the entire document we do not need to + // specify which part to serialize + if (!entireDocument) { + rv = encoder->SetNode(&aRoot); + } + + if (NS_FAILED(rv)) { + aRv.Throw(rv); + return nullptr; + } + + return encoder.forget(); +} + +void nsDOMSerializer::SerializeToString(nsINode& aRoot, nsAString& aStr, + ErrorResult& aRv) { + aStr.Truncate(); + + if (!nsContentUtils::CanCallerAccess(&aRoot)) { + aRv.Throw(NS_ERROR_DOM_SECURITY_ERR); + return; + } + + nsCOMPtr<nsIDocumentEncoder> encoder = SetUpEncoder(aRoot, u""_ns, aRv); + if (aRv.Failed()) { + return; + } + + nsresult rv = encoder->EncodeToString(aStr); + if (NS_FAILED(rv)) { + aRv.Throw(rv); + } +} + +void nsDOMSerializer::SerializeToStream(nsINode& aRoot, + nsIOutputStream* aStream, + const nsAString& aCharset, + ErrorResult& aRv) { + if (NS_WARN_IF(!aStream)) { + aRv.Throw(NS_ERROR_INVALID_ARG); + return; + } + + // The charset arg can be empty, in which case we get the document's + // charset and use that when serializing. + + // No point doing a CanCallerAccess check, because we can only be + // called by system JS or C++. + nsCOMPtr<nsIDocumentEncoder> encoder = SetUpEncoder(aRoot, aCharset, aRv); + if (aRv.Failed()) { + return; + } + + nsresult rv = encoder->EncodeToStream(aStream); + if (NS_FAILED(rv)) { + aRv.Throw(rv); + } +} diff --git a/dom/serializers/nsDOMSerializer.h b/dom/serializers/nsDOMSerializer.h new file mode 100644 index 0000000000..21a898f712 --- /dev/null +++ b/dom/serializers/nsDOMSerializer.h @@ -0,0 +1,43 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef nsDOMSerializer_h_ +#define nsDOMSerializer_h_ + +#include "mozilla/dom/NonRefcountedDOMObject.h" +#include "mozilla/dom/XMLSerializerBinding.h" + +class nsINode; +class nsIOutputStream; + +namespace mozilla { +class ErrorResult; +} + +class nsDOMSerializer final : public mozilla::dom::NonRefcountedDOMObject { + public: + nsDOMSerializer(); + + // WebIDL API + static mozilla::UniquePtr<nsDOMSerializer> Constructor( + const mozilla::dom::GlobalObject& aOwner) { + return mozilla::MakeUnique<nsDOMSerializer>(); + } + + void SerializeToString(nsINode& aRoot, nsAString& aStr, + mozilla::ErrorResult& rv); + + void SerializeToStream(nsINode& aRoot, nsIOutputStream* aStream, + const nsAString& aCharset, mozilla::ErrorResult& aRv); + + bool WrapObject(JSContext* aCx, JS::Handle<JSObject*> aGivenProto, + JS::MutableHandle<JSObject*> aReflector) { + return mozilla::dom::XMLSerializer_Binding::Wrap(aCx, this, aGivenProto, + aReflector); + } +}; + +#endif diff --git a/dom/serializers/nsDocumentEncoder.cpp b/dom/serializers/nsDocumentEncoder.cpp new file mode 100644 index 0000000000..14120bae64 --- /dev/null +++ b/dom/serializers/nsDocumentEncoder.cpp @@ -0,0 +1,2109 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* + * Object that can be used to serialize selections, ranges, or nodes + * to strings in a gazillion different ways. + */ + +#include <utility> + +#include "nscore.h" +#include "nsISupports.h" +#include "nsCOMPtr.h" +#include "nsCRT.h" +#include "nsIContentSerializer.h" +#include "nsIDocumentEncoder.h" +#include "nsINode.h" +#include "nsIContentInlines.h" +#include "nsComponentManagerUtils.h" +#include "nsIOutputStream.h" +#include "nsRange.h" +#include "nsGkAtoms.h" +#include "nsHTMLDocument.h" +#include "nsIContent.h" +#include "nsIScriptContext.h" +#include "nsIScriptGlobalObject.h" +#include "nsITransferable.h" +#include "mozilla/dom/Selection.h" +#include "nsContentUtils.h" +#include "nsElementTable.h" +#include "nsUnicharUtils.h" +#include "nsReadableUtils.h" +#include "nsTArray.h" +#include "nsIFrame.h" +#include "nsLayoutUtils.h" +#include "nsStringBuffer.h" +#include "mozilla/dom/Comment.h" +#include "mozilla/dom/Document.h" +#include "mozilla/dom/DocumentType.h" +#include "mozilla/dom/Element.h" +#include "mozilla/dom/HTMLBRElement.h" +#include "mozilla/dom/ProcessingInstruction.h" +#include "mozilla/dom/ShadowRoot.h" +#include "mozilla/dom/Text.h" +#include "mozilla/Encoding.h" +#include "mozilla/IntegerRange.h" +#include "mozilla/Maybe.h" +#include "mozilla/ScopeExit.h" +#include "mozilla/UniquePtr.h" + +using namespace mozilla; +using namespace mozilla::dom; + +enum nsRangeIterationDirection { kDirectionOut = -1, kDirectionIn = 1 }; + +class TextStreamer { + public: + /** + * @param aStream Will be kept alive by the TextStreamer. + * @param aUnicodeEncoder Needs to be non-nullptr. + */ + TextStreamer(nsIOutputStream& aStream, UniquePtr<Encoder> aUnicodeEncoder, + bool aIsPlainText, nsAString& aOutputBuffer); + + /** + * String will be truncated if it is written to stream. + */ + nsresult FlushIfStringLongEnough(); + + /** + * String will be truncated. + */ + nsresult ForceFlush(); + + private: + const static uint32_t kMaxLengthBeforeFlush = 1024; + + const static uint32_t kEncoderBufferSizeInBytes = 4096; + + nsresult EncodeAndWrite(); + + nsresult EncodeAndWriteAndTruncate(); + + const nsCOMPtr<nsIOutputStream> mStream; + const UniquePtr<Encoder> mUnicodeEncoder; + const bool mIsPlainText; + nsAString& mOutputBuffer; +}; + +TextStreamer::TextStreamer(nsIOutputStream& aStream, + UniquePtr<Encoder> aUnicodeEncoder, + bool aIsPlainText, nsAString& aOutputBuffer) + : mStream{&aStream}, + mUnicodeEncoder(std::move(aUnicodeEncoder)), + mIsPlainText(aIsPlainText), + mOutputBuffer(aOutputBuffer) { + MOZ_ASSERT(mUnicodeEncoder); +} + +nsresult TextStreamer::FlushIfStringLongEnough() { + nsresult rv = NS_OK; + + if (mOutputBuffer.Length() > kMaxLengthBeforeFlush) { + rv = EncodeAndWriteAndTruncate(); + } + + return rv; +} + +nsresult TextStreamer::ForceFlush() { return EncodeAndWriteAndTruncate(); } + +nsresult TextStreamer::EncodeAndWrite() { + if (mOutputBuffer.IsEmpty()) { + return NS_OK; + } + + uint8_t buffer[kEncoderBufferSizeInBytes]; + auto src = Span(mOutputBuffer); + auto bufferSpan = Span(buffer); + // Reserve space for terminator + auto dst = bufferSpan.To(bufferSpan.Length() - 1); + for (;;) { + uint32_t result; + size_t read; + size_t written; + if (mIsPlainText) { + std::tie(result, read, written) = + mUnicodeEncoder->EncodeFromUTF16WithoutReplacement(src, dst, false); + if (result != kInputEmpty && result != kOutputFull) { + // There's always room for one byte in the case of + // an unmappable character, because otherwise + // we'd have gotten `kOutputFull`. + dst[written++] = '?'; + } + } else { + std::tie(result, read, written, std::ignore) = + mUnicodeEncoder->EncodeFromUTF16(src, dst, false); + } + src = src.From(read); + // Sadly, we still have test cases that implement nsIOutputStream in JS, so + // the buffer needs to be zero-terminated for XPConnect to do its thing. + // See bug 170416. + bufferSpan[written] = 0; + uint32_t streamWritten; + nsresult rv = mStream->Write(reinterpret_cast<char*>(dst.Elements()), + written, &streamWritten); + if (NS_FAILED(rv)) { + return rv; + } + if (result == kInputEmpty) { + return NS_OK; + } + } +} + +nsresult TextStreamer::EncodeAndWriteAndTruncate() { + const nsresult rv = EncodeAndWrite(); + mOutputBuffer.Truncate(); + return rv; +} + +/** + * The scope may be limited to either a selection, range, or node. + */ +class EncodingScope { + public: + /** + * @return true, iff the scope is limited to a selection, range or node. + */ + bool IsLimited() const; + + RefPtr<Selection> mSelection; + RefPtr<nsRange> mRange; + nsCOMPtr<nsINode> mNode; + bool mNodeIsContainer = false; +}; + +bool EncodingScope::IsLimited() const { return mSelection || mRange || mNode; } + +struct RangeBoundariesInclusiveAncestorsAndOffsets { + /** + * https://dom.spec.whatwg.org/#concept-tree-inclusive-ancestor. + */ + using InclusiveAncestors = AutoTArray<nsIContent*, 8>; + + /** + * https://dom.spec.whatwg.org/#concept-tree-inclusive-ancestor. + */ + using InclusiveAncestorsOffsets = AutoTArray<Maybe<uint32_t>, 8>; + + // The first node is the range's boundary node, the following ones the + // ancestors. + InclusiveAncestors mInclusiveAncestorsOfStart; + // The first offset represents where at the boundary node the range starts. + // Each other offset is the index of the child relative to its parent. + InclusiveAncestorsOffsets mInclusiveAncestorsOffsetsOfStart; + + // The first node is the range's boundary node, the following one the + // ancestors. + InclusiveAncestors mInclusiveAncestorsOfEnd; + // The first offset represents where at the boundary node the range ends. + // Each other offset is the index of the child relative to its parent. + InclusiveAncestorsOffsets mInclusiveAncestorsOffsetsOfEnd; +}; + +struct ContextInfoDepth { + uint32_t mStart = 0; + uint32_t mEnd = 0; +}; + +class nsDocumentEncoder : public nsIDocumentEncoder { + protected: + class RangeNodeContext { + public: + virtual ~RangeNodeContext() = default; + + virtual bool IncludeInContext(nsINode& aNode) const { return false; } + + virtual int32_t GetImmediateContextCount( + const nsTArray<nsINode*>& aAncestorArray) const { + return -1; + } + }; + + public: + nsDocumentEncoder(); + + protected: + /** + * @param aRangeNodeContext has to be non-null. + */ + explicit nsDocumentEncoder(UniquePtr<RangeNodeContext> aRangeNodeContext); + + public: + NS_DECL_CYCLE_COLLECTING_ISUPPORTS + NS_DECL_CYCLE_COLLECTION_CLASS(nsDocumentEncoder) + NS_DECL_NSIDOCUMENTENCODER + + protected: + virtual ~nsDocumentEncoder(); + + void Initialize(bool aClearCachedSerializer = true); + + /** + * @param aMaxLength As described at + * `nsIDocumentEncodder.encodeToStringWithMaxLength`. + */ + nsresult SerializeDependingOnScope(uint32_t aMaxLength); + + nsresult SerializeSelection(); + + nsresult SerializeNode(); + + /** + * @param aMaxLength As described at + * `nsIDocumentEncodder.encodeToStringWithMaxLength`. + */ + nsresult SerializeWholeDocument(uint32_t aMaxLength); + + /** + * @param aFlags multiple of the flags defined in nsIDocumentEncoder.idl.o + */ + static bool IsInvisibleNodeAndShouldBeSkipped(const nsINode& aNode, + const uint32_t aFlags) { + if (aFlags & SkipInvisibleContent) { + // Treat the visibility of the ShadowRoot as if it were + // the host content. + // + // FIXME(emilio): I suspect instead of this a bunch of the GetParent() + // calls here should be doing GetFlattenedTreeParent, then this condition + // should be unreachable... + const nsINode* node{&aNode}; + if (const ShadowRoot* shadowRoot = ShadowRoot::FromNode(node)) { + node = shadowRoot->GetHost(); + } + + if (node->IsContent()) { + nsIFrame* frame = node->AsContent()->GetPrimaryFrame(); + if (!frame) { + if (node->IsElement() && node->AsElement()->IsDisplayContents()) { + return false; + } + if (node->IsText()) { + // We have already checked that our parent is visible. + // + // FIXME(emilio): Text not assigned to a <slot> in Shadow DOM should + // probably return false... + return false; + } + if (node->IsHTMLElement(nsGkAtoms::rp)) { + // Ruby parentheses are part of ruby structure, hence + // shouldn't be stripped out even if it is not displayed. + return false; + } + return true; + } + bool isVisible = frame->StyleVisibility()->IsVisible(); + if (!isVisible && node->IsText()) { + return true; + } + } + } + return false; + } + + void ReleaseDocumentReferenceAndInitialize(bool aClearCachedSerializer); + + class MOZ_STACK_CLASS AutoReleaseDocumentIfNeeded final { + public: + explicit AutoReleaseDocumentIfNeeded(nsDocumentEncoder* aEncoder) + : mEncoder(aEncoder) {} + + ~AutoReleaseDocumentIfNeeded() { + if (mEncoder->mFlags & RequiresReinitAfterOutput) { + const bool clearCachedSerializer = false; + mEncoder->ReleaseDocumentReferenceAndInitialize(clearCachedSerializer); + } + } + + private: + nsDocumentEncoder* mEncoder; + }; + + nsCOMPtr<Document> mDocument; + EncodingScope mEncodingScope; + nsCOMPtr<nsIContentSerializer> mSerializer; + + Maybe<TextStreamer> mTextStreamer; + nsCOMPtr<nsIDocumentEncoderNodeFixup> mNodeFixup; + + nsString mMimeType; + const Encoding* mEncoding; + // Multiple of the flags defined in nsIDocumentEncoder.idl. + uint32_t mFlags; + uint32_t mWrapColumn; + // Whether the serializer cares about being notified to scan elements to + // keep track of whether they are preformatted. This stores the out + // argument of nsIContentSerializer::Init(). + bool mNeedsPreformatScanning; + bool mIsCopying; // Set to true only while copying + nsStringBuffer* mCachedBuffer; + + class NodeSerializer { + public: + /** + * @param aFlags multiple of the flags defined in nsIDocumentEncoder.idl. + */ + NodeSerializer(const bool& aNeedsPreformatScanning, + const nsCOMPtr<nsIContentSerializer>& aSerializer, + const uint32_t& aFlags, + const nsCOMPtr<nsIDocumentEncoderNodeFixup>& aNodeFixup, + Maybe<TextStreamer>& aTextStreamer) + : mNeedsPreformatScanning{aNeedsPreformatScanning}, + mSerializer{aSerializer}, + mFlags{aFlags}, + mNodeFixup{aNodeFixup}, + mTextStreamer{aTextStreamer} {} + + nsresult SerializeNodeStart(nsINode& aOriginalNode, int32_t aStartOffset, + int32_t aEndOffset, + nsINode* aFixupNode = nullptr) const; + + enum class SerializeRoot { eYes, eNo }; + + nsresult SerializeToStringRecursive(nsINode* aNode, + SerializeRoot aSerializeRoot, + uint32_t aMaxLength = 0) const; + + nsresult SerializeNodeEnd(nsINode& aOriginalNode, + nsINode* aFixupNode = nullptr) const; + + [[nodiscard]] nsresult SerializeTextNode(nsINode& aNode, + int32_t aStartOffset, + int32_t aEndOffset) const; + + nsresult SerializeToStringIterative(nsINode* aNode) const; + + private: + const bool& mNeedsPreformatScanning; + const nsCOMPtr<nsIContentSerializer>& mSerializer; + // Multiple of the flags defined in nsIDocumentEncoder.idl. + const uint32_t& mFlags; + const nsCOMPtr<nsIDocumentEncoderNodeFixup>& mNodeFixup; + Maybe<TextStreamer>& mTextStreamer; + }; + + NodeSerializer mNodeSerializer; + + const UniquePtr<RangeNodeContext> mRangeNodeContext; + + struct RangeContextSerializer final { + RangeContextSerializer(const RangeNodeContext& aRangeNodeContext, + const NodeSerializer& aNodeSerializer) + : mDisableContextSerialize{false}, + mRangeNodeContext{aRangeNodeContext}, + mNodeSerializer{aNodeSerializer} {} + + nsresult SerializeRangeContextStart( + const nsTArray<nsINode*>& aAncestorArray); + nsresult SerializeRangeContextEnd(); + + // Used when context has already been serialized for + // table cell selections (where parent is <tr>) + bool mDisableContextSerialize; + AutoTArray<AutoTArray<nsINode*, 8>, 8> mRangeContexts; + + const RangeNodeContext& mRangeNodeContext; + + private: + const NodeSerializer& mNodeSerializer; + }; + + RangeContextSerializer mRangeContextSerializer; + + struct RangeSerializer { + // @param aFlags multiple of the flags defined in nsIDocumentEncoder.idl. + RangeSerializer(const uint32_t& aFlags, + const NodeSerializer& aNodeSerializer, + RangeContextSerializer& aRangeContextSerializer) + : mStartRootIndex{0}, + mEndRootIndex{0}, + mHaltRangeHint{false}, + mFlags{aFlags}, + mNodeSerializer{aNodeSerializer}, + mRangeContextSerializer{aRangeContextSerializer} {} + + void Initialize(); + + /** + * @param aDepth the distance (number of `GetParent` calls) from aNode to + * aRange's closest common inclusive ancestor. + */ + nsresult SerializeRangeNodes(const nsRange* aRange, nsINode* aNode, + int32_t aDepth); + + /** + * Serialize aContent's children from aStartOffset to aEndOffset. + * + * @param aDepth the distance (number of `GetParent` calls) from aContent to + * aRange's closest common inclusive ancestor. + */ + [[nodiscard]] nsresult SerializeChildrenOfContent(nsIContent& aContent, + uint32_t aStartOffset, + uint32_t aEndOffset, + const nsRange* aRange, + int32_t aDepth); + + nsresult SerializeRangeToString(const nsRange* aRange); + + /** + * https://dom.spec.whatwg.org/#concept-tree-inclusive-ancestor. + */ + nsCOMPtr<nsINode> mClosestCommonInclusiveAncestorOfRange; + + /** + * https://dom.spec.whatwg.org/#concept-tree-inclusive-ancestor. + */ + AutoTArray<nsINode*, 8> mCommonInclusiveAncestors; + + ContextInfoDepth mContextInfoDepth; + + private: + struct StartAndEndContent { + nsCOMPtr<nsIContent> mStart; + nsCOMPtr<nsIContent> mEnd; + }; + + StartAndEndContent GetStartAndEndContentForRecursionLevel( + int32_t aDepth) const; + + bool HasInvisibleParentAndShouldBeSkipped(nsINode& aNode) const; + + nsresult SerializeNodePartiallyContainedInRange( + nsINode& aNode, nsIContent& aContent, + const StartAndEndContent& aStartAndEndContent, const nsRange& aRange, + int32_t aDepth); + + nsresult SerializeTextNode(nsINode& aNode, const nsIContent& aContent, + const StartAndEndContent& aStartAndEndContent, + const nsRange& aRange) const; + + RangeBoundariesInclusiveAncestorsAndOffsets + mRangeBoundariesInclusiveAncestorsAndOffsets; + int32_t mStartRootIndex; + int32_t mEndRootIndex; + bool mHaltRangeHint; + + // Multiple of the flags defined in nsIDocumentEncoder.idl. + const uint32_t& mFlags; + + const NodeSerializer& mNodeSerializer; + RangeContextSerializer& mRangeContextSerializer; + }; + + RangeSerializer mRangeSerializer; +}; + +void nsDocumentEncoder::RangeSerializer::Initialize() { + mContextInfoDepth = {}; + mStartRootIndex = 0; + mEndRootIndex = 0; + mHaltRangeHint = false; + mClosestCommonInclusiveAncestorOfRange = nullptr; + mRangeBoundariesInclusiveAncestorsAndOffsets = {}; +} + +NS_IMPL_CYCLE_COLLECTING_ADDREF(nsDocumentEncoder) +NS_IMPL_CYCLE_COLLECTING_RELEASE_WITH_LAST_RELEASE( + nsDocumentEncoder, ReleaseDocumentReferenceAndInitialize(true)) + +NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(nsDocumentEncoder) + NS_INTERFACE_MAP_ENTRY(nsIDocumentEncoder) + NS_INTERFACE_MAP_ENTRY(nsISupports) +NS_INTERFACE_MAP_END + +NS_IMPL_CYCLE_COLLECTION( + nsDocumentEncoder, mDocument, mEncodingScope.mSelection, + mEncodingScope.mRange, mEncodingScope.mNode, mSerializer, + mRangeSerializer.mClosestCommonInclusiveAncestorOfRange) + +nsDocumentEncoder::nsDocumentEncoder( + UniquePtr<RangeNodeContext> aRangeNodeContext) + : mEncoding(nullptr), + mIsCopying(false), + mCachedBuffer(nullptr), + mNodeSerializer(mNeedsPreformatScanning, mSerializer, mFlags, mNodeFixup, + mTextStreamer), + mRangeNodeContext(std::move(aRangeNodeContext)), + mRangeContextSerializer(*mRangeNodeContext, mNodeSerializer), + mRangeSerializer(mFlags, mNodeSerializer, mRangeContextSerializer) { + MOZ_ASSERT(mRangeNodeContext); + + Initialize(); + mMimeType.AssignLiteral("text/plain"); +} + +nsDocumentEncoder::nsDocumentEncoder() + : nsDocumentEncoder(MakeUnique<RangeNodeContext>()) {} + +void nsDocumentEncoder::Initialize(bool aClearCachedSerializer) { + mFlags = 0; + mWrapColumn = 72; + mRangeSerializer.Initialize(); + mNeedsPreformatScanning = false; + mRangeContextSerializer.mDisableContextSerialize = false; + mEncodingScope = {}; + mNodeFixup = nullptr; + if (aClearCachedSerializer) { + mSerializer = nullptr; + } +} + +static bool ParentIsTR(nsIContent* aContent) { + mozilla::dom::Element* parent = aContent->GetParentElement(); + if (!parent) { + return false; + } + return parent->IsHTMLElement(nsGkAtoms::tr); +} + +nsresult nsDocumentEncoder::SerializeDependingOnScope(uint32_t aMaxLength) { + nsresult rv = NS_OK; + if (mEncodingScope.mSelection) { + rv = SerializeSelection(); + } else if (nsRange* range = mEncodingScope.mRange) { + rv = mRangeSerializer.SerializeRangeToString(range); + } else if (mEncodingScope.mNode) { + rv = SerializeNode(); + } else { + rv = SerializeWholeDocument(aMaxLength); + } + + mEncodingScope = {}; + + return rv; +} + +nsresult nsDocumentEncoder::SerializeSelection() { + NS_ENSURE_TRUE(mEncodingScope.mSelection, NS_ERROR_FAILURE); + + nsresult rv = NS_OK; + const Selection* selection = mEncodingScope.mSelection; + nsCOMPtr<nsINode> node; + nsCOMPtr<nsINode> prevNode; + uint32_t firstRangeStartDepth = 0; + const uint32_t rangeCount = selection->RangeCount(); + for (const uint32_t i : IntegerRange(rangeCount)) { + MOZ_ASSERT(selection->RangeCount() == rangeCount); + RefPtr<const nsRange> range = selection->GetRangeAt(i); + + // Bug 236546: newlines not added when copying table cells into clipboard + // Each selected cell shows up as a range containing a row with a single + // cell get the row, compare it to previous row and emit </tr><tr> as + // needed Bug 137450: Problem copying/pasting a table from a web page to + // Excel. Each separate block of <tr></tr> produced above will be wrapped + // by the immediate context. This assumes that you can't select cells that + // are multiple selections from two tables simultaneously. + node = range->GetStartContainer(); + NS_ENSURE_TRUE(node, NS_ERROR_FAILURE); + if (node != prevNode) { + if (prevNode) { + rv = mNodeSerializer.SerializeNodeEnd(*prevNode); + NS_ENSURE_SUCCESS(rv, rv); + } + nsCOMPtr<nsIContent> content = nsIContent::FromNodeOrNull(node); + if (content && content->IsHTMLElement(nsGkAtoms::tr) && + !ParentIsTR(content)) { + if (!prevNode) { + // Went from a non-<tr> to a <tr> + mRangeSerializer.mCommonInclusiveAncestors.Clear(); + nsContentUtils::GetInclusiveAncestors( + node->GetParentNode(), + mRangeSerializer.mCommonInclusiveAncestors); + rv = mRangeContextSerializer.SerializeRangeContextStart( + mRangeSerializer.mCommonInclusiveAncestors); + NS_ENSURE_SUCCESS(rv, rv); + // Don't let SerializeRangeToString serialize the context again + mRangeContextSerializer.mDisableContextSerialize = true; + } + + rv = mNodeSerializer.SerializeNodeStart(*node, 0, -1); + NS_ENSURE_SUCCESS(rv, rv); + prevNode = node; + } else if (prevNode) { + // Went from a <tr> to a non-<tr> + mRangeContextSerializer.mDisableContextSerialize = false; + + // `mCommonInclusiveAncestors` is used in `EncodeToStringWithContext` + // too. Update it here to mimic the old behavior. + mRangeSerializer.mCommonInclusiveAncestors.Clear(); + nsContentUtils::GetInclusiveAncestors( + prevNode->GetParentNode(), + mRangeSerializer.mCommonInclusiveAncestors); + + rv = mRangeContextSerializer.SerializeRangeContextEnd(); + NS_ENSURE_SUCCESS(rv, rv); + prevNode = nullptr; + } + } + + rv = mRangeSerializer.SerializeRangeToString(range); + NS_ENSURE_SUCCESS(rv, rv); + if (i == 0) { + firstRangeStartDepth = mRangeSerializer.mContextInfoDepth.mStart; + } + } + mRangeSerializer.mContextInfoDepth.mStart = firstRangeStartDepth; + + if (prevNode) { + rv = mNodeSerializer.SerializeNodeEnd(*prevNode); + NS_ENSURE_SUCCESS(rv, rv); + mRangeContextSerializer.mDisableContextSerialize = false; + + // `mCommonInclusiveAncestors` is used in `EncodeToStringWithContext` + // too. Update it here to mimic the old behavior. + mRangeSerializer.mCommonInclusiveAncestors.Clear(); + nsContentUtils::GetInclusiveAncestors( + prevNode->GetParentNode(), mRangeSerializer.mCommonInclusiveAncestors); + + rv = mRangeContextSerializer.SerializeRangeContextEnd(); + NS_ENSURE_SUCCESS(rv, rv); + } + + // Just to be safe + mRangeContextSerializer.mDisableContextSerialize = false; + + return rv; +} + +nsresult nsDocumentEncoder::SerializeNode() { + NS_ENSURE_TRUE(mEncodingScope.mNode, NS_ERROR_FAILURE); + + nsresult rv = NS_OK; + nsINode* node = mEncodingScope.mNode; + const bool nodeIsContainer = mEncodingScope.mNodeIsContainer; + if (!mNodeFixup && !(mFlags & SkipInvisibleContent) && !mTextStreamer && + nodeIsContainer) { + rv = mNodeSerializer.SerializeToStringIterative(node); + } else { + rv = mNodeSerializer.SerializeToStringRecursive( + node, nodeIsContainer ? NodeSerializer::SerializeRoot::eNo + : NodeSerializer::SerializeRoot::eYes); + } + + return rv; +} + +nsresult nsDocumentEncoder::SerializeWholeDocument(uint32_t aMaxLength) { + NS_ENSURE_FALSE(mEncodingScope.mSelection, NS_ERROR_FAILURE); + NS_ENSURE_FALSE(mEncodingScope.mRange, NS_ERROR_FAILURE); + NS_ENSURE_FALSE(mEncodingScope.mNode, NS_ERROR_FAILURE); + + nsresult rv = mSerializer->AppendDocumentStart(mDocument); + NS_ENSURE_SUCCESS(rv, rv); + + rv = mNodeSerializer.SerializeToStringRecursive( + mDocument, NodeSerializer::SerializeRoot::eYes, aMaxLength); + return rv; +} + +nsDocumentEncoder::~nsDocumentEncoder() { + if (mCachedBuffer) { + mCachedBuffer->Release(); + } +} + +NS_IMETHODIMP +nsDocumentEncoder::Init(Document* aDocument, const nsAString& aMimeType, + uint32_t aFlags) { + return NativeInit(aDocument, aMimeType, aFlags); +} + +NS_IMETHODIMP +nsDocumentEncoder::NativeInit(Document* aDocument, const nsAString& aMimeType, + uint32_t aFlags) { + if (!aDocument) return NS_ERROR_INVALID_ARG; + + Initialize(!mMimeType.Equals(aMimeType)); + + mDocument = aDocument; + + mMimeType = aMimeType; + + mFlags = aFlags; + mIsCopying = false; + + return NS_OK; +} + +NS_IMETHODIMP +nsDocumentEncoder::SetWrapColumn(uint32_t aWC) { + mWrapColumn = aWC; + return NS_OK; +} + +NS_IMETHODIMP +nsDocumentEncoder::SetSelection(Selection* aSelection) { + mEncodingScope.mSelection = aSelection; + return NS_OK; +} + +NS_IMETHODIMP +nsDocumentEncoder::SetRange(nsRange* aRange) { + mEncodingScope.mRange = aRange; + return NS_OK; +} + +NS_IMETHODIMP +nsDocumentEncoder::SetNode(nsINode* aNode) { + mEncodingScope.mNodeIsContainer = false; + mEncodingScope.mNode = aNode; + return NS_OK; +} + +NS_IMETHODIMP +nsDocumentEncoder::SetContainerNode(nsINode* aContainer) { + mEncodingScope.mNodeIsContainer = true; + mEncodingScope.mNode = aContainer; + return NS_OK; +} + +NS_IMETHODIMP +nsDocumentEncoder::SetCharset(const nsACString& aCharset) { + const Encoding* encoding = Encoding::ForLabel(aCharset); + if (!encoding) { + return NS_ERROR_UCONV_NOCONV; + } + mEncoding = encoding->OutputEncoding(); + return NS_OK; +} + +NS_IMETHODIMP +nsDocumentEncoder::GetMimeType(nsAString& aMimeType) { + aMimeType = mMimeType; + return NS_OK; +} + +class FixupNodeDeterminer { + public: + FixupNodeDeterminer(nsIDocumentEncoderNodeFixup* aNodeFixup, + nsINode* aFixupNode, nsINode& aOriginalNode) + : mIsSerializationOfFixupChildrenNeeded{false}, + mNodeFixup(aNodeFixup), + mOriginalNode(aOriginalNode) { + if (mNodeFixup) { + if (aFixupNode) { + mFixupNode = aFixupNode; + } else { + mNodeFixup->FixupNode(&mOriginalNode, + &mIsSerializationOfFixupChildrenNeeded, + getter_AddRefs(mFixupNode)); + } + } + } + + bool IsSerializationOfFixupChildrenNeeded() const { + return mIsSerializationOfFixupChildrenNeeded; + } + + /** + * @return The fixup node, if available, otherwise the original node. The + * former is kept alive by this object. + */ + nsINode& GetFixupNodeFallBackToOriginalNode() const { + return mFixupNode ? *mFixupNode : mOriginalNode; + } + + private: + bool mIsSerializationOfFixupChildrenNeeded; + nsIDocumentEncoderNodeFixup* mNodeFixup; + nsCOMPtr<nsINode> mFixupNode; + nsINode& mOriginalNode; +}; + +nsresult nsDocumentEncoder::NodeSerializer::SerializeNodeStart( + nsINode& aOriginalNode, int32_t aStartOffset, int32_t aEndOffset, + nsINode* aFixupNode) const { + if (mNeedsPreformatScanning) { + if (aOriginalNode.IsElement()) { + mSerializer->ScanElementForPreformat(aOriginalNode.AsElement()); + } else if (aOriginalNode.IsText()) { + const nsCOMPtr<nsINode> parent = aOriginalNode.GetParent(); + if (parent && parent->IsElement()) { + mSerializer->ScanElementForPreformat(parent->AsElement()); + } + } + } + + if (IsInvisibleNodeAndShouldBeSkipped(aOriginalNode, mFlags)) { + return NS_OK; + } + + FixupNodeDeterminer fixupNodeDeterminer{mNodeFixup, aFixupNode, + aOriginalNode}; + nsINode* node = &fixupNodeDeterminer.GetFixupNodeFallBackToOriginalNode(); + + nsresult rv = NS_OK; + + if (node->IsElement()) { + if ((mFlags & (nsIDocumentEncoder::OutputPreformatted | + nsIDocumentEncoder::OutputDropInvisibleBreak)) && + nsLayoutUtils::IsInvisibleBreak(node)) { + return rv; + } + rv = mSerializer->AppendElementStart(node->AsElement(), + aOriginalNode.AsElement()); + return rv; + } + + switch (node->NodeType()) { + case nsINode::TEXT_NODE: { + rv = mSerializer->AppendText(static_cast<nsIContent*>(node), aStartOffset, + aEndOffset); + break; + } + case nsINode::CDATA_SECTION_NODE: { + rv = mSerializer->AppendCDATASection(static_cast<nsIContent*>(node), + aStartOffset, aEndOffset); + break; + } + case nsINode::PROCESSING_INSTRUCTION_NODE: { + rv = mSerializer->AppendProcessingInstruction( + static_cast<ProcessingInstruction*>(node), aStartOffset, aEndOffset); + break; + } + case nsINode::COMMENT_NODE: { + rv = mSerializer->AppendComment(static_cast<Comment*>(node), aStartOffset, + aEndOffset); + break; + } + case nsINode::DOCUMENT_TYPE_NODE: { + rv = mSerializer->AppendDoctype(static_cast<DocumentType*>(node)); + break; + } + } + + return rv; +} + +nsresult nsDocumentEncoder::NodeSerializer::SerializeNodeEnd( + nsINode& aOriginalNode, nsINode* aFixupNode) const { + if (mNeedsPreformatScanning) { + if (aOriginalNode.IsElement()) { + mSerializer->ForgetElementForPreformat(aOriginalNode.AsElement()); + } else if (aOriginalNode.IsText()) { + const nsCOMPtr<nsINode> parent = aOriginalNode.GetParent(); + if (parent && parent->IsElement()) { + mSerializer->ForgetElementForPreformat(parent->AsElement()); + } + } + } + + if (IsInvisibleNodeAndShouldBeSkipped(aOriginalNode, mFlags)) { + return NS_OK; + } + + nsresult rv = NS_OK; + + FixupNodeDeterminer fixupNodeDeterminer{mNodeFixup, aFixupNode, + aOriginalNode}; + nsINode* node = &fixupNodeDeterminer.GetFixupNodeFallBackToOriginalNode(); + + if (node->IsElement()) { + rv = mSerializer->AppendElementEnd(node->AsElement(), + aOriginalNode.AsElement()); + } + + return rv; +} + +nsresult nsDocumentEncoder::NodeSerializer::SerializeToStringRecursive( + nsINode* aNode, SerializeRoot aSerializeRoot, uint32_t aMaxLength) const { + uint32_t outputLength{0}; + nsresult rv = mSerializer->GetOutputLength(outputLength); + NS_ENSURE_SUCCESS(rv, rv); + + if (aMaxLength > 0 && outputLength >= aMaxLength) { + return NS_OK; + } + + NS_ENSURE_TRUE(aNode, NS_ERROR_NULL_POINTER); + + if (IsInvisibleNodeAndShouldBeSkipped(*aNode, mFlags)) { + return NS_OK; + } + + FixupNodeDeterminer fixupNodeDeterminer{mNodeFixup, nullptr, *aNode}; + nsINode* maybeFixedNode = + &fixupNodeDeterminer.GetFixupNodeFallBackToOriginalNode(); + + if (mFlags & SkipInvisibleContent) { + if (aNode->IsContent()) { + if (nsIFrame* frame = aNode->AsContent()->GetPrimaryFrame()) { + if (!frame->IsSelectable(nullptr)) { + aSerializeRoot = SerializeRoot::eNo; + } + } + } + } + + if (aSerializeRoot == SerializeRoot::eYes) { + int32_t endOffset = -1; + if (aMaxLength > 0) { + MOZ_ASSERT(aMaxLength >= outputLength); + endOffset = aMaxLength - outputLength; + } + rv = SerializeNodeStart(*aNode, 0, endOffset, maybeFixedNode); + NS_ENSURE_SUCCESS(rv, rv); + } + + nsINode* node = fixupNodeDeterminer.IsSerializationOfFixupChildrenNeeded() + ? maybeFixedNode + : aNode; + + for (nsINode* child = node->GetFirstChildOfTemplateOrNode(); child; + child = child->GetNextSibling()) { + rv = SerializeToStringRecursive(child, SerializeRoot::eYes, aMaxLength); + NS_ENSURE_SUCCESS(rv, rv); + } + + if (aSerializeRoot == SerializeRoot::eYes) { + rv = SerializeNodeEnd(*aNode, maybeFixedNode); + NS_ENSURE_SUCCESS(rv, rv); + } + + if (mTextStreamer) { + rv = mTextStreamer->FlushIfStringLongEnough(); + } + + return rv; +} + +nsresult nsDocumentEncoder::NodeSerializer::SerializeToStringIterative( + nsINode* aNode) const { + nsresult rv; + + nsINode* node = aNode->GetFirstChildOfTemplateOrNode(); + while (node) { + nsINode* current = node; + rv = SerializeNodeStart(*current, 0, -1, current); + NS_ENSURE_SUCCESS(rv, rv); + node = current->GetFirstChildOfTemplateOrNode(); + while (!node && current && current != aNode) { + rv = SerializeNodeEnd(*current); + NS_ENSURE_SUCCESS(rv, rv); + // Check if we have siblings. + node = current->GetNextSibling(); + if (!node) { + // Perhaps parent node has siblings. + current = current->GetParentNode(); + + // Handle template element. If the parent is a template's content, + // then adjust the parent to be the template element. + if (current && current != aNode && current->IsDocumentFragment()) { + nsIContent* host = current->AsDocumentFragment()->GetHost(); + if (host && host->IsHTMLElement(nsGkAtoms::_template)) { + current = host; + } + } + } + } + } + + return NS_OK; +} + +static bool IsTextNode(nsINode* aNode) { return aNode && aNode->IsText(); } + +nsresult nsDocumentEncoder::NodeSerializer::SerializeTextNode( + nsINode& aNode, int32_t aStartOffset, int32_t aEndOffset) const { + MOZ_ASSERT(IsTextNode(&aNode)); + + nsresult rv = SerializeNodeStart(aNode, aStartOffset, aEndOffset); + NS_ENSURE_SUCCESS(rv, rv); + rv = SerializeNodeEnd(aNode); + NS_ENSURE_SUCCESS(rv, rv); + return rv; +} + +nsDocumentEncoder::RangeSerializer::StartAndEndContent +nsDocumentEncoder::RangeSerializer::GetStartAndEndContentForRecursionLevel( + const int32_t aDepth) const { + StartAndEndContent result; + + const auto& inclusiveAncestorsOfStart = + mRangeBoundariesInclusiveAncestorsAndOffsets.mInclusiveAncestorsOfStart; + const auto& inclusiveAncestorsOfEnd = + mRangeBoundariesInclusiveAncestorsAndOffsets.mInclusiveAncestorsOfEnd; + int32_t start = mStartRootIndex - aDepth; + if (start >= 0 && (uint32_t)start <= inclusiveAncestorsOfStart.Length()) { + result.mStart = inclusiveAncestorsOfStart[start]; + } + + int32_t end = mEndRootIndex - aDepth; + if (end >= 0 && (uint32_t)end <= inclusiveAncestorsOfEnd.Length()) { + result.mEnd = inclusiveAncestorsOfEnd[end]; + } + + return result; +} + +nsresult nsDocumentEncoder::RangeSerializer::SerializeTextNode( + nsINode& aNode, const nsIContent& aContent, + const StartAndEndContent& aStartAndEndContent, + const nsRange& aRange) const { + const int32_t startOffset = + (aStartAndEndContent.mStart == &aContent) ? aRange.StartOffset() : 0; + const int32_t endOffset = + (aStartAndEndContent.mEnd == &aContent) ? aRange.EndOffset() : -1; + return mNodeSerializer.SerializeTextNode(aNode, startOffset, endOffset); +} + +nsresult nsDocumentEncoder::RangeSerializer::SerializeRangeNodes( + const nsRange* const aRange, nsINode* const aNode, const int32_t aDepth) { + MOZ_ASSERT(aDepth >= 0); + MOZ_ASSERT(aRange); + + nsCOMPtr<nsIContent> content = nsIContent::FromNodeOrNull(aNode); + NS_ENSURE_TRUE(content, NS_ERROR_FAILURE); + + if (nsDocumentEncoder::IsInvisibleNodeAndShouldBeSkipped(*aNode, mFlags)) { + return NS_OK; + } + + nsresult rv = NS_OK; + + StartAndEndContent startAndEndContent = + GetStartAndEndContentForRecursionLevel(aDepth); + + if (startAndEndContent.mStart != content && + startAndEndContent.mEnd != content) { + // node is completely contained in range. Serialize the whole subtree + // rooted by this node. + rv = mNodeSerializer.SerializeToStringRecursive( + aNode, NodeSerializer::SerializeRoot::eYes); + NS_ENSURE_SUCCESS(rv, rv); + } else { + rv = SerializeNodePartiallyContainedInRange( + *aNode, *content, startAndEndContent, *aRange, aDepth); + if (NS_WARN_IF(NS_FAILED(rv))) { + return rv; + } + } + return NS_OK; +} + +nsresult +nsDocumentEncoder::RangeSerializer::SerializeNodePartiallyContainedInRange( + nsINode& aNode, nsIContent& aContent, + const StartAndEndContent& aStartAndEndContent, const nsRange& aRange, + const int32_t aDepth) { + // due to implementation it is impossible for text node to be both start and + // end of range. We would have handled that case without getting here. + // XXXsmaug What does this all mean? + if (IsTextNode(&aNode)) { + nsresult rv = + SerializeTextNode(aNode, aContent, aStartAndEndContent, aRange); + NS_ENSURE_SUCCESS(rv, rv); + } else { + if (&aNode != mClosestCommonInclusiveAncestorOfRange) { + if (mRangeContextSerializer.mRangeNodeContext.IncludeInContext(aNode)) { + // halt the incrementing of mContextInfoDepth. This + // is so paste client will include this node in paste. + mHaltRangeHint = true; + } + if ((aStartAndEndContent.mStart == &aContent) && !mHaltRangeHint) { + ++mContextInfoDepth.mStart; + } + if ((aStartAndEndContent.mEnd == &aContent) && !mHaltRangeHint) { + ++mContextInfoDepth.mEnd; + } + + // serialize the start of this node + nsresult rv = mNodeSerializer.SerializeNodeStart(aNode, 0, -1); + NS_ENSURE_SUCCESS(rv, rv); + } + + const auto& inclusiveAncestorsOffsetsOfStart = + mRangeBoundariesInclusiveAncestorsAndOffsets + .mInclusiveAncestorsOffsetsOfStart; + const auto& inclusiveAncestorsOffsetsOfEnd = + mRangeBoundariesInclusiveAncestorsAndOffsets + .mInclusiveAncestorsOffsetsOfEnd; + // do some calculations that will tell us which children of this + // node are in the range. + Maybe<uint32_t> startOffset = Some(0); + Maybe<uint32_t> endOffset; + if (aStartAndEndContent.mStart == &aContent && mStartRootIndex >= aDepth) { + startOffset = inclusiveAncestorsOffsetsOfStart[mStartRootIndex - aDepth]; + } + if (aStartAndEndContent.mEnd == &aContent && mEndRootIndex >= aDepth) { + endOffset = inclusiveAncestorsOffsetsOfEnd[mEndRootIndex - aDepth]; + } + // generated aContent will cause offset values of Nothing to be returned. + if (startOffset.isNothing()) { + startOffset = Some(0); + } + if (endOffset.isNothing()) { + endOffset = Some(aContent.GetChildCount()); + } else { + // if we are at the "tip" of the selection, endOffset is fine. + // otherwise, we need to add one. This is because of the semantics + // of the offset list created by GetInclusiveAncestorsAndOffsets(). The + // intermediate points on the list use the endOffset of the + // location of the ancestor, rather than just past it. So we need + // to add one here in order to include it in the children we serialize. + if (&aNode != aRange.GetEndContainer()) { + MOZ_ASSERT(*endOffset != UINT32_MAX); + endOffset.ref()++; + } + } + + if (*endOffset) { + nsresult rv = SerializeChildrenOfContent(aContent, *startOffset, + *endOffset, &aRange, aDepth); + NS_ENSURE_SUCCESS(rv, rv); + } + // serialize the end of this node + if (&aNode != mClosestCommonInclusiveAncestorOfRange) { + nsresult rv = mNodeSerializer.SerializeNodeEnd(aNode); + NS_ENSURE_SUCCESS(rv, rv); + } + } + + return NS_OK; +} + +nsresult nsDocumentEncoder::RangeSerializer::SerializeChildrenOfContent( + nsIContent& aContent, uint32_t aStartOffset, uint32_t aEndOffset, + const nsRange* aRange, int32_t aDepth) { + // serialize the children of this node that are in the range + nsIContent* childAsNode = aContent.GetFirstChild(); + uint32_t j = 0; + + for (; j < aStartOffset && childAsNode; ++j) { + childAsNode = childAsNode->GetNextSibling(); + } + + MOZ_ASSERT(j == aStartOffset); + + for (; childAsNode && j < aEndOffset; ++j) { + nsresult rv{NS_OK}; + if ((j == aStartOffset) || (j == aEndOffset - 1)) { + rv = SerializeRangeNodes(aRange, childAsNode, aDepth + 1); + } else { + rv = mNodeSerializer.SerializeToStringRecursive( + childAsNode, NodeSerializer::SerializeRoot::eYes); + } + + if (NS_FAILED(rv)) { + return rv; + } + + childAsNode = childAsNode->GetNextSibling(); + } + + return NS_OK; +} + +nsresult nsDocumentEncoder::RangeContextSerializer::SerializeRangeContextStart( + const nsTArray<nsINode*>& aAncestorArray) { + if (mDisableContextSerialize) { + return NS_OK; + } + + AutoTArray<nsINode*, 8>* serializedContext = mRangeContexts.AppendElement(); + + int32_t i = aAncestorArray.Length(), j; + nsresult rv = NS_OK; + + // currently only for table-related elements; see Bug 137450 + j = mRangeNodeContext.GetImmediateContextCount(aAncestorArray); + + while (i > 0) { + nsINode* node = aAncestorArray.ElementAt(--i); + if (!node) break; + + // Either a general inclusion or as immediate context + if (mRangeNodeContext.IncludeInContext(*node) || i < j) { + rv = mNodeSerializer.SerializeNodeStart(*node, 0, -1); + serializedContext->AppendElement(node); + if (NS_FAILED(rv)) break; + } + } + + return rv; +} + +nsresult nsDocumentEncoder::RangeContextSerializer::SerializeRangeContextEnd() { + if (mDisableContextSerialize) { + return NS_OK; + } + + MOZ_RELEASE_ASSERT(!mRangeContexts.IsEmpty(), + "Tried to end context without starting one."); + AutoTArray<nsINode*, 8>& serializedContext = mRangeContexts.LastElement(); + + nsresult rv = NS_OK; + for (nsINode* node : Reversed(serializedContext)) { + rv = mNodeSerializer.SerializeNodeEnd(*node); + + if (NS_FAILED(rv)) break; + } + + mRangeContexts.RemoveLastElement(); + return rv; +} + +bool nsDocumentEncoder::RangeSerializer::HasInvisibleParentAndShouldBeSkipped( + nsINode& aNode) const { + if (!(mFlags & SkipInvisibleContent)) { + return false; + } + + // Check that the parent is visible if we don't a frame. + // IsInvisibleNodeAndShouldBeSkipped() will do it when there's a frame. + nsCOMPtr<nsIContent> content = nsIContent::FromNode(aNode); + if (content && !content->GetPrimaryFrame()) { + nsIContent* parent = content->GetParent(); + return !parent || IsInvisibleNodeAndShouldBeSkipped(*parent, mFlags); + } + + return false; +} + +nsresult nsDocumentEncoder::RangeSerializer::SerializeRangeToString( + const nsRange* aRange) { + if (!aRange || aRange->Collapsed()) return NS_OK; + + mClosestCommonInclusiveAncestorOfRange = + aRange->GetClosestCommonInclusiveAncestor(); + + if (!mClosestCommonInclusiveAncestorOfRange) { + return NS_OK; + } + + nsINode* startContainer = aRange->GetStartContainer(); + NS_ENSURE_TRUE(startContainer, NS_ERROR_FAILURE); + int32_t startOffset = aRange->StartOffset(); + + nsINode* endContainer = aRange->GetEndContainer(); + NS_ENSURE_TRUE(endContainer, NS_ERROR_FAILURE); + int32_t endOffset = aRange->EndOffset(); + + mContextInfoDepth = {}; + mCommonInclusiveAncestors.Clear(); + + mRangeBoundariesInclusiveAncestorsAndOffsets = {}; + auto& inclusiveAncestorsOfStart = + mRangeBoundariesInclusiveAncestorsAndOffsets.mInclusiveAncestorsOfStart; + auto& inclusiveAncestorsOffsetsOfStart = + mRangeBoundariesInclusiveAncestorsAndOffsets + .mInclusiveAncestorsOffsetsOfStart; + auto& inclusiveAncestorsOfEnd = + mRangeBoundariesInclusiveAncestorsAndOffsets.mInclusiveAncestorsOfEnd; + auto& inclusiveAncestorsOffsetsOfEnd = + mRangeBoundariesInclusiveAncestorsAndOffsets + .mInclusiveAncestorsOffsetsOfEnd; + + nsContentUtils::GetInclusiveAncestors(mClosestCommonInclusiveAncestorOfRange, + mCommonInclusiveAncestors); + nsContentUtils::GetInclusiveAncestorsAndOffsets( + startContainer, startOffset, &inclusiveAncestorsOfStart, + &inclusiveAncestorsOffsetsOfStart); + nsContentUtils::GetInclusiveAncestorsAndOffsets( + endContainer, endOffset, &inclusiveAncestorsOfEnd, + &inclusiveAncestorsOffsetsOfEnd); + + nsCOMPtr<nsIContent> commonContent = + nsIContent::FromNodeOrNull(mClosestCommonInclusiveAncestorOfRange); + mStartRootIndex = inclusiveAncestorsOfStart.IndexOf(commonContent); + mEndRootIndex = inclusiveAncestorsOfEnd.IndexOf(commonContent); + + nsresult rv = NS_OK; + + rv = mRangeContextSerializer.SerializeRangeContextStart( + mCommonInclusiveAncestors); + NS_ENSURE_SUCCESS(rv, rv); + + if (startContainer == endContainer && IsTextNode(startContainer)) { + if (HasInvisibleParentAndShouldBeSkipped(*startContainer)) { + return NS_OK; + } + rv = mNodeSerializer.SerializeTextNode(*startContainer, startOffset, + endOffset); + NS_ENSURE_SUCCESS(rv, rv); + } else { + rv = SerializeRangeNodes(aRange, mClosestCommonInclusiveAncestorOfRange, 0); + NS_ENSURE_SUCCESS(rv, rv); + } + rv = mRangeContextSerializer.SerializeRangeContextEnd(); + NS_ENSURE_SUCCESS(rv, rv); + + return rv; +} + +void nsDocumentEncoder::ReleaseDocumentReferenceAndInitialize( + bool aClearCachedSerializer) { + mDocument = nullptr; + + Initialize(aClearCachedSerializer); +} + +NS_IMETHODIMP +nsDocumentEncoder::EncodeToString(nsAString& aOutputString) { + return EncodeToStringWithMaxLength(0, aOutputString); +} + +NS_IMETHODIMP +nsDocumentEncoder::EncodeToStringWithMaxLength(uint32_t aMaxLength, + nsAString& aOutputString) { + MOZ_ASSERT(mRangeContextSerializer.mRangeContexts.IsEmpty(), + "Re-entrant call to nsDocumentEncoder."); + auto rangeContextGuard = + MakeScopeExit([&] { mRangeContextSerializer.mRangeContexts.Clear(); }); + + if (!mDocument) return NS_ERROR_NOT_INITIALIZED; + + AutoReleaseDocumentIfNeeded autoReleaseDocument(this); + + aOutputString.Truncate(); + + nsString output; + static const size_t kStringBufferSizeInBytes = 2048; + if (!mCachedBuffer) { + mCachedBuffer = nsStringBuffer::Alloc(kStringBufferSizeInBytes).take(); + if (NS_WARN_IF(!mCachedBuffer)) { + return NS_ERROR_OUT_OF_MEMORY; + } + } + NS_ASSERTION( + !mCachedBuffer->IsReadonly(), + "nsIDocumentEncoder shouldn't keep reference to non-readonly buffer!"); + static_cast<char16_t*>(mCachedBuffer->Data())[0] = char16_t(0); + mCachedBuffer->ToString(0, output, true); + // output owns the buffer now! + mCachedBuffer = nullptr; + + if (!mSerializer) { + nsAutoCString progId(NS_CONTENTSERIALIZER_CONTRACTID_PREFIX); + AppendUTF16toUTF8(mMimeType, progId); + + mSerializer = do_CreateInstance(progId.get()); + NS_ENSURE_TRUE(mSerializer, NS_ERROR_NOT_IMPLEMENTED); + } + + nsresult rv = NS_OK; + + bool rewriteEncodingDeclaration = + !mEncodingScope.IsLimited() && + !(mFlags & OutputDontRewriteEncodingDeclaration); + mSerializer->Init(mFlags, mWrapColumn, mEncoding, mIsCopying, + rewriteEncodingDeclaration, &mNeedsPreformatScanning, + output); + + rv = SerializeDependingOnScope(aMaxLength); + NS_ENSURE_SUCCESS(rv, rv); + + rv = mSerializer->FlushAndFinish(); + + mCachedBuffer = nsStringBuffer::FromString(output); + // We have to be careful how we set aOutputString, because we don't + // want it to end up sharing mCachedBuffer if we plan to reuse it. + bool setOutput = false; + // Try to cache the buffer. + if (mCachedBuffer) { + if ((mCachedBuffer->StorageSize() == kStringBufferSizeInBytes) && + !mCachedBuffer->IsReadonly()) { + mCachedBuffer->AddRef(); + } else { + if (NS_SUCCEEDED(rv)) { + mCachedBuffer->ToString(output.Length(), aOutputString); + setOutput = true; + } + mCachedBuffer = nullptr; + } + } + + if (!setOutput && NS_SUCCEEDED(rv)) { + aOutputString.Append(output.get(), output.Length()); + } + + return rv; +} + +NS_IMETHODIMP +nsDocumentEncoder::EncodeToStream(nsIOutputStream* aStream) { + MOZ_ASSERT(mRangeContextSerializer.mRangeContexts.IsEmpty(), + "Re-entrant call to nsDocumentEncoder."); + auto rangeContextGuard = + MakeScopeExit([&] { mRangeContextSerializer.mRangeContexts.Clear(); }); + NS_ENSURE_ARG_POINTER(aStream); + + nsresult rv = NS_OK; + + if (!mDocument) return NS_ERROR_NOT_INITIALIZED; + + if (!mEncoding) { + return NS_ERROR_UCONV_NOCONV; + } + + nsAutoString buf; + const bool isPlainText = mMimeType.LowerCaseEqualsLiteral(kTextMime); + mTextStreamer.emplace(*aStream, mEncoding->NewEncoder(), isPlainText, buf); + + rv = EncodeToString(buf); + + // Force a flush of the last chunk of data. + rv = mTextStreamer->ForceFlush(); + NS_ENSURE_SUCCESS(rv, rv); + + mTextStreamer.reset(); + + return rv; +} + +NS_IMETHODIMP +nsDocumentEncoder::EncodeToStringWithContext(nsAString& aContextString, + nsAString& aInfoString, + nsAString& aEncodedString) { + return NS_ERROR_NOT_IMPLEMENTED; +} + +NS_IMETHODIMP +nsDocumentEncoder::SetNodeFixup(nsIDocumentEncoderNodeFixup* aFixup) { + mNodeFixup = aFixup; + return NS_OK; +} + +bool do_getDocumentTypeSupportedForEncoding(const char* aContentType) { + if (!nsCRT::strcmp(aContentType, "text/xml") || + !nsCRT::strcmp(aContentType, "application/xml") || + !nsCRT::strcmp(aContentType, "application/xhtml+xml") || + !nsCRT::strcmp(aContentType, "image/svg+xml") || + !nsCRT::strcmp(aContentType, "text/html") || + !nsCRT::strcmp(aContentType, "text/plain")) { + return true; + } + return false; +} + +already_AddRefed<nsIDocumentEncoder> do_createDocumentEncoder( + const char* aContentType) { + if (do_getDocumentTypeSupportedForEncoding(aContentType)) { + return do_AddRef(new nsDocumentEncoder); + } + return nullptr; +} + +class nsHTMLCopyEncoder : public nsDocumentEncoder { + private: + class RangeNodeContext final : public nsDocumentEncoder::RangeNodeContext { + bool IncludeInContext(nsINode& aNode) const final; + + int32_t GetImmediateContextCount( + const nsTArray<nsINode*>& aAncestorArray) const final; + }; + + public: + nsHTMLCopyEncoder(); + ~nsHTMLCopyEncoder(); + + NS_IMETHOD Init(Document* aDocument, const nsAString& aMimeType, + uint32_t aFlags) override; + + // overridden methods from nsDocumentEncoder + MOZ_CAN_RUN_SCRIPT_BOUNDARY + NS_IMETHOD SetSelection(Selection* aSelection) override; + NS_IMETHOD EncodeToStringWithContext(nsAString& aContextString, + nsAString& aInfoString, + nsAString& aEncodedString) override; + NS_IMETHOD EncodeToString(nsAString& aOutputString) override; + + protected: + enum Endpoint { kStart, kEnd }; + + nsresult PromoteRange(nsRange* inRange); + nsresult PromoteAncestorChain(nsCOMPtr<nsINode>* ioNode, + int32_t* ioStartOffset, int32_t* ioEndOffset); + nsresult GetPromotedPoint(Endpoint aWhere, nsINode* aNode, int32_t aOffset, + nsCOMPtr<nsINode>* outNode, int32_t* outOffset, + nsINode* aCommon); + static nsCOMPtr<nsINode> GetChildAt(nsINode* aParent, int32_t aOffset); + static bool IsMozBR(Element* aNode); + static nsresult GetNodeLocation(nsINode* inChild, + nsCOMPtr<nsINode>* outParent, + int32_t* outOffset); + bool IsRoot(nsINode* aNode); + static bool IsFirstNode(nsINode* aNode); + static bool IsLastNode(nsINode* aNode); + + bool mIsTextWidget; +}; + +nsHTMLCopyEncoder::nsHTMLCopyEncoder() + : nsDocumentEncoder{MakeUnique<nsHTMLCopyEncoder::RangeNodeContext>()} { + mIsTextWidget = false; +} + +nsHTMLCopyEncoder::~nsHTMLCopyEncoder() = default; + +NS_IMETHODIMP +nsHTMLCopyEncoder::Init(Document* aDocument, const nsAString& aMimeType, + uint32_t aFlags) { + if (!aDocument) return NS_ERROR_INVALID_ARG; + + mIsTextWidget = false; + Initialize(); + + mIsCopying = true; + mDocument = aDocument; + + // Hack, hack! Traditionally, the caller passes text/plain, which is + // treated as "guess text/html or text/plain" in this context. (It has a + // different meaning in other contexts. Sigh.) From now on, "text/plain" + // means forcing text/plain instead of guessing. + if (aMimeType.EqualsLiteral("text/plain")) { + mMimeType.AssignLiteral("text/plain"); + } else { + mMimeType.AssignLiteral("text/html"); + } + + // Make all links absolute when copying + // (see related bugs #57296, #41924, #58646, #32768) + mFlags = aFlags | OutputAbsoluteLinks; + + if (!mDocument->IsScriptEnabled()) mFlags |= OutputNoScriptContent; + + return NS_OK; +} + +NS_IMETHODIMP +nsHTMLCopyEncoder::SetSelection(Selection* aSelection) { + // check for text widgets: we need to recognize these so that + // we don't tweak the selection to be outside of the magic + // div that ender-lite text widgets are embedded in. + + if (!aSelection) return NS_ERROR_NULL_POINTER; + + const uint32_t rangeCount = aSelection->RangeCount(); + + // if selection is uninitialized return + if (!rangeCount) { + return NS_ERROR_FAILURE; + } + + // we'll just use the common parent of the first range. Implicit assumption + // here that multi-range selections are table cell selections, in which case + // the common parent is somewhere in the table and we don't really care where. + // + // FIXME(emilio, bug 1455894): This assumption is already wrong, and will + // probably be more wrong in a Shadow DOM world... + // + // We should be able to write this as "Find the common ancestor of the + // selection, then go through the flattened tree and serialize the selected + // nodes", effectively serializing the composed tree. + RefPtr<nsRange> range = aSelection->GetRangeAt(0); + nsINode* commonParent = range->GetClosestCommonInclusiveAncestor(); + + for (nsCOMPtr<nsIContent> selContent( + nsIContent::FromNodeOrNull(commonParent)); + selContent; selContent = selContent->GetParent()) { + // checking for selection inside a plaintext form widget + if (selContent->IsAnyOfHTMLElements(nsGkAtoms::input, + nsGkAtoms::textarea)) { + mIsTextWidget = true; + break; + } + } + + // normalize selection if we are not in a widget + if (mIsTextWidget) { + mEncodingScope.mSelection = aSelection; + mMimeType.AssignLiteral("text/plain"); + return NS_OK; + } + + // XXX We should try to get rid of the Selection object here. + // XXX bug 1245883 + + // also consider ourselves in a text widget if we can't find an html document + if (!(mDocument && mDocument->IsHTMLDocument())) { + mIsTextWidget = true; + mEncodingScope.mSelection = aSelection; + // mMimeType is set to text/plain when encoding starts. + return NS_OK; + } + + // there's no Clone() for selection! fix... + // nsresult rv = aSelection->Clone(getter_AddRefs(mSelection); + // NS_ENSURE_SUCCESS(rv, rv); + mEncodingScope.mSelection = new Selection(SelectionType::eNormal, nullptr); + + // loop thru the ranges in the selection + for (const uint32_t rangeIdx : IntegerRange(rangeCount)) { + MOZ_ASSERT(aSelection->RangeCount() == rangeCount); + range = aSelection->GetRangeAt(rangeIdx); + NS_ENSURE_TRUE(range, NS_ERROR_FAILURE); + RefPtr<nsRange> myRange = range->CloneRange(); + MOZ_ASSERT(myRange); + + // adjust range to include any ancestors who's children are entirely + // selected + nsresult rv = PromoteRange(myRange); + NS_ENSURE_SUCCESS(rv, rv); + + ErrorResult result; + RefPtr<Selection> selection(mEncodingScope.mSelection); + RefPtr<Document> document(mDocument); + selection->AddRangeAndSelectFramesAndNotifyListenersInternal( + *myRange, document, result); + rv = result.StealNSResult(); + NS_ENSURE_SUCCESS(rv, rv); + } + + return NS_OK; +} + +NS_IMETHODIMP +nsHTMLCopyEncoder::EncodeToString(nsAString& aOutputString) { + if (mIsTextWidget) { + mMimeType.AssignLiteral("text/plain"); + } + return nsDocumentEncoder::EncodeToString(aOutputString); +} + +NS_IMETHODIMP +nsHTMLCopyEncoder::EncodeToStringWithContext(nsAString& aContextString, + nsAString& aInfoString, + nsAString& aEncodedString) { + nsresult rv = EncodeToString(aEncodedString); + NS_ENSURE_SUCCESS(rv, rv); + + // do not encode any context info or range hints if we are in a text widget. + if (mIsTextWidget) return NS_OK; + + // now encode common ancestors into aContextString. Note that the common + // ancestors will be for the last range in the selection in the case of + // multirange selections. encoding ancestors every range in a multirange + // selection in a way that could be understood by the paste code would be a + // lot more work to do. As a practical matter, selections are single range, + // and the ones that aren't are table cell selections where all the cells are + // in the same table. + + mSerializer->Init(mFlags, mWrapColumn, mEncoding, mIsCopying, false, + &mNeedsPreformatScanning, aContextString); + + // leaf of ancestors might be text node. If so discard it. + int32_t count = mRangeSerializer.mCommonInclusiveAncestors.Length(); + int32_t i; + nsCOMPtr<nsINode> node; + if (count > 0) { + node = mRangeSerializer.mCommonInclusiveAncestors.ElementAt(0); + } + + if (node && IsTextNode(node)) { + mRangeSerializer.mCommonInclusiveAncestors.RemoveElementAt(0); + if (mRangeSerializer.mContextInfoDepth.mStart) { + --mRangeSerializer.mContextInfoDepth.mStart; + } + if (mRangeSerializer.mContextInfoDepth.mEnd) { + --mRangeSerializer.mContextInfoDepth.mEnd; + } + count--; + } + + i = count; + while (i > 0) { + node = mRangeSerializer.mCommonInclusiveAncestors.ElementAt(--i); + rv = mNodeSerializer.SerializeNodeStart(*node, 0, -1); + NS_ENSURE_SUCCESS(rv, rv); + } + // i = 0; guaranteed by above + while (i < count) { + node = mRangeSerializer.mCommonInclusiveAncestors.ElementAt(i++); + rv = mNodeSerializer.SerializeNodeEnd(*node); + NS_ENSURE_SUCCESS(rv, rv); + } + + mSerializer->Finish(); + + // encode range info : the start and end depth of the selection, where the + // depth is distance down in the parent hierarchy. Later we will need to add + // leading/trailing whitespace info to this. + nsAutoString infoString; + infoString.AppendInt(mRangeSerializer.mContextInfoDepth.mStart); + infoString.Append(char16_t(',')); + infoString.AppendInt(mRangeSerializer.mContextInfoDepth.mEnd); + aInfoString = infoString; + + return rv; +} + +bool nsHTMLCopyEncoder::RangeNodeContext::IncludeInContext( + nsINode& aNode) const { + nsCOMPtr<nsIContent> content(nsIContent::FromNodeOrNull(&aNode)); + + if (!content) return false; + + return content->IsAnyOfHTMLElements( + nsGkAtoms::b, nsGkAtoms::i, nsGkAtoms::u, nsGkAtoms::a, nsGkAtoms::tt, + nsGkAtoms::s, nsGkAtoms::big, nsGkAtoms::small, nsGkAtoms::strike, + nsGkAtoms::em, nsGkAtoms::strong, nsGkAtoms::dfn, nsGkAtoms::code, + nsGkAtoms::cite, nsGkAtoms::var, nsGkAtoms::abbr, nsGkAtoms::font, + nsGkAtoms::script, nsGkAtoms::span, nsGkAtoms::pre, nsGkAtoms::h1, + nsGkAtoms::h2, nsGkAtoms::h3, nsGkAtoms::h4, nsGkAtoms::h5, + nsGkAtoms::h6); +} + +nsresult nsHTMLCopyEncoder::PromoteRange(nsRange* inRange) { + if (!inRange->IsPositioned()) { + return NS_ERROR_UNEXPECTED; + } + nsCOMPtr<nsINode> startNode = inRange->GetStartContainer(); + uint32_t startOffset = inRange->StartOffset(); + nsCOMPtr<nsINode> endNode = inRange->GetEndContainer(); + uint32_t endOffset = inRange->EndOffset(); + nsCOMPtr<nsINode> common = inRange->GetClosestCommonInclusiveAncestor(); + + nsCOMPtr<nsINode> opStartNode; + nsCOMPtr<nsINode> opEndNode; + int32_t opStartOffset, opEndOffset; + + // examine range endpoints. + nsresult rv = + GetPromotedPoint(kStart, startNode, static_cast<int32_t>(startOffset), + address_of(opStartNode), &opStartOffset, common); + NS_ENSURE_SUCCESS(rv, rv); + rv = GetPromotedPoint(kEnd, endNode, static_cast<int32_t>(endOffset), + address_of(opEndNode), &opEndOffset, common); + NS_ENSURE_SUCCESS(rv, rv); + + // if both range endpoints are at the common ancestor, check for possible + // inclusion of ancestors + if (opStartNode == common && opEndNode == common) { + rv = PromoteAncestorChain(address_of(opStartNode), &opStartOffset, + &opEndOffset); + NS_ENSURE_SUCCESS(rv, rv); + opEndNode = opStartNode; + } + + // set the range to the new values + ErrorResult err; + inRange->SetStart(*opStartNode, static_cast<uint32_t>(opStartOffset), err); + if (NS_WARN_IF(err.Failed())) { + return err.StealNSResult(); + } + inRange->SetEnd(*opEndNode, static_cast<uint32_t>(opEndOffset), err); + if (NS_WARN_IF(err.Failed())) { + return err.StealNSResult(); + } + return NS_OK; +} + +// PromoteAncestorChain will promote a range represented by +// [{*ioNode,*ioStartOffset} , {*ioNode,*ioEndOffset}] The promotion is +// different from that found in getPromotedPoint: it will only promote one +// endpoint if it can promote the other. Thus, instead of having a +// startnode/endNode, there is just the one ioNode. +nsresult nsHTMLCopyEncoder::PromoteAncestorChain(nsCOMPtr<nsINode>* ioNode, + int32_t* ioStartOffset, + int32_t* ioEndOffset) { + if (!ioNode || !ioStartOffset || !ioEndOffset) return NS_ERROR_NULL_POINTER; + + nsresult rv = NS_OK; + bool done = false; + + nsCOMPtr<nsINode> frontNode, endNode, parent; + int32_t frontOffset, endOffset; + + // save the editable state of the ioNode, so we don't promote an ancestor if + // it has different editable state + nsCOMPtr<nsINode> node = *ioNode; + bool isEditable = node->IsEditable(); + + // loop for as long as we can promote both endpoints + while (!done) { + node = *ioNode; + parent = node->GetParentNode(); + if (!parent) { + done = true; + } else { + // passing parent as last param to GetPromotedPoint() allows it to promote + // only one level up the hierarchy. + rv = GetPromotedPoint(kStart, *ioNode, *ioStartOffset, + address_of(frontNode), &frontOffset, parent); + NS_ENSURE_SUCCESS(rv, rv); + // then we make the same attempt with the endpoint + rv = GetPromotedPoint(kEnd, *ioNode, *ioEndOffset, address_of(endNode), + &endOffset, parent); + NS_ENSURE_SUCCESS(rv, rv); + + // if both endpoints were promoted one level and isEditable is the same as + // the original node, keep looping - otherwise we are done. + if ((frontNode != parent) || (endNode != parent) || + (frontNode->IsEditable() != isEditable)) + done = true; + else { + *ioNode = frontNode; + *ioStartOffset = frontOffset; + *ioEndOffset = endOffset; + } + } + } + return rv; +} + +nsresult nsHTMLCopyEncoder::GetPromotedPoint(Endpoint aWhere, nsINode* aNode, + int32_t aOffset, + nsCOMPtr<nsINode>* outNode, + int32_t* outOffset, + nsINode* common) { + nsresult rv = NS_OK; + nsCOMPtr<nsINode> node = aNode; + nsCOMPtr<nsINode> parent = aNode; + int32_t offset = aOffset; + bool bResetPromotion = false; + + // default values + *outNode = node; + *outOffset = offset; + + if (common == node) return NS_OK; + + if (aWhere == kStart) { + // some special casing for text nodes + if (auto nodeAsText = aNode->GetAsText()) { + // if not at beginning of text node, we are done + if (offset > 0) { + // unless everything before us in just whitespace. NOTE: we need a more + // general solution that truly detects all cases of non-significant + // whitesace with no false alarms. + nsAutoString text; + nodeAsText->SubstringData(0, offset, text, IgnoreErrors()); + text.CompressWhitespace(); + if (!text.IsEmpty()) return NS_OK; + bResetPromotion = true; + } + // else + rv = GetNodeLocation(aNode, address_of(parent), &offset); + NS_ENSURE_SUCCESS(rv, rv); + } else { + node = GetChildAt(parent, offset); + } + if (!node) node = parent; + + // finding the real start for this point. look up the tree for as long as + // we are the first node in the container, and as long as we haven't hit the + // body node. + if (!IsRoot(node) && (parent != common)) { + rv = GetNodeLocation(node, address_of(parent), &offset); + NS_ENSURE_SUCCESS(rv, rv); + if (offset == -1) return NS_OK; // we hit generated content; STOP + while ((IsFirstNode(node)) && (!IsRoot(parent)) && (parent != common)) { + if (bResetPromotion) { + nsCOMPtr<nsIContent> content = nsIContent::FromNodeOrNull(parent); + if (content && content->IsHTMLElement()) { + if (nsHTMLElement::IsBlock( + nsHTMLTags::AtomTagToId(content->NodeInfo()->NameAtom()))) { + bResetPromotion = false; + } + } + } + + node = parent; + rv = GetNodeLocation(node, address_of(parent), &offset); + NS_ENSURE_SUCCESS(rv, rv); + if (offset == -1) // we hit generated content; STOP + { + // back up a bit + parent = node; + offset = 0; + break; + } + } + if (bResetPromotion) { + *outNode = aNode; + *outOffset = aOffset; + } else { + *outNode = parent; + *outOffset = offset; + } + return rv; + } + } + + if (aWhere == kEnd) { + // some special casing for text nodes + if (auto nodeAsText = aNode->GetAsText()) { + // if not at end of text node, we are done + uint32_t len = aNode->Length(); + if (offset < (int32_t)len) { + // unless everything after us in just whitespace. NOTE: we need a more + // general solution that truly detects all cases of non-significant + // whitespace with no false alarms. + nsAutoString text; + nodeAsText->SubstringData(offset, len - offset, text, IgnoreErrors()); + text.CompressWhitespace(); + if (!text.IsEmpty()) return NS_OK; + bResetPromotion = true; + } + rv = GetNodeLocation(aNode, address_of(parent), &offset); + NS_ENSURE_SUCCESS(rv, rv); + } else { + if (offset) offset--; // we want node _before_ offset + node = GetChildAt(parent, offset); + } + if (!node) node = parent; + + // finding the real end for this point. look up the tree for as long as we + // are the last node in the container, and as long as we haven't hit the + // body node. + if (!IsRoot(node) && (parent != common)) { + rv = GetNodeLocation(node, address_of(parent), &offset); + NS_ENSURE_SUCCESS(rv, rv); + if (offset == -1) return NS_OK; // we hit generated content; STOP + while ((IsLastNode(node)) && (!IsRoot(parent)) && (parent != common)) { + if (bResetPromotion) { + nsCOMPtr<nsIContent> content = nsIContent::FromNodeOrNull(parent); + if (content && content->IsHTMLElement()) { + if (nsHTMLElement::IsBlock( + nsHTMLTags::AtomTagToId(content->NodeInfo()->NameAtom()))) { + bResetPromotion = false; + } + } + } + + node = parent; + rv = GetNodeLocation(node, address_of(parent), &offset); + NS_ENSURE_SUCCESS(rv, rv); + if (offset == -1) // we hit generated content; STOP + { + // back up a bit + parent = node; + offset = 0; + break; + } + } + if (bResetPromotion) { + *outNode = aNode; + *outOffset = aOffset; + } else { + *outNode = parent; + offset++; // add one since this in an endpoint - want to be AFTER node. + *outOffset = offset; + } + return rv; + } + } + + return rv; +} + +nsCOMPtr<nsINode> nsHTMLCopyEncoder::GetChildAt(nsINode* aParent, + int32_t aOffset) { + nsCOMPtr<nsINode> resultNode; + + if (!aParent) return resultNode; + + nsCOMPtr<nsIContent> content = nsIContent::FromNodeOrNull(aParent); + MOZ_ASSERT(content, "null content in nsHTMLCopyEncoder::GetChildAt"); + + resultNode = content->GetChildAt_Deprecated(aOffset); + + return resultNode; +} + +bool nsHTMLCopyEncoder::IsMozBR(Element* aElement) { + HTMLBRElement* brElement = HTMLBRElement::FromNodeOrNull(aElement); + return brElement && brElement->IsPaddingForEmptyLastLine(); +} + +nsresult nsHTMLCopyEncoder::GetNodeLocation(nsINode* inChild, + nsCOMPtr<nsINode>* outParent, + int32_t* outOffset) { + NS_ASSERTION((inChild && outParent && outOffset), "bad args"); + if (inChild && outParent && outOffset) { + nsCOMPtr<nsIContent> child = nsIContent::FromNodeOrNull(inChild); + if (!child) { + return NS_ERROR_NULL_POINTER; + } + + nsIContent* parent = child->GetParent(); + if (!parent) { + return NS_ERROR_NULL_POINTER; + } + + *outParent = parent; + *outOffset = parent->ComputeIndexOf_Deprecated(child); + return NS_OK; + } + return NS_ERROR_NULL_POINTER; +} + +bool nsHTMLCopyEncoder::IsRoot(nsINode* aNode) { + nsCOMPtr<nsIContent> content = nsIContent::FromNodeOrNull(aNode); + if (!content) { + return false; + } + + if (mIsTextWidget) { + return content->IsHTMLElement(nsGkAtoms::div); + } + + return content->IsAnyOfHTMLElements(nsGkAtoms::body, nsGkAtoms::td, + nsGkAtoms::th); +} + +bool nsHTMLCopyEncoder::IsFirstNode(nsINode* aNode) { + // need to check if any nodes before us are really visible. + // Mike wrote something for me along these lines in nsSelectionController, + // but I don't think it's ready for use yet - revisit. + // HACK: for now, simply consider all whitespace text nodes to be + // invisible formatting nodes. + for (nsIContent* sibling = aNode->GetPreviousSibling(); sibling; + sibling = sibling->GetPreviousSibling()) { + if (!sibling->TextIsOnlyWhitespace()) { + return false; + } + } + + return true; +} + +bool nsHTMLCopyEncoder::IsLastNode(nsINode* aNode) { + // need to check if any nodes after us are really visible. + // Mike wrote something for me along these lines in nsSelectionController, + // but I don't think it's ready for use yet - revisit. + // HACK: for now, simply consider all whitespace text nodes to be + // invisible formatting nodes. + for (nsIContent* sibling = aNode->GetNextSibling(); sibling; + sibling = sibling->GetNextSibling()) { + if (sibling->IsElement() && IsMozBR(sibling->AsElement())) { + // we ignore trailing moz BRs. + continue; + } + if (!sibling->TextIsOnlyWhitespace()) { + return false; + } + } + + return true; +} + +already_AddRefed<nsIDocumentEncoder> do_createHTMLCopyEncoder() { + return do_AddRef(new nsHTMLCopyEncoder); +} + +int32_t nsHTMLCopyEncoder::RangeNodeContext::GetImmediateContextCount( + const nsTArray<nsINode*>& aAncestorArray) const { + int32_t i = aAncestorArray.Length(), j = 0; + while (j < i) { + nsINode* node = aAncestorArray.ElementAt(j); + if (!node) { + break; + } + nsCOMPtr<nsIContent> content(nsIContent::FromNodeOrNull(node)); + if (!content || !content->IsAnyOfHTMLElements( + nsGkAtoms::tr, nsGkAtoms::thead, nsGkAtoms::tbody, + nsGkAtoms::tfoot, nsGkAtoms::table)) { + break; + } + ++j; + } + return j; +} diff --git a/dom/serializers/nsHTMLContentSerializer.cpp b/dom/serializers/nsHTMLContentSerializer.cpp new file mode 100644 index 0000000000..a0b8c8882c --- /dev/null +++ b/dom/serializers/nsHTMLContentSerializer.cpp @@ -0,0 +1,445 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* + * nsIContentSerializer implementation that can be used with an + * nsIDocumentEncoder to convert an HTML (not XHTML!) DOM to an HTML + * string that could be parsed into more or less the original DOM. + */ + +#include "nsHTMLContentSerializer.h" + +#include "nsIContent.h" +#include "mozilla/dom/Document.h" +#include "nsElementTable.h" +#include "nsNameSpaceManager.h" +#include "nsString.h" +#include "nsUnicharUtils.h" +#include "nsIDocumentEncoder.h" +#include "nsGkAtoms.h" +#include "nsIURI.h" +#include "nsNetUtil.h" +#include "nsEscape.h" +#include "nsCRT.h" +#include "nsContentUtils.h" +#include "nsIScriptElement.h" +#include "nsAttrName.h" +#include "mozilla/dom/Element.h" +#include "nsParserConstants.h" + +using namespace mozilla::dom; + +nsresult NS_NewHTMLContentSerializer(nsIContentSerializer** aSerializer) { + RefPtr<nsHTMLContentSerializer> it = new nsHTMLContentSerializer(); + it.forget(aSerializer); + return NS_OK; +} + +nsHTMLContentSerializer::nsHTMLContentSerializer() { mIsHTMLSerializer = true; } + +nsHTMLContentSerializer::~nsHTMLContentSerializer() = default; + +NS_IMETHODIMP +nsHTMLContentSerializer::AppendDocumentStart(Document* aDocument) { + return NS_OK; +} + +bool nsHTMLContentSerializer::SerializeHTMLAttributes( + Element* aElement, Element* aOriginalElement, nsAString& aTagPrefix, + const nsAString& aTagNamespaceURI, nsAtom* aTagName, int32_t aNamespace, + nsAString& aStr) { + MaybeSerializeIsValue(aElement, aStr); + + int32_t count = aElement->GetAttrCount(); + if (!count) return true; + + nsresult rv; + nsAutoString valueStr; + + for (int32_t index = 0; index < count; index++) { + const nsAttrName* name = aElement->GetAttrNameAt(index); + int32_t namespaceID = name->NamespaceID(); + nsAtom* attrName = name->LocalName(); + + // Filter out any attribute starting with [-|_]moz + nsDependentAtomString attrNameStr(attrName); + if (StringBeginsWith(attrNameStr, u"_moz"_ns) || + StringBeginsWith(attrNameStr, u"-moz"_ns)) { + continue; + } + aElement->GetAttr(namespaceID, attrName, valueStr); + + if (mIsCopying && mIsFirstChildOfOL && aTagName == nsGkAtoms::li && + aNamespace == kNameSpaceID_XHTML && attrName == nsGkAtoms::value && + namespaceID == kNameSpaceID_None) { + // This is handled separately in SerializeLIValueAttribute() + continue; + } + bool isJS = IsJavaScript(aElement, attrName, namespaceID, valueStr); + + if (((attrName == nsGkAtoms::href && (namespaceID == kNameSpaceID_None || + namespaceID == kNameSpaceID_XLink)) || + (attrName == nsGkAtoms::src && namespaceID == kNameSpaceID_None))) { + // Make all links absolute when converting only the selection: + if (mFlags & nsIDocumentEncoder::OutputAbsoluteLinks) { + // Would be nice to handle OBJECT tags, but that gets more complicated + // since we have to search the tag list for CODEBASE as well. For now, + // just leave them relative. + nsIURI* uri = aElement->GetBaseURI(); + if (uri) { + nsAutoString absURI; + rv = NS_MakeAbsoluteURI(absURI, valueStr, uri); + if (NS_SUCCEEDED(rv)) { + valueStr = absURI; + } + } + } + } + + if (mRewriteEncodingDeclaration && aTagName == nsGkAtoms::meta && + aNamespace == kNameSpaceID_XHTML && attrName == nsGkAtoms::content && + namespaceID == kNameSpaceID_None) { + // If we're serializing a <meta http-equiv="content-type">, + // use the proper value, rather than what's in the document. + nsAutoString header; + aElement->GetAttr(nsGkAtoms::httpEquiv, header); + if (header.LowerCaseEqualsLiteral("content-type")) { + valueStr = u"text/html; charset="_ns + NS_ConvertASCIItoUTF16(mCharset); + } + } + + nsDependentAtomString nameStr(attrName); + nsAutoString prefix; + if (namespaceID == kNameSpaceID_XML) { + prefix.AssignLiteral(u"xml"); + } else if (namespaceID == kNameSpaceID_XLink) { + prefix.AssignLiteral(u"xlink"); + } + + // Expand shorthand attribute. + if (aNamespace == kNameSpaceID_XHTML && namespaceID == kNameSpaceID_None && + IsShorthandAttr(attrName, aTagName) && valueStr.IsEmpty()) { + valueStr = nameStr; + } + NS_ENSURE_TRUE(SerializeAttr(prefix, nameStr, valueStr, aStr, !isJS), + false); + } + + return true; +} + +NS_IMETHODIMP +nsHTMLContentSerializer::AppendElementStart(Element* aElement, + Element* aOriginalElement) { + NS_ENSURE_ARG(aElement); + NS_ENSURE_STATE(mOutput); + + bool forceFormat = false; + nsresult rv = NS_OK; + if (!CheckElementStart(aElement, forceFormat, *mOutput, rv)) { + // When we go to AppendElementEnd for this element, we're going to + // MaybeLeaveFromPreContent(). So make sure to MaybeEnterInPreContent() + // now, so our PreLevel() doesn't get confused. + MaybeEnterInPreContent(aElement); + return rv; + } + + NS_ENSURE_SUCCESS(rv, rv); + + nsAtom* name = aElement->NodeInfo()->NameAtom(); + int32_t ns = aElement->GetNameSpaceID(); + + bool lineBreakBeforeOpen = LineBreakBeforeOpen(ns, name); + + if ((mDoFormat || forceFormat) && !mDoRaw && !PreLevel()) { + if (mColPos && lineBreakBeforeOpen) { + NS_ENSURE_TRUE(AppendNewLineToString(*mOutput), NS_ERROR_OUT_OF_MEMORY); + } else { + NS_ENSURE_TRUE(MaybeAddNewlineForRootNode(*mOutput), + NS_ERROR_OUT_OF_MEMORY); + } + if (!mColPos) { + NS_ENSURE_TRUE(AppendIndentation(*mOutput), NS_ERROR_OUT_OF_MEMORY); + } else if (mAddSpace) { + bool result = AppendToString(char16_t(' '), *mOutput); + mAddSpace = false; + NS_ENSURE_TRUE(result, NS_ERROR_OUT_OF_MEMORY); + } + } else if (mAddSpace) { + bool result = AppendToString(char16_t(' '), *mOutput); + mAddSpace = false; + NS_ENSURE_TRUE(result, NS_ERROR_OUT_OF_MEMORY); + } else { + NS_ENSURE_TRUE(MaybeAddNewlineForRootNode(*mOutput), + NS_ERROR_OUT_OF_MEMORY); + } + // Always reset to avoid false newlines in case MaybeAddNewlineForRootNode + // wasn't called + mAddNewlineForRootNode = false; + + NS_ENSURE_TRUE(AppendToString(kLessThan, *mOutput), NS_ERROR_OUT_OF_MEMORY); + + NS_ENSURE_TRUE(AppendToString(nsDependentAtomString(name), *mOutput), + NS_ERROR_OUT_OF_MEMORY); + + MaybeEnterInPreContent(aElement); + + // for block elements, we increase the indentation + if ((mDoFormat || forceFormat) && !mDoRaw && !PreLevel()) + NS_ENSURE_TRUE(IncrIndentation(name), NS_ERROR_OUT_OF_MEMORY); + + // Need to keep track of OL and LI elements in order to get ordinal number + // for the LI. + if (mIsCopying && name == nsGkAtoms::ol && ns == kNameSpaceID_XHTML) { + // We are copying and current node is an OL; + // Store its start attribute value in olState->startVal. + nsAutoString start; + int32_t startAttrVal = 0; + + aElement->GetAttr(nsGkAtoms::start, start); + if (!start.IsEmpty()) { + nsresult rv = NS_OK; + startAttrVal = start.ToInteger(&rv); + // If OL has "start" attribute, first LI element has to start with that + // value Therefore subtracting 1 as all the LI elements are incrementing + // it before using it; In failure of ToInteger(), default StartAttrValue + // to 0. + if (NS_SUCCEEDED(rv)) + startAttrVal--; + else + startAttrVal = 0; + } + mOLStateStack.AppendElement(olState(startAttrVal, true)); + } + + if (mIsCopying && name == nsGkAtoms::li && ns == kNameSpaceID_XHTML) { + mIsFirstChildOfOL = IsFirstChildOfOL(aOriginalElement); + if (mIsFirstChildOfOL) { + // If OL is parent of this LI, serialize attributes in different manner. + NS_ENSURE_TRUE(SerializeLIValueAttribute(aElement, *mOutput), + NS_ERROR_OUT_OF_MEMORY); + } + } + + // Even LI passed above have to go through this + // for serializing attributes other than "value". + nsAutoString dummyPrefix; + NS_ENSURE_TRUE( + SerializeHTMLAttributes(aElement, aOriginalElement, dummyPrefix, u""_ns, + name, ns, *mOutput), + NS_ERROR_OUT_OF_MEMORY); + + NS_ENSURE_TRUE(AppendToString(kGreaterThan, *mOutput), + NS_ERROR_OUT_OF_MEMORY); + + if (ns == kNameSpaceID_XHTML && + (name == nsGkAtoms::script || name == nsGkAtoms::style || + (name == nsGkAtoms::noscript && + aElement->OwnerDoc()->IsScriptEnabled()) || + name == nsGkAtoms::noframes)) { + ++mDisableEntityEncoding; + } + + if ((mDoFormat || forceFormat) && !mDoRaw && !PreLevel() && + LineBreakAfterOpen(ns, name)) { + NS_ENSURE_TRUE(AppendNewLineToString(*mOutput), NS_ERROR_OUT_OF_MEMORY); + } + + NS_ENSURE_TRUE(AfterElementStart(aElement, aOriginalElement, *mOutput), + NS_ERROR_OUT_OF_MEMORY); + + return NS_OK; +} + +NS_IMETHODIMP +nsHTMLContentSerializer::AppendElementEnd(Element* aElement, + Element* aOriginalElement) { + NS_ENSURE_ARG(aElement); + NS_ENSURE_STATE(mOutput); + + nsAtom* name = aElement->NodeInfo()->NameAtom(); + int32_t ns = aElement->GetNameSpaceID(); + + if (ns == kNameSpaceID_XHTML && + (name == nsGkAtoms::script || name == nsGkAtoms::style || + (name == nsGkAtoms::noscript && + aElement->OwnerDoc()->IsScriptEnabled()) || + name == nsGkAtoms::noframes)) { + --mDisableEntityEncoding; + } + + bool forceFormat = !(mFlags & nsIDocumentEncoder::OutputIgnoreMozDirty) && + aElement->HasAttr(nsGkAtoms::mozdirty); + + if ((mDoFormat || forceFormat) && !mDoRaw && !PreLevel()) { + DecrIndentation(name); + } + + if (name == nsGkAtoms::script) { + nsCOMPtr<nsIScriptElement> script = do_QueryInterface(aElement); + + if (ShouldMaintainPreLevel() && script && script->IsMalformed()) { + // We're looking at a malformed script tag. This means that the end tag + // was missing in the source. Imitate that here by not serializing the end + // tag. + --PreLevel(); + return NS_OK; + } + } else if (mIsCopying && name == nsGkAtoms::ol && ns == kNameSpaceID_XHTML) { + NS_ASSERTION((!mOLStateStack.IsEmpty()), "Cannot have an empty OL Stack"); + /* Though at this point we must always have an state to be deleted as all + the OL opening tags are supposed to push an olState object to the stack*/ + if (!mOLStateStack.IsEmpty()) { + mOLStateStack.RemoveLastElement(); + } + } + + if (ns == kNameSpaceID_XHTML) { + bool isContainer = + nsHTMLElement::IsContainer(nsHTMLTags::CaseSensitiveAtomTagToId(name)); + if (!isContainer) { + // Keep this in sync with the cleanup at the end of this method. + MOZ_ASSERT(name != nsGkAtoms::body); + MaybeLeaveFromPreContent(aElement); + return NS_OK; + } + } + + if ((mDoFormat || forceFormat) && !mDoRaw && !PreLevel()) { + bool lineBreakBeforeClose = LineBreakBeforeClose(ns, name); + + if (mColPos && lineBreakBeforeClose) { + NS_ENSURE_TRUE(AppendNewLineToString(*mOutput), NS_ERROR_OUT_OF_MEMORY); + } + if (!mColPos) { + NS_ENSURE_TRUE(AppendIndentation(*mOutput), NS_ERROR_OUT_OF_MEMORY); + } else if (mAddSpace) { + bool result = AppendToString(char16_t(' '), *mOutput); + mAddSpace = false; + NS_ENSURE_TRUE(result, NS_ERROR_OUT_OF_MEMORY); + } + } else if (mAddSpace) { + bool result = AppendToString(char16_t(' '), *mOutput); + mAddSpace = false; + NS_ENSURE_TRUE(result, NS_ERROR_OUT_OF_MEMORY); + } + + NS_ENSURE_TRUE(AppendToString(kEndTag, *mOutput), NS_ERROR_OUT_OF_MEMORY); + NS_ENSURE_TRUE(AppendToString(nsDependentAtomString(name), *mOutput), + NS_ERROR_OUT_OF_MEMORY); + NS_ENSURE_TRUE(AppendToString(kGreaterThan, *mOutput), + NS_ERROR_OUT_OF_MEMORY); + + // Keep this cleanup in sync with the IsContainer() early return above. + MaybeLeaveFromPreContent(aElement); + + if ((mDoFormat || forceFormat) && !mDoRaw && !PreLevel() && + LineBreakAfterClose(ns, name)) { + NS_ENSURE_TRUE(AppendNewLineToString(*mOutput), NS_ERROR_OUT_OF_MEMORY); + } else { + MaybeFlagNewlineForRootNode(aElement); + } + + if (name == nsGkAtoms::body && ns == kNameSpaceID_XHTML) { + --mInBody; + } + + return NS_OK; +} + +static const uint16_t kValNBSP = 160; + +#define _ 0 + +// This table indexes into kEntityStrings[]. +const uint8_t nsHTMLContentSerializer::kEntities[] = { + // clang-format off + _, _, _, _, _, _, _, _, _, _, + _, _, _, _, _, _, _, _, _, _, + _, _, _, _, _, _, _, _, _, _, + _, _, _, _, _, _, _, _, 2, _, + _, _, _, _, _, _, _, _, _, _, + _, _, _, _, _, _, _, _, _, _, + 3, _, 4, _, _, _, _, _, _, _, + _, _, _, _, _, _, _, _, _, _, + _, _, _, _, _, _, _, _, _, _, + _, _, _, _, _, _, _, _, _, _, + _, _, _, _, _, _, _, _, _, _, + _, _, _, _, _, _, _, _, _, _, + _, _, _, _, _, _, _, _, _, _, + _, _, _, _, _, _, _, _, _, _, + _, _, _, _, _, _, _, _, _, _, + _, _, _, _, _, _, _, _, _, _, + 5 + // clang-format on +}; + +// This table indexes into kEntityStrings[]. +const uint8_t nsHTMLContentSerializer::kAttrEntities[] = { + // clang-format off + _, _, _, _, _, _, _, _, _, _, + _, _, _, _, _, _, _, _, _, _, + _, _, _, _, _, _, _, _, _, _, + _, _, _, _, 1, _, _, _, 2, _, + _, _, _, _, _, _, _, _, _, _, + _, _, _, _, _, _, _, _, _, _, + 3, _, 4, _, _, _, _, _, _, _, + _, _, _, _, _, _, _, _, _, _, + _, _, _, _, _, _, _, _, _, _, + _, _, _, _, _, _, _, _, _, _, + _, _, _, _, _, _, _, _, _, _, + _, _, _, _, _, _, _, _, _, _, + _, _, _, _, _, _, _, _, _, _, + _, _, _, _, _, _, _, _, _, _, + _, _, _, _, _, _, _, _, _, _, + _, _, _, _, _, _, _, _, _, _, + 5 + // clang-format on +}; + +#undef _ + +const char* const nsHTMLContentSerializer::kEntityStrings[] = { + /* 0 */ nullptr, + /* 1 */ """, + /* 2 */ "&", + /* 3 */ "<", + /* 4 */ ">", + /* 5 */ " "}; + +bool nsHTMLContentSerializer::AppendAndTranslateEntities( + const nsAString& aStr, nsAString& aOutputStr) { + if (mBodyOnly && !mInBody) { + return true; + } + + if (mDisableEntityEncoding) { + return aOutputStr.Append(aStr, mozilla::fallible); + } + + if (mFlags & (nsIDocumentEncoder::OutputEncodeBasicEntities)) { + // Per the API documentation, encode , &, <, >, and " + if (mInAttribute) { + return nsXMLContentSerializer::AppendAndTranslateEntities<kValNBSP>( + aStr, aOutputStr, kAttrEntities, kEntityStrings); + } + + return nsXMLContentSerializer::AppendAndTranslateEntities<kValNBSP>( + aStr, aOutputStr, kEntities, kEntityStrings); + } + + // We don't want to call into our superclass 2-arg version of + // AppendAndTranslateEntities, because it wants to encode more characters + // than we do. Use our tables, but avoid encoding by passing in a + // smaller max index. This will only encode &, <, >, and ". + if (mInAttribute) { + return nsXMLContentSerializer::AppendAndTranslateEntities<kGTVal>( + aStr, aOutputStr, kAttrEntities, kEntityStrings); + } + + return nsXMLContentSerializer::AppendAndTranslateEntities<kGTVal>( + aStr, aOutputStr, kEntities, kEntityStrings); +} diff --git a/dom/serializers/nsHTMLContentSerializer.h b/dom/serializers/nsHTMLContentSerializer.h new file mode 100644 index 0000000000..7307eb6e3c --- /dev/null +++ b/dom/serializers/nsHTMLContentSerializer.h @@ -0,0 +1,53 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* + * nsIContentSerializer implementation that can be used with an + * nsIDocumentEncoder to convert an HTML (not XHTML!) DOM to an HTML + * string that could be parsed into more or less the original DOM. + */ + +#ifndef nsHTMLContentSerializer_h__ +#define nsHTMLContentSerializer_h__ + +#include "mozilla/Attributes.h" +#include "nsXHTMLContentSerializer.h" +#include "nsString.h" + +class nsAtom; + +class nsHTMLContentSerializer final : public nsXHTMLContentSerializer { + public: + nsHTMLContentSerializer(); + virtual ~nsHTMLContentSerializer(); + + NS_IMETHOD AppendElementStart( + mozilla::dom::Element* aElement, + mozilla::dom::Element* aOriginalElement) override; + + NS_IMETHOD AppendElementEnd(mozilla::dom::Element* aElement, + mozilla::dom::Element* aOriginalElement) override; + + NS_IMETHOD AppendDocumentStart(mozilla::dom::Document* aDocument) override; + + protected: + [[nodiscard]] virtual bool SerializeHTMLAttributes( + mozilla::dom::Element* aContent, mozilla::dom::Element* aOriginalElement, + nsAString& aTagPrefix, const nsAString& aTagNamespaceURI, + nsAtom* aTagName, int32_t aNamespace, nsAString& aStr); + + [[nodiscard]] virtual bool AppendAndTranslateEntities( + const nsAString& aStr, nsAString& aOutputStr) override; + + private: + static const uint8_t kEntities[]; + static const uint8_t kAttrEntities[]; + static const char* const kEntityStrings[]; +}; + +nsresult NS_NewHTMLContentSerializer(nsIContentSerializer** aSerializer); + +#endif diff --git a/dom/serializers/nsIContentSerializer.h b/dom/serializers/nsIContentSerializer.h new file mode 100644 index 0000000000..18e9e5b4cd --- /dev/null +++ b/dom/serializers/nsIContentSerializer.h @@ -0,0 +1,97 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef nsIContentSerializer_h +#define nsIContentSerializer_h + +#include "nsISupports.h" +#include "nsStringFwd.h" + +class nsIContent; + +namespace mozilla { +class Encoding; +namespace dom { +class Comment; +class Document; +class DocumentType; +class Element; +class ProcessingInstruction; +} // namespace dom +} // namespace mozilla + +#define NS_ICONTENTSERIALIZER_IID \ + { \ + 0xb1ee32f2, 0xb8c4, 0x49b9, { \ + 0x93, 0xdf, 0xb6, 0xfa, 0xb5, 0xd5, 0x46, 0x88 \ + } \ + } + +class nsIContentSerializer : public nsISupports { + public: + NS_DECLARE_STATIC_IID_ACCESSOR(NS_ICONTENTSERIALIZER_IID) + + /** + * @param aOutput The `Append*` methods will append to this string. The + * reference to it will be dropped with `Finish`. + */ + NS_IMETHOD Init(uint32_t flags, uint32_t aWrapColumn, + const mozilla::Encoding* aEncoding, bool aIsCopying, + bool aIsWholeDocument, bool* aNeedsPerformatScanning, + nsAString& aOutput) = 0; + + NS_IMETHOD AppendText(nsIContent* aText, int32_t aStartOffset, + int32_t aEndOffset) = 0; + + NS_IMETHOD AppendCDATASection(nsIContent* aCDATASection, int32_t aStartOffset, + int32_t aEndOffset) = 0; + + NS_IMETHOD AppendProcessingInstruction( + mozilla::dom::ProcessingInstruction* aPI, int32_t aStartOffset, + int32_t aEndOffset) = 0; + + NS_IMETHOD AppendComment(mozilla::dom::Comment* aComment, + int32_t aStartOffset, int32_t aEndOffset) = 0; + + NS_IMETHOD AppendDoctype(mozilla::dom::DocumentType* aDoctype) = 0; + + NS_IMETHOD AppendElementStart(mozilla::dom::Element* aElement, + mozilla::dom::Element* aOriginalElement) = 0; + + NS_IMETHOD AppendElementEnd(mozilla::dom::Element* aElement, + mozilla::dom::Element* aOriginalElement) = 0; + + NS_IMETHOD FlushAndFinish() = 0; + + /** + * Drops the reference to the output buffer. + */ + NS_IMETHOD Finish() = 0; + + NS_IMETHOD GetOutputLength(uint32_t& aLength) const = 0; + + /** + * Append any items in the beginning of the document that won't be + * serialized by other methods. XML declaration is the most likely + * thing this method can produce. + */ + NS_IMETHOD AppendDocumentStart(mozilla::dom::Document* aDocument) = 0; + + // If Init() sets *aNeedsPerformatScanning to true, then these methods are + // called when elements are started and ended, before AppendElementStart + // and AppendElementEnd, respectively. They are supposed to be used to + // allow the implementer to keep track of whether the element is + // preformatted. + NS_IMETHOD ScanElementForPreformat(mozilla::dom::Element* aElement) = 0; + NS_IMETHOD ForgetElementForPreformat(mozilla::dom::Element* aElement) = 0; +}; + +NS_DEFINE_STATIC_IID_ACCESSOR(nsIContentSerializer, NS_ICONTENTSERIALIZER_IID) + +#define NS_CONTENTSERIALIZER_CONTRACTID_PREFIX \ + "@mozilla.org/layout/contentserializer;1?mimetype=" + +#endif /* nsIContentSerializer_h */ diff --git a/dom/serializers/nsIDocumentEncoder.idl b/dom/serializers/nsIDocumentEncoder.idl new file mode 100644 index 0000000000..d909c3989a --- /dev/null +++ b/dom/serializers/nsIDocumentEncoder.idl @@ -0,0 +1,361 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsISupports.idl" + +interface nsIOutputStream; + +webidl Document; +webidl Node; +webidl Range; +webidl Selection; + +%{ C++ +class nsINode; + +%} +[ptr] native nsINodePtr(nsINode); + +[scriptable, uuid(3d9371d8-a2ad-403e-8b0e-8885ad3562e3)] +interface nsIDocumentEncoderNodeFixup : nsISupports +{ + /** + * Create a fixed up version of a node. This method is called before + * each node in a document is about to be persisted. The implementor + * may return a new node with fixed up attributes or null. If null is + * returned the node should be used as-is. + * @param aNode Node to fixup. + * @param [OUT] aSerializeCloneKids True if the document encoder should + * apply recursive serialization to the children of the fixed up node + * instead of the children of the original node. + * @return The resulting fixed up node. + */ + Node fixupNode(in Node aNode, out boolean aSerializeCloneKids); +}; + +[scriptable, uuid(21f112df-d96f-47da-bfcb-5331273003d1)] +interface nsIDocumentEncoder : nsISupports +{ + // Output methods flag bits. There are a frightening number of these, + // because everyone wants something a little bit different + + + /** + * Output only the selection (as opposed to the whole document). + */ + const unsigned long OutputSelectionOnly = (1 << 0); + + /** + * Plaintext output: + * - Convert html to plaintext that looks like the html. + * - Can't be used in conjunction with `OutputPreformatted`. + * - Implies wrap (except inside <pre>), since html wraps. + * HTML and XHTML output: + * - Do prettyprinting, ignoring existing formatting. + * - Implies wrap (except in attribute values and inside <pre>). + * XML output: + * - Do prettyprinting, ignoring existing formatting. + * - Doesn't implicitly wrap + */ + const unsigned long OutputFormatted = (1 << 1); + + /** Don't do prettyprinting. Don't do any wrapping that's not in the existing + * HTML/XML source. This option overrides OutputFormatted if both are set. + * HTML/XHTML output: If neither are set, there won't be prettyprinting too, but + * long lines will be wrapped. + * Supported also in XML and Plaintext output. + * @note This option does not affect entity conversion. + */ + const unsigned long OutputRaw = (1 << 2); + + /** + * Do not print html head tags. + * XHTML/HTML output only. + */ + const unsigned long OutputBodyOnly = (1 << 3); + + /** + * Output as though the content is preformatted + * (e.g. maybe it's wrapped in a PRE or PRE_WRAP style tag) + * Plaintext output only. + * Can't be used together with `OutputFormatted`/`OutputFormatFlowed`. + * XXXbz How does this interact with OutputRaw? + */ + const unsigned long OutputPreformatted = (1 << 4); + + /** + * Wrap even if we're not doing formatted output (e.g. for text fields). + * Supported in XML, XHTML, HTML and Plaintext output. + * Set implicitly in HTML/XHTML output when no OutputRaw. + * Ignored when OutputRaw. + * For XML, XHTML and HTML: does not wrap values in attributes. + * XXXLJ: set implicitly in HTML/XHTML output, to keep compatible behaviors + * for old callers of this interface + * XXXbz How does this interact with OutputFormatFlowed? + */ + const unsigned long OutputWrap = (1 << 5); + + /** + * Output for format flowed (RFC 2646). This is used when converting + * to text for mail sending. This differs just slightly + * but in an important way from normal formatted, and that is that + * lines are space stuffed. This can't (correctly) be done later. + * PlainText output only. + * If this flag is set, `OutputFormat` has to be set too. + * XXXbz How does this interact with OutputRaw/OutputWrap? + */ + const unsigned long OutputFormatFlowed = (1 << 6); + + /** + * Convert links, image src, and script src to absolute URLs when possible. + * XHTML/HTML output only. + */ + const unsigned long OutputAbsoluteLinks = (1 << 7); + + /** + * LineBreak processing: if this flag is set than CR line breaks will + * be written. If neither this nor OutputLFLineBreak is set, then we + * will use platform line breaks. The combination of the two flags will + * cause CRLF line breaks to be written. + */ + const unsigned long OutputCRLineBreak = (1 << 9); + + /** + * LineBreak processing: if this flag is set than LF line breaks will + * be written. If neither this nor OutputCRLineBreak is set, then we + * will use platform line breaks. The combination of the two flags will + * cause CRLF line breaks to be written. + */ + const unsigned long OutputLFLineBreak = (1 << 10); + + /** + * Output the content of noscript elements (only for serializing + * to plaintext). + */ + const unsigned long OutputNoScriptContent = (1 << 11); + + /** + * Output the content of noframes elements (only for serializing + * to plaintext). (Used only internally in the plain text serializer; + * ignored if passed by the caller.) + */ + const unsigned long OutputNoFramesContent = (1 << 12); + + /** + * Don't allow any formatting nodes (e.g. <br>, <b>) inside a <pre>. + * This is used primarily by mail. XHTML/HTML output only. + */ + const unsigned long OutputNoFormattingInPre = (1 << 13); + + /** + * Encode entities when outputting to a string. + * E.g. If set, we'll output if clear, we'll output 0xa0. + * The basic set is just & < > " for interoperability + * with older products that don't support α and friends. + * HTML output only. + */ + const unsigned long OutputEncodeBasicEntities = (1 << 14); + + /** + * Normally is replaced with a space character when + * encoding data as plain text, set this flag if that's + * not desired. + * Plaintext output only. + */ + const unsigned long OutputPersistNBSP = (1 << 17); + + /** + * Normally when serializing the whole document using the HTML or + * XHTML serializer, the encoding declaration is rewritten to match. + * This flag suppresses that behavior. + */ + const unsigned long OutputDontRewriteEncodingDeclaration = (1 << 18); + + /** + * When using the HTML or XHTML serializer, skip elements that are not + * visible when this flag is set. Elements are not visible when they + * have CSS style display:none or visibility:collapse, for example. + */ + const unsigned long SkipInvisibleContent = (1 << 19); + + /** + * Output for delsp=yes (RFC 3676). This is used with OutputFormatFlowed + * when converting to text for mail sending. + * PlainText output only. + */ + const unsigned long OutputFormatDelSp = (1 << 20); + + /** + * Drop <br> elements considered "invisible" by the editor. OutputPreformatted + * implies this flag. + */ + const unsigned long OutputDropInvisibleBreak = (1 << 21); + + /** + * Don't check for _moz_dirty attributes when deciding whether to + * pretty-print if this flag is set (bug 599983). + */ + const unsigned long OutputIgnoreMozDirty = (1 << 22); + + /** + * Serialize in a way that is suitable for copying a plaintext version of the + * document to the clipboard. This can for example cause line endings to be + * injected at preformatted block element boundaries. + */ + const unsigned long OutputForPlainTextClipboardCopy = (1 << 25); + + /** + * Include ruby annotations and ruby parentheses in the output. + * PlainText output only. + */ + const unsigned long OutputRubyAnnotation = (1 << 26); + + /** + * Disallow breaking of long character strings. This is important + * for serializing e-mail which contains CJK strings. These must + * not be broken just as "normal" longs strings aren't broken. + */ + const unsigned long OutputDisallowLineBreaking = (1 << 27); + + /** + * Release reference of Document after using encodeTo* method to recycle + * this encoder without holding Document. To use this encoder again, + * we have to call init again. + */ + const unsigned long RequiresReinitAfterOutput = (1 << 28); + + /** + * Initialize with a pointer to the document and the mime type. + * Resets wrap column to 72 and resets node fixup. + * @param aDocument Document to encode. + * @param aMimeType MimeType to use. May also be set by SetMimeType. + * @param aFlags Flags to use while encoding. May also be set by SetFlags. + */ + void init(in Document aDocument, + in AString aMimeType, + in unsigned long aFlags); + [noscript] void nativeInit(in Document aDocument, + in AString aMimeType, + in unsigned long aFlags); + + /** + * If the selection is set to a non-null value, then the + * selection is used for encoding, otherwise the entire + * document is encoded. + * @param aSelection The selection to encode. + */ + void setSelection(in Selection aSelection); + + /** + * If the range is set to a non-null value, then the + * range is used for encoding, otherwise the entire + * document or selection is encoded. + * @param aRange The range to encode. + */ + void setRange(in Range aRange); + + /** + * If the node is set to a non-null value, then the + * node is used for encoding, otherwise the entire + * document or range or selection is encoded. + * @param aNode The node to encode. + */ + void setNode(in Node aNode); + + /** + * If the container is set to a non-null value, then its + * child nodes are used for encoding, otherwise the entire + * document or range or selection or node is encoded. + * @param aContainer The node which child nodes will be encoded. + */ + void setContainerNode(in Node aContainer); + + /** + * Documents typically have an intrinsic character set, + * but if no intrinsic value is found, the platform character set + * is used. This function overrides both the intrinisc and platform + * charset. + * @param aCharset Overrides the both the intrinsic or platform + * character set when encoding the document. + * + * Possible result codes: NS_ERROR_NO_CHARSET_CONVERTER + */ + void setCharset(in ACString aCharset); + + /** + * Set a wrap column. This may have no effect in some types of encoders. + * @param aWrapColumn Column to which to wrap. If 0, wrapping is disabled. + */ + void setWrapColumn(in unsigned long aWrapColumn); + + /** + * The mime type preferred by the encoder. This piece of api was + * added because the copy encoder may need to switch mime types on you + * if you ask it to copy html that really represents plaintext content. + * Call this AFTER Init() and SetSelection() have both been called. + */ + readonly attribute AString mimeType; + + /** + * Encode the document and send the result to the nsIOutputStream. + * + * Possible result codes are the stream errors which might have + * been encountered. + * @param aStream Stream into which to encode. + */ + void encodeToStream(in nsIOutputStream aStream); + + /** + * Encode the document into a string. + * + * @return The document encoded into a string. + */ + AString encodeToString(); + + /** + * Encode the document into a string. Stores the extra context information + * into the two arguments. + * @param [OUT] aContextString The string where the parent hierarchy + * information will be stored. + * @param [OUT] aInfoString The string where extra context info will + * be stored. + * @return The document encoded as a string. + * + */ + AString encodeToStringWithContext( out AString aContextString, + out AString aInfoString); + + /** + * Encode the document into a string of limited size. + * @param aMaxLength After aMaxLength characters, the encoder will stop + * encoding new data. + * Only values > 0 will be considered. + * The returned string may be slightly larger than + * aMaxLength because some serializers (eg. HTML) + * may need to close some tags after they stop + * encoding new data, or finish a line (72 columns + * by default for the plain text serializer). + * + * @return The document encoded into a string. + */ + AString encodeToStringWithMaxLength(in unsigned long aMaxLength); + + /** + * Set the fixup object associated with node persistence. + * @param aFixup The fixup object. + */ + void setNodeFixup(in nsIDocumentEncoderNodeFixup aFixup); +}; + +%{ C++ +template<class T> struct already_AddRefed; + +bool +do_getDocumentTypeSupportedForEncoding(const char* aContentType); +already_AddRefed<nsIDocumentEncoder> +do_createDocumentEncoder(const char* aContentType); +already_AddRefed<nsIDocumentEncoder> +do_createHTMLCopyEncoder(); +%} diff --git a/dom/serializers/nsPlainTextSerializer.cpp b/dom/serializers/nsPlainTextSerializer.cpp new file mode 100644 index 0000000000..4ca5d10b3a --- /dev/null +++ b/dom/serializers/nsPlainTextSerializer.cpp @@ -0,0 +1,1830 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* + * nsIContentSerializer implementation that can be used with an + * nsIDocumentEncoder to convert a DOM into plaintext in a nice way + * (eg for copy/paste as plaintext). + */ + +#include "nsPlainTextSerializer.h" + +#include <limits> + +#include "nsPrintfCString.h" +#include "nsDebug.h" +#include "nsGkAtoms.h" +#include "nsNameSpaceManager.h" +#include "nsTextFragment.h" +#include "nsContentUtils.h" +#include "nsReadableUtils.h" +#include "nsUnicharUtils.h" +#include "nsCRT.h" +#include "mozilla/Casting.h" +#include "mozilla/TextEditor.h" +#include "mozilla/dom/CharacterData.h" +#include "mozilla/dom/Element.h" +#include "mozilla/dom/HTMLBRElement.h" +#include "mozilla/dom/Text.h" +#include "mozilla/intl/Segmenter.h" +#include "mozilla/intl/UnicodeProperties.h" +#include "nsUnicodeProperties.h" +#include "mozilla/Span.h" +#include "mozilla/Preferences.h" +#include "mozilla/StaticPrefs_converter.h" +#include "nsComputedDOMStyle.h" + +namespace mozilla { +class Encoding; +} + +using namespace mozilla; +using namespace mozilla::dom; + +#define PREF_STRUCTS "converter.html2txt.structs" +#define PREF_HEADER_STRATEGY "converter.html2txt.header_strategy" + +static const int32_t kTabSize = 4; +static const int32_t kIndentSizeHeaders = + 2; /* Indention of h1, if + mHeaderStrategy = kIndentIncreasedWithHeaderLevel + or = kNumberHeadingsAndIndentSlightly. Indention of + other headers is derived from that. */ +static const int32_t kIndentIncrementHeaders = + 2; /* If mHeaderStrategy = kIndentIncreasedWithHeaderLevel, + indent h(x+1) this many + columns more than h(x) */ +static const int32_t kIndentSizeList = kTabSize; +// Indention of non-first lines of ul and ol +static const int32_t kIndentSizeDD = kTabSize; // Indention of <dd> +static const char16_t kNBSP = 160; +static const char16_t kSPACE = ' '; + +static int32_t HeaderLevel(const nsAtom* aTag); +static int32_t GetUnicharWidth(char32_t ucs); +static int32_t GetUnicharStringWidth(Span<const char16_t> aString); + +// Someday may want to make this non-const: +static const uint32_t TagStackSize = 500; + +NS_IMPL_CYCLE_COLLECTING_ADDREF(nsPlainTextSerializer) +NS_IMPL_CYCLE_COLLECTING_RELEASE(nsPlainTextSerializer) + +NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(nsPlainTextSerializer) + NS_INTERFACE_MAP_ENTRY(nsIContentSerializer) + NS_INTERFACE_MAP_ENTRY(nsISupports) +NS_INTERFACE_MAP_END + +NS_IMPL_CYCLE_COLLECTION(nsPlainTextSerializer, mElement) + +nsresult NS_NewPlainTextSerializer(nsIContentSerializer** aSerializer) { + RefPtr<nsPlainTextSerializer> it = new nsPlainTextSerializer(); + it.forget(aSerializer); + return NS_OK; +} + +// @param aFlags As defined in nsIDocumentEncoder.idl. +static void DetermineLineBreak(const int32_t aFlags, nsAString& aLineBreak) { + // Set the line break character: + if ((aFlags & nsIDocumentEncoder::OutputCRLineBreak) && + (aFlags & nsIDocumentEncoder::OutputLFLineBreak)) { + // Windows + aLineBreak.AssignLiteral(u"\r\n"); + } else if (aFlags & nsIDocumentEncoder::OutputCRLineBreak) { + // Mac + aLineBreak.AssignLiteral(u"\r"); + } else if (aFlags & nsIDocumentEncoder::OutputLFLineBreak) { + // Unix/DOM + aLineBreak.AssignLiteral(u"\n"); + } else { + // Platform/default + aLineBreak.AssignLiteral(NS_ULINEBREAK); + } +} + +void nsPlainTextSerializer::CurrentLine::MaybeReplaceNbspsInContent( + const int32_t aFlags) { + if (!(aFlags & nsIDocumentEncoder::OutputPersistNBSP)) { + // First, replace all nbsp characters with spaces, + // which the unicode encoder won't do for us. + mContent.ReplaceChar(kNBSP, kSPACE); + } +} + +void nsPlainTextSerializer::CurrentLine::ResetContentAndIndentationHeader() { + mContent.Truncate(); + mIndentation.mHeader.Truncate(); +} + +int32_t nsPlainTextSerializer::CurrentLine::FindWrapIndexForContent( + const uint32_t aWrapColumn, bool aUseLineBreaker) const { + MOZ_ASSERT(!mContent.IsEmpty()); + + const uint32_t prefixwidth = DeterminePrefixWidth(); + int32_t goodSpace = 0; + + if (aUseLineBreaker) { + // We advance one line break point at a time from the beginning of the + // mContent until we find a width less than or equal to wrap column. + uint32_t width = 0; + intl::LineBreakIteratorUtf16 lineBreakIter(mContent); + while (Maybe<uint32_t> nextGoodSpace = lineBreakIter.Next()) { + // Trim space at the tail. UAX#14 doesn't have break opportunity for + // ASCII space at the tail. + const Maybe<uint32_t> originalNextGoodSpace = nextGoodSpace; + while (*nextGoodSpace > 0 && + mContent.CharAt(*nextGoodSpace - 1) == 0x20) { + nextGoodSpace = Some(*nextGoodSpace - 1); + } + if (*nextGoodSpace == 0) { + // Restore the original nextGoodSpace. + nextGoodSpace = originalNextGoodSpace; + } + + width += GetUnicharStringWidth(Span<const char16_t>( + mContent.get() + goodSpace, *nextGoodSpace - goodSpace)); + if (prefixwidth + width > aWrapColumn) { + // The next break point makes the width exceeding the wrap column, so + // goodSpace is what we want. + break; + } + goodSpace = AssertedCast<int32_t>(*nextGoodSpace); + } + + return goodSpace; + } + + // In this case we don't want strings, especially CJK-ones, to be split. See + // bug 333064 for more information. We break only at ASCII spaces. + if (aWrapColumn >= prefixwidth) { + // Search backward from the adjusted wrap column or from the text end. + goodSpace = + std::min<int32_t>(aWrapColumn - prefixwidth, mContent.Length() - 1); + while (goodSpace >= 0) { + if (nsCRT::IsAsciiSpace(mContent.CharAt(goodSpace))) { + return goodSpace; + } + goodSpace--; + } + } + + // Search forward from the adjusted wrap column. + goodSpace = (prefixwidth > aWrapColumn) ? 1 : aWrapColumn - prefixwidth; + const int32_t contentLength = mContent.Length(); + while (goodSpace < contentLength && + !nsCRT::IsAsciiSpace(mContent.CharAt(goodSpace))) { + goodSpace++; + } + + return goodSpace; +} + +nsPlainTextSerializer::OutputManager::OutputManager(const int32_t aFlags, + nsAString& aOutput) + : mFlags{aFlags}, mOutput{aOutput}, mAtFirstColumn{true} { + MOZ_ASSERT(aOutput.IsEmpty()); + + DetermineLineBreak(mFlags, mLineBreak); +} + +void nsPlainTextSerializer::OutputManager::Append( + const CurrentLine& aCurrentLine, + const StripTrailingWhitespaces aStripTrailingWhitespaces) { + if (IsAtFirstColumn()) { + nsAutoString quotesAndIndent; + aCurrentLine.CreateQuotesAndIndent(quotesAndIndent); + + if ((aStripTrailingWhitespaces == StripTrailingWhitespaces::kMaybe)) { + const bool stripTrailingSpaces = aCurrentLine.mContent.IsEmpty(); + if (stripTrailingSpaces) { + quotesAndIndent.Trim(" ", false, true, false); + } + } + + Append(quotesAndIndent); + } + + Append(aCurrentLine.mContent); +} + +void nsPlainTextSerializer::OutputManager::Append(const nsAString& aString) { + if (!aString.IsEmpty()) { + mOutput.Append(aString); + mAtFirstColumn = false; + } +} + +void nsPlainTextSerializer::OutputManager::AppendLineBreak() { + mOutput.Append(mLineBreak); + mAtFirstColumn = true; +} + +uint32_t nsPlainTextSerializer::OutputManager::GetOutputLength() const { + return mOutput.Length(); +} + +nsPlainTextSerializer::nsPlainTextSerializer() + : mFloatingLines(-1), + mLineBreakDue(false), + kSpace(u" "_ns) // Init of "constant" +{ + mHeadLevel = 0; + mHasWrittenCiteBlockquote = false; + mSpanLevel = 0; + for (int32_t i = 0; i <= 6; i++) { + mHeaderCounter[i] = 0; + } + + // Flow + mEmptyLines = 1; // The start of the document is an "empty line" in itself, + mInWhitespace = false; + mPreFormattedMail = false; + + mPreformattedBlockBoundary = false; + + // initialize the tag stack to zero: + // The stack only ever contains pointers to static atoms, so they don't + // need refcounting. + mTagStack = new const nsAtom*[TagStackSize]; + mTagStackIndex = 0; + mIgnoreAboveIndex = (uint32_t)kNotFound; + + mULCount = 0; + + mIgnoredChildNodeLevel = 0; +} + +nsPlainTextSerializer::~nsPlainTextSerializer() { + delete[] mTagStack; + NS_WARNING_ASSERTION(mHeadLevel == 0, "Wrong head level!"); +} + +nsPlainTextSerializer::Settings::HeaderStrategy +nsPlainTextSerializer::Settings::Convert(const int32_t aPrefHeaderStrategy) { + HeaderStrategy result{HeaderStrategy::kIndentIncreasedWithHeaderLevel}; + + switch (aPrefHeaderStrategy) { + case 0: { + result = HeaderStrategy::kNoIndentation; + break; + } + case 1: { + result = HeaderStrategy::kIndentIncreasedWithHeaderLevel; + break; + } + case 2: { + result = HeaderStrategy::kNumberHeadingsAndIndentSlightly; + break; + } + default: { + NS_WARNING( + nsPrintfCString("Header strategy pref contains undefined value: %i", + aPrefHeaderStrategy) + .get()); + } + } + + return result; +} + +const int32_t kDefaultHeaderStrategy = 1; + +void nsPlainTextSerializer::Settings::Init(const int32_t aFlags, + const uint32_t aWrapColumn) { + mFlags = aFlags; + + if (mFlags & nsIDocumentEncoder::OutputFormatted) { + // Get some prefs that controls how we do formatted output + mStructs = Preferences::GetBool(PREF_STRUCTS, mStructs); + + int32_t headerStrategy = + Preferences::GetInt(PREF_HEADER_STRATEGY, kDefaultHeaderStrategy); + mHeaderStrategy = Convert(headerStrategy); + } + + mWithRubyAnnotation = StaticPrefs::converter_html2txt_always_include_ruby() || + (mFlags & nsIDocumentEncoder::OutputRubyAnnotation); + + // XXX We should let the caller decide whether to do this or not + mFlags &= ~nsIDocumentEncoder::OutputNoFramesContent; + + mWrapColumn = aWrapColumn; +} + +NS_IMETHODIMP +nsPlainTextSerializer::Init(const uint32_t aFlags, uint32_t aWrapColumn, + const Encoding* aEncoding, bool aIsCopying, + bool aIsWholeDocument, + bool* aNeedsPreformatScanning, nsAString& aOutput) { +#ifdef DEBUG + // Check if the major control flags are set correctly. + if (aFlags & nsIDocumentEncoder::OutputFormatFlowed) { + NS_ASSERTION(aFlags & nsIDocumentEncoder::OutputFormatted, + "If you want format=flowed, you must combine it with " + "nsIDocumentEncoder::OutputFormatted"); + } + + if (aFlags & nsIDocumentEncoder::OutputFormatted) { + NS_ASSERTION( + !(aFlags & nsIDocumentEncoder::OutputPreformatted), + "Can't do formatted and preformatted output at the same time!"); + } +#endif + MOZ_ASSERT(!(aFlags & nsIDocumentEncoder::OutputFormatDelSp) || + (aFlags & nsIDocumentEncoder::OutputFormatFlowed)); + + *aNeedsPreformatScanning = true; + mSettings.Init(aFlags, aWrapColumn); + mOutputManager.emplace(mSettings.GetFlags(), aOutput); + + mUseLineBreaker = mSettings.MayWrap() && mSettings.MayBreakLines(); + + mLineBreakDue = false; + mFloatingLines = -1; + + mPreformattedBlockBoundary = false; + + MOZ_ASSERT(mOLStack.IsEmpty()); + + return NS_OK; +} + +bool nsPlainTextSerializer::GetLastBool(const nsTArray<bool>& aStack) { + uint32_t size = aStack.Length(); + if (size == 0) { + return false; + } + return aStack.ElementAt(size - 1); +} + +void nsPlainTextSerializer::SetLastBool(nsTArray<bool>& aStack, bool aValue) { + uint32_t size = aStack.Length(); + if (size > 0) { + aStack.ElementAt(size - 1) = aValue; + } else { + NS_ERROR("There is no \"Last\" value"); + } +} + +void nsPlainTextSerializer::PushBool(nsTArray<bool>& aStack, bool aValue) { + aStack.AppendElement(bool(aValue)); +} + +bool nsPlainTextSerializer::PopBool(nsTArray<bool>& aStack) { + return aStack.Length() ? aStack.PopLastElement() : false; +} + +bool nsPlainTextSerializer::IsIgnorableRubyAnnotation( + const nsAtom* aTag) const { + if (mSettings.GetWithRubyAnnotation()) { + return false; + } + + return aTag == nsGkAtoms::rp || aTag == nsGkAtoms::rt || + aTag == nsGkAtoms::rtc; +} + +// Return true if aElement has 'display:none' or if we just don't know. +static bool IsDisplayNone(Element* aElement) { + RefPtr<const ComputedStyle> computedStyle = + nsComputedDOMStyle::GetComputedStyleNoFlush(aElement); + return !computedStyle || + computedStyle->StyleDisplay()->mDisplay == StyleDisplay::None; +} + +static bool IsIgnorableScriptOrStyle(Element* aElement) { + return aElement->IsAnyOfHTMLElements(nsGkAtoms::script, nsGkAtoms::style) && + IsDisplayNone(aElement); +} + +NS_IMETHODIMP +nsPlainTextSerializer::AppendText(nsIContent* aText, int32_t aStartOffset, + int32_t aEndOffset) { + if (mIgnoreAboveIndex != (uint32_t)kNotFound) { + return NS_OK; + } + + NS_ASSERTION(aStartOffset >= 0, "Negative start offset for text fragment!"); + if (aStartOffset < 0) return NS_ERROR_INVALID_ARG; + + NS_ENSURE_ARG(aText); + + nsresult rv = NS_OK; + + nsIContent* content = aText; + const nsTextFragment* frag; + if (!content || !(frag = content->GetText())) { + return NS_ERROR_FAILURE; + } + + int32_t fragLength = frag->GetLength(); + int32_t endoffset = + (aEndOffset == -1) ? fragLength : std::min(aEndOffset, fragLength); + NS_ASSERTION(aStartOffset <= endoffset, + "A start offset is beyond the end of the text fragment!"); + + int32_t length = endoffset - aStartOffset; + if (length <= 0) { + return NS_OK; + } + + nsAutoString textstr; + if (frag->Is2b()) { + textstr.Assign(frag->Get2b() + aStartOffset, length); + } else { + // AssignASCII is for 7-bit character only, so don't use it + const char* data = frag->Get1b(); + CopyASCIItoUTF16(Substring(data + aStartOffset, data + endoffset), textstr); + } + + // Mask the text if the text node is in a password field. + if (content->HasFlag(NS_MAYBE_MASKED)) { + TextEditor::MaskString(textstr, *content->AsText(), 0, aStartOffset); + } + + // We have to split the string across newlines + // to match parser behavior + int32_t start = 0; + int32_t offset = textstr.FindCharInSet(u"\n\r"); + while (offset != kNotFound) { + if (offset > start) { + // Pass in the line + DoAddText(false, Substring(textstr, start, offset - start)); + } + + // Pass in a newline + DoAddText(); + + start = offset + 1; + offset = textstr.FindCharInSet(u"\n\r", start); + } + + // Consume the last bit of the string if there's any left + if (start < length) { + if (start) { + DoAddText(false, Substring(textstr, start, length - start)); + } else { + DoAddText(false, textstr); + } + } + + return rv; +} + +NS_IMETHODIMP +nsPlainTextSerializer::AppendCDATASection(nsIContent* aCDATASection, + int32_t aStartOffset, + int32_t aEndOffset) { + return AppendText(aCDATASection, aStartOffset, aEndOffset); +} + +NS_IMETHODIMP +nsPlainTextSerializer::ScanElementForPreformat(Element* aElement) { + mPreformatStack.push(IsElementPreformatted(aElement)); + return NS_OK; +} + +NS_IMETHODIMP +nsPlainTextSerializer::ForgetElementForPreformat(Element* aElement) { + MOZ_RELEASE_ASSERT(!mPreformatStack.empty(), + "Tried to pop without previous push."); + mPreformatStack.pop(); + return NS_OK; +} + +NS_IMETHODIMP +nsPlainTextSerializer::AppendElementStart(Element* aElement, + Element* aOriginalElement) { + NS_ENSURE_ARG(aElement); + + mElement = aElement; + + nsresult rv; + nsAtom* id = GetIdForContent(mElement); + + bool isContainer = !FragmentOrElement::IsHTMLVoid(id); + + if (isContainer) { + rv = DoOpenContainer(id); + } else { + rv = DoAddLeaf(id); + } + + mElement = nullptr; + + if (id == nsGkAtoms::head) { + ++mHeadLevel; + } + + return rv; +} + +NS_IMETHODIMP +nsPlainTextSerializer::AppendElementEnd(Element* aElement, + Element* aOriginalElement) { + NS_ENSURE_ARG(aElement); + + mElement = aElement; + + nsresult rv; + nsAtom* id = GetIdForContent(mElement); + + bool isContainer = !FragmentOrElement::IsHTMLVoid(id); + + rv = NS_OK; + if (isContainer) { + rv = DoCloseContainer(id); + } + + mElement = nullptr; + + if (id == nsGkAtoms::head) { + NS_ASSERTION(mHeadLevel != 0, "mHeadLevel being decremented below 0"); + --mHeadLevel; + } + + return rv; +} + +NS_IMETHODIMP +nsPlainTextSerializer::FlushAndFinish() { + MOZ_ASSERT(mOutputManager); + + mOutputManager->Flush(mCurrentLine); + return Finish(); +} + +NS_IMETHODIMP +nsPlainTextSerializer::Finish() { + mOutputManager.reset(); + + return NS_OK; +} + +NS_IMETHODIMP +nsPlainTextSerializer::GetOutputLength(uint32_t& aLength) const { + MOZ_ASSERT(mOutputManager); + + aLength = mOutputManager->GetOutputLength(); + + return NS_OK; +} + +NS_IMETHODIMP +nsPlainTextSerializer::AppendDocumentStart(Document* aDocument) { + return NS_OK; +} + +constexpr int32_t kOlStackDummyValue = 0; + +nsresult nsPlainTextSerializer::DoOpenContainer(const nsAtom* aTag) { + if (IsIgnorableRubyAnnotation(aTag)) { + // Ignorable ruby annotation shouldn't be replaced by a placeholder + // character, neither any of its descendants. + mIgnoredChildNodeLevel++; + return NS_OK; + } + if (IsIgnorableScriptOrStyle(mElement)) { + mIgnoredChildNodeLevel++; + return NS_OK; + } + + if (mSettings.HasFlag(nsIDocumentEncoder::OutputForPlainTextClipboardCopy)) { + if (mPreformattedBlockBoundary && DoOutput()) { + // Should always end a line, but get no more whitespace + if (mFloatingLines < 0) mFloatingLines = 0; + mLineBreakDue = true; + } + mPreformattedBlockBoundary = false; + } + + if (mSettings.HasFlag(nsIDocumentEncoder::OutputRaw)) { + // Raw means raw. Don't even think about doing anything fancy + // here like indenting, adding line breaks or any other + // characters such as list item bullets, quote characters + // around <q>, etc. + + return NS_OK; + } + + if (mTagStackIndex < TagStackSize) { + mTagStack[mTagStackIndex++] = aTag; + } + + if (mIgnoreAboveIndex != (uint32_t)kNotFound) { + return NS_OK; + } + + // Reset this so that <blockquote type=cite> doesn't affect the whitespace + // above random <pre>s below it. + mHasWrittenCiteBlockquote = + mHasWrittenCiteBlockquote && aTag == nsGkAtoms::pre; + + bool isInCiteBlockquote = false; + + // XXX special-case <blockquote type=cite> so that we don't add additional + // newlines before the text. + if (aTag == nsGkAtoms::blockquote) { + nsAutoString value; + nsresult rv = GetAttributeValue(nsGkAtoms::type, value); + isInCiteBlockquote = NS_SUCCEEDED(rv) && value.EqualsIgnoreCase("cite"); + } + + if (mLineBreakDue && !isInCiteBlockquote) EnsureVerticalSpace(mFloatingLines); + + // Check if this tag's content that should not be output + if ((aTag == nsGkAtoms::noscript && + !mSettings.HasFlag(nsIDocumentEncoder::OutputNoScriptContent)) || + ((aTag == nsGkAtoms::iframe || aTag == nsGkAtoms::noframes) && + !mSettings.HasFlag(nsIDocumentEncoder::OutputNoFramesContent))) { + // Ignore everything that follows the current tag in + // question until a matching end tag is encountered. + mIgnoreAboveIndex = mTagStackIndex - 1; + return NS_OK; + } + + if (aTag == nsGkAtoms::body) { + // Try to figure out here whether we have a + // preformatted style attribute set by Thunderbird. + // + // Trigger on the presence of a "pre-wrap" in the + // style attribute. That's a very simplistic way to do + // it, but better than nothing. + nsAutoString style; + int32_t whitespace; + if (NS_SUCCEEDED(GetAttributeValue(nsGkAtoms::style, style)) && + (kNotFound != (whitespace = style.Find(u"white-space:")))) { + if (kNotFound != style.LowerCaseFindASCII("pre-wrap", whitespace)) { +#ifdef DEBUG_preformatted + printf("Set mPreFormattedMail based on style pre-wrap\n"); +#endif + mPreFormattedMail = true; + } else if (kNotFound != style.LowerCaseFindASCII("pre", whitespace)) { +#ifdef DEBUG_preformatted + printf("Set mPreFormattedMail based on style pre\n"); +#endif + mPreFormattedMail = true; + } + } else { + /* See comment at end of function. */ + mInWhitespace = true; + mPreFormattedMail = false; + } + + return NS_OK; + } + + // Keep this in sync with DoCloseContainer! + if (!DoOutput()) { + return NS_OK; + } + + if (aTag == nsGkAtoms::p) + EnsureVerticalSpace(1); + else if (aTag == nsGkAtoms::pre) { + if (GetLastBool(mIsInCiteBlockquote)) + EnsureVerticalSpace(0); + else if (mHasWrittenCiteBlockquote) { + EnsureVerticalSpace(0); + mHasWrittenCiteBlockquote = false; + } else + EnsureVerticalSpace(1); + } else if (aTag == nsGkAtoms::tr) { + PushBool(mHasWrittenCellsForRow, false); + } else if (aTag == nsGkAtoms::td || aTag == nsGkAtoms::th) { + // We must make sure that the content of two table cells get a + // space between them. + + // To make the separation between cells most obvious and + // importable, we use a TAB. + if (mHasWrittenCellsForRow.IsEmpty()) { + // We don't always see a <tr> (nor a <table>) before the <td> if we're + // copying part of a table + PushBool(mHasWrittenCellsForRow, true); // will never be popped + } else if (GetLastBool(mHasWrittenCellsForRow)) { + // Bypass |Write| so that the TAB isn't compressed away. + AddToLine(u"\t", 1); + mInWhitespace = true; + } else { + SetLastBool(mHasWrittenCellsForRow, true); + } + } else if (aTag == nsGkAtoms::ul) { + // Indent here to support nested lists, which aren't included in li :-( + EnsureVerticalSpace(IsInOlOrUl() ? 0 : 1); + // Must end the current line before we change indention + mCurrentLine.mIndentation.mLength += kIndentSizeList; + mULCount++; + } else if (aTag == nsGkAtoms::ol) { + EnsureVerticalSpace(IsInOlOrUl() ? 0 : 1); + if (mSettings.HasFlag(nsIDocumentEncoder::OutputFormatted)) { + // Must end the current line before we change indention + nsAutoString startAttr; + int32_t startVal = 1; + if (NS_SUCCEEDED(GetAttributeValue(nsGkAtoms::start, startAttr))) { + nsresult rv = NS_OK; + startVal = startAttr.ToInteger(&rv); + if (NS_FAILED(rv)) { + startVal = 1; + } + } + mOLStack.AppendElement(startVal); + } else { + mOLStack.AppendElement(kOlStackDummyValue); + } + mCurrentLine.mIndentation.mLength += kIndentSizeList; // see ul + } else if (aTag == nsGkAtoms::li && + mSettings.HasFlag(nsIDocumentEncoder::OutputFormatted)) { + if (mTagStackIndex > 1 && IsInOL()) { + if (!mOLStack.IsEmpty()) { + nsAutoString valueAttr; + if (NS_SUCCEEDED(GetAttributeValue(nsGkAtoms::value, valueAttr))) { + nsresult rv = NS_OK; + int32_t valueAttrVal = valueAttr.ToInteger(&rv); + if (NS_SUCCEEDED(rv)) { + mOLStack.LastElement() = valueAttrVal; + } + } + // This is what nsBulletFrame does for OLs: + mCurrentLine.mIndentation.mHeader.AppendInt(mOLStack.LastElement(), 10); + mOLStack.LastElement()++; + } else { + mCurrentLine.mIndentation.mHeader.Append(char16_t('#')); + } + + mCurrentLine.mIndentation.mHeader.Append(char16_t('.')); + + } else { + static const char bulletCharArray[] = "*o+#"; + uint32_t index = mULCount > 0 ? (mULCount - 1) : 3; + char bulletChar = bulletCharArray[index % 4]; + mCurrentLine.mIndentation.mHeader.Append(char16_t(bulletChar)); + } + + mCurrentLine.mIndentation.mHeader.Append(char16_t(' ')); + } else if (aTag == nsGkAtoms::dl) { + EnsureVerticalSpace(1); + } else if (aTag == nsGkAtoms::dt) { + EnsureVerticalSpace(0); + } else if (aTag == nsGkAtoms::dd) { + EnsureVerticalSpace(0); + mCurrentLine.mIndentation.mLength += kIndentSizeDD; + } else if (aTag == nsGkAtoms::span) { + ++mSpanLevel; + } else if (aTag == nsGkAtoms::blockquote) { + // Push + PushBool(mIsInCiteBlockquote, isInCiteBlockquote); + if (isInCiteBlockquote) { + EnsureVerticalSpace(0); + mCurrentLine.mCiteQuoteLevel++; + } else { + EnsureVerticalSpace(1); + mCurrentLine.mIndentation.mLength += + kTabSize; // Check for some maximum value? + } + } else if (aTag == nsGkAtoms::q) { + Write(u"\""_ns); + } + + // Else make sure we'll separate block level tags, + // even if we're about to leave, before doing any other formatting. + else if (IsCssBlockLevelElement(mElement)) { + EnsureVerticalSpace(0); + } + + if (mSettings.HasFlag(nsIDocumentEncoder::OutputFormatted)) { + OpenContainerForOutputFormatted(aTag); + } + return NS_OK; +} + +void nsPlainTextSerializer::OpenContainerForOutputFormatted( + const nsAtom* aTag) { + const bool currentNodeIsConverted = IsCurrentNodeConverted(); + + if (aTag == nsGkAtoms::h1 || aTag == nsGkAtoms::h2 || aTag == nsGkAtoms::h3 || + aTag == nsGkAtoms::h4 || aTag == nsGkAtoms::h5 || aTag == nsGkAtoms::h6) { + EnsureVerticalSpace(2); + if (mSettings.GetHeaderStrategy() == + Settings::HeaderStrategy::kNumberHeadingsAndIndentSlightly) { + mCurrentLine.mIndentation.mLength += kIndentSizeHeaders; + // Caching + int32_t level = HeaderLevel(aTag); + // Increase counter for current level + mHeaderCounter[level]++; + // Reset all lower levels + int32_t i; + + for (i = level + 1; i <= 6; i++) { + mHeaderCounter[i] = 0; + } + + // Construct numbers + nsAutoString leadup; + for (i = 1; i <= level; i++) { + leadup.AppendInt(mHeaderCounter[i]); + leadup.Append(char16_t('.')); + } + leadup.Append(char16_t(' ')); + Write(leadup); + } else if (mSettings.GetHeaderStrategy() == + Settings::HeaderStrategy::kIndentIncreasedWithHeaderLevel) { + mCurrentLine.mIndentation.mLength += kIndentSizeHeaders; + for (int32_t i = HeaderLevel(aTag); i > 1; i--) { + // for h(x), run x-1 times + mCurrentLine.mIndentation.mLength += kIndentIncrementHeaders; + } + } + } else if (aTag == nsGkAtoms::sup && mSettings.GetStructs() && + !currentNodeIsConverted) { + Write(u"^"_ns); + } else if (aTag == nsGkAtoms::sub && mSettings.GetStructs() && + !currentNodeIsConverted) { + Write(u"_"_ns); + } else if (aTag == nsGkAtoms::code && mSettings.GetStructs() && + !currentNodeIsConverted) { + Write(u"|"_ns); + } else if ((aTag == nsGkAtoms::strong || aTag == nsGkAtoms::b) && + mSettings.GetStructs() && !currentNodeIsConverted) { + Write(u"*"_ns); + } else if ((aTag == nsGkAtoms::em || aTag == nsGkAtoms::i) && + mSettings.GetStructs() && !currentNodeIsConverted) { + Write(u"/"_ns); + } else if (aTag == nsGkAtoms::u && mSettings.GetStructs() && + !currentNodeIsConverted) { + Write(u"_"_ns); + } + + /* Container elements are always block elements, so we shouldn't + output any whitespace immediately after the container tag even if + there's extra whitespace there because the HTML is pretty-printed + or something. To ensure that happens, tell the serializer we're + already in whitespace so it won't output more. */ + mInWhitespace = true; +} + +nsresult nsPlainTextSerializer::DoCloseContainer(const nsAtom* aTag) { + if (IsIgnorableRubyAnnotation(aTag)) { + mIgnoredChildNodeLevel--; + return NS_OK; + } + if (IsIgnorableScriptOrStyle(mElement)) { + mIgnoredChildNodeLevel--; + return NS_OK; + } + + if (mSettings.HasFlag(nsIDocumentEncoder::OutputForPlainTextClipboardCopy)) { + if (DoOutput() && IsElementPreformatted() && + IsCssBlockLevelElement(mElement)) { + // If we're closing a preformatted block element, output a line break + // when we find a new container. + mPreformattedBlockBoundary = true; + } + } + + if (mSettings.HasFlag(nsIDocumentEncoder::OutputRaw)) { + // Raw means raw. Don't even think about doing anything fancy + // here like indenting, adding line breaks or any other + // characters such as list item bullets, quote characters + // around <q>, etc. + + return NS_OK; + } + + if (mTagStackIndex > 0) { + --mTagStackIndex; + } + + if (mTagStackIndex >= mIgnoreAboveIndex) { + if (mTagStackIndex == mIgnoreAboveIndex) { + // We're dealing with the close tag whose matching + // open tag had set the mIgnoreAboveIndex value. + // Reset mIgnoreAboveIndex before discarding this tag. + mIgnoreAboveIndex = (uint32_t)kNotFound; + } + return NS_OK; + } + + MOZ_ASSERT(mOutputManager); + + // End current line if we're ending a block level tag + if ((aTag == nsGkAtoms::body) || (aTag == nsGkAtoms::html)) { + // We want the output to end with a new line, + // but in preformatted areas like text fields, + // we can't emit newlines that weren't there. + // So add the newline only in the case of formatted output. + if (mSettings.HasFlag(nsIDocumentEncoder::OutputFormatted)) { + EnsureVerticalSpace(0); + } else { + mOutputManager->Flush(mCurrentLine); + } + // We won't want to do anything with these in formatted mode either, + // so just return now: + return NS_OK; + } + + // Keep this in sync with DoOpenContainer! + if (!DoOutput()) { + return NS_OK; + } + + if (aTag == nsGkAtoms::tr) { + PopBool(mHasWrittenCellsForRow); + // Should always end a line, but get no more whitespace + if (mFloatingLines < 0) mFloatingLines = 0; + mLineBreakDue = true; + } else if (((aTag == nsGkAtoms::li) || (aTag == nsGkAtoms::dt)) && + mSettings.HasFlag(nsIDocumentEncoder::OutputFormatted)) { + // Items that should always end a line, but get no more whitespace + if (mFloatingLines < 0) mFloatingLines = 0; + mLineBreakDue = true; + } else if (aTag == nsGkAtoms::pre) { + mFloatingLines = GetLastBool(mIsInCiteBlockquote) ? 0 : 1; + mLineBreakDue = true; + } else if (aTag == nsGkAtoms::ul) { + mOutputManager->Flush(mCurrentLine); + mCurrentLine.mIndentation.mLength -= kIndentSizeList; + --mULCount; + if (!IsInOlOrUl()) { + mFloatingLines = 1; + mLineBreakDue = true; + } + } else if (aTag == nsGkAtoms::ol) { + mOutputManager->Flush(mCurrentLine); // Doing this after decreasing + // OLStackIndex would be wrong. + mCurrentLine.mIndentation.mLength -= kIndentSizeList; + MOZ_ASSERT(!mOLStack.IsEmpty(), "Wrong OLStack level!"); + mOLStack.RemoveLastElement(); + if (!IsInOlOrUl()) { + mFloatingLines = 1; + mLineBreakDue = true; + } + } else if (aTag == nsGkAtoms::dl) { + mFloatingLines = 1; + mLineBreakDue = true; + } else if (aTag == nsGkAtoms::dd) { + mOutputManager->Flush(mCurrentLine); + mCurrentLine.mIndentation.mLength -= kIndentSizeDD; + } else if (aTag == nsGkAtoms::span) { + NS_ASSERTION(mSpanLevel, "Span level will be negative!"); + --mSpanLevel; + } else if (aTag == nsGkAtoms::div) { + if (mFloatingLines < 0) mFloatingLines = 0; + mLineBreakDue = true; + } else if (aTag == nsGkAtoms::blockquote) { + mOutputManager->Flush(mCurrentLine); // Is this needed? + + // Pop + bool isInCiteBlockquote = PopBool(mIsInCiteBlockquote); + + if (isInCiteBlockquote) { + NS_ASSERTION(mCurrentLine.mCiteQuoteLevel, + "CiteQuote level will be negative!"); + mCurrentLine.mCiteQuoteLevel--; + mFloatingLines = 0; + mHasWrittenCiteBlockquote = true; + } else { + mCurrentLine.mIndentation.mLength -= kTabSize; + mFloatingLines = 1; + } + mLineBreakDue = true; + } else if (aTag == nsGkAtoms::q) { + Write(u"\""_ns); + } else if (IsCssBlockLevelElement(mElement)) { + // All other blocks get 1 vertical space after them + // in formatted mode, otherwise 0. + // This is hard. Sometimes 0 is a better number, but + // how to know? + if (mSettings.HasFlag(nsIDocumentEncoder::OutputFormatted)) { + EnsureVerticalSpace(1); + } else { + if (mFloatingLines < 0) mFloatingLines = 0; + mLineBreakDue = true; + } + } + + if (mSettings.HasFlag(nsIDocumentEncoder::OutputFormatted)) { + CloseContainerForOutputFormatted(aTag); + } + + return NS_OK; +} + +void nsPlainTextSerializer::CloseContainerForOutputFormatted( + const nsAtom* aTag) { + const bool currentNodeIsConverted = IsCurrentNodeConverted(); + + if (aTag == nsGkAtoms::h1 || aTag == nsGkAtoms::h2 || aTag == nsGkAtoms::h3 || + aTag == nsGkAtoms::h4 || aTag == nsGkAtoms::h5 || aTag == nsGkAtoms::h6) { + using HeaderStrategy = Settings::HeaderStrategy; + if ((mSettings.GetHeaderStrategy() == + HeaderStrategy::kIndentIncreasedWithHeaderLevel) || + (mSettings.GetHeaderStrategy() == + HeaderStrategy::kNumberHeadingsAndIndentSlightly)) { + mCurrentLine.mIndentation.mLength -= kIndentSizeHeaders; + } + if (mSettings.GetHeaderStrategy() == + HeaderStrategy::kIndentIncreasedWithHeaderLevel) { + for (int32_t i = HeaderLevel(aTag); i > 1; i--) { + // for h(x), run x-1 times + mCurrentLine.mIndentation.mLength -= kIndentIncrementHeaders; + } + } + EnsureVerticalSpace(1); + } else if (aTag == nsGkAtoms::a && !currentNodeIsConverted) { + nsAutoString url; + if (NS_SUCCEEDED(GetAttributeValue(nsGkAtoms::href, url)) && + !url.IsEmpty()) { + nsAutoString temp; + temp.AssignLiteral(" <"); + temp += url; + temp.Append(char16_t('>')); + Write(temp); + } + } else if ((aTag == nsGkAtoms::sup || aTag == nsGkAtoms::sub) && + mSettings.GetStructs() && !currentNodeIsConverted) { + Write(kSpace); + } else if (aTag == nsGkAtoms::code && mSettings.GetStructs() && + !currentNodeIsConverted) { + Write(u"|"_ns); + } else if ((aTag == nsGkAtoms::strong || aTag == nsGkAtoms::b) && + mSettings.GetStructs() && !currentNodeIsConverted) { + Write(u"*"_ns); + } else if ((aTag == nsGkAtoms::em || aTag == nsGkAtoms::i) && + mSettings.GetStructs() && !currentNodeIsConverted) { + Write(u"/"_ns); + } else if (aTag == nsGkAtoms::u && mSettings.GetStructs() && + !currentNodeIsConverted) { + Write(u"_"_ns); + } +} + +bool nsPlainTextSerializer::MustSuppressLeaf() const { + if (mIgnoredChildNodeLevel > 0) { + return true; + } + + if ((mTagStackIndex > 1 && + mTagStack[mTagStackIndex - 2] == nsGkAtoms::select) || + (mTagStackIndex > 0 && + mTagStack[mTagStackIndex - 1] == nsGkAtoms::select)) { + // Don't output the contents of SELECT elements; + // Might be nice, eventually, to output just the selected element. + // Read more in bug 31994. + return true; + } + + return false; +} + +void nsPlainTextSerializer::DoAddText() { DoAddText(true, u""_ns); } + +void nsPlainTextSerializer::DoAddText(bool aIsLineBreak, + const nsAString& aText) { + // If we don't want any output, just return + if (!DoOutput()) { + return; + } + + if (!aIsLineBreak) { + // Make sure to reset this, since it's no longer true. + mHasWrittenCiteBlockquote = false; + } + + if (mLineBreakDue) EnsureVerticalSpace(mFloatingLines); + + if (MustSuppressLeaf()) { + return; + } + + if (aIsLineBreak) { + // The only times we want to pass along whitespace from the original + // html source are if we're forced into preformatted mode via flags, + // or if we're prettyprinting and we're inside a <pre>. + // Otherwise, either we're collapsing to minimal text, or we're + // prettyprinting to mimic the html format, and in neither case + // does the formatting of the html source help us. + if (mSettings.HasFlag(nsIDocumentEncoder::OutputPreformatted) || + (mPreFormattedMail && !mSettings.GetWrapColumn()) || + IsElementPreformatted()) { + EnsureVerticalSpace(mEmptyLines + 1); + } else if (!mInWhitespace) { + Write(kSpace); + mInWhitespace = true; + } + return; + } + + Write(aText); +} + +void CreateLineOfDashes(nsAString& aResult, const uint32_t aWrapColumn) { + MOZ_ASSERT(aResult.IsEmpty()); + + const uint32_t width = (aWrapColumn > 0 ? aWrapColumn : 25); + while (aResult.Length() < width) { + aResult.Append(char16_t('-')); + } +} + +nsresult nsPlainTextSerializer::DoAddLeaf(const nsAtom* aTag) { + mPreformattedBlockBoundary = false; + + if (!DoOutput()) { + return NS_OK; + } + + if (mLineBreakDue) EnsureVerticalSpace(mFloatingLines); + + if (MustSuppressLeaf()) { + return NS_OK; + } + + if (aTag == nsGkAtoms::br) { + // Another egregious editor workaround, see bug 38194: + // ignore the bogus br tags that the editor sticks here and there. + // FYI: `brElement` may be `nullptr` if the element is <br> element + // of non-HTML element. + // XXX Do we need to call `EnsureVerticalSpace()` when the <br> element + // is not an HTML element? + HTMLBRElement* brElement = HTMLBRElement::FromNodeOrNull(mElement); + if (!brElement || !brElement->IsPaddingForEmptyLastLine()) { + EnsureVerticalSpace(mEmptyLines + 1); + } + } else if (aTag == nsGkAtoms::hr && + mSettings.HasFlag(nsIDocumentEncoder::OutputFormatted)) { + EnsureVerticalSpace(0); + + // Make a line of dashes as wide as the wrap width + // XXX honoring percentage would be nice + nsAutoString line; + CreateLineOfDashes(line, mSettings.GetWrapColumn()); + Write(line); + + EnsureVerticalSpace(0); + } else if (aTag == nsGkAtoms::img) { + /* Output (in decreasing order of preference) + alt, title or nothing */ + // See <http://www.w3.org/TR/REC-html40/struct/objects.html#edef-IMG> + nsAutoString imageDescription; + if (NS_SUCCEEDED(GetAttributeValue(nsGkAtoms::alt, imageDescription))) { + // If the alt attribute has an empty value (|alt=""|), output nothing + } else if (NS_SUCCEEDED( + GetAttributeValue(nsGkAtoms::title, imageDescription)) && + !imageDescription.IsEmpty()) { + imageDescription = u" ["_ns + imageDescription + u"] "_ns; + } + + Write(imageDescription); + } + + return NS_OK; +} + +/** + * Adds as many newline as necessary to get |aNumberOfRows| empty lines + * + * aNumberOfRows = -1 : Being in the middle of some line of text + * aNumberOfRows = 0 : Being at the start of a line + * aNumberOfRows = n>0 : Having n empty lines before the current line. + */ +void nsPlainTextSerializer::EnsureVerticalSpace(const int32_t aNumberOfRows) { + // If we have something in the indent we probably want to output + // it and it's not included in the count for empty lines so we don't + // realize that we should start a new line. + if (aNumberOfRows >= 0 && !mCurrentLine.mIndentation.mHeader.IsEmpty()) { + EndLine(false); + mInWhitespace = true; + } + + while (mEmptyLines < aNumberOfRows) { + EndLine(false); + mInWhitespace = true; + } + mLineBreakDue = false; + mFloatingLines = -1; +} + +void nsPlainTextSerializer::OutputManager::Flush(CurrentLine& aCurrentLine) { + if (!aCurrentLine.mContent.IsEmpty()) { + aCurrentLine.MaybeReplaceNbspsInContent(mFlags); + + Append(aCurrentLine, StripTrailingWhitespaces::kNo); + + aCurrentLine.ResetContentAndIndentationHeader(); + } +} + +static bool IsSpaceStuffable(const char16_t* s) { + return (s[0] == '>' || s[0] == ' ' || s[0] == kNBSP || + NS_strncmp(s, u"From ", 5) == 0); +} + +void nsPlainTextSerializer::MaybeWrapAndOutputCompleteLines() { + if (!mSettings.MayWrap()) { + return; + } + + // Yes, wrap! + // The "+4" is to avoid wrap lines that only would be a couple + // of letters too long. We give this bonus only if the + // wrapcolumn is more than 20. + const uint32_t wrapColumn = mSettings.GetWrapColumn(); + uint32_t bonuswidth = (wrapColumn > 20) ? 4 : 0; + while (!mCurrentLine.mContent.IsEmpty()) { + const uint32_t prefixwidth = mCurrentLine.DeterminePrefixWidth(); + // The width of the line as it will appear on the screen (approx.). + const uint32_t currentLineContentWidth = + GetUnicharStringWidth(mCurrentLine.mContent); + if (currentLineContentWidth + prefixwidth <= wrapColumn + bonuswidth) { + break; + } + + const int32_t goodSpace = + mCurrentLine.FindWrapIndexForContent(wrapColumn, mUseLineBreaker); + + const int32_t contentLength = mCurrentLine.mContent.Length(); + if (goodSpace <= 0 || goodSpace >= contentLength) { + // Nothing to do. Hopefully we get more data later to use for a place to + // break line. + break; + } + // Found a place to break + // -1 (trim a char at the break position) only if the line break was a + // space. + nsAutoString restOfContent; + if (nsCRT::IsAsciiSpace(mCurrentLine.mContent.CharAt(goodSpace))) { + mCurrentLine.mContent.Right(restOfContent, contentLength - goodSpace - 1); + } else { + mCurrentLine.mContent.Right(restOfContent, contentLength - goodSpace); + } + // if breaker was U+0020, it has to consider for delsp=yes support + const bool breakBySpace = mCurrentLine.mContent.CharAt(goodSpace) == ' '; + mCurrentLine.mContent.Truncate(goodSpace); + EndLine(true, breakBySpace); + mCurrentLine.mContent.Truncate(); + // Space stuffing a la RFC 2646 (format=flowed) + if (mSettings.HasFlag(nsIDocumentEncoder::OutputFormatFlowed)) { + mCurrentLine.mSpaceStuffed = !restOfContent.IsEmpty() && + IsSpaceStuffable(restOfContent.get()) && + // We space-stuff quoted lines anyway + mCurrentLine.mCiteQuoteLevel == 0; + } + mCurrentLine.mContent.Append(restOfContent); + mEmptyLines = -1; + } +} + +/** + * This function adds a piece of text to the current stored line. If we are + * wrapping text and the stored line will become too long, a suitable + * location to wrap will be found and the line that's complete will be + * output. + */ +void nsPlainTextSerializer::AddToLine(const char16_t* aLineFragment, + int32_t aLineFragmentLength) { + if (mLineBreakDue) EnsureVerticalSpace(mFloatingLines); + + if (mCurrentLine.mContent.IsEmpty()) { + if (0 == aLineFragmentLength) { + return; + } + + if (mSettings.HasFlag(nsIDocumentEncoder::OutputFormatFlowed)) { + // Space stuffing a la RFC 2646 (format=flowed). + // We space-stuff quoted lines anyway + mCurrentLine.mSpaceStuffed = + IsSpaceStuffable(aLineFragment) && mCurrentLine.mCiteQuoteLevel == 0; + } + mEmptyLines = -1; + } + + mCurrentLine.mContent.Append(aLineFragment, aLineFragmentLength); + + MaybeWrapAndOutputCompleteLines(); +} + +// The signature separator (RFC 2646). +const char kSignatureSeparator[] = "-- "; + +// The OpenPGP dash-escaped signature separator in inline +// signed messages according to the OpenPGP standard (RFC 2440). +const char kDashEscapedSignatureSeparator[] = "- -- "; + +static bool IsSignatureSeparator(const nsAString& aString) { + return aString.EqualsLiteral(kSignatureSeparator) || + aString.EqualsLiteral(kDashEscapedSignatureSeparator); +} + +/** + * Outputs the contents of mCurrentLine.mContent, and resets line + * specific variables. Also adds an indentation and prefix if there is one + * specified. Strips ending spaces from the line if it isn't preformatted. + */ +void nsPlainTextSerializer::EndLine(bool aSoftLineBreak, bool aBreakBySpace) { + if (aSoftLineBreak && mCurrentLine.mContent.IsEmpty()) { + // No meaning + return; + } + + /* In non-preformatted mode, remove spaces from the end of the line for + * format=flowed compatibility. Don't do this for these special cases: + * "-- ", the signature separator (RFC 2646) shouldn't be touched and + * "- -- ", the OpenPGP dash-escaped signature separator in inline + * signed messages according to the OpenPGP standard (RFC 2440). + */ + if (!mSettings.HasFlag(nsIDocumentEncoder::OutputPreformatted) && + (aSoftLineBreak || !IsSignatureSeparator(mCurrentLine.mContent))) { + mCurrentLine.mContent.Trim(" ", false, true, false); + } + + if (aSoftLineBreak && + mSettings.HasFlag(nsIDocumentEncoder::OutputFormatFlowed) && + !mCurrentLine.mIndentation.mLength) { + // Add the soft part of the soft linebreak (RFC 2646 4.1) + // We only do this when there is no indentation since format=flowed + // lines and indentation doesn't work well together. + + // If breaker character is ASCII space with RFC 3676 support (delsp=yes), + // add twice space. + if (mSettings.HasFlag(nsIDocumentEncoder::OutputFormatDelSp) && + aBreakBySpace) { + mCurrentLine.mContent.AppendLiteral(" "); + } else { + mCurrentLine.mContent.Append(char16_t(' ')); + } + } + + if (aSoftLineBreak) { + mEmptyLines = 0; + } else { + // Hard break + if (mCurrentLine.HasContentOrIndentationHeader()) { + mEmptyLines = 0; + } else { + mEmptyLines++; + } + } + + MOZ_ASSERT(mOutputManager); + + mCurrentLine.MaybeReplaceNbspsInContent(mSettings.GetFlags()); + + // If we don't have anything "real" to output we have to + // make sure the indent doesn't end in a space since that + // would trick a format=flowed-aware receiver. + mOutputManager->Append(mCurrentLine, + OutputManager::StripTrailingWhitespaces::kMaybe); + mOutputManager->AppendLineBreak(); + mCurrentLine.ResetContentAndIndentationHeader(); + mInWhitespace = true; + mLineBreakDue = false; + mFloatingLines = -1; +} + +/** + * Creates the calculated and stored indent and text in the indentation. That is + * quote chars and numbers for numbered lists and such. + */ +void nsPlainTextSerializer::CurrentLine::CreateQuotesAndIndent( + nsAString& aResult) const { + // Put the mail quote "> " chars in, if appropriate: + if (mCiteQuoteLevel > 0) { + nsAutoString quotes; + for (int i = 0; i < mCiteQuoteLevel; i++) { + quotes.Append(char16_t('>')); + } + if (!mContent.IsEmpty()) { + /* Better don't output a space here, if the line is empty, + in case a receiving format=flowed-aware UA thinks, this were a flowed + line, which it isn't - it's just empty. (Flowed lines may be joined + with the following one, so the empty line may be lost completely.) */ + quotes.Append(char16_t(' ')); + } + aResult = quotes; + } + + // Indent if necessary + int32_t indentwidth = mIndentation.mLength - mIndentation.mHeader.Length(); + if (mSpaceStuffed) { + indentwidth += 1; + } + + // Don't make empty lines look flowed + if (indentwidth > 0 && HasContentOrIndentationHeader()) { + nsAutoString spaces; + for (int i = 0; i < indentwidth; ++i) { + spaces.Append(char16_t(' ')); + } + aResult += spaces; + } + + if (!mIndentation.mHeader.IsEmpty()) { + aResult += mIndentation.mHeader; + } +} + +static bool IsLineFeedCarriageReturnBlankOrTab(char16_t c) { + return ('\n' == c || '\r' == c || ' ' == c || '\t' == c); +} + +static void ReplaceVisiblyTrailingNbsps(nsAString& aString) { + const int32_t totLen = aString.Length(); + for (int32_t i = totLen - 1; i >= 0; i--) { + char16_t c = aString[i]; + if (IsLineFeedCarriageReturnBlankOrTab(c)) { + continue; + } + if (kNBSP == c) { + aString.Replace(i, 1, ' '); + } else { + break; + } + } +} + +void nsPlainTextSerializer::ConvertToLinesAndOutput(const nsAString& aString) { + const int32_t totLen = aString.Length(); + int32_t newline{0}; + + // Put the mail quote "> " chars in, if appropriate. + // Have to put it in before every line. + int32_t bol = 0; + while (bol < totLen) { + bool outputLineBreak = false; + bool spacesOnly = true; + + // Find one of '\n' or '\r' using iterators since nsAString + // doesn't have the old FindCharInSet function. + nsAString::const_iterator iter; + aString.BeginReading(iter); + nsAString::const_iterator done_searching; + aString.EndReading(done_searching); + iter.advance(bol); + int32_t new_newline = bol; + newline = kNotFound; + while (iter != done_searching) { + if ('\n' == *iter || '\r' == *iter) { + newline = new_newline; + break; + } + if (' ' != *iter) { + spacesOnly = false; + } + ++new_newline; + ++iter; + } + + // Done searching + nsAutoString stringpart; + if (newline == kNotFound) { + // No new lines. + stringpart.Assign(Substring(aString, bol, totLen - bol)); + if (!stringpart.IsEmpty()) { + char16_t lastchar = stringpart.Last(); + mInWhitespace = IsLineFeedCarriageReturnBlankOrTab(lastchar); + } + mEmptyLines = -1; + bol = totLen; + } else { + // There is a newline + stringpart.Assign(Substring(aString, bol, newline - bol)); + mInWhitespace = true; + outputLineBreak = true; + mEmptyLines = 0; + bol = newline + 1; + if ('\r' == *iter && bol < totLen && '\n' == *++iter) { + // There was a CRLF in the input. This used to be illegal and + // stripped by the parser. Apparently not anymore. Let's skip + // over the LF. + bol++; + } + } + + if (mSettings.HasFlag(nsIDocumentEncoder::OutputFormatFlowed)) { + if ((outputLineBreak || !spacesOnly) && // bugs 261467,125928 + !IsQuotedLine(stringpart) && !IsSignatureSeparator(stringpart)) { + stringpart.Trim(" ", false, true, true); + } + mCurrentLine.mSpaceStuffed = + IsSpaceStuffable(stringpart.get()) && !IsQuotedLine(stringpart); + } + mCurrentLine.mContent.Append(stringpart); + + mCurrentLine.MaybeReplaceNbspsInContent(mSettings.GetFlags()); + + mOutputManager->Append(mCurrentLine, + OutputManager::StripTrailingWhitespaces::kNo); + if (outputLineBreak) { + mOutputManager->AppendLineBreak(); + } + + mCurrentLine.ResetContentAndIndentationHeader(); + } + +#ifdef DEBUG_wrapping + printf("No wrapping: newline is %d, totLen is %d\n", newline, totLen); +#endif +} + +/** + * Write a string. This is the highlevel function to use to get text output. + * By using AddToLine, Output, EndLine and other functions it handles quotation, + * line wrapping, indentation, whitespace compression and other things. + */ +void nsPlainTextSerializer::Write(const nsAString& aStr) { + // XXX Copy necessary to use nsString methods and gain + // access to underlying buffer + nsAutoString str(aStr); + +#ifdef DEBUG_wrapping + printf("Write(%s): wrap col = %d\n", NS_ConvertUTF16toUTF8(str).get(), + mSettings.GetWrapColumn()); +#endif + + const int32_t totLen = str.Length(); + + // If the string is empty, do nothing: + if (totLen <= 0) return; + + // For Flowed text change nbsp-ses to spaces at end of lines to allow them + // to be cut off along with usual spaces if required. (bug #125928) + if (mSettings.HasFlag(nsIDocumentEncoder::OutputFormatFlowed)) { + ReplaceVisiblyTrailingNbsps(str); + } + + // We have two major codepaths here. One that does preformatted text and one + // that does normal formatted text. The one for preformatted text calls + // Output directly while the other code path goes through AddToLine. + if ((mPreFormattedMail && !mSettings.GetWrapColumn()) || + (IsElementPreformatted() && !mPreFormattedMail) || + (mSpanLevel > 0 && mEmptyLines >= 0 && IsQuotedLine(str))) { + // No intelligent wrapping. + + // This mustn't be mixed with intelligent wrapping without clearing + // the mCurrentLine.mContent buffer before!!! + NS_ASSERTION(mCurrentLine.mContent.IsEmpty() || + (IsElementPreformatted() && !mPreFormattedMail), + "Mixed wrapping data and nonwrapping data on the same line"); + MOZ_ASSERT(mOutputManager); + + if (!mCurrentLine.mContent.IsEmpty()) { + mOutputManager->Flush(mCurrentLine); + } + + ConvertToLinesAndOutput(str); + return; + } + + // Intelligent handling of text + // If needed, strip out all "end of lines" + // and multiple whitespace between words + int32_t nextpos; + const char16_t* offsetIntoBuffer = nullptr; + + int32_t bol = 0; + while (bol < totLen) { // Loop over lines + // Find a place where we may have to do whitespace compression + nextpos = str.FindCharInSet(u" \t\n\r", bol); +#ifdef DEBUG_wrapping + nsAutoString remaining; + str.Right(remaining, totLen - bol); + foo = ToNewCString(remaining); + // printf("Next line: bol = %d, newlinepos = %d, totLen = %d, " + // "string = '%s'\n", bol, nextpos, totLen, foo); + free(foo); +#endif + + if (nextpos == kNotFound) { + // The rest of the string + offsetIntoBuffer = str.get() + bol; + AddToLine(offsetIntoBuffer, totLen - bol); + bol = totLen; + mInWhitespace = false; + } else { + // There's still whitespace left in the string + if (nextpos != 0 && (nextpos + 1) < totLen) { + offsetIntoBuffer = str.get() + nextpos; + // skip '\n' if it is between CJ chars + if (offsetIntoBuffer[0] == '\n' && IS_CJ_CHAR(offsetIntoBuffer[-1]) && + IS_CJ_CHAR(offsetIntoBuffer[1])) { + offsetIntoBuffer = str.get() + bol; + AddToLine(offsetIntoBuffer, nextpos - bol); + bol = nextpos + 1; + continue; + } + } + // If we're already in whitespace and not preformatted, just skip it: + if (mInWhitespace && (nextpos == bol) && !mPreFormattedMail && + !mSettings.HasFlag(nsIDocumentEncoder::OutputPreformatted)) { + // Skip whitespace + bol++; + continue; + } + + if (nextpos == bol) { + // Note that we are in whitespace. + mInWhitespace = true; + offsetIntoBuffer = str.get() + nextpos; + AddToLine(offsetIntoBuffer, 1); + bol++; + continue; + } + + mInWhitespace = true; + + offsetIntoBuffer = str.get() + bol; + if (mPreFormattedMail || + mSettings.HasFlag(nsIDocumentEncoder::OutputPreformatted)) { + // Preserve the real whitespace character + nextpos++; + AddToLine(offsetIntoBuffer, nextpos - bol); + bol = nextpos; + } else { + // Replace the whitespace with a space + AddToLine(offsetIntoBuffer, nextpos - bol); + AddToLine(kSpace.get(), 1); + bol = nextpos + 1; // Let's eat the whitespace + } + } + } // Continue looping over the string +} + +/** + * Gets the value of an attribute in a string. If the function returns + * NS_ERROR_NOT_AVAILABLE, there was none such attribute specified. + */ +nsresult nsPlainTextSerializer::GetAttributeValue(const nsAtom* aName, + nsString& aValueRet) const { + if (mElement) { + if (mElement->GetAttr(aName, aValueRet)) { + return NS_OK; + } + } + + return NS_ERROR_NOT_AVAILABLE; +} + +/** + * Returns true, if the element was inserted by Moz' TXT->HTML converter. + * In this case, we should ignore it. + */ +bool nsPlainTextSerializer::IsCurrentNodeConverted() const { + nsAutoString value; + nsresult rv = GetAttributeValue(nsGkAtoms::_class, value); + return (NS_SUCCEEDED(rv) && + (StringBeginsWith(value, u"moz-txt"_ns, + nsASCIICaseInsensitiveStringComparator) || + StringBeginsWith(value, u"\"moz-txt"_ns, + nsASCIICaseInsensitiveStringComparator))); +} + +// static +nsAtom* nsPlainTextSerializer::GetIdForContent(nsIContent* aContent) { + if (!aContent->IsHTMLElement()) { + return nullptr; + } + + nsAtom* localName = aContent->NodeInfo()->NameAtom(); + return localName->IsStatic() ? localName : nullptr; +} + +bool nsPlainTextSerializer::IsElementPreformatted() const { + return !mPreformatStack.empty() && mPreformatStack.top(); +} + +bool nsPlainTextSerializer::IsElementPreformatted(Element* aElement) { + RefPtr<const ComputedStyle> computedStyle = + nsComputedDOMStyle::GetComputedStyleNoFlush(aElement); + if (computedStyle) { + const nsStyleText* textStyle = computedStyle->StyleText(); + return textStyle->WhiteSpaceOrNewlineIsSignificant(); + } + // Fall back to looking at the tag, in case there is no style information. + return GetIdForContent(aElement) == nsGkAtoms::pre; +} + +bool nsPlainTextSerializer::IsCssBlockLevelElement(Element* aElement) { + RefPtr<const ComputedStyle> computedStyle = + nsComputedDOMStyle::GetComputedStyleNoFlush(aElement); + if (computedStyle) { + const nsStyleDisplay* displayStyle = computedStyle->StyleDisplay(); + return displayStyle->IsBlockOutsideStyle(); + } + // Fall back to looking at the tag, in case there is no style information. + return nsContentUtils::IsHTMLBlockLevelElement(aElement); +} + +/** + * This method is required only to identify LI's inside OL. + * Returns TRUE if we are inside an OL tag and FALSE otherwise. + */ +bool nsPlainTextSerializer::IsInOL() const { + int32_t i = mTagStackIndex; + while (--i >= 0) { + if (mTagStack[i] == nsGkAtoms::ol) return true; + if (mTagStack[i] == nsGkAtoms::ul) { + // If a UL is reached first, LI belongs the UL nested in OL. + return false; + } + } + // We may reach here for orphan LI's. + return false; +} + +bool nsPlainTextSerializer::IsInOlOrUl() const { + return (mULCount > 0) || !mOLStack.IsEmpty(); +} + +/* + @return 0 = no header, 1 = h1, ..., 6 = h6 +*/ +int32_t HeaderLevel(const nsAtom* aTag) { + if (aTag == nsGkAtoms::h1) { + return 1; + } + if (aTag == nsGkAtoms::h2) { + return 2; + } + if (aTag == nsGkAtoms::h3) { + return 3; + } + if (aTag == nsGkAtoms::h4) { + return 4; + } + if (aTag == nsGkAtoms::h5) { + return 5; + } + if (aTag == nsGkAtoms::h6) { + return 6; + } + return 0; +} + +/* These functions define the column width of an ISO 10646 character + * as follows: + * + * - The null character (U+0000) has a column width of 0. + * + * - Other C0/C1 control characters and DEL will lead to a return + * value of -1. + * + * - Non-spacing and enclosing combining characters (general + * category code Mn or Me in the Unicode database) have a + * column width of 0. + * + * - Spacing characters in the East Asian Wide (W) or East Asian + * FullWidth (F) category as defined in Unicode Technical + * Report #11 have a column width of 2. + * + * - All remaining characters (including all printable + * ISO 8859-1 and WGL4 characters, Unicode control characters, + * etc.) have a column width of 1. + */ + +int32_t GetUnicharWidth(char32_t aCh) { + /* test for 8-bit control characters */ + if (aCh == 0) { + return 0; + } + if (aCh < 32 || (aCh >= 0x7f && aCh < 0xa0)) { + return -1; + } + + /* The first combining char in Unicode is U+0300 */ + if (aCh < 0x0300) { + return 1; + } + + auto gc = unicode::GetGeneralCategory(aCh); + if (gc == HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK || + gc == HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK) { + return 0; + } + + /* if we arrive here, ucs is not a combining or C0/C1 control character */ + + /* fast test for majority of non-wide scripts */ + if (aCh < 0x1100) { + return 1; + } + + return intl::UnicodeProperties::IsEastAsianWidthFW(aCh) ? 2 : 1; +} + +int32_t GetUnicharStringWidth(Span<const char16_t> aString) { + int32_t width = 0; + for (auto iter = aString.begin(); iter != aString.end(); ++iter) { + char32_t c = *iter; + if (NS_IS_HIGH_SURROGATE(c) && (iter + 1) != aString.end() && + NS_IS_LOW_SURROGATE(*(iter + 1))) { + c = SURROGATE_TO_UCS4(c, *++iter); + } + const int32_t w = GetUnicharWidth(c); + // Taking 1 as the width of non-printable character, for bug 94475. + width += (w < 0 ? 1 : w); + } + return width; +} diff --git a/dom/serializers/nsPlainTextSerializer.h b/dom/serializers/nsPlainTextSerializer.h new file mode 100644 index 0000000000..4afd83f1a0 --- /dev/null +++ b/dom/serializers/nsPlainTextSerializer.h @@ -0,0 +1,386 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ + +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* + * nsIContentSerializer implementation that can be used with an + * nsIDocumentEncoder to convert a DOM into plaintext in a nice way + * (eg for copy/paste as plaintext). + */ + +#ifndef nsPlainTextSerializer_h__ +#define nsPlainTextSerializer_h__ + +#include "mozilla/Maybe.h" +#include "nsAtom.h" +#include "nsCycleCollectionParticipant.h" +#include "nsIContentSerializer.h" +#include "nsIDocumentEncoder.h" +#include "nsString.h" +#include "nsTArray.h" + +#include <stack> + +class nsIContent; + +namespace mozilla::dom { +class DocumentType; +class Element; +} // namespace mozilla::dom + +class nsPlainTextSerializer final : public nsIContentSerializer { + public: + nsPlainTextSerializer(); + + NS_DECL_CYCLE_COLLECTING_ISUPPORTS + NS_DECL_CYCLE_COLLECTION_CLASS(nsPlainTextSerializer) + + // nsIContentSerializer + NS_IMETHOD Init(uint32_t flags, uint32_t aWrapColumn, + const mozilla::Encoding* aEncoding, bool aIsCopying, + bool aIsWholeDocument, bool* aNeedsPreformatScanning, + nsAString& aOutput) override; + + NS_IMETHOD AppendText(nsIContent* aText, int32_t aStartOffset, + int32_t aEndOffset) override; + NS_IMETHOD AppendCDATASection(nsIContent* aCDATASection, int32_t aStartOffset, + int32_t aEndOffset) override; + NS_IMETHOD AppendProcessingInstruction( + mozilla::dom::ProcessingInstruction* aPI, int32_t aStartOffset, + int32_t aEndOffset) override { + return NS_OK; + } + NS_IMETHOD AppendComment(mozilla::dom::Comment* aComment, + int32_t aStartOffset, int32_t aEndOffset) override { + return NS_OK; + } + NS_IMETHOD AppendDoctype(mozilla::dom::DocumentType* aDoctype) override { + return NS_OK; + } + NS_IMETHOD AppendElementStart( + mozilla::dom::Element* aElement, + mozilla::dom::Element* aOriginalElement) override; + NS_IMETHOD AppendElementEnd(mozilla::dom::Element* aElement, + mozilla::dom::Element* aOriginalElement) override; + + NS_IMETHOD FlushAndFinish() override; + + NS_IMETHOD Finish() override; + + NS_IMETHOD GetOutputLength(uint32_t& aLength) const override; + + NS_IMETHOD AppendDocumentStart(mozilla::dom::Document* aDocument) override; + + NS_IMETHOD ScanElementForPreformat(mozilla::dom::Element* aElement) override; + NS_IMETHOD ForgetElementForPreformat( + mozilla::dom::Element* aElement) override; + + private: + ~nsPlainTextSerializer(); + + nsresult GetAttributeValue(const nsAtom* aName, nsString& aValueRet) const; + void AddToLine(const char16_t* aStringToAdd, int32_t aLength); + + void MaybeWrapAndOutputCompleteLines(); + + // @param aSoftLineBreak A soft line break is a space followed by a linebreak + // (cf. https://www.ietf.org/rfc/rfc3676.txt, section 4.2). + void EndLine(bool aSoftLineBreak, bool aBreakBySpace = false); + + void EnsureVerticalSpace(int32_t noOfRows); + + void ConvertToLinesAndOutput(const nsAString& aString); + + void Write(const nsAString& aString); + + // @return true, iff the elements' whitespace and newline characters have to + // be preserved according to its style or because it's a `<pre>` + // element. + bool IsElementPreformatted() const; + bool IsInOL() const; + bool IsInOlOrUl() const; + bool IsCurrentNodeConverted() const; + bool MustSuppressLeaf() const; + + /** + * Returns the local name of the element as an atom if the element is an + * HTML element and the atom is a static atom. Otherwise, nullptr is returned. + */ + static nsAtom* GetIdForContent(nsIContent* aContent); + nsresult DoOpenContainer(const nsAtom* aTag); + void OpenContainerForOutputFormatted(const nsAtom* aTag); + nsresult DoCloseContainer(const nsAtom* aTag); + void CloseContainerForOutputFormatted(const nsAtom* aTag); + nsresult DoAddLeaf(const nsAtom* aTag); + + void DoAddText(); + // @param aText Ignored if aIsLineBreak is true. + void DoAddText(bool aIsLineBreak, const nsAString& aText); + + inline bool DoOutput() const { return mHeadLevel == 0; } + + static inline bool IsQuotedLine(const nsAString& aLine) { + return !aLine.IsEmpty() && aLine.First() == char16_t('>'); + } + + // Stack handling functions + bool GetLastBool(const nsTArray<bool>& aStack); + void SetLastBool(nsTArray<bool>& aStack, bool aValue); + void PushBool(nsTArray<bool>& aStack, bool aValue); + bool PopBool(nsTArray<bool>& aStack); + + bool IsIgnorableRubyAnnotation(const nsAtom* aTag) const; + + // @return true, iff the elements' whitespace and newline characters have to + // be preserved according to its style or because it's a `<pre>` + // element. + static bool IsElementPreformatted(mozilla::dom::Element* aElement); + + // https://drafts.csswg.org/css-display/#block-level + static bool IsCssBlockLevelElement(mozilla::dom::Element* aElement); + + private: + uint32_t mHeadLevel; + + class Settings { + public: + enum class HeaderStrategy { + kNoIndentation, + kIndentIncreasedWithHeaderLevel, + kNumberHeadingsAndIndentSlightly + }; + + // May adapt the flags. + // + // @param aFlags As defined in nsIDocumentEncoder.idl. + void Init(int32_t aFlags, uint32_t aWrapColumn); + + // Pref: converter.html2txt.structs. + bool GetStructs() const { return mStructs; } + + // Pref: converter.html2txt.header_strategy. + HeaderStrategy GetHeaderStrategy() const { return mHeaderStrategy; } + + // @return As defined in nsIDocumentEncoder.idl. + int32_t GetFlags() const { return mFlags; } + + // @param aFlag As defined in nsIDocumentEncoder.idl. May consist of + // multiple bitwise or'd flags. + bool HasFlag(int32_t aFlag) const { return mFlags & aFlag; } + + // Whether the output should include ruby annotations. + bool GetWithRubyAnnotation() const { return mWithRubyAnnotation; } + + uint32_t GetWrapColumn() const { return mWrapColumn; } + + bool MayWrap() const { + return GetWrapColumn() && HasFlag(nsIDocumentEncoder::OutputFormatted | + nsIDocumentEncoder::OutputWrap); + } + + bool MayBreakLines() const { + return !HasFlag(nsIDocumentEncoder::OutputDisallowLineBreaking); + } + + private: + // @param aPrefHeaderStrategy Pref: converter.html2txt.header_strategy. + static HeaderStrategy Convert(int32_t aPrefHeaderStrategy); + + // Pref: converter.html2txt.structs. + bool mStructs = true; + + // Pref: converter.html2txt.header_strategy. + HeaderStrategy mHeaderStrategy = + HeaderStrategy::kIndentIncreasedWithHeaderLevel; + + // Flags defined in nsIDocumentEncoder.idl. + int32_t mFlags = 0; + + // Whether the output should include ruby annotations. + bool mWithRubyAnnotation = false; + + // The wrap column is how many fixed-pitch narrow + // (https://unicode.org/reports/tr11/) (e.g. Latin) characters + // should be allowed on a line. There could be less chars if the chars + // are wider than latin chars of more if the chars are more narrow. + uint32_t mWrapColumn = 0; + }; + + Settings mSettings; + + struct Indentation { + // The number of space characters to be inserted including the length of + // mHeader. + int32_t mLength = 0; + + // The header that has to be written in the indent. + // That could be, for instance, the bullet in a bulleted list. + nsString mHeader; + }; + + class CurrentLine { + public: + void ResetContentAndIndentationHeader(); + + // @param aFlags As defined in nsIDocumentEncoder.idl. + void MaybeReplaceNbspsInContent(int32_t aFlags); + + void CreateQuotesAndIndent(nsAString& aResult) const; + + bool HasContentOrIndentationHeader() const { + return !mContent.IsEmpty() || !mIndentation.mHeader.IsEmpty(); + } + + // @param aLineBreaker May be nullptr. + int32_t FindWrapIndexForContent(uint32_t aWrapColumn, + bool aUseLineBreaker) const; + + // @return Combined width of cite quote level and indentation. + uint32_t DeterminePrefixWidth() const { + // XXX: Should calculate prefixwidth with GetUnicharStringWidth + return (mCiteQuoteLevel > 0 ? mCiteQuoteLevel + 1 : 0) + + mIndentation.mLength + uint32_t(mSpaceStuffed); + } + + Indentation mIndentation; + + // The number of '>' characters. + int32_t mCiteQuoteLevel = 0; + + // Whether this line is getting space-stuffed, see + // https://datatracker.ietf.org/doc/html/rfc2646#section-4.4 + bool mSpaceStuffed = false; + + // Excludes indentation and quotes. + nsString mContent; + }; + + CurrentLine mCurrentLine; + + class OutputManager { + public: + /** + * @param aFlags As defined in nsIDocumentEncoder.idl. + * @param aOutput An empty string. + */ + OutputManager(int32_t aFlags, nsAString& aOutput); + + enum class StripTrailingWhitespaces { kMaybe, kNo }; + + void Append(const CurrentLine& aCurrentLine, + StripTrailingWhitespaces aStripTrailingWhitespaces); + + void AppendLineBreak(); + + /** + * This empties the current line cache without adding a NEWLINE. + * Should not be used if line wrapping is of importance since + * this function destroys the cache information. + * + * It will also write indentation and quotes if we believe us to be + * at the start of the line. + */ + void Flush(CurrentLine& aCurrentLine); + + bool IsAtFirstColumn() const { return mAtFirstColumn; } + + uint32_t GetOutputLength() const; + + private: + /** + * @param aString Last character is expected to not be a line break. + */ + void Append(const nsAString& aString); + + // As defined in nsIDocumentEncoder.idl. + const int32_t mFlags; + + nsAString& mOutput; + + bool mAtFirstColumn; + + nsString mLineBreak; + }; + + mozilla::Maybe<OutputManager> mOutputManager; + + // If we've just written out a cite blockquote, we need to remember it + // so we don't duplicate spaces before a <pre wrap> (which mail uses to quote + // old messages). + bool mHasWrittenCiteBlockquote; + + int32_t mFloatingLines; // To store the number of lazy line breaks + + // Treat quoted text as though it's preformatted -- don't wrap it. + // Having it on a pref is a temporary measure, See bug 69638. + int32_t mSpanLevel; + + int32_t mEmptyLines; // Will be the number of empty lines before + // the current. 0 if we are starting a new + // line and -1 if we are in a line. + + bool mInWhitespace; + bool mPreFormattedMail; // we're dealing with special DOM + // used by Thunderbird code. + + // While handling a new tag, this variable should remind if any line break + // is due because of a closing tag. Setting it to "TRUE" while closing the + // tags. Hence opening tags are guaranteed to start with appropriate line + // breaks. + bool mLineBreakDue; + + bool mPreformattedBlockBoundary; + + int32_t mHeaderCounter[7]; /* For header-numbering: + Number of previous headers of + the same depth and in the same + section. + mHeaderCounter[1] for <h1> etc. */ + + RefPtr<mozilla::dom::Element> mElement; + + // For handling table rows + AutoTArray<bool, 8> mHasWrittenCellsForRow; + + // Values gotten in OpenContainer that is (also) needed in CloseContainer + AutoTArray<bool, 8> mIsInCiteBlockquote; + + // The tag stack: the stack of tags we're operating on, so we can nest. + // The stack only ever points to static atoms, so they don't need to be + // refcounted. + const nsAtom** mTagStack; + uint32_t mTagStackIndex; + + // The stack indicating whether the elements we've been operating on are + // CSS preformatted elements, so that we can tell if the text inside them + // should be formatted. + std::stack<bool> mPreformatStack; + + // Content in the stack above this index should be ignored: + uint32_t mIgnoreAboveIndex; + + // The stack for ordered lists + AutoTArray<int32_t, 100> mOLStack; + + uint32_t mULCount; + + bool mUseLineBreaker = false; + + // Conveniance constant. It would be nice to have it as a const static + // variable, but that causes issues with OpenBSD and module unloading. + const nsString kSpace; + + // mIgnoredChildNodeLevel is used to tell if current node is an ignorable + // child node. The initial value of mIgnoredChildNodeLevel is 0. When + // serializer enters those specific nodes, mIgnoredChildNodeLevel increases + // and is greater than 0. Otherwise when serializer leaves those nodes, + // mIgnoredChildNodeLevel decreases. + uint32_t mIgnoredChildNodeLevel; +}; + +nsresult NS_NewPlainTextSerializer(nsIContentSerializer** aSerializer); + +#endif diff --git a/dom/serializers/nsXHTMLContentSerializer.cpp b/dom/serializers/nsXHTMLContentSerializer.cpp new file mode 100644 index 0000000000..5f5f0befaa --- /dev/null +++ b/dom/serializers/nsXHTMLContentSerializer.cpp @@ -0,0 +1,729 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* + * nsIContentSerializer implementation that can be used with an + * nsIDocumentEncoder to convert an XHTML (not HTML!) DOM to an XHTML + * string that could be parsed into more or less the original DOM. + */ + +#include "nsXHTMLContentSerializer.h" + +#include "mozilla/dom/Element.h" +#include "nsIContent.h" +#include "mozilla/dom/Document.h" +#include "nsElementTable.h" +#include "nsNameSpaceManager.h" +#include "nsString.h" +#include "nsUnicharUtils.h" +#include "nsIDocumentEncoder.h" +#include "nsGkAtoms.h" +#include "nsIURI.h" +#include "nsNetUtil.h" +#include "nsEscape.h" +#include "nsCRT.h" +#include "nsContentUtils.h" +#include "nsIScriptElement.h" +#include "nsStubMutationObserver.h" +#include "nsAttrName.h" +#include "nsComputedDOMStyle.h" + +using namespace mozilla; +using namespace mozilla::dom; + +static const int32_t kLongLineLen = 128; + +#define kXMLNS "xmlns" + +nsresult NS_NewXHTMLContentSerializer(nsIContentSerializer** aSerializer) { + RefPtr<nsXHTMLContentSerializer> it = new nsXHTMLContentSerializer(); + it.forget(aSerializer); + return NS_OK; +} + +nsXHTMLContentSerializer::nsXHTMLContentSerializer() + : mIsHTMLSerializer(false), + mIsCopying(false), + mDisableEntityEncoding(0), + mRewriteEncodingDeclaration(false), + mIsFirstChildOfOL(false) {} + +nsXHTMLContentSerializer::~nsXHTMLContentSerializer() { + NS_ASSERTION(mOLStateStack.IsEmpty(), "Expected OL State stack to be empty"); +} + +NS_IMETHODIMP +nsXHTMLContentSerializer::Init(uint32_t aFlags, uint32_t aWrapColumn, + const Encoding* aEncoding, bool aIsCopying, + bool aRewriteEncodingDeclaration, + bool* aNeedsPreformatScanning, + nsAString& aOutput) { + // The previous version of the HTML serializer did implicit wrapping + // when there is no flags, so we keep wrapping in order to keep + // compatibility with the existing calling code + // XXXLJ perhaps should we remove this default settings later ? + if (aFlags & nsIDocumentEncoder::OutputFormatted) { + aFlags = aFlags | nsIDocumentEncoder::OutputWrap; + } + + nsresult rv; + rv = nsXMLContentSerializer::Init(aFlags, aWrapColumn, aEncoding, aIsCopying, + aRewriteEncodingDeclaration, + aNeedsPreformatScanning, aOutput); + NS_ENSURE_SUCCESS(rv, rv); + + mRewriteEncodingDeclaration = aRewriteEncodingDeclaration; + mIsCopying = aIsCopying; + mIsFirstChildOfOL = false; + mInBody = 0; + mDisableEntityEncoding = 0; + mBodyOnly = (mFlags & nsIDocumentEncoder::OutputBodyOnly); + + return NS_OK; +} + +// See if the string has any lines longer than longLineLen: +// if so, we presume formatting is wonky (e.g. the node has been edited) +// and we'd better rewrap the whole text node. +bool nsXHTMLContentSerializer::HasLongLines(const nsString& text, + int32_t& aLastNewlineOffset) { + uint32_t start = 0; + uint32_t theLen = text.Length(); + bool rv = false; + aLastNewlineOffset = kNotFound; + for (start = 0; start < theLen;) { + int32_t eol = text.FindChar('\n', start); + if (eol < 0) { + eol = text.Length(); + } else { + aLastNewlineOffset = eol; + } + if (int32_t(eol - start) > kLongLineLen) rv = true; + start = eol + 1; + } + return rv; +} + +NS_IMETHODIMP +nsXHTMLContentSerializer::AppendText(nsIContent* aText, int32_t aStartOffset, + int32_t aEndOffset) { + NS_ENSURE_ARG(aText); + NS_ENSURE_STATE(mOutput); + + nsAutoString data; + nsresult rv; + + rv = AppendTextData(aText, aStartOffset, aEndOffset, data, true); + if (NS_FAILED(rv)) return NS_ERROR_FAILURE; + + if (mDoRaw || PreLevel() > 0) { + NS_ENSURE_TRUE(AppendToStringConvertLF(data, *mOutput), + NS_ERROR_OUT_OF_MEMORY); + } else if (mDoFormat) { + NS_ENSURE_TRUE(AppendToStringFormatedWrapped(data, *mOutput), + NS_ERROR_OUT_OF_MEMORY); + } else if (mDoWrap) { + NS_ENSURE_TRUE(AppendToStringWrapped(data, *mOutput), + NS_ERROR_OUT_OF_MEMORY); + } else { + int32_t lastNewlineOffset = kNotFound; + if (HasLongLines(data, lastNewlineOffset)) { + // We have long lines, rewrap + mDoWrap = true; + bool result = AppendToStringWrapped(data, *mOutput); + mDoWrap = false; + NS_ENSURE_TRUE(result, NS_ERROR_OUT_OF_MEMORY); + } else { + NS_ENSURE_TRUE(AppendToStringConvertLF(data, *mOutput), + NS_ERROR_OUT_OF_MEMORY); + } + } + + return NS_OK; +} + +bool nsXHTMLContentSerializer::SerializeAttributes( + Element* aElement, Element* aOriginalElement, nsAString& aTagPrefix, + const nsAString& aTagNamespaceURI, nsAtom* aTagName, nsAString& aStr, + uint32_t aSkipAttr, bool aAddNSAttr) { + nsresult rv; + uint32_t index, count; + nsAutoString prefixStr, uriStr, valueStr; + nsAutoString xmlnsStr; + xmlnsStr.AssignLiteral(kXMLNS); + + int32_t contentNamespaceID = aElement->GetNameSpaceID(); + + MaybeSerializeIsValue(aElement, aStr); + + // this method is not called by nsHTMLContentSerializer + // so we don't have to check HTML element, just XHTML + + if (mIsCopying && kNameSpaceID_XHTML == contentNamespaceID) { + // Need to keep track of OL and LI elements in order to get ordinal number + // for the LI. + if (aTagName == nsGkAtoms::ol) { + // We are copying and current node is an OL; + // Store its start attribute value in olState->startVal. + nsAutoString start; + int32_t startAttrVal = 0; + aElement->GetAttr(nsGkAtoms::start, start); + if (!start.IsEmpty()) { + nsresult rv = NS_OK; + startAttrVal = start.ToInteger(&rv); + // If OL has "start" attribute, first LI element has to start with that + // value Therefore subtracting 1 as all the LI elements are incrementing + // it before using it; In failure of ToInteger(), default StartAttrValue + // to 0. + if (NS_SUCCEEDED(rv)) + --startAttrVal; + else + startAttrVal = 0; + } + olState state(startAttrVal, true); + mOLStateStack.AppendElement(state); + } else if (aTagName == nsGkAtoms::li) { + mIsFirstChildOfOL = IsFirstChildOfOL(aOriginalElement); + if (mIsFirstChildOfOL) { + // If OL is parent of this LI, serialize attributes in different manner. + NS_ENSURE_TRUE(SerializeLIValueAttribute(aElement, aStr), false); + } + } + } + + // If we had to add a new namespace declaration, serialize + // and push it on the namespace stack + if (aAddNSAttr) { + if (aTagPrefix.IsEmpty()) { + // Serialize default namespace decl + NS_ENSURE_TRUE( + SerializeAttr(u""_ns, xmlnsStr, aTagNamespaceURI, aStr, true), false); + } else { + // Serialize namespace decl + NS_ENSURE_TRUE( + SerializeAttr(xmlnsStr, aTagPrefix, aTagNamespaceURI, aStr, true), + false); + } + PushNameSpaceDecl(aTagPrefix, aTagNamespaceURI, aOriginalElement); + } + + count = aElement->GetAttrCount(); + + // Now serialize each of the attributes + // XXX Unfortunately we need a namespace manager to get + // attribute URIs. + for (index = 0; index < count; index++) { + if (aSkipAttr == index) { + continue; + } + + dom::BorrowedAttrInfo info = aElement->GetAttrInfoAt(index); + const nsAttrName* name = info.mName; + + int32_t namespaceID = name->NamespaceID(); + nsAtom* attrName = name->LocalName(); + nsAtom* attrPrefix = name->GetPrefix(); + + // Filter out any attribute starting with [-|_]moz + nsDependentAtomString attrNameStr(attrName); + if (StringBeginsWith(attrNameStr, u"_moz"_ns) || + StringBeginsWith(attrNameStr, u"-moz"_ns)) { + continue; + } + + if (attrPrefix) { + attrPrefix->ToString(prefixStr); + } else { + prefixStr.Truncate(); + } + + bool addNSAttr = false; + if (kNameSpaceID_XMLNS != namespaceID) { + nsNameSpaceManager::GetInstance()->GetNameSpaceURI(namespaceID, uriStr); + addNSAttr = ConfirmPrefix(prefixStr, uriStr, aOriginalElement, true); + } + + info.mValue->ToString(valueStr); + + nsDependentAtomString nameStr(attrName); + bool isJS = false; + + if (kNameSpaceID_XHTML == contentNamespaceID) { + if (mIsCopying && mIsFirstChildOfOL && (aTagName == nsGkAtoms::li) && + (attrName == nsGkAtoms::value)) { + // This is handled separately in SerializeLIValueAttribute() + continue; + } + + isJS = IsJavaScript(aElement, attrName, namespaceID, valueStr); + + if (namespaceID == kNameSpaceID_None && + ((attrName == nsGkAtoms::href) || (attrName == nsGkAtoms::src))) { + // Make all links absolute when converting only the selection: + if (mFlags & nsIDocumentEncoder::OutputAbsoluteLinks) { + // Would be nice to handle OBJECT tags, + // but that gets more complicated since we have to + // search the tag list for CODEBASE as well. + // For now, just leave them relative. + nsIURI* uri = aElement->GetBaseURI(); + if (uri) { + nsAutoString absURI; + rv = NS_MakeAbsoluteURI(absURI, valueStr, uri); + if (NS_SUCCEEDED(rv)) { + valueStr = absURI; + } + } + } + } + + if (mRewriteEncodingDeclaration && aTagName == nsGkAtoms::meta && + attrName == nsGkAtoms::content) { + // If we're serializing a <meta http-equiv="content-type">, + // use the proper value, rather than what's in the document. + nsAutoString header; + aElement->GetAttr(nsGkAtoms::httpEquiv, header); + if (header.LowerCaseEqualsLiteral("content-type")) { + valueStr = + u"text/html; charset="_ns + NS_ConvertASCIItoUTF16(mCharset); + } + } + + // Expand shorthand attribute. + if (namespaceID == kNameSpaceID_None && + IsShorthandAttr(attrName, aTagName) && valueStr.IsEmpty()) { + valueStr = nameStr; + } + } else { + isJS = IsJavaScript(aElement, attrName, namespaceID, valueStr); + } + + NS_ENSURE_TRUE(SerializeAttr(prefixStr, nameStr, valueStr, aStr, !isJS), + false); + + if (addNSAttr) { + NS_ASSERTION(!prefixStr.IsEmpty(), + "Namespaced attributes must have a prefix"); + NS_ENSURE_TRUE(SerializeAttr(xmlnsStr, prefixStr, uriStr, aStr, true), + false); + PushNameSpaceDecl(prefixStr, uriStr, aOriginalElement); + } + } + + return true; +} + +bool nsXHTMLContentSerializer::AfterElementStart(nsIContent* aContent, + nsIContent* aOriginalElement, + nsAString& aStr) { + if (mRewriteEncodingDeclaration && aContent->IsHTMLElement(nsGkAtoms::head)) { + // Check if there already are any content-type meta children. + // If there are, they will be modified to use the correct charset. + // If there aren't, we'll insert one here. + bool hasMeta = false; + for (nsIContent* child = aContent->GetFirstChild(); child; + child = child->GetNextSibling()) { + if (child->IsHTMLElement(nsGkAtoms::meta) && + child->AsElement()->HasAttr(nsGkAtoms::content)) { + nsAutoString header; + child->AsElement()->GetAttr(nsGkAtoms::httpEquiv, header); + + if (header.LowerCaseEqualsLiteral("content-type")) { + hasMeta = true; + break; + } + } + } + + if (!hasMeta) { + NS_ENSURE_TRUE(AppendNewLineToString(aStr), false); + if (mDoFormat) { + NS_ENSURE_TRUE(AppendIndentation(aStr), false); + } + NS_ENSURE_TRUE( + AppendToString(u"<meta http-equiv=\"content-type\""_ns, aStr), false); + NS_ENSURE_TRUE(AppendToString(u" content=\"text/html; charset="_ns, aStr), + false); + NS_ENSURE_TRUE(AppendToString(NS_ConvertASCIItoUTF16(mCharset), aStr), + false); + if (mIsHTMLSerializer) { + NS_ENSURE_TRUE(AppendToString(u"\">"_ns, aStr), false); + } else { + NS_ENSURE_TRUE(AppendToString(u"\" />"_ns, aStr), false); + } + } + } + + return true; +} + +void nsXHTMLContentSerializer::AfterElementEnd(nsIContent* aContent, + nsAString& aStr) { + NS_ASSERTION(!mIsHTMLSerializer, + "nsHTMLContentSerializer shouldn't call this method !"); + + // this method is not called by nsHTMLContentSerializer + // so we don't have to check HTML element, just XHTML + if (aContent->IsHTMLElement(nsGkAtoms::body)) { + --mInBody; + } +} + +NS_IMETHODIMP +nsXHTMLContentSerializer::AppendDocumentStart(Document* aDocument) { + if (!mBodyOnly) { + return nsXMLContentSerializer::AppendDocumentStart(aDocument); + } + + return NS_OK; +} + +bool nsXHTMLContentSerializer::CheckElementStart(Element* aElement, + bool& aForceFormat, + nsAString& aStr, + nsresult& aResult) { + aResult = NS_OK; + + // The _moz_dirty attribute is emitted by the editor to + // indicate that this element should be pretty printed + // even if we're not in pretty printing mode + aForceFormat = !(mFlags & nsIDocumentEncoder::OutputIgnoreMozDirty) && + aElement->HasAttr(nsGkAtoms::mozdirty); + + if (aElement->IsHTMLElement(nsGkAtoms::br) && + (mFlags & nsIDocumentEncoder::OutputNoFormattingInPre) && + PreLevel() > 0) { + aResult = AppendNewLineToString(aStr) ? NS_OK : NS_ERROR_OUT_OF_MEMORY; + return false; + } + + if (aElement->IsHTMLElement(nsGkAtoms::body)) { + ++mInBody; + } + + return true; +} + +bool nsXHTMLContentSerializer::CheckElementEnd(Element* aElement, + Element* aOriginalElement, + bool& aForceFormat, + nsAString& aStr) { + NS_ASSERTION(!mIsHTMLSerializer, + "nsHTMLContentSerializer shouldn't call this method !"); + + aForceFormat = !(mFlags & nsIDocumentEncoder::OutputIgnoreMozDirty) && + aElement->HasAttr(nsGkAtoms::mozdirty); + + if (mIsCopying && aElement->IsHTMLElement(nsGkAtoms::ol)) { + NS_ASSERTION((!mOLStateStack.IsEmpty()), "Cannot have an empty OL Stack"); + /* Though at this point we must always have an state to be deleted as all + the OL opening tags are supposed to push an olState object to the stack*/ + if (!mOLStateStack.IsEmpty()) { + mOLStateStack.RemoveLastElement(); + } + } + + bool dummyFormat; + return nsXMLContentSerializer::CheckElementEnd(aElement, aOriginalElement, + dummyFormat, aStr); +} + +bool nsXHTMLContentSerializer::AppendAndTranslateEntities( + const nsAString& aStr, nsAString& aOutputStr) { + if (mBodyOnly && !mInBody) { + return true; + } + + if (mDisableEntityEncoding) { + return aOutputStr.Append(aStr, fallible); + } + + return nsXMLContentSerializer::AppendAndTranslateEntities(aStr, aOutputStr); +} + +bool nsXHTMLContentSerializer::IsShorthandAttr(const nsAtom* aAttrName, + const nsAtom* aElementName) { + // checked + if ((aAttrName == nsGkAtoms::checked) && (aElementName == nsGkAtoms::input)) { + return true; + } + + // compact + if ((aAttrName == nsGkAtoms::compact) && + (aElementName == nsGkAtoms::dir || aElementName == nsGkAtoms::dl || + aElementName == nsGkAtoms::menu || aElementName == nsGkAtoms::ol || + aElementName == nsGkAtoms::ul)) { + return true; + } + + // declare + if ((aAttrName == nsGkAtoms::declare) && + (aElementName == nsGkAtoms::object)) { + return true; + } + + // defer + if ((aAttrName == nsGkAtoms::defer) && (aElementName == nsGkAtoms::script)) { + return true; + } + + // disabled + if ((aAttrName == nsGkAtoms::disabled) && + (aElementName == nsGkAtoms::button || aElementName == nsGkAtoms::input || + aElementName == nsGkAtoms::optgroup || + aElementName == nsGkAtoms::option || aElementName == nsGkAtoms::select || + aElementName == nsGkAtoms::textarea)) { + return true; + } + + // ismap + if ((aAttrName == nsGkAtoms::ismap) && + (aElementName == nsGkAtoms::img || aElementName == nsGkAtoms::input)) { + return true; + } + + // multiple + if ((aAttrName == nsGkAtoms::multiple) && + (aElementName == nsGkAtoms::select)) { + return true; + } + + // noresize + if ((aAttrName == nsGkAtoms::noresize) && + (aElementName == nsGkAtoms::frame)) { + return true; + } + + // noshade + if ((aAttrName == nsGkAtoms::noshade) && (aElementName == nsGkAtoms::hr)) { + return true; + } + + // nowrap + if ((aAttrName == nsGkAtoms::nowrap) && + (aElementName == nsGkAtoms::td || aElementName == nsGkAtoms::th)) { + return true; + } + + // readonly + if ((aAttrName == nsGkAtoms::readonly) && + (aElementName == nsGkAtoms::input || + aElementName == nsGkAtoms::textarea)) { + return true; + } + + // selected + if ((aAttrName == nsGkAtoms::selected) && + (aElementName == nsGkAtoms::option)) { + return true; + } + + // autoplay and controls + if ((aElementName == nsGkAtoms::video || aElementName == nsGkAtoms::audio) && + (aAttrName == nsGkAtoms::autoplay || aAttrName == nsGkAtoms::muted || + aAttrName == nsGkAtoms::controls)) { + return true; + } + + return false; +} + +bool nsXHTMLContentSerializer::LineBreakBeforeOpen(int32_t aNamespaceID, + nsAtom* aName) { + if (aNamespaceID != kNameSpaceID_XHTML) { + return mAddSpace; + } + + if (aName == nsGkAtoms::title || aName == nsGkAtoms::meta || + aName == nsGkAtoms::link || aName == nsGkAtoms::style || + aName == nsGkAtoms::select || aName == nsGkAtoms::option || + aName == nsGkAtoms::script || aName == nsGkAtoms::html) { + return true; + } + + return nsHTMLElement::IsBlock(nsHTMLTags::CaseSensitiveAtomTagToId(aName)); +} + +bool nsXHTMLContentSerializer::LineBreakAfterOpen(int32_t aNamespaceID, + nsAtom* aName) { + if (aNamespaceID != kNameSpaceID_XHTML) { + return false; + } + + if ((aName == nsGkAtoms::html) || (aName == nsGkAtoms::head) || + (aName == nsGkAtoms::body) || (aName == nsGkAtoms::ul) || + (aName == nsGkAtoms::ol) || (aName == nsGkAtoms::dl) || + (aName == nsGkAtoms::table) || (aName == nsGkAtoms::tbody) || + (aName == nsGkAtoms::tr) || (aName == nsGkAtoms::br) || + (aName == nsGkAtoms::meta) || (aName == nsGkAtoms::link) || + (aName == nsGkAtoms::script) || (aName == nsGkAtoms::select) || + (aName == nsGkAtoms::map) || (aName == nsGkAtoms::area) || + (aName == nsGkAtoms::style)) { + return true; + } + + return false; +} + +bool nsXHTMLContentSerializer::LineBreakBeforeClose(int32_t aNamespaceID, + nsAtom* aName) { + if (aNamespaceID != kNameSpaceID_XHTML) { + return false; + } + + if ((aName == nsGkAtoms::html) || (aName == nsGkAtoms::head) || + (aName == nsGkAtoms::body) || (aName == nsGkAtoms::ul) || + (aName == nsGkAtoms::ol) || (aName == nsGkAtoms::dl) || + (aName == nsGkAtoms::select) || (aName == nsGkAtoms::table) || + (aName == nsGkAtoms::tbody)) { + return true; + } + return false; +} + +bool nsXHTMLContentSerializer::LineBreakAfterClose(int32_t aNamespaceID, + nsAtom* aName) { + if (aNamespaceID != kNameSpaceID_XHTML) { + return false; + } + + if ((aName == nsGkAtoms::html) || (aName == nsGkAtoms::head) || + (aName == nsGkAtoms::body) || (aName == nsGkAtoms::tr) || + (aName == nsGkAtoms::th) || (aName == nsGkAtoms::td) || + (aName == nsGkAtoms::title) || (aName == nsGkAtoms::dt) || + (aName == nsGkAtoms::dd) || (aName == nsGkAtoms::select) || + (aName == nsGkAtoms::option) || (aName == nsGkAtoms::map)) { + return true; + } + + return nsHTMLElement::IsBlock(nsHTMLTags::CaseSensitiveAtomTagToId(aName)); +} + +void nsXHTMLContentSerializer::MaybeEnterInPreContent(nsIContent* aNode) { + if (!ShouldMaintainPreLevel() || !aNode->IsHTMLElement()) { + return; + } + + if (IsElementPreformatted(aNode) || + aNode->IsAnyOfHTMLElements(nsGkAtoms::script, nsGkAtoms::style, + nsGkAtoms::noscript, nsGkAtoms::noframes)) { + PreLevel()++; + } +} + +void nsXHTMLContentSerializer::MaybeLeaveFromPreContent(nsIContent* aNode) { + if (!ShouldMaintainPreLevel() || !aNode->IsHTMLElement()) { + return; + } + + if (IsElementPreformatted(aNode) || + aNode->IsAnyOfHTMLElements(nsGkAtoms::script, nsGkAtoms::style, + nsGkAtoms::noscript, nsGkAtoms::noframes)) { + --PreLevel(); + } +} + +bool nsXHTMLContentSerializer::IsElementPreformatted(nsIContent* aNode) { + MOZ_ASSERT(ShouldMaintainPreLevel(), + "We should not be calling this needlessly"); + + if (!aNode->IsElement()) { + return false; + } + RefPtr<const ComputedStyle> computedStyle = + nsComputedDOMStyle::GetComputedStyleNoFlush(aNode->AsElement()); + if (computedStyle) { + const nsStyleText* textStyle = computedStyle->StyleText(); + return textStyle->WhiteSpaceOrNewlineIsSignificant(); + } + return false; +} + +bool nsXHTMLContentSerializer::SerializeLIValueAttribute(nsIContent* aElement, + nsAString& aStr) { + // We are copying and we are at the "first" LI node of OL in selected range. + // It may not be the first LI child of OL but it's first in the selected + // range. Note that we get into this condition only once per a OL. + bool found = false; + nsAutoString valueStr; + + olState state(0, false); + + if (!mOLStateStack.IsEmpty()) { + state = mOLStateStack[mOLStateStack.Length() - 1]; + // isFirstListItem should be true only before the serialization of the + // first item in the list. + state.isFirstListItem = false; + mOLStateStack[mOLStateStack.Length() - 1] = state; + } + + int32_t startVal = state.startVal; + int32_t offset = 0; + + // Traverse previous siblings until we find one with "value" attribute. + // offset keeps track of how many previous siblings we had to traverse. + nsIContent* currNode = aElement; + while (currNode && !found) { + if (currNode->IsHTMLElement(nsGkAtoms::li)) { + currNode->AsElement()->GetAttr(nsGkAtoms::value, valueStr); + if (valueStr.IsEmpty()) { + offset++; + } else { + found = true; + nsresult rv = NS_OK; + startVal = valueStr.ToInteger(&rv); + } + } + currNode = currNode->GetPreviousSibling(); + } + // If LI was not having "value", Set the "value" attribute for it. + // Note that We are at the first LI in the selected range of OL. + if (offset == 0 && found) { + // offset = 0 => LI itself has the value attribute and we did not need to + // traverse back. Just serialize value attribute like other tags. + NS_ENSURE_TRUE(SerializeAttr(u""_ns, u"value"_ns, valueStr, aStr, false), + false); + } else if (offset == 1 && !found) { + /*(offset = 1 && !found) means either LI is the first child node of OL + and LI is not having "value" attribute. + In that case we would not like to set "value" attribute to reduce the + changes. + */ + // do nothing... + } else if (offset > 0) { + // Set value attribute. + nsAutoString valueStr; + + // As serializer needs to use this valueAttr we are creating here, + valueStr.AppendInt(startVal + offset); + NS_ENSURE_TRUE(SerializeAttr(u""_ns, u"value"_ns, valueStr, aStr, false), + false); + } + + return true; +} + +bool nsXHTMLContentSerializer::IsFirstChildOfOL(nsIContent* aElement) { + nsIContent* parent = aElement->GetParent(); + if (parent && parent->NodeName().LowerCaseEqualsLiteral("ol")) { + if (!mOLStateStack.IsEmpty()) { + olState state = mOLStateStack[mOLStateStack.Length() - 1]; + if (state.isFirstListItem) return true; + } + } + + return false; +} + +bool nsXHTMLContentSerializer::HasNoChildren(nsIContent* aContent) { + for (nsIContent* child = aContent->GetFirstChild(); child; + child = child->GetNextSibling()) { + if (!child->IsText()) return false; + + if (child->TextLength()) return false; + } + + return true; +} diff --git a/dom/serializers/nsXHTMLContentSerializer.h b/dom/serializers/nsXHTMLContentSerializer.h new file mode 100644 index 0000000000..ea4c83840b --- /dev/null +++ b/dom/serializers/nsXHTMLContentSerializer.h @@ -0,0 +1,143 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* + * nsIContentSerializer implementation that can be used with an + * nsIDocumentEncoder to convert an XHTML (not HTML!) DOM to an XHTML + * string that could be parsed into more or less the original DOM. + */ + +#ifndef nsXHTMLContentSerializer_h__ +#define nsXHTMLContentSerializer_h__ + +#include "mozilla/Attributes.h" +#include "nsXMLContentSerializer.h" +#include "nsString.h" +#include "nsTArray.h" + +class nsIContent; +class nsAtom; + +namespace mozilla { +class Encoding; +} + +class nsXHTMLContentSerializer : public nsXMLContentSerializer { + public: + nsXHTMLContentSerializer(); + virtual ~nsXHTMLContentSerializer(); + + NS_IMETHOD Init(uint32_t flags, uint32_t aWrapColumn, + const mozilla::Encoding* aEncoding, bool aIsCopying, + bool aRewriteEncodingDeclaration, + bool* aNeedsPreformatScanning, nsAString& aOutput) override; + + NS_IMETHOD AppendText(nsIContent* aText, int32_t aStartOffset, + int32_t aEndOffset) override; + + NS_IMETHOD AppendDocumentStart(mozilla::dom::Document* aDocument) override; + + protected: + virtual bool CheckElementStart(mozilla::dom::Element* aElement, + bool& aForceFormat, nsAString& aStr, + nsresult& aResult) override; + + [[nodiscard]] virtual bool AfterElementStart(nsIContent* aContent, + nsIContent* aOriginalElement, + nsAString& aStr) override; + + virtual bool CheckElementEnd(mozilla::dom::Element* aContent, + mozilla::dom::Element* aOriginalElement, + bool& aForceFormat, nsAString& aStr) override; + + virtual void AfterElementEnd(nsIContent* aContent, nsAString& aStr) override; + + virtual bool LineBreakBeforeOpen(int32_t aNamespaceID, + nsAtom* aName) override; + virtual bool LineBreakAfterOpen(int32_t aNamespaceID, nsAtom* aName) override; + virtual bool LineBreakBeforeClose(int32_t aNamespaceID, + nsAtom* aName) override; + virtual bool LineBreakAfterClose(int32_t aNamespaceID, + nsAtom* aName) override; + + bool HasLongLines(const nsString& text, int32_t& aLastNewlineOffset); + + // functions to check if we enter in or leave from a preformated content + virtual void MaybeEnterInPreContent(nsIContent* aNode) override; + virtual void MaybeLeaveFromPreContent(nsIContent* aNode) override; + + [[nodiscard]] virtual bool SerializeAttributes( + mozilla::dom::Element* aContent, mozilla::dom::Element* aOriginalElement, + nsAString& aTagPrefix, const nsAString& aTagNamespaceURI, + nsAtom* aTagName, nsAString& aStr, uint32_t aSkipAttr, + bool aAddNSAttr) override; + + bool IsFirstChildOfOL(nsIContent* aElement); + + [[nodiscard]] bool SerializeLIValueAttribute(nsIContent* aElement, + nsAString& aStr); + bool IsShorthandAttr(const nsAtom* aAttrName, const nsAtom* aElementName); + + [[nodiscard]] virtual bool AppendAndTranslateEntities( + const nsAString& aStr, nsAString& aOutputStr) override; + + private: + bool IsElementPreformatted(nsIContent* aNode); + + protected: + /* + * isHTMLParser should be set to true by the HTML parser which inherits from + * this class. It avoids to redefine methods just for few changes. + */ + bool mIsHTMLSerializer; + + bool mIsCopying; // Set to true only while copying + + /* + * mDisableEntityEncoding is higher than 0 while the serializer is serializing + * the content of a element whose content is considerd CDATA by the + * serializer (such elements are 'script', 'style', 'noscript' and + * possibly others in XHTML) This doesn't have anything to do with if the + * element is defined as CDATA in the DTD, it simply means we'll + * output the content of the element without doing any entity encoding + * what so ever. + */ + int32_t mDisableEntityEncoding; + + // This is to ensure that we only do meta tag fixups when dealing with + // whole documents. + bool mRewriteEncodingDeclaration; + + // To keep track of First LI child of OL in selected range + bool mIsFirstChildOfOL; + + // To keep track of startvalue of OL and first list item for nested lists + struct olState { + olState(int32_t aStart, bool aIsFirst) + : startVal(aStart), isFirstListItem(aIsFirst) {} + + olState(const olState& aOlState) { + startVal = aOlState.startVal; + isFirstListItem = aOlState.isFirstListItem; + } + + // the value of the start attribute in the OL + int32_t startVal; + + // is true only before the serialization of the first li of an ol + // should be false for other li in the list + bool isFirstListItem; + }; + + // Stack to store one olState struct per <OL>. + AutoTArray<olState, 8> mOLStateStack; + + bool HasNoChildren(nsIContent* aContent); +}; + +nsresult NS_NewXHTMLContentSerializer(nsIContentSerializer** aSerializer); + +#endif diff --git a/dom/serializers/nsXMLContentSerializer.cpp b/dom/serializers/nsXMLContentSerializer.cpp new file mode 100644 index 0000000000..ab0fcdf413 --- /dev/null +++ b/dom/serializers/nsXMLContentSerializer.cpp @@ -0,0 +1,1827 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* + * nsIContentSerializer implementation that can be used with an + * nsIDocumentEncoder to convert an XML DOM to an XML string that + * could be parsed into more or less the original DOM. + */ + +#include "nsXMLContentSerializer.h" + +#include "nsGkAtoms.h" +#include "nsIContent.h" +#include "nsIContentInlines.h" +#include "mozilla/dom/Document.h" +#include "nsIDocumentEncoder.h" +#include "nsElementTable.h" +#include "nsNameSpaceManager.h" +#include "nsTextFragment.h" +#include "nsString.h" +#include "mozilla/Sprintf.h" +#include "nsUnicharUtils.h" +#include "nsCRT.h" +#include "nsContentUtils.h" +#include "nsAttrName.h" +#include "mozilla/dom/Comment.h" +#include "mozilla/dom/CustomElementRegistry.h" +#include "mozilla/dom/DocumentType.h" +#include "mozilla/dom/Element.h" +#include "mozilla/dom/ProcessingInstruction.h" +#include "mozilla/intl/Segmenter.h" +#include "nsParserConstants.h" +#include "mozilla/Encoding.h" + +using namespace mozilla; +using namespace mozilla::dom; + +#define kXMLNS "xmlns" + +// to be readable, we assume that an indented line contains +// at least this number of characters (arbitrary value here). +// This is a limit for the indentation. +#define MIN_INDENTED_LINE_LENGTH 15 + +// the string used to indent. +#define INDENT_STRING " " +#define INDENT_STRING_LENGTH 2 + +nsresult NS_NewXMLContentSerializer(nsIContentSerializer** aSerializer) { + RefPtr<nsXMLContentSerializer> it = new nsXMLContentSerializer(); + it.forget(aSerializer); + return NS_OK; +} + +nsXMLContentSerializer::nsXMLContentSerializer() + : mPrefixIndex(0), + mColPos(0), + mIndentOverflow(0), + mIsIndentationAddedOnCurrentLine(false), + mInAttribute(false), + mAddNewlineForRootNode(false), + mAddSpace(false), + mMayIgnoreLineBreakSequence(false), + mBodyOnly(false), + mInBody(0) {} + +nsXMLContentSerializer::~nsXMLContentSerializer() = default; + +NS_IMPL_ISUPPORTS(nsXMLContentSerializer, nsIContentSerializer) + +NS_IMETHODIMP +nsXMLContentSerializer::Init(uint32_t aFlags, uint32_t aWrapColumn, + const Encoding* aEncoding, bool aIsCopying, + bool aRewriteEncodingDeclaration, + bool* aNeedsPreformatScanning, + nsAString& aOutput) { + *aNeedsPreformatScanning = false; + mPrefixIndex = 0; + mColPos = 0; + mIndentOverflow = 0; + mIsIndentationAddedOnCurrentLine = false; + mInAttribute = false; + mAddNewlineForRootNode = false; + mAddSpace = false; + mMayIgnoreLineBreakSequence = false; + mBodyOnly = false; + mInBody = 0; + + if (aEncoding) { + aEncoding->Name(mCharset); + } + mFlags = aFlags; + + // Set the line break character: + if ((mFlags & nsIDocumentEncoder::OutputCRLineBreak) && + (mFlags & nsIDocumentEncoder::OutputLFLineBreak)) { // Windows + mLineBreak.AssignLiteral("\r\n"); + } else if (mFlags & nsIDocumentEncoder::OutputCRLineBreak) { // Mac + mLineBreak.Assign('\r'); + } else if (mFlags & nsIDocumentEncoder::OutputLFLineBreak) { // Unix/DOM + mLineBreak.Assign('\n'); + } else { + mLineBreak.AssignLiteral(NS_LINEBREAK); // Platform/default + } + + mDoRaw = !!(mFlags & nsIDocumentEncoder::OutputRaw); + + mDoFormat = (mFlags & nsIDocumentEncoder::OutputFormatted && !mDoRaw); + + mDoWrap = (mFlags & nsIDocumentEncoder::OutputWrap && !mDoRaw); + + mAllowLineBreaking = + !(mFlags & nsIDocumentEncoder::OutputDisallowLineBreaking); + + if (!aWrapColumn) { + mMaxColumn = 72; + } else { + mMaxColumn = aWrapColumn; + } + + mOutput = &aOutput; + mPreLevel = 0; + mIsIndentationAddedOnCurrentLine = false; + return NS_OK; +} + +nsresult nsXMLContentSerializer::AppendTextData(nsIContent* aNode, + int32_t aStartOffset, + int32_t aEndOffset, + nsAString& aStr, + bool aTranslateEntities) { + nsIContent* content = aNode; + const nsTextFragment* frag; + if (!content || !(frag = content->GetText())) { + return NS_ERROR_FAILURE; + } + + int32_t fragLength = frag->GetLength(); + int32_t endoffset = + (aEndOffset == -1) ? fragLength : std::min(aEndOffset, fragLength); + int32_t length = endoffset - aStartOffset; + + NS_ASSERTION(aStartOffset >= 0, "Negative start offset for text fragment!"); + NS_ASSERTION(aStartOffset <= endoffset, + "A start offset is beyond the end of the text fragment!"); + + if (length <= 0) { + // XXX Zero is a legal value, maybe non-zero values should be an + // error. + return NS_OK; + } + + if (frag->Is2b()) { + const char16_t* strStart = frag->Get2b() + aStartOffset; + if (aTranslateEntities) { + NS_ENSURE_TRUE(AppendAndTranslateEntities( + Substring(strStart, strStart + length), aStr), + NS_ERROR_OUT_OF_MEMORY); + } else { + NS_ENSURE_TRUE(aStr.Append(Substring(strStart, strStart + length), + mozilla::fallible), + NS_ERROR_OUT_OF_MEMORY); + } + } else { + nsAutoString utf16; + if (!CopyASCIItoUTF16(Span(frag->Get1b() + aStartOffset, length), utf16, + mozilla::fallible_t())) { + return NS_ERROR_OUT_OF_MEMORY; + } + if (aTranslateEntities) { + NS_ENSURE_TRUE(AppendAndTranslateEntities(utf16, aStr), + NS_ERROR_OUT_OF_MEMORY); + } else { + NS_ENSURE_TRUE(aStr.Append(utf16, mozilla::fallible), + NS_ERROR_OUT_OF_MEMORY); + } + } + + return NS_OK; +} + +NS_IMETHODIMP +nsXMLContentSerializer::AppendText(nsIContent* aText, int32_t aStartOffset, + int32_t aEndOffset) { + NS_ENSURE_ARG(aText); + NS_ENSURE_STATE(mOutput); + + nsAutoString data; + nsresult rv; + + rv = AppendTextData(aText, aStartOffset, aEndOffset, data, true); + if (NS_FAILED(rv)) return NS_ERROR_FAILURE; + + if (mDoRaw || PreLevel() > 0) { + NS_ENSURE_TRUE(AppendToStringConvertLF(data, *mOutput), + NS_ERROR_OUT_OF_MEMORY); + } else if (mDoFormat) { + NS_ENSURE_TRUE(AppendToStringFormatedWrapped(data, *mOutput), + NS_ERROR_OUT_OF_MEMORY); + } else if (mDoWrap) { + NS_ENSURE_TRUE(AppendToStringWrapped(data, *mOutput), + NS_ERROR_OUT_OF_MEMORY); + } else { + NS_ENSURE_TRUE(AppendToStringConvertLF(data, *mOutput), + NS_ERROR_OUT_OF_MEMORY); + } + + return NS_OK; +} + +NS_IMETHODIMP +nsXMLContentSerializer::AppendCDATASection(nsIContent* aCDATASection, + int32_t aStartOffset, + int32_t aEndOffset) { + NS_ENSURE_ARG(aCDATASection); + NS_ENSURE_STATE(mOutput); + + nsresult rv; + + constexpr auto cdata = u"<![CDATA["_ns; + + if (mDoRaw || PreLevel() > 0) { + NS_ENSURE_TRUE(AppendToString(cdata, *mOutput), NS_ERROR_OUT_OF_MEMORY); + } else if (mDoFormat) { + NS_ENSURE_TRUE(AppendToStringFormatedWrapped(cdata, *mOutput), + NS_ERROR_OUT_OF_MEMORY); + } else if (mDoWrap) { + NS_ENSURE_TRUE(AppendToStringWrapped(cdata, *mOutput), + NS_ERROR_OUT_OF_MEMORY); + } else { + NS_ENSURE_TRUE(AppendToString(cdata, *mOutput), NS_ERROR_OUT_OF_MEMORY); + } + + nsAutoString data; + rv = AppendTextData(aCDATASection, aStartOffset, aEndOffset, data, false); + if (NS_FAILED(rv)) return NS_ERROR_FAILURE; + + NS_ENSURE_TRUE(AppendToStringConvertLF(data, *mOutput), + NS_ERROR_OUT_OF_MEMORY); + + NS_ENSURE_TRUE(AppendToString(u"]]>"_ns, *mOutput), NS_ERROR_OUT_OF_MEMORY); + + return NS_OK; +} + +NS_IMETHODIMP +nsXMLContentSerializer::AppendProcessingInstruction(ProcessingInstruction* aPI, + int32_t aStartOffset, + int32_t aEndOffset) { + NS_ENSURE_STATE(mOutput); + + nsAutoString target, data, start; + + NS_ENSURE_TRUE(MaybeAddNewlineForRootNode(*mOutput), NS_ERROR_OUT_OF_MEMORY); + + aPI->GetTarget(target); + + aPI->GetData(data); + + NS_ENSURE_TRUE(start.AppendLiteral("<?", mozilla::fallible), + NS_ERROR_OUT_OF_MEMORY); + NS_ENSURE_TRUE(start.Append(target, mozilla::fallible), + NS_ERROR_OUT_OF_MEMORY); + + if (mDoRaw || PreLevel() > 0) { + NS_ENSURE_TRUE(AppendToString(start, *mOutput), NS_ERROR_OUT_OF_MEMORY); + } else if (mDoFormat) { + if (mAddSpace) { + NS_ENSURE_TRUE(AppendNewLineToString(*mOutput), NS_ERROR_OUT_OF_MEMORY); + } + NS_ENSURE_TRUE(AppendToStringFormatedWrapped(start, *mOutput), + NS_ERROR_OUT_OF_MEMORY); + } else if (mDoWrap) { + NS_ENSURE_TRUE(AppendToStringWrapped(start, *mOutput), + NS_ERROR_OUT_OF_MEMORY); + } else { + NS_ENSURE_TRUE(AppendToString(start, *mOutput), NS_ERROR_OUT_OF_MEMORY); + } + + if (!data.IsEmpty()) { + NS_ENSURE_TRUE(AppendToString(char16_t(' '), *mOutput), + NS_ERROR_OUT_OF_MEMORY); + NS_ENSURE_TRUE(AppendToStringConvertLF(data, *mOutput), + NS_ERROR_OUT_OF_MEMORY); + } + NS_ENSURE_TRUE(AppendToString(u"?>"_ns, *mOutput), NS_ERROR_OUT_OF_MEMORY); + + MaybeFlagNewlineForRootNode(aPI); + + return NS_OK; +} + +NS_IMETHODIMP +nsXMLContentSerializer::AppendComment(Comment* aComment, int32_t aStartOffset, + int32_t aEndOffset) { + NS_ENSURE_STATE(mOutput); + + nsAutoString data; + aComment->GetData(data); + + int32_t dataLength = data.Length(); + if (aStartOffset || (aEndOffset != -1 && aEndOffset < dataLength)) { + int32_t length = + (aEndOffset == -1) ? dataLength : std::min(aEndOffset, dataLength); + length -= aStartOffset; + + nsAutoString frag; + if (length > 0) { + data.Mid(frag, aStartOffset, length); + } + data.Assign(frag); + } + + NS_ENSURE_TRUE(MaybeAddNewlineForRootNode(*mOutput), NS_ERROR_OUT_OF_MEMORY); + + constexpr auto startComment = u"<!--"_ns; + + if (mDoRaw || PreLevel() > 0) { + NS_ENSURE_TRUE(AppendToString(startComment, *mOutput), + NS_ERROR_OUT_OF_MEMORY); + } else if (mDoFormat) { + if (mAddSpace) { + NS_ENSURE_TRUE(AppendNewLineToString(*mOutput), NS_ERROR_OUT_OF_MEMORY); + } + NS_ENSURE_TRUE(AppendToStringFormatedWrapped(startComment, *mOutput), + NS_ERROR_OUT_OF_MEMORY); + } else if (mDoWrap) { + NS_ENSURE_TRUE(AppendToStringWrapped(startComment, *mOutput), + NS_ERROR_OUT_OF_MEMORY); + } else { + NS_ENSURE_TRUE(AppendToString(startComment, *mOutput), + NS_ERROR_OUT_OF_MEMORY); + } + + // Even if mDoformat, we don't format the content because it + // could have been preformated by the author + NS_ENSURE_TRUE(AppendToStringConvertLF(data, *mOutput), + NS_ERROR_OUT_OF_MEMORY); + NS_ENSURE_TRUE(AppendToString(u"-->"_ns, *mOutput), NS_ERROR_OUT_OF_MEMORY); + + MaybeFlagNewlineForRootNode(aComment); + + return NS_OK; +} + +NS_IMETHODIMP +nsXMLContentSerializer::AppendDoctype(DocumentType* aDocType) { + NS_ENSURE_STATE(mOutput); + + nsAutoString name, publicId, systemId; + aDocType->GetName(name); + aDocType->GetPublicId(publicId); + aDocType->GetSystemId(systemId); + + NS_ENSURE_TRUE(MaybeAddNewlineForRootNode(*mOutput), NS_ERROR_OUT_OF_MEMORY); + + NS_ENSURE_TRUE(AppendToString(u"<!DOCTYPE "_ns, *mOutput), + NS_ERROR_OUT_OF_MEMORY); + NS_ENSURE_TRUE(AppendToString(name, *mOutput), NS_ERROR_OUT_OF_MEMORY); + + char16_t quote; + if (!publicId.IsEmpty()) { + NS_ENSURE_TRUE(AppendToString(u" PUBLIC "_ns, *mOutput), + NS_ERROR_OUT_OF_MEMORY); + if (publicId.FindChar(char16_t('"')) == -1) { + quote = char16_t('"'); + } else { + quote = char16_t('\''); + } + NS_ENSURE_TRUE(AppendToString(quote, *mOutput), NS_ERROR_OUT_OF_MEMORY); + NS_ENSURE_TRUE(AppendToString(publicId, *mOutput), NS_ERROR_OUT_OF_MEMORY); + NS_ENSURE_TRUE(AppendToString(quote, *mOutput), NS_ERROR_OUT_OF_MEMORY); + + if (!systemId.IsEmpty()) { + NS_ENSURE_TRUE(AppendToString(char16_t(' '), *mOutput), + NS_ERROR_OUT_OF_MEMORY); + if (systemId.FindChar(char16_t('"')) == -1) { + quote = char16_t('"'); + } else { + quote = char16_t('\''); + } + NS_ENSURE_TRUE(AppendToString(quote, *mOutput), NS_ERROR_OUT_OF_MEMORY); + NS_ENSURE_TRUE(AppendToString(systemId, *mOutput), + NS_ERROR_OUT_OF_MEMORY); + NS_ENSURE_TRUE(AppendToString(quote, *mOutput), NS_ERROR_OUT_OF_MEMORY); + } + } else if (!systemId.IsEmpty()) { + if (systemId.FindChar(char16_t('"')) == -1) { + quote = char16_t('"'); + } else { + quote = char16_t('\''); + } + NS_ENSURE_TRUE(AppendToString(u" SYSTEM "_ns, *mOutput), + NS_ERROR_OUT_OF_MEMORY); + NS_ENSURE_TRUE(AppendToString(quote, *mOutput), NS_ERROR_OUT_OF_MEMORY); + NS_ENSURE_TRUE(AppendToString(systemId, *mOutput), NS_ERROR_OUT_OF_MEMORY); + NS_ENSURE_TRUE(AppendToString(quote, *mOutput), NS_ERROR_OUT_OF_MEMORY); + } + + NS_ENSURE_TRUE(AppendToString(kGreaterThan, *mOutput), + NS_ERROR_OUT_OF_MEMORY); + MaybeFlagNewlineForRootNode(aDocType); + + return NS_OK; +} + +nsresult nsXMLContentSerializer::PushNameSpaceDecl(const nsAString& aPrefix, + const nsAString& aURI, + nsIContent* aOwner) { + NameSpaceDecl* decl = mNameSpaceStack.AppendElement(); + if (!decl) return NS_ERROR_OUT_OF_MEMORY; + + decl->mPrefix.Assign(aPrefix); + decl->mURI.Assign(aURI); + // Don't addref - this weak reference will be removed when + // we pop the stack + decl->mOwner = aOwner; + return NS_OK; +} + +void nsXMLContentSerializer::PopNameSpaceDeclsFor(nsIContent* aOwner) { + int32_t index, count; + + count = mNameSpaceStack.Length(); + for (index = count - 1; index >= 0; index--) { + if (mNameSpaceStack[index].mOwner != aOwner) { + break; + } + mNameSpaceStack.RemoveLastElement(); + } +} + +bool nsXMLContentSerializer::ConfirmPrefix(nsAString& aPrefix, + const nsAString& aURI, + nsIContent* aElement, + bool aIsAttribute) { + if (aPrefix.EqualsLiteral(kXMLNS)) { + return false; + } + + if (aURI.EqualsLiteral("http://www.w3.org/XML/1998/namespace")) { + // The prefix must be xml for this namespace. We don't need to declare it, + // so always just set the prefix to xml. + aPrefix.AssignLiteral("xml"); + + return false; + } + + bool mustHavePrefix; + if (aIsAttribute) { + if (aURI.IsEmpty()) { + // Attribute in the null namespace. This just shouldn't have a prefix. + // And there's no need to push any namespace decls + aPrefix.Truncate(); + return false; + } + + // Attribute not in the null namespace -- must have a prefix + mustHavePrefix = true; + } else { + // Not an attribute, so doesn't _have_ to have a prefix + mustHavePrefix = false; + } + + // Keep track of the closest prefix that's bound to aURI and whether we've + // found such a thing. closestURIMatch holds the prefix, and uriMatch + // indicates whether we actually have one. + nsAutoString closestURIMatch; + bool uriMatch = false; + + // Also keep track of whether we've seen aPrefix already. If we have, that + // means that it's already bound to a URI different from aURI, so even if we + // later (so in a more outer scope) see it bound to aURI we can't reuse it. + bool haveSeenOurPrefix = false; + + int32_t count = mNameSpaceStack.Length(); + int32_t index = count - 1; + while (index >= 0) { + NameSpaceDecl& decl = mNameSpaceStack.ElementAt(index); + // Check if we've found a prefix match + if (aPrefix.Equals(decl.mPrefix)) { + // If the URIs match and aPrefix is not bound to any other URI, we can + // use aPrefix + if (!haveSeenOurPrefix && aURI.Equals(decl.mURI)) { + // Just use our uriMatch stuff. That will deal with an empty aPrefix + // the right way. We can break out of the loop now, though. + uriMatch = true; + closestURIMatch = aPrefix; + break; + } + + haveSeenOurPrefix = true; + + // If they don't, and either: + // 1) We have a prefix (so we'd be redeclaring this prefix to point to a + // different namespace) or + // 2) We're looking at an existing default namespace decl on aElement (so + // we can't create a new default namespace decl for this URI) + // then generate a new prefix. Note that we do NOT generate new prefixes + // if we happen to have aPrefix == decl->mPrefix == "" and mismatching + // URIs when |decl| doesn't have aElement as its owner. In that case we + // can simply push the new namespace URI as the default namespace for + // aElement. + if (!aPrefix.IsEmpty() || decl.mOwner == aElement) { + NS_ASSERTION(!aURI.IsEmpty(), + "Not allowed to add a xmlns attribute with an empty " + "namespace name unless it declares the default " + "namespace."); + + GenerateNewPrefix(aPrefix); + // Now we need to validate our new prefix/uri combination; check it + // against the full namespace stack again. Note that just restarting + // the while loop is ok, since we haven't changed aURI, so the + // closestURIMatch and uriMatch state is not affected. + index = count - 1; + haveSeenOurPrefix = false; + continue; + } + } + + // If we've found a URI match, then record the first one + if (!uriMatch && aURI.Equals(decl.mURI)) { + // Need to check that decl->mPrefix is not declared anywhere closer to + // us. If it is, we can't use it. + bool prefixOK = true; + int32_t index2; + for (index2 = count - 1; index2 > index && prefixOK; --index2) { + prefixOK = (mNameSpaceStack[index2].mPrefix != decl.mPrefix); + } + + if (prefixOK) { + uriMatch = true; + closestURIMatch.Assign(decl.mPrefix); + } + } + + --index; + } + + // At this point the following invariants hold: + // 1) The prefix in closestURIMatch is mapped to aURI in our scope if + // uriMatch is set. + // 2) There is nothing on the namespace stack that has aPrefix as the prefix + // and a _different_ URI, except for the case aPrefix.IsEmpty (and + // possible default namespaces on ancestors) + + // So if uriMatch is set it's OK to use the closestURIMatch prefix. The one + // exception is when closestURIMatch is actually empty (default namespace + // decl) and we must have a prefix. + if (uriMatch && (!mustHavePrefix || !closestURIMatch.IsEmpty())) { + aPrefix.Assign(closestURIMatch); + return false; + } + + if (aPrefix.IsEmpty()) { + // At this point, aPrefix is empty (which means we never had a prefix to + // start with). If we must have a prefix, just generate a new prefix and + // then send it back through the namespace stack checks to make sure it's + // OK. + if (mustHavePrefix) { + GenerateNewPrefix(aPrefix); + return ConfirmPrefix(aPrefix, aURI, aElement, aIsAttribute); + } + + // One final special case. If aPrefix is empty and we never saw an empty + // prefix (default namespace decl) on the namespace stack and we're in the + // null namespace there is no reason to output an |xmlns=""| here. It just + // makes the output less readable. + if (!haveSeenOurPrefix && aURI.IsEmpty()) { + return false; + } + } + + // Now just set aURI as the new default namespace URI. Indicate that we need + // to create a namespace decl for the final prefix + return true; +} + +void nsXMLContentSerializer::GenerateNewPrefix(nsAString& aPrefix) { + aPrefix.Assign('a'); + aPrefix.AppendInt(mPrefixIndex++); +} + +bool nsXMLContentSerializer::SerializeAttr(const nsAString& aPrefix, + const nsAString& aName, + const nsAString& aValue, + nsAString& aStr, + bool aDoEscapeEntities) { + // Because this method can short-circuit AppendToString for raw output, we + // need to make sure that we're not inappropriately serializing attributes + // from outside the body + if (mBodyOnly && !mInBody) { + return true; + } + + nsAutoString attrString_; + // For innerHTML we can do faster appending without + // temporary strings. + bool rawAppend = mDoRaw && aDoEscapeEntities; + nsAString& attrString = (rawAppend) ? aStr : attrString_; + + NS_ENSURE_TRUE(attrString.Append(char16_t(' '), mozilla::fallible), false); + if (!aPrefix.IsEmpty()) { + NS_ENSURE_TRUE(attrString.Append(aPrefix, mozilla::fallible), false); + NS_ENSURE_TRUE(attrString.Append(char16_t(':'), mozilla::fallible), false); + } + NS_ENSURE_TRUE(attrString.Append(aName, mozilla::fallible), false); + + if (aDoEscapeEntities) { + // if problem characters are turned into character entity references + // then there will be no problem with the value delimiter characters + NS_ENSURE_TRUE(attrString.AppendLiteral("=\"", mozilla::fallible), false); + + mInAttribute = true; + bool result = AppendAndTranslateEntities(aValue, attrString); + mInAttribute = false; + NS_ENSURE_TRUE(result, false); + + NS_ENSURE_TRUE(attrString.Append(char16_t('"'), mozilla::fallible), false); + if (rawAppend) { + return true; + } + } else { + // Depending on whether the attribute value contains quotes or apostrophes + // we need to select the delimiter character and escape characters using + // character entity references, ignoring the value of aDoEscapeEntities. + // See http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.3.2.2 for + // the standard on character entity references in values. We also have to + // make sure to escape any '&' characters. + + bool bIncludesSingle = false; + bool bIncludesDouble = false; + nsAString::const_iterator iCurr, iEnd; + aValue.BeginReading(iCurr); + aValue.EndReading(iEnd); + for (; iCurr != iEnd; ++iCurr) { + if (*iCurr == char16_t('\'')) { + bIncludesSingle = true; + if (bIncludesDouble) { + break; + } + } else if (*iCurr == char16_t('"')) { + bIncludesDouble = true; + if (bIncludesSingle) { + break; + } + } + } + + // Delimiter and escaping is according to the following table + // bIncludesDouble bIncludesSingle Delimiter Escape Double Quote + // FALSE FALSE " FALSE + // FALSE TRUE " FALSE + // TRUE FALSE ' FALSE + // TRUE TRUE " TRUE + char16_t cDelimiter = + (bIncludesDouble && !bIncludesSingle) ? char16_t('\'') : char16_t('"'); + NS_ENSURE_TRUE(attrString.Append(char16_t('='), mozilla::fallible), false); + NS_ENSURE_TRUE(attrString.Append(cDelimiter, mozilla::fallible), false); + nsAutoString sValue(aValue); + NS_ENSURE_TRUE( + sValue.ReplaceSubstring(u"&"_ns, u"&"_ns, mozilla::fallible), + false); + if (bIncludesDouble && bIncludesSingle) { + NS_ENSURE_TRUE( + sValue.ReplaceSubstring(u"\""_ns, u"""_ns, mozilla::fallible), + false); + } + NS_ENSURE_TRUE(attrString.Append(sValue, mozilla::fallible), false); + NS_ENSURE_TRUE(attrString.Append(cDelimiter, mozilla::fallible), false); + } + + if (mDoWrap && mColPos + attrString.Length() > mMaxColumn) { + // Attr would cause us to overrun the max width, so begin a new line. + NS_ENSURE_TRUE(AppendNewLineToString(aStr), false); + + // Chomp the leading space. + nsDependentSubstring chomped(attrString, 1); + if (mDoFormat && mIndent.Length() + chomped.Length() <= mMaxColumn) { + NS_ENSURE_TRUE(AppendIndentation(aStr), false); + } + NS_ENSURE_TRUE(AppendToStringConvertLF(chomped, aStr), false); + } else { + NS_ENSURE_TRUE(AppendToStringConvertLF(attrString, aStr), false); + } + + return true; +} + +uint32_t nsXMLContentSerializer::ScanNamespaceDeclarations( + Element* aElement, Element* aOriginalElement, + const nsAString& aTagNamespaceURI) { + uint32_t index, count; + nsAutoString uriStr, valueStr; + + count = aElement->GetAttrCount(); + + // First scan for namespace declarations, pushing each on the stack + uint32_t skipAttr = count; + for (index = 0; index < count; index++) { + const BorrowedAttrInfo info = aElement->GetAttrInfoAt(index); + const nsAttrName* name = info.mName; + + int32_t namespaceID = name->NamespaceID(); + nsAtom* attrName = name->LocalName(); + + if (namespaceID == kNameSpaceID_XMLNS || + // Also push on the stack attrs named "xmlns" in the null + // namespace... because once we serialize those out they'll look like + // namespace decls. :( + // XXXbz what if we have both "xmlns" in the null namespace and "xmlns" + // in the xmlns namespace? + (namespaceID == kNameSpaceID_None && attrName == nsGkAtoms::xmlns)) { + info.mValue->ToString(uriStr); + + if (!name->GetPrefix()) { + if (aTagNamespaceURI.IsEmpty() && !uriStr.IsEmpty()) { + // If the element is in no namespace we need to add a xmlns + // attribute to declare that. That xmlns attribute must not have a + // prefix (see http://www.w3.org/TR/REC-xml-names/#dt-prefix), ie it + // must declare the default namespace. We just found an xmlns + // attribute that declares the default namespace to something + // non-empty. We're going to ignore this attribute, for children we + // will detect that we need to add it again and attributes aren't + // affected by the default namespace. + skipAttr = index; + } else { + // Default NS attribute does not have prefix (and the name is "xmlns") + PushNameSpaceDecl(u""_ns, uriStr, aOriginalElement); + } + } else { + PushNameSpaceDecl(nsDependentAtomString(attrName), uriStr, + aOriginalElement); + } + } + } + return skipAttr; +} + +bool nsXMLContentSerializer::IsJavaScript(nsIContent* aContent, + nsAtom* aAttrNameAtom, + int32_t aAttrNamespaceID, + const nsAString& aValueString) { + bool isHtml = aContent->IsHTMLElement(); + bool isXul = aContent->IsXULElement(); + bool isSvg = aContent->IsSVGElement(); + + if (aAttrNamespaceID == kNameSpaceID_None && (isHtml || isXul || isSvg) && + (aAttrNameAtom == nsGkAtoms::href || aAttrNameAtom == nsGkAtoms::src)) { + static const char kJavaScript[] = "javascript"; + int32_t pos = aValueString.FindChar(':'); + if (pos < (int32_t)(sizeof kJavaScript - 1)) return false; + nsAutoString scheme(Substring(aValueString, 0, pos)); + scheme.StripWhitespace(); + if ((scheme.Length() == (sizeof kJavaScript - 1)) && + scheme.EqualsIgnoreCase(kJavaScript)) + return true; + else + return false; + } + + return aContent->IsEventAttributeName(aAttrNameAtom); +} + +bool nsXMLContentSerializer::SerializeAttributes( + Element* aElement, Element* aOriginalElement, nsAString& aTagPrefix, + const nsAString& aTagNamespaceURI, nsAtom* aTagName, nsAString& aStr, + uint32_t aSkipAttr, bool aAddNSAttr) { + nsAutoString prefixStr, uriStr, valueStr; + nsAutoString xmlnsStr; + xmlnsStr.AssignLiteral(kXMLNS); + uint32_t index, count; + + MaybeSerializeIsValue(aElement, aStr); + + // If we had to add a new namespace declaration, serialize + // and push it on the namespace stack + if (aAddNSAttr) { + if (aTagPrefix.IsEmpty()) { + // Serialize default namespace decl + NS_ENSURE_TRUE( + SerializeAttr(u""_ns, xmlnsStr, aTagNamespaceURI, aStr, true), false); + } else { + // Serialize namespace decl + NS_ENSURE_TRUE( + SerializeAttr(xmlnsStr, aTagPrefix, aTagNamespaceURI, aStr, true), + false); + } + PushNameSpaceDecl(aTagPrefix, aTagNamespaceURI, aOriginalElement); + } + + count = aElement->GetAttrCount(); + + // Now serialize each of the attributes + // XXX Unfortunately we need a namespace manager to get + // attribute URIs. + for (index = 0; index < count; index++) { + if (aSkipAttr == index) { + continue; + } + + const nsAttrName* name = aElement->GetAttrNameAt(index); + int32_t namespaceID = name->NamespaceID(); + nsAtom* attrName = name->LocalName(); + nsAtom* attrPrefix = name->GetPrefix(); + + // Filter out any attribute starting with [-|_]moz + nsDependentAtomString attrNameStr(attrName); + if (StringBeginsWith(attrNameStr, u"_moz"_ns) || + StringBeginsWith(attrNameStr, u"-moz"_ns)) { + continue; + } + + if (attrPrefix) { + attrPrefix->ToString(prefixStr); + } else { + prefixStr.Truncate(); + } + + bool addNSAttr = false; + if (kNameSpaceID_XMLNS != namespaceID) { + nsNameSpaceManager::GetInstance()->GetNameSpaceURI(namespaceID, uriStr); + addNSAttr = ConfirmPrefix(prefixStr, uriStr, aOriginalElement, true); + } + + aElement->GetAttr(namespaceID, attrName, valueStr); + + nsDependentAtomString nameStr(attrName); + bool isJS = IsJavaScript(aElement, attrName, namespaceID, valueStr); + + NS_ENSURE_TRUE(SerializeAttr(prefixStr, nameStr, valueStr, aStr, !isJS), + false); + + if (addNSAttr) { + NS_ASSERTION(!prefixStr.IsEmpty(), + "Namespaced attributes must have a prefix"); + NS_ENSURE_TRUE(SerializeAttr(xmlnsStr, prefixStr, uriStr, aStr, true), + false); + PushNameSpaceDecl(prefixStr, uriStr, aOriginalElement); + } + } + + return true; +} + +NS_IMETHODIMP +nsXMLContentSerializer::AppendElementStart(Element* aElement, + Element* aOriginalElement) { + NS_ENSURE_ARG(aElement); + NS_ENSURE_STATE(mOutput); + + bool forceFormat = false; + nsresult rv = NS_OK; + if (!CheckElementStart(aElement, forceFormat, *mOutput, rv)) { + // When we go to AppendElementEnd for this element, we're going to + // MaybeLeaveFromPreContent(). So make sure to MaybeEnterInPreContent() + // now, so our PreLevel() doesn't get confused. + MaybeEnterInPreContent(aElement); + return rv; + } + + NS_ENSURE_SUCCESS(rv, rv); + + nsAutoString tagPrefix, tagLocalName, tagNamespaceURI; + aElement->NodeInfo()->GetPrefix(tagPrefix); + aElement->NodeInfo()->GetName(tagLocalName); + aElement->NodeInfo()->GetNamespaceURI(tagNamespaceURI); + + uint32_t skipAttr = + ScanNamespaceDeclarations(aElement, aOriginalElement, tagNamespaceURI); + + nsAtom* name = aElement->NodeInfo()->NameAtom(); + bool lineBreakBeforeOpen = + LineBreakBeforeOpen(aElement->GetNameSpaceID(), name); + + if ((mDoFormat || forceFormat) && !mDoRaw && !PreLevel()) { + if (mColPos && lineBreakBeforeOpen) { + NS_ENSURE_TRUE(AppendNewLineToString(*mOutput), NS_ERROR_OUT_OF_MEMORY); + } else { + NS_ENSURE_TRUE(MaybeAddNewlineForRootNode(*mOutput), + NS_ERROR_OUT_OF_MEMORY); + } + if (!mColPos) { + NS_ENSURE_TRUE(AppendIndentation(*mOutput), NS_ERROR_OUT_OF_MEMORY); + } else if (mAddSpace) { + NS_ENSURE_TRUE(AppendToString(char16_t(' '), *mOutput), + NS_ERROR_OUT_OF_MEMORY); + mAddSpace = false; + } + } else if (mAddSpace) { + NS_ENSURE_TRUE(AppendToString(char16_t(' '), *mOutput), + NS_ERROR_OUT_OF_MEMORY); + mAddSpace = false; + } else { + NS_ENSURE_TRUE(MaybeAddNewlineForRootNode(*mOutput), + NS_ERROR_OUT_OF_MEMORY); + } + + // Always reset to avoid false newlines in case MaybeAddNewlineForRootNode + // wasn't called + mAddNewlineForRootNode = false; + + bool addNSAttr; + addNSAttr = + ConfirmPrefix(tagPrefix, tagNamespaceURI, aOriginalElement, false); + + // Serialize the qualified name of the element + NS_ENSURE_TRUE(AppendToString(kLessThan, *mOutput), NS_ERROR_OUT_OF_MEMORY); + if (!tagPrefix.IsEmpty()) { + NS_ENSURE_TRUE(AppendToString(tagPrefix, *mOutput), NS_ERROR_OUT_OF_MEMORY); + NS_ENSURE_TRUE(AppendToString(u":"_ns, *mOutput), NS_ERROR_OUT_OF_MEMORY); + } + NS_ENSURE_TRUE(AppendToString(tagLocalName, *mOutput), + NS_ERROR_OUT_OF_MEMORY); + + MaybeEnterInPreContent(aElement); + + if ((mDoFormat || forceFormat) && !mDoRaw && !PreLevel()) { + NS_ENSURE_TRUE(IncrIndentation(name), NS_ERROR_OUT_OF_MEMORY); + } + + NS_ENSURE_TRUE( + SerializeAttributes(aElement, aOriginalElement, tagPrefix, + tagNamespaceURI, name, *mOutput, skipAttr, addNSAttr), + NS_ERROR_OUT_OF_MEMORY); + + NS_ENSURE_TRUE(AppendEndOfElementStart(aElement, aOriginalElement, *mOutput), + NS_ERROR_OUT_OF_MEMORY); + + if ((mDoFormat || forceFormat) && !mDoRaw && !PreLevel() && + LineBreakAfterOpen(aElement->GetNameSpaceID(), name)) { + NS_ENSURE_TRUE(AppendNewLineToString(*mOutput), NS_ERROR_OUT_OF_MEMORY); + } + + NS_ENSURE_TRUE(AfterElementStart(aElement, aOriginalElement, *mOutput), + NS_ERROR_OUT_OF_MEMORY); + + return NS_OK; +} + +// aElement is the actual element we're outputting. aOriginalElement is the one +// in the original DOM, which is the one we have to test for kids. +static bool ElementNeedsSeparateEndTag(Element* aElement, + Element* aOriginalElement) { + if (aOriginalElement->GetChildCount()) { + // We have kids, so we need a separate end tag. This needs to be checked on + // aOriginalElement because that's the one that's actually in the DOM and + // might have kids. + return true; + } + + if (!aElement->IsHTMLElement()) { + // Empty non-HTML elements can just skip a separate end tag. + return false; + } + + // HTML container tags should have a separate end tag even if empty, per spec. + // See + // https://w3c.github.io/DOM-Parsing/#dfn-concept-xml-serialization-algorithm + nsAtom* localName = aElement->NodeInfo()->NameAtom(); + bool isHTMLContainer = nsHTMLElement::IsContainer( + nsHTMLTags::CaseSensitiveAtomTagToId(localName)); + return isHTMLContainer; +} + +bool nsXMLContentSerializer::AppendEndOfElementStart(Element* aElement, + Element* aOriginalElement, + nsAString& aStr) { + if (ElementNeedsSeparateEndTag(aElement, aOriginalElement)) { + return AppendToString(kGreaterThan, aStr); + } + + // We don't need a separate end tag. For HTML elements (which at this point + // must be non-containers), append a space before the '/', per spec. See + // https://w3c.github.io/DOM-Parsing/#dfn-concept-xml-serialization-algorithm + if (aOriginalElement->IsHTMLElement()) { + if (!AppendToString(kSpace, aStr)) { + return false; + } + } + + return AppendToString(u"/>"_ns, aStr); +} + +NS_IMETHODIMP +nsXMLContentSerializer::AppendElementEnd(Element* aElement, + Element* aOriginalElement) { + NS_ENSURE_ARG(aElement); + NS_ENSURE_STATE(mOutput); + + nsIContent* content = aElement; + + bool forceFormat = false, outputElementEnd; + outputElementEnd = + CheckElementEnd(aElement, aOriginalElement, forceFormat, *mOutput); + + nsAtom* name = content->NodeInfo()->NameAtom(); + + if ((mDoFormat || forceFormat) && !mDoRaw && !PreLevel()) { + DecrIndentation(name); + } + + if (!outputElementEnd) { + // Keep this in sync with the cleanup at the end of this method. + PopNameSpaceDeclsFor(aElement); + MaybeLeaveFromPreContent(content); + MaybeFlagNewlineForRootNode(aElement); + AfterElementEnd(content, *mOutput); + return NS_OK; + } + + nsAutoString tagPrefix, tagLocalName, tagNamespaceURI; + + aElement->NodeInfo()->GetPrefix(tagPrefix); + aElement->NodeInfo()->GetName(tagLocalName); + aElement->NodeInfo()->GetNamespaceURI(tagNamespaceURI); + +#ifdef DEBUG + bool debugNeedToPushNamespace = +#endif + ConfirmPrefix(tagPrefix, tagNamespaceURI, aElement, false); + NS_ASSERTION(!debugNeedToPushNamespace, + "Can't push namespaces in closing tag!"); + + if ((mDoFormat || forceFormat) && !mDoRaw && !PreLevel()) { + bool lineBreakBeforeClose = + LineBreakBeforeClose(content->GetNameSpaceID(), name); + + if (mColPos && lineBreakBeforeClose) { + NS_ENSURE_TRUE(AppendNewLineToString(*mOutput), NS_ERROR_OUT_OF_MEMORY); + } + if (!mColPos) { + NS_ENSURE_TRUE(AppendIndentation(*mOutput), NS_ERROR_OUT_OF_MEMORY); + } else if (mAddSpace) { + NS_ENSURE_TRUE(AppendToString(char16_t(' '), *mOutput), + NS_ERROR_OUT_OF_MEMORY); + mAddSpace = false; + } + } else if (mAddSpace) { + NS_ENSURE_TRUE(AppendToString(char16_t(' '), *mOutput), + NS_ERROR_OUT_OF_MEMORY); + mAddSpace = false; + } + + NS_ENSURE_TRUE(AppendToString(kEndTag, *mOutput), NS_ERROR_OUT_OF_MEMORY); + if (!tagPrefix.IsEmpty()) { + NS_ENSURE_TRUE(AppendToString(tagPrefix, *mOutput), NS_ERROR_OUT_OF_MEMORY); + NS_ENSURE_TRUE(AppendToString(u":"_ns, *mOutput), NS_ERROR_OUT_OF_MEMORY); + } + NS_ENSURE_TRUE(AppendToString(tagLocalName, *mOutput), + NS_ERROR_OUT_OF_MEMORY); + NS_ENSURE_TRUE(AppendToString(kGreaterThan, *mOutput), + NS_ERROR_OUT_OF_MEMORY); + + // Keep what follows in sync with the cleanup in the !outputElementEnd case. + PopNameSpaceDeclsFor(aElement); + + MaybeLeaveFromPreContent(content); + + if ((mDoFormat || forceFormat) && !mDoRaw && !PreLevel() && + LineBreakAfterClose(content->GetNameSpaceID(), name)) { + NS_ENSURE_TRUE(AppendNewLineToString(*mOutput), NS_ERROR_OUT_OF_MEMORY); + } else { + MaybeFlagNewlineForRootNode(aElement); + } + + AfterElementEnd(content, *mOutput); + + return NS_OK; +} + +NS_IMETHODIMP +nsXMLContentSerializer::Finish() { + NS_ENSURE_STATE(mOutput); + + mOutput = nullptr; + + return NS_OK; +} + +NS_IMETHODIMP +nsXMLContentSerializer::GetOutputLength(uint32_t& aLength) const { + NS_ENSURE_STATE(mOutput); + + aLength = mOutput->Length(); + + return NS_OK; +} + +NS_IMETHODIMP +nsXMLContentSerializer::AppendDocumentStart(Document* aDocument) { + NS_ENSURE_ARG_POINTER(aDocument); + NS_ENSURE_STATE(mOutput); + + nsAutoString version, encoding, standalone; + aDocument->GetXMLDeclaration(version, encoding, standalone); + + if (version.IsEmpty()) + return NS_OK; // A declaration must have version, or there is no decl + + constexpr auto endQuote = u"\""_ns; + + *mOutput += u"<?xml version=\""_ns + version + endQuote; + + if (!mCharset.IsEmpty()) { + *mOutput += + u" encoding=\""_ns + NS_ConvertASCIItoUTF16(mCharset) + endQuote; + } + // Otherwise just don't output an encoding attr. Not that we expect + // mCharset to ever be empty. +#ifdef DEBUG + else { + NS_WARNING("Empty mCharset? How come?"); + } +#endif + + if (!standalone.IsEmpty()) { + *mOutput += u" standalone=\""_ns + standalone + endQuote; + } + + NS_ENSURE_TRUE(mOutput->AppendLiteral("?>", mozilla::fallible), + NS_ERROR_OUT_OF_MEMORY); + mAddNewlineForRootNode = true; + + return NS_OK; +} + +bool nsXMLContentSerializer::CheckElementStart(Element*, bool& aForceFormat, + nsAString& aStr, + nsresult& aResult) { + aResult = NS_OK; + aForceFormat = false; + return true; +} + +bool nsXMLContentSerializer::CheckElementEnd(Element* aElement, + Element* aOriginalElement, + bool& aForceFormat, + nsAString& aStr) { + // We don't output a separate end tag for empty element + aForceFormat = false; + return ElementNeedsSeparateEndTag(aElement, aOriginalElement); +} + +bool nsXMLContentSerializer::AppendToString(const char16_t aChar, + nsAString& aOutputStr) { + if (mBodyOnly && !mInBody) { + return true; + } + mColPos += 1; + return aOutputStr.Append(aChar, mozilla::fallible); +} + +bool nsXMLContentSerializer::AppendToString(const nsAString& aStr, + nsAString& aOutputStr) { + if (mBodyOnly && !mInBody) { + return true; + } + mColPos += aStr.Length(); + return aOutputStr.Append(aStr, mozilla::fallible); +} + +#define _ 0 + +// This table indexes into kEntityStrings[]. +const uint8_t nsXMLContentSerializer::kEntities[] = { + // clang-format off + _, _, _, _, _, _, _, _, _, _, + _, _, _, _, _, _, _, _, _, _, + _, _, _, _, _, _, _, _, _, _, + _, _, _, _, _, _, _, _, 2, _, + _, _, _, _, _, _, _, _, _, _, + _, _, _, _, _, _, _, _, _, _, + 3, _, 4 + // clang-format on +}; + +// This table indexes into kEntityStrings[]. +const uint8_t nsXMLContentSerializer::kAttrEntities[] = { + // clang-format off + _, _, _, _, _, _, _, _, _, 5, + 6, _, _, 7, _, _, _, _, _, _, + _, _, _, _, _, _, _, _, _, _, + _, _, _, _, 1, _, _, _, 2, _, + _, _, _, _, _, _, _, _, _, _, + _, _, _, _, _, _, _, _, _, _, + 3, _, 4 + // clang-format on +}; + +#undef _ + +const char* const nsXMLContentSerializer::kEntityStrings[] = { + /* 0 */ nullptr, + /* 1 */ """, + /* 2 */ "&", + /* 3 */ "<", + /* 4 */ ">", + /* 5 */ "	", + /* 6 */ "
", + /* 7 */ "
", +}; + +bool nsXMLContentSerializer::AppendAndTranslateEntities(const nsAString& aStr, + nsAString& aOutputStr) { + if (mInAttribute) { + return AppendAndTranslateEntities<kGTVal>(aStr, aOutputStr, kAttrEntities, + kEntityStrings); + } + + return AppendAndTranslateEntities<kGTVal>(aStr, aOutputStr, kEntities, + kEntityStrings); +} + +/* static */ +bool nsXMLContentSerializer::AppendAndTranslateEntities( + const nsAString& aStr, nsAString& aOutputStr, const uint8_t aEntityTable[], + uint16_t aMaxTableIndex, const char* const aStringTable[]) { + nsReadingIterator<char16_t> done_reading; + aStr.EndReading(done_reading); + + // for each chunk of |aString|... + uint32_t advanceLength = 0; + nsReadingIterator<char16_t> iter; + + for (aStr.BeginReading(iter); iter != done_reading; + iter.advance(int32_t(advanceLength))) { + uint32_t fragmentLength = done_reading - iter; + const char16_t* c = iter.get(); + const char16_t* fragmentStart = c; + const char16_t* fragmentEnd = c + fragmentLength; + const char* entityText = nullptr; + + advanceLength = 0; + // for each character in this chunk, check if it + // needs to be replaced + for (; c < fragmentEnd; c++, advanceLength++) { + char16_t val = *c; + if ((val <= aMaxTableIndex) && aEntityTable[val]) { + entityText = aStringTable[aEntityTable[val]]; + break; + } + } + + NS_ENSURE_TRUE( + aOutputStr.Append(fragmentStart, advanceLength, mozilla::fallible), + false); + if (entityText) { + NS_ENSURE_TRUE(AppendASCIItoUTF16(mozilla::MakeStringSpan(entityText), + aOutputStr, mozilla::fallible), + false); + advanceLength++; + } + } + + return true; +} + +bool nsXMLContentSerializer::MaybeAddNewlineForRootNode(nsAString& aStr) { + if (mAddNewlineForRootNode) { + return AppendNewLineToString(aStr); + } + + return true; +} + +void nsXMLContentSerializer::MaybeFlagNewlineForRootNode(nsINode* aNode) { + nsINode* parent = aNode->GetParentNode(); + if (parent) { + mAddNewlineForRootNode = parent->IsDocument(); + } +} + +void nsXMLContentSerializer::MaybeEnterInPreContent(nsIContent* aNode) { + // support of the xml:space attribute + nsAutoString space; + if (ShouldMaintainPreLevel() && aNode->IsElement() && + aNode->AsElement()->GetAttr(kNameSpaceID_XML, nsGkAtoms::space, space) && + space.EqualsLiteral("preserve")) { + ++PreLevel(); + } +} + +void nsXMLContentSerializer::MaybeLeaveFromPreContent(nsIContent* aNode) { + // support of the xml:space attribute + nsAutoString space; + if (ShouldMaintainPreLevel() && aNode->IsElement() && + aNode->AsElement()->GetAttr(kNameSpaceID_XML, nsGkAtoms::space, space) && + space.EqualsLiteral("preserve")) { + --PreLevel(); + } +} + +bool nsXMLContentSerializer::AppendNewLineToString(nsAString& aStr) { + bool result = AppendToString(mLineBreak, aStr); + mMayIgnoreLineBreakSequence = true; + mColPos = 0; + mAddSpace = false; + mIsIndentationAddedOnCurrentLine = false; + return result; +} + +bool nsXMLContentSerializer::AppendIndentation(nsAString& aStr) { + mIsIndentationAddedOnCurrentLine = true; + bool result = AppendToString(mIndent, aStr); + mAddSpace = false; + mMayIgnoreLineBreakSequence = false; + return result; +} + +bool nsXMLContentSerializer::IncrIndentation(nsAtom* aName) { + // we want to keep the source readable + if (mDoWrap && + mIndent.Length() >= uint32_t(mMaxColumn) - MIN_INDENTED_LINE_LENGTH) { + ++mIndentOverflow; + } else { + return mIndent.AppendLiteral(INDENT_STRING, mozilla::fallible); + } + + return true; +} + +void nsXMLContentSerializer::DecrIndentation(nsAtom* aName) { + if (mIndentOverflow) + --mIndentOverflow; + else + mIndent.Cut(0, INDENT_STRING_LENGTH); +} + +bool nsXMLContentSerializer::LineBreakBeforeOpen(int32_t aNamespaceID, + nsAtom* aName) { + return mAddSpace; +} + +bool nsXMLContentSerializer::LineBreakAfterOpen(int32_t aNamespaceID, + nsAtom* aName) { + return false; +} + +bool nsXMLContentSerializer::LineBreakBeforeClose(int32_t aNamespaceID, + nsAtom* aName) { + return mAddSpace; +} + +bool nsXMLContentSerializer::LineBreakAfterClose(int32_t aNamespaceID, + nsAtom* aName) { + return false; +} + +bool nsXMLContentSerializer::AppendToStringConvertLF(const nsAString& aStr, + nsAString& aOutputStr) { + if (mBodyOnly && !mInBody) { + return true; + } + + if (mDoRaw) { + NS_ENSURE_TRUE(AppendToString(aStr, aOutputStr), false); + } else { + // Convert line-endings to mLineBreak + uint32_t start = 0; + uint32_t theLen = aStr.Length(); + while (start < theLen) { + int32_t eol = aStr.FindChar('\n', start); + if (eol == kNotFound) { + nsDependentSubstring dataSubstring(aStr, start, theLen - start); + NS_ENSURE_TRUE(AppendToString(dataSubstring, aOutputStr), false); + start = theLen; + // if there was a line break before this substring + // AppendNewLineToString was called, so we should reverse + // this flag + mMayIgnoreLineBreakSequence = false; + } else { + nsDependentSubstring dataSubstring(aStr, start, eol - start); + NS_ENSURE_TRUE(AppendToString(dataSubstring, aOutputStr), false); + NS_ENSURE_TRUE(AppendNewLineToString(aOutputStr), false); + start = eol + 1; + } + } + } + + return true; +} + +bool nsXMLContentSerializer::AppendFormatedWrapped_WhitespaceSequence( + nsAString::const_char_iterator& aPos, + const nsAString::const_char_iterator aEnd, + const nsAString::const_char_iterator aSequenceStart, + bool& aMayIgnoreStartOfLineWhitespaceSequence, nsAString& aOutputStr) { + // Handle the complete sequence of whitespace. + // Continue to iterate until we find the first non-whitespace char. + // Updates "aPos" to point to the first unhandled char. + // Also updates the aMayIgnoreStartOfLineWhitespaceSequence flag, + // as well as the other "global" state flags. + + bool sawBlankOrTab = false; + bool leaveLoop = false; + + do { + switch (*aPos) { + case ' ': + case '\t': + sawBlankOrTab = true; + [[fallthrough]]; + case '\n': + ++aPos; + // do not increase mColPos, + // because we will reduce the whitespace to a single char + break; + default: + leaveLoop = true; + break; + } + } while (!leaveLoop && aPos < aEnd); + + if (mAddSpace) { + // if we had previously been asked to add space, + // our situation has not changed + } else if (!sawBlankOrTab && mMayIgnoreLineBreakSequence) { + // nothing to do in the case where line breaks have already been added + // before the call of AppendToStringWrapped + // and only if we found line break in the sequence + mMayIgnoreLineBreakSequence = false; + } else if (aMayIgnoreStartOfLineWhitespaceSequence) { + // nothing to do + aMayIgnoreStartOfLineWhitespaceSequence = false; + } else { + if (sawBlankOrTab) { + if (mDoWrap && mColPos + 1 >= mMaxColumn) { + // no much sense in delaying, we only have one slot left, + // let's write a break now + bool result = aOutputStr.Append(mLineBreak, mozilla::fallible); + mColPos = 0; + mIsIndentationAddedOnCurrentLine = false; + mMayIgnoreLineBreakSequence = true; + NS_ENSURE_TRUE(result, false); + } else { + // do not write out yet, we may write out either a space or a linebreak + // let's delay writing it out until we know more + mAddSpace = true; + ++mColPos; // eat a slot of available space + } + } else { + // Asian text usually does not contain spaces, therefore we should not + // transform a linebreak into a space. + // Since we only saw linebreaks, but no spaces or tabs, + // let's write a linebreak now. + NS_ENSURE_TRUE(AppendNewLineToString(aOutputStr), false); + } + } + + return true; +} + +bool nsXMLContentSerializer::AppendWrapped_NonWhitespaceSequence( + nsAString::const_char_iterator& aPos, + const nsAString::const_char_iterator aEnd, + const nsAString::const_char_iterator aSequenceStart, + bool& aMayIgnoreStartOfLineWhitespaceSequence, + bool& aSequenceStartAfterAWhiteSpace, nsAString& aOutputStr) { + mMayIgnoreLineBreakSequence = false; + aMayIgnoreStartOfLineWhitespaceSequence = false; + + // Handle the complete sequence of non-whitespace in this block + // Iterate until we find the first whitespace char or an aEnd condition + // Updates "aPos" to point to the first unhandled char. + // Also updates the aMayIgnoreStartOfLineWhitespaceSequence flag, + // as well as the other "global" state flags. + + bool thisSequenceStartsAtBeginningOfLine = !mColPos; + bool onceAgainBecauseWeAddedBreakInFront = false; + bool foundWhitespaceInLoop; + uint32_t length, colPos; + + do { + if (mColPos) { + colPos = mColPos; + } else { + if (mDoFormat && !mDoRaw && !PreLevel() && + !onceAgainBecauseWeAddedBreakInFront) { + colPos = mIndent.Length(); + } else + colPos = 0; + } + foundWhitespaceInLoop = false; + length = 0; + // we iterate until the next whitespace character + // or until we reach the maximum of character per line + // or until the end of the string to add. + do { + if (*aPos == ' ' || *aPos == '\t' || *aPos == '\n') { + foundWhitespaceInLoop = true; + break; + } + + ++aPos; + ++length; + } while ((!mDoWrap || colPos + length < mMaxColumn) && aPos < aEnd); + + // in the case we don't reached the end of the string, but we reached the + // maxcolumn, we see if there is a whitespace after the maxcolumn if yes, + // then we can append directly the string instead of appending a new line + // etc. + if (*aPos == ' ' || *aPos == '\t' || *aPos == '\n') { + foundWhitespaceInLoop = true; + } + + if (aPos == aEnd || foundWhitespaceInLoop) { + // there is enough room for the complete block we found + if (mDoFormat && !mColPos) { + NS_ENSURE_TRUE(AppendIndentation(aOutputStr), false); + } else if (mAddSpace) { + bool result = aOutputStr.Append(char16_t(' '), mozilla::fallible); + mAddSpace = false; + NS_ENSURE_TRUE(result, false); + } + + mColPos += length; + NS_ENSURE_TRUE(aOutputStr.Append(aSequenceStart, aPos - aSequenceStart, + mozilla::fallible), + false); + + // We have not yet reached the max column, we will continue to + // fill the current line in the next outer loop iteration + // (this one in AppendToStringWrapped) + // make sure we return in this outer loop + onceAgainBecauseWeAddedBreakInFront = false; + } else { // we reach the max column + if (!thisSequenceStartsAtBeginningOfLine && + (mAddSpace || (!mDoFormat && aSequenceStartAfterAWhiteSpace))) { + // when !mDoFormat, mAddSpace is not used, mAddSpace is always false + // so, in the case where mDoWrap && !mDoFormat, if we want to enter in + // this condition... + + // We can avoid to wrap. We try to add the whole block + // in an empty new line + + NS_ENSURE_TRUE(AppendNewLineToString(aOutputStr), false); + aPos = aSequenceStart; + thisSequenceStartsAtBeginningOfLine = true; + onceAgainBecauseWeAddedBreakInFront = true; + } else { + // we must wrap + onceAgainBecauseWeAddedBreakInFront = false; + Maybe<uint32_t> wrapPosition; + + if (mAllowLineBreaking) { + MOZ_ASSERT(aPos < aEnd, + "We shouldn't be here if aPos reaches the end of text!"); + + // Search forward from aSequenceStart until we find the largest + // wrap position less than or equal to aPos. + Maybe<uint32_t> nextWrapPosition; + Span<const char16_t> subSeq(aSequenceStart, aEnd); + intl::LineBreakIteratorUtf16 lineBreakIter(subSeq); + while (true) { + nextWrapPosition = lineBreakIter.Next(); + MOZ_ASSERT(nextWrapPosition.isSome(), + "We should've exited the loop when reaching the end of " + "text in the previous iteration!"); + + // Trim space at the tail. UAX#14 doesn't have break opportunity + // for ASCII space at the tail. + const Maybe<uint32_t> originalNextWrapPosition = nextWrapPosition; + while (*nextWrapPosition > 0 && + subSeq.at(*nextWrapPosition - 1) == 0x20) { + nextWrapPosition = Some(*nextWrapPosition - 1); + } + if (*nextWrapPosition == 0) { + // Restore the original nextWrapPosition. + nextWrapPosition = originalNextWrapPosition; + } + + if (aSequenceStart + *nextWrapPosition > aPos) { + break; + } + wrapPosition = nextWrapPosition; + } + + if (!wrapPosition) { + // The wrap position found in the first iteration of the above loop + // already exceeds aPos. We accept it as valid a wrap position only + // if it is not end-of-text. If the line-breaker returned + // end-of-text, we don't know that it is actually a good wrap + // position, so ignore it and continue to use the fallback code + // below. + if (*nextWrapPosition < subSeq.Length()) { + wrapPosition = nextWrapPosition; + } + } + } + + if (wrapPosition) { + if (!mColPos && mDoFormat) { + NS_ENSURE_TRUE(AppendIndentation(aOutputStr), false); + } else if (mAddSpace) { + bool result = aOutputStr.Append(char16_t(' '), mozilla::fallible); + mAddSpace = false; + NS_ENSURE_TRUE(result, false); + } + NS_ENSURE_TRUE(aOutputStr.Append(aSequenceStart, *wrapPosition, + mozilla::fallible), + false); + + NS_ENSURE_TRUE(AppendNewLineToString(aOutputStr), false); + aPos = aSequenceStart + *wrapPosition; + aMayIgnoreStartOfLineWhitespaceSequence = true; + } else { + // try some simple fallback logic + // go forward up to the next whitespace position, + // in the worst case this will be all the rest of the data + + // XXX(jfkthame) Should we (conditionally) output indentation here? + // It makes for tidier-looking formatted output, at the cost of + // exceeding the target width by a greater amount on such lines. + // if (!mColPos && mDoFormat) { + // NS_ENSURE_TRUE(AppendIndentation(aOutputStr), false); + // mAddSpace = false; + // } + + // we update the mColPos variable with the length of + // the part already parsed. + mColPos += length; + + // now try to find the next whitespace + do { + if (*aPos == ' ' || *aPos == '\t' || *aPos == '\n') { + break; + } + + ++aPos; + ++mColPos; + } while (aPos < aEnd); + + if (mAddSpace) { + bool result = aOutputStr.Append(char16_t(' '), mozilla::fallible); + mAddSpace = false; + NS_ENSURE_TRUE(result, false); + } + NS_ENSURE_TRUE( + aOutputStr.Append(aSequenceStart, aPos - aSequenceStart, + mozilla::fallible), + false); + } + } + aSequenceStartAfterAWhiteSpace = false; + } + } while (onceAgainBecauseWeAddedBreakInFront); + + return true; +} + +bool nsXMLContentSerializer::AppendToStringFormatedWrapped( + const nsAString& aStr, nsAString& aOutputStr) { + if (mBodyOnly && !mInBody) { + return true; + } + + nsAString::const_char_iterator pos, end, sequenceStart; + + aStr.BeginReading(pos); + aStr.EndReading(end); + + bool sequenceStartAfterAWhitespace = false; + if (pos < end) { + nsAString::const_char_iterator end2; + aOutputStr.EndReading(end2); + --end2; + if (*end2 == ' ' || *end2 == '\n' || *end2 == '\t') { + sequenceStartAfterAWhitespace = true; + } + } + + // if the current line already has text on it, such as a tag, + // leading whitespace is significant + bool mayIgnoreStartOfLineWhitespaceSequence = + (!mColPos || + (mIsIndentationAddedOnCurrentLine && sequenceStartAfterAWhitespace && + uint32_t(mColPos) == mIndent.Length())); + + while (pos < end) { + sequenceStart = pos; + + // if beginning of a whitespace sequence + if (*pos == ' ' || *pos == '\n' || *pos == '\t') { + NS_ENSURE_TRUE(AppendFormatedWrapped_WhitespaceSequence( + pos, end, sequenceStart, + mayIgnoreStartOfLineWhitespaceSequence, aOutputStr), + false); + } else { // any other non-whitespace char + NS_ENSURE_TRUE( + AppendWrapped_NonWhitespaceSequence( + pos, end, sequenceStart, mayIgnoreStartOfLineWhitespaceSequence, + sequenceStartAfterAWhitespace, aOutputStr), + false); + } + } + + return true; +} + +bool nsXMLContentSerializer::AppendWrapped_WhitespaceSequence( + nsAString::const_char_iterator& aPos, + const nsAString::const_char_iterator aEnd, + const nsAString::const_char_iterator aSequenceStart, + nsAString& aOutputStr) { + // Handle the complete sequence of whitespace. + // Continue to iterate until we find the first non-whitespace char. + // Updates "aPos" to point to the first unhandled char. + mAddSpace = false; + mIsIndentationAddedOnCurrentLine = false; + + bool leaveLoop = false; + nsAString::const_char_iterator lastPos = aPos; + + do { + switch (*aPos) { + case ' ': + case '\t': + // if there are too many spaces on a line, we wrap + if (mColPos >= mMaxColumn) { + if (lastPos != aPos) { + NS_ENSURE_TRUE( + aOutputStr.Append(lastPos, aPos - lastPos, mozilla::fallible), + false); + } + NS_ENSURE_TRUE(AppendToString(mLineBreak, aOutputStr), false); + mColPos = 0; + lastPos = aPos; + } + + ++mColPos; + ++aPos; + break; + case '\n': + if (lastPos != aPos) { + NS_ENSURE_TRUE( + aOutputStr.Append(lastPos, aPos - lastPos, mozilla::fallible), + false); + } + NS_ENSURE_TRUE(AppendToString(mLineBreak, aOutputStr), false); + mColPos = 0; + ++aPos; + lastPos = aPos; + break; + default: + leaveLoop = true; + break; + } + } while (!leaveLoop && aPos < aEnd); + + if (lastPos != aPos) { + NS_ENSURE_TRUE( + aOutputStr.Append(lastPos, aPos - lastPos, mozilla::fallible), false); + } + + return true; +} + +bool nsXMLContentSerializer::AppendToStringWrapped(const nsAString& aStr, + nsAString& aOutputStr) { + if (mBodyOnly && !mInBody) { + return true; + } + + nsAString::const_char_iterator pos, end, sequenceStart; + + aStr.BeginReading(pos); + aStr.EndReading(end); + + // not used in this case, but needed by AppendWrapped_NonWhitespaceSequence + bool mayIgnoreStartOfLineWhitespaceSequence = false; + mMayIgnoreLineBreakSequence = false; + + bool sequenceStartAfterAWhitespace = false; + if (pos < end && !aOutputStr.IsEmpty()) { + nsAString::const_char_iterator end2; + aOutputStr.EndReading(end2); + --end2; + if (*end2 == ' ' || *end2 == '\n' || *end2 == '\t') { + sequenceStartAfterAWhitespace = true; + } + } + + while (pos < end) { + sequenceStart = pos; + + // if beginning of a whitespace sequence + if (*pos == ' ' || *pos == '\n' || *pos == '\t') { + sequenceStartAfterAWhitespace = true; + NS_ENSURE_TRUE( + AppendWrapped_WhitespaceSequence(pos, end, sequenceStart, aOutputStr), + false); + } else { // any other non-whitespace char + NS_ENSURE_TRUE( + AppendWrapped_NonWhitespaceSequence( + pos, end, sequenceStart, mayIgnoreStartOfLineWhitespaceSequence, + sequenceStartAfterAWhitespace, aOutputStr), + false); + } + } + + return true; +} + +bool nsXMLContentSerializer::ShouldMaintainPreLevel() const { + // Only attempt to maintain the pre level for consumers who care about it. + return !mDoRaw || (mFlags & nsIDocumentEncoder::OutputNoFormattingInPre); +} + +bool nsXMLContentSerializer::MaybeSerializeIsValue(Element* aElement, + nsAString& aStr) { + CustomElementData* ceData = aElement->GetCustomElementData(); + if (ceData) { + nsAtom* isAttr = ceData->GetIs(aElement); + if (isAttr && !aElement->HasAttr(nsGkAtoms::is)) { + NS_ENSURE_TRUE(aStr.AppendLiteral(" is=\"", mozilla::fallible), false); + NS_ENSURE_TRUE( + aStr.Append(nsDependentAtomString(isAttr), mozilla::fallible), false); + NS_ENSURE_TRUE(aStr.AppendLiteral("\"", mozilla::fallible), false); + } + } + + return true; +} diff --git a/dom/serializers/nsXMLContentSerializer.h b/dom/serializers/nsXMLContentSerializer.h new file mode 100644 index 0000000000..167255fe09 --- /dev/null +++ b/dom/serializers/nsXMLContentSerializer.h @@ -0,0 +1,440 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* + * nsIContentSerializer implementation that can be used with an + * nsIDocumentEncoder to convert an XML DOM to an XML string that + * could be parsed into more or less the original DOM. + */ + +#ifndef nsXMLContentSerializer_h__ +#define nsXMLContentSerializer_h__ + +#include "mozilla/Attributes.h" +#include "nsIContentSerializer.h" +#include "nsISupportsUtils.h" +#include "nsCOMPtr.h" +#include "nsTArray.h" +#include "nsString.h" + +#define kIndentStr u" "_ns +#define kEndTag u"</"_ns + +class nsAtom; +class nsINode; + +namespace mozilla { +class Encoding; +} + +class nsXMLContentSerializer : public nsIContentSerializer { + public: + nsXMLContentSerializer(); + + NS_DECL_ISUPPORTS + + NS_IMETHOD Init(uint32_t flags, uint32_t aWrapColumn, + const mozilla::Encoding* aEncoding, bool aIsCopying, + bool aRewriteEncodingDeclaration, + bool* aNeedsPreformatScanning, nsAString& aOutput) override; + + NS_IMETHOD AppendText(nsIContent* aText, int32_t aStartOffset, + int32_t aEndOffset) override; + + NS_IMETHOD AppendCDATASection(nsIContent* aCDATASection, int32_t aStartOffset, + int32_t aEndOffset) override; + + NS_IMETHOD AppendProcessingInstruction( + mozilla::dom::ProcessingInstruction* aPI, int32_t aStartOffset, + int32_t aEndOffset) override; + + NS_IMETHOD AppendComment(mozilla::dom::Comment* aComment, + int32_t aStartOffset, int32_t aEndOffset) override; + + NS_IMETHOD AppendDoctype(mozilla::dom::DocumentType* aDoctype) override; + + NS_IMETHOD AppendElementStart( + mozilla::dom::Element* aElement, + mozilla::dom::Element* aOriginalElement) override; + + NS_IMETHOD AppendElementEnd(mozilla::dom::Element* aElement, + mozilla::dom::Element* aOriginalElement) override; + + NS_IMETHOD FlushAndFinish() override { return NS_OK; } + + NS_IMETHOD Finish() override; + + NS_IMETHOD GetOutputLength(uint32_t& aLength) const override; + + NS_IMETHOD AppendDocumentStart(mozilla::dom::Document* aDocument) override; + + NS_IMETHOD ScanElementForPreformat(mozilla::dom::Element* aElement) override { + return NS_OK; + } + NS_IMETHOD ForgetElementForPreformat( + mozilla::dom::Element* aElement) override { + return NS_OK; + } + + protected: + virtual ~nsXMLContentSerializer(); + + /** + * Appends a char16_t character and increments the column position + */ + [[nodiscard]] bool AppendToString(const char16_t aChar, + nsAString& aOutputStr); + + /** + * Appends a nsAString string and increments the column position + */ + [[nodiscard]] bool AppendToString(const nsAString& aStr, + nsAString& aOutputStr); + + /** + * Appends a string by replacing all line-endings + * by mLineBreak, except in the case of raw output. + * It increments the column position. + */ + [[nodiscard]] bool AppendToStringConvertLF(const nsAString& aStr, + nsAString& aOutputStr); + + /** + * Appends a string by wrapping it when necessary. + * It updates the column position. + */ + [[nodiscard]] bool AppendToStringWrapped(const nsAString& aStr, + nsAString& aOutputStr); + + /** + * Appends a string by formating and wrapping it when necessary + * It updates the column position. + */ + [[nodiscard]] bool AppendToStringFormatedWrapped(const nsAString& aStr, + nsAString& aOutputStr); + + // used by AppendToStringWrapped + [[nodiscard]] bool AppendWrapped_WhitespaceSequence( + nsAString::const_char_iterator& aPos, + const nsAString::const_char_iterator aEnd, + const nsAString::const_char_iterator aSequenceStart, + nsAString& aOutputStr); + + // used by AppendToStringFormatedWrapped + [[nodiscard]] bool AppendFormatedWrapped_WhitespaceSequence( + nsAString::const_char_iterator& aPos, + const nsAString::const_char_iterator aEnd, + const nsAString::const_char_iterator aSequenceStart, + bool& aMayIgnoreStartOfLineWhitespaceSequence, nsAString& aOutputStr); + + // used by AppendToStringWrapped and AppendToStringFormatedWrapped + [[nodiscard]] bool AppendWrapped_NonWhitespaceSequence( + nsAString::const_char_iterator& aPos, + const nsAString::const_char_iterator aEnd, + const nsAString::const_char_iterator aSequenceStart, + bool& aMayIgnoreStartOfLineWhitespaceSequence, + bool& aSequenceStartAfterAWhiteSpace, nsAString& aOutputStr); + + /** + * add mLineBreak to the string + * It updates the column position and other flags. + */ + [[nodiscard]] bool AppendNewLineToString(nsAString& aOutputStr); + + /** + * Appends a string by translating entities + * It doesn't increment the column position + */ + [[nodiscard]] virtual bool AppendAndTranslateEntities(const nsAString& aStr, + nsAString& aOutputStr); + + /** + * Helper for virtual AppendAndTranslateEntities that does the actualy work. + * + * Do not call this directly. Call it via the template helper below. + */ + private: + [[nodiscard]] static bool AppendAndTranslateEntities( + const nsAString& aStr, nsAString& aOutputStr, + const uint8_t aEntityTable[], uint16_t aMaxTableIndex, + const char* const aStringTable[]); + + protected: + /** + * Helper for calling AppendAndTranslateEntities in a way that guarantees we + * don't mess up our aEntityTable sizing. This is a bit more complicated than + * it could be, becaue sometimes we don't want to use all of aEntityTable, so + * we have to allow passing the amount to use independently. But we can + * statically ensure it's not too big. + * + * The first integer template argument, which callers need to specify + * explicitly, is the index of the last entry in aEntityTable that should be + * considered for encoding as an entity reference. The second integer + * argument will be deduced from the actual table passed in. + * + * aEntityTable contains as values indices into aStringTable. Those represent + * the strings that should be used to replace the characters that are used to + * index into aEntityTable. aStringTable[0] should be nullptr, and characters + * that do not need replacement should map to 0 in aEntityTable. + */ + template <uint16_t LargestIndex, uint16_t TableLength> + [[nodiscard]] bool AppendAndTranslateEntities( + const nsAString& aStr, nsAString& aOutputStr, + const uint8_t (&aEntityTable)[TableLength], + const char* const aStringTable[]) { + static_assert(LargestIndex < TableLength, + "Largest allowed index must be smaller than table length"); + return AppendAndTranslateEntities(aStr, aOutputStr, aEntityTable, + LargestIndex, aStringTable); + } + + /** + * Max index that can be used with some of our entity tables. + */ + static const uint16_t kGTVal = 62; + + /** + * retrieve the text content of the node and append it to the given string + * It doesn't increment the column position + */ + nsresult AppendTextData(nsIContent* aNode, int32_t aStartOffset, + int32_t aEndOffset, nsAString& aStr, + bool aTranslateEntities); + + virtual nsresult PushNameSpaceDecl(const nsAString& aPrefix, + const nsAString& aURI, nsIContent* aOwner); + void PopNameSpaceDeclsFor(nsIContent* aOwner); + + /** + * The problem that ConfirmPrefix fixes is that anyone can insert nodes + * through the DOM that have a namespace URI and a random or empty or + * previously existing prefix that's totally unrelated to the prefixes + * declared at that point through xmlns attributes. So what ConfirmPrefix + * does is ensure that we can map aPrefix to the namespace URI aURI (for + * example, that the prefix is not already mapped to some other namespace). + * aPrefix will be adjusted, if necessary, so the value of the prefix + * _after_ this call is what should be serialized. + * @param aPrefix the prefix that may need adjusting + * @param aURI the namespace URI we want aPrefix to point to + * @param aElement the element we're working with (needed for proper default + * namespace handling) + * @param aIsAttribute true if we're confirming a prefix for an attribute. + * @return true if we need to push the (prefix, uri) pair on the namespace + * stack (note that this can happen even if the prefix is + * empty). + */ + bool ConfirmPrefix(nsAString& aPrefix, const nsAString& aURI, + nsIContent* aElement, bool aIsAttribute); + /** + * GenerateNewPrefix generates a new prefix and writes it to aPrefix + */ + void GenerateNewPrefix(nsAString& aPrefix); + + uint32_t ScanNamespaceDeclarations(mozilla::dom::Element* aContent, + mozilla::dom::Element* aOriginalElement, + const nsAString& aTagNamespaceURI); + + [[nodiscard]] virtual bool SerializeAttributes( + mozilla::dom::Element* aContent, mozilla::dom::Element* aOriginalElement, + nsAString& aTagPrefix, const nsAString& aTagNamespaceURI, + nsAtom* aTagName, nsAString& aStr, uint32_t aSkipAttr, bool aAddNSAttr); + + [[nodiscard]] bool SerializeAttr(const nsAString& aPrefix, + const nsAString& aName, + const nsAString& aValue, nsAString& aStr, + bool aDoEscapeEntities); + + bool IsJavaScript(nsIContent* aContent, nsAtom* aAttrNameAtom, + int32_t aAttrNamespaceID, const nsAString& aValueString); + + /** + * This method can be redefined to check if the element can be serialized. + * It is called when the serialization of the start tag is asked + * (AppendElementStart) + * In this method you can also force the formating + * by setting aForceFormat to true. + * @return boolean true if the element can be output + */ + virtual bool CheckElementStart(mozilla::dom::Element* aElement, + bool& aForceFormat, nsAString& aStr, + nsresult& aResult); + + /** + * This method is responsible for appending the '>' at the end of the start + * tag, possibly preceded by '/' and maybe a ' ' before that too. + * + * aElement and aOriginalElement are the same as the corresponding arguments + * to AppendElementStart. + */ + [[nodiscard]] bool AppendEndOfElementStart( + mozilla::dom::Element* aEleemnt, mozilla::dom::Element* aOriginalElement, + nsAString& aStr); + + /** + * This method can be redefine to serialize additional things just after + * the serialization of the start tag. + * (called at the end of AppendElementStart) + */ + [[nodiscard]] virtual bool AfterElementStart(nsIContent* aContent, + nsIContent* aOriginalElement, + nsAString& aStr) { + return true; + }; + + /** + * This method can be redefined to check if the element can be serialized. + * It is called when the serialization of the end tag is asked + * (AppendElementEnd) + * In this method you can also force the formating + * by setting aForceFormat to true. + * @return boolean true if the element can be output + */ + virtual bool CheckElementEnd(mozilla::dom::Element* aElement, + mozilla::dom::Element* aOriginalElement, + bool& aForceFormat, nsAString& aStr); + + /** + * This method can be redefine to serialize additional things just after + * the serialization of the end tag. + * (called at the end of AppendElementStart) + */ + virtual void AfterElementEnd(nsIContent* aContent, nsAString& aStr){}; + + /** + * Returns true if a line break should be inserted before an element open tag + */ + virtual bool LineBreakBeforeOpen(int32_t aNamespaceID, nsAtom* aName); + + /** + * Returns true if a line break should be inserted after an element open tag + */ + virtual bool LineBreakAfterOpen(int32_t aNamespaceID, nsAtom* aName); + + /** + * Returns true if a line break should be inserted after an element close tag + */ + virtual bool LineBreakBeforeClose(int32_t aNamespaceID, nsAtom* aName); + + /** + * Returns true if a line break should be inserted after an element close tag + */ + virtual bool LineBreakAfterClose(int32_t aNamespaceID, nsAtom* aName); + + /** + * add intendation. Call only in the case of formating and if the current + * position is at 0. It updates the column position. + */ + [[nodiscard]] bool AppendIndentation(nsAString& aStr); + + [[nodiscard]] bool IncrIndentation(nsAtom* aName); + void DecrIndentation(nsAtom* aName); + + // Functions to check for newlines that needs to be added between nodes in + // the root of a document. See mAddNewlineForRootNode + [[nodiscard]] bool MaybeAddNewlineForRootNode(nsAString& aStr); + void MaybeFlagNewlineForRootNode(nsINode* aNode); + + // Functions to check if we enter in or leave from a preformated content + virtual void MaybeEnterInPreContent(nsIContent* aNode); + virtual void MaybeLeaveFromPreContent(nsIContent* aNode); + + bool ShouldMaintainPreLevel() const; + int32_t PreLevel() const { + MOZ_ASSERT(ShouldMaintainPreLevel()); + return mPreLevel; + } + int32_t& PreLevel() { + MOZ_ASSERT(ShouldMaintainPreLevel()); + return mPreLevel; + } + + bool MaybeSerializeIsValue(mozilla::dom::Element* aElement, nsAString& aStr); + + int32_t mPrefixIndex; + + struct NameSpaceDecl { + nsString mPrefix; + nsString mURI; + nsIContent* mOwner; + }; + + nsTArray<NameSpaceDecl> mNameSpaceStack; + + // nsIDocumentEncoder flags + MOZ_INIT_OUTSIDE_CTOR uint32_t mFlags; + + // characters to use for line break + nsString mLineBreak; + + // The charset that was passed to Init() + nsCString mCharset; + + // current column position on the current line + uint32_t mColPos; + + // true = pretty formating should be done (OutputFormated flag) + MOZ_INIT_OUTSIDE_CTOR bool mDoFormat; + + // true = no formatting,(OutputRaw flag) + // no newline convertion and no rewrap long lines even if OutputWrap is set. + MOZ_INIT_OUTSIDE_CTOR bool mDoRaw; + + // true = wrapping should be done (OutputWrap flag) + MOZ_INIT_OUTSIDE_CTOR bool mDoWrap; + + // true = we can break lines (OutputDisallowLineBreaking flag) + MOZ_INIT_OUTSIDE_CTOR bool mAllowLineBreaking; + + // number of maximum column in a line, in the wrap mode + MOZ_INIT_OUTSIDE_CTOR uint32_t mMaxColumn; + + // current indent value + nsString mIndent; + + // this is the indentation level after the indentation reached + // the maximum length of indentation + int32_t mIndentOverflow; + + // says if the indentation has been already added on the current line + bool mIsIndentationAddedOnCurrentLine; + + // the string which is currently added is in an attribute + bool mInAttribute; + + // true = a newline character should be added. It's only + // useful when serializing root nodes. see MaybeAddNewlineForRootNode and + // MaybeFlagNewlineForRootNode + bool mAddNewlineForRootNode; + + // Indicates that a space will be added if and only if content is + // continued on the same line while serializing source. Otherwise, + // the newline character acts as the whitespace and no space is needed. + // used when mDoFormat = true + bool mAddSpace; + + // says that if the next string to add contains a newline character at the + // begining, then this newline character should be ignored, because a + // such character has already been added into the output string + bool mMayIgnoreLineBreakSequence; + + bool mBodyOnly; + int32_t mInBody; + + // Non-owning. + nsAString* mOutput; + + private: + // number of nested elements which have preformated content + MOZ_INIT_OUTSIDE_CTOR int32_t mPreLevel; + + static const uint8_t kEntities[]; + static const uint8_t kAttrEntities[]; + static const char* const kEntityStrings[]; +}; + +nsresult NS_NewXMLContentSerializer(nsIContentSerializer** aSerializer); + +#endif |