From 26a029d407be480d791972afb5975cf62c9360a6 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 19 Apr 2024 02:47:55 +0200 Subject: Adding upstream version 124.0.1. Signed-off-by: Daniel Baumann --- dom/serializers/nsPlainTextSerializer.h | 386 ++++++++++++++++++++++++++++++++ 1 file changed, 386 insertions(+) create mode 100644 dom/serializers/nsPlainTextSerializer.h (limited to 'dom/serializers/nsPlainTextSerializer.h') diff --git a/dom/serializers/nsPlainTextSerializer.h b/dom/serializers/nsPlainTextSerializer.h new file mode 100644 index 0000000000..4afd83f1a0 --- /dev/null +++ b/dom/serializers/nsPlainTextSerializer.h @@ -0,0 +1,386 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ + +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* + * nsIContentSerializer implementation that can be used with an + * nsIDocumentEncoder to convert a DOM into plaintext in a nice way + * (eg for copy/paste as plaintext). + */ + +#ifndef nsPlainTextSerializer_h__ +#define nsPlainTextSerializer_h__ + +#include "mozilla/Maybe.h" +#include "nsAtom.h" +#include "nsCycleCollectionParticipant.h" +#include "nsIContentSerializer.h" +#include "nsIDocumentEncoder.h" +#include "nsString.h" +#include "nsTArray.h" + +#include + +class nsIContent; + +namespace mozilla::dom { +class DocumentType; +class Element; +} // namespace mozilla::dom + +class nsPlainTextSerializer final : public nsIContentSerializer { + public: + nsPlainTextSerializer(); + + NS_DECL_CYCLE_COLLECTING_ISUPPORTS + NS_DECL_CYCLE_COLLECTION_CLASS(nsPlainTextSerializer) + + // nsIContentSerializer + NS_IMETHOD Init(uint32_t flags, uint32_t aWrapColumn, + const mozilla::Encoding* aEncoding, bool aIsCopying, + bool aIsWholeDocument, bool* aNeedsPreformatScanning, + nsAString& aOutput) override; + + NS_IMETHOD AppendText(nsIContent* aText, int32_t aStartOffset, + int32_t aEndOffset) override; + NS_IMETHOD AppendCDATASection(nsIContent* aCDATASection, int32_t aStartOffset, + int32_t aEndOffset) override; + NS_IMETHOD AppendProcessingInstruction( + mozilla::dom::ProcessingInstruction* aPI, int32_t aStartOffset, + int32_t aEndOffset) override { + return NS_OK; + } + NS_IMETHOD AppendComment(mozilla::dom::Comment* aComment, + int32_t aStartOffset, int32_t aEndOffset) override { + return NS_OK; + } + NS_IMETHOD AppendDoctype(mozilla::dom::DocumentType* aDoctype) override { + return NS_OK; + } + NS_IMETHOD AppendElementStart( + mozilla::dom::Element* aElement, + mozilla::dom::Element* aOriginalElement) override; + NS_IMETHOD AppendElementEnd(mozilla::dom::Element* aElement, + mozilla::dom::Element* aOriginalElement) override; + + NS_IMETHOD FlushAndFinish() override; + + NS_IMETHOD Finish() override; + + NS_IMETHOD GetOutputLength(uint32_t& aLength) const override; + + NS_IMETHOD AppendDocumentStart(mozilla::dom::Document* aDocument) override; + + NS_IMETHOD ScanElementForPreformat(mozilla::dom::Element* aElement) override; + NS_IMETHOD ForgetElementForPreformat( + mozilla::dom::Element* aElement) override; + + private: + ~nsPlainTextSerializer(); + + nsresult GetAttributeValue(const nsAtom* aName, nsString& aValueRet) const; + void AddToLine(const char16_t* aStringToAdd, int32_t aLength); + + void MaybeWrapAndOutputCompleteLines(); + + // @param aSoftLineBreak A soft line break is a space followed by a linebreak + // (cf. https://www.ietf.org/rfc/rfc3676.txt, section 4.2). + void EndLine(bool aSoftLineBreak, bool aBreakBySpace = false); + + void EnsureVerticalSpace(int32_t noOfRows); + + void ConvertToLinesAndOutput(const nsAString& aString); + + void Write(const nsAString& aString); + + // @return true, iff the elements' whitespace and newline characters have to + // be preserved according to its style or because it's a `
`
+  //         element.
+  bool IsElementPreformatted() const;
+  bool IsInOL() const;
+  bool IsInOlOrUl() const;
+  bool IsCurrentNodeConverted() const;
+  bool MustSuppressLeaf() const;
+
+  /**
+   * Returns the local name of the element as an atom if the element is an
+   * HTML element and the atom is a static atom. Otherwise, nullptr is returned.
+   */
+  static nsAtom* GetIdForContent(nsIContent* aContent);
+  nsresult DoOpenContainer(const nsAtom* aTag);
+  void OpenContainerForOutputFormatted(const nsAtom* aTag);
+  nsresult DoCloseContainer(const nsAtom* aTag);
+  void CloseContainerForOutputFormatted(const nsAtom* aTag);
+  nsresult DoAddLeaf(const nsAtom* aTag);
+
+  void DoAddText();
+  // @param aText Ignored if aIsLineBreak is true.
+  void DoAddText(bool aIsLineBreak, const nsAString& aText);
+
+  inline bool DoOutput() const { return mHeadLevel == 0; }
+
+  static inline bool IsQuotedLine(const nsAString& aLine) {
+    return !aLine.IsEmpty() && aLine.First() == char16_t('>');
+  }
+
+  // Stack handling functions
+  bool GetLastBool(const nsTArray& aStack);
+  void SetLastBool(nsTArray& aStack, bool aValue);
+  void PushBool(nsTArray& aStack, bool aValue);
+  bool PopBool(nsTArray& aStack);
+
+  bool IsIgnorableRubyAnnotation(const nsAtom* aTag) const;
+
+  // @return true, iff the elements' whitespace and newline characters have to
+  //         be preserved according to its style or because it's a `
`
+  //         element.
+  static bool IsElementPreformatted(mozilla::dom::Element* aElement);
+
+  // https://drafts.csswg.org/css-display/#block-level
+  static bool IsCssBlockLevelElement(mozilla::dom::Element* aElement);
+
+ private:
+  uint32_t mHeadLevel;
+
+  class Settings {
+   public:
+    enum class HeaderStrategy {
+      kNoIndentation,
+      kIndentIncreasedWithHeaderLevel,
+      kNumberHeadingsAndIndentSlightly
+    };
+
+    // May adapt the flags.
+    //
+    // @param aFlags As defined in nsIDocumentEncoder.idl.
+    void Init(int32_t aFlags, uint32_t aWrapColumn);
+
+    // Pref: converter.html2txt.structs.
+    bool GetStructs() const { return mStructs; }
+
+    // Pref: converter.html2txt.header_strategy.
+    HeaderStrategy GetHeaderStrategy() const { return mHeaderStrategy; }
+
+    // @return As defined in nsIDocumentEncoder.idl.
+    int32_t GetFlags() const { return mFlags; }
+
+    // @param aFlag As defined in nsIDocumentEncoder.idl. May consist of
+    // multiple bitwise or'd flags.
+    bool HasFlag(int32_t aFlag) const { return mFlags & aFlag; }
+
+    // Whether the output should include ruby annotations.
+    bool GetWithRubyAnnotation() const { return mWithRubyAnnotation; }
+
+    uint32_t GetWrapColumn() const { return mWrapColumn; }
+
+    bool MayWrap() const {
+      return GetWrapColumn() && HasFlag(nsIDocumentEncoder::OutputFormatted |
+                                        nsIDocumentEncoder::OutputWrap);
+    }
+
+    bool MayBreakLines() const {
+      return !HasFlag(nsIDocumentEncoder::OutputDisallowLineBreaking);
+    }
+
+   private:
+    // @param aPrefHeaderStrategy Pref: converter.html2txt.header_strategy.
+    static HeaderStrategy Convert(int32_t aPrefHeaderStrategy);
+
+    // Pref: converter.html2txt.structs.
+    bool mStructs = true;
+
+    // Pref: converter.html2txt.header_strategy.
+    HeaderStrategy mHeaderStrategy =
+        HeaderStrategy::kIndentIncreasedWithHeaderLevel;
+
+    // Flags defined in nsIDocumentEncoder.idl.
+    int32_t mFlags = 0;
+
+    // Whether the output should include ruby annotations.
+    bool mWithRubyAnnotation = false;
+
+    // The wrap column is how many fixed-pitch narrow
+    // (https://unicode.org/reports/tr11/) (e.g. Latin) characters
+    // should be allowed on a line. There could be less chars if the chars
+    // are wider than latin chars of more if the chars are more narrow.
+    uint32_t mWrapColumn = 0;
+  };
+
+  Settings mSettings;
+
+  struct Indentation {
+    // The number of space characters to be inserted including the length of
+    // mHeader.
+    int32_t mLength = 0;
+
+    // The header that has to be written in the indent.
+    // That could be, for instance, the bullet in a bulleted list.
+    nsString mHeader;
+  };
+
+  class CurrentLine {
+   public:
+    void ResetContentAndIndentationHeader();
+
+    // @param aFlags As defined in nsIDocumentEncoder.idl.
+    void MaybeReplaceNbspsInContent(int32_t aFlags);
+
+    void CreateQuotesAndIndent(nsAString& aResult) const;
+
+    bool HasContentOrIndentationHeader() const {
+      return !mContent.IsEmpty() || !mIndentation.mHeader.IsEmpty();
+    }
+
+    // @param aLineBreaker May be nullptr.
+    int32_t FindWrapIndexForContent(uint32_t aWrapColumn,
+                                    bool aUseLineBreaker) const;
+
+    // @return Combined width of cite quote level and indentation.
+    uint32_t DeterminePrefixWidth() const {
+      // XXX: Should calculate prefixwidth with GetUnicharStringWidth
+      return (mCiteQuoteLevel > 0 ? mCiteQuoteLevel + 1 : 0) +
+             mIndentation.mLength + uint32_t(mSpaceStuffed);
+    }
+
+    Indentation mIndentation;
+
+    // The number of '>' characters.
+    int32_t mCiteQuoteLevel = 0;
+
+    // Whether this line is getting space-stuffed, see
+    // https://datatracker.ietf.org/doc/html/rfc2646#section-4.4
+    bool mSpaceStuffed = false;
+
+    // Excludes indentation and quotes.
+    nsString mContent;
+  };
+
+  CurrentLine mCurrentLine;
+
+  class OutputManager {
+   public:
+    /**
+     *  @param aFlags As defined in nsIDocumentEncoder.idl.
+     *  @param aOutput An empty string.
+     */
+    OutputManager(int32_t aFlags, nsAString& aOutput);
+
+    enum class StripTrailingWhitespaces { kMaybe, kNo };
+
+    void Append(const CurrentLine& aCurrentLine,
+                StripTrailingWhitespaces aStripTrailingWhitespaces);
+
+    void AppendLineBreak();
+
+    /**
+     * This empties the current line cache without adding a NEWLINE.
+     * Should not be used if line wrapping is of importance since
+     * this function destroys the cache information.
+     *
+     * It will also write indentation and quotes if we believe us to be
+     * at the start of the line.
+     */
+    void Flush(CurrentLine& aCurrentLine);
+
+    bool IsAtFirstColumn() const { return mAtFirstColumn; }
+
+    uint32_t GetOutputLength() const;
+
+   private:
+    /**
+     * @param aString Last character is expected to not be a line break.
+     */
+    void Append(const nsAString& aString);
+
+    // As defined in nsIDocumentEncoder.idl.
+    const int32_t mFlags;
+
+    nsAString& mOutput;
+
+    bool mAtFirstColumn;
+
+    nsString mLineBreak;
+  };
+
+  mozilla::Maybe mOutputManager;
+
+  // If we've just written out a cite blockquote, we need to remember it
+  // so we don't duplicate spaces before a 
 (which mail uses to quote
+  // old messages).
+  bool mHasWrittenCiteBlockquote;
+
+  int32_t mFloatingLines;  // To store the number of lazy line breaks
+
+  // Treat quoted text as though it's preformatted -- don't wrap it.
+  // Having it on a pref is a temporary measure, See bug 69638.
+  int32_t mSpanLevel;
+
+  int32_t mEmptyLines;  // Will be the number of empty lines before
+                        // the current. 0 if we are starting a new
+                        // line and -1 if we are in a line.
+
+  bool mInWhitespace;
+  bool mPreFormattedMail;  // we're dealing with special DOM
+                           // used by Thunderbird code.
+
+  // While handling a new tag, this variable should remind if any line break
+  // is due because of a closing tag. Setting it to "TRUE" while closing the
+  // tags. Hence opening tags are guaranteed to start with appropriate line
+  // breaks.
+  bool mLineBreakDue;
+
+  bool mPreformattedBlockBoundary;
+
+  int32_t mHeaderCounter[7]; /* For header-numbering:
+                                Number of previous headers of
+                                the same depth and in the same
+                                section.
+                                mHeaderCounter[1] for 

etc. */ + + RefPtr mElement; + + // For handling table rows + AutoTArray mHasWrittenCellsForRow; + + // Values gotten in OpenContainer that is (also) needed in CloseContainer + AutoTArray mIsInCiteBlockquote; + + // The tag stack: the stack of tags we're operating on, so we can nest. + // The stack only ever points to static atoms, so they don't need to be + // refcounted. + const nsAtom** mTagStack; + uint32_t mTagStackIndex; + + // The stack indicating whether the elements we've been operating on are + // CSS preformatted elements, so that we can tell if the text inside them + // should be formatted. + std::stack mPreformatStack; + + // Content in the stack above this index should be ignored: + uint32_t mIgnoreAboveIndex; + + // The stack for ordered lists + AutoTArray mOLStack; + + uint32_t mULCount; + + bool mUseLineBreaker = false; + + // Conveniance constant. It would be nice to have it as a const static + // variable, but that causes issues with OpenBSD and module unloading. + const nsString kSpace; + + // mIgnoredChildNodeLevel is used to tell if current node is an ignorable + // child node. The initial value of mIgnoredChildNodeLevel is 0. When + // serializer enters those specific nodes, mIgnoredChildNodeLevel increases + // and is greater than 0. Otherwise when serializer leaves those nodes, + // mIgnoredChildNodeLevel decreases. + uint32_t mIgnoredChildNodeLevel; +}; + +nsresult NS_NewPlainTextSerializer(nsIContentSerializer** aSerializer); + +#endif -- cgit v1.2.3