/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* vim: set ts=8 sts=2 et sw=2 tw=80: */ /* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ /* * nsIContentSerializer implementation that can be used with an * nsIDocumentEncoder to convert a DOM into plaintext in a nice way * (eg for copy/paste as plaintext). */ #ifndef nsPlainTextSerializer_h__ #define nsPlainTextSerializer_h__ #include "mozilla/Attributes.h" #include "mozilla/Maybe.h" #include "nsCOMPtr.h" #include "nsAtom.h" #include "nsCycleCollectionParticipant.h" #include "nsIContentSerializer.h" #include "nsIDocumentEncoder.h" #include "nsString.h" #include "nsTArray.h" #include class nsIContent; namespace mozilla::dom { class DocumentType; class Element; } // namespace mozilla::dom class nsPlainTextSerializer final : public nsIContentSerializer { public: nsPlainTextSerializer(); NS_DECL_CYCLE_COLLECTING_ISUPPORTS NS_DECL_CYCLE_COLLECTION_CLASS(nsPlainTextSerializer) // nsIContentSerializer NS_IMETHOD Init(uint32_t flags, uint32_t aWrapColumn, const mozilla::Encoding* aEncoding, bool aIsCopying, bool aIsWholeDocument, bool* aNeedsPreformatScanning, nsAString& aOutput) override; NS_IMETHOD AppendText(nsIContent* aText, int32_t aStartOffset, int32_t aEndOffset) override; NS_IMETHOD AppendCDATASection(nsIContent* aCDATASection, int32_t aStartOffset, int32_t aEndOffset) override; NS_IMETHOD AppendProcessingInstruction( mozilla::dom::ProcessingInstruction* aPI, int32_t aStartOffset, int32_t aEndOffset) override { return NS_OK; } NS_IMETHOD AppendComment(mozilla::dom::Comment* aComment, int32_t aStartOffset, int32_t aEndOffset) override { return NS_OK; } NS_IMETHOD AppendDoctype(mozilla::dom::DocumentType* aDoctype) override { return NS_OK; } NS_IMETHOD AppendElementStart( mozilla::dom::Element* aElement, mozilla::dom::Element* aOriginalElement) override; NS_IMETHOD AppendElementEnd(mozilla::dom::Element* aElement, mozilla::dom::Element* aOriginalElement) override; NS_IMETHOD FlushAndFinish() override; NS_IMETHOD Finish() override; NS_IMETHOD GetOutputLength(uint32_t& aLength) const override; NS_IMETHOD AppendDocumentStart(mozilla::dom::Document* aDocument) override; NS_IMETHOD ScanElementForPreformat(mozilla::dom::Element* aElement) override; NS_IMETHOD ForgetElementForPreformat( mozilla::dom::Element* aElement) override; private: ~nsPlainTextSerializer(); nsresult GetAttributeValue(const nsAtom* aName, nsString& aValueRet) const; void AddToLine(const char16_t* aStringToAdd, int32_t aLength); void MaybeWrapAndOutputCompleteLines(); // @param aSoftLineBreak A soft line break is a space followed by a linebreak // (cf. https://www.ietf.org/rfc/rfc3676.txt, section 4.2). void EndLine(bool aSoftLineBreak, bool aBreakBySpace = false); void EnsureVerticalSpace(int32_t noOfRows); void ConvertToLinesAndOutput(const nsAString& aString); void Write(const nsAString& aString); // @return true, iff the elements' whitespace and newline characters have to // be preserved according to its style or because it's a `
`
  //         element.
  bool IsElementPreformatted() const;
  bool IsInOL() const;
  bool IsInOlOrUl() const;
  bool IsCurrentNodeConverted() const;
  bool MustSuppressLeaf() const;

  /**
   * Returns the local name of the element as an atom if the element is an
   * HTML element and the atom is a static atom. Otherwise, nullptr is returned.
   */
  static nsAtom* GetIdForContent(nsIContent* aContent);
  nsresult DoOpenContainer(const nsAtom* aTag);
  void OpenContainerForOutputFormatted(const nsAtom* aTag);
  nsresult DoCloseContainer(const nsAtom* aTag);
  void CloseContainerForOutputFormatted(const nsAtom* aTag);
  nsresult DoAddLeaf(const nsAtom* aTag);

  void DoAddText();
  // @param aText Ignored if aIsLineBreak is true.
  void DoAddText(bool aIsLineBreak, const nsAString& aText);

  inline bool DoOutput() const { return mHeadLevel == 0; }

  static inline bool IsQuotedLine(const nsAString& aLine) {
    return !aLine.IsEmpty() && aLine.First() == char16_t('>');
  }

  // Stack handling functions
  bool GetLastBool(const nsTArray& aStack);
  void SetLastBool(nsTArray& aStack, bool aValue);
  void PushBool(nsTArray& aStack, bool aValue);
  bool PopBool(nsTArray& aStack);

  bool IsIgnorableRubyAnnotation(const nsAtom* aTag) const;

  // @return true, iff the elements' whitespace and newline characters have to
  //         be preserved according to its style or because it's a `
`
  //         element.
  static bool IsElementPreformatted(mozilla::dom::Element* aElement);

  // https://drafts.csswg.org/css-display/#block-level
  static bool IsCssBlockLevelElement(mozilla::dom::Element* aElement);

 private:
  uint32_t mHeadLevel;

  class Settings {
   public:
    enum class HeaderStrategy {
      kNoIndentation,
      kIndentIncreasedWithHeaderLevel,
      kNumberHeadingsAndIndentSlightly
    };

    // May adapt the flags.
    //
    // @param aFlags As defined in nsIDocumentEncoder.idl.
    void Init(int32_t aFlags, uint32_t aWrapColumn);

    // Pref: converter.html2txt.structs.
    bool GetStructs() const { return mStructs; }

    // Pref: converter.html2txt.header_strategy.
    HeaderStrategy GetHeaderStrategy() const { return mHeaderStrategy; }

    // @return As defined in nsIDocumentEncoder.idl.
    int32_t GetFlags() const { return mFlags; }

    // @param aFlag As defined in nsIDocumentEncoder.idl. May consist of
    // multiple bitwise or'd flags.
    bool HasFlag(int32_t aFlag) const { return mFlags & aFlag; }

    // Whether the output should include ruby annotations.
    bool GetWithRubyAnnotation() const { return mWithRubyAnnotation; }

    uint32_t GetWrapColumn() const { return mWrapColumn; }

    bool MayWrap() const {
      return GetWrapColumn() && HasFlag(nsIDocumentEncoder::OutputFormatted |
                                        nsIDocumentEncoder::OutputWrap);
    }

    bool MayBreakLines() const {
      return !HasFlag(nsIDocumentEncoder::OutputDisallowLineBreaking);
    }

   private:
    // @param aPrefHeaderStrategy Pref: converter.html2txt.header_strategy.
    static HeaderStrategy Convert(int32_t aPrefHeaderStrategy);

    // Pref: converter.html2txt.structs.
    bool mStructs = true;

    // Pref: converter.html2txt.header_strategy.
    HeaderStrategy mHeaderStrategy =
        HeaderStrategy::kIndentIncreasedWithHeaderLevel;

    // Flags defined in nsIDocumentEncoder.idl.
    int32_t mFlags = 0;

    // Whether the output should include ruby annotations.
    bool mWithRubyAnnotation = false;

    // The wrap column is how many fixed-pitch narrow
    // (https://unicode.org/reports/tr11/) (e.g. Latin) characters
    // should be allowed on a line. There could be less chars if the chars
    // are wider than latin chars of more if the chars are more narrow.
    uint32_t mWrapColumn = 0;
  };

  Settings mSettings;

  struct Indentation {
    // The number of space characters to be inserted including the length of
    // mHeader.
    int32_t mLength = 0;

    // The header that has to be written in the indent.
    // That could be, for instance, the bullet in a bulleted list.
    nsString mHeader;
  };

  class CurrentLine {
   public:
    void ResetContentAndIndentationHeader();

    // @param aFlags As defined in nsIDocumentEncoder.idl.
    void MaybeReplaceNbspsInContent(int32_t aFlags);

    void CreateQuotesAndIndent(nsAString& aResult) const;

    bool HasContentOrIndentationHeader() const {
      return !mContent.IsEmpty() || !mIndentation.mHeader.IsEmpty();
    }

    // @param aLineBreaker May be nullptr.
    int32_t FindWrapIndexForContent(uint32_t aWrapColumn,
                                    bool aUseLineBreaker) const;

    // @return Combined width of cite quote level and indentation.
    uint32_t DeterminePrefixWidth() const {
      // XXX: Should calculate prefixwidth with GetUnicharStringWidth
      return (mCiteQuoteLevel > 0 ? mCiteQuoteLevel + 1 : 0) +
             mIndentation.mLength;
    }

    Indentation mIndentation;

    // The number of '>' characters.
    int32_t mCiteQuoteLevel = 0;

    // Excludes indentation and quotes.
    nsString mContent;
  };

  CurrentLine mCurrentLine;

  class OutputManager {
   public:
    /**
     *  @param aFlags As defined in nsIDocumentEncoder.idl.
     *  @param aOutput An empty string.
     */
    OutputManager(int32_t aFlags, nsAString& aOutput);

    enum class StripTrailingWhitespaces { kMaybe, kNo };

    void Append(const CurrentLine& aCurrentLine,
                StripTrailingWhitespaces aStripTrailingWhitespaces);

    void AppendLineBreak();

    /**
     * This empties the current line cache without adding a NEWLINE.
     * Should not be used if line wrapping is of importance since
     * this function destroys the cache information.
     *
     * It will also write indentation and quotes if we believe us to be
     * at the start of the line.
     */
    void Flush(CurrentLine& aCurrentLine);

    bool IsAtFirstColumn() const { return mAtFirstColumn; }

    uint32_t GetOutputLength() const;

   private:
    /**
     * @param aString Last character is expected to not be a line break.
     */
    void Append(const nsAString& aString);

    // As defined in nsIDocumentEncoder.idl.
    const int32_t mFlags;

    nsAString& mOutput;

    bool mAtFirstColumn;

    nsString mLineBreak;
  };

  mozilla::Maybe mOutputManager;

  // If we've just written out a cite blockquote, we need to remember it
  // so we don't duplicate spaces before a 
 (which mail uses to quote
  // old messages).
  bool mHasWrittenCiteBlockquote;

  int32_t mFloatingLines;  // To store the number of lazy line breaks

  // Treat quoted text as though it's preformatted -- don't wrap it.
  // Having it on a pref is a temporary measure, See bug 69638.
  int32_t mSpanLevel;

  int32_t mEmptyLines;  // Will be the number of empty lines before
                        // the current. 0 if we are starting a new
                        // line and -1 if we are in a line.

  bool mInWhitespace;
  bool mPreFormattedMail;  // we're dealing with special DOM
                           // used by Thunderbird code.

  // While handling a new tag, this variable should remind if any line break
  // is due because of a closing tag. Setting it to "TRUE" while closing the
  // tags. Hence opening tags are guaranteed to start with appropriate line
  // breaks.
  bool mLineBreakDue;

  bool mPreformattedBlockBoundary;

  int32_t mHeaderCounter[7]; /* For header-numbering:
                                Number of previous headers of
                                the same depth and in the same
                                section.
                                mHeaderCounter[1] for 

etc. */ RefPtr mElement; // For handling table rows AutoTArray mHasWrittenCellsForRow; // Values gotten in OpenContainer that is (also) needed in CloseContainer AutoTArray mIsInCiteBlockquote; // The tag stack: the stack of tags we're operating on, so we can nest. // The stack only ever points to static atoms, so they don't need to be // refcounted. const nsAtom** mTagStack; uint32_t mTagStackIndex; // The stack indicating whether the elements we've been operating on are // CSS preformatted elements, so that we can tell if the text inside them // should be formatted. std::stack mPreformatStack; // Content in the stack above this index should be ignored: uint32_t mIgnoreAboveIndex; // The stack for ordered lists AutoTArray mOLStack; uint32_t mULCount; bool mUseLineBreaker = false; // Conveniance constant. It would be nice to have it as a const static // variable, but that causes issues with OpenBSD and module unloading. const nsString kSpace; // mIgnoredChildNodeLevel is used to tell if current node is an ignorable // child node. The initial value of mIgnoredChildNodeLevel is 0. When // serializer enters those specific nodes, mIgnoredChildNodeLevel increases // and is greater than 0. Otherwise when serializer leaves those nodes, // mIgnoredChildNodeLevel decreases. uint32_t mIgnoredChildNodeLevel; }; nsresult NS_NewPlainTextSerializer(nsIContentSerializer** aSerializer); #endif