summaryrefslogtreecommitdiffstats
path: root/dom/serializers/nsPlainTextSerializer.h
blob: 4afd83f1a0052094c5fddc6556046d6ec53bff99 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */

/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

/*
 * nsIContentSerializer implementation that can be used with an
 * nsIDocumentEncoder to convert a DOM into plaintext in a nice way
 * (eg for copy/paste as plaintext).
 */

#ifndef nsPlainTextSerializer_h__
#define nsPlainTextSerializer_h__

#include "mozilla/Maybe.h"
#include "nsAtom.h"
#include "nsCycleCollectionParticipant.h"
#include "nsIContentSerializer.h"
#include "nsIDocumentEncoder.h"
#include "nsString.h"
#include "nsTArray.h"

#include <stack>

class nsIContent;

namespace mozilla::dom {
class DocumentType;
class Element;
}  // namespace mozilla::dom

class nsPlainTextSerializer final : public nsIContentSerializer {
 public:
  nsPlainTextSerializer();

  NS_DECL_CYCLE_COLLECTING_ISUPPORTS
  NS_DECL_CYCLE_COLLECTION_CLASS(nsPlainTextSerializer)

  // nsIContentSerializer
  NS_IMETHOD Init(uint32_t flags, uint32_t aWrapColumn,
                  const mozilla::Encoding* aEncoding, bool aIsCopying,
                  bool aIsWholeDocument, bool* aNeedsPreformatScanning,
                  nsAString& aOutput) override;

  NS_IMETHOD AppendText(nsIContent* aText, int32_t aStartOffset,
                        int32_t aEndOffset) override;
  NS_IMETHOD AppendCDATASection(nsIContent* aCDATASection, int32_t aStartOffset,
                                int32_t aEndOffset) override;
  NS_IMETHOD AppendProcessingInstruction(
      mozilla::dom::ProcessingInstruction* aPI, int32_t aStartOffset,
      int32_t aEndOffset) override {
    return NS_OK;
  }
  NS_IMETHOD AppendComment(mozilla::dom::Comment* aComment,
                           int32_t aStartOffset, int32_t aEndOffset) override {
    return NS_OK;
  }
  NS_IMETHOD AppendDoctype(mozilla::dom::DocumentType* aDoctype) override {
    return NS_OK;
  }
  NS_IMETHOD AppendElementStart(
      mozilla::dom::Element* aElement,
      mozilla::dom::Element* aOriginalElement) override;
  NS_IMETHOD AppendElementEnd(mozilla::dom::Element* aElement,
                              mozilla::dom::Element* aOriginalElement) override;

  NS_IMETHOD FlushAndFinish() override;

  NS_IMETHOD Finish() override;

  NS_IMETHOD GetOutputLength(uint32_t& aLength) const override;

  NS_IMETHOD AppendDocumentStart(mozilla::dom::Document* aDocument) override;

  NS_IMETHOD ScanElementForPreformat(mozilla::dom::Element* aElement) override;
  NS_IMETHOD ForgetElementForPreformat(
      mozilla::dom::Element* aElement) override;

 private:
  ~nsPlainTextSerializer();

  nsresult GetAttributeValue(const nsAtom* aName, nsString& aValueRet) const;
  void AddToLine(const char16_t* aStringToAdd, int32_t aLength);

  void MaybeWrapAndOutputCompleteLines();

  // @param aSoftLineBreak A soft line break is a space followed by a linebreak
  // (cf. https://www.ietf.org/rfc/rfc3676.txt, section 4.2).
  void EndLine(bool aSoftLineBreak, bool aBreakBySpace = false);

  void EnsureVerticalSpace(int32_t noOfRows);

  void ConvertToLinesAndOutput(const nsAString& aString);

  void Write(const nsAString& aString);

  // @return true, iff the elements' whitespace and newline characters have to
  //         be preserved according to its style or because it's a `<pre>`
  //         element.
  bool IsElementPreformatted() const;
  bool IsInOL() const;
  bool IsInOlOrUl() const;
  bool IsCurrentNodeConverted() const;
  bool MustSuppressLeaf() const;

  /**
   * Returns the local name of the element as an atom if the element is an
   * HTML element and the atom is a static atom. Otherwise, nullptr is returned.
   */
  static nsAtom* GetIdForContent(nsIContent* aContent);
  nsresult DoOpenContainer(const nsAtom* aTag);
  void OpenContainerForOutputFormatted(const nsAtom* aTag);
  nsresult DoCloseContainer(const nsAtom* aTag);
  void CloseContainerForOutputFormatted(const nsAtom* aTag);
  nsresult DoAddLeaf(const nsAtom* aTag);

  void DoAddText();
  // @param aText Ignored if aIsLineBreak is true.
  void DoAddText(bool aIsLineBreak, const nsAString& aText);

  inline bool DoOutput() const { return mHeadLevel == 0; }

  static inline bool IsQuotedLine(const nsAString& aLine) {
    return !aLine.IsEmpty() && aLine.First() == char16_t('>');
  }

  // Stack handling functions
  bool GetLastBool(const nsTArray<bool>& aStack);
  void SetLastBool(nsTArray<bool>& aStack, bool aValue);
  void PushBool(nsTArray<bool>& aStack, bool aValue);
  bool PopBool(nsTArray<bool>& aStack);

  bool IsIgnorableRubyAnnotation(const nsAtom* aTag) const;

  // @return true, iff the elements' whitespace and newline characters have to
  //         be preserved according to its style or because it's a `<pre>`
  //         element.
  static bool IsElementPreformatted(mozilla::dom::Element* aElement);

  // https://drafts.csswg.org/css-display/#block-level
  static bool IsCssBlockLevelElement(mozilla::dom::Element* aElement);

 private:
  uint32_t mHeadLevel;

  class Settings {
   public:
    enum class HeaderStrategy {
      kNoIndentation,
      kIndentIncreasedWithHeaderLevel,
      kNumberHeadingsAndIndentSlightly
    };

    // May adapt the flags.
    //
    // @param aFlags As defined in nsIDocumentEncoder.idl.
    void Init(int32_t aFlags, uint32_t aWrapColumn);

    // Pref: converter.html2txt.structs.
    bool GetStructs() const { return mStructs; }

    // Pref: converter.html2txt.header_strategy.
    HeaderStrategy GetHeaderStrategy() const { return mHeaderStrategy; }

    // @return As defined in nsIDocumentEncoder.idl.
    int32_t GetFlags() const { return mFlags; }

    // @param aFlag As defined in nsIDocumentEncoder.idl. May consist of
    // multiple bitwise or'd flags.
    bool HasFlag(int32_t aFlag) const { return mFlags & aFlag; }

    // Whether the output should include ruby annotations.
    bool GetWithRubyAnnotation() const { return mWithRubyAnnotation; }

    uint32_t GetWrapColumn() const { return mWrapColumn; }

    bool MayWrap() const {
      return GetWrapColumn() && HasFlag(nsIDocumentEncoder::OutputFormatted |
                                        nsIDocumentEncoder::OutputWrap);
    }

    bool MayBreakLines() const {
      return !HasFlag(nsIDocumentEncoder::OutputDisallowLineBreaking);
    }

   private:
    // @param aPrefHeaderStrategy Pref: converter.html2txt.header_strategy.
    static HeaderStrategy Convert(int32_t aPrefHeaderStrategy);

    // Pref: converter.html2txt.structs.
    bool mStructs = true;

    // Pref: converter.html2txt.header_strategy.
    HeaderStrategy mHeaderStrategy =
        HeaderStrategy::kIndentIncreasedWithHeaderLevel;

    // Flags defined in nsIDocumentEncoder.idl.
    int32_t mFlags = 0;

    // Whether the output should include ruby annotations.
    bool mWithRubyAnnotation = false;

    // The wrap column is how many fixed-pitch narrow
    // (https://unicode.org/reports/tr11/) (e.g. Latin) characters
    // should be allowed on a line. There could be less chars if the chars
    // are wider than latin chars of more if the chars are more narrow.
    uint32_t mWrapColumn = 0;
  };

  Settings mSettings;

  struct Indentation {
    // The number of space characters to be inserted including the length of
    // mHeader.
    int32_t mLength = 0;

    // The header that has to be written in the indent.
    // That could be, for instance, the bullet in a bulleted list.
    nsString mHeader;
  };

  class CurrentLine {
   public:
    void ResetContentAndIndentationHeader();

    // @param aFlags As defined in nsIDocumentEncoder.idl.
    void MaybeReplaceNbspsInContent(int32_t aFlags);

    void CreateQuotesAndIndent(nsAString& aResult) const;

    bool HasContentOrIndentationHeader() const {
      return !mContent.IsEmpty() || !mIndentation.mHeader.IsEmpty();
    }

    // @param aLineBreaker May be nullptr.
    int32_t FindWrapIndexForContent(uint32_t aWrapColumn,
                                    bool aUseLineBreaker) const;

    // @return Combined width of cite quote level and indentation.
    uint32_t DeterminePrefixWidth() const {
      // XXX: Should calculate prefixwidth with GetUnicharStringWidth
      return (mCiteQuoteLevel > 0 ? mCiteQuoteLevel + 1 : 0) +
             mIndentation.mLength + uint32_t(mSpaceStuffed);
    }

    Indentation mIndentation;

    // The number of '>' characters.
    int32_t mCiteQuoteLevel = 0;

    // Whether this line is getting space-stuffed, see
    // https://datatracker.ietf.org/doc/html/rfc2646#section-4.4
    bool mSpaceStuffed = false;

    // Excludes indentation and quotes.
    nsString mContent;
  };

  CurrentLine mCurrentLine;

  class OutputManager {
   public:
    /**
     *  @param aFlags As defined in nsIDocumentEncoder.idl.
     *  @param aOutput An empty string.
     */
    OutputManager(int32_t aFlags, nsAString& aOutput);

    enum class StripTrailingWhitespaces { kMaybe, kNo };

    void Append(const CurrentLine& aCurrentLine,
                StripTrailingWhitespaces aStripTrailingWhitespaces);

    void AppendLineBreak();

    /**
     * This empties the current line cache without adding a NEWLINE.
     * Should not be used if line wrapping is of importance since
     * this function destroys the cache information.
     *
     * It will also write indentation and quotes if we believe us to be
     * at the start of the line.
     */
    void Flush(CurrentLine& aCurrentLine);

    bool IsAtFirstColumn() const { return mAtFirstColumn; }

    uint32_t GetOutputLength() const;

   private:
    /**
     * @param aString Last character is expected to not be a line break.
     */
    void Append(const nsAString& aString);

    // As defined in nsIDocumentEncoder.idl.
    const int32_t mFlags;

    nsAString& mOutput;

    bool mAtFirstColumn;

    nsString mLineBreak;
  };

  mozilla::Maybe<OutputManager> mOutputManager;

  // If we've just written out a cite blockquote, we need to remember it
  // so we don't duplicate spaces before a <pre wrap> (which mail uses to quote
  // old messages).
  bool mHasWrittenCiteBlockquote;

  int32_t mFloatingLines;  // To store the number of lazy line breaks

  // Treat quoted text as though it's preformatted -- don't wrap it.
  // Having it on a pref is a temporary measure, See bug 69638.
  int32_t mSpanLevel;

  int32_t mEmptyLines;  // Will be the number of empty lines before
                        // the current. 0 if we are starting a new
                        // line and -1 if we are in a line.

  bool mInWhitespace;
  bool mPreFormattedMail;  // we're dealing with special DOM
                           // used by Thunderbird code.

  // While handling a new tag, this variable should remind if any line break
  // is due because of a closing tag. Setting it to "TRUE" while closing the
  // tags. Hence opening tags are guaranteed to start with appropriate line
  // breaks.
  bool mLineBreakDue;

  bool mPreformattedBlockBoundary;

  int32_t mHeaderCounter[7]; /* For header-numbering:
                                Number of previous headers of
                                the same depth and in the same
                                section.
                                mHeaderCounter[1] for <h1> etc. */

  RefPtr<mozilla::dom::Element> mElement;

  // For handling table rows
  AutoTArray<bool, 8> mHasWrittenCellsForRow;

  // Values gotten in OpenContainer that is (also) needed in CloseContainer
  AutoTArray<bool, 8> mIsInCiteBlockquote;

  // The tag stack: the stack of tags we're operating on, so we can nest.
  // The stack only ever points to static atoms, so they don't need to be
  // refcounted.
  const nsAtom** mTagStack;
  uint32_t mTagStackIndex;

  // The stack indicating whether the elements we've been operating on are
  // CSS preformatted elements, so that we can tell if the text inside them
  // should be formatted.
  std::stack<bool> mPreformatStack;

  // Content in the stack above this index should be ignored:
  uint32_t mIgnoreAboveIndex;

  // The stack for ordered lists
  AutoTArray<int32_t, 100> mOLStack;

  uint32_t mULCount;

  bool mUseLineBreaker = false;

  // Conveniance constant. It would be nice to have it as a const static
  // variable, but that causes issues with OpenBSD and module unloading.
  const nsString kSpace;

  // mIgnoredChildNodeLevel is used to tell if current node is an ignorable
  // child node. The initial value of mIgnoredChildNodeLevel is 0. When
  // serializer enters those specific nodes, mIgnoredChildNodeLevel increases
  // and is greater than 0. Otherwise when serializer leaves those nodes,
  // mIgnoredChildNodeLevel decreases.
  uint32_t mIgnoredChildNodeLevel;
};

nsresult NS_NewPlainTextSerializer(nsIContentSerializer** aSerializer);

#endif