diff options
Diffstat (limited to 'dom/serializers/nsPlainTextSerializer.cpp')
-rw-r--r-- | dom/serializers/nsPlainTextSerializer.cpp | 1826 |
1 files changed, 1826 insertions, 0 deletions
diff --git a/dom/serializers/nsPlainTextSerializer.cpp b/dom/serializers/nsPlainTextSerializer.cpp new file mode 100644 index 0000000000..952ed39942 --- /dev/null +++ b/dom/serializers/nsPlainTextSerializer.cpp @@ -0,0 +1,1826 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* + * nsIContentSerializer implementation that can be used with an + * nsIDocumentEncoder to convert a DOM into plaintext in a nice way + * (eg for copy/paste as plaintext). + */ + +#include "nsPlainTextSerializer.h" + +#include <limits> + +#include "nsPrintfCString.h" +#include "nsDebug.h" +#include "nsGkAtoms.h" +#include "nsNameSpaceManager.h" +#include "nsTextFragment.h" +#include "nsContentUtils.h" +#include "nsReadableUtils.h" +#include "nsUnicharUtils.h" +#include "nsCRT.h" +#include "mozilla/Casting.h" +#include "mozilla/TextEditor.h" +#include "mozilla/dom/CharacterData.h" +#include "mozilla/dom/Element.h" +#include "mozilla/dom/HTMLBRElement.h" +#include "mozilla/dom/Text.h" +#include "mozilla/intl/Segmenter.h" +#include "mozilla/intl/UnicodeProperties.h" +#include "nsUnicodeProperties.h" +#include "mozilla/Span.h" +#include "mozilla/Preferences.h" +#include "mozilla/StaticPrefs_converter.h" +#include "nsComputedDOMStyle.h" + +namespace mozilla { +class Encoding; +} + +using namespace mozilla; +using namespace mozilla::dom; + +#define PREF_STRUCTS "converter.html2txt.structs" +#define PREF_HEADER_STRATEGY "converter.html2txt.header_strategy" + +static const int32_t kTabSize = 4; +static const int32_t kIndentSizeHeaders = + 2; /* Indention of h1, if + mHeaderStrategy = kIndentIncreasedWithHeaderLevel + or = kNumberHeadingsAndIndentSlightly. Indention of + other headers is derived from that. */ +static const int32_t kIndentIncrementHeaders = + 2; /* If mHeaderStrategy = kIndentIncreasedWithHeaderLevel, + indent h(x+1) this many + columns more than h(x) */ +static const int32_t kIndentSizeList = kTabSize; +// Indention of non-first lines of ul and ol +static const int32_t kIndentSizeDD = kTabSize; // Indention of <dd> +static const char16_t kNBSP = 160; +static const char16_t kSPACE = ' '; + +static int32_t HeaderLevel(const nsAtom* aTag); +static int32_t GetUnicharWidth(char32_t ucs); +static int32_t GetUnicharStringWidth(Span<const char16_t> aString); + +// Someday may want to make this non-const: +static const uint32_t TagStackSize = 500; + +NS_IMPL_CYCLE_COLLECTING_ADDREF(nsPlainTextSerializer) +NS_IMPL_CYCLE_COLLECTING_RELEASE(nsPlainTextSerializer) + +NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(nsPlainTextSerializer) + NS_INTERFACE_MAP_ENTRY(nsIContentSerializer) + NS_INTERFACE_MAP_ENTRY(nsISupports) +NS_INTERFACE_MAP_END + +NS_IMPL_CYCLE_COLLECTION(nsPlainTextSerializer, mElement) + +nsresult NS_NewPlainTextSerializer(nsIContentSerializer** aSerializer) { + RefPtr<nsPlainTextSerializer> it = new nsPlainTextSerializer(); + it.forget(aSerializer); + return NS_OK; +} + +// @param aFlags As defined in nsIDocumentEncoder.idl. +static void DetermineLineBreak(const int32_t aFlags, nsAString& aLineBreak) { + // Set the line break character: + if ((aFlags & nsIDocumentEncoder::OutputCRLineBreak) && + (aFlags & nsIDocumentEncoder::OutputLFLineBreak)) { + // Windows + aLineBreak.AssignLiteral(u"\r\n"); + } else if (aFlags & nsIDocumentEncoder::OutputCRLineBreak) { + // Mac + aLineBreak.AssignLiteral(u"\r"); + } else if (aFlags & nsIDocumentEncoder::OutputLFLineBreak) { + // Unix/DOM + aLineBreak.AssignLiteral(u"\n"); + } else { + // Platform/default + aLineBreak.AssignLiteral(NS_ULINEBREAK); + } +} + +void nsPlainTextSerializer::CurrentLine::MaybeReplaceNbspsInContent( + const int32_t aFlags) { + if (!(aFlags & nsIDocumentEncoder::OutputPersistNBSP)) { + // First, replace all nbsp characters with spaces, + // which the unicode encoder won't do for us. + mContent.ReplaceChar(kNBSP, kSPACE); + } +} + +void nsPlainTextSerializer::CurrentLine::ResetContentAndIndentationHeader() { + mContent.Truncate(); + mIndentation.mHeader.Truncate(); +} + +int32_t nsPlainTextSerializer::CurrentLine::FindWrapIndexForContent( + const uint32_t aWrapColumn, bool aUseLineBreaker) const { + MOZ_ASSERT(!mContent.IsEmpty()); + + const uint32_t prefixwidth = DeterminePrefixWidth(); + int32_t goodSpace = 0; + + if (aUseLineBreaker) { + // We advance one line break point at a time from the beginning of the + // mContent until we find a width less than or equal to wrap column. + uint32_t width = 0; + intl::LineBreakIteratorUtf16 lineBreakIter(mContent); + while (const Maybe<uint32_t> nextGoodSpace = lineBreakIter.Next()) { + width += GetUnicharStringWidth(Span<const char16_t>( + mContent.get() + goodSpace, *nextGoodSpace - goodSpace)); + if (prefixwidth + width > aWrapColumn) { + // The next break point makes the width exceeding the wrap column, so + // goodSpace is what we want. + break; + } + goodSpace = AssertedCast<int32_t>(*nextGoodSpace); + } + + return goodSpace; + } + + // In this case we don't want strings, especially CJK-ones, to be split. See + // bug 333064 for more information. We break only at ASCII spaces. + if (aWrapColumn >= prefixwidth) { + // Search backward from the adjusted wrap column or from the text end. + goodSpace = + std::min<int32_t>(aWrapColumn - prefixwidth, mContent.Length() - 1); + while (goodSpace >= 0) { + if (nsCRT::IsAsciiSpace(mContent.CharAt(goodSpace))) { + return goodSpace; + } + goodSpace--; + } + } + + // Search forward from the adjusted wrap column. + goodSpace = (prefixwidth > aWrapColumn) ? 1 : aWrapColumn - prefixwidth; + const int32_t contentLength = mContent.Length(); + while (goodSpace < contentLength && + !nsCRT::IsAsciiSpace(mContent.CharAt(goodSpace))) { + goodSpace++; + } + + return goodSpace; +} + +nsPlainTextSerializer::OutputManager::OutputManager(const int32_t aFlags, + nsAString& aOutput) + : mFlags{aFlags}, mOutput{aOutput}, mAtFirstColumn{true} { + MOZ_ASSERT(aOutput.IsEmpty()); + + DetermineLineBreak(mFlags, mLineBreak); +} + +void nsPlainTextSerializer::OutputManager::Append( + const CurrentLine& aCurrentLine, + const StripTrailingWhitespaces aStripTrailingWhitespaces) { + if (IsAtFirstColumn()) { + nsAutoString quotesAndIndent; + aCurrentLine.CreateQuotesAndIndent(quotesAndIndent); + + if ((aStripTrailingWhitespaces == StripTrailingWhitespaces::kMaybe)) { + const bool stripTrailingSpaces = aCurrentLine.mContent.IsEmpty(); + if (stripTrailingSpaces) { + quotesAndIndent.Trim(" ", false, true, false); + } + } + + Append(quotesAndIndent); + } + + Append(aCurrentLine.mContent); +} + +void nsPlainTextSerializer::OutputManager::Append(const nsAString& aString) { + if (!aString.IsEmpty()) { + mOutput.Append(aString); + mAtFirstColumn = false; + } +} + +void nsPlainTextSerializer::OutputManager::AppendLineBreak() { + mOutput.Append(mLineBreak); + mAtFirstColumn = true; +} + +uint32_t nsPlainTextSerializer::OutputManager::GetOutputLength() const { + return mOutput.Length(); +} + +nsPlainTextSerializer::nsPlainTextSerializer() + : mFloatingLines(-1), + mLineBreakDue(false), + kSpace(u" "_ns) // Init of "constant" +{ + mHeadLevel = 0; + mHasWrittenCiteBlockquote = false; + mSpanLevel = 0; + for (int32_t i = 0; i <= 6; i++) { + mHeaderCounter[i] = 0; + } + + // Flow + mEmptyLines = 1; // The start of the document is an "empty line" in itself, + mInWhitespace = false; + mPreFormattedMail = false; + + mPreformattedBlockBoundary = false; + + // initialize the tag stack to zero: + // The stack only ever contains pointers to static atoms, so they don't + // need refcounting. + mTagStack = new const nsAtom*[TagStackSize]; + mTagStackIndex = 0; + mIgnoreAboveIndex = (uint32_t)kNotFound; + + mULCount = 0; + + mIgnoredChildNodeLevel = 0; +} + +nsPlainTextSerializer::~nsPlainTextSerializer() { + delete[] mTagStack; + NS_WARNING_ASSERTION(mHeadLevel == 0, "Wrong head level!"); +} + +nsPlainTextSerializer::Settings::HeaderStrategy +nsPlainTextSerializer::Settings::Convert(const int32_t aPrefHeaderStrategy) { + HeaderStrategy result{HeaderStrategy::kIndentIncreasedWithHeaderLevel}; + + switch (aPrefHeaderStrategy) { + case 0: { + result = HeaderStrategy::kNoIndentation; + break; + } + case 1: { + result = HeaderStrategy::kIndentIncreasedWithHeaderLevel; + break; + } + case 2: { + result = HeaderStrategy::kNumberHeadingsAndIndentSlightly; + break; + } + default: { + NS_WARNING( + nsPrintfCString("Header strategy pref contains undefined value: %i", + aPrefHeaderStrategy) + .get()); + } + } + + return result; +} + +const int32_t kDefaultHeaderStrategy = 1; + +void nsPlainTextSerializer::Settings::Init(const int32_t aFlags, + const uint32_t aWrapColumn) { + mFlags = aFlags; + + if (mFlags & nsIDocumentEncoder::OutputFormatted) { + // Get some prefs that controls how we do formatted output + mStructs = Preferences::GetBool(PREF_STRUCTS, mStructs); + + int32_t headerStrategy = + Preferences::GetInt(PREF_HEADER_STRATEGY, kDefaultHeaderStrategy); + mHeaderStrategy = Convert(headerStrategy); + } + + mWithRubyAnnotation = StaticPrefs::converter_html2txt_always_include_ruby() || + (mFlags & nsIDocumentEncoder::OutputRubyAnnotation); + + // XXX We should let the caller decide whether to do this or not + mFlags &= ~nsIDocumentEncoder::OutputNoFramesContent; + + mWrapColumn = aWrapColumn; +} + +NS_IMETHODIMP +nsPlainTextSerializer::Init(const uint32_t aFlags, uint32_t aWrapColumn, + const Encoding* aEncoding, bool aIsCopying, + bool aIsWholeDocument, + bool* aNeedsPreformatScanning, nsAString& aOutput) { +#ifdef DEBUG + // Check if the major control flags are set correctly. + if (aFlags & nsIDocumentEncoder::OutputFormatFlowed) { + NS_ASSERTION(aFlags & nsIDocumentEncoder::OutputFormatted, + "If you want format=flowed, you must combine it with " + "nsIDocumentEncoder::OutputFormatted"); + } + + if (aFlags & nsIDocumentEncoder::OutputFormatted) { + NS_ASSERTION( + !(aFlags & nsIDocumentEncoder::OutputPreformatted), + "Can't do formatted and preformatted output at the same time!"); + } +#endif + MOZ_ASSERT(!(aFlags & nsIDocumentEncoder::OutputFormatDelSp) || + (aFlags & nsIDocumentEncoder::OutputFormatFlowed)); + + *aNeedsPreformatScanning = true; + mSettings.Init(aFlags, aWrapColumn); + mOutputManager.emplace(mSettings.GetFlags(), aOutput); + + mUseLineBreaker = mSettings.MayWrap() && mSettings.MayBreakLines(); + + mLineBreakDue = false; + mFloatingLines = -1; + + mPreformattedBlockBoundary = false; + + MOZ_ASSERT(mOLStack.IsEmpty()); + + return NS_OK; +} + +bool nsPlainTextSerializer::GetLastBool(const nsTArray<bool>& aStack) { + uint32_t size = aStack.Length(); + if (size == 0) { + return false; + } + return aStack.ElementAt(size - 1); +} + +void nsPlainTextSerializer::SetLastBool(nsTArray<bool>& aStack, bool aValue) { + uint32_t size = aStack.Length(); + if (size > 0) { + aStack.ElementAt(size - 1) = aValue; + } else { + NS_ERROR("There is no \"Last\" value"); + } +} + +void nsPlainTextSerializer::PushBool(nsTArray<bool>& aStack, bool aValue) { + aStack.AppendElement(bool(aValue)); +} + +bool nsPlainTextSerializer::PopBool(nsTArray<bool>& aStack) { + return aStack.Length() ? aStack.PopLastElement() : false; +} + +bool nsPlainTextSerializer::IsIgnorableRubyAnnotation( + const nsAtom* aTag) const { + if (mSettings.GetWithRubyAnnotation()) { + return false; + } + + return aTag == nsGkAtoms::rp || aTag == nsGkAtoms::rt || + aTag == nsGkAtoms::rtc; +} + +// Return true if aElement has 'display:none' or if we just don't know. +static bool IsDisplayNone(Element* aElement) { + RefPtr<const ComputedStyle> computedStyle = + nsComputedDOMStyle::GetComputedStyleNoFlush(aElement); + return !computedStyle || + computedStyle->StyleDisplay()->mDisplay == StyleDisplay::None; +} + +static bool IsIgnorableScriptOrStyle(Element* aElement) { + return aElement->IsAnyOfHTMLElements(nsGkAtoms::script, nsGkAtoms::style) && + IsDisplayNone(aElement); +} + +NS_IMETHODIMP +nsPlainTextSerializer::AppendText(nsIContent* aText, int32_t aStartOffset, + int32_t aEndOffset) { + if (mIgnoreAboveIndex != (uint32_t)kNotFound) { + return NS_OK; + } + + NS_ASSERTION(aStartOffset >= 0, "Negative start offset for text fragment!"); + if (aStartOffset < 0) return NS_ERROR_INVALID_ARG; + + NS_ENSURE_ARG(aText); + + nsresult rv = NS_OK; + + nsIContent* content = aText; + const nsTextFragment* frag; + if (!content || !(frag = content->GetText())) { + return NS_ERROR_FAILURE; + } + + int32_t fragLength = frag->GetLength(); + int32_t endoffset = + (aEndOffset == -1) ? fragLength : std::min(aEndOffset, fragLength); + NS_ASSERTION(aStartOffset <= endoffset, + "A start offset is beyond the end of the text fragment!"); + + int32_t length = endoffset - aStartOffset; + if (length <= 0) { + return NS_OK; + } + + nsAutoString textstr; + if (frag->Is2b()) { + textstr.Assign(frag->Get2b() + aStartOffset, length); + } else { + // AssignASCII is for 7-bit character only, so don't use it + const char* data = frag->Get1b(); + CopyASCIItoUTF16(Substring(data + aStartOffset, data + endoffset), textstr); + } + + // Mask the text if the text node is in a password field. + if (content->HasFlag(NS_MAYBE_MASKED)) { + TextEditor::MaskString(textstr, *content->AsText(), 0, aStartOffset); + } + + // We have to split the string across newlines + // to match parser behavior + int32_t start = 0; + int32_t offset = textstr.FindCharInSet(u"\n\r"); + while (offset != kNotFound) { + if (offset > start) { + // Pass in the line + DoAddText(false, Substring(textstr, start, offset - start)); + } + + // Pass in a newline + DoAddText(); + + start = offset + 1; + offset = textstr.FindCharInSet(u"\n\r", start); + } + + // Consume the last bit of the string if there's any left + if (start < length) { + if (start) { + DoAddText(false, Substring(textstr, start, length - start)); + } else { + DoAddText(false, textstr); + } + } + + return rv; +} + +NS_IMETHODIMP +nsPlainTextSerializer::AppendCDATASection(nsIContent* aCDATASection, + int32_t aStartOffset, + int32_t aEndOffset) { + return AppendText(aCDATASection, aStartOffset, aEndOffset); +} + +NS_IMETHODIMP +nsPlainTextSerializer::ScanElementForPreformat(Element* aElement) { + mPreformatStack.push(IsElementPreformatted(aElement)); + return NS_OK; +} + +NS_IMETHODIMP +nsPlainTextSerializer::ForgetElementForPreformat(Element* aElement) { + MOZ_RELEASE_ASSERT(!mPreformatStack.empty(), + "Tried to pop without previous push."); + mPreformatStack.pop(); + return NS_OK; +} + +NS_IMETHODIMP +nsPlainTextSerializer::AppendElementStart(Element* aElement, + Element* aOriginalElement) { + NS_ENSURE_ARG(aElement); + + mElement = aElement; + + nsresult rv; + nsAtom* id = GetIdForContent(mElement); + + bool isContainer = !FragmentOrElement::IsHTMLVoid(id); + + if (isContainer) { + rv = DoOpenContainer(id); + } else { + rv = DoAddLeaf(id); + } + + mElement = nullptr; + + if (id == nsGkAtoms::head) { + ++mHeadLevel; + } + + return rv; +} + +NS_IMETHODIMP +nsPlainTextSerializer::AppendElementEnd(Element* aElement, + Element* aOriginalElement) { + NS_ENSURE_ARG(aElement); + + mElement = aElement; + + nsresult rv; + nsAtom* id = GetIdForContent(mElement); + + bool isContainer = !FragmentOrElement::IsHTMLVoid(id); + + rv = NS_OK; + if (isContainer) { + rv = DoCloseContainer(id); + } + + mElement = nullptr; + + if (id == nsGkAtoms::head) { + NS_ASSERTION(mHeadLevel != 0, "mHeadLevel being decremented below 0"); + --mHeadLevel; + } + + return rv; +} + +NS_IMETHODIMP +nsPlainTextSerializer::FlushAndFinish() { + MOZ_ASSERT(mOutputManager); + + mOutputManager->Flush(mCurrentLine); + return Finish(); +} + +NS_IMETHODIMP +nsPlainTextSerializer::Finish() { + mOutputManager.reset(); + + return NS_OK; +} + +NS_IMETHODIMP +nsPlainTextSerializer::GetOutputLength(uint32_t& aLength) const { + MOZ_ASSERT(mOutputManager); + + aLength = mOutputManager->GetOutputLength(); + + return NS_OK; +} + +NS_IMETHODIMP +nsPlainTextSerializer::AppendDocumentStart(Document* aDocument) { + return NS_OK; +} + +constexpr int32_t kOlStackDummyValue = 0; + +nsresult nsPlainTextSerializer::DoOpenContainer(const nsAtom* aTag) { + if (IsIgnorableRubyAnnotation(aTag)) { + // Ignorable ruby annotation shouldn't be replaced by a placeholder + // character, neither any of its descendants. + mIgnoredChildNodeLevel++; + return NS_OK; + } + if (IsIgnorableScriptOrStyle(mElement)) { + mIgnoredChildNodeLevel++; + return NS_OK; + } + + if (mSettings.HasFlag(nsIDocumentEncoder::OutputForPlainTextClipboardCopy)) { + if (mPreformattedBlockBoundary && DoOutput()) { + // Should always end a line, but get no more whitespace + if (mFloatingLines < 0) mFloatingLines = 0; + mLineBreakDue = true; + } + mPreformattedBlockBoundary = false; + } + + if (mSettings.HasFlag(nsIDocumentEncoder::OutputRaw)) { + // Raw means raw. Don't even think about doing anything fancy + // here like indenting, adding line breaks or any other + // characters such as list item bullets, quote characters + // around <q>, etc. + + return NS_OK; + } + + if (mTagStackIndex < TagStackSize) { + mTagStack[mTagStackIndex++] = aTag; + } + + if (mIgnoreAboveIndex != (uint32_t)kNotFound) { + return NS_OK; + } + + // Reset this so that <blockquote type=cite> doesn't affect the whitespace + // above random <pre>s below it. + mHasWrittenCiteBlockquote = + mHasWrittenCiteBlockquote && aTag == nsGkAtoms::pre; + + bool isInCiteBlockquote = false; + + // XXX special-case <blockquote type=cite> so that we don't add additional + // newlines before the text. + if (aTag == nsGkAtoms::blockquote) { + nsAutoString value; + nsresult rv = GetAttributeValue(nsGkAtoms::type, value); + isInCiteBlockquote = NS_SUCCEEDED(rv) && value.EqualsIgnoreCase("cite"); + } + + if (mLineBreakDue && !isInCiteBlockquote) EnsureVerticalSpace(mFloatingLines); + + // Check if this tag's content that should not be output + if ((aTag == nsGkAtoms::noscript && + !mSettings.HasFlag(nsIDocumentEncoder::OutputNoScriptContent)) || + ((aTag == nsGkAtoms::iframe || aTag == nsGkAtoms::noframes) && + !mSettings.HasFlag(nsIDocumentEncoder::OutputNoFramesContent))) { + // Ignore everything that follows the current tag in + // question until a matching end tag is encountered. + mIgnoreAboveIndex = mTagStackIndex - 1; + return NS_OK; + } + + if (aTag == nsGkAtoms::body) { + // Try to figure out here whether we have a + // preformatted style attribute set by Thunderbird. + // + // Trigger on the presence of a "pre-wrap" in the + // style attribute. That's a very simplistic way to do + // it, but better than nothing. + nsAutoString style; + int32_t whitespace; + if (NS_SUCCEEDED(GetAttributeValue(nsGkAtoms::style, style)) && + (kNotFound != (whitespace = style.Find(u"white-space:")))) { + if (kNotFound != style.LowerCaseFindASCII("pre-wrap", whitespace)) { +#ifdef DEBUG_preformatted + printf("Set mPreFormattedMail based on style pre-wrap\n"); +#endif + mPreFormattedMail = true; + } else if (kNotFound != style.LowerCaseFindASCII("pre", whitespace)) { +#ifdef DEBUG_preformatted + printf("Set mPreFormattedMail based on style pre\n"); +#endif + mPreFormattedMail = true; + } + } else { + /* See comment at end of function. */ + mInWhitespace = true; + mPreFormattedMail = false; + } + + return NS_OK; + } + + // Keep this in sync with DoCloseContainer! + if (!DoOutput()) { + return NS_OK; + } + + if (aTag == nsGkAtoms::p) + EnsureVerticalSpace(1); + else if (aTag == nsGkAtoms::pre) { + if (GetLastBool(mIsInCiteBlockquote)) + EnsureVerticalSpace(0); + else if (mHasWrittenCiteBlockquote) { + EnsureVerticalSpace(0); + mHasWrittenCiteBlockquote = false; + } else + EnsureVerticalSpace(1); + } else if (aTag == nsGkAtoms::tr) { + PushBool(mHasWrittenCellsForRow, false); + } else if (aTag == nsGkAtoms::td || aTag == nsGkAtoms::th) { + // We must make sure that the content of two table cells get a + // space between them. + + // To make the separation between cells most obvious and + // importable, we use a TAB. + if (mHasWrittenCellsForRow.IsEmpty()) { + // We don't always see a <tr> (nor a <table>) before the <td> if we're + // copying part of a table + PushBool(mHasWrittenCellsForRow, true); // will never be popped + } else if (GetLastBool(mHasWrittenCellsForRow)) { + // Bypass |Write| so that the TAB isn't compressed away. + AddToLine(u"\t", 1); + mInWhitespace = true; + } else { + SetLastBool(mHasWrittenCellsForRow, true); + } + } else if (aTag == nsGkAtoms::ul) { + // Indent here to support nested lists, which aren't included in li :-( + EnsureVerticalSpace(IsInOlOrUl() ? 0 : 1); + // Must end the current line before we change indention + mCurrentLine.mIndentation.mLength += kIndentSizeList; + mULCount++; + } else if (aTag == nsGkAtoms::ol) { + EnsureVerticalSpace(IsInOlOrUl() ? 0 : 1); + if (mSettings.HasFlag(nsIDocumentEncoder::OutputFormatted)) { + // Must end the current line before we change indention + nsAutoString startAttr; + int32_t startVal = 1; + if (NS_SUCCEEDED(GetAttributeValue(nsGkAtoms::start, startAttr))) { + nsresult rv = NS_OK; + startVal = startAttr.ToInteger(&rv); + if (NS_FAILED(rv)) { + startVal = 1; + } + } + mOLStack.AppendElement(startVal); + } else { + mOLStack.AppendElement(kOlStackDummyValue); + } + mCurrentLine.mIndentation.mLength += kIndentSizeList; // see ul + } else if (aTag == nsGkAtoms::li && + mSettings.HasFlag(nsIDocumentEncoder::OutputFormatted)) { + if (mTagStackIndex > 1 && IsInOL()) { + if (!mOLStack.IsEmpty()) { + nsAutoString valueAttr; + if (NS_SUCCEEDED(GetAttributeValue(nsGkAtoms::value, valueAttr))) { + nsresult rv = NS_OK; + int32_t valueAttrVal = valueAttr.ToInteger(&rv); + if (NS_SUCCEEDED(rv)) { + mOLStack.LastElement() = valueAttrVal; + } + } + // This is what nsBulletFrame does for OLs: + mCurrentLine.mIndentation.mHeader.AppendInt(mOLStack.LastElement(), 10); + mOLStack.LastElement()++; + } else { + mCurrentLine.mIndentation.mHeader.Append(char16_t('#')); + } + + mCurrentLine.mIndentation.mHeader.Append(char16_t('.')); + + } else { + static const char bulletCharArray[] = "*o+#"; + uint32_t index = mULCount > 0 ? (mULCount - 1) : 3; + char bulletChar = bulletCharArray[index % 4]; + mCurrentLine.mIndentation.mHeader.Append(char16_t(bulletChar)); + } + + mCurrentLine.mIndentation.mHeader.Append(char16_t(' ')); + } else if (aTag == nsGkAtoms::dl) { + EnsureVerticalSpace(1); + } else if (aTag == nsGkAtoms::dt) { + EnsureVerticalSpace(0); + } else if (aTag == nsGkAtoms::dd) { + EnsureVerticalSpace(0); + mCurrentLine.mIndentation.mLength += kIndentSizeDD; + } else if (aTag == nsGkAtoms::span) { + ++mSpanLevel; + } else if (aTag == nsGkAtoms::blockquote) { + // Push + PushBool(mIsInCiteBlockquote, isInCiteBlockquote); + if (isInCiteBlockquote) { + EnsureVerticalSpace(0); + mCurrentLine.mCiteQuoteLevel++; + } else { + EnsureVerticalSpace(1); + mCurrentLine.mIndentation.mLength += + kTabSize; // Check for some maximum value? + } + } else if (aTag == nsGkAtoms::q) { + Write(u"\""_ns); + } + + // Else make sure we'll separate block level tags, + // even if we're about to leave, before doing any other formatting. + else if (IsCssBlockLevelElement(mElement)) { + EnsureVerticalSpace(0); + } + + if (mSettings.HasFlag(nsIDocumentEncoder::OutputFormatted)) { + OpenContainerForOutputFormatted(aTag); + } + return NS_OK; +} + +void nsPlainTextSerializer::OpenContainerForOutputFormatted( + const nsAtom* aTag) { + const bool currentNodeIsConverted = IsCurrentNodeConverted(); + + if (aTag == nsGkAtoms::h1 || aTag == nsGkAtoms::h2 || aTag == nsGkAtoms::h3 || + aTag == nsGkAtoms::h4 || aTag == nsGkAtoms::h5 || aTag == nsGkAtoms::h6) { + EnsureVerticalSpace(2); + if (mSettings.GetHeaderStrategy() == + Settings::HeaderStrategy::kNumberHeadingsAndIndentSlightly) { + mCurrentLine.mIndentation.mLength += kIndentSizeHeaders; + // Caching + int32_t level = HeaderLevel(aTag); + // Increase counter for current level + mHeaderCounter[level]++; + // Reset all lower levels + int32_t i; + + for (i = level + 1; i <= 6; i++) { + mHeaderCounter[i] = 0; + } + + // Construct numbers + nsAutoString leadup; + for (i = 1; i <= level; i++) { + leadup.AppendInt(mHeaderCounter[i]); + leadup.Append(char16_t('.')); + } + leadup.Append(char16_t(' ')); + Write(leadup); + } else if (mSettings.GetHeaderStrategy() == + Settings::HeaderStrategy::kIndentIncreasedWithHeaderLevel) { + mCurrentLine.mIndentation.mLength += kIndentSizeHeaders; + for (int32_t i = HeaderLevel(aTag); i > 1; i--) { + // for h(x), run x-1 times + mCurrentLine.mIndentation.mLength += kIndentIncrementHeaders; + } + } + } else if (aTag == nsGkAtoms::sup && mSettings.GetStructs() && + !currentNodeIsConverted) { + Write(u"^"_ns); + } else if (aTag == nsGkAtoms::sub && mSettings.GetStructs() && + !currentNodeIsConverted) { + Write(u"_"_ns); + } else if (aTag == nsGkAtoms::code && mSettings.GetStructs() && + !currentNodeIsConverted) { + Write(u"|"_ns); + } else if ((aTag == nsGkAtoms::strong || aTag == nsGkAtoms::b) && + mSettings.GetStructs() && !currentNodeIsConverted) { + Write(u"*"_ns); + } else if ((aTag == nsGkAtoms::em || aTag == nsGkAtoms::i) && + mSettings.GetStructs() && !currentNodeIsConverted) { + Write(u"/"_ns); + } else if (aTag == nsGkAtoms::u && mSettings.GetStructs() && + !currentNodeIsConverted) { + Write(u"_"_ns); + } + + /* Container elements are always block elements, so we shouldn't + output any whitespace immediately after the container tag even if + there's extra whitespace there because the HTML is pretty-printed + or something. To ensure that happens, tell the serializer we're + already in whitespace so it won't output more. */ + mInWhitespace = true; +} + +nsresult nsPlainTextSerializer::DoCloseContainer(const nsAtom* aTag) { + if (IsIgnorableRubyAnnotation(aTag)) { + mIgnoredChildNodeLevel--; + return NS_OK; + } + if (IsIgnorableScriptOrStyle(mElement)) { + mIgnoredChildNodeLevel--; + return NS_OK; + } + + if (mSettings.HasFlag(nsIDocumentEncoder::OutputForPlainTextClipboardCopy)) { + if (DoOutput() && IsElementPreformatted() && + IsCssBlockLevelElement(mElement)) { + // If we're closing a preformatted block element, output a line break + // when we find a new container. + mPreformattedBlockBoundary = true; + } + } + + if (mSettings.HasFlag(nsIDocumentEncoder::OutputRaw)) { + // Raw means raw. Don't even think about doing anything fancy + // here like indenting, adding line breaks or any other + // characters such as list item bullets, quote characters + // around <q>, etc. + + return NS_OK; + } + + if (mTagStackIndex > 0) { + --mTagStackIndex; + } + + if (mTagStackIndex >= mIgnoreAboveIndex) { + if (mTagStackIndex == mIgnoreAboveIndex) { + // We're dealing with the close tag whose matching + // open tag had set the mIgnoreAboveIndex value. + // Reset mIgnoreAboveIndex before discarding this tag. + mIgnoreAboveIndex = (uint32_t)kNotFound; + } + return NS_OK; + } + + MOZ_ASSERT(mOutputManager); + + // End current line if we're ending a block level tag + if ((aTag == nsGkAtoms::body) || (aTag == nsGkAtoms::html)) { + // We want the output to end with a new line, + // but in preformatted areas like text fields, + // we can't emit newlines that weren't there. + // So add the newline only in the case of formatted output. + if (mSettings.HasFlag(nsIDocumentEncoder::OutputFormatted)) { + EnsureVerticalSpace(0); + } else { + mOutputManager->Flush(mCurrentLine); + } + // We won't want to do anything with these in formatted mode either, + // so just return now: + return NS_OK; + } + + // Keep this in sync with DoOpenContainer! + if (!DoOutput()) { + return NS_OK; + } + + if (aTag == nsGkAtoms::tr) { + PopBool(mHasWrittenCellsForRow); + // Should always end a line, but get no more whitespace + if (mFloatingLines < 0) mFloatingLines = 0; + mLineBreakDue = true; + } else if (((aTag == nsGkAtoms::li) || (aTag == nsGkAtoms::dt)) && + mSettings.HasFlag(nsIDocumentEncoder::OutputFormatted)) { + // Items that should always end a line, but get no more whitespace + if (mFloatingLines < 0) mFloatingLines = 0; + mLineBreakDue = true; + } else if (aTag == nsGkAtoms::pre) { + mFloatingLines = GetLastBool(mIsInCiteBlockquote) ? 0 : 1; + mLineBreakDue = true; + } else if (aTag == nsGkAtoms::ul) { + mOutputManager->Flush(mCurrentLine); + mCurrentLine.mIndentation.mLength -= kIndentSizeList; + --mULCount; + if (!IsInOlOrUl()) { + mFloatingLines = 1; + mLineBreakDue = true; + } + } else if (aTag == nsGkAtoms::ol) { + mOutputManager->Flush(mCurrentLine); // Doing this after decreasing + // OLStackIndex would be wrong. + mCurrentLine.mIndentation.mLength -= kIndentSizeList; + MOZ_ASSERT(!mOLStack.IsEmpty(), "Wrong OLStack level!"); + mOLStack.RemoveLastElement(); + if (!IsInOlOrUl()) { + mFloatingLines = 1; + mLineBreakDue = true; + } + } else if (aTag == nsGkAtoms::dl) { + mFloatingLines = 1; + mLineBreakDue = true; + } else if (aTag == nsGkAtoms::dd) { + mOutputManager->Flush(mCurrentLine); + mCurrentLine.mIndentation.mLength -= kIndentSizeDD; + } else if (aTag == nsGkAtoms::span) { + NS_ASSERTION(mSpanLevel, "Span level will be negative!"); + --mSpanLevel; + } else if (aTag == nsGkAtoms::div) { + if (mFloatingLines < 0) mFloatingLines = 0; + mLineBreakDue = true; + } else if (aTag == nsGkAtoms::blockquote) { + mOutputManager->Flush(mCurrentLine); // Is this needed? + + // Pop + bool isInCiteBlockquote = PopBool(mIsInCiteBlockquote); + + if (isInCiteBlockquote) { + NS_ASSERTION(mCurrentLine.mCiteQuoteLevel, + "CiteQuote level will be negative!"); + mCurrentLine.mCiteQuoteLevel--; + mFloatingLines = 0; + mHasWrittenCiteBlockquote = true; + } else { + mCurrentLine.mIndentation.mLength -= kTabSize; + mFloatingLines = 1; + } + mLineBreakDue = true; + } else if (aTag == nsGkAtoms::q) { + Write(u"\""_ns); + } else if (IsCssBlockLevelElement(mElement)) { + // All other blocks get 1 vertical space after them + // in formatted mode, otherwise 0. + // This is hard. Sometimes 0 is a better number, but + // how to know? + if (mSettings.HasFlag(nsIDocumentEncoder::OutputFormatted)) { + EnsureVerticalSpace(1); + } else { + if (mFloatingLines < 0) mFloatingLines = 0; + mLineBreakDue = true; + } + } + + if (mSettings.HasFlag(nsIDocumentEncoder::OutputFormatted)) { + CloseContainerForOutputFormatted(aTag); + } + + return NS_OK; +} + +void nsPlainTextSerializer::CloseContainerForOutputFormatted( + const nsAtom* aTag) { + const bool currentNodeIsConverted = IsCurrentNodeConverted(); + + if (aTag == nsGkAtoms::h1 || aTag == nsGkAtoms::h2 || aTag == nsGkAtoms::h3 || + aTag == nsGkAtoms::h4 || aTag == nsGkAtoms::h5 || aTag == nsGkAtoms::h6) { + using HeaderStrategy = Settings::HeaderStrategy; + if ((mSettings.GetHeaderStrategy() == + HeaderStrategy::kIndentIncreasedWithHeaderLevel) || + (mSettings.GetHeaderStrategy() == + HeaderStrategy::kNumberHeadingsAndIndentSlightly)) { + mCurrentLine.mIndentation.mLength -= kIndentSizeHeaders; + } + if (mSettings.GetHeaderStrategy() == + HeaderStrategy::kIndentIncreasedWithHeaderLevel) { + for (int32_t i = HeaderLevel(aTag); i > 1; i--) { + // for h(x), run x-1 times + mCurrentLine.mIndentation.mLength -= kIndentIncrementHeaders; + } + } + EnsureVerticalSpace(1); + } else if (aTag == nsGkAtoms::a && !currentNodeIsConverted) { + nsAutoString url; + if (NS_SUCCEEDED(GetAttributeValue(nsGkAtoms::href, url)) && + !url.IsEmpty()) { + nsAutoString temp; + temp.AssignLiteral(" <"); + temp += url; + temp.Append(char16_t('>')); + Write(temp); + } + } else if ((aTag == nsGkAtoms::sup || aTag == nsGkAtoms::sub) && + mSettings.GetStructs() && !currentNodeIsConverted) { + Write(kSpace); + } else if (aTag == nsGkAtoms::code && mSettings.GetStructs() && + !currentNodeIsConverted) { + Write(u"|"_ns); + } else if ((aTag == nsGkAtoms::strong || aTag == nsGkAtoms::b) && + mSettings.GetStructs() && !currentNodeIsConverted) { + Write(u"*"_ns); + } else if ((aTag == nsGkAtoms::em || aTag == nsGkAtoms::i) && + mSettings.GetStructs() && !currentNodeIsConverted) { + Write(u"/"_ns); + } else if (aTag == nsGkAtoms::u && mSettings.GetStructs() && + !currentNodeIsConverted) { + Write(u"_"_ns); + } +} + +bool nsPlainTextSerializer::MustSuppressLeaf() const { + if (mIgnoredChildNodeLevel > 0) { + return true; + } + + if ((mTagStackIndex > 1 && + mTagStack[mTagStackIndex - 2] == nsGkAtoms::select) || + (mTagStackIndex > 0 && + mTagStack[mTagStackIndex - 1] == nsGkAtoms::select)) { + // Don't output the contents of SELECT elements; + // Might be nice, eventually, to output just the selected element. + // Read more in bug 31994. + return true; + } + + return false; +} + +void nsPlainTextSerializer::DoAddText() { DoAddText(true, u""_ns); } + +void nsPlainTextSerializer::DoAddText(bool aIsLineBreak, + const nsAString& aText) { + // If we don't want any output, just return + if (!DoOutput()) { + return; + } + + if (!aIsLineBreak) { + // Make sure to reset this, since it's no longer true. + mHasWrittenCiteBlockquote = false; + } + + if (mLineBreakDue) EnsureVerticalSpace(mFloatingLines); + + if (MustSuppressLeaf()) { + return; + } + + if (aIsLineBreak) { + // The only times we want to pass along whitespace from the original + // html source are if we're forced into preformatted mode via flags, + // or if we're prettyprinting and we're inside a <pre>. + // Otherwise, either we're collapsing to minimal text, or we're + // prettyprinting to mimic the html format, and in neither case + // does the formatting of the html source help us. + if (mSettings.HasFlag(nsIDocumentEncoder::OutputPreformatted) || + (mPreFormattedMail && !mSettings.GetWrapColumn()) || + IsElementPreformatted()) { + EnsureVerticalSpace(mEmptyLines + 1); + } else if (!mInWhitespace) { + Write(kSpace); + mInWhitespace = true; + } + return; + } + + Write(aText); +} + +void CreateLineOfDashes(nsAString& aResult, const uint32_t aWrapColumn) { + MOZ_ASSERT(aResult.IsEmpty()); + + const uint32_t width = (aWrapColumn > 0 ? aWrapColumn : 25); + while (aResult.Length() < width) { + aResult.Append(char16_t('-')); + } +} + +nsresult nsPlainTextSerializer::DoAddLeaf(const nsAtom* aTag) { + mPreformattedBlockBoundary = false; + + if (!DoOutput()) { + return NS_OK; + } + + if (mLineBreakDue) EnsureVerticalSpace(mFloatingLines); + + if (MustSuppressLeaf()) { + return NS_OK; + } + + if (aTag == nsGkAtoms::br) { + // Another egregious editor workaround, see bug 38194: + // ignore the bogus br tags that the editor sticks here and there. + // FYI: `brElement` may be `nullptr` if the element is <br> element + // of non-HTML element. + // XXX Do we need to call `EnsureVerticalSpace()` when the <br> element + // is not an HTML element? + HTMLBRElement* brElement = HTMLBRElement::FromNodeOrNull(mElement); + if (!brElement || !brElement->IsPaddingForEmptyLastLine()) { + EnsureVerticalSpace(mEmptyLines + 1); + } + } else if (aTag == nsGkAtoms::hr && + mSettings.HasFlag(nsIDocumentEncoder::OutputFormatted)) { + EnsureVerticalSpace(0); + + // Make a line of dashes as wide as the wrap width + // XXX honoring percentage would be nice + nsAutoString line; + CreateLineOfDashes(line, mSettings.GetWrapColumn()); + Write(line); + + EnsureVerticalSpace(0); + } else if (aTag == nsGkAtoms::img) { + /* Output (in decreasing order of preference) + alt, title or nothing */ + // See <http://www.w3.org/TR/REC-html40/struct/objects.html#edef-IMG> + nsAutoString imageDescription; + if (NS_SUCCEEDED(GetAttributeValue(nsGkAtoms::alt, imageDescription))) { + // If the alt attribute has an empty value (|alt=""|), output nothing + } else if (NS_SUCCEEDED( + GetAttributeValue(nsGkAtoms::title, imageDescription)) && + !imageDescription.IsEmpty()) { + imageDescription = u" ["_ns + imageDescription + u"] "_ns; + } + + Write(imageDescription); + } + + return NS_OK; +} + +/** + * Adds as many newline as necessary to get |aNumberOfRows| empty lines + * + * aNumberOfRows = -1 : Being in the middle of some line of text + * aNumberOfRows = 0 : Being at the start of a line + * aNumberOfRows = n>0 : Having n empty lines before the current line. + */ +void nsPlainTextSerializer::EnsureVerticalSpace(const int32_t aNumberOfRows) { + // If we have something in the indent we probably want to output + // it and it's not included in the count for empty lines so we don't + // realize that we should start a new line. + if (aNumberOfRows >= 0 && !mCurrentLine.mIndentation.mHeader.IsEmpty()) { + EndLine(false); + mInWhitespace = true; + } + + while (mEmptyLines < aNumberOfRows) { + EndLine(false); + mInWhitespace = true; + } + mLineBreakDue = false; + mFloatingLines = -1; +} + +void nsPlainTextSerializer::OutputManager::Flush(CurrentLine& aCurrentLine) { + if (!aCurrentLine.mContent.IsEmpty()) { + aCurrentLine.MaybeReplaceNbspsInContent(mFlags); + + Append(aCurrentLine, StripTrailingWhitespaces::kNo); + + aCurrentLine.ResetContentAndIndentationHeader(); + } +} + +static bool IsSpaceStuffable(const char16_t* s) { + return (s[0] == '>' || s[0] == ' ' || s[0] == kNBSP || + NS_strncmp(s, u"From ", 5) == 0); +} + +void nsPlainTextSerializer::MaybeWrapAndOutputCompleteLines() { + if (!mSettings.MayWrap()) { + return; + } + + const uint32_t prefixwidth = mCurrentLine.DeterminePrefixWidth(); + + // Yes, wrap! + // The "+4" is to avoid wrap lines that only would be a couple + // of letters too long. We give this bonus only if the + // wrapcolumn is more than 20. + const uint32_t wrapColumn = mSettings.GetWrapColumn(); + uint32_t bonuswidth = (wrapColumn > 20) ? 4 : 0; + + while (!mCurrentLine.mContent.IsEmpty()) { + // The width of the line as it will appear on the screen (approx.). + const uint32_t currentLineContentWidth = + GetUnicharStringWidth(mCurrentLine.mContent); + if (currentLineContentWidth + prefixwidth <= wrapColumn + bonuswidth) { + break; + } + + const int32_t goodSpace = + mCurrentLine.FindWrapIndexForContent(wrapColumn, mUseLineBreaker); + + const int32_t contentLength = mCurrentLine.mContent.Length(); + if ((goodSpace < contentLength) && (goodSpace > 0)) { + // Found a place to break + + // -1 (trim a char at the break position) + // only if the line break was a space. + nsAutoString restOfContent; + if (nsCRT::IsAsciiSpace(mCurrentLine.mContent.CharAt(goodSpace))) { + mCurrentLine.mContent.Right(restOfContent, + contentLength - goodSpace - 1); + } else { + mCurrentLine.mContent.Right(restOfContent, contentLength - goodSpace); + } + // if breaker was U+0020, it has to consider for delsp=yes support + const bool breakBySpace = mCurrentLine.mContent.CharAt(goodSpace) == ' '; + mCurrentLine.mContent.Truncate(goodSpace); + EndLine(true, breakBySpace); + mCurrentLine.mContent.Truncate(); + // Space stuff new line? + if (mSettings.HasFlag(nsIDocumentEncoder::OutputFormatFlowed)) { + if (!restOfContent.IsEmpty() && IsSpaceStuffable(restOfContent.get()) && + mCurrentLine.mCiteQuoteLevel == + 0 // We space-stuff quoted lines anyway + ) { + // Space stuffing a la RFC 2646 (format=flowed). + mCurrentLine.mContent.Append(char16_t(' ')); + // XXX doesn't seem to work correctly for ' ' + } + } + mCurrentLine.mContent.Append(restOfContent); + mEmptyLines = -1; + } else { + // Nothing to do. Hopefully we get more data later + // to use for a place to break line + break; + } + } +} + +/** + * This function adds a piece of text to the current stored line. If we are + * wrapping text and the stored line will become too long, a suitable + * location to wrap will be found and the line that's complete will be + * output. + */ +void nsPlainTextSerializer::AddToLine(const char16_t* aLineFragment, + int32_t aLineFragmentLength) { + if (mLineBreakDue) EnsureVerticalSpace(mFloatingLines); + + if (mCurrentLine.mContent.IsEmpty()) { + if (0 == aLineFragmentLength) { + return; + } + + if (mSettings.HasFlag(nsIDocumentEncoder::OutputFormatFlowed)) { + if (IsSpaceStuffable(aLineFragment) && + mCurrentLine.mCiteQuoteLevel == + 0 // We space-stuff quoted lines anyway + ) { + // Space stuffing a la RFC 2646 (format=flowed). + mCurrentLine.mContent.Append(char16_t(' ')); + } + } + mEmptyLines = -1; + } + + mCurrentLine.mContent.Append(aLineFragment, aLineFragmentLength); + + MaybeWrapAndOutputCompleteLines(); +} + +// The signature separator (RFC 2646). +const char kSignatureSeparator[] = "-- "; + +// The OpenPGP dash-escaped signature separator in inline +// signed messages according to the OpenPGP standard (RFC 2440). +const char kDashEscapedSignatureSeparator[] = "- -- "; + +static bool IsSignatureSeparator(const nsAString& aString) { + return aString.EqualsLiteral(kSignatureSeparator) || + aString.EqualsLiteral(kDashEscapedSignatureSeparator); +} + +/** + * Outputs the contents of mCurrentLine.mContent, and resets line + * specific variables. Also adds an indentation and prefix if there is one + * specified. Strips ending spaces from the line if it isn't preformatted. + */ +void nsPlainTextSerializer::EndLine(bool aSoftLineBreak, bool aBreakBySpace) { + if (aSoftLineBreak && mCurrentLine.mContent.IsEmpty()) { + // No meaning + return; + } + + /* In non-preformatted mode, remove spaces from the end of the line for + * format=flowed compatibility. Don't do this for these special cases: + * "-- ", the signature separator (RFC 2646) shouldn't be touched and + * "- -- ", the OpenPGP dash-escaped signature separator in inline + * signed messages according to the OpenPGP standard (RFC 2440). + */ + if (!mSettings.HasFlag(nsIDocumentEncoder::OutputPreformatted) && + (aSoftLineBreak || !IsSignatureSeparator(mCurrentLine.mContent))) { + mCurrentLine.mContent.Trim(" ", false, true, false); + } + + if (aSoftLineBreak && + mSettings.HasFlag(nsIDocumentEncoder::OutputFormatFlowed) && + (mCurrentLine.mIndentation.mLength == 0)) { + // Add the soft part of the soft linebreak (RFC 2646 4.1) + // We only do this when there is no indentation since format=flowed + // lines and indentation doesn't work well together. + + // If breaker character is ASCII space with RFC 3676 support (delsp=yes), + // add twice space. + if (mSettings.HasFlag(nsIDocumentEncoder::OutputFormatDelSp) && + aBreakBySpace) { + mCurrentLine.mContent.AppendLiteral(" "); + } else { + mCurrentLine.mContent.Append(char16_t(' ')); + } + } + + if (aSoftLineBreak) { + mEmptyLines = 0; + } else { + // Hard break + if (mCurrentLine.HasContentOrIndentationHeader()) { + mEmptyLines = 0; + } else { + mEmptyLines++; + } + } + + MOZ_ASSERT(mOutputManager); + + mCurrentLine.MaybeReplaceNbspsInContent(mSettings.GetFlags()); + + // If we don't have anything "real" to output we have to + // make sure the indent doesn't end in a space since that + // would trick a format=flowed-aware receiver. + mOutputManager->Append(mCurrentLine, + OutputManager::StripTrailingWhitespaces::kMaybe); + mOutputManager->AppendLineBreak(); + mCurrentLine.ResetContentAndIndentationHeader(); + mInWhitespace = true; + mLineBreakDue = false; + mFloatingLines = -1; +} + +/** + * Creates the calculated and stored indent and text in the indentation. That is + * quote chars and numbers for numbered lists and such. + */ +void nsPlainTextSerializer::CurrentLine::CreateQuotesAndIndent( + nsAString& aResult) const { + // Put the mail quote "> " chars in, if appropriate: + if (mCiteQuoteLevel > 0) { + nsAutoString quotes; + for (int i = 0; i < mCiteQuoteLevel; i++) { + quotes.Append(char16_t('>')); + } + if (!mContent.IsEmpty()) { + /* Better don't output a space here, if the line is empty, + in case a receiving format=flowed-aware UA thinks, this were a flowed + line, which it isn't - it's just empty. (Flowed lines may be joined + with the following one, so the empty line may be lost completely.) */ + quotes.Append(char16_t(' ')); + } + aResult = quotes; + } + + // Indent if necessary + int32_t indentwidth = mIndentation.mLength - mIndentation.mHeader.Length(); + if (indentwidth > 0 && HasContentOrIndentationHeader() + // Don't make empty lines look flowed + ) { + nsAutoString spaces; + for (int i = 0; i < indentwidth; ++i) spaces.Append(char16_t(' ')); + aResult += spaces; + } + + if (!mIndentation.mHeader.IsEmpty()) { + aResult += mIndentation.mHeader; + } +} + +static bool IsLineFeedCarriageReturnBlankOrTab(char16_t c) { + return ('\n' == c || '\r' == c || ' ' == c || '\t' == c); +} + +static void ReplaceVisiblyTrailingNbsps(nsAString& aString) { + const int32_t totLen = aString.Length(); + for (int32_t i = totLen - 1; i >= 0; i--) { + char16_t c = aString[i]; + if (IsLineFeedCarriageReturnBlankOrTab(c)) { + continue; + } + if (kNBSP == c) { + aString.Replace(i, 1, ' '); + } else { + break; + } + } +} + +void nsPlainTextSerializer::ConvertToLinesAndOutput(const nsAString& aString) { + const int32_t totLen = aString.Length(); + int32_t newline{0}; + + // Put the mail quote "> " chars in, if appropriate. + // Have to put it in before every line. + int32_t bol = 0; + while (bol < totLen) { + bool outputLineBreak = false; + bool spacesOnly = true; + + // Find one of '\n' or '\r' using iterators since nsAString + // doesn't have the old FindCharInSet function. + nsAString::const_iterator iter; + aString.BeginReading(iter); + nsAString::const_iterator done_searching; + aString.EndReading(done_searching); + iter.advance(bol); + int32_t new_newline = bol; + newline = kNotFound; + while (iter != done_searching) { + if ('\n' == *iter || '\r' == *iter) { + newline = new_newline; + break; + } + if (' ' != *iter) { + spacesOnly = false; + } + ++new_newline; + ++iter; + } + + // Done searching + nsAutoString stringpart; + if (newline == kNotFound) { + // No new lines. + stringpart.Assign(Substring(aString, bol, totLen - bol)); + if (!stringpart.IsEmpty()) { + char16_t lastchar = stringpart.Last(); + mInWhitespace = IsLineFeedCarriageReturnBlankOrTab(lastchar); + } + mEmptyLines = -1; + bol = totLen; + } else { + // There is a newline + stringpart.Assign(Substring(aString, bol, newline - bol)); + mInWhitespace = true; + outputLineBreak = true; + mEmptyLines = 0; + bol = newline + 1; + if ('\r' == *iter && bol < totLen && '\n' == *++iter) { + // There was a CRLF in the input. This used to be illegal and + // stripped by the parser. Apparently not anymore. Let's skip + // over the LF. + bol++; + } + } + + if (mSettings.HasFlag(nsIDocumentEncoder::OutputFormatFlowed)) { + if ((outputLineBreak || !spacesOnly) && // bugs 261467,125928 + !IsQuotedLine(stringpart) && !IsSignatureSeparator(stringpart)) { + stringpart.Trim(" ", false, true, true); + } + if (IsSpaceStuffable(stringpart.get()) && !IsQuotedLine(stringpart)) { + mCurrentLine.mContent.Append(char16_t(' ')); + } + } + mCurrentLine.mContent.Append(stringpart); + + mCurrentLine.MaybeReplaceNbspsInContent(mSettings.GetFlags()); + + mOutputManager->Append(mCurrentLine, + OutputManager::StripTrailingWhitespaces::kNo); + if (outputLineBreak) { + mOutputManager->AppendLineBreak(); + } + + mCurrentLine.ResetContentAndIndentationHeader(); + } + +#ifdef DEBUG_wrapping + printf("No wrapping: newline is %d, totLen is %d\n", newline, totLen); +#endif +} + +/** + * Write a string. This is the highlevel function to use to get text output. + * By using AddToLine, Output, EndLine and other functions it handles quotation, + * line wrapping, indentation, whitespace compression and other things. + */ +void nsPlainTextSerializer::Write(const nsAString& aStr) { + // XXX Copy necessary to use nsString methods and gain + // access to underlying buffer + nsAutoString str(aStr); + +#ifdef DEBUG_wrapping + printf("Write(%s): wrap col = %d\n", NS_ConvertUTF16toUTF8(str).get(), + mSettings.GetWrapColumn()); +#endif + + const int32_t totLen = str.Length(); + + // If the string is empty, do nothing: + if (totLen <= 0) return; + + // For Flowed text change nbsp-ses to spaces at end of lines to allow them + // to be cut off along with usual spaces if required. (bug #125928) + if (mSettings.HasFlag(nsIDocumentEncoder::OutputFormatFlowed)) { + ReplaceVisiblyTrailingNbsps(str); + } + + // We have two major codepaths here. One that does preformatted text and one + // that does normal formatted text. The one for preformatted text calls + // Output directly while the other code path goes through AddToLine. + if ((mPreFormattedMail && !mSettings.GetWrapColumn()) || + (IsElementPreformatted() && !mPreFormattedMail) || + (mSpanLevel > 0 && mEmptyLines >= 0 && IsQuotedLine(str))) { + // No intelligent wrapping. + + // This mustn't be mixed with intelligent wrapping without clearing + // the mCurrentLine.mContent buffer before!!! + NS_ASSERTION(mCurrentLine.mContent.IsEmpty() || + (IsElementPreformatted() && !mPreFormattedMail), + "Mixed wrapping data and nonwrapping data on the same line"); + MOZ_ASSERT(mOutputManager); + + if (!mCurrentLine.mContent.IsEmpty()) { + mOutputManager->Flush(mCurrentLine); + } + + ConvertToLinesAndOutput(str); + return; + } + + // Intelligent handling of text + // If needed, strip out all "end of lines" + // and multiple whitespace between words + int32_t nextpos; + const char16_t* offsetIntoBuffer = nullptr; + + int32_t bol = 0; + while (bol < totLen) { // Loop over lines + // Find a place where we may have to do whitespace compression + nextpos = str.FindCharInSet(u" \t\n\r", bol); +#ifdef DEBUG_wrapping + nsAutoString remaining; + str.Right(remaining, totLen - bol); + foo = ToNewCString(remaining); + // printf("Next line: bol = %d, newlinepos = %d, totLen = %d, " + // "string = '%s'\n", bol, nextpos, totLen, foo); + free(foo); +#endif + + if (nextpos == kNotFound) { + // The rest of the string + offsetIntoBuffer = str.get() + bol; + AddToLine(offsetIntoBuffer, totLen - bol); + bol = totLen; + mInWhitespace = false; + } else { + // There's still whitespace left in the string + if (nextpos != 0 && (nextpos + 1) < totLen) { + offsetIntoBuffer = str.get() + nextpos; + // skip '\n' if it is between CJ chars + if (offsetIntoBuffer[0] == '\n' && IS_CJ_CHAR(offsetIntoBuffer[-1]) && + IS_CJ_CHAR(offsetIntoBuffer[1])) { + offsetIntoBuffer = str.get() + bol; + AddToLine(offsetIntoBuffer, nextpos - bol); + bol = nextpos + 1; + continue; + } + } + // If we're already in whitespace and not preformatted, just skip it: + if (mInWhitespace && (nextpos == bol) && !mPreFormattedMail && + !mSettings.HasFlag(nsIDocumentEncoder::OutputPreformatted)) { + // Skip whitespace + bol++; + continue; + } + + if (nextpos == bol) { + // Note that we are in whitespace. + mInWhitespace = true; + offsetIntoBuffer = str.get() + nextpos; + AddToLine(offsetIntoBuffer, 1); + bol++; + continue; + } + + mInWhitespace = true; + + offsetIntoBuffer = str.get() + bol; + if (mPreFormattedMail || + mSettings.HasFlag(nsIDocumentEncoder::OutputPreformatted)) { + // Preserve the real whitespace character + nextpos++; + AddToLine(offsetIntoBuffer, nextpos - bol); + bol = nextpos; + } else { + // Replace the whitespace with a space + AddToLine(offsetIntoBuffer, nextpos - bol); + AddToLine(kSpace.get(), 1); + bol = nextpos + 1; // Let's eat the whitespace + } + } + } // Continue looping over the string +} + +/** + * Gets the value of an attribute in a string. If the function returns + * NS_ERROR_NOT_AVAILABLE, there was none such attribute specified. + */ +nsresult nsPlainTextSerializer::GetAttributeValue(const nsAtom* aName, + nsString& aValueRet) const { + if (mElement) { + if (mElement->GetAttr(kNameSpaceID_None, aName, aValueRet)) { + return NS_OK; + } + } + + return NS_ERROR_NOT_AVAILABLE; +} + +/** + * Returns true, if the element was inserted by Moz' TXT->HTML converter. + * In this case, we should ignore it. + */ +bool nsPlainTextSerializer::IsCurrentNodeConverted() const { + nsAutoString value; + nsresult rv = GetAttributeValue(nsGkAtoms::_class, value); + return (NS_SUCCEEDED(rv) && + (StringBeginsWith(value, u"moz-txt"_ns, + nsASCIICaseInsensitiveStringComparator) || + StringBeginsWith(value, u"\"moz-txt"_ns, + nsASCIICaseInsensitiveStringComparator))); +} + +// static +nsAtom* nsPlainTextSerializer::GetIdForContent(nsIContent* aContent) { + if (!aContent->IsHTMLElement()) { + return nullptr; + } + + nsAtom* localName = aContent->NodeInfo()->NameAtom(); + return localName->IsStatic() ? localName : nullptr; +} + +bool nsPlainTextSerializer::IsElementPreformatted() const { + return !mPreformatStack.empty() && mPreformatStack.top(); +} + +bool nsPlainTextSerializer::IsElementPreformatted(Element* aElement) { + RefPtr<const ComputedStyle> computedStyle = + nsComputedDOMStyle::GetComputedStyleNoFlush(aElement); + if (computedStyle) { + const nsStyleText* textStyle = computedStyle->StyleText(); + return textStyle->WhiteSpaceOrNewlineIsSignificant(); + } + // Fall back to looking at the tag, in case there is no style information. + return GetIdForContent(aElement) == nsGkAtoms::pre; +} + +bool nsPlainTextSerializer::IsCssBlockLevelElement(Element* aElement) { + RefPtr<const ComputedStyle> computedStyle = + nsComputedDOMStyle::GetComputedStyleNoFlush(aElement); + if (computedStyle) { + const nsStyleDisplay* displayStyle = computedStyle->StyleDisplay(); + return displayStyle->IsBlockOutsideStyle(); + } + // Fall back to looking at the tag, in case there is no style information. + return nsContentUtils::IsHTMLBlockLevelElement(aElement); +} + +/** + * This method is required only to identify LI's inside OL. + * Returns TRUE if we are inside an OL tag and FALSE otherwise. + */ +bool nsPlainTextSerializer::IsInOL() const { + int32_t i = mTagStackIndex; + while (--i >= 0) { + if (mTagStack[i] == nsGkAtoms::ol) return true; + if (mTagStack[i] == nsGkAtoms::ul) { + // If a UL is reached first, LI belongs the UL nested in OL. + return false; + } + } + // We may reach here for orphan LI's. + return false; +} + +bool nsPlainTextSerializer::IsInOlOrUl() const { + return (mULCount > 0) || !mOLStack.IsEmpty(); +} + +/* + @return 0 = no header, 1 = h1, ..., 6 = h6 +*/ +int32_t HeaderLevel(const nsAtom* aTag) { + if (aTag == nsGkAtoms::h1) { + return 1; + } + if (aTag == nsGkAtoms::h2) { + return 2; + } + if (aTag == nsGkAtoms::h3) { + return 3; + } + if (aTag == nsGkAtoms::h4) { + return 4; + } + if (aTag == nsGkAtoms::h5) { + return 5; + } + if (aTag == nsGkAtoms::h6) { + return 6; + } + return 0; +} + +/* These functions define the column width of an ISO 10646 character + * as follows: + * + * - The null character (U+0000) has a column width of 0. + * + * - Other C0/C1 control characters and DEL will lead to a return + * value of -1. + * + * - Non-spacing and enclosing combining characters (general + * category code Mn or Me in the Unicode database) have a + * column width of 0. + * + * - Spacing characters in the East Asian Wide (W) or East Asian + * FullWidth (F) category as defined in Unicode Technical + * Report #11 have a column width of 2. + * + * - All remaining characters (including all printable + * ISO 8859-1 and WGL4 characters, Unicode control characters, + * etc.) have a column width of 1. + */ + +int32_t GetUnicharWidth(char32_t aCh) { + /* test for 8-bit control characters */ + if (aCh == 0) { + return 0; + } + if (aCh < 32 || (aCh >= 0x7f && aCh < 0xa0)) { + return -1; + } + + /* The first combining char in Unicode is U+0300 */ + if (aCh < 0x0300) { + return 1; + } + + auto gc = unicode::GetGeneralCategory(aCh); + if (gc == HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK || + gc == HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK) { + return 0; + } + + /* if we arrive here, ucs is not a combining or C0/C1 control character */ + + /* fast test for majority of non-wide scripts */ + if (aCh < 0x1100) { + return 1; + } + + return intl::UnicodeProperties::IsEastAsianWidthFW(aCh) ? 2 : 1; +} + +int32_t GetUnicharStringWidth(Span<const char16_t> aString) { + int32_t width = 0; + for (auto iter = aString.begin(); iter != aString.end(); ++iter) { + char32_t c = *iter; + if (NS_IS_HIGH_SURROGATE(c) && (iter + 1) != aString.end() && + NS_IS_LOW_SURROGATE(*(iter + 1))) { + c = SURROGATE_TO_UCS4(c, *++iter); + } + const int32_t w = GetUnicharWidth(c); + // Taking 1 as the width of non-printable character, for bug 94475. + width += (w < 0 ? 1 : w); + } + return width; +} |