diff options
Diffstat (limited to 'netwerk/streamconv/converters/mozTXTToHTMLConv.cpp')
-rw-r--r-- | netwerk/streamconv/converters/mozTXTToHTMLConv.cpp | 1291 |
1 files changed, 1291 insertions, 0 deletions
diff --git a/netwerk/streamconv/converters/mozTXTToHTMLConv.cpp b/netwerk/streamconv/converters/mozTXTToHTMLConv.cpp new file mode 100644 index 0000000000..a9f1d5183f --- /dev/null +++ b/netwerk/streamconv/converters/mozTXTToHTMLConv.cpp @@ -0,0 +1,1291 @@ +/* -*- Mode: C; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "mozilla/TextUtils.h" +#include "mozTXTToHTMLConv.h" +#include "mozilla/intl/Segmenter.h" +#include "mozilla/Maybe.h" +#include "nsIThreadRetargetableStreamListener.h" +#include "nsNetUtil.h" +#include "nsUnicharUtils.h" +#include "nsUnicodeProperties.h" +#include "nsCRT.h" +#include "nsIExternalProtocolHandler.h" +#include "nsIURI.h" + +#include <algorithm> + +#ifdef DEBUG_BenB_Perf +# include "prtime.h" +# include "prinrval.h" +#endif + +using mozilla::IsAscii; +using mozilla::IsAsciiAlpha; +using mozilla::IsAsciiDigit; +using mozilla::Maybe; +using mozilla::Some; +using mozilla::Span; +using mozilla::intl::GraphemeClusterBreakIteratorUtf16; +using mozilla::intl::GraphemeClusterBreakReverseIteratorUtf16; + +const double growthRate = 1.2; + +// Bug 183111, editor now replaces multiple spaces with leading +// 0xA0's and a single ending space, so need to treat 0xA0's as spaces. +// 0xA0 is the Latin1/Unicode character for "non-breaking space (nbsp)" +// Also recognize the Japanese ideographic space 0x3000 as a space. +static inline bool IsSpace(const char16_t aChar) { + return (nsCRT::IsAsciiSpace(aChar) || aChar == 0xA0 || aChar == 0x3000); +} + +// Escape Char will take ch, escape it and append the result to +// aStringToAppendTo +void mozTXTToHTMLConv::EscapeChar(const char16_t ch, + nsAString& aStringToAppendTo, + bool inAttribute) { + switch (ch) { + case '<': + aStringToAppendTo.AppendLiteral("<"); + break; + case '>': + aStringToAppendTo.AppendLiteral(">"); + break; + case '&': + aStringToAppendTo.AppendLiteral("&"); + break; + case '"': + if (inAttribute) { + aStringToAppendTo.AppendLiteral("""); + break; + } + // else fall through + [[fallthrough]]; + default: + aStringToAppendTo += ch; + } +} + +// EscapeStr takes the passed in string and +// escapes it IN PLACE. +void mozTXTToHTMLConv::EscapeStr(nsString& aInString, bool inAttribute) { + // the replace substring routines + // don't seem to work if you have a character + // in the in string that is also in the replacement + // string! =( + // aInString.ReplaceSubstring("&", "&"); + // aInString.ReplaceSubstring("<", "<"); + // aInString.ReplaceSubstring(">", ">"); + for (uint32_t i = 0; i < aInString.Length();) { + switch (aInString[i]) { + case '<': + aInString.Cut(i, 1); + aInString.InsertLiteral(u"<", i); + i += 4; // skip past the integers we just added + break; + case '>': + aInString.Cut(i, 1); + aInString.InsertLiteral(u">", i); + i += 4; // skip past the integers we just added + break; + case '&': + aInString.Cut(i, 1); + aInString.InsertLiteral(u"&", i); + i += 5; // skip past the integers we just added + break; + case '"': + if (inAttribute) { + aInString.Cut(i, 1); + aInString.InsertLiteral(u""", i); + i += 6; + break; + } + // else fall through + [[fallthrough]]; + default: + i++; + } + } +} + +void mozTXTToHTMLConv::UnescapeStr(const char16_t* aInString, int32_t aStartPos, + int32_t aLength, nsString& aOutString) { + const char16_t* subString = nullptr; + for (uint32_t i = aStartPos; int32_t(i) - aStartPos < aLength;) { + int32_t remainingChars = i - aStartPos; + if (aInString[i] == '&') { + subString = &aInString[i]; + if (!NS_strncmp(subString, u"<", + std::min(4, aLength - remainingChars))) { + aOutString.Append(char16_t('<')); + i += 4; + } else if (!NS_strncmp(subString, u">", + std::min(4, aLength - remainingChars))) { + aOutString.Append(char16_t('>')); + i += 4; + } else if (!NS_strncmp(subString, u"&", + std::min(5, aLength - remainingChars))) { + aOutString.Append(char16_t('&')); + i += 5; + } else if (!NS_strncmp(subString, u""", + std::min(6, aLength - remainingChars))) { + aOutString.Append(char16_t('"')); + i += 6; + } else { + aOutString += aInString[i]; + i++; + } + } else { + aOutString += aInString[i]; + i++; + } + } +} + +void mozTXTToHTMLConv::CompleteAbbreviatedURL(const char16_t* aInString, + int32_t aInLength, + const uint32_t pos, + nsString& aOutString) { + NS_ASSERTION(int32_t(pos) < aInLength, + "bad args to CompleteAbbreviatedURL, see bug #190851"); + if (int32_t(pos) >= aInLength) return; + + if (aInString[pos] == '@') { + // only pre-pend a mailto url if the string contains a .domain in it.. + // i.e. we want to linkify johndoe@foo.com but not "let's meet @8pm" + nsDependentString inString(aInString, aInLength); + if (inString.FindChar('.', pos) != + kNotFound) // if we have a '.' after the @ sign.... + { + aOutString.AssignLiteral("mailto:"); + aOutString += aInString; + } + } else if (aInString[pos] == '.') { + if (ItMatchesDelimited(aInString, aInLength, u"www.", 4, LT_IGNORE, + LT_IGNORE)) { + aOutString.AssignLiteral("http://"); + aOutString += aInString; + } + } +} + +bool mozTXTToHTMLConv::FindURLStart(const char16_t* aInString, + int32_t aInLength, const uint32_t pos, + const modetype check, uint32_t& start) { + switch (check) { // no breaks, because end of blocks is never reached + case RFC1738: { + if (!NS_strncmp(&aInString[std::max(int32_t(pos - 4), 0)], u"<URL:", 5)) { + start = pos + 1; + return true; + } + return false; + } + case RFC2396E: { + nsDependentSubstring temp(aInString, aInLength); + int32_t i = pos <= 0 ? kNotFound : temp.RFindCharInSet(u"<>\"", pos - 1); + if (i != kNotFound && + (temp[uint32_t(i)] == '<' || temp[uint32_t(i)] == '"')) { + start = uint32_t(++i); + return start < pos; + } + return false; + } + case freetext: { + int32_t i = pos - 1; + for (; i >= 0 && + (IsAsciiAlpha(aInString[uint32_t(i)]) || + IsAsciiDigit(aInString[uint32_t(i)]) || + aInString[uint32_t(i)] == '+' || aInString[uint32_t(i)] == '-' || + aInString[uint32_t(i)] == '.'); + i--) { + ; + } + if (++i >= 0 && uint32_t(i) < pos && + IsAsciiAlpha(aInString[uint32_t(i)])) { + start = uint32_t(i); + return true; + } + return false; + } + case abbreviated: { + int32_t i = pos - 1; + // This disallows non-ascii-characters for email. + // Currently correct, but revisit later after standards changed. + bool isEmail = aInString[pos] == (char16_t)'@'; + // These chars mark the start of the URL + for (; i >= 0 && aInString[uint32_t(i)] != '>' && + aInString[uint32_t(i)] != '<' && aInString[uint32_t(i)] != '"' && + aInString[uint32_t(i)] != '\'' && aInString[uint32_t(i)] != '`' && + aInString[uint32_t(i)] != ',' && aInString[uint32_t(i)] != '{' && + aInString[uint32_t(i)] != '[' && aInString[uint32_t(i)] != '(' && + aInString[uint32_t(i)] != '|' && aInString[uint32_t(i)] != '\\' && + !IsSpace(aInString[uint32_t(i)]) && + (!isEmail || IsAscii(aInString[uint32_t(i)])) && + (!isEmail || aInString[uint32_t(i)] != ')'); + i--) { + ; + } + if (++i >= 0 && uint32_t(i) < pos && + (IsAsciiAlpha(aInString[uint32_t(i)]) || + IsAsciiDigit(aInString[uint32_t(i)]))) { + start = uint32_t(i); + return true; + } + return false; + } + default: + return false; + } // switch +} + +bool mozTXTToHTMLConv::FindURLEnd(const char16_t* aInString, + int32_t aInStringLength, const uint32_t pos, + const modetype check, const uint32_t start, + uint32_t& end) { + switch (check) { // no breaks, because end of blocks is never reached + case RFC1738: + case RFC2396E: { + nsDependentSubstring temp(aInString, aInStringLength); + + int32_t i = temp.FindCharInSet(u"<>\"", pos + 1); + if (i != kNotFound && + temp[uint32_t(i--)] == + (check == RFC1738 || temp[start - 1] == '<' ? '>' : '"')) { + end = uint32_t(i); + return end > pos; + } + return false; + } + case freetext: + case abbreviated: { + uint32_t i = pos + 1; + bool isEmail = aInString[pos] == (char16_t)'@'; + bool seenOpeningParenthesis = false; // there is a '(' earlier in the URL + bool seenOpeningSquareBracket = + false; // there is a '[' earlier in the URL + for (; int32_t(i) < aInStringLength; i++) { + // These chars mark the end of the URL + if (aInString[i] == '>' || aInString[i] == '<' || aInString[i] == '"' || + aInString[i] == '`' || aInString[i] == '}' || aInString[i] == '{' || + (aInString[i] == ')' && !seenOpeningParenthesis) || + (aInString[i] == ']' && !seenOpeningSquareBracket) || + // Allow IPv6 adresses like http://[1080::8:800:200C:417A]/foo. + (aInString[i] == '[' && i > 2 && + (aInString[i - 1] != '/' || aInString[i - 2] != '/')) || + IsSpace(aInString[i])) { + break; + } + // Disallow non-ascii-characters for email. + // Currently correct, but revisit later after standards changed. + if (isEmail && (aInString[i] == '(' || aInString[i] == '\'' || + !IsAscii(aInString[i]))) { + break; + } + if (aInString[i] == '(') seenOpeningParenthesis = true; + if (aInString[i] == '[') seenOpeningSquareBracket = true; + } + // These chars are allowed in the middle of the URL, but not at end. + // Technically they are, but are used in normal text after the URL. + while (--i > pos && (aInString[i] == '.' || aInString[i] == ',' || + aInString[i] == ';' || aInString[i] == '!' || + aInString[i] == '?' || aInString[i] == '-' || + aInString[i] == ':' || aInString[i] == '\'')) { + ; + } + if (i > pos) { + end = i; + return true; + } + return false; + } + default: + return false; + } // switch +} + +void mozTXTToHTMLConv::CalculateURLBoundaries( + const char16_t* aInString, int32_t aInStringLength, const uint32_t pos, + const uint32_t whathasbeendone, const modetype check, const uint32_t start, + const uint32_t end, nsString& txtURL, nsString& desc, + int32_t& replaceBefore, int32_t& replaceAfter) { + uint32_t descstart = start; + switch (check) { + case RFC1738: { + descstart = start - 5; + desc.Append(&aInString[descstart], + end - descstart + 2); // include "<URL:" and ">" + replaceAfter = end - pos + 1; + } break; + case RFC2396E: { + descstart = start - 1; + desc.Append(&aInString[descstart], + end - descstart + 2); // include brackets + replaceAfter = end - pos + 1; + } break; + case freetext: + case abbreviated: { + descstart = start; + desc.Append(&aInString[descstart], + end - start + 1); // don't include brackets + replaceAfter = end - pos; + } break; + default: + break; + } // switch + + EscapeStr(desc, false); + + txtURL.Append(&aInString[start], end - start + 1); + txtURL.StripWhitespace(); + + // FIX ME + nsAutoString temp2; + ScanTXT(nsDependentSubstring(&aInString[descstart], pos - descstart), + ~kURLs /*prevents loop*/ & whathasbeendone, temp2); + replaceBefore = temp2.Length(); +} + +bool mozTXTToHTMLConv::ShouldLinkify(const nsCString& aURL) { + if (!mIOService) return false; + + nsAutoCString scheme; + nsresult rv = mIOService->ExtractScheme(aURL, scheme); + if (NS_FAILED(rv)) return false; + + if (scheme == "http" || scheme == "https" || scheme == "mailto") { + return true; + } + + // Get the handler for this scheme. + nsCOMPtr<nsIProtocolHandler> handler; + rv = mIOService->GetProtocolHandler(scheme.get(), getter_AddRefs(handler)); + if (NS_FAILED(rv)) return false; + + // Is it an external protocol handler? If not, linkify it. + nsCOMPtr<nsIExternalProtocolHandler> externalHandler = + do_QueryInterface(handler); + if (!externalHandler) return true; // handler is built-in, linkify it! + + // If external app exists for the scheme then linkify it. + bool exists; + rv = externalHandler->ExternalAppExistsForScheme(scheme, &exists); + return (NS_SUCCEEDED(rv) && exists); +} + +bool mozTXTToHTMLConv::CheckURLAndCreateHTML(const nsString& txtURL, + const nsString& desc, + const modetype mode, + nsString& outputHTML) { + // Create *uri from txtURL + nsCOMPtr<nsIURI> uri; + nsresult rv; + // Lazily initialize mIOService + if (!mIOService) { + mIOService = do_GetIOService(); + + if (!mIOService) return false; + } + + // See if the url should be linkified. + NS_ConvertUTF16toUTF8 utf8URL(txtURL); + if (!ShouldLinkify(utf8URL)) return false; + + // it would be faster if we could just check to see if there is a protocol + // handler for the url and return instead of actually trying to create a + // url... + rv = mIOService->NewURI(utf8URL, nullptr, nullptr, getter_AddRefs(uri)); + + // Real work + if (NS_SUCCEEDED(rv) && uri) { + outputHTML.AssignLiteral("<a class=\"moz-txt-link-"); + switch (mode) { + case RFC1738: + outputHTML.AppendLiteral("rfc1738"); + break; + case RFC2396E: + outputHTML.AppendLiteral("rfc2396E"); + break; + case freetext: + outputHTML.AppendLiteral("freetext"); + break; + case abbreviated: + outputHTML.AppendLiteral("abbreviated"); + break; + default: + break; + } + nsAutoString escapedURL(txtURL); + EscapeStr(escapedURL, true); + + outputHTML.AppendLiteral("\" href=\""); + outputHTML += escapedURL; + outputHTML.AppendLiteral("\">"); + outputHTML += desc; + outputHTML.AppendLiteral("</a>"); + return true; + } + return false; +} + +NS_IMETHODIMP mozTXTToHTMLConv::FindURLInPlaintext(const char16_t* aInString, + int32_t aInLength, + int32_t aPos, + int32_t* aStartPos, + int32_t* aEndPos) { + // call FindURL on the passed in string + nsAutoString outputHTML; // we'll ignore the generated output HTML + + *aStartPos = -1; + *aEndPos = -1; + + FindURL(aInString, aInLength, aPos, kURLs, outputHTML, *aStartPos, *aEndPos); + + return NS_OK; +} + +bool mozTXTToHTMLConv::FindURL(const char16_t* aInString, int32_t aInLength, + const uint32_t pos, + const uint32_t whathasbeendone, + nsString& outputHTML, int32_t& replaceBefore, + int32_t& replaceAfter) { + enum statetype { unchecked, invalid, startok, endok, success }; + static const modetype ranking[] = {RFC1738, RFC2396E, freetext, abbreviated}; + + statetype state[mozTXTToHTMLConv_lastMode + 1]; // 0(=unknown)..lastMode + /* I don't like this abuse of enums as index for the array, + but I don't know a better method */ + + // Define, which modes to check + /* all modes but abbreviated are checked for text[pos] == ':', + only abbreviated for '.', RFC2396E and abbreviated for '@' */ + for (modetype iState = unknown; iState <= mozTXTToHTMLConv_lastMode; + iState = modetype(iState + 1)) { + state[iState] = aInString[pos] == ':' ? unchecked : invalid; + } + switch (aInString[pos]) { + case '@': + state[RFC2396E] = unchecked; + [[fallthrough]]; + case '.': + state[abbreviated] = unchecked; + break; + case ':': + state[abbreviated] = invalid; + break; + default: + break; + } + + // Test, first successful mode wins, sequence defined by |ranking| + int32_t iCheck = 0; // the currently tested modetype + modetype check = ranking[iCheck]; + for (; iCheck < mozTXTToHTMLConv_numberOfModes && state[check] != success; + iCheck++) + /* check state from last run. + If this is the first, check this one, which isn't = success yet */ + { + check = ranking[iCheck]; + + uint32_t start, end; + + if (state[check] == unchecked) { + if (FindURLStart(aInString, aInLength, pos, check, start)) { + state[check] = startok; + } + } + + if (state[check] == startok) { + if (FindURLEnd(aInString, aInLength, pos, check, start, end)) { + state[check] = endok; + } + } + + if (state[check] == endok) { + nsAutoString txtURL, desc; + int32_t resultReplaceBefore, resultReplaceAfter; + + CalculateURLBoundaries(aInString, aInLength, pos, whathasbeendone, check, + start, end, txtURL, desc, resultReplaceBefore, + resultReplaceAfter); + + if (aInString[pos] != ':') { + nsAutoString temp = txtURL; + txtURL.SetLength(0); + CompleteAbbreviatedURL(temp.get(), temp.Length(), pos - start, txtURL); + } + + if (!txtURL.IsEmpty() && + CheckURLAndCreateHTML(txtURL, desc, check, outputHTML)) { + replaceBefore = resultReplaceBefore; + replaceAfter = resultReplaceAfter; + state[check] = success; + } + } // if + } // for + return state[check] == success; +} + +static inline bool IsAlpha(const uint32_t aChar) { + return mozilla::unicode::GetGenCategory(aChar) == nsUGenCategory::kLetter; +} + +static inline bool IsDigit(const uint32_t aChar) { + return mozilla::unicode::GetGenCategory(aChar) == nsUGenCategory::kNumber; +} + +bool mozTXTToHTMLConv::ItMatchesDelimited(const char16_t* aInString, + int32_t aInLength, + const char16_t* rep, int32_t aRepLen, + LIMTYPE before, LIMTYPE after) { + // this little method gets called a LOT. I found we were spending a + // lot of time just calculating the length of the variable "rep" + // over and over again every time we called it. So we're now passing + // an integer in here. + int32_t textLen = aInLength; + + if (((before == LT_IGNORE && (after == LT_IGNORE || after == LT_DELIMITER)) && + textLen < aRepLen) || + ((before != LT_IGNORE || (after != LT_IGNORE && after != LT_DELIMITER)) && + textLen < aRepLen + 1) || + (before != LT_IGNORE && after != LT_IGNORE && after != LT_DELIMITER && + textLen < aRepLen + 2)) { + return false; + } + + uint32_t text0 = aInString[0]; + if (aInLength > 1 && NS_IS_SURROGATE_PAIR(text0, aInString[1])) { + text0 = SURROGATE_TO_UCS4(text0, aInString[1]); + } + // find length of the char/cluster to be ignored + int32_t ignoreLen = before == LT_IGNORE ? 0 : 1; + if (ignoreLen) { + GraphemeClusterBreakIteratorUtf16 ci( + Span<const char16_t>(aInString, aInLength)); + ignoreLen = *ci.Next(); + } + + int32_t afterIndex = aRepLen + ignoreLen; + uint32_t textAfterPos = aInString[afterIndex]; + if (aInLength > afterIndex + 1 && + NS_IS_SURROGATE_PAIR(textAfterPos, aInString[afterIndex + 1])) { + textAfterPos = SURROGATE_TO_UCS4(textAfterPos, aInString[afterIndex + 1]); + } + + return !((before == LT_ALPHA && !IsAlpha(text0)) || + (before == LT_DIGIT && !IsDigit(text0)) || + (before == LT_DELIMITER && + (IsAlpha(text0) || IsDigit(text0) || text0 == *rep)) || + (after == LT_ALPHA && !IsAlpha(textAfterPos)) || + (after == LT_DIGIT && !IsDigit(textAfterPos)) || + (after == LT_DELIMITER && + (IsAlpha(textAfterPos) || IsDigit(textAfterPos) || + textAfterPos == *rep)) || + !Substring(Substring(aInString, aInString + aInLength), ignoreLen, + aRepLen) + .Equals(Substring(rep, rep + aRepLen), + nsCaseInsensitiveStringComparator)); +} + +uint32_t mozTXTToHTMLConv::NumberOfMatches(const char16_t* aInString, + int32_t aInStringLength, + const char16_t* rep, int32_t aRepLen, + LIMTYPE before, LIMTYPE after) { + uint32_t result = 0; + + // Limit lookahead length to avoid pathological O(n^2) behavior; looking so + // far ahead is unlikely to be important for cases where styling marked-up + // fragments is actually useful anyhow. + const uint32_t len = + std::min(2000u, mozilla::AssertedCast<uint32_t>(aInStringLength)); + GraphemeClusterBreakIteratorUtf16 ci(Span<const char16_t>(aInString, len)); + for (uint32_t pos = 0; pos < len; pos = *ci.Next()) { + if (ItMatchesDelimited(aInString + pos, aInStringLength - pos, rep, aRepLen, + before, after)) { + result++; + } + } + return result; +} + +// NOTE: the converted html for the phrase is appended to aOutString +// tagHTML and attributeHTML are plain ASCII (literal strings, in fact) +bool mozTXTToHTMLConv::StructPhraseHit( + const char16_t* aInString, int32_t aInStringLength, bool col0, + const char16_t* tagTXT, int32_t aTagTXTLen, const char* tagHTML, + const char* attributeHTML, nsAString& aOutString, uint32_t& openTags) { + /* We're searching for the following pattern: + LT_DELIMITER - "*" - ALPHA - + [ some text (maybe more "*"-pairs) - ALPHA ] "*" - LT_DELIMITER. + <strong> is only inserted, if existence of a pair could be verified + We use the first opening/closing tag, if we can choose */ + + const char16_t* newOffset = aInString; + int32_t newLength = aInStringLength; + if (!col0) // skip the first element? + { + newOffset = &aInString[1]; + newLength = aInStringLength - 1; + } + + // opening tag + if (ItMatchesDelimited(aInString, aInStringLength, tagTXT, aTagTXTLen, + (col0 ? LT_IGNORE : LT_DELIMITER), + LT_ALPHA) // is opening tag + && NumberOfMatches(newOffset, newLength, tagTXT, aTagTXTLen, LT_ALPHA, + LT_DELIMITER) // remaining closing tags + > openTags) { + openTags++; + aOutString.Append('<'); + aOutString.AppendASCII(tagHTML); + aOutString.Append(char16_t(' ')); + aOutString.AppendASCII(attributeHTML); + aOutString.AppendLiteral("><span class=\"moz-txt-tag\">"); + aOutString.Append(tagTXT); + aOutString.AppendLiteral("</span>"); + return true; + } + + // closing tag + if (openTags > 0 && ItMatchesDelimited(aInString, aInStringLength, tagTXT, + aTagTXTLen, LT_ALPHA, LT_DELIMITER)) { + openTags--; + aOutString.AppendLiteral("<span class=\"moz-txt-tag\">"); + aOutString.Append(tagTXT); + aOutString.AppendLiteral("</span></"); + aOutString.AppendASCII(tagHTML); + aOutString.Append(char16_t('>')); + return true; + } + + return false; +} + +bool mozTXTToHTMLConv::SmilyHit(const char16_t* aInString, int32_t aLength, + bool col0, const char* tagTXT, + const nsString& imageName, nsString& outputHTML, + int32_t& glyphTextLen) { + if (!aInString || !tagTXT || imageName.IsEmpty()) return false; + + int32_t tagLen = strlen(tagTXT); + + uint32_t delim = (col0 ? 0 : 1) + tagLen; + + if ((col0 || IsSpace(aInString[0])) && + (aLength <= int32_t(delim) || IsSpace(aInString[delim]) || + (aLength > int32_t(delim + 1) && + (aInString[delim] == '.' || aInString[delim] == ',' || + aInString[delim] == ';' || aInString[delim] == '8' || + aInString[delim] == '>' || aInString[delim] == '!' || + aInString[delim] == '?') && + IsSpace(aInString[delim + 1]))) && + ItMatchesDelimited(aInString, aLength, + NS_ConvertASCIItoUTF16(tagTXT).get(), tagLen, + col0 ? LT_IGNORE : LT_DELIMITER, LT_IGNORE) + // Note: tests at different pos for LT_IGNORE and LT_DELIMITER + ) { + if (!col0) { + outputHTML.Truncate(); + outputHTML.Append(char16_t(' ')); + } + + outputHTML.Append(imageName); // emoji unicode + glyphTextLen = (col0 ? 0 : 1) + tagLen; + return true; + } + + return false; +} + +// the glyph is appended to aOutputString instead of the original string... +bool mozTXTToHTMLConv::GlyphHit(const char16_t* aInString, int32_t aInLength, + bool col0, nsAString& aOutputString, + int32_t& glyphTextLen) { + char16_t text0 = aInString[0]; + char16_t text1 = aInString[1]; + char16_t firstChar = (col0 ? text0 : text1); + + // temporary variable used to store the glyph html text + nsAutoString outputHTML; + bool bTestSmilie; + bool bArg = false; + int i; + + // refactor some of this mess to avoid code duplication and speed execution a + // bit there are two cases that need to be tried one after another. To avoid a + // lot of duplicate code, rolling into a loop + + i = 0; + while (i < 2) { + bTestSmilie = false; + if (!i && (firstChar == ':' || firstChar == ';' || firstChar == '=' || + firstChar == '>' || firstChar == '8' || firstChar == 'O')) { + // first test passed + + bTestSmilie = true; + bArg = col0; + } + if (i && col0 && + (text1 == ':' || text1 == ';' || text1 == '=' || text1 == '>' || + text1 == '8' || text1 == 'O')) { + // second test passed + + bTestSmilie = true; + bArg = false; + } + if (bTestSmilie && (SmilyHit(aInString, aInLength, bArg, ":-)", + u"🙂"_ns, // smile, U+1F642 + outputHTML, glyphTextLen) || + + SmilyHit(aInString, aInLength, bArg, ":)", + u"🙂"_ns, // smile, U+1F642 + outputHTML, glyphTextLen) || + + SmilyHit(aInString, aInLength, bArg, ":-D", + u"😂"_ns, // laughing, U+1F602 + outputHTML, glyphTextLen) || + + SmilyHit(aInString, aInLength, bArg, ":-(", + u"🙁"_ns, // frown, U+1F641 + outputHTML, glyphTextLen) || + + SmilyHit(aInString, aInLength, bArg, ":(", + u"🙁"_ns, // frown, U+1F641 + outputHTML, glyphTextLen) || + + SmilyHit(aInString, aInLength, bArg, ":$", + u"😳"_ns, // embarassed, U+1F633 + outputHTML, glyphTextLen) || + + SmilyHit(aInString, aInLength, bArg, ";-)", + u"😉"_ns, // wink, U+1F609 + outputHTML, glyphTextLen) || + + SmilyHit(aInString, aInLength, col0, ";)", + u"😉"_ns, // wink, U+1F609 + outputHTML, glyphTextLen) || + + SmilyHit(aInString, aInLength, bArg, ":-\\", + u"😕"_ns, // undecided, U+1F615 + outputHTML, glyphTextLen) || + + SmilyHit(aInString, aInLength, bArg, ":-P", + u"😛"_ns, // tongue, U+1F61B + outputHTML, glyphTextLen) || + + SmilyHit(aInString, aInLength, bArg, ";-P", + u"😜"_ns, // winking face with tongue, U+1F61C + outputHTML, glyphTextLen) || + + SmilyHit(aInString, aInLength, bArg, "=-O", + u"😮"_ns, // surprise, U+1F62E + outputHTML, glyphTextLen) || + + SmilyHit(aInString, aInLength, bArg, ":-*", + u"😘"_ns, // kiss, U+1F618 + outputHTML, glyphTextLen) || + + SmilyHit(aInString, aInLength, bArg, ">:o", + u"🤬"_ns, // swearing, U+1F92C + outputHTML, glyphTextLen) || + + SmilyHit(aInString, aInLength, bArg, ">:-o", + u"🤬"_ns, // swearing, U+1F92C + outputHTML, glyphTextLen) || + + SmilyHit(aInString, aInLength, bArg, ">:(", + u"😠"_ns, // angry, U+1F620 + outputHTML, glyphTextLen) || + + SmilyHit(aInString, aInLength, bArg, ">:-(", + u"😠"_ns, // angry, U+1F620 + outputHTML, glyphTextLen) || + + SmilyHit(aInString, aInLength, bArg, "8-)", + u"😎"_ns, // cool, U+1F60E + outputHTML, glyphTextLen) || + + SmilyHit(aInString, aInLength, bArg, ":-$", + u"🤑"_ns, // money, U+1F911 + outputHTML, glyphTextLen) || + + SmilyHit(aInString, aInLength, bArg, ":-!", + u"😬"_ns, // foot, U+1F62C + outputHTML, glyphTextLen) || + + SmilyHit(aInString, aInLength, bArg, "O:-)", + u"😇"_ns, // innocent, U+1F607 + outputHTML, glyphTextLen) || + + SmilyHit(aInString, aInLength, bArg, ":'(", + u"😭"_ns, // cry, U+1F62D + outputHTML, glyphTextLen) || + + SmilyHit(aInString, aInLength, bArg, ":-X", + u"🤐"_ns, // sealed, U+1F910 + outputHTML, glyphTextLen))) { + aOutputString.Append(outputHTML); + return true; + } + i++; + } + if (text0 == '\f') { + aOutputString.AppendLiteral("<span class='moz-txt-formfeed'></span>"); + glyphTextLen = 1; + return true; + } + if (text0 == '+' || text1 == '+') { + if (ItMatchesDelimited(aInString, aInLength, u" +/-", 4, LT_IGNORE, + LT_IGNORE)) { + aOutputString.AppendLiteral(" ±"); + glyphTextLen = 4; + return true; + } + if (col0 && ItMatchesDelimited(aInString, aInLength, u"+/-", 3, LT_IGNORE, + LT_IGNORE)) { + aOutputString.AppendLiteral("±"); + glyphTextLen = 3; + return true; + } + } + + // x^2 => x<sup>2</sup>, also handle powers x^-2, x^0.5 + // implement regular expression /[\dA-Za-z\)\]}]\^-?\d+(\.\d+)*[^\dA-Za-z]/ + if (text1 == '^' && + (IsAsciiDigit(text0) || IsAsciiAlpha(text0) || text0 == ')' || + text0 == ']' || text0 == '}') && + ((2 < aInLength && IsAsciiDigit(aInString[2])) || + (3 < aInLength && aInString[2] == '-' && IsAsciiDigit(aInString[3])))) { + // Find first non-digit + int32_t delimPos = 3; // skip "^" and first digit (or '-') + for (; delimPos < aInLength && + (IsAsciiDigit(aInString[delimPos]) || + (aInString[delimPos] == '.' && delimPos + 1 < aInLength && + IsAsciiDigit(aInString[delimPos + 1]))); + delimPos++) { + ; + } + + if (delimPos < aInLength && IsAsciiAlpha(aInString[delimPos])) { + return false; + } + + outputHTML.Truncate(); + outputHTML += text0; + outputHTML.AppendLiteral( + "<sup class=\"moz-txt-sup\">" + "<span style=\"display:inline-block;width:0;height:0;overflow:hidden\">" + "^</span>"); + + aOutputString.Append(outputHTML); + aOutputString.Append(&aInString[2], delimPos - 2); + aOutputString.AppendLiteral("</sup>"); + + glyphTextLen = delimPos /* - 1 + 1 */; + return true; + } + /* + The following strings are not substituted: + |TXT |HTML |Reason + +------+---------+---------- + -> ← Bug #454 + => ⇐ dito + <- → dito + <= ⇒ dito + (tm) ™ dito + 1/4 ¼ is triggered by 1/4 Part 1, 2/4 Part 2, ... + 3/4 ¾ dito + 1/2 ½ similar + */ + return false; +} + +/*************************************************************************** + Library-internal Interface +****************************************************************************/ + +NS_IMPL_ISUPPORTS(mozTXTToHTMLConv, mozITXTToHTMLConv, nsIStreamConverter, + nsIThreadRetargetableStreamListener, nsIStreamListener, + nsIRequestObserver) + +int32_t mozTXTToHTMLConv::CiteLevelTXT(const char16_t* line, + uint32_t& logLineStart) { + int32_t result = 0; + int32_t lineLength = NS_strlen(line); + + bool moreCites = true; + while (moreCites) { + /* E.g. the following lines count as quote: + + > text + //#ifdef QUOTE_RECOGNITION_AGGRESSIVE + >text + //#ifdef QUOTE_RECOGNITION_AGGRESSIVE + > text + ] text + USER> text + USER] text + //#endif + + logLineStart is the position of "t" in this example + */ + uint32_t i = logLineStart; + +#ifdef QUOTE_RECOGNITION_AGGRESSIVE + for (; int32_t(i) < lineLength && IsSpace(line[i]); i++) + ; + for (; int32_t(i) < lineLength && IsAsciiAlpha(line[i]) && + nsCRT::IsUpper(line[i]); + i++) + ; + if (int32_t(i) < lineLength && (line[i] == '>' || line[i] == ']')) +#else + if (int32_t(i) < lineLength && line[i] == '>') +#endif + { + i++; + if (int32_t(i) < lineLength && line[i] == ' ') i++; + // sendmail/mbox + // Placed here for performance increase + const char16_t* indexString = &line[logLineStart]; + // here, |logLineStart < lineLength| is always true + uint32_t minlength = std::min(uint32_t(6), NS_strlen(indexString)); + if (Substring(indexString, indexString + minlength) + .Equals(Substring(u">From "_ns, 0, minlength), + nsCaseInsensitiveStringComparator)) { + // XXX RFC2646 + moreCites = false; + } else { + result++; + logLineStart = i; + } + } else { + moreCites = false; + } + } + + return result; +} + +NS_IMETHODIMP +mozTXTToHTMLConv::ScanTXT(const nsAString& aInString, uint32_t whattodo, + nsAString& aOutString) { + if (aInString.Length() == 0) { + aOutString.Truncate(); + return NS_OK; + } + + if (!aOutString.SetCapacity(uint32_t(aInString.Length() * growthRate), + mozilla::fallible)) { + return NS_ERROR_OUT_OF_MEMORY; + } + + bool doURLs = 0 != (whattodo & kURLs); + bool doGlyphSubstitution = 0 != (whattodo & kGlyphSubstitution); + bool doStructPhrase = 0 != (whattodo & kStructPhrase); + + uint32_t structPhrase_strong = 0; // Number of currently open tags + uint32_t structPhrase_underline = 0; + uint32_t structPhrase_italic = 0; + uint32_t structPhrase_code = 0; + + uint32_t endOfLastURLOutput = 0; + + nsAutoString outputHTML; // moved here for performance increase + + const char16_t* rawInputString = aInString.BeginReading(); + uint32_t inLength = aInString.Length(); + + const Span<const char16_t> inString(aInString); + GraphemeClusterBreakIteratorUtf16 ci(inString); + uint32_t i = 0; + while (i < inLength) { + if (doGlyphSubstitution) { + int32_t glyphTextLen; + if (GlyphHit(&rawInputString[i], inLength - i, i == 0, aOutString, + glyphTextLen)) { + i = *ci.Seek(i + glyphTextLen - 1); + continue; + } + } + + if (doStructPhrase) { + const char16_t* newOffset = rawInputString; + int32_t newLength = aInString.Length(); + if (i > 0) // skip the first element? + { + GraphemeClusterBreakReverseIteratorUtf16 ri( + Span<const char16_t>(rawInputString, i)); + Maybe<uint32_t> nextPos = ri.Next(); + newOffset += *nextPos; + newLength -= *nextPos; + } + + switch (aInString[i]) // Performance increase + { + case '*': + if (StructPhraseHit(newOffset, newLength, i == 0, u"*", 1, "b", + "class=\"moz-txt-star\"", aOutString, + structPhrase_strong)) { + i = *ci.Next(); + continue; + } + break; + case '/': + if (StructPhraseHit(newOffset, newLength, i == 0, u"/", 1, "i", + "class=\"moz-txt-slash\"", aOutString, + structPhrase_italic)) { + i = *ci.Next(); + continue; + } + break; + case '_': + if (StructPhraseHit(newOffset, newLength, i == 0, u"_", 1, + "span" /* <u> is deprecated */, + "class=\"moz-txt-underscore\"", aOutString, + structPhrase_underline)) { + i = *ci.Next(); + continue; + } + break; + case '|': + if (StructPhraseHit(newOffset, newLength, i == 0, u"|", 1, "code", + "class=\"moz-txt-verticalline\"", aOutString, + structPhrase_code)) { + i = *ci.Next(); + continue; + } + break; + } + } + + if (doURLs) { + switch (aInString[i]) { + case ':': + case '@': + case '.': + if ((i == 0 || ((i > 0) && aInString[i - 1] != ' ')) && + ((i == aInString.Length() - 1) || + (aInString[i + 1] != ' '))) // Performance increase + { + int32_t replaceBefore; + int32_t replaceAfter; + if (FindURL(rawInputString, aInString.Length(), i, whattodo, + outputHTML, replaceBefore, replaceAfter) && + structPhrase_strong + structPhrase_italic + + structPhrase_underline + structPhrase_code == + 0 + /* workaround for bug #19445 */) { + // Don't cut into previously inserted HTML (bug 1509493) + if (aOutString.Length() - replaceBefore < endOfLastURLOutput) { + break; + } + aOutString.Cut(aOutString.Length() - replaceBefore, + replaceBefore); + aOutString += outputHTML; + endOfLastURLOutput = aOutString.Length(); + i = *ci.Seek(i + replaceAfter); + continue; + } + } + break; + } // switch + } + + switch (aInString[i]) { + // Special symbols + case '<': + case '>': + case '&': + EscapeChar(aInString[i], aOutString, false); + i = *ci.Next(); + break; + // Normal characters + default: { + const uint32_t oldIdx = i; + i = *ci.Next(); + aOutString.Append(inString.FromTo(oldIdx, i)); + break; + } + } + } + return NS_OK; +} + +NS_IMETHODIMP +mozTXTToHTMLConv::ScanHTML(const nsAString& input, uint32_t whattodo, + nsAString& aOutString) { + const nsPromiseFlatString& aInString = PromiseFlatString(input); + if (!aOutString.SetCapacity(uint32_t(aInString.Length() * growthRate), + mozilla::fallible)) { + return NS_ERROR_OUT_OF_MEMORY; + } + + // some common variables we were recalculating + // every time inside the for loop... + int32_t lengthOfInString = aInString.Length(); + const char16_t* uniBuffer = aInString.get(); + +#ifdef DEBUG_BenB_Perf + PRTime parsing_start = PR_IntervalNow(); +#endif + + // Look for simple entities not included in a tags and scan them. + // Skip all tags ("<[...]>") and content in an a link tag ("<a [...]</a>"), + // comment tag ("<!--[...]-->"), style tag, script tag or head tag. + // Unescape the rest (text between tags) and pass it to ScanTXT. + nsAutoCString canFollow(" \f\n\r\t>"); + for (int32_t i = 0; i < lengthOfInString;) { + if (aInString[i] == '<') // html tag + { + int32_t start = i; + if (i + 2 < lengthOfInString && nsCRT::ToLower(aInString[i + 1]) == 'a' && + canFollow.FindChar(aInString[i + 2]) != kNotFound) + // if a tag, skip until </a>. + // Make sure there's a white-space character after, not to match "abbr". + { + i = aInString.LowerCaseFindASCII("</a>", i); + if (i == kNotFound) { + i = lengthOfInString; + } else { + i += 4; + } + } else if (Substring(aInString, i + 1, 3).LowerCaseEqualsASCII("!--")) + // if out-commended code, skip until --> + { + i = aInString.Find(u"-->", i); + if (i == kNotFound) { + i = lengthOfInString; + } else { + i += 3; + } + } else if (i + 6 < lengthOfInString && + Substring(aInString, i + 1, 5).LowerCaseEqualsASCII("style") && + canFollow.FindChar(aInString[i + 6]) != kNotFound) + // if style tag, skip until </style> + { + i = aInString.LowerCaseFindASCII("</style>", i); + if (i == kNotFound) { + i = lengthOfInString; + } else { + i += 8; + } + } else if (i + 7 < lengthOfInString && + Substring(aInString, i + 1, 6) + .LowerCaseEqualsASCII("script") && + canFollow.FindChar(aInString[i + 7]) != kNotFound) + // if script tag, skip until </script> + { + i = aInString.LowerCaseFindASCII("</script>", i); + if (i == kNotFound) { + i = lengthOfInString; + } else { + i += 9; + } + } else if (i + 5 < lengthOfInString && + Substring(aInString, i + 1, 4).LowerCaseEqualsASCII("head") && + canFollow.FindChar(aInString[i + 5]) != kNotFound) + // if head tag, skip until </head> + // Make sure not to match <header>. + { + i = aInString.LowerCaseFindASCII("</head>", i); + if (i == kNotFound) { + i = lengthOfInString; + } else { + i += 7; + } + } else // just skip tag (attributes etc.) + { + i = aInString.FindChar('>', i); + if (i == kNotFound) { + i = lengthOfInString; + } else { + i++; + } + } + aOutString.Append(&uniBuffer[start], i - start); + } else { + uint32_t start = uint32_t(i); + i = aInString.FindChar('<', i); + if (i == kNotFound) i = lengthOfInString; + + nsAutoStringN<256> tempString; + tempString.SetCapacity(uint32_t((uint32_t(i) - start) * growthRate)); + UnescapeStr(uniBuffer, start, uint32_t(i) - start, tempString); + ScanTXT(tempString, whattodo, aOutString); + } + } + +#ifdef DEBUG_BenB_Perf + printf("ScanHTML time: %d ms\n", + PR_IntervalToMilliseconds(PR_IntervalNow() - parsing_start)); +#endif + return NS_OK; +} + +/**************************************************************************** + XPCOM Interface +*****************************************************************************/ + +NS_IMETHODIMP +mozTXTToHTMLConv::Convert(nsIInputStream* aFromStream, const char* aFromType, + const char* aToType, nsISupports* aCtxt, + nsIInputStream** _retval) { + return NS_ERROR_NOT_IMPLEMENTED; +} + +NS_IMETHODIMP +mozTXTToHTMLConv::AsyncConvertData(const char* aFromType, const char* aToType, + nsIStreamListener* aListener, + nsISupports* aCtxt) { + return NS_ERROR_NOT_IMPLEMENTED; +} + +NS_IMETHODIMP +mozTXTToHTMLConv::GetConvertedType(const nsACString& aFromType, + nsIChannel* aChannel, nsACString& aToType) { + return NS_ERROR_NOT_IMPLEMENTED; +} + +NS_IMETHODIMP +mozTXTToHTMLConv::OnDataAvailable(nsIRequest* request, nsIInputStream* inStr, + uint64_t sourceOffset, uint32_t count) { + return NS_ERROR_NOT_IMPLEMENTED; +} + +NS_IMETHODIMP +mozTXTToHTMLConv::OnDataFinished(nsresult aStatus) { + return NS_ERROR_NOT_IMPLEMENTED; +} + +NS_IMETHODIMP +mozTXTToHTMLConv::CheckListenerChain() { return NS_ERROR_NOT_IMPLEMENTED; } + +NS_IMETHODIMP +mozTXTToHTMLConv::OnStartRequest(nsIRequest* request) { + return NS_ERROR_NOT_IMPLEMENTED; +} + +NS_IMETHODIMP +mozTXTToHTMLConv::OnStopRequest(nsIRequest* request, nsresult aStatus) { + return NS_ERROR_NOT_IMPLEMENTED; +} + +NS_IMETHODIMP +mozTXTToHTMLConv::CiteLevelTXT(const char16_t* line, uint32_t* logLineStart, + uint32_t* _retval) { + if (!logLineStart || !_retval || !line) return NS_ERROR_NULL_POINTER; + *_retval = CiteLevelTXT(line, *logLineStart); + return NS_OK; +} + +nsresult MOZ_NewTXTToHTMLConv(mozTXTToHTMLConv** aConv) { + MOZ_ASSERT(aConv != nullptr, "null ptr"); + if (!aConv) return NS_ERROR_NULL_POINTER; + + RefPtr<mozTXTToHTMLConv> conv = new mozTXTToHTMLConv(); + conv.forget(aConv); + // return (*aConv)->Init(); + return NS_OK; +} |