diff options
Diffstat (limited to '')
-rw-r--r-- | comm/mailnews/search/src/nsMsgBodyHandler.cpp | 464 |
1 files changed, 464 insertions, 0 deletions
diff --git a/comm/mailnews/search/src/nsMsgBodyHandler.cpp b/comm/mailnews/search/src/nsMsgBodyHandler.cpp new file mode 100644 index 0000000000..5b77750f63 --- /dev/null +++ b/comm/mailnews/search/src/nsMsgBodyHandler.cpp @@ -0,0 +1,464 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "msgCore.h" +#include "nsMsgSearchCore.h" +#include "nsMsgUtils.h" +#include "nsMsgBodyHandler.h" +#include "nsMsgSearchTerm.h" +#include "nsIMsgHdr.h" +#include "nsMsgMessageFlags.h" +#include "nsISeekableStream.h" +#include "nsIInputStream.h" +#include "nsIFile.h" +#include "plbase64.h" +#include "prmem.h" +#include "nsMimeTypes.h" + +nsMsgBodyHandler::nsMsgBodyHandler(nsIMsgSearchScopeTerm* scope, + uint32_t numLines, nsIMsgDBHdr* msg, + nsIMsgDatabase* db) { + m_scope = scope; + m_numLocalLines = numLines; + uint32_t flags; + m_lineCountInBodyLines = NS_SUCCEEDED(msg->GetFlags(&flags)) + ? !(flags & nsMsgMessageFlags::Offline) + : true; + // account for added x-mozilla-status lines, and envelope line. + if (!m_lineCountInBodyLines) m_numLocalLines += 3; + m_msgHdr = msg; + m_db = db; + + // the following are variables used when the body handler is handling stuff + // from filters....through this constructor, that is not the case so we set + // them to NULL. + m_headers = nullptr; + m_headersSize = 0; + m_Filtering = false; // make sure we set this before we call initialize... + + Initialize(); // common initialization stuff + OpenLocalFolder(); +} + +nsMsgBodyHandler::nsMsgBodyHandler(nsIMsgSearchScopeTerm* scope, + uint32_t numLines, nsIMsgDBHdr* msg, + nsIMsgDatabase* db, const char* headers, + uint32_t headersSize, bool Filtering) { + m_scope = scope; + m_numLocalLines = numLines; + uint32_t flags; + m_lineCountInBodyLines = NS_SUCCEEDED(msg->GetFlags(&flags)) + ? !(flags & nsMsgMessageFlags::Offline) + : true; + // account for added x-mozilla-status lines, and envelope line. + if (!m_lineCountInBodyLines) m_numLocalLines += 3; + m_msgHdr = msg; + m_db = db; + m_headers = nullptr; + m_headersSize = 0; + m_Filtering = Filtering; + + Initialize(); + + if (m_Filtering) { + m_headers = headers; + m_headersSize = headersSize; + } else { + OpenLocalFolder(); + } +} + +void nsMsgBodyHandler::Initialize() +// common initialization code regardless of what body type we are handling... +{ + // Default transformations for local message search and MAPI access + m_stripHeaders = true; + m_partIsHtml = false; + m_base64part = false; + m_partIsQP = false; + m_isMultipart = false; + m_partIsText = true; // Default is text/plain, maybe proven otherwise later. + m_pastMsgHeaders = false; + m_pastPartHeaders = false; + m_inMessageAttachment = false; + m_headerBytesRead = 0; +} + +nsMsgBodyHandler::~nsMsgBodyHandler() {} + +int32_t nsMsgBodyHandler::GetNextLine(nsCString& buf, nsCString& charset) { + int32_t length = -1; // length of incoming line or -1 eof + int32_t outLength = -1; // length of outgoing line or -1 eof + bool eatThisLine = true; + nsAutoCString nextLine; + + while (eatThisLine) { + // first, handle the filtering case...this is easy.... + if (m_Filtering) { + length = GetNextFilterLine(nextLine); + } else { + // 3 cases: Offline IMAP, POP, or we are dealing with a news message.... + // Offline cases should be same as local mail cases, since we're going + // to store offline messages in berkeley format folders. + if (m_db) { + length = GetNextLocalLine(nextLine); // (2) POP + } + } + + if (length < 0) break; // eof in + + outLength = ApplyTransformations(nextLine, length, eatThisLine, buf); + } + + if (outLength < 0) return -1; // eof out + + // For non-multipart messages, the entire message minus headers is encoded + // ApplyTransformations can only decode a part + if (!m_isMultipart && m_base64part) { + Base64Decode(buf); + m_base64part = false; + // And reapply our transformations... + outLength = ApplyTransformations(buf, buf.Length(), eatThisLine, buf); + } + + // Process aggregated HTML. + if (!m_isMultipart && m_partIsHtml) { + StripHtml(buf); + outLength = buf.Length(); + } + + charset = m_partCharset; + return outLength; +} + +void nsMsgBodyHandler::OpenLocalFolder() { + nsCOMPtr<nsIInputStream> inputStream; + nsresult rv = m_scope->GetInputStream(m_msgHdr, getter_AddRefs(inputStream)); + // Warn and return if GetInputStream fails + NS_ENSURE_SUCCESS_VOID(rv); + m_fileLineStream = do_QueryInterface(inputStream); +} + +int32_t nsMsgBodyHandler::GetNextFilterLine(nsCString& buf) { + // m_nextHdr always points to the next header in the list....the list is NULL + // terminated... + uint32_t numBytesCopied = 0; + if (m_headersSize > 0) { + // #mscott. Ugly hack! filter headers list have CRs & LFs inside the NULL + // delimited list of header strings. It is possible to have: To NULL CR LF + // From. We want to skip over these CR/LFs if they start at the beginning of + // what we think is another header. + + while (m_headersSize > 0 && (m_headers[0] == '\r' || m_headers[0] == '\n' || + m_headers[0] == ' ' || m_headers[0] == '\0')) { + m_headers++; // skip over these chars... + m_headersSize--; + } + + if (m_headersSize > 0) { + numBytesCopied = strlen(m_headers) + 1; + buf.Assign(m_headers); + m_headers += numBytesCopied; + // be careful...m_headersSize is unsigned. Don't let it go negative or we + // overflow to 2^32....*yikes* + if (m_headersSize < numBytesCopied) + m_headersSize = 0; + else + m_headersSize -= numBytesCopied; // update # bytes we have read from + // the headers list + + return (int32_t)numBytesCopied; + } + } else if (m_headersSize == 0) { + buf.Truncate(); + } + return -1; +} + +// return -1 if no more local lines, length of next line otherwise. + +int32_t nsMsgBodyHandler::GetNextLocalLine(nsCString& buf) +// returns number of bytes copied +{ + if (m_numLocalLines) { + // I the line count is in body lines, only decrement once we have + // processed all the headers. Otherwise the line is not in body + // lines and we want to decrement for every line. + if (m_pastMsgHeaders || !m_lineCountInBodyLines) m_numLocalLines--; + // do we need to check the return value here? + if (m_fileLineStream) { + bool more = false; + nsresult rv = m_fileLineStream->ReadLine(buf, &more); + if (NS_SUCCEEDED(rv)) return buf.Length(); + } + } + + return -1; +} + +/** + * This method applies a sequence of transformations to the line. + * + * It applies the following sequences in order + * * Removes headers if the searcher doesn't want them + * (sets m_past*Headers) + * * Determines the current MIME type. + * (via SniffPossibleMIMEHeader) + * * Strips any HTML if the searcher doesn't want it + * * Strips non-text parts + * * Decodes any base64 part + * (resetting part variables: m_base64part, m_pastPartHeaders, m_partIsHtml, + * m_partIsText) + * + * @param line (in) the current line + * @param length (in) the length of said line + * @param eatThisLine (out) whether or not to ignore this line + * @param buf (inout) if m_base64part, the current part as needed for + * decoding; else, it is treated as an out param (a + * redundant version of line). + * @return the length of the line after applying transformations + */ +int32_t nsMsgBodyHandler::ApplyTransformations(const nsCString& line, + int32_t length, + bool& eatThisLine, + nsCString& buf) { + eatThisLine = false; + + if (!m_pastPartHeaders) // line is a line from the part headers + { + if (m_stripHeaders) eatThisLine = true; + + // We have already grabbed all worthwhile information from the headers, + // so there is no need to keep track of the current lines + buf.Assign(line); + + SniffPossibleMIMEHeader(buf); + + if (buf.IsEmpty() || buf.First() == '\r' || buf.First() == '\n') { + if (!m_inMessageAttachment) { + m_pastPartHeaders = true; + } else { + // We're in a message attachment and have just read past the + // part header for the attached message. We now need to read + // the message headers and any part headers. + // We can now forget about the special handling of attached messages. + m_inMessageAttachment = false; + } + } + + // We set m_pastMsgHeaders to 'true' only once. + if (m_pastPartHeaders) m_pastMsgHeaders = true; + + return length; + } + + // Check to see if this is one of our boundary strings. + bool matchedBoundary = false; + if (m_isMultipart && m_boundaries.Length() > 0) { + for (int32_t i = (int32_t)m_boundaries.Length() - 1; i >= 0; i--) { + if (StringBeginsWith(line, m_boundaries[i])) { + matchedBoundary = true; + // If we matched a boundary, we won't need the nested/later ones any + // more. + m_boundaries.SetLength(i + 1); + break; + } + } + } + if (matchedBoundary) { + if (m_base64part && m_partIsText) { + Base64Decode(buf); + // Work on the parsed string + if (!buf.Length()) { + NS_WARNING("Trying to transform an empty buffer"); + eatThisLine = true; + } else { + // It is wrong to call ApplyTransformations() here since this will + // lead to the buffer being doubled-up at |buf.Append(line);| + // below. ApplyTransformations(buf, buf.Length(), eatThisLine, buf); + // Avoid spurious failures + eatThisLine = false; + } + } else if (!m_partIsHtml) { + buf.Truncate(); + eatThisLine = true; // We have no content... + } + + if (m_partIsHtml) { + StripHtml(buf); + } + + // Reset all assumed headers + m_base64part = false; + // Get ready to sniff new part headers, but do not reset m_pastMsgHeaders + // since it will screw the body line count. + m_pastPartHeaders = false; + m_partIsHtml = false; + // If we ever see a multipart message, each part needs to set + // 'm_partIsText', so no more defaulting to 'true' when the part is done. + m_partIsText = false; + + // Note: we cannot reset 'm_partIsQP' yet since we still need it to process + // the last buffer returned here. Parsing the next part will set a new + // value. + return buf.Length(); + } + + if (!m_partIsText) { + // Ignore non-text parts + buf.Truncate(); + eatThisLine = true; + return 0; + } + + // Accumulate base64 parts and HTML parts for later decoding or tag stripping. + if (m_base64part || m_partIsHtml) { + if (m_partIsHtml && !m_base64part) { + size_t bufLength = buf.Length(); + if (!m_partIsQP || bufLength == 0 || !StringEndsWith(buf, "="_ns)) { + // Replace newline in HTML with a space. + buf.Append(' '); + } else { + // Strip the soft line break. + buf.SetLength(bufLength - 1); + } + } + buf.Append(line); + eatThisLine = true; + return buf.Length(); + } + + buf.Assign(line); + return buf.Length(); +} + +void nsMsgBodyHandler::StripHtml(nsCString& pBufInOut) { + char* pBuf = (char*)PR_Malloc(pBufInOut.Length() + 1); + if (pBuf) { + char* pWalk = pBuf; + + char* pWalkInOut = (char*)pBufInOut.get(); + bool inTag = false; + while (*pWalkInOut) // throw away everything inside < > + { + if (!inTag) { + if (*pWalkInOut == '<') + inTag = true; + else + *pWalk++ = *pWalkInOut; + } else { + if (*pWalkInOut == '>') inTag = false; + } + pWalkInOut++; + } + *pWalk = 0; // null terminator + + pBufInOut.Adopt(pBuf); + } +} + +/** + * Determines the MIME type, if present, from the current line. + * + * m_partIsHtml, m_isMultipart, m_partIsText, m_base64part, and boundary are + * all set by this method at various points in time. + * + * @param line (in) a header line that may contain a MIME header + */ +void nsMsgBodyHandler::SniffPossibleMIMEHeader(const nsCString& line) { + // Some parts of MIME are case-sensitive and other parts are case-insensitive; + // specifically, the headers are all case-insensitive and the values we care + // about are also case-insensitive, with the sole exception of the boundary + // string, so we can't just take the input line and make it lower case. + nsCString lowerCaseLine(line); + ToLowerCase(lowerCaseLine); + + if (StringBeginsWith(lowerCaseLine, "content-transfer-encoding:"_ns)) + m_partIsQP = lowerCaseLine.Find("quoted-printable") != kNotFound; + + if (StringBeginsWith(lowerCaseLine, "content-type:"_ns)) { + if (lowerCaseLine.LowerCaseFindASCII("text/html") != kNotFound) { + m_partIsText = true; + m_partIsHtml = true; + } else if (lowerCaseLine.Find("multipart/") != kNotFound) { + if (m_isMultipart) { + // Nested multipart, get ready for new headers. + m_base64part = false; + m_partIsQP = false; + m_pastPartHeaders = false; + m_partIsHtml = false; + m_partIsText = false; + } + m_isMultipart = true; + m_partCharset.Truncate(); + } else if (lowerCaseLine.Find("message/") != kNotFound) { + // Initialise again. + m_base64part = false; + m_partIsQP = false; + m_pastPartHeaders = false; + m_partIsHtml = false; + m_partIsText = + true; // Default is text/plain, maybe proven otherwise later. + m_inMessageAttachment = true; + } else if (lowerCaseLine.Find("text/") != kNotFound) + m_partIsText = true; + else if (lowerCaseLine.Find("text/") == kNotFound) + m_partIsText = false; // We have disproven our assumption. + } + + int32_t start; + if (m_isMultipart && (start = lowerCaseLine.Find("boundary=")) != kNotFound) { + start += 9; // strlen("boundary=") + if (line[start] == '\"') start++; + int32_t end = line.RFindChar('\"'); + if (end == -1) end = line.Length(); + + // Collect all boundaries. Since we only react to crossing a boundary, + // we can simply collect the boundaries instead of forming a tree + // structure from the message. Keep it simple ;-) + nsCString boundary; + boundary.AssignLiteral("--"); + boundary.Append(Substring(line, start, end - start)); + if (!m_boundaries.Contains(boundary)) m_boundaries.AppendElement(boundary); + } + + if (m_isMultipart && (start = lowerCaseLine.Find("charset=")) != kNotFound) { + start += 8; // strlen("charset=") + bool foundQuote = false; + if (line[start] == '\"') { + start++; + foundQuote = true; + } + int32_t end = line.FindChar(foundQuote ? '\"' : ';', start); + if (end == -1) end = line.Length(); + + m_partCharset.Assign(Substring(line, start, end - start)); + } + + if (StringBeginsWith(lowerCaseLine, "content-transfer-encoding:"_ns) && + lowerCaseLine.LowerCaseFindASCII(ENCODING_BASE64) != kNotFound) + m_base64part = true; +} + +/** + * Decodes the given base64 string. + * + * It returns its decoded string in its input. + * + * @param pBufInOut (inout) a buffer of the string + */ +void nsMsgBodyHandler::Base64Decode(nsCString& pBufInOut) { + char* decodedBody = + PL_Base64Decode(pBufInOut.get(), pBufInOut.Length(), nullptr); + if (decodedBody) { + // Replace CR LF with spaces. + char* q = decodedBody; + while (*q) { + if (*q == '\n' || *q == '\r') *q = ' '; + q++; + } + pBufInOut.Adopt(decodedBody); + } +} |