From 6bf0a5cb5034a7e684dcc3500e841785237ce2dd Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 19:32:43 +0200 Subject: Adding upstream version 1:115.7.0. Signed-off-by: Daniel Baumann --- .../bayesian-spam-filter/components.conf | 15 + .../extensions/bayesian-spam-filter/moz.build | 16 + .../bayesian-spam-filter/nsBayesianFilter.cpp | 2548 ++++++++++++++++++++ .../bayesian-spam-filter/nsBayesianFilter.h | 397 +++ .../bayesian-spam-filter/nsIncompleteGamma.h | 239 ++ .../bayesian-spam-filter/test/unit/head_bayes.js | 28 + .../test/unit/resources/aliases.dat | Bin 0 -> 446 bytes .../test/unit/resources/aliases1.eml | 6 + .../test/unit/resources/aliases2.eml | 6 + .../test/unit/resources/aliases3.eml | 6 + .../test/unit/resources/ham1.eml | 7 + .../test/unit/resources/ham2.eml | 8 + .../test/unit/resources/msgCorpus.dat | Bin 0 -> 2447 bytes .../test/unit/resources/spam1.eml | 7 + .../test/unit/resources/spam2.eml | 8 + .../test/unit/resources/spam3.eml | 7 + .../test/unit/resources/spam4.eml | 8 + .../test/unit/resources/tokenTest.eml | 14 + .../test/unit/resources/trainingfile.js | 108 + .../test/unit/test_bug228675.js | 136 ++ .../test/unit/test_customTokenization.js | 197 ++ .../test/unit/test_junkAsTraits.js | 574 +++++ .../test/unit/test_msgCorpus.js | 144 ++ .../test/unit/test_traitAliases.js | 172 ++ .../bayesian-spam-filter/test/unit/test_traits.js | 287 +++ .../bayesian-spam-filter/test/unit/xpcshell.ini | 11 + 26 files changed, 4949 insertions(+) create mode 100644 comm/mailnews/extensions/bayesian-spam-filter/components.conf create mode 100644 comm/mailnews/extensions/bayesian-spam-filter/moz.build create mode 100644 comm/mailnews/extensions/bayesian-spam-filter/nsBayesianFilter.cpp create mode 100644 comm/mailnews/extensions/bayesian-spam-filter/nsBayesianFilter.h create mode 100644 comm/mailnews/extensions/bayesian-spam-filter/nsIncompleteGamma.h create mode 100644 comm/mailnews/extensions/bayesian-spam-filter/test/unit/head_bayes.js create mode 100644 comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/aliases.dat create mode 100644 comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/aliases1.eml create mode 100644 comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/aliases2.eml create mode 100644 comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/aliases3.eml create mode 100644 comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/ham1.eml create mode 100644 comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/ham2.eml create mode 100644 comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/msgCorpus.dat create mode 100644 comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/spam1.eml create mode 100644 comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/spam2.eml create mode 100644 comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/spam3.eml create mode 100644 comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/spam4.eml create mode 100644 comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/tokenTest.eml create mode 100644 comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/trainingfile.js create mode 100644 comm/mailnews/extensions/bayesian-spam-filter/test/unit/test_bug228675.js create mode 100644 comm/mailnews/extensions/bayesian-spam-filter/test/unit/test_customTokenization.js create mode 100644 comm/mailnews/extensions/bayesian-spam-filter/test/unit/test_junkAsTraits.js create mode 100644 comm/mailnews/extensions/bayesian-spam-filter/test/unit/test_msgCorpus.js create mode 100644 comm/mailnews/extensions/bayesian-spam-filter/test/unit/test_traitAliases.js create mode 100644 comm/mailnews/extensions/bayesian-spam-filter/test/unit/test_traits.js create mode 100644 comm/mailnews/extensions/bayesian-spam-filter/test/unit/xpcshell.ini (limited to 'comm/mailnews/extensions/bayesian-spam-filter') diff --git a/comm/mailnews/extensions/bayesian-spam-filter/components.conf b/comm/mailnews/extensions/bayesian-spam-filter/components.conf new file mode 100644 index 0000000000..98fe2d6aeb --- /dev/null +++ b/comm/mailnews/extensions/bayesian-spam-filter/components.conf @@ -0,0 +1,15 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, you can obtain one at http://mozilla.org/MPL/2.0/. + +Classes = [ + { + "cid": "{f1070bfa-d539-11d6-90ca-00039310a47a}", + "contract_ids": ["@mozilla.org/messenger/filter-plugin;1?name=bayesianfilter"], + "type": "nsBayesianFilter", + "init_method": "Init", + "headers": [ + "/comm/mailnews/extensions/bayesian-spam-filter/nsBayesianFilter.h" + ], + }, +] diff --git a/comm/mailnews/extensions/bayesian-spam-filter/moz.build b/comm/mailnews/extensions/bayesian-spam-filter/moz.build new file mode 100644 index 0000000000..329fdcafa4 --- /dev/null +++ b/comm/mailnews/extensions/bayesian-spam-filter/moz.build @@ -0,0 +1,16 @@ +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +SOURCES += [ + "nsBayesianFilter.cpp", +] + +FINAL_LIBRARY = "mail" + +XPCSHELL_TESTS_MANIFESTS += ["test/unit/xpcshell.ini"] + +XPCOM_MANIFESTS += [ + "components.conf", +] diff --git a/comm/mailnews/extensions/bayesian-spam-filter/nsBayesianFilter.cpp b/comm/mailnews/extensions/bayesian-spam-filter/nsBayesianFilter.cpp new file mode 100644 index 0000000000..8a4cca905b --- /dev/null +++ b/comm/mailnews/extensions/bayesian-spam-filter/nsBayesianFilter.cpp @@ -0,0 +1,2548 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsBayesianFilter.h" +#include "nsIInputStream.h" +#include "nsIStreamListener.h" +#include "nsNetUtil.h" +#include "nsQuickSort.h" +#include "nsIMsgMessageService.h" +#include "nsMsgUtils.h" // for GetMessageServiceFromURI +#include "prnetdb.h" +#include "nsIMsgWindow.h" +#include "mozilla/Logging.h" +#include "nsAppDirectoryServiceDefs.h" +#include "nsUnicharUtils.h" +#include "nsDirectoryServiceUtils.h" +#include "nsIMIMEHeaderParam.h" +#include "nsNetCID.h" +#include "nsIMsgMailNewsUrl.h" +#include "nsIPrefService.h" +#include "nsIPrefBranch.h" +#include "nsIStringEnumerator.h" +#include "nsIObserverService.h" +#include "nsIChannel.h" +#include "nsIMailChannel.h" +#include "nsDependentSubstring.h" +#include "nsMemory.h" +#include "nsUnicodeProperties.h" + +#include "mozilla/ArenaAllocatorExtensions.h" // for ArenaStrdup + +using namespace mozilla; +using mozilla::intl::Script; +using mozilla::intl::UnicodeProperties; + +// needed to mark attachment flag on the db hdr +#include "nsIMsgHdr.h" + +// needed to strip html out of the body +#include "nsLayoutCID.h" +#include "nsIParserUtils.h" +#include "nsIDocumentEncoder.h" + +#include "nsIncompleteGamma.h" +#include +#include +#include "nsIMsgTraitService.h" +#include "mozilla/Services.h" +#include "mozilla/Attributes.h" +#include // for std::abs(int/long) +#include // for std::abs(float/double) + +static mozilla::LazyLogModule BayesianFilterLogModule("BayesianFilter"); + +#define kDefaultJunkThreshold .99 // we override this value via a pref +static const char* kBayesianFilterTokenDelimiters = " \t\n\r\f."; +static unsigned int kMinLengthForToken = + 3; // lower bound on the number of characters in a word before we treat it + // as a token +static unsigned int kMaxLengthForToken = + 12; // upper bound on the number of characters in a word to be declared as + // a token + +#define FORGED_RECEIVED_HEADER_HINT "may be forged"_ns + +#ifndef M_LN2 +# define M_LN2 0.69314718055994530942 +#endif + +#ifndef M_E +# define M_E 2.7182818284590452354 +#endif + +// provide base implementation of hash lookup of a string +struct BaseToken : public PLDHashEntryHdr { + const char* mWord; +}; + +// token for a particular message +// mCount, mAnalysisLink are initialized to zero by the hash code +struct Token : public BaseToken { + uint32_t mCount; + uint32_t mAnalysisLink; // index in mAnalysisStore of the AnalysisPerToken + // object for the first trait for this token + // Helper to support Tokenizer::copyTokens() + void clone(const Token& other) { + mWord = other.mWord; + mCount = other.mCount; + mAnalysisLink = other.mAnalysisLink; + } +}; + +// token stored in a training file for a group of messages +// mTraitLink is initialized to 0 by the hash code +struct CorpusToken : public BaseToken { + uint32_t mTraitLink; // index in mTraitStore of the TraitPerToken + // object for the first trait for this token +}; + +// set the value of a TraitPerToken object +TraitPerToken::TraitPerToken(uint32_t aTraitId, uint32_t aCount) + : mId(aTraitId), mCount(aCount), mNextLink(0) {} + +// shorthand representations of trait ids for junk and good +static const uint32_t kJunkTrait = nsIJunkMailPlugin::JUNK_TRAIT; +static const uint32_t kGoodTrait = nsIJunkMailPlugin::GOOD_TRAIT; + +// set the value of an AnalysisPerToken object +AnalysisPerToken::AnalysisPerToken(uint32_t aTraitIndex, double aDistance, + double aProbability) + : mTraitIndex(aTraitIndex), + mDistance(aDistance), + mProbability(aProbability), + mNextLink(0) {} + +// the initial size of the AnalysisPerToken linked list storage +const uint32_t kAnalysisStoreCapacity = 2048; + +// the initial size of the TraitPerToken linked list storage +const uint32_t kTraitStoreCapacity = 16384; + +// Size of Auto arrays representing per trait information +const uint32_t kTraitAutoCapacity = 10; + +TokenEnumeration::TokenEnumeration(PLDHashTable* table) + : mIterator(table->Iter()) {} + +inline bool TokenEnumeration::hasMoreTokens() { return !mIterator.Done(); } + +inline BaseToken* TokenEnumeration::nextToken() { + auto token = static_cast(mIterator.Get()); + mIterator.Next(); + return token; +} + +// member variables +static const PLDHashTableOps gTokenTableOps = { + PLDHashTable::HashStringKey, PLDHashTable::MatchStringKey, + PLDHashTable::MoveEntryStub, PLDHashTable::ClearEntryStub, nullptr}; + +TokenHash::TokenHash(uint32_t aEntrySize) + : mTokenTable(&gTokenTableOps, aEntrySize, 128) { + mEntrySize = aEntrySize; +} + +TokenHash::~TokenHash() {} + +nsresult TokenHash::clearTokens() { + // we re-use the tokenizer when classifying multiple messages, + // so this gets called after every message classification. + mTokenTable.ClearAndPrepareForLength(128); + mWordPool.Clear(); + return NS_OK; +} + +char* TokenHash::copyWord(const char* word, uint32_t len) { + return ArenaStrdup(Substring(word, len), mWordPool); +} + +inline BaseToken* TokenHash::get(const char* word) { + PLDHashEntryHdr* entry = mTokenTable.Search(word); + if (entry) return static_cast(entry); + return NULL; +} + +BaseToken* TokenHash::add(const char* word) { + if (!word || !*word) { + NS_ERROR("Trying to add a null word"); + return nullptr; + } + + MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug, ("add word: %s", word)); + + PLDHashEntryHdr* entry = mTokenTable.Add(word, mozilla::fallible); + BaseToken* token = static_cast(entry); + if (token) { + if (token->mWord == NULL) { + uint32_t len = strlen(word); + NS_ASSERTION(len != 0, "adding zero length word to tokenizer"); + if (!len) + MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug, + ("adding zero length word to tokenizer")); + token->mWord = copyWord(word, len); + NS_ASSERTION(token->mWord, "copyWord failed"); + if (!token->mWord) { + MOZ_LOG(BayesianFilterLogModule, LogLevel::Error, + ("copyWord failed: %s (%d)", word, len)); + mTokenTable.RawRemove(entry); + return NULL; + } + } + } + return token; +} + +inline uint32_t TokenHash::countTokens() { return mTokenTable.EntryCount(); } + +inline TokenEnumeration TokenHash::getTokens() { + return TokenEnumeration(&mTokenTable); +} + +Tokenizer::Tokenizer() + : TokenHash(sizeof(Token)), + mBodyDelimiters(kBayesianFilterTokenDelimiters), + mHeaderDelimiters(kBayesianFilterTokenDelimiters), + mCustomHeaderTokenization(false), + mMaxLengthForToken(kMaxLengthForToken), + mIframeToDiv(false) { + nsresult rv; + nsCOMPtr prefs = + do_GetService(NS_PREFSERVICE_CONTRACTID, &rv); + NS_ENSURE_SUCCESS_VOID(rv); + + nsCOMPtr prefBranch; + rv = prefs->GetBranch("mailnews.bayesian_spam_filter.", + getter_AddRefs(prefBranch)); + NS_ENSURE_SUCCESS_VOID(rv); // no branch defined, just use defaults + + /* + * RSS feeds store their summary as alternate content of an iframe. But due + * to bug 365953, this is not seen by the serializer. As a workaround, allow + * the tokenizer to replace the iframe with div for tokenization. + */ + rv = prefBranch->GetBoolPref("iframe_to_div", &mIframeToDiv); + if (NS_FAILED(rv)) mIframeToDiv = false; + + /* + * the list of delimiters used to tokenize the message and body + * defaults to the value in kBayesianFilterTokenDelimiters, but may be + * set with the following preferences for the body and header + * separately. + * + * \t, \n, \v, \f, \r, and \\ will be escaped to their normal + * C-library values, all other two-letter combinations beginning with \ + * will be ignored. + */ + + prefBranch->GetCharPref("body_delimiters", mBodyDelimiters); + if (!mBodyDelimiters.IsEmpty()) + UnescapeCString(mBodyDelimiters); + else // prefBranch empties the result when it fails :( + mBodyDelimiters.Assign(kBayesianFilterTokenDelimiters); + + prefBranch->GetCharPref("header_delimiters", mHeaderDelimiters); + if (!mHeaderDelimiters.IsEmpty()) + UnescapeCString(mHeaderDelimiters); + else + mHeaderDelimiters.Assign(kBayesianFilterTokenDelimiters); + + /* + * Extensions may wish to enable or disable tokenization of certain headers. + * Define any headers to enable/disable in a string preference like this: + * "mailnews.bayesian_spam_filter.tokenizeheader.headername" + * + * where "headername" is the header to tokenize. For example, to tokenize the + * header "x-spam-status" use the preference: + * + * "mailnews.bayesian_spam_filter.tokenizeheader.x-spam-status" + * + * The value of the string preference will be interpreted in one of + * four ways, depending on the value: + * + * If "false" then do not tokenize that header + * If "full" then add the entire header value as a token, + * without breaking up into subtokens using delimiters + * If "standard" then tokenize the header using as delimiters the current + * value of the generic header delimiters + * Any other string is interpreted as a list of delimiters to use to parse + * the header. \t, \n, \v, \f, \r, and \\ will be escaped to their normal + * C-library values, all other two-letter combinations beginning with \ + * will be ignored. + * + * Header names in the preference should be all lower case + * + * Extensions may also set the maximum length of a token (default is + * kMaxLengthForToken) by setting the int preference: + * "mailnews.bayesian_spam_filter.maxlengthfortoken" + */ + + nsTArray headers; + + // get customized maximum token length + int32_t maxLengthForToken; + rv = prefBranch->GetIntPref("maxlengthfortoken", &maxLengthForToken); + mMaxLengthForToken = + NS_SUCCEEDED(rv) ? uint32_t(maxLengthForToken) : kMaxLengthForToken; + + rv = prefs->GetBranch("mailnews.bayesian_spam_filter.tokenizeheader.", + getter_AddRefs(prefBranch)); + if (NS_SUCCEEDED(rv)) rv = prefBranch->GetChildList("", headers); + + if (NS_SUCCEEDED(rv)) { + mCustomHeaderTokenization = true; + for (auto& header : headers) { + nsCString value; + prefBranch->GetCharPref(header.get(), value); + if (value.EqualsLiteral("false")) { + mDisabledHeaders.AppendElement(header); + continue; + } + mEnabledHeaders.AppendElement(header); + if (value.EqualsLiteral("standard")) + value.SetIsVoid(true); // Void means use default delimiter + else if (value.EqualsLiteral("full")) + value.Truncate(); // Empty means add full header + else + UnescapeCString(value); + mEnabledHeadersDelimiters.AppendElement(value); + } + } +} + +Tokenizer::~Tokenizer() {} + +inline Token* Tokenizer::get(const char* word) { + return static_cast(TokenHash::get(word)); +} + +Token* Tokenizer::add(const char* word, uint32_t count) { + MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug, + ("add word: %s (count=%d)", word, count)); + + Token* token = static_cast(TokenHash::add(word)); + if (token) { + token->mCount += count; // hash code initializes this to zero + MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug, + ("adding word to tokenizer: %s (count=%d) (mCount=%d)", word, count, + token->mCount)); + } + return token; +} + +static bool isDecimalNumber(const char* word) { + const char* p = word; + if (*p == '-') ++p; + char c; + while ((c = *p++)) { + if (!isdigit((unsigned char)c)) return false; + } + return true; +} + +static bool isASCII(const char* word) { + const unsigned char* p = (const unsigned char*)word; + unsigned char c; + while ((c = *p++)) { + if (c > 127) return false; + } + return true; +} + +inline bool isUpperCase(char c) { return ('A' <= c) && (c <= 'Z'); } + +static char* toLowerCase(char* str) { + char c, *p = str; + while ((c = *p++)) { + if (isUpperCase(c)) p[-1] = c + ('a' - 'A'); + } + return str; +} + +void Tokenizer::addTokenForHeader(const char* aTokenPrefix, nsACString& aValue, + bool aTokenizeValue, + const char* aDelimiters) { + if (aValue.Length()) { + ToLowerCase(aValue); + if (!aTokenizeValue) { + nsCString tmpStr; + tmpStr.Assign(aTokenPrefix); + tmpStr.Append(':'); + tmpStr.Append(aValue); + + add(tmpStr.get()); + } else { + char* word; + nsCString str(aValue); + char* next = str.BeginWriting(); + const char* delimiters = + !aDelimiters ? mHeaderDelimiters.get() : aDelimiters; + while ((word = NS_strtok(delimiters, &next)) != NULL) { + if (strlen(word) < kMinLengthForToken) continue; + if (isDecimalNumber(word)) continue; + if (isASCII(word)) { + nsCString tmpStr; + tmpStr.Assign(aTokenPrefix); + tmpStr.Append(':'); + tmpStr.Append(word); + add(tmpStr.get()); + } + } + } + } +} + +void Tokenizer::tokenizeAttachments( + nsTArray>& attachments) { + for (auto attachment : attachments) { + nsCString contentType; + ToLowerCase(contentType); + attachment->GetPropertyAsAUTF8String(u"contentType"_ns, contentType); + addTokenForHeader("attachment/content-type", contentType); + + nsCString displayName; + attachment->GetPropertyAsAUTF8String(u"displayName"_ns, displayName); + ToLowerCase(displayName); + addTokenForHeader("attachment/filename", displayName); + } +} + +void Tokenizer::tokenizeHeaders(nsTArray& aHeaderNames, + nsTArray& aHeaderValues) { + nsCString headerValue; + nsAutoCString + headerName; // we'll be normalizing all header names to lower case + + for (uint32_t i = 0; i < aHeaderNames.Length(); i++) { + headerName = aHeaderNames[i]; + ToLowerCase(headerName); + headerValue = aHeaderValues[i]; + + bool headerProcessed = false; + if (mCustomHeaderTokenization) { + // Process any exceptions set from preferences + for (uint32_t i = 0; i < mEnabledHeaders.Length(); i++) + if (headerName.Equals(mEnabledHeaders[i])) { + if (mEnabledHeadersDelimiters[i].IsVoid()) + // tokenize with standard delimiters for all headers + addTokenForHeader(headerName.get(), headerValue, true); + else if (mEnabledHeadersDelimiters[i].IsEmpty()) + // do not break the header into tokens + addTokenForHeader(headerName.get(), headerValue); + else + // use the delimiter in mEnabledHeadersDelimiters + addTokenForHeader(headerName.get(), headerValue, true, + mEnabledHeadersDelimiters[i].get()); + headerProcessed = true; + break; // we found the header, no need to look for more custom values + } + + for (uint32_t i = 0; i < mDisabledHeaders.Length(); i++) { + if (headerName.Equals(mDisabledHeaders[i])) { + headerProcessed = true; + break; + } + } + + if (headerProcessed) continue; + } + + switch (headerName.First()) { + case 'c': + if (headerName.EqualsLiteral("content-type")) { + nsresult rv; + nsCOMPtr mimehdrpar = + do_GetService(NS_MIMEHEADERPARAM_CONTRACTID, &rv); + if (NS_FAILED(rv)) break; + + // extract the charset parameter + nsCString parameterValue; + mimehdrpar->GetParameterInternal(headerValue, "charset", nullptr, + nullptr, + getter_Copies(parameterValue)); + addTokenForHeader("charset", parameterValue); + + // create a token containing just the content type + mimehdrpar->GetParameterInternal(headerValue, "type", nullptr, + nullptr, + getter_Copies(parameterValue)); + if (!parameterValue.Length()) + mimehdrpar->GetParameterInternal( + headerValue, nullptr /* use first unnamed param */, nullptr, + nullptr, getter_Copies(parameterValue)); + addTokenForHeader("content-type/type", parameterValue); + + // XXX: should we add a token for the entire content-type header as + // well or just these parts we have extracted? + } + break; + case 'r': + if (headerName.EqualsLiteral("received")) { + // look for the string "may be forged" in the received headers. + // sendmail sometimes adds this hint This does not compile on linux + // yet. Need to figure out why. Commenting out for now if + // (FindInReadable(FORGED_RECEIVED_HEADER_HINT, headerValue)) + // addTokenForHeader(headerName.get(), FORGED_RECEIVED_HEADER_HINT); + } + + // leave out reply-to + break; + case 's': + if (headerName.EqualsLiteral("subject")) { + // we want to tokenize the subject + addTokenForHeader(headerName.get(), headerValue, true); + } + + // important: leave out sender field. Too strong of an indicator + break; + case 'x': // (2) X-Mailer / user-agent works best if it is untokenized, + // just fold the case and any leading/trailing white space + // all headers beginning with x-mozilla are being changed by us, so + // ignore + if (StringBeginsWith(headerName, "x-mozilla"_ns)) break; + // fall through + [[fallthrough]]; + case 'u': + addTokenForHeader(headerName.get(), headerValue); + break; + default: + addTokenForHeader(headerName.get(), headerValue); + break; + } // end switch + } +} + +void Tokenizer::tokenize_ascii_word(char* aWord) { + // always deal with normalized lower case strings + toLowerCase(aWord); + uint32_t wordLength = strlen(aWord); + + // if the wordLength is within our accepted token limit, then add it + if (wordLength >= kMinLengthForToken && wordLength <= mMaxLengthForToken) + add(aWord); + else if (wordLength > mMaxLengthForToken) { + // don't skip over the word if it looks like an email address, + // there is value in adding tokens for addresses + nsDependentCString word(aWord, + wordLength); // CHEAP, no allocation occurs here... + + // XXX: i think the 40 byte check is just for perf reasons...if the email + // address is longer than that then forget about it. + const char* atSign = strchr(aWord, '@'); + if (wordLength < 40 && strchr(aWord, '.') && atSign && + !strchr(atSign + 1, '@')) { + uint32_t numBytesToSep = atSign - aWord; + if (numBytesToSep < + wordLength - 1) // if the @ sign is the last character, it must not + // be an email address + { + // split the john@foo.com into john and foo.com, treat them as separate + // tokens + nsCString emailNameToken; + emailNameToken.AssignLiteral("email name:"); + emailNameToken.Append(Substring(word, 0, numBytesToSep++)); + add(emailNameToken.get()); + nsCString emailAddrToken; + emailAddrToken.AssignLiteral("email addr:"); + emailAddrToken.Append( + Substring(word, numBytesToSep, wordLength - numBytesToSep)); + add(emailAddrToken.get()); + return; + } + } + + // there is value in generating a token indicating the number + // of characters we are skipping. We'll round to the nearest 10 + nsCString skipToken; + skipToken.AssignLiteral("skip:"); + skipToken.Append(word[0]); + skipToken.Append(' '); + skipToken.AppendInt((wordLength / 10) * 10); + add(skipToken.get()); + } +} + +// Copied from mozilla/intl/lwbrk/WordBreaker.cpp + +#define ASCII_IS_ALPHA(c) \ + ((('a' <= (c)) && ((c) <= 'z')) || (('A' <= (c)) && ((c) <= 'Z'))) +#define ASCII_IS_DIGIT(c) (('0' <= (c)) && ((c) <= '9')) +#define ASCII_IS_SPACE(c) \ + ((' ' == (c)) || ('\t' == (c)) || ('\r' == (c)) || ('\n' == (c))) +#define IS_ALPHABETICAL_SCRIPT(c) ((c) < 0x2E80) + +// we change the beginning of IS_HAN from 0x4e00 to 0x3400 to relfect +// Unicode 3.0 +#define IS_HAN(c) \ + ((0x3400 <= (c)) && ((c) <= 0x9fff)) || ((0xf900 <= (c)) && ((c) <= 0xfaff)) +#define IS_KATAKANA(c) ((0x30A0 <= (c)) && ((c) <= 0x30FF)) +#define IS_HIRAGANA(c) ((0x3040 <= (c)) && ((c) <= 0x309F)) +#define IS_HALFWIDTHKATAKANA(c) ((0xFF60 <= (c)) && ((c) <= 0xFF9F)) + +// Return true if aChar belongs to a SEAsian script that is written without +// word spaces, so we need to use the "complex breaker" to find possible word +// boundaries. (https://en.wikipedia.org/wiki/Scriptio_continua) +// (How well this works depends on the level of platform support for finding +// possible line breaks - or possible word boundaries - in the particular +// script. Thai, at least, works pretty well on the major desktop OSes. If +// the script is not supported by the platform, we just won't find any useful +// boundaries.) +static bool IsScriptioContinua(char16_t aChar) { + Script sc = UnicodeProperties::GetScriptCode(aChar); + return sc == Script::THAI || sc == Script::MYANMAR || sc == Script::KHMER || + sc == Script::JAVANESE || sc == Script::BALINESE || + sc == Script::SUNDANESE || sc == Script::LAO; +} + +// one subtract and one conditional jump should be faster than two conditional +// jump on most recent system. +#define IN_RANGE(x, low, high) ((uint16_t)((x) - (low)) <= (high) - (low)) + +#define IS_JA_HIRAGANA(x) IN_RANGE(x, 0x3040, 0x309F) +// swapping the range using xor operation to reduce conditional jump. +#define IS_JA_KATAKANA(x) \ + (IN_RANGE(x ^ 0x0004, 0x30A0, 0x30FE) || (IN_RANGE(x, 0xFF66, 0xFF9F))) +#define IS_JA_KANJI(x) \ + (IN_RANGE(x, 0x2E80, 0x2FDF) || IN_RANGE(x, 0x4E00, 0x9FAF)) +#define IS_JA_KUTEN(x) (((x) == 0x3001) || ((x) == 0xFF64) || ((x) == 0xFF0E)) +#define IS_JA_TOUTEN(x) (((x) == 0x3002) || ((x) == 0xFF61) || ((x) == 0xFF0C)) +#define IS_JA_SPACE(x) ((x) == 0x3000) +#define IS_JA_FWLATAIN(x) IN_RANGE(x, 0xFF01, 0xFF5E) +#define IS_JA_FWNUMERAL(x) IN_RANGE(x, 0xFF10, 0xFF19) + +#define IS_JAPANESE_SPECIFIC(x) \ + (IN_RANGE(x, 0x3040, 0x30FF) || IN_RANGE(x, 0xFF01, 0xFF9F)) + +enum char_class { + others = 0, + space, + hiragana, + katakana, + kanji, + kuten, + touten, + kigou, + fwlatain, + ascii +}; + +static char_class getCharClass(char16_t c) { + char_class charClass = others; + + if (IS_JA_HIRAGANA(c)) + charClass = hiragana; + else if (IS_JA_KATAKANA(c)) + charClass = katakana; + else if (IS_JA_KANJI(c)) + charClass = kanji; + else if (IS_JA_KUTEN(c)) + charClass = kuten; + else if (IS_JA_TOUTEN(c)) + charClass = touten; + else if (IS_JA_FWLATAIN(c)) + charClass = fwlatain; + + return charClass; +} + +static bool isJapanese(const char* word) { + nsString text = NS_ConvertUTF8toUTF16(word); + const char16_t* p = (const char16_t*)text.get(); + char16_t c; + + // it is japanese chunk if it contains any hiragana or katakana. + while ((c = *p++)) + if (IS_JAPANESE_SPECIFIC(c)) return true; + + return false; +} + +static bool isFWNumeral(const char16_t* p1, const char16_t* p2) { + for (; p1 < p2; p1++) + if (!IS_JA_FWNUMERAL(*p1)) return false; + + return true; +} + +// The japanese tokenizer was added as part of Bug #277354 +void Tokenizer::tokenize_japanese_word(char* chunk) { + MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug, + ("entering tokenize_japanese_word(%s)", chunk)); + + nsString srcStr = NS_ConvertUTF8toUTF16(chunk); + const char16_t* p1 = srcStr.get(); + const char16_t* p2 = p1; + if (!*p2) return; + + char_class cc = getCharClass(*p2); + while (*(++p2)) { + if (cc == getCharClass(*p2)) continue; + + nsCString token = NS_ConvertUTF16toUTF8(p1, p2 - p1); + if ((!isDecimalNumber(token.get())) && (!isFWNumeral(p1, p2))) { + nsCString tmpStr; + tmpStr.AppendLiteral("JA:"); + tmpStr.Append(token); + add(tmpStr.get()); + } + + cc = getCharClass(*p2); + p1 = p2; + } +} + +nsresult Tokenizer::stripHTML(const nsAString& inString, nsAString& outString) { + uint32_t flags = nsIDocumentEncoder::OutputLFLineBreak | + nsIDocumentEncoder::OutputNoScriptContent | + nsIDocumentEncoder::OutputNoFramesContent | + nsIDocumentEncoder::OutputBodyOnly; + nsCOMPtr utils = do_GetService(NS_PARSERUTILS_CONTRACTID); + return utils->ConvertToPlainText(inString, flags, 80, outString); +} + +// Copied from WorfdBreker.cpp due to changes in bug 1728708. +enum WordBreakClass : uint8_t { + kWbClassSpace = 0, + kWbClassAlphaLetter, + kWbClassPunct, + kWbClassHanLetter, + kWbClassKatakanaLetter, + kWbClassHiraganaLetter, + kWbClassHWKatakanaLetter, + kWbClassScriptioContinua +}; + +WordBreakClass GetWordBreakClass(char16_t c) { + // begin of the hack + + if (IS_ALPHABETICAL_SCRIPT(c)) { + if (IS_ASCII(c)) { + if (ASCII_IS_SPACE(c)) { + return WordBreakClass::kWbClassSpace; + } + if (ASCII_IS_ALPHA(c) || ASCII_IS_DIGIT(c) || (c == '_')) { + return WordBreakClass::kWbClassAlphaLetter; + } + return WordBreakClass::kWbClassPunct; + } + if (c == 0x00A0 /*NBSP*/) { + return WordBreakClass::kWbClassSpace; + } + if (mozilla::unicode::GetGenCategory(c) == nsUGenCategory::kPunctuation) { + return WordBreakClass::kWbClassPunct; + } + if (IsScriptioContinua(c)) { + return WordBreakClass::kWbClassScriptioContinua; + } + return WordBreakClass::kWbClassAlphaLetter; + } + if (IS_HAN(c)) { + return WordBreakClass::kWbClassHanLetter; + } + if (IS_KATAKANA(c)) { + return kWbClassKatakanaLetter; + } + if (IS_HIRAGANA(c)) { + return WordBreakClass::kWbClassHiraganaLetter; + } + if (IS_HALFWIDTHKATAKANA(c)) { + return WordBreakClass::kWbClassHWKatakanaLetter; + } + if (mozilla::unicode::GetGenCategory(c) == nsUGenCategory::kPunctuation) { + return WordBreakClass::kWbClassPunct; + } + if (IsScriptioContinua(c)) { + return WordBreakClass::kWbClassScriptioContinua; + } + return WordBreakClass::kWbClassAlphaLetter; +} + +// Copied from nsSemanticUnitScanner.cpp which was removed in bug 1368418. +nsresult Tokenizer::ScannerNext(const char16_t* text, int32_t length, + int32_t pos, bool isLastBuffer, int32_t* begin, + int32_t* end, bool* _retval) { + // if we reach the end, just return + if (pos >= length) { + *begin = pos; + *end = pos; + *_retval = false; + return NS_OK; + } + + WordBreakClass char_class = GetWordBreakClass(text[pos]); + + // If we are in Chinese mode, return one Han letter at a time. + // We should not do this if we are in Japanese or Korean mode. + if (WordBreakClass::kWbClassHanLetter == char_class) { + *begin = pos; + *end = pos + 1; + *_retval = true; + return NS_OK; + } + + int32_t next; + // Find the next "word". + next = + mozilla::intl::WordBreaker::Next(text, (uint32_t)length, (uint32_t)pos); + + // If we don't have enough text to make decision, return. + if (next == NS_WORDBREAKER_NEED_MORE_TEXT) { + *begin = pos; + *end = isLastBuffer ? length : pos; + *_retval = isLastBuffer; + return NS_OK; + } + + // If what we got is space or punct, look at the next break. + if (char_class == WordBreakClass::kWbClassSpace || + char_class == WordBreakClass::kWbClassPunct) { + // If the next "word" is not letters, + // call itself recursively with the new pos. + return ScannerNext(text, length, next, isLastBuffer, begin, end, _retval); + } + + // For the rest, return. + *begin = pos; + *end = next; + *_retval = true; + return NS_OK; +} + +void Tokenizer::tokenize(const char* aText) { + MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug, ("tokenize: %s", aText)); + + // strip out HTML tags before we begin processing + // uggh but first we have to blow up our string into UCS2 + // since that's what the document encoder wants. UTF8/UCS2, I wish we all + // spoke the same language here.. + nsString text = NS_ConvertUTF8toUTF16(aText); + nsString strippedUCS2; + + // RSS feeds store their summary information as an iframe. But due to + // bug 365953, we can't see those in the plaintext serializer. As a + // workaround, allow an option to replace iframe with div in the message + // text. We disable by default, since most people won't be applying bayes + // to RSS + + if (mIframeToDiv) { + text.ReplaceSubstring(u""_ns, u"/div>"_ns); + } + + stripHTML(text, strippedUCS2); + + // convert 0x3000(full width space) into 0x0020 + char16_t* substr_start = strippedUCS2.BeginWriting(); + char16_t* substr_end = strippedUCS2.EndWriting(); + while (substr_start != substr_end) { + if (*substr_start == 0x3000) *substr_start = 0x0020; + ++substr_start; + } + + nsCString strippedStr = NS_ConvertUTF16toUTF8(strippedUCS2); + char* strippedText = strippedStr.BeginWriting(); + MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug, + ("tokenize stripped html: %s", strippedText)); + + char* word; + char* next = strippedText; + while ((word = NS_strtok(mBodyDelimiters.get(), &next)) != NULL) { + if (!*word) continue; + if (isDecimalNumber(word)) continue; + if (isASCII(word)) + tokenize_ascii_word(word); + else if (isJapanese(word)) + tokenize_japanese_word(word); + else { + nsresult rv; + // Convert this word from UTF-8 into UCS2. + NS_ConvertUTF8toUTF16 uword(word); + ToLowerCase(uword); + const char16_t* utext = uword.get(); + int32_t len = uword.Length(), pos = 0, begin, end; + bool gotUnit; + while (pos < len) { + rv = ScannerNext(utext, len, pos, true, &begin, &end, &gotUnit); + if (NS_SUCCEEDED(rv) && gotUnit) { + NS_ConvertUTF16toUTF8 utfUnit(utext + begin, end - begin); + add(utfUnit.get()); + // Advance to end of current unit. + pos = end; + } else { + break; + } + } + } + } +} + +// helper function to un-escape \n, \t, etc from a CString +void Tokenizer::UnescapeCString(nsCString& aCString) { + nsAutoCString result; + + const char* readEnd = aCString.EndReading(); + result.SetLength(aCString.Length()); + char* writeStart = result.BeginWriting(); + char* writeIter = writeStart; + + bool inEscape = false; + for (const char* readIter = aCString.BeginReading(); readIter != readEnd; + readIter++) { + if (!inEscape) { + if (*readIter == '\\') + inEscape = true; + else + *(writeIter++) = *readIter; + } else { + inEscape = false; + switch (*readIter) { + case '\\': + *(writeIter++) = '\\'; + break; + case 't': + *(writeIter++) = '\t'; + break; + case 'n': + *(writeIter++) = '\n'; + break; + case 'v': + *(writeIter++) = '\v'; + break; + case 'f': + *(writeIter++) = '\f'; + break; + case 'r': + *(writeIter++) = '\r'; + break; + default: + // all other escapes are ignored + break; + } + } + } + result.Truncate(writeIter - writeStart); + aCString.Assign(result); +} + +Token* Tokenizer::copyTokens() { + uint32_t count = countTokens(); + if (count > 0) { + Token* tokens = new Token[count]; + if (tokens) { + Token* tp = tokens; + TokenEnumeration e(&mTokenTable); + while (e.hasMoreTokens()) { + Token* src = static_cast(e.nextToken()); + tp->clone(*src); + ++tp; + } + } + return tokens; + } + return NULL; +} + +class TokenAnalyzer { + public: + virtual ~TokenAnalyzer() {} + + virtual void analyzeTokens(Tokenizer& tokenizer) = 0; + void setTokenListener(nsIStreamListener* aTokenListener) { + mTokenListener = aTokenListener; + } + + void setSource(const nsACString& sourceURI) { mTokenSource = sourceURI; } + + nsCOMPtr mTokenListener; + nsCString mTokenSource; +}; + +/** + * This class downloads the raw content of an email message, buffering until + * complete segments are seen, that is until a linefeed is seen, although + * any of the valid token separators would do. This could be a further + * refinement. + */ +class TokenStreamListener : public nsIStreamListener { + public: + NS_DECL_ISUPPORTS + NS_DECL_NSIREQUESTOBSERVER + NS_DECL_NSISTREAMLISTENER + + explicit TokenStreamListener(TokenAnalyzer* analyzer); + + protected: + virtual ~TokenStreamListener(); + TokenAnalyzer* mAnalyzer; + char* mBuffer; + uint32_t mBufferSize; + uint32_t mLeftOverCount; + Tokenizer mTokenizer; + bool mSetAttachmentFlag; +}; + +const uint32_t kBufferSize = 16384; + +TokenStreamListener::TokenStreamListener(TokenAnalyzer* analyzer) + : mAnalyzer(analyzer), + mBuffer(NULL), + mBufferSize(kBufferSize), + mLeftOverCount(0), + mSetAttachmentFlag(false) {} + +TokenStreamListener::~TokenStreamListener() { + delete[] mBuffer; + delete mAnalyzer; +} + +NS_IMPL_ISUPPORTS(TokenStreamListener, nsIRequestObserver, nsIStreamListener) + +/* void onStartRequest (in nsIRequest aRequest); */ +NS_IMETHODIMP TokenStreamListener::OnStartRequest(nsIRequest* aRequest) { + mLeftOverCount = 0; + if (!mBuffer) { + mBuffer = new char[mBufferSize]; + NS_ENSURE_TRUE(mBuffer, NS_ERROR_OUT_OF_MEMORY); + } + + return NS_OK; +} + +/* void onDataAvailable (in nsIRequest aRequest, in nsIInputStream aInputStream, + * in unsigned long long aOffset, in unsigned long aCount); */ +NS_IMETHODIMP TokenStreamListener::OnDataAvailable(nsIRequest* aRequest, + nsIInputStream* aInputStream, + uint64_t aOffset, + uint32_t aCount) { + nsresult rv = NS_OK; + + while (aCount > 0) { + uint32_t readCount, totalCount = (aCount + mLeftOverCount); + if (totalCount >= mBufferSize) { + readCount = mBufferSize - mLeftOverCount - 1; + } else { + readCount = aCount; + } + + // mBuffer is supposed to be allocated in onStartRequest. But something + // is causing that to not happen, so as a last-ditch attempt we'll + // do it here. + if (!mBuffer) { + mBuffer = new char[mBufferSize]; + NS_ENSURE_TRUE(mBuffer, NS_ERROR_OUT_OF_MEMORY); + } + + char* buffer = mBuffer; + rv = aInputStream->Read(buffer + mLeftOverCount, readCount, &readCount); + if (NS_FAILED(rv)) break; + + if (readCount == 0) { + rv = NS_ERROR_UNEXPECTED; + NS_WARNING("failed to tokenize"); + break; + } + + aCount -= readCount; + + /* consume the tokens up to the last legal token delimiter in the buffer. */ + totalCount = (readCount + mLeftOverCount); + buffer[totalCount] = '\0'; + char* lastDelimiter = NULL; + char* scan = buffer + totalCount; + while (scan > buffer) { + if (strchr(mTokenizer.mBodyDelimiters.get(), *--scan)) { + lastDelimiter = scan; + break; + } + } + + if (lastDelimiter) { + *lastDelimiter = '\0'; + mTokenizer.tokenize(buffer); + + uint32_t consumedCount = 1 + (lastDelimiter - buffer); + mLeftOverCount = totalCount - consumedCount; + if (mLeftOverCount) + memmove(buffer, buffer + consumedCount, mLeftOverCount); + } else { + /* didn't find a delimiter, keep the whole buffer around. */ + mLeftOverCount = totalCount; + if (totalCount >= (mBufferSize / 2)) { + uint32_t newBufferSize = mBufferSize * 2; + char* newBuffer = new char[newBufferSize]; + NS_ENSURE_TRUE(newBuffer, NS_ERROR_OUT_OF_MEMORY); + memcpy(newBuffer, mBuffer, mLeftOverCount); + delete[] mBuffer; + mBuffer = newBuffer; + mBufferSize = newBufferSize; + } + } + } + + return rv; +} + +/* void onStopRequest (in nsIRequest aRequest, in nsresult aStatusCode); */ +NS_IMETHODIMP TokenStreamListener::OnStopRequest(nsIRequest* aRequest, + nsresult aStatusCode) { + nsCOMPtr mailChannel = do_QueryInterface(aRequest); + if (mailChannel) { + nsTArray headerNames; + nsTArray headerValues; + mailChannel->GetHeaderNames(headerNames); + mailChannel->GetHeaderValues(headerValues); + mTokenizer.tokenizeHeaders(headerNames, headerValues); + + nsTArray> attachments; + mailChannel->GetAttachments(attachments); + mTokenizer.tokenizeAttachments(attachments); + } + + if (mLeftOverCount) { + /* assume final buffer is complete. */ + mBuffer[mLeftOverCount] = '\0'; + mTokenizer.tokenize(mBuffer); + } + + /* finally, analyze the tokenized message. */ + MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug, + ("analyze the tokenized message")); + if (mAnalyzer) mAnalyzer->analyzeTokens(mTokenizer); + + return NS_OK; +} + +/* Implementation file */ + +NS_IMPL_ISUPPORTS(nsBayesianFilter, nsIMsgFilterPlugin, nsIJunkMailPlugin, + nsIMsgCorpus, nsISupportsWeakReference, nsIObserver) + +nsBayesianFilter::nsBayesianFilter() : mTrainingDataDirty(false) { + int32_t junkThreshold = 0; + nsresult rv; + nsCOMPtr pPrefBranch( + do_GetService(NS_PREFSERVICE_CONTRACTID, &rv)); + if (pPrefBranch) + pPrefBranch->GetIntPref("mail.adaptivefilters.junk_threshold", + &junkThreshold); + + mJunkProbabilityThreshold = (static_cast(junkThreshold)) / 100.0; + if (mJunkProbabilityThreshold == 0 || mJunkProbabilityThreshold >= 1) + mJunkProbabilityThreshold = kDefaultJunkThreshold; + + MOZ_LOG(BayesianFilterLogModule, LogLevel::Warning, + ("junk probability threshold: %f", mJunkProbabilityThreshold)); + + mCorpus.readTrainingData(); + + // get parameters for training data flushing, from the prefs + + nsCOMPtr prefBranch; + + nsCOMPtr prefs = + do_GetService(NS_PREFSERVICE_CONTRACTID, &rv); + NS_ASSERTION(NS_SUCCEEDED(rv), "failed accessing preferences service"); + rv = prefs->GetBranch(nullptr, getter_AddRefs(prefBranch)); + NS_ASSERTION(NS_SUCCEEDED(rv), "failed getting preferences branch"); + + rv = prefBranch->GetIntPref( + "mailnews.bayesian_spam_filter.flush.minimum_interval", + &mMinFlushInterval); + // it is not a good idea to allow a minimum interval of under 1 second + if (NS_FAILED(rv) || (mMinFlushInterval <= 1000)) + mMinFlushInterval = DEFAULT_MIN_INTERVAL_BETWEEN_WRITES; + + rv = prefBranch->GetIntPref("mailnews.bayesian_spam_filter.junk_maxtokens", + &mMaximumTokenCount); + if (NS_FAILED(rv)) + mMaximumTokenCount = 0; // which means do not limit token counts + MOZ_LOG(BayesianFilterLogModule, LogLevel::Warning, + ("maximum junk tokens: %d", mMaximumTokenCount)); + + // give a default capacity to the memory structure used to store + // per-message/per-trait token data + mAnalysisStore.SetCapacity(kAnalysisStoreCapacity); + + // dummy 0th element. Index 0 means "end of list" so we need to + // start from 1 + AnalysisPerToken analysisPT(0, 0.0, 0.0); + mAnalysisStore.AppendElement(analysisPT); + mNextAnalysisIndex = 1; +} + +nsresult nsBayesianFilter::Init() { + nsCOMPtr observerService = + mozilla::services::GetObserverService(); + if (observerService) + observerService->AddObserver(this, "profile-before-change", true); + return NS_OK; +} + +void nsBayesianFilter::TimerCallback(nsITimer* aTimer, void* aClosure) { + // we will flush the training data to disk after enough time has passed + // since the first time a message has been classified after the last flush + + nsBayesianFilter* filter = static_cast(aClosure); + filter->mCorpus.writeTrainingData(filter->mMaximumTokenCount); + filter->mTrainingDataDirty = false; +} + +nsBayesianFilter::~nsBayesianFilter() { + if (mTimer) { + mTimer->Cancel(); + mTimer = nullptr; + } + // call shutdown when we are going away in case we need + // to flush the training set to disk + Shutdown(); +} + +// this object is used for one call to classifyMessage or classifyMessages(). +// So if we're classifying multiple messages, this object will be used for each +// message. It's going to hold a reference to itself, basically, to stay in +// memory. +class MessageClassifier : public TokenAnalyzer { + public: + // full classifier with arbitrary traits + MessageClassifier(nsBayesianFilter* aFilter, + nsIJunkMailClassificationListener* aJunkListener, + nsIMsgTraitClassificationListener* aTraitListener, + nsIMsgTraitDetailListener* aDetailListener, + const nsTArray& aProTraits, + const nsTArray& aAntiTraits, + nsIMsgWindow* aMsgWindow, + const nsTArray& aMessageURIs) + : mFilter(aFilter), + mJunkMailPlugin(aFilter), + mJunkListener(aJunkListener), + mTraitListener(aTraitListener), + mDetailListener(aDetailListener), + mProTraits(aProTraits.Clone()), + mAntiTraits(aAntiTraits.Clone()), + mMsgWindow(aMsgWindow), + mMessageURIs(aMessageURIs.Clone()), + mCurMessageToClassify(0) { + MOZ_ASSERT(aProTraits.Length() == aAntiTraits.Length()); + } + + // junk-only classifier + MessageClassifier(nsBayesianFilter* aFilter, + nsIJunkMailClassificationListener* aJunkListener, + nsIMsgWindow* aMsgWindow, + const nsTArray& aMessageURIs) + : mFilter(aFilter), + mJunkMailPlugin(aFilter), + mJunkListener(aJunkListener), + mTraitListener(nullptr), + mDetailListener(nullptr), + mMsgWindow(aMsgWindow), + mMessageURIs(aMessageURIs.Clone()), + mCurMessageToClassify(0) { + mProTraits.AppendElement(kJunkTrait); + mAntiTraits.AppendElement(kGoodTrait); + } + + virtual ~MessageClassifier() {} + virtual void analyzeTokens(Tokenizer& tokenizer) { + mFilter->classifyMessage(tokenizer, mTokenSource, mProTraits, mAntiTraits, + mJunkListener, mTraitListener, mDetailListener); + tokenizer.clearTokens(); + classifyNextMessage(); + } + + virtual void classifyNextMessage() { + if (++mCurMessageToClassify < mMessageURIs.Length()) { + MOZ_LOG(BayesianFilterLogModule, LogLevel::Warning, + ("classifyNextMessage(%s)", + mMessageURIs[mCurMessageToClassify].get())); + mFilter->tokenizeMessage(mMessageURIs[mCurMessageToClassify], mMsgWindow, + this); + } else { + // call all listeners with null parameters to signify end of batch + if (mJunkListener) + mJunkListener->OnMessageClassified(EmptyCString(), + nsIJunkMailPlugin::UNCLASSIFIED, 0); + if (mTraitListener) { + nsTArray nullTraits; + nsTArray nullPercents; + mTraitListener->OnMessageTraitsClassified(EmptyCString(), nullTraits, + nullPercents); + } + mTokenListener = + nullptr; // this breaks the circular ref that keeps this object alive + // so we will be destroyed as a result. + } + } + + private: + nsBayesianFilter* mFilter; + nsCOMPtr mJunkMailPlugin; + nsCOMPtr mJunkListener; + nsCOMPtr mTraitListener; + nsCOMPtr mDetailListener; + nsTArray mProTraits; + nsTArray mAntiTraits; + nsCOMPtr mMsgWindow; + nsTArray mMessageURIs; + uint32_t mCurMessageToClassify; // 0-based index +}; + +nsresult nsBayesianFilter::tokenizeMessage(const nsACString& aMessageURI, + nsIMsgWindow* aMsgWindow, + TokenAnalyzer* aAnalyzer) { + nsCOMPtr msgService; + nsresult rv = + GetMessageServiceFromURI(aMessageURI, getter_AddRefs(msgService)); + NS_ENSURE_SUCCESS(rv, rv); + + aAnalyzer->setSource(aMessageURI); + nsCOMPtr dummyNull; + return msgService->StreamMessage( + aMessageURI, aAnalyzer->mTokenListener, aMsgWindow, nullptr, + true /* convert data */, "filter"_ns, false, getter_AddRefs(dummyNull)); +} + +// a TraitAnalysis is the per-token representation of the statistical +// calculations, basically created to group information that is then +// sorted by mDistance +struct TraitAnalysis { + uint32_t mTokenIndex; + double mDistance; + double mProbability; +}; + +// comparator required to sort an nsTArray +class compareTraitAnalysis { + public: + bool Equals(const TraitAnalysis& a, const TraitAnalysis& b) const { + return a.mDistance == b.mDistance; + } + bool LessThan(const TraitAnalysis& a, const TraitAnalysis& b) const { + return a.mDistance < b.mDistance; + } +}; + +inline double dmax(double x, double y) { return (x > y ? x : y); } +inline double dmin(double x, double y) { return (x < y ? x : y); } + +// Chi square functions are implemented by an incomplete gamma function. +// Note that chi2P's callers multiply the arguments by 2 but chi2P +// divides them by 2 again. Inlining chi2P gives the compiler a +// chance to notice this. + +// Both chi2P and nsIncompleteGammaP set *error negative on domain +// errors and nsIncompleteGammaP sets it posivive on internal errors. +// This may be useful but the chi2P callers treat any error as fatal. + +// Note that converting unsigned ints to floating point can be slow on +// some platforms (like Intel) so use signed quantities for the numeric +// routines. +static inline double chi2P(double chi2, double nu, int32_t* error) { + // domain checks; set error and return a dummy value + if (chi2 < 0.0 || nu <= 0.0) { + *error = -1; + return 0.0; + } + // reversing the arguments is intentional + return nsIncompleteGammaP(nu / 2.0, chi2 / 2.0, error); +} + +void nsBayesianFilter::classifyMessage( + Tokenizer& tokenizer, const nsACString& messageURI, + nsTArray& aProTraits, nsTArray& aAntiTraits, + nsIJunkMailClassificationListener* listener, + nsIMsgTraitClassificationListener* aTraitListener, + nsIMsgTraitDetailListener* aDetailListener) { + if (aProTraits.Length() != aAntiTraits.Length()) { + NS_ERROR("Each Pro trait needs a matching Anti trait"); + return; + } + Token* tokens = tokenizer.copyTokens(); + uint32_t tokenCount; + if (!tokens) { + // This can happen with problems with UTF conversion + NS_ERROR("Trying to classify a null or invalid message"); + tokenCount = 0; + // don't return so that we still call the listeners + } else { + tokenCount = tokenizer.countTokens(); + } + + /* this part is similar to the Graham algorithm with some adjustments. */ + uint32_t traitCount = aProTraits.Length(); + + // pro message counts per trait index + AutoTArray numProMessages; + // anti message counts per trait index + AutoTArray numAntiMessages; + // array of pro aliases per trait index + AutoTArray, kTraitAutoCapacity> proAliasArrays; + // array of anti aliases per trait index + AutoTArray, kTraitAutoCapacity> antiAliasArrays; + // construct the outgoing listener arrays + AutoTArray traits; + AutoTArray percents; + if (traitCount > kTraitAutoCapacity) { + traits.SetCapacity(traitCount); + percents.SetCapacity(traitCount); + numProMessages.SetCapacity(traitCount); + numAntiMessages.SetCapacity(traitCount); + proAliasArrays.SetCapacity(traitCount); + antiAliasArrays.SetCapacity(traitCount); + } + + nsresult rv; + nsCOMPtr traitService( + do_GetService("@mozilla.org/msg-trait-service;1", &rv)); + if (NS_FAILED(rv)) { + NS_ERROR("Failed to get trait service"); + MOZ_LOG(BayesianFilterLogModule, LogLevel::Error, + ("Failed to get trait service")); + } + + // get aliases and message counts for the pro and anti traits + for (uint32_t traitIndex = 0; traitIndex < traitCount; traitIndex++) { + nsresult rv; + + // pro trait + nsTArray proAliases; + uint32_t proTrait = aProTraits[traitIndex]; + if (traitService) { + rv = traitService->GetAliases(proTrait, proAliases); + if (NS_FAILED(rv)) { + NS_ERROR("trait service failed to get aliases"); + MOZ_LOG(BayesianFilterLogModule, LogLevel::Error, + ("trait service failed to get aliases")); + } + } + proAliasArrays.AppendElement(proAliases.Clone()); + uint32_t proMessageCount = mCorpus.getMessageCount(proTrait); + for (uint32_t aliasIndex = 0; aliasIndex < proAliases.Length(); + aliasIndex++) + proMessageCount += mCorpus.getMessageCount(proAliases[aliasIndex]); + numProMessages.AppendElement(proMessageCount); + + // anti trait + nsTArray antiAliases; + uint32_t antiTrait = aAntiTraits[traitIndex]; + if (traitService) { + rv = traitService->GetAliases(antiTrait, antiAliases); + if (NS_FAILED(rv)) { + NS_ERROR("trait service failed to get aliases"); + MOZ_LOG(BayesianFilterLogModule, LogLevel::Error, + ("trait service failed to get aliases")); + } + } + antiAliasArrays.AppendElement(antiAliases.Clone()); + uint32_t antiMessageCount = mCorpus.getMessageCount(antiTrait); + for (uint32_t aliasIndex = 0; aliasIndex < antiAliases.Length(); + aliasIndex++) + antiMessageCount += mCorpus.getMessageCount(antiAliases[aliasIndex]); + numAntiMessages.AppendElement(antiMessageCount); + } + + for (uint32_t i = 0; i < tokenCount; ++i) { + Token& token = tokens[i]; + CorpusToken* t = mCorpus.get(token.mWord); + if (!t) continue; + for (uint32_t traitIndex = 0; traitIndex < traitCount; traitIndex++) { + uint32_t iProCount = mCorpus.getTraitCount(t, aProTraits[traitIndex]); + // add in any counts for aliases to proTrait + for (uint32_t aliasIndex = 0; + aliasIndex < proAliasArrays[traitIndex].Length(); aliasIndex++) + iProCount += + mCorpus.getTraitCount(t, proAliasArrays[traitIndex][aliasIndex]); + double proCount = static_cast(iProCount); + + uint32_t iAntiCount = mCorpus.getTraitCount(t, aAntiTraits[traitIndex]); + // add in any counts for aliases to antiTrait + for (uint32_t aliasIndex = 0; + aliasIndex < antiAliasArrays[traitIndex].Length(); aliasIndex++) + iAntiCount += + mCorpus.getTraitCount(t, antiAliasArrays[traitIndex][aliasIndex]); + double antiCount = static_cast(iAntiCount); + + double prob, denom; + // Prevent a divide by zero error by setting defaults for prob + + // If there are no matching tokens at all, ignore. + if (antiCount == 0.0 && proCount == 0.0) continue; + // if only anti match, set probability to 0% + if (proCount == 0.0) prob = 0.0; + // if only pro match, set probability to 100% + else if (antiCount == 0.0) + prob = 1.0; + // not really needed, but just to be sure check the denom as well + else if ((denom = proCount * numAntiMessages[traitIndex] + + antiCount * numProMessages[traitIndex]) == 0.0) + continue; + else + prob = (proCount * numAntiMessages[traitIndex]) / denom; + + double n = proCount + antiCount; + prob = (0.225 + n * prob) / (.45 + n); + double distance = std::abs(prob - 0.5); + if (distance >= .1) { + mozilla::DebugOnly rv = + setAnalysis(token, traitIndex, distance, prob); + NS_ASSERTION(NS_SUCCEEDED(rv), "Problem in setAnalysis"); + } + } + } + + for (uint32_t traitIndex = 0; traitIndex < traitCount; traitIndex++) { + AutoTArray traitAnalyses; + // copy valid tokens into an array to sort + for (uint32_t tokenIndex = 0; tokenIndex < tokenCount; tokenIndex++) { + uint32_t storeIndex = getAnalysisIndex(tokens[tokenIndex], traitIndex); + if (storeIndex) { + TraitAnalysis ta = {tokenIndex, mAnalysisStore[storeIndex].mDistance, + mAnalysisStore[storeIndex].mProbability}; + traitAnalyses.AppendElement(ta); + } + } + + // sort the array by the distances + traitAnalyses.Sort(compareTraitAnalysis()); + uint32_t count = traitAnalyses.Length(); + uint32_t first, last = count; + const uint32_t kMaxTokens = 150; + first = (count > kMaxTokens) ? count - kMaxTokens : 0; + + // Setup the arrays to save details if needed + nsTArray sArray; + nsTArray hArray; + uint32_t usedTokenCount = (count > kMaxTokens) ? kMaxTokens : count; + if (aDetailListener) { + sArray.SetCapacity(usedTokenCount); + hArray.SetCapacity(usedTokenCount); + } + + double H = 1.0, S = 1.0; + int32_t Hexp = 0, Sexp = 0; + uint32_t goodclues = 0; + int e; + + // index from end to analyze most significant first + for (uint32_t ip1 = last; ip1 != first; --ip1) { + TraitAnalysis& ta = traitAnalyses[ip1 - 1]; + if (ta.mDistance > 0.0) { + goodclues++; + double value = ta.mProbability; + S *= (1.0 - value); + H *= value; + if (S < 1e-200) { + S = frexp(S, &e); + Sexp += e; + } + if (H < 1e-200) { + H = frexp(H, &e); + Hexp += e; + } + MOZ_LOG(BayesianFilterLogModule, LogLevel::Warning, + ("token probability (%s) is %f", tokens[ta.mTokenIndex].mWord, + ta.mProbability)); + } + if (aDetailListener) { + sArray.AppendElement(log(S) + Sexp * M_LN2); + hArray.AppendElement(log(H) + Hexp * M_LN2); + } + } + + S = log(S) + Sexp * M_LN2; + H = log(H) + Hexp * M_LN2; + + double prob; + if (goodclues > 0) { + int32_t chi_error; + S = chi2P(-2.0 * S, 2.0 * goodclues, &chi_error); + if (!chi_error) H = chi2P(-2.0 * H, 2.0 * goodclues, &chi_error); + // if any error toss the entire calculation + if (!chi_error) + prob = (S - H + 1.0) / 2.0; + else + prob = 0.5; + } else + prob = 0.5; + + if (aDetailListener) { + // Prepare output arrays + nsTArray tokenPercents(usedTokenCount); + nsTArray runningPercents(usedTokenCount); + nsTArray tokenStrings(usedTokenCount); + + double clueCount = 1.0; + for (uint32_t tokenIndex = 0; tokenIndex < usedTokenCount; tokenIndex++) { + TraitAnalysis& ta = traitAnalyses[last - 1 - tokenIndex]; + int32_t chi_error; + S = chi2P(-2.0 * sArray[tokenIndex], 2.0 * clueCount, &chi_error); + if (!chi_error) + H = chi2P(-2.0 * hArray[tokenIndex], 2.0 * clueCount, &chi_error); + clueCount += 1.0; + double runningProb; + if (!chi_error) + runningProb = (S - H + 1.0) / 2.0; + else + runningProb = 0.5; + runningPercents.AppendElement( + static_cast(runningProb * 100. + .5)); + tokenPercents.AppendElement( + static_cast(ta.mProbability * 100. + .5)); + tokenStrings.AppendElement( + NS_ConvertUTF8toUTF16(tokens[ta.mTokenIndex].mWord)); + } + + aDetailListener->OnMessageTraitDetails(messageURI, aProTraits[traitIndex], + tokenStrings, tokenPercents, + runningPercents); + } + + uint32_t proPercent = static_cast(prob * 100. + .5); + + // directly classify junk to maintain backwards compatibility + if (aProTraits[traitIndex] == kJunkTrait) { + bool isJunk = (prob >= mJunkProbabilityThreshold); + MOZ_LOG(BayesianFilterLogModule, LogLevel::Info, + ("%s is junk probability = (%f) HAM SCORE:%f SPAM SCORE:%f", + PromiseFlatCString(messageURI).get(), prob, H, S)); + + // the algorithm in "A Plan For Spam" assumes that you have a large good + // corpus and a large junk corpus. + // that won't be the case with users who first use the junk mail trait + // so, we do certain things to encourage them to train. + // + // if there are no good tokens, assume the message is junk + // this will "encourage" the user to train + // and if there are no bad tokens, assume the message is not junk + // this will also "encourage" the user to train + // see bug #194238 + + if (listener && !mCorpus.getMessageCount(kGoodTrait)) + isJunk = true; + else if (listener && !mCorpus.getMessageCount(kJunkTrait)) + isJunk = false; + + if (listener) + listener->OnMessageClassified( + messageURI, + isJunk ? nsMsgJunkStatus(nsIJunkMailPlugin::JUNK) + : nsMsgJunkStatus(nsIJunkMailPlugin::GOOD), + proPercent); + } + + if (aTraitListener) { + traits.AppendElement(aProTraits[traitIndex]); + percents.AppendElement(proPercent); + } + } + + if (aTraitListener) + aTraitListener->OnMessageTraitsClassified(messageURI, traits, percents); + + delete[] tokens; + // reuse mAnalysisStore without clearing memory + mNextAnalysisIndex = 1; + // but shrink it back to the default size + if (mAnalysisStore.Length() > kAnalysisStoreCapacity) + mAnalysisStore.RemoveElementsAt( + kAnalysisStoreCapacity, + mAnalysisStore.Length() - kAnalysisStoreCapacity); + mAnalysisStore.Compact(); +} + +void nsBayesianFilter::classifyMessage( + Tokenizer& tokens, const nsACString& messageURI, + nsIJunkMailClassificationListener* aJunkListener) { + AutoTArray proTraits; + AutoTArray antiTraits; + proTraits.AppendElement(kJunkTrait); + antiTraits.AppendElement(kGoodTrait); + classifyMessage(tokens, messageURI, proTraits, antiTraits, aJunkListener, + nullptr, nullptr); +} + +NS_IMETHODIMP +nsBayesianFilter::Observe(nsISupports* aSubject, const char* aTopic, + const char16_t* someData) { + if (!strcmp(aTopic, "profile-before-change")) Shutdown(); + return NS_OK; +} + +/* void shutdown (); */ +NS_IMETHODIMP nsBayesianFilter::Shutdown() { + if (mTrainingDataDirty) mCorpus.writeTrainingData(mMaximumTokenCount); + mTrainingDataDirty = false; + + return NS_OK; +} + +/* readonly attribute boolean shouldDownloadAllHeaders; */ +NS_IMETHODIMP nsBayesianFilter::GetShouldDownloadAllHeaders( + bool* aShouldDownloadAllHeaders) { + // bayesian filters work on the whole msg body currently. + *aShouldDownloadAllHeaders = false; + return NS_OK; +} + +/* void classifyMessage (in string aMsgURL, in nsIJunkMailClassificationListener + * aListener); */ +NS_IMETHODIMP nsBayesianFilter::ClassifyMessage( + const nsACString& aMessageURL, nsIMsgWindow* aMsgWindow, + nsIJunkMailClassificationListener* aListener) { + AutoTArray urls = {PromiseFlatCString(aMessageURL)}; + MessageClassifier* analyzer = + new MessageClassifier(this, aListener, aMsgWindow, urls); + NS_ENSURE_TRUE(analyzer, NS_ERROR_OUT_OF_MEMORY); + TokenStreamListener* tokenListener = new TokenStreamListener(analyzer); + NS_ENSURE_TRUE(tokenListener, NS_ERROR_OUT_OF_MEMORY); + analyzer->setTokenListener(tokenListener); + return tokenizeMessage(aMessageURL, aMsgWindow, analyzer); +} + +/* void classifyMessages(in Array aMsgURIs, + * in nsIMsgWindow aMsgWindow, + * in nsIJunkMailClassificationListener aListener); */ +NS_IMETHODIMP nsBayesianFilter::ClassifyMessages( + const nsTArray& aMsgURLs, nsIMsgWindow* aMsgWindow, + nsIJunkMailClassificationListener* aListener) { + TokenAnalyzer* analyzer = + new MessageClassifier(this, aListener, aMsgWindow, aMsgURLs); + NS_ENSURE_TRUE(analyzer, NS_ERROR_OUT_OF_MEMORY); + TokenStreamListener* tokenListener = new TokenStreamListener(analyzer); + NS_ENSURE_TRUE(tokenListener, NS_ERROR_OUT_OF_MEMORY); + analyzer->setTokenListener(tokenListener); + return tokenizeMessage(aMsgURLs[0], aMsgWindow, analyzer); +} + +nsresult nsBayesianFilter::setAnalysis(Token& token, uint32_t aTraitIndex, + double aDistance, double aProbability) { + uint32_t nextLink = token.mAnalysisLink; + uint32_t lastLink = 0; + uint32_t linkCount = 0, maxLinks = 100; + + // try to find an existing element. Limit the search to maxLinks + // as a precaution + for (linkCount = 0; nextLink && linkCount < maxLinks; linkCount++) { + AnalysisPerToken& rAnalysis = mAnalysisStore[nextLink]; + if (rAnalysis.mTraitIndex == aTraitIndex) { + rAnalysis.mDistance = aDistance; + rAnalysis.mProbability = aProbability; + return NS_OK; + } + lastLink = nextLink; + nextLink = rAnalysis.mNextLink; + } + if (linkCount >= maxLinks) return NS_ERROR_FAILURE; + + // trait does not exist, so add it + + AnalysisPerToken analysis(aTraitIndex, aDistance, aProbability); + if (mAnalysisStore.Length() == mNextAnalysisIndex) + mAnalysisStore.InsertElementAt(mNextAnalysisIndex, analysis); + else if (mAnalysisStore.Length() > mNextAnalysisIndex) + mAnalysisStore.ReplaceElementsAt(mNextAnalysisIndex, 1, analysis); + else // we can only insert at the end of the array + return NS_ERROR_FAILURE; + + if (lastLink) + // the token had at least one link, so update the last link to point to + // the new item + mAnalysisStore[lastLink].mNextLink = mNextAnalysisIndex; + else + // need to update the token's first link + token.mAnalysisLink = mNextAnalysisIndex; + mNextAnalysisIndex++; + return NS_OK; +} + +uint32_t nsBayesianFilter::getAnalysisIndex(Token& token, + uint32_t aTraitIndex) { + uint32_t nextLink; + uint32_t linkCount = 0, maxLinks = 100; + for (nextLink = token.mAnalysisLink; nextLink && linkCount < maxLinks; + linkCount++) { + AnalysisPerToken& rAnalysis = mAnalysisStore[nextLink]; + if (rAnalysis.mTraitIndex == aTraitIndex) return nextLink; + nextLink = rAnalysis.mNextLink; + } + NS_ASSERTION(linkCount < maxLinks, "corrupt analysis store"); + + // Trait not found, indicate by zero + return 0; +} + +NS_IMETHODIMP nsBayesianFilter::ClassifyTraitsInMessage( + const nsACString& aMsgURI, const nsTArray& aProTraits, + const nsTArray& aAntiTraits, + nsIMsgTraitClassificationListener* aTraitListener, nsIMsgWindow* aMsgWindow, + nsIJunkMailClassificationListener* aJunkListener) { + AutoTArray uris = {PromiseFlatCString(aMsgURI)}; + return ClassifyTraitsInMessages(uris, aProTraits, aAntiTraits, aTraitListener, + aMsgWindow, aJunkListener); +} + +NS_IMETHODIMP nsBayesianFilter::ClassifyTraitsInMessages( + const nsTArray& aMsgURIs, const nsTArray& aProTraits, + const nsTArray& aAntiTraits, + nsIMsgTraitClassificationListener* aTraitListener, nsIMsgWindow* aMsgWindow, + nsIJunkMailClassificationListener* aJunkListener) { + MOZ_ASSERT(aProTraits.Length() == aAntiTraits.Length()); + MessageClassifier* analyzer = + new MessageClassifier(this, aJunkListener, aTraitListener, nullptr, + aProTraits, aAntiTraits, aMsgWindow, aMsgURIs); + + TokenStreamListener* tokenListener = new TokenStreamListener(analyzer); + + analyzer->setTokenListener(tokenListener); + return tokenizeMessage(aMsgURIs[0], aMsgWindow, analyzer); +} + +class MessageObserver : public TokenAnalyzer { + public: + MessageObserver(nsBayesianFilter* filter, + const nsTArray& aOldClassifications, + const nsTArray& aNewClassifications, + nsIJunkMailClassificationListener* aJunkListener, + nsIMsgTraitClassificationListener* aTraitListener) + : mFilter(filter), + mJunkMailPlugin(filter), + mJunkListener(aJunkListener), + mTraitListener(aTraitListener), + mOldClassifications(aOldClassifications.Clone()), + mNewClassifications(aNewClassifications.Clone()) {} + + virtual void analyzeTokens(Tokenizer& tokenizer) { + mFilter->observeMessage(tokenizer, mTokenSource, mOldClassifications, + mNewClassifications, mJunkListener, mTraitListener); + // release reference to listener, which will allow us to go away as well. + mTokenListener = nullptr; + } + + private: + nsBayesianFilter* mFilter; + nsCOMPtr mJunkMailPlugin; + nsCOMPtr mJunkListener; + nsCOMPtr mTraitListener; + nsTArray mOldClassifications; + nsTArray mNewClassifications; +}; + +NS_IMETHODIMP nsBayesianFilter::SetMsgTraitClassification( + const nsACString& aMsgURI, const nsTArray& aOldTraits, + const nsTArray& aNewTraits, + nsIMsgTraitClassificationListener* aTraitListener, nsIMsgWindow* aMsgWindow, + nsIJunkMailClassificationListener* aJunkListener) { + MessageObserver* analyzer = new MessageObserver( + this, aOldTraits, aNewTraits, aJunkListener, aTraitListener); + NS_ENSURE_TRUE(analyzer, NS_ERROR_OUT_OF_MEMORY); + + TokenStreamListener* tokenListener = new TokenStreamListener(analyzer); + NS_ENSURE_TRUE(tokenListener, NS_ERROR_OUT_OF_MEMORY); + + analyzer->setTokenListener(tokenListener); + return tokenizeMessage(aMsgURI, aMsgWindow, analyzer); +} + +// set new message classifications for a message +void nsBayesianFilter::observeMessage( + Tokenizer& tokenizer, const nsACString& messageURL, + nsTArray& oldClassifications, + nsTArray& newClassifications, + nsIJunkMailClassificationListener* aJunkListener, + nsIMsgTraitClassificationListener* aTraitListener) { + bool trainingDataWasDirty = mTrainingDataDirty; + + // Uhoh...if the user is re-training then the message may already be + // classified and we are classifying it again with the same classification. + // the old code would have removed the tokens for this message then added them + // back. But this really hurts the message occurrence count for tokens if you + // just removed training.dat and are re-training. See Bug #237095 for more + // details. What can we do here? Well we can skip the token removal step if + // the classifications are the same and assume the user is just re-training. + // But this then allows users to re-classify the same message on the same + // training set over and over again leading to data skew. But that's all I can + // think to do right now to address this..... + uint32_t oldLength = oldClassifications.Length(); + for (uint32_t index = 0; index < oldLength; index++) { + uint32_t trait = oldClassifications.ElementAt(index); + // skip removing if trait is also in the new set + if (newClassifications.Contains(trait)) continue; + // remove the tokens from the token set it is currently in + uint32_t messageCount; + messageCount = mCorpus.getMessageCount(trait); + if (messageCount > 0) { + mCorpus.setMessageCount(trait, messageCount - 1); + mCorpus.forgetTokens(tokenizer, trait, 1); + mTrainingDataDirty = true; + } + } + + nsMsgJunkStatus newClassification = nsIJunkMailPlugin::UNCLASSIFIED; + uint32_t junkPercent = + 0; // 0 here is no possibility of meeting the classification + uint32_t newLength = newClassifications.Length(); + for (uint32_t index = 0; index < newLength; index++) { + uint32_t trait = newClassifications.ElementAt(index); + mCorpus.setMessageCount(trait, mCorpus.getMessageCount(trait) + 1); + mCorpus.rememberTokens(tokenizer, trait, 1); + mTrainingDataDirty = true; + + if (aJunkListener) { + if (trait == kJunkTrait) { + junkPercent = nsIJunkMailPlugin::IS_SPAM_SCORE; + newClassification = nsIJunkMailPlugin::JUNK; + } else if (trait == kGoodTrait) { + junkPercent = nsIJunkMailPlugin::IS_HAM_SCORE; + newClassification = nsIJunkMailPlugin::GOOD; + } + } + } + + if (aJunkListener) + aJunkListener->OnMessageClassified(messageURL, newClassification, + junkPercent); + + if (aTraitListener) { + // construct the outgoing listener arrays + AutoTArray traits; + AutoTArray percents; + uint32_t newLength = newClassifications.Length(); + if (newLength > kTraitAutoCapacity) { + traits.SetCapacity(newLength); + percents.SetCapacity(newLength); + } + traits.AppendElements(newClassifications); + for (uint32_t index = 0; index < newLength; index++) + percents.AppendElement(100); // This is 100 percent, or certainty + aTraitListener->OnMessageTraitsClassified(messageURL, traits, percents); + } + + if (mTrainingDataDirty && !trainingDataWasDirty) { + // if training data became dirty just now, schedule flush + // mMinFlushInterval msec from now + MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug, + ("starting training data flush timer %i msec", mMinFlushInterval)); + + nsresult rv = NS_NewTimerWithFuncCallback( + getter_AddRefs(mTimer), nsBayesianFilter::TimerCallback, (void*)this, + mMinFlushInterval, nsITimer::TYPE_ONE_SHOT, + "nsBayesianFilter::TimerCallback", nullptr); + if (NS_FAILED(rv)) { + NS_WARNING("Could not start nsBayesianFilter timer"); + } + } +} + +NS_IMETHODIMP nsBayesianFilter::GetUserHasClassified(bool* aResult) { + *aResult = ((mCorpus.getMessageCount(kGoodTrait) + + mCorpus.getMessageCount(kJunkTrait)) && + mCorpus.countTokens()); + return NS_OK; +} + +// Set message classification (only allows junk and good) +NS_IMETHODIMP nsBayesianFilter::SetMessageClassification( + const nsACString& aMsgURL, nsMsgJunkStatus aOldClassification, + nsMsgJunkStatus aNewClassification, nsIMsgWindow* aMsgWindow, + nsIJunkMailClassificationListener* aListener) { + AutoTArray oldClassifications; + AutoTArray newClassifications; + + // convert between classifications and trait + if (aOldClassification == nsIJunkMailPlugin::JUNK) + oldClassifications.AppendElement(kJunkTrait); + else if (aOldClassification == nsIJunkMailPlugin::GOOD) + oldClassifications.AppendElement(kGoodTrait); + if (aNewClassification == nsIJunkMailPlugin::JUNK) + newClassifications.AppendElement(kJunkTrait); + else if (aNewClassification == nsIJunkMailPlugin::GOOD) + newClassifications.AppendElement(kGoodTrait); + + MessageObserver* analyzer = new MessageObserver( + this, oldClassifications, newClassifications, aListener, nullptr); + NS_ENSURE_TRUE(analyzer, NS_ERROR_OUT_OF_MEMORY); + + TokenStreamListener* tokenListener = new TokenStreamListener(analyzer); + NS_ENSURE_TRUE(tokenListener, NS_ERROR_OUT_OF_MEMORY); + + analyzer->setTokenListener(tokenListener); + return tokenizeMessage(aMsgURL, aMsgWindow, analyzer); +} + +NS_IMETHODIMP nsBayesianFilter::ResetTrainingData() { + return mCorpus.resetTrainingData(); +} + +NS_IMETHODIMP nsBayesianFilter::DetailMessage( + const nsACString& aMsgURI, uint32_t aProTrait, uint32_t aAntiTrait, + nsIMsgTraitDetailListener* aDetailListener, nsIMsgWindow* aMsgWindow) { + AutoTArray proTraits = {aProTrait}; + AutoTArray antiTraits = {aAntiTrait}; + AutoTArray uris = {PromiseFlatCString(aMsgURI)}; + + MessageClassifier* analyzer = + new MessageClassifier(this, nullptr, nullptr, aDetailListener, proTraits, + antiTraits, aMsgWindow, uris); + NS_ENSURE_TRUE(analyzer, NS_ERROR_OUT_OF_MEMORY); + + TokenStreamListener* tokenListener = new TokenStreamListener(analyzer); + NS_ENSURE_TRUE(tokenListener, NS_ERROR_OUT_OF_MEMORY); + + analyzer->setTokenListener(tokenListener); + return tokenizeMessage(aMsgURI, aMsgWindow, analyzer); +} + +// nsIMsgCorpus implementation + +NS_IMETHODIMP nsBayesianFilter::CorpusCounts(uint32_t aTrait, + uint32_t* aMessageCount, + uint32_t* aTokenCount) { + NS_ENSURE_ARG_POINTER(aTokenCount); + *aTokenCount = mCorpus.countTokens(); + if (aTrait && aMessageCount) *aMessageCount = mCorpus.getMessageCount(aTrait); + return NS_OK; +} + +NS_IMETHODIMP nsBayesianFilter::ClearTrait(uint32_t aTrait) { + return mCorpus.ClearTrait(aTrait); +} + +NS_IMETHODIMP +nsBayesianFilter::UpdateData(nsIFile* aFile, bool aIsAdd, + const nsTArray& aFromTraits, + const nsTArray& aToTraits) { + MOZ_ASSERT(aFromTraits.Length() == aToTraits.Length()); + return mCorpus.UpdateData(aFile, aIsAdd, aFromTraits, aToTraits); +} + +NS_IMETHODIMP +nsBayesianFilter::GetTokenCount(const nsACString& aWord, uint32_t aTrait, + uint32_t* aCount) { + NS_ENSURE_ARG_POINTER(aCount); + CorpusToken* t = mCorpus.get(PromiseFlatCString(aWord).get()); + uint32_t count = mCorpus.getTraitCount(t, aTrait); + *aCount = count; + return NS_OK; +} + +/* Corpus Store */ + +/* + Format of the training file for version 1: + [0xFEEDFACE] + [number good messages][number bad messages] + [number good tokens] + [count][length of word]word + ... + [number bad tokens] + [count][length of word]word + ... + + Format of the trait file for version 1: + [0xFCA93601] (the 01 is the version) + for each trait to write + [id of trait to write] (0 means end of list) + [number of messages per trait] + for each token with non-zero count + [count] + [length of word]word +*/ + +CorpusStore::CorpusStore() + : TokenHash(sizeof(CorpusToken)), + mNextTraitIndex(1) // skip 0 since index=0 will mean end of linked list +{ + getTrainingFile(getter_AddRefs(mTrainingFile)); + mTraitStore.SetCapacity(kTraitStoreCapacity); + TraitPerToken traitPT(0, 0); + mTraitStore.AppendElement(traitPT); // dummy 0th element +} + +CorpusStore::~CorpusStore() {} + +inline int writeUInt32(FILE* stream, uint32_t value) { + value = PR_htonl(value); + return fwrite(&value, sizeof(uint32_t), 1, stream); +} + +inline int readUInt32(FILE* stream, uint32_t* value) { + int n = fread(value, sizeof(uint32_t), 1, stream); + if (n == 1) { + *value = PR_ntohl(*value); + } + return n; +} + +void CorpusStore::forgetTokens(Tokenizer& aTokenizer, uint32_t aTraitId, + uint32_t aCount) { + // if we are forgetting the tokens for a message, should only + // subtract 1 from the occurrence count for that token in the training set + // because we assume we only bumped the training set count once per messages + // containing the token. + TokenEnumeration tokens = aTokenizer.getTokens(); + while (tokens.hasMoreTokens()) { + CorpusToken* token = static_cast(tokens.nextToken()); + remove(token->mWord, aTraitId, aCount); + } +} + +void CorpusStore::rememberTokens(Tokenizer& aTokenizer, uint32_t aTraitId, + uint32_t aCount) { + TokenEnumeration tokens = aTokenizer.getTokens(); + while (tokens.hasMoreTokens()) { + CorpusToken* token = static_cast(tokens.nextToken()); + if (!token) { + NS_ERROR("null token"); + continue; + } + add(token->mWord, aTraitId, aCount); + } +} + +bool CorpusStore::writeTokens(FILE* stream, bool shrink, uint32_t aTraitId) { + uint32_t tokenCount = countTokens(); + uint32_t newTokenCount = 0; + + // calculate the tokens for this trait to write + + TokenEnumeration tokens = getTokens(); + for (uint32_t i = 0; i < tokenCount; ++i) { + CorpusToken* token = static_cast(tokens.nextToken()); + uint32_t count = getTraitCount(token, aTraitId); + // Shrinking the token database is accomplished by dividing all token counts + // by 2. If shrinking, we'll ignore counts < 2, otherwise only ignore counts + // of < 1 + if ((shrink && count > 1) || (!shrink && count)) newTokenCount++; + } + + if (writeUInt32(stream, newTokenCount) != 1) return false; + + if (newTokenCount > 0) { + TokenEnumeration tokens = getTokens(); + for (uint32_t i = 0; i < tokenCount; ++i) { + CorpusToken* token = static_cast(tokens.nextToken()); + uint32_t wordCount = getTraitCount(token, aTraitId); + if (shrink) wordCount /= 2; + if (!wordCount) continue; // Don't output zero count words + if (writeUInt32(stream, wordCount) != 1) return false; + uint32_t tokenLength = strlen(token->mWord); + if (writeUInt32(stream, tokenLength) != 1) return false; + if (fwrite(token->mWord, tokenLength, 1, stream) != 1) return false; + } + } + return true; +} + +bool CorpusStore::readTokens(FILE* stream, int64_t fileSize, uint32_t aTraitId, + bool aIsAdd) { + uint32_t tokenCount; + if (readUInt32(stream, &tokenCount) != 1) return false; + + int64_t fpos = ftell(stream); + if (fpos < 0) return false; + + uint32_t bufferSize = 4096; + char* buffer = new char[bufferSize]; + if (!buffer) return false; + + for (uint32_t i = 0; i < tokenCount; ++i) { + uint32_t count; + if (readUInt32(stream, &count) != 1) break; + uint32_t size; + if (readUInt32(stream, &size) != 1) break; + fpos += 8; + if (fpos + size > fileSize) { + delete[] buffer; + return false; + } + if (size >= bufferSize) { + delete[] buffer; + while (size >= bufferSize) { + bufferSize *= 2; + if (bufferSize == 0) return false; + } + buffer = new char[bufferSize]; + if (!buffer) return false; + } + if (fread(buffer, size, 1, stream) != 1) break; + fpos += size; + buffer[size] = '\0'; + if (aIsAdd) + add(buffer, aTraitId, count); + else + remove(buffer, aTraitId, count); + } + + delete[] buffer; + + return true; +} + +nsresult CorpusStore::getTrainingFile(nsIFile** aTrainingFile) { + // should we cache the profile manager's directory? + nsCOMPtr profileDir; + + nsresult rv = NS_GetSpecialDirectory(NS_APP_USER_PROFILE_50_DIR, + getter_AddRefs(profileDir)); + NS_ENSURE_SUCCESS(rv, rv); + rv = profileDir->Append(u"training.dat"_ns); + NS_ENSURE_SUCCESS(rv, rv); + + return profileDir->QueryInterface(NS_GET_IID(nsIFile), (void**)aTrainingFile); +} + +nsresult CorpusStore::getTraitFile(nsIFile** aTraitFile) { + // should we cache the profile manager's directory? + nsCOMPtr profileDir; + + nsresult rv = NS_GetSpecialDirectory(NS_APP_USER_PROFILE_50_DIR, + getter_AddRefs(profileDir)); + NS_ENSURE_SUCCESS(rv, rv); + + rv = profileDir->Append(u"traits.dat"_ns); + NS_ENSURE_SUCCESS(rv, rv); + + return profileDir->QueryInterface(NS_GET_IID(nsIFile), (void**)aTraitFile); +} + +static const char kMagicCookie[] = {'\xFE', '\xED', '\xFA', '\xCE'}; + +// random string used to identify trait file and version (last byte is version) +static const char kTraitCookie[] = {'\xFC', '\xA9', '\x36', '\x01'}; + +void CorpusStore::writeTrainingData(uint32_t aMaximumTokenCount) { + MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug, + ("writeTrainingData() entered")); + if (!mTrainingFile) return; + + /* + * For backwards compatibility, write the good and junk tokens to + * training.dat; additional traits are added to a different file + */ + + // open the file, and write out training data + FILE* stream; + nsresult rv = mTrainingFile->OpenANSIFileDesc("wb", &stream); + if (NS_FAILED(rv)) return; + + // If the number of tokens exceeds our limit, set the shrink flag + bool shrink = false; + if ((aMaximumTokenCount > 0) && // if 0, do not limit tokens + (countTokens() > aMaximumTokenCount)) { + shrink = true; + MOZ_LOG(BayesianFilterLogModule, LogLevel::Warning, + ("shrinking token data file")); + } + + // We implement shrink by dividing counts by two + uint32_t shrinkFactor = shrink ? 2 : 1; + + if (!((fwrite(kMagicCookie, sizeof(kMagicCookie), 1, stream) == 1) && + (writeUInt32(stream, getMessageCount(kGoodTrait) / shrinkFactor)) && + (writeUInt32(stream, getMessageCount(kJunkTrait) / shrinkFactor)) && + writeTokens(stream, shrink, kGoodTrait) && + writeTokens(stream, shrink, kJunkTrait))) { + NS_WARNING("failed to write training data."); + fclose(stream); + // delete the training data file, since it is potentially corrupt. + mTrainingFile->Remove(false); + } else { + fclose(stream); + } + + /* + * Write the remaining data to a second file traits.dat + */ + + if (!mTraitFile) { + getTraitFile(getter_AddRefs(mTraitFile)); + if (!mTraitFile) return; + } + + // open the file, and write out training data + rv = mTraitFile->OpenANSIFileDesc("wb", &stream); + if (NS_FAILED(rv)) return; + + uint32_t numberOfTraits = mMessageCounts.Length(); + bool error; + while (1) // break on error or done + { + if ((error = (fwrite(kTraitCookie, sizeof(kTraitCookie), 1, stream) != 1))) + break; + + for (uint32_t index = 0; index < numberOfTraits; index++) { + uint32_t trait = mMessageCountsId[index]; + if (trait == 1 || trait == 2) + continue; // junk traits are stored in training.dat + if ((error = (writeUInt32(stream, trait) != 1))) break; + if ((error = (writeUInt32(stream, mMessageCounts[index] / shrinkFactor) != + 1))) + break; + if ((error = !writeTokens(stream, shrink, trait))) break; + } + break; + } + // we add a 0 at the end to represent end of trait list + error = writeUInt32(stream, 0) != 1; + + fclose(stream); + if (error) { + NS_WARNING("failed to write trait data."); + // delete the trait data file, since it is probably corrupt. + mTraitFile->Remove(false); + } + + if (shrink) { + // We'll clear the tokens, and read them back in from the file. + // Yes this is slower than in place, but this is a rare event. + + if (countTokens()) { + clearTokens(); + for (uint32_t index = 0; index < numberOfTraits; index++) + mMessageCounts[index] = 0; + } + + readTrainingData(); + } +} + +void CorpusStore::readTrainingData() { + /* + * To maintain backwards compatibility, good and junk traits + * are stored in a file "training.dat" + */ + if (!mTrainingFile) return; + + bool exists; + nsresult rv = mTrainingFile->Exists(&exists); + if (NS_FAILED(rv) || !exists) return; + + FILE* stream; + rv = mTrainingFile->OpenANSIFileDesc("rb", &stream); + if (NS_FAILED(rv)) return; + + int64_t fileSize; + rv = mTrainingFile->GetFileSize(&fileSize); + if (NS_FAILED(rv)) return; + + // FIXME: should make sure that the tokenizers are empty. + char cookie[4]; + uint32_t goodMessageCount = 0, junkMessageCount = 0; + if (!((fread(cookie, sizeof(cookie), 1, stream) == 1) && + (memcmp(cookie, kMagicCookie, sizeof(cookie)) == 0) && + (readUInt32(stream, &goodMessageCount) == 1) && + (readUInt32(stream, &junkMessageCount) == 1) && + readTokens(stream, fileSize, kGoodTrait, true) && + readTokens(stream, fileSize, kJunkTrait, true))) { + NS_WARNING("failed to read training data."); + MOZ_LOG(BayesianFilterLogModule, LogLevel::Error, + ("failed to read training data.")); + } + setMessageCount(kGoodTrait, goodMessageCount); + setMessageCount(kJunkTrait, junkMessageCount); + + fclose(stream); + + /* + * Additional traits are stored in traits.dat + */ + + if (!mTraitFile) { + getTraitFile(getter_AddRefs(mTraitFile)); + if (!mTraitFile) return; + } + + rv = mTraitFile->Exists(&exists); + if (NS_FAILED(rv) || !exists) return; + + nsTArray empty; + rv = UpdateData(mTraitFile, true, empty, empty); + + if (NS_FAILED(rv)) { + NS_WARNING("failed to read training data."); + MOZ_LOG(BayesianFilterLogModule, LogLevel::Error, + ("failed to read training data.")); + } + return; +} + +nsresult CorpusStore::resetTrainingData() { + // clear out our in memory training tokens... + if (countTokens()) clearTokens(); + + uint32_t length = mMessageCounts.Length(); + for (uint32_t index = 0; index < length; index++) mMessageCounts[index] = 0; + + if (mTrainingFile) mTrainingFile->Remove(false); + if (mTraitFile) mTraitFile->Remove(false); + return NS_OK; +} + +inline CorpusToken* CorpusStore::get(const char* word) { + return static_cast(TokenHash::get(word)); +} + +nsresult CorpusStore::updateTrait(CorpusToken* token, uint32_t aTraitId, + int32_t aCountChange) { + NS_ENSURE_ARG_POINTER(token); + uint32_t nextLink = token->mTraitLink; + uint32_t lastLink = 0; + + uint32_t linkCount, maxLinks = 100; // sanity check + for (linkCount = 0; nextLink && linkCount < maxLinks; linkCount++) { + TraitPerToken& traitPT = mTraitStore[nextLink]; + if (traitPT.mId == aTraitId) { + // be careful with signed versus unsigned issues here + if (static_cast(traitPT.mCount) + aCountChange > 0) + traitPT.mCount += aCountChange; + else + traitPT.mCount = 0; + // we could delete zero count traits here, but let's not. It's rare + // anyway. + return NS_OK; + } + lastLink = nextLink; + nextLink = traitPT.mNextLink; + } + if (linkCount >= maxLinks) return NS_ERROR_FAILURE; + + // trait does not exist, so add it + + if (aCountChange > 0) // don't set a negative count + { + TraitPerToken traitPT(aTraitId, aCountChange); + if (mTraitStore.Length() == mNextTraitIndex) + mTraitStore.InsertElementAt(mNextTraitIndex, traitPT); + else if (mTraitStore.Length() > mNextTraitIndex) + mTraitStore.ReplaceElementsAt(mNextTraitIndex, 1, traitPT); + else + return NS_ERROR_FAILURE; + if (lastLink) + // the token had a parent, so update it + mTraitStore[lastLink].mNextLink = mNextTraitIndex; + else + // need to update the token's root link + token->mTraitLink = mNextTraitIndex; + mNextTraitIndex++; + } + return NS_OK; +} + +uint32_t CorpusStore::getTraitCount(CorpusToken* token, uint32_t aTraitId) { + uint32_t nextLink; + if (!token || !(nextLink = token->mTraitLink)) return 0; + + uint32_t linkCount, maxLinks = 100; // sanity check + for (linkCount = 0; nextLink && linkCount < maxLinks; linkCount++) { + TraitPerToken& traitPT = mTraitStore[nextLink]; + if (traitPT.mId == aTraitId) return traitPT.mCount; + nextLink = traitPT.mNextLink; + } + NS_ASSERTION(linkCount < maxLinks, "Corrupt trait count store"); + + // trait not found (or error), so count is zero + return 0; +} + +CorpusToken* CorpusStore::add(const char* word, uint32_t aTraitId, + uint32_t aCount) { + CorpusToken* token = static_cast(TokenHash::add(word)); + if (token) { + MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug, + ("adding word to corpus store: %s (Trait=%d) (deltaCount=%d)", word, + aTraitId, aCount)); + updateTrait(token, aTraitId, aCount); + } + return token; +} + +void CorpusStore::remove(const char* word, uint32_t aTraitId, uint32_t aCount) { + MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug, + ("remove word: %s (TraitId=%d) (Count=%d)", word, aTraitId, aCount)); + CorpusToken* token = get(word); + if (token) updateTrait(token, aTraitId, -static_cast(aCount)); +} + +uint32_t CorpusStore::getMessageCount(uint32_t aTraitId) { + size_t index = mMessageCountsId.IndexOf(aTraitId); + if (index == mMessageCountsId.NoIndex) return 0; + return mMessageCounts.ElementAt(index); +} + +void CorpusStore::setMessageCount(uint32_t aTraitId, uint32_t aCount) { + size_t index = mMessageCountsId.IndexOf(aTraitId); + if (index == mMessageCountsId.NoIndex) { + mMessageCounts.AppendElement(aCount); + mMessageCountsId.AppendElement(aTraitId); + } else { + mMessageCounts[index] = aCount; + } +} + +nsresult CorpusStore::UpdateData(nsIFile* aFile, bool aIsAdd, + const nsTArray& aFromTraits, + const nsTArray& aToTraits) { + NS_ENSURE_ARG_POINTER(aFile); + MOZ_ASSERT(aFromTraits.Length() == aToTraits.Length()); + + int64_t fileSize; + nsresult rv = aFile->GetFileSize(&fileSize); + NS_ENSURE_SUCCESS(rv, rv); + + FILE* stream; + rv = aFile->OpenANSIFileDesc("rb", &stream); + NS_ENSURE_SUCCESS(rv, rv); + + bool error; + do // break on error or done + { + char cookie[4]; + if ((error = (fread(cookie, sizeof(cookie), 1, stream) != 1))) break; + + if ((error = memcmp(cookie, kTraitCookie, sizeof(cookie)))) break; + + uint32_t fileTrait; + while (!(error = (readUInt32(stream, &fileTrait) != 1)) && fileTrait) { + uint32_t count; + if ((error = (readUInt32(stream, &count) != 1))) break; + + uint32_t localTrait = fileTrait; + // remap the trait + for (uint32_t i = 0; i < aFromTraits.Length(); i++) { + if (aFromTraits[i] == fileTrait) localTrait = aToTraits[i]; + } + + uint32_t messageCount = getMessageCount(localTrait); + if (aIsAdd) + messageCount += count; + else if (count > messageCount) + messageCount = 0; + else + messageCount -= count; + setMessageCount(localTrait, messageCount); + + if ((error = !readTokens(stream, fileSize, localTrait, aIsAdd))) break; + } + break; + } while (0); + + fclose(stream); + + if (error) return NS_ERROR_FAILURE; + return NS_OK; +} + +nsresult CorpusStore::ClearTrait(uint32_t aTrait) { + // clear message counts + setMessageCount(aTrait, 0); + + TokenEnumeration tokens = getTokens(); + while (tokens.hasMoreTokens()) { + CorpusToken* token = static_cast(tokens.nextToken()); + int32_t wordCount = static_cast(getTraitCount(token, aTrait)); + updateTrait(token, aTrait, -wordCount); + } + return NS_OK; +} diff --git a/comm/mailnews/extensions/bayesian-spam-filter/nsBayesianFilter.h b/comm/mailnews/extensions/bayesian-spam-filter/nsBayesianFilter.h new file mode 100644 index 0000000000..70d0a1a02b --- /dev/null +++ b/comm/mailnews/extensions/bayesian-spam-filter/nsBayesianFilter.h @@ -0,0 +1,397 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef nsBayesianFilter_h__ +#define nsBayesianFilter_h__ + +#include +#include "nsCOMPtr.h" +#include "nsIMsgFilterPlugin.h" +#include "PLDHashTable.h" +#include "nsITimer.h" +#include "nsTArray.h" +#include "nsString.h" +#include "nsWeakReference.h" +#include "nsIObserver.h" +#include "nsHashPropertyBag.h" +#include "mozilla/intl/WordBreaker.h" + +#include "mozilla/ArenaAllocator.h" + +#define DEFAULT_MIN_INTERVAL_BETWEEN_WRITES 15 * 60 * 1000 + +struct Token; +class TokenEnumeration; +class TokenAnalyzer; +class nsIMsgWindow; +class nsIUTF8StringEnumerator; +struct BaseToken; +struct CorpusToken; + +/** + * Helper class to enumerate Token objects in a PLDHashTable + * safely and without copying (see bugzilla #174859). The + * enumeration is safe to use until an Add() + * or Remove() is performed on the table. + */ +class TokenEnumeration { + public: + explicit TokenEnumeration(PLDHashTable* table); + bool hasMoreTokens(); + BaseToken* nextToken(); + + private: + PLDHashTable::Iterator mIterator; +}; + +// A trait is some aspect of a message, like being junk or tagged as +// Personal, that the statistical classifier should track. The Trait +// structure is a per-token representation of information pertaining to +// a message trait. +// +// Traits per token are maintained as a linked list. +// +struct TraitPerToken { + uint32_t mId; // identifying number for a trait + uint32_t mCount; // count of messages with this token and trait + uint32_t mNextLink; // index in mTraitStore for the next trait, or 0 + // for none + TraitPerToken(uint32_t aId, uint32_t aCount); // inititializer +}; + +// An Analysis is the statistical results for a particular message, a +// particular token, and for a particular pair of trait/antitrait, that +// is then used in subsequent analysis to score the message. +// +// Analyses per token are maintained as a linked list. +// +struct AnalysisPerToken { + uint32_t mTraitIndex; // index representing a protrait/antitrait pair. + // So if we are analyzing 3 different traits, then + // the first trait is 0, the second 1, etc. + double mDistance; // absolute value of mProbability - 0.5 + double mProbability; // relative indicator of match of trait to token + uint32_t mNextLink; // index in mAnalysisStore for the Analysis object + // for the next trait index, or 0 for none. + // initializer + AnalysisPerToken(uint32_t aTraitIndex, double aDistance, double aProbability); +}; + +class TokenHash { + public: + virtual ~TokenHash(); + /** + * Clears out the previous message tokens. + */ + nsresult clearTokens(); + uint32_t countTokens(); + TokenEnumeration getTokens(); + BaseToken* add(const char* word); + + protected: + explicit TokenHash(uint32_t entrySize); + mozilla::ArenaAllocator<16384, 2> mWordPool; + uint32_t mEntrySize; + PLDHashTable mTokenTable; + char* copyWord(const char* word, uint32_t len); + BaseToken* get(const char* word); +}; + +class Tokenizer : public TokenHash { + public: + Tokenizer(); + ~Tokenizer(); + + Token* get(const char* word); + + // The training set keeps an occurrence count on each word. This count + // is supposed to count the # of messages it occurs in. + // When add/remove is called while tokenizing a message and NOT the training + // set, + // + Token* add(const char* word, uint32_t count = 1); + + Token* copyTokens(); + + void tokenize(const char* text); + + /** + * Creates specific tokens based on the mime headers for the message being + * tokenized + */ + void tokenizeHeaders(nsTArray& aHeaderNames, + nsTArray& aHeaderValues); + + void tokenizeAttachments(nsTArray>& attachments); + + nsCString mBodyDelimiters; // delimiters for body tokenization + nsCString mHeaderDelimiters; // delimiters for header tokenization + + // arrays of extra headers to tokenize / to not tokenize + nsTArray mEnabledHeaders; + nsTArray mDisabledHeaders; + // Delimiters used in tokenizing a particular header. + // Parallel array to mEnabledHeaders + nsTArray mEnabledHeadersDelimiters; + bool mCustomHeaderTokenization; // Are there any preference-set tokenization + // customizations? + uint32_t mMaxLengthForToken; // maximum length of a token + // should we convert iframe to div during tokenization? + bool mIframeToDiv; + + private: + void tokenize_ascii_word(char* word); + void tokenize_japanese_word(char* chunk); + inline void addTokenForHeader(const char* aTokenPrefix, nsACString& aValue, + bool aTokenizeValue = false, + const char* aDelimiters = nullptr); + nsresult stripHTML(const nsAString& inString, nsAString& outString); + // helper function to escape \n, \t, etc from a CString + void UnescapeCString(nsCString& aCString); + nsresult ScannerNext(const char16_t* text, int32_t length, int32_t pos, + bool isLastBuffer, int32_t* begin, int32_t* end, + bool* _retval); +}; + +/** + * Implements storage of a collection of message tokens and counts for + * a corpus of classified messages + */ + +class CorpusStore : public TokenHash { + public: + CorpusStore(); + ~CorpusStore(); + + /** + * retrieve the token structure for a particular string + * + * @param word the character representation of the token + * + * @return token structure containing counts, null if not found + */ + CorpusToken* get(const char* word); + + /** + * add tokens to the storage, or increment counts if already exists. + * + * @param aTokenizer tokenizer for the list of tokens to remember + * @param aTraitId id for the trait whose counts will be remembered + * @param aCount number of new messages represented by the token list + */ + void rememberTokens(Tokenizer& aTokenizer, uint32_t aTraitId, + uint32_t aCount); + + /** + * decrement counts for tokens in the storage, removing if all counts + * are zero + * + * @param aTokenizer tokenizer for the list of tokens to forget + * @param aTraitId id for the trait whose counts will be removed + * @param aCount number of messages represented by the token list + */ + void forgetTokens(Tokenizer& aTokenizer, uint32_t aTraitId, uint32_t aCount); + + /** + * write the corpus information to file storage + * + * @param aMaximumTokenCount prune tokens if number of tokens exceeds + * this value. == 0 for no pruning + */ + void writeTrainingData(uint32_t aMaximumTokenCount); + + /** + * read the corpus information from file storage + */ + void readTrainingData(); + + /** + * delete the local corpus storage file and data + */ + nsresult resetTrainingData(); + + /** + * get the count of messages whose tokens are stored that are associated + * with a trait + * + * @param aTraitId identifier for the trait + * @return number of messages for that trait + */ + uint32_t getMessageCount(uint32_t aTraitId); + + /** + * set the count of messages whose tokens are stored that are associated + * with a trait + * + * @param aTraitId identifier for the trait + * @param aCount number of messages for that trait + */ + void setMessageCount(uint32_t aTraitId, uint32_t aCount); + + /** + * get the count of messages associated with a particular token and trait + * + * @param token the token string and associated counts + * @param aTraitId identifier for the trait + */ + uint32_t getTraitCount(CorpusToken* token, uint32_t aTraitId); + + /** + * Add (or remove) data from a particular file to the corpus data. + * + * @param aFile the file with the data, in the format: + * + * Format of the trait file for version 1: + * [0xFCA93601] (the 01 is the version) + * for each trait to write: + * [id of trait to write] (0 means end of list) + * [number of messages per trait] + * for each token with non-zero count + * [count] + * [length of word]word + * + * @param aIsAdd should the data be added, or removed? true if adding, + * else removing. + * + * @param aFromTraits array of trait ids used in aFile. If aFile contains + * trait ids that are not in this array, they are not + * remapped, but assumed to be local trait ids. + * + * @param aToTraits array of trait ids, corresponding to elements of + * aFromTraits, that represent the local trait ids to be + * used in storing data from aFile into the local corpus. + * + */ + nsresult UpdateData(nsIFile* aFile, bool aIsAdd, + const nsTArray& aFromTraits, + const nsTArray& aToTraits); + + /** + * remove all counts (message and tokens) for a trait id + * + * @param aTrait trait id for the trait to remove + */ + nsresult ClearTrait(uint32_t aTrait); + + protected: + /** + * return the local corpus storage file for junk traits + */ + nsresult getTrainingFile(nsIFile** aFile); + + /** + * return the local corpus storage file for non-junk traits + */ + nsresult getTraitFile(nsIFile** aFile); + + /** + * read token strings from the data file + * + * @param stream file stream with token data + * @param fileSize file size + * @param aTraitId id for the trait whose counts will be read + * @param aIsAdd true to add the counts, false to remove them + * + * @return true if successful, false if error + */ + bool readTokens(FILE* stream, int64_t fileSize, uint32_t aTraitId, + bool aIsAdd); + + /** + * write token strings to the data file + */ + bool writeTokens(FILE* stream, bool shrink, uint32_t aTraitId); + + /** + * remove counts for a token string + */ + void remove(const char* word, uint32_t aTraitId, uint32_t aCount); + + /** + * add counts for a token string, adding the token string if new + */ + CorpusToken* add(const char* word, uint32_t aTraitId, uint32_t aCount); + + /** + * change counts in a trait in the traits array, adding the trait if needed + */ + nsresult updateTrait(CorpusToken* token, uint32_t aTraitId, + int32_t aCountChange); + nsCOMPtr mTrainingFile; // file used to store junk training data + nsCOMPtr mTraitFile; // file used to store non-junk + // training data + nsTArray mTraitStore; // memory for linked-list of counts + uint32_t mNextTraitIndex; // index in mTraitStore to first empty + // TraitPerToken + nsTArray mMessageCounts; // count of messages per trait + // represented in the store + nsTArray mMessageCountsId; // Parallel array to mMessageCounts, + // with the corresponding trait ID +}; + +class nsBayesianFilter : public nsIJunkMailPlugin, + nsIMsgCorpus, + nsIObserver, + nsSupportsWeakReference { + public: + NS_DECL_ISUPPORTS + NS_DECL_NSIMSGFILTERPLUGIN + NS_DECL_NSIJUNKMAILPLUGIN + NS_DECL_NSIMSGCORPUS + NS_DECL_NSIOBSERVER + + nsBayesianFilter(); + + nsresult Init(); + + nsresult tokenizeMessage(const nsACString& messageURI, + nsIMsgWindow* aMsgWindow, TokenAnalyzer* analyzer); + void classifyMessage(Tokenizer& tokens, const nsACString& messageURI, + nsIJunkMailClassificationListener* listener); + + void classifyMessage(Tokenizer& tokenizer, const nsACString& messageURI, + nsTArray& aProTraits, + nsTArray& aAntiTraits, + nsIJunkMailClassificationListener* listener, + nsIMsgTraitClassificationListener* aTraitListener, + nsIMsgTraitDetailListener* aDetailListener); + + void observeMessage(Tokenizer& tokens, const nsACString& messageURI, + nsTArray& oldClassifications, + nsTArray& newClassifications, + nsIJunkMailClassificationListener* listener, + nsIMsgTraitClassificationListener* aTraitListener); + + protected: + virtual ~nsBayesianFilter(); + + static void TimerCallback(nsITimer* aTimer, void* aClosure); + + CorpusStore mCorpus; + double mJunkProbabilityThreshold; + int32_t mMaximumTokenCount; + bool mTrainingDataDirty; + int32_t mMinFlushInterval; // in milliseconds, must be positive + // and not too close to 0 + nsCOMPtr mTimer; + + // index in mAnalysisStore for first empty AnalysisPerToken + uint32_t mNextAnalysisIndex; + // memory for linked list of AnalysisPerToken objects + nsTArray mAnalysisStore; + /** + * Determine the location in mAnalysisStore where the AnalysisPerToken + * object for a particular token and trait is stored + */ + uint32_t getAnalysisIndex(Token& token, uint32_t aTraitIndex); + /** + * Set the value of the AnalysisPerToken object for a particular + * token and trait + */ + nsresult setAnalysis(Token& token, uint32_t aTraitIndex, double aDistance, + double aProbability); +}; + +#endif // _nsBayesianFilter_h__ diff --git a/comm/mailnews/extensions/bayesian-spam-filter/nsIncompleteGamma.h b/comm/mailnews/extensions/bayesian-spam-filter/nsIncompleteGamma.h new file mode 100644 index 0000000000..9b96459c7c --- /dev/null +++ b/comm/mailnews/extensions/bayesian-spam-filter/nsIncompleteGamma.h @@ -0,0 +1,239 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef nsIncompleteGamma_h__ +#define nsIncompleteGamma_h__ + +/* An implementation of the incomplete gamma functions for real + arguments. P is defined as + + x + / + 1 [ a - 1 - t + P(a, x) = -------- I t e dt + Gamma(a) ] + / + 0 + + and + + infinity + / + 1 [ a - 1 - t + Q(a, x) = -------- I t e dt + Gamma(a) ] + / + x + + so that P(a,x) + Q(a,x) = 1. + + Both a series expansion and a continued fraction exist. This + implementation uses the more efficient method based on the arguments. + + Either case involves calculating a multiplicative term: + e^(-x)*x^a/Gamma(a). + Here we calculate the log of this term. Most math libraries have a + "lgamma" function but it is not re-entrant. Some libraries have a + "lgamma_r" which is re-entrant. Use it if possible. I have included a + simple replacement but it is certainly not as accurate. + + Relative errors are almost always < 1e-10 and usually < 1e-14. Very + small and very large arguments cause trouble. + + The region where a < 0.5 and x < 0.5 has poor error properties and is + not too stable. Get a better routine if you need results in this + region. + + The error argument will be set negative if there is a domain error or + positive for an internal calculation error, currently lack of + convergence. A value is always returned, though. + + */ + +#include +#include + +// the main routine +static double nsIncompleteGammaP(double a, double x, int* error); + +// nsLnGamma(z): either a wrapper around lgamma_r or the internal function. +// C_m = B[2*m]/(2*m*(2*m-1)) where B is a Bernoulli number +static const double C_1 = 1.0 / 12.0; +static const double C_2 = -1.0 / 360.0; +static const double C_3 = 1.0 / 1260.0; +static const double C_4 = -1.0 / 1680.0; +static const double C_5 = 1.0 / 1188.0; +static const double C_6 = -691.0 / 360360.0; +static const double C_7 = 1.0 / 156.0; +static const double C_8 = -3617.0 / 122400.0; +static const double C_9 = 43867.0 / 244188.0; +static const double C_10 = -174611.0 / 125400.0; +static const double C_11 = 77683.0 / 5796.0; + +// truncated asymptotic series in 1/z +static inline double lngamma_asymp(double z) { + double w, w2, sum; + w = 1.0 / z; + w2 = w * w; + sum = + w * + (w2 * (w2 * (w2 * (w2 * (w2 * (w2 * (w2 * (w2 * (w2 * (C_11 * w2 + C_10) + + C_9) + + C_8) + + C_7) + + C_6) + + C_5) + + C_4) + + C_3) + + C_2) + + C_1); + + return sum; +} + +struct fact_table_s { + double fact; + double lnfact; +}; + +// for speed and accuracy +static const struct fact_table_s FactTable[] = { + {1.000000000000000, 0.0000000000000000000000e+00}, + {1.000000000000000, 0.0000000000000000000000e+00}, + {2.000000000000000, 6.9314718055994530942869e-01}, + {6.000000000000000, 1.7917594692280550007892e+00}, + {24.00000000000000, 3.1780538303479456197550e+00}, + {120.0000000000000, 4.7874917427820459941458e+00}, + {720.0000000000000, 6.5792512120101009952602e+00}, + {5040.000000000000, 8.5251613610654142999881e+00}, + {40320.00000000000, 1.0604602902745250228925e+01}, + {362880.0000000000, 1.2801827480081469610995e+01}, + {3628800.000000000, 1.5104412573075515295248e+01}, + {39916800.00000000, 1.7502307845873885839769e+01}, + {479001600.0000000, 1.9987214495661886149228e+01}, + {6227020800.000000, 2.2552163853123422886104e+01}, + {87178291200.00000, 2.5191221182738681499610e+01}, + {1307674368000.000, 2.7899271383840891566988e+01}, + {20922789888000.00, 3.0671860106080672803835e+01}, + {355687428096000.0, 3.3505073450136888885825e+01}, + {6402373705728000., 3.6395445208033053576674e+01}}; +#define FactTableLength (int)(sizeof(FactTable) / sizeof(FactTable[0])) + +// for speed +static const double ln_2pi_2 = 0.918938533204672741803; // log(2*PI)/2 + +/* A simple lgamma function, not very robust. + + Valid for z_in > 0 ONLY. + + For z_in > 8 precision is quite good, relative errors < 1e-14 and + usually better. For z_in < 8 relative errors increase but are usually + < 1e-10. In two small regions, 1 +/- .001 and 2 +/- .001 errors + increase quickly. +*/ +static double nsLnGamma(double z_in, int* gsign) { + double scale, z, sum, result; + *gsign = 1; + + int zi = (int)z_in; + if (z_in == (double)zi) { + if (0 < zi && zi <= FactTableLength) + return FactTable[zi - 1].lnfact; // gamma(z) = (z-1)! + } + + for (scale = 1.0, z = z_in; z < 8.0; ++z) scale *= z; + + sum = lngamma_asymp(z); + result = (z - 0.5) * log(z) - z + ln_2pi_2 - log(scale); + result += sum; + return result; +} + +// log( e^(-x)*x^a/Gamma(a) ) +static inline double lnPQfactor(double a, double x) { + int gsign; // ignored because a > 0 + return a * log(x) - x - nsLnGamma(a, &gsign); +} + +static double Pseries(double a, double x, int* error) { + double sum, term; + const double eps = 2.0 * DBL_EPSILON; + const int imax = 5000; + int i; + + sum = term = 1.0 / a; + for (i = 1; i < imax; ++i) { + term *= x / (a + i); + sum += term; + if (fabs(term) < eps * fabs(sum)) break; + } + + if (i >= imax) *error = 1; + + return sum; +} + +static double Qcontfrac(double a, double x, int* error) { + double result, D, C, e, f, term; + const double eps = 2.0 * DBL_EPSILON; + const double small = DBL_EPSILON * DBL_EPSILON * DBL_EPSILON * DBL_EPSILON; + const int imax = 5000; + int i; + + // modified Lentz method + f = x - a + 1.0; + if (fabs(f) < small) f = small; + C = f + 1.0 / small; + D = 1.0 / f; + result = D; + for (i = 1; i < imax; ++i) { + e = i * (a - i); + f += 2.0; + D = f + e * D; + if (fabs(D) < small) D = small; + D = 1.0 / D; + C = f + e / C; + if (fabs(C) < small) C = small; + term = C * D; + result *= term; + if (fabs(term - 1.0) < eps) break; + } + + if (i >= imax) *error = 1; + return result; +} + +static double nsIncompleteGammaP(double a, double x, int* error) { + double result, dom, ldom; + // domain errors. the return values are meaningless but have + // to return something. + *error = -1; + if (a <= 0.0) return 1.0; + if (x < 0.0) return 0.0; + *error = 0; + if (x == 0.0) return 0.0; + + ldom = lnPQfactor(a, x); + dom = exp(ldom); + // might need to adjust the crossover point + if (a <= 0.5) { + if (x < a + 1.0) + result = dom * Pseries(a, x, error); + else + result = 1.0 - dom * Qcontfrac(a, x, error); + } else { + if (x < a) + result = dom * Pseries(a, x, error); + else + result = 1.0 - dom * Qcontfrac(a, x, error); + } + + // not clear if this can ever happen + if (result > 1.0) result = 1.0; + if (result < 0.0) result = 0.0; + return result; +} + +#endif diff --git a/comm/mailnews/extensions/bayesian-spam-filter/test/unit/head_bayes.js b/comm/mailnews/extensions/bayesian-spam-filter/test/unit/head_bayes.js new file mode 100644 index 0000000000..b502dcc2e5 --- /dev/null +++ b/comm/mailnews/extensions/bayesian-spam-filter/test/unit/head_bayes.js @@ -0,0 +1,28 @@ +var { MailServices } = ChromeUtils.import( + "resource:///modules/MailServices.jsm" +); +var { XPCOMUtils } = ChromeUtils.importESModule( + "resource://gre/modules/XPCOMUtils.sys.mjs" +); +var { mailTestUtils } = ChromeUtils.import( + "resource://testing-common/mailnews/MailTestUtils.jsm" +); +var { localAccountUtils } = ChromeUtils.import( + "resource://testing-common/mailnews/LocalAccountUtils.jsm" +); + +var CC = Components.Constructor; + +// Ensure the profile directory is set up +do_get_profile(); + +function getSpec(aFileName) { + var file = do_get_file("resources/" + aFileName); + var uri = Services.io.newFileURI(file).QueryInterface(Ci.nsIURL); + uri = uri.mutate().setQuery("type=application/x-message-display").finalize(); + return uri.spec; +} + +registerCleanupFunction(function () { + load("../../../../resources/mailShutdown.js"); +}); diff --git a/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/aliases.dat b/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/aliases.dat new file mode 100644 index 0000000000..31162459e4 Binary files /dev/null and b/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/aliases.dat differ diff --git a/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/aliases1.eml b/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/aliases1.eml new file mode 100644 index 0000000000..4720467fe6 --- /dev/null +++ b/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/aliases1.eml @@ -0,0 +1,6 @@ +From - Sat Jan 26 08:43:42 2008 +Subject: test1 +Content-Type: text/plain; charset=iso-8859-1 + +important + diff --git a/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/aliases2.eml b/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/aliases2.eml new file mode 100644 index 0000000000..9a251486a9 --- /dev/null +++ b/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/aliases2.eml @@ -0,0 +1,6 @@ +From - Sat Jan 26 08:43:42 2008 +Subject: test2 +Content-Type: text/plain; charset=iso-8859-1 + +work + diff --git a/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/aliases3.eml b/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/aliases3.eml new file mode 100644 index 0000000000..de31992ac5 --- /dev/null +++ b/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/aliases3.eml @@ -0,0 +1,6 @@ +From - Sat Jan 26 08:43:42 2008 +Subject: test3 +Content-Type: text/plain; charset=iso-8859-1 + +very important work + diff --git a/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/ham1.eml b/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/ham1.eml new file mode 100644 index 0000000000..6a63f587b8 --- /dev/null +++ b/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/ham1.eml @@ -0,0 +1,7 @@ +Date: Tue, 30 Apr 2008 00:12:17 -0700 +From: Mom +To: Careful Reader +Subject: eat your vegetables +MIME-Version: 1.0 + +vegetables are very important for your health and wealth. diff --git a/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/ham2.eml b/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/ham2.eml new file mode 100644 index 0000000000..cd6691b921 --- /dev/null +++ b/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/ham2.eml @@ -0,0 +1,8 @@ +Date: Tue, 27 Apr 2006 00:13:23 -0700 +From: Evil Despot +To: Careful Reader +Subject: finish your report +MIME-Version: 1.0 + +If you want to keep your sorry job and health, finish that +important report before the close of business today. diff --git a/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/msgCorpus.dat b/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/msgCorpus.dat new file mode 100644 index 0000000000..f273a4f10c Binary files /dev/null and b/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/msgCorpus.dat differ diff --git a/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/spam1.eml b/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/spam1.eml new file mode 100644 index 0000000000..ea629213cc --- /dev/null +++ b/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/spam1.eml @@ -0,0 +1,7 @@ +Date: Tue, 29 Apr 2008 00:10:07 -0700 +From: Spam King +To: Careful Reader +Subject: viagra is your nigerian xxx dream +MIME-Version: 1.0 + +click here to make lots of money and wealth diff --git a/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/spam2.eml b/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/spam2.eml new file mode 100644 index 0000000000..817d328cf2 --- /dev/null +++ b/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/spam2.eml @@ -0,0 +1,8 @@ +Date: Mon, 27 Apr 2008 01:02:03 -0700 +From: Stock Pusher +To: Careful Reader +Subject: ABCD Corporation will soar tomorrow! +MIME-Version: 1.0 + +Make lots of money! Put all of your money into ACBD Corporation +Stock! diff --git a/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/spam3.eml b/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/spam3.eml new file mode 100644 index 0000000000..0a524e604b --- /dev/null +++ b/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/spam3.eml @@ -0,0 +1,7 @@ +Date: Wed, 30 Apr 2008 01:11:17 -0700 +From: Spam King +To: Careful Reader +Subject: we have your nigerian xxx dream +MIME-Version: 1.0 + +Not making lots of money and wealth? Call me! diff --git a/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/spam4.eml b/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/spam4.eml new file mode 100644 index 0000000000..775d3b41fa --- /dev/null +++ b/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/spam4.eml @@ -0,0 +1,8 @@ +Date: Tue, 28 Apr 2008 01:02:04 -0700 +From: Stock Pusher +To: Careful Reader +Subject: ABCD Corporation will really soar this time! +MIME-Version: 1.0 + +Make lots of money! Put all of your money into ABCD Corporation +Stock! (We really mean it this time!) diff --git a/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/tokenTest.eml b/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/tokenTest.eml new file mode 100644 index 0000000000..d6e7e0ae3d --- /dev/null +++ b/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/tokenTest.eml @@ -0,0 +1,14 @@ +Date: Tue, 30 Apr 2008 00:12:17 -0700 +From: Mom +To: Careful Reader +Subject: eat your vegetables to live long +Received: from c-1-2-3-4.hsd1.wa.example.net ([1.2.3.4] helo=theComputer) + by host301.example.com with esmtpa (Exim 4.69) + (envelope-from ) + id 1LeEgH-0003GN-Rr + for reader@example.org; Mon, 02 Mar 2009 13:24:06 -0700 +MIME-Version: 1.0 +Message-Id: 14159 +Sender: Bugzilla Test Setup + +This is a sentence. Important URL is http://www.example.org Check it out! diff --git a/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/trainingfile.js b/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/trainingfile.js new file mode 100644 index 0000000000..b6d37e879b --- /dev/null +++ b/comm/mailnews/extensions/bayesian-spam-filter/test/unit/resources/trainingfile.js @@ -0,0 +1,108 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +// service class to manipulate the junk training.dat file +// code is adapted from Mnehy Thunderbird Extension + +/* exported TrainingData */ +function TrainingData() { + // local constants + + const CC = Components.Constructor; + + // public methods + + this.read = read; + + // public variables + + this.mGoodTokens = 0; + this.mJunkTokens = 0; + this.mGoodMessages = 0; + this.mJunkMessages = 0; + this.mGoodCounts = {}; + this.mJunkCounts = {}; + + // helper functions + + function getJunkStatFile() { + var sBaseDir = Services.dirsvc.get("ProfD", Ci.nsIFile); + var CFileByFile = new CC( + "@mozilla.org/file/local;1", + "nsIFile", + "initWithFile" + ); + var oFile = new CFileByFile(sBaseDir); + oFile.append("training.dat"); + return oFile; + } + + function getBinStream(oFile) { + if (oFile && oFile.exists()) { + var oUri = Services.io.newFileURI(oFile); + // open stream (channel) + let channel = Services.io.newChannelFromURI( + oUri, + null, + Services.scriptSecurityManager.getSystemPrincipal(), + null, + Ci.nsILoadInfo.SEC_ALLOW_CROSS_ORIGIN_SEC_CONTEXT_IS_NULL, + Ci.nsIContentPolicy.TYPE_OTHER + ); + var oStream = channel.open(); + // buffer it + var oBufStream = Cc[ + "@mozilla.org/network/buffered-input-stream;1" + ].createInstance(Ci.nsIBufferedInputStream); + oBufStream.init(oStream, oFile.fileSize); + // read as binary + var oBinStream = Cc["@mozilla.org/binaryinputstream;1"].createInstance( + Ci.nsIBinaryInputStream + ); + oBinStream.setInputStream(oBufStream); + // return it + return oBinStream; + } + return null; + } + + // method specifications + + function read() { + var file = getJunkStatFile(); + + // does the file exist? + Assert.ok(file.exists()); + + var fileStream = getBinStream(file); + + // check magic number + var iMagicNumber = fileStream.read32(); + Assert.equal(iMagicNumber, 0xfeedface); + + // get ham'n'spam numbers + this.mGoodMessages = fileStream.read32(); + this.mJunkMessages = fileStream.read32(); + + // Read good tokens + this.mGoodTokens = fileStream.read32(); + var iRefCount, iTokenLen, sToken; + for (let i = 0; i < this.mGoodTokens; ++i) { + iRefCount = fileStream.read32(); + iTokenLen = fileStream.read32(); + sToken = fileStream.readBytes(iTokenLen); + this.mGoodCounts[sToken] = iRefCount; + } + + // we have no further good tokens, so read junk tokens + this.mJunkTokens = fileStream.read32(); + for (let i = 0; i < this.mJunkTokens; i++) { + // read token data + iRefCount = fileStream.read32(); + iTokenLen = fileStream.read32(); + sToken = fileStream.readBytes(iTokenLen); + this.mJunkCounts[sToken] = iRefCount; + } + } +} diff --git a/comm/mailnews/extensions/bayesian-spam-filter/test/unit/test_bug228675.js b/comm/mailnews/extensions/bayesian-spam-filter/test/unit/test_bug228675.js new file mode 100644 index 0000000000..40180006d7 --- /dev/null +++ b/comm/mailnews/extensions/bayesian-spam-filter/test/unit/test_bug228675.js @@ -0,0 +1,136 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +// tests reduction in size of training.dat + +// main setup + +/* import-globals-from resources/trainingfile.js */ +load("resources/trainingfile.js"); + +var { MailServices } = ChromeUtils.import( + "resource:///modules/MailServices.jsm" +); + +// before shrink, the trained messages have 76 tokens. Force shrink. +Services.prefs.setIntPref("mailnews.bayesian_spam_filter.junk_maxtokens", 75); + +// local constants +var kUnclassified = MailServices.junk.UNCLASSIFIED; +var kJunk = MailServices.junk.JUNK; +var kGood = MailServices.junk.GOOD; + +var emails = [ + "ham1.eml", + "ham2.eml", + "spam1.eml", + "spam2.eml", + "spam3.eml", + "spam4.eml", +]; +var classifications = [kGood, kGood, kJunk, kJunk, kJunk, kJunk]; +var trainingData; + +// main test +function run_test() { + localAccountUtils.loadLocalMailAccount(); + MailServices.junk.resetTrainingData(); + + do_test_pending(); + + var email = emails.shift(); + var classification = classifications.shift(); + // additional calls to setMessageClassifiaction are done in the callback + MailServices.junk.setMessageClassification( + getSpec(email), + kUnclassified, + classification, + null, + doTestingListener + ); +} + +var doTestingListener = { + onMessageClassified(aMsgURI, aClassification, aJunkPercent) { + if (!aMsgURI) { + // Ignore end-of-batch signal. + return; + } + var email = emails.shift(); + var classification = classifications.shift(); + if (email) { + MailServices.junk.setMessageClassification( + getSpec(email), + kUnclassified, + classification, + null, + doTestingListener + ); + return; + } + + // all done classifying, time to test + MailServices.junk.shutdown(); // just flushes training.dat + trainingData = new TrainingData(); + trainingData.read(); + + /* + // List training.dat information for debug + dump("training.data results: goodMessages=" + trainingData.mGoodMessages + + " junkMessages = " + trainingData.mJunkMessages + + " goodTokens = " + trainingData.mGoodTokens + + " junkTokens = " + trainingData.mJunkTokens + + "\n"); + print("Good counts"); + for (var token in trainingData.mGoodCounts) + dump("count: " + trainingData.mGoodCounts[token] + " token: " + token + "\n"); + print("Junk Counts"); + for (var token in trainingData.mJunkCounts) + dump("count: " + trainingData.mJunkCounts[token] + " token: " + token + "\n"); + */ + + /* Selected pre-shrink counts after training + training.data results: goodMessages=2 junkMessages = 4 tokens = 78 + Good counts + count: 1 token: subject:report + count: 2 token: important + count: 2 token: to:careful reader + + Junk Counts + count: 3 token: make + count: 4 token: money + count: 4 token: to:careful reader + count: 2 token: money! + */ + + // Shrinking divides all counts by two. In comments, I show the + // calculation for each test, (pre-shrink count)/2. + + Assert.equal(trainingData.mGoodMessages, 1); // 2/2 + Assert.equal(trainingData.mJunkMessages, 2); // 4/2 + checkToken("money", 0, 2); // (0/2, 4/2) + checkToken("subject:report", 0, 0); // (1/2, 0/2) + checkToken("to:careful reader ", 1, 2); // (2/2, 4/2) + checkToken("make", 0, 1); // (0/2, 3/2) + checkToken("important", 1, 0); // (2/2, 0/2) + + do_test_finished(); + }, +}; + +// helper functions + +function checkToken(aToken, aGoodCount, aJunkCount) { + print(" checking " + aToken); + var goodCount = trainingData.mGoodCounts[aToken]; + var junkCount = trainingData.mJunkCounts[aToken]; + if (!goodCount) { + goodCount = 0; + } + if (!junkCount) { + junkCount = 0; + } + Assert.equal(goodCount, aGoodCount); + Assert.equal(junkCount, aJunkCount); +} diff --git a/comm/mailnews/extensions/bayesian-spam-filter/test/unit/test_customTokenization.js b/comm/mailnews/extensions/bayesian-spam-filter/test/unit/test_customTokenization.js new file mode 100644 index 0000000000..222a9557d8 --- /dev/null +++ b/comm/mailnews/extensions/bayesian-spam-filter/test/unit/test_customTokenization.js @@ -0,0 +1,197 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +// Tests use of custom tokenization, originally introduced in bug 476389 + +var { MailServices } = ChromeUtils.import( + "resource:///modules/MailServices.jsm" +); + +// command functions for test data +var kTrain = 0; // train a file +var kTest = 1; // test headers returned from detail +var kSetup = 2; // run a setup function + +// trait ids +var kProArray = [3]; +var kAntiArray = [4]; + +var gTest; // currently active test + +// The tests array defines the tests to attempt. + +var tests = [ + // test a few tokens using defaults + { + command: kTrain, + fileName: "tokenTest.eml", + }, + { + command: kTest, + fileName: "tokenTest.eml", + tokens: ["important", "subject:eat", "message-id:14159", "http://www"], + nottokens: ["idonotexist", "subject:to"], + }, + + // enable received, disable message-id + // switch tokenization of body to catch full urls (no "." delimiter) + // enable sender, keeping full value + { + command: kSetup, + operation() { + Services.prefs.setCharPref( + "mailnews.bayesian_spam_filter.tokenizeheader.received", + "standard" + ); + Services.prefs.setCharPref( + "mailnews.bayesian_spam_filter.tokenizeheader.message-id", + "false" + ); + Services.prefs.setCharPref( + "mailnews.bayesian_spam_filter.body_delimiters", + " \t\r\n\v" + ); + Services.prefs.setCharPref( + "mailnews.bayesian_spam_filter.tokenizeheader.sender", + "full" + ); + }, + }, + { + command: kTrain, + fileName: "tokenTest.eml", + }, + { + command: kTest, + fileName: "tokenTest.eml", + tokens: [ + "important", + "subject:eat", + "received:reader@example", + "skip:h 20", + "sender:bugzilla test setup ", + "received:;, remove . from standard header delimiters to better capture emails + // use custom delimiters on sender, without "." or "<>" + { + command: kSetup, + operation() { + Services.prefs.setIntPref( + "mailnews.bayesian_spam_filter.maxlengthfortoken", + 50 + ); + Services.prefs.setCharPref( + "mailnews.bayesian_spam_filter.header_delimiters", + " ;<>\t\r\n\v" + ); + Services.prefs.setCharPref( + "mailnews.bayesian_spam_filter.tokenizeheader.sender", + " \t\r\n\v" + ); + }, + }, + { + command: kTrain, + fileName: "tokenTest.eml", + }, + { + command: kTest, + fileName: "tokenTest.eml", + tokens: [ + "received:someone@example.com", + "http://www.example.org", + "received:reader@example.org", + "sender:", + ], + nottokens: ["skip:h 20", "received: aMsgURIs, + proArray, // in array aProTraits, + antiArray, // in array aAntiTraits + listener + ); // in nsIMsgTraitClassificationListener aTraitListener + // null, // [optional] in nsIMsgWindow aMsgWindow + // null, // [optional] in nsIJunkMailClassificationListener aJunkListener + } + break; + } + case kDetail: + // detail message + nsIJunkMailPlugin.detailMessage( + getSpec(gTest.fileName), // in string aMsgURI + gTest.traitIds[0], // proTrait + gTest.traitAntiIds[0], // antiTrait + listener + ); // in nsIMsgTraitDetailListener aDetailListener + break; + case kReset: + // reload a new nsIJunkMailPlugin, reading file in the process + nsIJunkMailPlugin.shutdown(); // writes files + nsIJunkMailPlugin = null; + nsIJunkMailPlugin = Cc[ + "@mozilla.org/messenger/filter-plugin;1?name=bayesianfilter" + ].createInstance(Ci.nsIJunkMailPlugin); + // does not do a callback, so we must restart next command + startCommand(); + break; + } +} diff --git a/comm/mailnews/extensions/bayesian-spam-filter/test/unit/xpcshell.ini b/comm/mailnews/extensions/bayesian-spam-filter/test/unit/xpcshell.ini new file mode 100644 index 0000000000..86776834ba --- /dev/null +++ b/comm/mailnews/extensions/bayesian-spam-filter/test/unit/xpcshell.ini @@ -0,0 +1,11 @@ +[DEFAULT] +head = head_bayes.js +tail = +support-files = resources/* + +[test_bug228675.js] +[test_customTokenization.js] +[test_junkAsTraits.js] +[test_msgCorpus.js] +[test_traitAliases.js] +[test_traits.js] -- cgit v1.2.3