summaryrefslogtreecommitdiffstats
path: root/comm/mailnews/extensions/bayesian-spam-filter/nsBayesianFilter.h
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--comm/mailnews/extensions/bayesian-spam-filter/nsBayesianFilter.h397
1 files changed, 397 insertions, 0 deletions
diff --git a/comm/mailnews/extensions/bayesian-spam-filter/nsBayesianFilter.h b/comm/mailnews/extensions/bayesian-spam-filter/nsBayesianFilter.h
new file mode 100644
index 0000000000..70d0a1a02b
--- /dev/null
+++ b/comm/mailnews/extensions/bayesian-spam-filter/nsBayesianFilter.h
@@ -0,0 +1,397 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef nsBayesianFilter_h__
+#define nsBayesianFilter_h__
+
+#include <stdio.h>
+#include "nsCOMPtr.h"
+#include "nsIMsgFilterPlugin.h"
+#include "PLDHashTable.h"
+#include "nsITimer.h"
+#include "nsTArray.h"
+#include "nsString.h"
+#include "nsWeakReference.h"
+#include "nsIObserver.h"
+#include "nsHashPropertyBag.h"
+#include "mozilla/intl/WordBreaker.h"
+
+#include "mozilla/ArenaAllocator.h"
+
+#define DEFAULT_MIN_INTERVAL_BETWEEN_WRITES 15 * 60 * 1000
+
+struct Token;
+class TokenEnumeration;
+class TokenAnalyzer;
+class nsIMsgWindow;
+class nsIUTF8StringEnumerator;
+struct BaseToken;
+struct CorpusToken;
+
+/**
+ * Helper class to enumerate Token objects in a PLDHashTable
+ * safely and without copying (see bugzilla #174859). The
+ * enumeration is safe to use until an Add()
+ * or Remove() is performed on the table.
+ */
+class TokenEnumeration {
+ public:
+ explicit TokenEnumeration(PLDHashTable* table);
+ bool hasMoreTokens();
+ BaseToken* nextToken();
+
+ private:
+ PLDHashTable::Iterator mIterator;
+};
+
+// A trait is some aspect of a message, like being junk or tagged as
+// Personal, that the statistical classifier should track. The Trait
+// structure is a per-token representation of information pertaining to
+// a message trait.
+//
+// Traits per token are maintained as a linked list.
+//
+struct TraitPerToken {
+ uint32_t mId; // identifying number for a trait
+ uint32_t mCount; // count of messages with this token and trait
+ uint32_t mNextLink; // index in mTraitStore for the next trait, or 0
+ // for none
+ TraitPerToken(uint32_t aId, uint32_t aCount); // inititializer
+};
+
+// An Analysis is the statistical results for a particular message, a
+// particular token, and for a particular pair of trait/antitrait, that
+// is then used in subsequent analysis to score the message.
+//
+// Analyses per token are maintained as a linked list.
+//
+struct AnalysisPerToken {
+ uint32_t mTraitIndex; // index representing a protrait/antitrait pair.
+ // So if we are analyzing 3 different traits, then
+ // the first trait is 0, the second 1, etc.
+ double mDistance; // absolute value of mProbability - 0.5
+ double mProbability; // relative indicator of match of trait to token
+ uint32_t mNextLink; // index in mAnalysisStore for the Analysis object
+ // for the next trait index, or 0 for none.
+ // initializer
+ AnalysisPerToken(uint32_t aTraitIndex, double aDistance, double aProbability);
+};
+
+class TokenHash {
+ public:
+ virtual ~TokenHash();
+ /**
+ * Clears out the previous message tokens.
+ */
+ nsresult clearTokens();
+ uint32_t countTokens();
+ TokenEnumeration getTokens();
+ BaseToken* add(const char* word);
+
+ protected:
+ explicit TokenHash(uint32_t entrySize);
+ mozilla::ArenaAllocator<16384, 2> mWordPool;
+ uint32_t mEntrySize;
+ PLDHashTable mTokenTable;
+ char* copyWord(const char* word, uint32_t len);
+ BaseToken* get(const char* word);
+};
+
+class Tokenizer : public TokenHash {
+ public:
+ Tokenizer();
+ ~Tokenizer();
+
+ Token* get(const char* word);
+
+ // The training set keeps an occurrence count on each word. This count
+ // is supposed to count the # of messages it occurs in.
+ // When add/remove is called while tokenizing a message and NOT the training
+ // set,
+ //
+ Token* add(const char* word, uint32_t count = 1);
+
+ Token* copyTokens();
+
+ void tokenize(const char* text);
+
+ /**
+ * Creates specific tokens based on the mime headers for the message being
+ * tokenized
+ */
+ void tokenizeHeaders(nsTArray<nsCString>& aHeaderNames,
+ nsTArray<nsCString>& aHeaderValues);
+
+ void tokenizeAttachments(nsTArray<RefPtr<nsIPropertyBag2>>& attachments);
+
+ nsCString mBodyDelimiters; // delimiters for body tokenization
+ nsCString mHeaderDelimiters; // delimiters for header tokenization
+
+ // arrays of extra headers to tokenize / to not tokenize
+ nsTArray<nsCString> mEnabledHeaders;
+ nsTArray<nsCString> mDisabledHeaders;
+ // Delimiters used in tokenizing a particular header.
+ // Parallel array to mEnabledHeaders
+ nsTArray<nsCString> mEnabledHeadersDelimiters;
+ bool mCustomHeaderTokenization; // Are there any preference-set tokenization
+ // customizations?
+ uint32_t mMaxLengthForToken; // maximum length of a token
+ // should we convert iframe to div during tokenization?
+ bool mIframeToDiv;
+
+ private:
+ void tokenize_ascii_word(char* word);
+ void tokenize_japanese_word(char* chunk);
+ inline void addTokenForHeader(const char* aTokenPrefix, nsACString& aValue,
+ bool aTokenizeValue = false,
+ const char* aDelimiters = nullptr);
+ nsresult stripHTML(const nsAString& inString, nsAString& outString);
+ // helper function to escape \n, \t, etc from a CString
+ void UnescapeCString(nsCString& aCString);
+ nsresult ScannerNext(const char16_t* text, int32_t length, int32_t pos,
+ bool isLastBuffer, int32_t* begin, int32_t* end,
+ bool* _retval);
+};
+
+/**
+ * Implements storage of a collection of message tokens and counts for
+ * a corpus of classified messages
+ */
+
+class CorpusStore : public TokenHash {
+ public:
+ CorpusStore();
+ ~CorpusStore();
+
+ /**
+ * retrieve the token structure for a particular string
+ *
+ * @param word the character representation of the token
+ *
+ * @return token structure containing counts, null if not found
+ */
+ CorpusToken* get(const char* word);
+
+ /**
+ * add tokens to the storage, or increment counts if already exists.
+ *
+ * @param aTokenizer tokenizer for the list of tokens to remember
+ * @param aTraitId id for the trait whose counts will be remembered
+ * @param aCount number of new messages represented by the token list
+ */
+ void rememberTokens(Tokenizer& aTokenizer, uint32_t aTraitId,
+ uint32_t aCount);
+
+ /**
+ * decrement counts for tokens in the storage, removing if all counts
+ * are zero
+ *
+ * @param aTokenizer tokenizer for the list of tokens to forget
+ * @param aTraitId id for the trait whose counts will be removed
+ * @param aCount number of messages represented by the token list
+ */
+ void forgetTokens(Tokenizer& aTokenizer, uint32_t aTraitId, uint32_t aCount);
+
+ /**
+ * write the corpus information to file storage
+ *
+ * @param aMaximumTokenCount prune tokens if number of tokens exceeds
+ * this value. == 0 for no pruning
+ */
+ void writeTrainingData(uint32_t aMaximumTokenCount);
+
+ /**
+ * read the corpus information from file storage
+ */
+ void readTrainingData();
+
+ /**
+ * delete the local corpus storage file and data
+ */
+ nsresult resetTrainingData();
+
+ /**
+ * get the count of messages whose tokens are stored that are associated
+ * with a trait
+ *
+ * @param aTraitId identifier for the trait
+ * @return number of messages for that trait
+ */
+ uint32_t getMessageCount(uint32_t aTraitId);
+
+ /**
+ * set the count of messages whose tokens are stored that are associated
+ * with a trait
+ *
+ * @param aTraitId identifier for the trait
+ * @param aCount number of messages for that trait
+ */
+ void setMessageCount(uint32_t aTraitId, uint32_t aCount);
+
+ /**
+ * get the count of messages associated with a particular token and trait
+ *
+ * @param token the token string and associated counts
+ * @param aTraitId identifier for the trait
+ */
+ uint32_t getTraitCount(CorpusToken* token, uint32_t aTraitId);
+
+ /**
+ * Add (or remove) data from a particular file to the corpus data.
+ *
+ * @param aFile the file with the data, in the format:
+ *
+ * Format of the trait file for version 1:
+ * [0xFCA93601] (the 01 is the version)
+ * for each trait to write:
+ * [id of trait to write] (0 means end of list)
+ * [number of messages per trait]
+ * for each token with non-zero count
+ * [count]
+ * [length of word]word
+ *
+ * @param aIsAdd should the data be added, or removed? true if adding,
+ * else removing.
+ *
+ * @param aFromTraits array of trait ids used in aFile. If aFile contains
+ * trait ids that are not in this array, they are not
+ * remapped, but assumed to be local trait ids.
+ *
+ * @param aToTraits array of trait ids, corresponding to elements of
+ * aFromTraits, that represent the local trait ids to be
+ * used in storing data from aFile into the local corpus.
+ *
+ */
+ nsresult UpdateData(nsIFile* aFile, bool aIsAdd,
+ const nsTArray<uint32_t>& aFromTraits,
+ const nsTArray<uint32_t>& aToTraits);
+
+ /**
+ * remove all counts (message and tokens) for a trait id
+ *
+ * @param aTrait trait id for the trait to remove
+ */
+ nsresult ClearTrait(uint32_t aTrait);
+
+ protected:
+ /**
+ * return the local corpus storage file for junk traits
+ */
+ nsresult getTrainingFile(nsIFile** aFile);
+
+ /**
+ * return the local corpus storage file for non-junk traits
+ */
+ nsresult getTraitFile(nsIFile** aFile);
+
+ /**
+ * read token strings from the data file
+ *
+ * @param stream file stream with token data
+ * @param fileSize file size
+ * @param aTraitId id for the trait whose counts will be read
+ * @param aIsAdd true to add the counts, false to remove them
+ *
+ * @return true if successful, false if error
+ */
+ bool readTokens(FILE* stream, int64_t fileSize, uint32_t aTraitId,
+ bool aIsAdd);
+
+ /**
+ * write token strings to the data file
+ */
+ bool writeTokens(FILE* stream, bool shrink, uint32_t aTraitId);
+
+ /**
+ * remove counts for a token string
+ */
+ void remove(const char* word, uint32_t aTraitId, uint32_t aCount);
+
+ /**
+ * add counts for a token string, adding the token string if new
+ */
+ CorpusToken* add(const char* word, uint32_t aTraitId, uint32_t aCount);
+
+ /**
+ * change counts in a trait in the traits array, adding the trait if needed
+ */
+ nsresult updateTrait(CorpusToken* token, uint32_t aTraitId,
+ int32_t aCountChange);
+ nsCOMPtr<nsIFile> mTrainingFile; // file used to store junk training data
+ nsCOMPtr<nsIFile> mTraitFile; // file used to store non-junk
+ // training data
+ nsTArray<TraitPerToken> mTraitStore; // memory for linked-list of counts
+ uint32_t mNextTraitIndex; // index in mTraitStore to first empty
+ // TraitPerToken
+ nsTArray<uint32_t> mMessageCounts; // count of messages per trait
+ // represented in the store
+ nsTArray<uint32_t> mMessageCountsId; // Parallel array to mMessageCounts,
+ // with the corresponding trait ID
+};
+
+class nsBayesianFilter : public nsIJunkMailPlugin,
+ nsIMsgCorpus,
+ nsIObserver,
+ nsSupportsWeakReference {
+ public:
+ NS_DECL_ISUPPORTS
+ NS_DECL_NSIMSGFILTERPLUGIN
+ NS_DECL_NSIJUNKMAILPLUGIN
+ NS_DECL_NSIMSGCORPUS
+ NS_DECL_NSIOBSERVER
+
+ nsBayesianFilter();
+
+ nsresult Init();
+
+ nsresult tokenizeMessage(const nsACString& messageURI,
+ nsIMsgWindow* aMsgWindow, TokenAnalyzer* analyzer);
+ void classifyMessage(Tokenizer& tokens, const nsACString& messageURI,
+ nsIJunkMailClassificationListener* listener);
+
+ void classifyMessage(Tokenizer& tokenizer, const nsACString& messageURI,
+ nsTArray<uint32_t>& aProTraits,
+ nsTArray<uint32_t>& aAntiTraits,
+ nsIJunkMailClassificationListener* listener,
+ nsIMsgTraitClassificationListener* aTraitListener,
+ nsIMsgTraitDetailListener* aDetailListener);
+
+ void observeMessage(Tokenizer& tokens, const nsACString& messageURI,
+ nsTArray<uint32_t>& oldClassifications,
+ nsTArray<uint32_t>& newClassifications,
+ nsIJunkMailClassificationListener* listener,
+ nsIMsgTraitClassificationListener* aTraitListener);
+
+ protected:
+ virtual ~nsBayesianFilter();
+
+ static void TimerCallback(nsITimer* aTimer, void* aClosure);
+
+ CorpusStore mCorpus;
+ double mJunkProbabilityThreshold;
+ int32_t mMaximumTokenCount;
+ bool mTrainingDataDirty;
+ int32_t mMinFlushInterval; // in milliseconds, must be positive
+ // and not too close to 0
+ nsCOMPtr<nsITimer> mTimer;
+
+ // index in mAnalysisStore for first empty AnalysisPerToken
+ uint32_t mNextAnalysisIndex;
+ // memory for linked list of AnalysisPerToken objects
+ nsTArray<AnalysisPerToken> mAnalysisStore;
+ /**
+ * Determine the location in mAnalysisStore where the AnalysisPerToken
+ * object for a particular token and trait is stored
+ */
+ uint32_t getAnalysisIndex(Token& token, uint32_t aTraitIndex);
+ /**
+ * Set the value of the AnalysisPerToken object for a particular
+ * token and trait
+ */
+ nsresult setAnalysis(Token& token, uint32_t aTraitIndex, double aDistance,
+ double aProbability);
+};
+
+#endif // _nsBayesianFilter_h__