1 files changed, 397 insertions, 0 deletions
diff --git a/comm/mailnews/extensions/bayesian-spam-filter/nsBayesianFilter.h b/comm/mailnews/extensions/bayesian-spam-filter/nsBayesianFilter.h
new file mode 100644
index 0000000000..70d0a1a02b
--- /dev/null
+++ b/comm/mailnews/extensions/bayesian-spam-filter/nsBayesianFilter.h
@@ -0,0 +1,397 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef nsBayesianFilter_h__
+#define nsBayesianFilter_h__
+
+#include <stdio.h>
+#include "nsCOMPtr.h"
+#include "nsIMsgFilterPlugin.h"
+#include "PLDHashTable.h"
+#include "nsITimer.h"
+#include "nsTArray.h"
+#include "nsString.h"
+#include "nsWeakReference.h"
+#include "nsIObserver.h"
+#include "nsHashPropertyBag.h"
+#include "mozilla/intl/WordBreaker.h"
+
+#include "mozilla/ArenaAllocator.h"
+
+#define DEFAULT_MIN_INTERVAL_BETWEEN_WRITES 15 * 60 * 1000
+
+struct Token;
+class TokenEnumeration;
+class TokenAnalyzer;
+class nsIMsgWindow;
+class nsIUTF8StringEnumerator;
+struct BaseToken;
+struct CorpusToken;
+
+/**
+ * Helper class to enumerate Token objects in a PLDHashTable
+ * safely and without copying (see bugzilla #174859). The
+ * enumeration is safe to use until an Add()
+ * or Remove() is performed on the table.
+ */
+class TokenEnumeration {
+ public:
+  explicit TokenEnumeration(PLDHashTable* table);
+  bool hasMoreTokens();
+  BaseToken* nextToken();
+
+ private:
+  PLDHashTable::Iterator mIterator;
+};
+
+// A trait is some aspect of a message, like being junk or tagged as
+// Personal, that the statistical classifier should track. The Trait
+// structure is a per-token representation of information pertaining to
+// a message trait.
+//
+// Traits per token are maintained as a linked list.
+//
+struct TraitPerToken {
+  uint32_t mId;        // identifying number for a trait
+  uint32_t mCount;     // count of messages with this token and trait
+  uint32_t mNextLink;  // index in mTraitStore for the next trait, or 0
+                       // for none
+  TraitPerToken(uint32_t aId, uint32_t aCount);  // inititializer
+};
+
+// An Analysis is the statistical results for a particular message, a
+// particular token, and for a particular pair of trait/antitrait, that
+// is then used in subsequent analysis to score the message.
+//
+// Analyses per token are maintained as a linked list.
+//
+struct AnalysisPerToken {
+  uint32_t mTraitIndex;  // index representing a protrait/antitrait pair.
+                         // So if we are analyzing 3 different traits, then
+                         // the first trait is 0, the second 1, etc.
+  double mDistance;      // absolute value of mProbability - 0.5
+  double mProbability;   // relative indicator of match of trait to token
+  uint32_t mNextLink;    // index in mAnalysisStore for the Analysis object
+                         // for the next trait index, or 0 for none.
+  // initializer
+  AnalysisPerToken(uint32_t aTraitIndex, double aDistance, double aProbability);
+};
+
+class TokenHash {
+ public:
+  virtual ~TokenHash();
+  /**
+   * Clears out the previous message tokens.
+   */
+  nsresult clearTokens();
+  uint32_t countTokens();
+  TokenEnumeration getTokens();
+  BaseToken* add(const char* word);
+
+ protected:
+  explicit TokenHash(uint32_t entrySize);
+  mozilla::ArenaAllocator<16384, 2> mWordPool;
+  uint32_t mEntrySize;
+  PLDHashTable mTokenTable;
+  char* copyWord(const char* word, uint32_t len);
+  BaseToken* get(const char* word);
+};
+
+class Tokenizer : public TokenHash {
+ public:
+  Tokenizer();
+  ~Tokenizer();
+
+  Token* get(const char* word);
+
+  // The training set keeps an occurrence count on each word. This count
+  // is supposed to count the # of messages it occurs in.
+  // When add/remove is called while tokenizing a message and NOT the training
+  // set,
+  //
+  Token* add(const char* word, uint32_t count = 1);
+
+  Token* copyTokens();
+
+  void tokenize(const char* text);
+
+  /**
+   *  Creates specific tokens based on the mime headers for the message being
+   * tokenized
+   */
+  void tokenizeHeaders(nsTArray<nsCString>& aHeaderNames,
+                       nsTArray<nsCString>& aHeaderValues);
+
+  void tokenizeAttachments(nsTArray<RefPtr<nsIPropertyBag2>>& attachments);
+
+  nsCString mBodyDelimiters;    // delimiters for body tokenization
+  nsCString mHeaderDelimiters;  // delimiters for header tokenization
+
+  // arrays of extra headers to tokenize / to not tokenize
+  nsTArray<nsCString> mEnabledHeaders;
+  nsTArray<nsCString> mDisabledHeaders;
+  // Delimiters used in tokenizing a particular header.
+  // Parallel array to mEnabledHeaders
+  nsTArray<nsCString> mEnabledHeadersDelimiters;
+  bool mCustomHeaderTokenization;  // Are there any preference-set tokenization
+                                   // customizations?
+  uint32_t mMaxLengthForToken;     // maximum length of a token
+  // should we convert iframe to div during tokenization?
+  bool mIframeToDiv;
+
+ private:
+  void tokenize_ascii_word(char* word);
+  void tokenize_japanese_word(char* chunk);
+  inline void addTokenForHeader(const char* aTokenPrefix, nsACString& aValue,
+                                bool aTokenizeValue = false,
+                                const char* aDelimiters = nullptr);
+  nsresult stripHTML(const nsAString& inString, nsAString& outString);
+  // helper function to escape \n, \t, etc from a CString
+  void UnescapeCString(nsCString& aCString);
+  nsresult ScannerNext(const char16_t* text, int32_t length, int32_t pos,
+                       bool isLastBuffer, int32_t* begin, int32_t* end,
+                       bool* _retval);
+};
+
+/**
+ * Implements storage of a collection of message tokens and counts for
+ * a corpus of classified messages
+ */
+
+class CorpusStore : public TokenHash {
+ public:
+  CorpusStore();
+  ~CorpusStore();
+
+  /**
+   * retrieve the token structure for a particular string
+   *
+   * @param word  the character representation of the token
+   *
+   * @return      token structure containing counts, null if not found
+   */
+  CorpusToken* get(const char* word);
+
+  /**
+   * add tokens to the storage, or increment counts if already exists.
+   *
+   * @param aTokenizer tokenizer for the list of tokens to remember
+   * @param aTraitId   id for the trait whose counts will be remembered
+   * @param aCount     number of new messages represented by the token list
+   */
+  void rememberTokens(Tokenizer& aTokenizer, uint32_t aTraitId,
+                      uint32_t aCount);
+
+  /**
+   * decrement counts for tokens in the storage, removing if all counts
+   * are zero
+   *
+   * @param aTokenizer tokenizer for the list of tokens to forget
+   * @param aTraitId   id for the trait whose counts will be removed
+   * @param aCount     number of messages represented by the token list
+   */
+  void forgetTokens(Tokenizer& aTokenizer, uint32_t aTraitId, uint32_t aCount);
+
+  /**
+   * write the corpus information to file storage
+   *
+   * @param aMaximumTokenCount  prune tokens if number of tokens exceeds
+   *                            this value.  == 0  for no pruning
+   */
+  void writeTrainingData(uint32_t aMaximumTokenCount);
+
+  /**
+   * read the corpus information from file storage
+   */
+  void readTrainingData();
+
+  /**
+   * delete the local corpus storage file and data
+   */
+  nsresult resetTrainingData();
+
+  /**
+   * get the count of messages whose tokens are stored that are associated
+   * with a trait
+   *
+   * @param aTraitId  identifier for the trait
+   * @return          number of messages for that trait
+   */
+  uint32_t getMessageCount(uint32_t aTraitId);
+
+  /**
+   * set the count of messages whose tokens are stored that are associated
+   * with a trait
+   *
+   * @param aTraitId  identifier for the trait
+   * @param aCount    number of messages for that trait
+   */
+  void setMessageCount(uint32_t aTraitId, uint32_t aCount);
+
+  /**
+   * get the count of messages associated with a particular token and trait
+   *
+   * @param  token     the token string and associated counts
+   * @param  aTraitId  identifier for the trait
+   */
+  uint32_t getTraitCount(CorpusToken* token, uint32_t aTraitId);
+
+  /**
+   * Add (or remove) data from a particular file to the corpus data.
+   *
+   * @param aFile       the file with the data, in the format:
+   *
+   *                    Format of the trait file for version 1:
+   *                    [0xFCA93601]  (the 01 is the version)
+   *                    for each trait to write:
+   *                    [id of trait to write] (0 means end of list)
+   *                    [number of messages per trait]
+   *                    for each token with non-zero count
+   *                    [count]
+   *                    [length of word]word
+   *
+   * @param aIsAdd      should the data be added, or removed? true if adding,
+   *                    else removing.
+   *
+   * @param aFromTraits array of trait ids used in aFile. If aFile contains
+   *                    trait ids that are not in this array, they are not
+   *                    remapped, but assumed to be local trait ids.
+   *
+   * @param aToTraits   array of trait ids, corresponding to elements of
+   *                    aFromTraits, that represent the local trait ids to be
+   *                    used in storing data from aFile into the local corpus.
+   *
+   */
+  nsresult UpdateData(nsIFile* aFile, bool aIsAdd,
+                      const nsTArray<uint32_t>& aFromTraits,
+                      const nsTArray<uint32_t>& aToTraits);
+
+  /**
+   * remove all counts (message and tokens) for a trait id
+   *
+   * @param aTrait  trait id for the trait to remove
+   */
+  nsresult ClearTrait(uint32_t aTrait);
+
+ protected:
+  /**
+   * return the local corpus storage file for junk traits
+   */
+  nsresult getTrainingFile(nsIFile** aFile);
+
+  /**
+   * return the local corpus storage file for non-junk traits
+   */
+  nsresult getTraitFile(nsIFile** aFile);
+
+  /**
+   * read token strings from the data file
+   *
+   * @param stream     file stream with token data
+   * @param fileSize   file size
+   * @param aTraitId   id for the trait whose counts will be read
+   * @param aIsAdd     true to add the counts, false to remove them
+   *
+   * @return           true if successful, false if error
+   */
+  bool readTokens(FILE* stream, int64_t fileSize, uint32_t aTraitId,
+                  bool aIsAdd);
+
+  /**
+   * write token strings to the data file
+   */
+  bool writeTokens(FILE* stream, bool shrink, uint32_t aTraitId);
+
+  /**
+   * remove counts for a token string
+   */
+  void remove(const char* word, uint32_t aTraitId, uint32_t aCount);
+
+  /**
+   * add counts for a token string, adding the token string if new
+   */
+  CorpusToken* add(const char* word, uint32_t aTraitId, uint32_t aCount);
+
+  /**
+   * change counts in a trait in the traits array, adding the trait if needed
+   */
+  nsresult updateTrait(CorpusToken* token, uint32_t aTraitId,
+                       int32_t aCountChange);
+  nsCOMPtr<nsIFile> mTrainingFile;      // file used to store junk training data
+  nsCOMPtr<nsIFile> mTraitFile;         // file used to store non-junk
+                                        // training data
+  nsTArray<TraitPerToken> mTraitStore;  // memory for linked-list of counts
+  uint32_t mNextTraitIndex;             // index in mTraitStore to first empty
+                                        // TraitPerToken
+  nsTArray<uint32_t> mMessageCounts;    // count of messages per trait
+                                        // represented in the store
+  nsTArray<uint32_t> mMessageCountsId;  // Parallel array to mMessageCounts,
+                                        // with the corresponding trait ID
+};
+
+class nsBayesianFilter : public nsIJunkMailPlugin,
+                         nsIMsgCorpus,
+                         nsIObserver,
+                         nsSupportsWeakReference {
+ public:
+  NS_DECL_ISUPPORTS
+  NS_DECL_NSIMSGFILTERPLUGIN
+  NS_DECL_NSIJUNKMAILPLUGIN
+  NS_DECL_NSIMSGCORPUS
+  NS_DECL_NSIOBSERVER
+
+  nsBayesianFilter();
+
+  nsresult Init();
+
+  nsresult tokenizeMessage(const nsACString& messageURI,
+                           nsIMsgWindow* aMsgWindow, TokenAnalyzer* analyzer);
+  void classifyMessage(Tokenizer& tokens, const nsACString& messageURI,
+                       nsIJunkMailClassificationListener* listener);
+
+  void classifyMessage(Tokenizer& tokenizer, const nsACString& messageURI,
+                       nsTArray<uint32_t>& aProTraits,
+                       nsTArray<uint32_t>& aAntiTraits,
+                       nsIJunkMailClassificationListener* listener,
+                       nsIMsgTraitClassificationListener* aTraitListener,
+                       nsIMsgTraitDetailListener* aDetailListener);
+
+  void observeMessage(Tokenizer& tokens, const nsACString& messageURI,
+                      nsTArray<uint32_t>& oldClassifications,
+                      nsTArray<uint32_t>& newClassifications,
+                      nsIJunkMailClassificationListener* listener,
+                      nsIMsgTraitClassificationListener* aTraitListener);
+
+ protected:
+  virtual ~nsBayesianFilter();
+
+  static void TimerCallback(nsITimer* aTimer, void* aClosure);
+
+  CorpusStore mCorpus;
+  double mJunkProbabilityThreshold;
+  int32_t mMaximumTokenCount;
+  bool mTrainingDataDirty;
+  int32_t mMinFlushInterval;  // in milliseconds, must be positive
+                              // and not too close to 0
+  nsCOMPtr<nsITimer> mTimer;
+
+  // index in mAnalysisStore for first empty AnalysisPerToken
+  uint32_t mNextAnalysisIndex;
+  // memory for linked list of AnalysisPerToken objects
+  nsTArray<AnalysisPerToken> mAnalysisStore;
+  /**
+   * Determine the location in mAnalysisStore where the AnalysisPerToken
+   * object for a particular token and trait is stored
+   */
+  uint32_t getAnalysisIndex(Token& token, uint32_t aTraitIndex);
+  /**
+   * Set the value of the AnalysisPerToken object for a particular
+   * token and trait
+   */
+  nsresult setAnalysis(Token& token, uint32_t aTraitIndex, double aDistance,
+                       double aProbability);
+};
+
+#endif  // _nsBayesianFilter_h__