1 files changed, 561 insertions, 0 deletions
diff --git a/comm/mailnews/import/src/rtfDecoder.cpp b/comm/mailnews/import/src/rtfDecoder.cpp
new file mode 100644
index 0000000000..86a8151618
--- /dev/null
+++ b/comm/mailnews/import/src/rtfDecoder.cpp
@@ -0,0 +1,561 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include <stack>
+#include <map>
+#include <sstream>
+#include "windows.h"
+#include "rtfDecoder.h"
+
+#define SIZEOF(x) (sizeof(x) / sizeof((x)[0]))
+#define IS_DIGIT(i) ((i) >= '0' && (i) <= '9')
+#define IS_ALPHA(VAL) \
+  (((VAL) >= 'a' && (VAL) <= 'z') || ((VAL) >= 'A' && (VAL) <= 'Z'))
+
+inline int HexToInt(char ch) {
+  switch (ch) {
+    case '0':
+    case '1':
+    case '2':
+    case '3':
+    case '4':
+    case '5':
+    case '6':
+    case '7':
+    case '8':
+    case '9':
+      return ch - '0';
+    case 'A':
+    case 'B':
+    case 'C':
+    case 'D':
+    case 'E':
+    case 'F':
+      return ch - 'A' + 10;
+    case 'a':
+    case 'b':
+    case 'c':
+    case 'd':
+    case 'e':
+    case 'f':
+      return ch - 'a' + 10;
+    default:
+      return 0;
+  }
+}
+
+inline int CharsetToCP(int charset) {
+  // We don't know the Code page for the commented out charsets.
+  switch (charset) {
+    case 0:
+      return 1252;  // ANSI
+    case 1:
+      return 0;  // Default
+                 // case 2: return 42; // Symbol
+    case 2:
+      return 1252;  // Symbol
+    case 77:
+      return 10000;  // Mac Roman
+    case 78:
+      return 10001;  // Mac Shift Jis
+    case 79:
+      return 10003;  // Mac Hangul
+    case 80:
+      return 10008;  // Mac GB2312
+    case 81:
+      return 10002;  // Mac Big5
+                     // case 82: Mac Johab (old)
+    case 83:
+      return 10005;  // Mac Hebrew
+    case 84:
+      return 10004;  // Mac Arabic
+    case 85:
+      return 10006;  // Mac Greek
+    case 86:
+      return 10081;  // Mac Turkish
+    case 87:
+      return 10021;  // Mac Thai
+    case 88:
+      return 10029;  // Mac East Europe
+    case 89:
+      return 10007;  // Mac Russian
+    case 128:
+      return 932;  // Shift JIS
+    case 129:
+      return 949;  // Hangul
+    case 130:
+      return 1361;  // Johab
+    case 134:
+      return 936;  // GB2312
+    case 136:
+      return 950;  // Big5
+    case 161:
+      return 1253;  // Greek
+    case 162:
+      return 1254;  // Turkish
+    case 163:
+      return 1258;  // Vietnamese
+    case 177:
+      return 1255;  // Hebrew
+    case 178:
+      return 1256;  // Arabic
+                    // case 179: Arabic Traditional (old)
+      // case 180: Arabic user (old)
+      // case 181: Hebrew user (old)
+    case 186:
+      return 1257;  // Baltic
+    case 204:
+      return 1251;  // Russian
+    case 222:
+      return 874;  // Thai
+    case 238:
+      return 1250;  // Eastern European
+    case 254:
+      return 437;  // PC 437
+    case 255:
+      return 850;  // OEM
+    default:
+      return CP_ACP;
+  }
+}
+
+struct FontInfo {
+  enum Options { has_fcharset = 0x0001, has_cpg = 0x0002 };
+  unsigned int options;
+  int fcharset;
+  unsigned int cpg;
+  FontInfo() : options(0), fcharset(0), cpg(0xFFFFFFFF) {}
+  unsigned int Codepage() {
+    if (options & has_cpg)
+      return cpg;
+    else if (options & has_fcharset)
+      return CharsetToCP(fcharset);
+    else
+      return 0xFFFFFFFF;
+  }
+};
+typedef std::map<int, FontInfo> Fonttbl;
+
+struct LocalState {
+  bool fonttbl;  // When fonts are being defined
+  int f;  // Index of the font being defined/used; defines the codepage if no
+          // \cpg
+  unsigned int uc;        // ucN keyword value; its default is 1
+  unsigned int codepage;  // defined by \cpg
+};
+typedef std::stack<LocalState> StateStack;
+
+struct GlobalState {
+  enum Pcdata_state { pcdsno, pcdsin, pcdsfinished };
+  std::istream& stream;
+  Fonttbl fonttbl;
+  StateStack stack;
+  unsigned int codepage;  // defined by \ansi, \mac, \pc, \pca, and \ansicpgN
+  int deff;
+  std::stringstream pcdata_a;
+  unsigned int pcdata_a_codepage;
+  Pcdata_state pcdata_a_state;
+
+  explicit GlobalState(std::istream& s)
+      : stream(s), codepage(CP_ACP), deff(-1), pcdata_a_state(pcdsno) {
+    LocalState st;
+    st.fonttbl = false;
+    st.f = -1;
+    st.uc = 1;
+    st.codepage = 0xFFFFFFFF;
+    stack.push(st);
+  }
+  unsigned int GetCurrentCP() {
+    if (stack.top().codepage != 0xFFFFFFFF)  // \cpg in use
+      return stack.top().codepage;
+    // \cpg not used; use font settings
+    int f = (stack.top().f != -1) ? stack.top().f : deff;
+    if (f != -1) {
+      Fonttbl::iterator iter = fonttbl.find(f);
+      if (iter != fonttbl.end()) {
+        unsigned int cp = iter->second.Codepage();
+        if (cp != 0xFFFFFFFF) return cp;
+      }
+    }
+    return codepage;  // No overrides; use the top-level legacy setting
+  }
+};
+
+struct Keyword {
+  char name[33];
+  bool hasVal;
+  int val;
+};
+
+class Lexem {
+ public:
+  enum Type {
+    ltGroupBegin,
+    ltGroupEnd,
+    ltKeyword,
+    ltPCDATA_A,
+    ltPCDATA_W,
+    ltBDATA,
+    ltEOF,
+    ltError
+  };
+  explicit Lexem(Type t = ltError) : m_type(t) {}
+  Lexem(Lexem& from) {
+    switch (m_type = from.m_type) {
+      case ltKeyword:
+        m_keyword = from.m_keyword;
+        break;
+      case ltPCDATA_A:
+        m_pcdata_a = from.m_pcdata_a;
+        break;
+      case ltPCDATA_W:
+        m_pcdata_w = from.m_pcdata_w;
+        break;
+      case ltBDATA:
+        m_bdata = from.m_bdata;  // Move pointers when copying.
+        from.m_type = ltError;   // Invalidate the original. Not nice.
+        break;
+    }
+  }
+  ~Lexem() { Clear(); }
+  Lexem& operator=(Lexem& from) {
+    if (&from != this) {
+      Clear();
+      switch (m_type = from.m_type) {
+        case ltKeyword:
+          m_keyword = from.m_keyword;
+          break;
+        case ltPCDATA_A:
+          m_pcdata_a = from.m_pcdata_a;
+          break;
+        case ltPCDATA_W:
+          m_pcdata_w = from.m_pcdata_w;
+          break;
+        case ltBDATA:
+          m_bdata = from.m_bdata;  // Move pointers when copying.
+          from.m_type = ltError;   // Invalidate the original. Not nice.
+          break;
+      }
+    }
+    return *this;
+  }
+  Type type() const { return m_type; }
+  void SetPCDATA_A(char chdata) {
+    Clear();
+    m_pcdata_a = chdata;
+    m_type = ltPCDATA_A;
+  }
+  void SetPCDATA_W(wchar_t chdata) {
+    Clear();
+    m_pcdata_w = chdata;
+    m_type = ltPCDATA_W;
+  }
+  void SetBDATA(const char* data, int sz) {
+    char* tmp = new char[sz];  // to allow getting the data from itself
+    if (tmp) {
+      memcpy(tmp, data, sz);
+      Clear();
+      m_bdata.data = tmp;
+      m_bdata.sz = sz;
+      m_type = ltBDATA;
+    } else
+      m_type = ltError;
+  }
+  void SetKeyword(const Keyword& src) {
+    Clear();
+    m_type = ltKeyword;
+    m_keyword = src;
+  }
+  void SetKeyword(const char* name, bool hasVal = false, int val = 0) {
+    char tmp[SIZEOF(m_keyword.name)];
+    strncpy(tmp, name,
+            SIZEOF(m_keyword.name) - 1);  // to allow copy drom itself
+    tmp[SIZEOF(m_keyword.name) - 1] = 0;
+    Clear();
+    m_type = ltKeyword;
+    memcpy(m_keyword.name, tmp, SIZEOF(m_keyword.name));
+    m_keyword.hasVal = hasVal;
+    m_keyword.val = val;
+  }
+  const char* KeywordName() const {
+    return (m_type == ltKeyword) ? m_keyword.name : 0;
+  }
+  const int* KeywordVal() const {
+    return ((m_type == ltKeyword) && m_keyword.hasVal) ? &m_keyword.val : 0;
+  }
+  char pcdata_a() const { return (m_type == ltPCDATA_A) ? m_pcdata_a : 0; }
+  wchar_t pcdata_w() const { return (m_type == ltPCDATA_W) ? m_pcdata_w : 0; }
+  const char* bdata() const { return (m_type == ltBDATA) ? m_bdata.data : 0; }
+  int bdata_sz() const { return (m_type == ltBDATA) ? m_bdata.sz : 0; }
+  static Lexem eof;
+  static Lexem groupBegin;
+  static Lexem groupEnd;
+  static Lexem error;
+
+ private:
+  struct BDATA {
+    size_t sz;
+    char* data;
+  };
+
+  Type m_type;
+  union {
+    Keyword m_keyword;
+    char m_pcdata_a;
+    wchar_t m_pcdata_w;
+    BDATA m_bdata;
+  };
+  // This function leaves the object in the broken state. Must be followed
+  // by a correct initialization.
+  void Clear() {
+    switch (m_type) {
+      case ltBDATA:
+        delete[] m_bdata.data;
+        break;
+    }
+    //  m_type = ltError;
+  }
+};
+
+Lexem Lexem::eof(ltEOF);
+Lexem Lexem::groupBegin(ltGroupBegin);
+Lexem Lexem::groupEnd(ltGroupEnd);
+Lexem Lexem::error(ltError);
+
+// This function moves pos. When calling the function, pos must be next to the
+// backslash; pos must be in the same sequence and before end!
+Keyword GetKeyword(std::istream& stream) {
+  Keyword keyword = {"", false, 0};
+  char ch;
+  if (stream.get(ch).eof()) return keyword;
+  // Control word; maybe delimiter and value
+  if (IS_ALPHA(ch)) {
+    int i = 0;
+    do {
+      // We take up to 32 characters into account, skipping over extra
+      // characters (allowing for some non-conformant implementation).
+      if (i < 32) keyword.name[i++] = ch;
+    } while (!stream.get(ch).eof() && IS_ALPHA(ch));
+    keyword.name[i] = 0;                                   // NULL-terminating
+    if (!stream.eof() && (IS_DIGIT(ch) || (ch == '-'))) {  // Value begin
+      keyword.hasVal = true;
+      bool negative = (ch == '-');
+      if (negative) stream.get(ch);
+      i = 0;
+      while (!stream.eof() && IS_DIGIT(ch)) {
+        // We take into account only 10 digits, skip other. Older specs stated
+        // that we must be ready for an arbitrary number of digits.
+        if (i++ < 10) keyword.val = keyword.val * 10 + (ch - '0');
+        stream.get(ch);
+      }
+      if (negative) keyword.val = -keyword.val;
+    }
+    // End of control word; the space is just a delimiter - skip it
+    if (!stream.eof() && !(ch == ' ')) stream.unget();
+  } else {  // Control symbol
+    keyword.name[0] = ch;
+    keyword.name[1] = 0;
+  }
+  return keyword;
+}
+
+void GetLexem(std::istream& stream, Lexem& result) {
+  // We always stay at the beginning of the next lexem or a crlf
+  // If it's a brace then it's group begin/end
+  // If it's a backslash -> Preprocess
+  // - if it's a \u or \' -> make UTF16 character
+  // - else it's a keyword -> Process (e.g., remember the codepage)
+  // - (if the keyword is \bin then the following is #BDATA)
+  // If it's some other character -> Preprocess
+  // - if it's 0x09 -> it's the keyword \tab
+  // - else it's a PCDATA
+  char ch;
+  while (!stream.get(ch).eof() && ((ch == '\n') || (ch == '\r')))
+    ;  // Skip crlf
+  if (stream.eof())
+    result = Lexem::eof;
+  else {
+    switch (ch) {
+      case '{':  // Group begin
+      case '}':  // Group end
+        result = (ch == '{') ? Lexem::groupBegin : Lexem::groupEnd;
+        break;
+      case '\\':  // Keyword
+        result.SetKeyword(GetKeyword(stream));
+        break;
+      case '\t':  // tab
+        result.SetKeyword("tab");
+        break;
+      default:  // PSDATA?
+        result.SetPCDATA_A(ch);
+        break;
+    }
+  }
+}
+
+void PreprocessLexem(/*inout*/ Lexem& lexem, std::istream& stream, int uc) {
+  if (lexem.type() == Lexem::ltKeyword) {
+    if (lexem.KeywordName()[0] == 0)  // Empty keyword - maybe eof?
+      lexem = Lexem::error;
+    else if (eq(lexem.KeywordName(), "u")) {
+      // Unicode character - get the UTF16 and skip the uc characters
+      if (const int* val = lexem.KeywordVal()) {
+        lexem.SetPCDATA_W(*val);
+        stream.ignore(uc);
+      } else
+        lexem = Lexem::error;
+    } else if (eq(lexem.KeywordName(), "'")) {
+      // 8-bit character (\'hh) -> use current codepage
+      char ch = 0, ch1 = 0;
+      if (!stream.get(ch).eof()) ch1 = HexToInt(ch);
+      if (!stream.get(ch).eof()) (ch1 <<= 4) += HexToInt(ch);
+      lexem.SetPCDATA_A(ch1);
+    } else if (eq(lexem.KeywordName(), "\\") || eq(lexem.KeywordName(), "{") ||
+               eq(lexem.KeywordName(), "}"))  // escaped characters
+      lexem.SetPCDATA_A(lexem.KeywordName()[0]);
+    else if (eq(lexem.KeywordName(), "bin")) {
+      if (const int* i = lexem.KeywordVal()) {
+        char* data = new char[*i];
+        if (data) {
+          stream.read(data, *i);
+          if (stream.fail())
+            lexem = Lexem::error;
+          else
+            lexem.SetBDATA(data, *i);
+          delete[] data;
+        } else
+          lexem = Lexem::error;
+      } else
+        lexem = Lexem::error;
+    } else if (eq(lexem.KeywordName(), "\n") || eq(lexem.KeywordName(), "\r")) {
+      // escaped cr or lf
+      lexem.SetKeyword("par");
+    }
+  }
+}
+
+void UpdateState(const Lexem& lexem, /*inout*/ GlobalState& globalState) {
+  switch (globalState.pcdata_a_state) {
+    case GlobalState::pcdsfinished:  // Last time we finished the pcdata
+      globalState.pcdata_a_state = GlobalState::pcdsno;
+      break;
+    case GlobalState::pcdsin:
+      // to be reset later if still in the pcdata
+      globalState.pcdata_a_state = GlobalState::pcdsfinished;
+      break;
+  }
+
+  switch (lexem.type()) {
+    case Lexem::ltGroupBegin:
+      globalState.stack.push(globalState.stack.top());
+      break;
+    case Lexem::ltGroupEnd:
+      globalState.stack.pop();
+      break;
+    case Lexem::ltKeyword: {
+      const int* val = lexem.KeywordVal();
+      if (eq(lexem.KeywordName(), "ansi"))
+        globalState.codepage = CP_ACP;
+      else if (eq(lexem.KeywordName(), "mac"))
+        globalState.codepage = CP_MACCP;
+      else if (eq(lexem.KeywordName(), "pc"))
+        globalState.codepage = 437;
+      else if (eq(lexem.KeywordName(), "pca"))
+        globalState.codepage = 850;
+      else if (eq(lexem.KeywordName(), "ansicpg") && val)
+        globalState.codepage = static_cast<unsigned int>(*val);
+      else if (eq(lexem.KeywordName(), "deff") && val)
+        globalState.deff = *val;
+      else if (eq(lexem.KeywordName(), "fonttbl"))
+        globalState.stack.top().fonttbl = true;
+      else if (eq(lexem.KeywordName(), "f") && val) {
+        globalState.stack.top().f = *val;
+      } else if (eq(lexem.KeywordName(), "fcharset") &&
+                 globalState.stack.top().fonttbl &&
+                 (globalState.stack.top().f != -1) && val) {
+        FontInfo& f = globalState.fonttbl[globalState.stack.top().f];
+        f.options |= FontInfo::has_fcharset;
+        f.fcharset = *val;
+      } else if (eq(lexem.KeywordName(), "cpg") && val) {
+        if (globalState.stack.top().fonttbl &&
+            (globalState.stack.top().f != -1)) {  // Defining a font
+          FontInfo& f = globalState.fonttbl[globalState.stack.top().f];
+          f.options |= FontInfo::has_cpg;
+          f.cpg = *val;
+        } else {  // Overriding the codepage for the block - may be in filenames
+          globalState.stack.top().codepage = *val;
+        }
+      } else if (eq(lexem.KeywordName(), "plain"))
+        globalState.stack.top().f = -1;
+      else if (eq(lexem.KeywordName(), "uc") && val)
+        globalState.stack.top().uc = *val;
+    } break;
+    case Lexem::ltPCDATA_A:
+      if (globalState.pcdata_a_state ==
+          GlobalState::pcdsno)  // Beginning of the pcdata
+        globalState.pcdata_a_codepage =
+            globalState.GetCurrentCP();  // to use later to convert to utf16
+      globalState.pcdata_a_state = GlobalState::pcdsin;
+      globalState.pcdata_a << lexem.pcdata_a();
+      break;
+  }
+}
+
+void DecodeRTF(std::istream& rtf, CRTFDecoder& decoder) {
+  // Check if this is the rtf
+  Lexem lexem;
+  GetLexem(rtf, lexem);
+  if (lexem.type() != Lexem::ltGroupBegin) return;
+  decoder.BeginGroup();
+  GetLexem(rtf, lexem);
+  if ((lexem.type() != Lexem::ltKeyword) || !eq(lexem.KeywordName(), "rtf") ||
+      !lexem.KeywordVal() || (*lexem.KeywordVal() != 1))
+    return;
+  decoder.Keyword(lexem.KeywordName(), lexem.KeywordVal());
+
+  GlobalState state(rtf);
+  // Level is the count of elements in the stack
+
+  while (!state.stream.eof() &&
+         (state.stack.size() > 0)) {  // Don't go past the global group
+    GetLexem(state.stream, lexem);
+    PreprocessLexem(lexem, state.stream, state.stack.top().uc);
+    UpdateState(lexem, state);
+
+    if (state.pcdata_a_state == GlobalState::pcdsfinished) {
+      std::string s = state.pcdata_a.str();
+      int sz = ::MultiByteToWideChar(state.pcdata_a_codepage, 0, s.c_str(),
+                                     s.size(), 0, 0);
+      if (sz) {
+        wchar_t* data = new wchar_t[sz];
+        ::MultiByteToWideChar(state.pcdata_a_codepage, 0, s.c_str(), s.size(),
+                              data, sz);
+        decoder.PCDATA(data, sz);
+        delete[] data;
+      }
+      state.pcdata_a.str("");  // reset
+    }
+
+    switch (lexem.type()) {
+      case Lexem::ltGroupBegin:
+        decoder.BeginGroup();
+        break;
+      case Lexem::ltGroupEnd:
+        decoder.EndGroup();
+        break;
+      case Lexem::ltKeyword:
+        decoder.Keyword(lexem.KeywordName(), lexem.KeywordVal());
+        break;
+      case Lexem::ltPCDATA_W: {
+        wchar_t ch = lexem.pcdata_w();
+        decoder.PCDATA(&ch, 1);
+      } break;
+      case Lexem::ltBDATA:
+        decoder.BDATA(lexem.bdata(), lexem.bdata_sz());
+        break;
+      case Lexem::ltError:
+        break;  // Just silently skip the erroneous data - basic error recovery
+    }
+  }  // while
+}  // DecodeRTF