summaryrefslogtreecommitdiffstats
path: root/comm/mailnews/import/src/rtfDecoder.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'comm/mailnews/import/src/rtfDecoder.cpp')
-rw-r--r--comm/mailnews/import/src/rtfDecoder.cpp561
1 files changed, 561 insertions, 0 deletions
diff --git a/comm/mailnews/import/src/rtfDecoder.cpp b/comm/mailnews/import/src/rtfDecoder.cpp
new file mode 100644
index 0000000000..86a8151618
--- /dev/null
+++ b/comm/mailnews/import/src/rtfDecoder.cpp
@@ -0,0 +1,561 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include <stack>
+#include <map>
+#include <sstream>
+#include "windows.h"
+#include "rtfDecoder.h"
+
+#define SIZEOF(x) (sizeof(x) / sizeof((x)[0]))
+#define IS_DIGIT(i) ((i) >= '0' && (i) <= '9')
+#define IS_ALPHA(VAL) \
+ (((VAL) >= 'a' && (VAL) <= 'z') || ((VAL) >= 'A' && (VAL) <= 'Z'))
+
+inline int HexToInt(char ch) {
+ switch (ch) {
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ return ch - '0';
+ case 'A':
+ case 'B':
+ case 'C':
+ case 'D':
+ case 'E':
+ case 'F':
+ return ch - 'A' + 10;
+ case 'a':
+ case 'b':
+ case 'c':
+ case 'd':
+ case 'e':
+ case 'f':
+ return ch - 'a' + 10;
+ default:
+ return 0;
+ }
+}
+
+inline int CharsetToCP(int charset) {
+ // We don't know the Code page for the commented out charsets.
+ switch (charset) {
+ case 0:
+ return 1252; // ANSI
+ case 1:
+ return 0; // Default
+ // case 2: return 42; // Symbol
+ case 2:
+ return 1252; // Symbol
+ case 77:
+ return 10000; // Mac Roman
+ case 78:
+ return 10001; // Mac Shift Jis
+ case 79:
+ return 10003; // Mac Hangul
+ case 80:
+ return 10008; // Mac GB2312
+ case 81:
+ return 10002; // Mac Big5
+ // case 82: Mac Johab (old)
+ case 83:
+ return 10005; // Mac Hebrew
+ case 84:
+ return 10004; // Mac Arabic
+ case 85:
+ return 10006; // Mac Greek
+ case 86:
+ return 10081; // Mac Turkish
+ case 87:
+ return 10021; // Mac Thai
+ case 88:
+ return 10029; // Mac East Europe
+ case 89:
+ return 10007; // Mac Russian
+ case 128:
+ return 932; // Shift JIS
+ case 129:
+ return 949; // Hangul
+ case 130:
+ return 1361; // Johab
+ case 134:
+ return 936; // GB2312
+ case 136:
+ return 950; // Big5
+ case 161:
+ return 1253; // Greek
+ case 162:
+ return 1254; // Turkish
+ case 163:
+ return 1258; // Vietnamese
+ case 177:
+ return 1255; // Hebrew
+ case 178:
+ return 1256; // Arabic
+ // case 179: Arabic Traditional (old)
+ // case 180: Arabic user (old)
+ // case 181: Hebrew user (old)
+ case 186:
+ return 1257; // Baltic
+ case 204:
+ return 1251; // Russian
+ case 222:
+ return 874; // Thai
+ case 238:
+ return 1250; // Eastern European
+ case 254:
+ return 437; // PC 437
+ case 255:
+ return 850; // OEM
+ default:
+ return CP_ACP;
+ }
+}
+
+struct FontInfo {
+ enum Options { has_fcharset = 0x0001, has_cpg = 0x0002 };
+ unsigned int options;
+ int fcharset;
+ unsigned int cpg;
+ FontInfo() : options(0), fcharset(0), cpg(0xFFFFFFFF) {}
+ unsigned int Codepage() {
+ if (options & has_cpg)
+ return cpg;
+ else if (options & has_fcharset)
+ return CharsetToCP(fcharset);
+ else
+ return 0xFFFFFFFF;
+ }
+};
+typedef std::map<int, FontInfo> Fonttbl;
+
+struct LocalState {
+ bool fonttbl; // When fonts are being defined
+ int f; // Index of the font being defined/used; defines the codepage if no
+ // \cpg
+ unsigned int uc; // ucN keyword value; its default is 1
+ unsigned int codepage; // defined by \cpg
+};
+typedef std::stack<LocalState> StateStack;
+
+struct GlobalState {
+ enum Pcdata_state { pcdsno, pcdsin, pcdsfinished };
+ std::istream& stream;
+ Fonttbl fonttbl;
+ StateStack stack;
+ unsigned int codepage; // defined by \ansi, \mac, \pc, \pca, and \ansicpgN
+ int deff;
+ std::stringstream pcdata_a;
+ unsigned int pcdata_a_codepage;
+ Pcdata_state pcdata_a_state;
+
+ explicit GlobalState(std::istream& s)
+ : stream(s), codepage(CP_ACP), deff(-1), pcdata_a_state(pcdsno) {
+ LocalState st;
+ st.fonttbl = false;
+ st.f = -1;
+ st.uc = 1;
+ st.codepage = 0xFFFFFFFF;
+ stack.push(st);
+ }
+ unsigned int GetCurrentCP() {
+ if (stack.top().codepage != 0xFFFFFFFF) // \cpg in use
+ return stack.top().codepage;
+ // \cpg not used; use font settings
+ int f = (stack.top().f != -1) ? stack.top().f : deff;
+ if (f != -1) {
+ Fonttbl::iterator iter = fonttbl.find(f);
+ if (iter != fonttbl.end()) {
+ unsigned int cp = iter->second.Codepage();
+ if (cp != 0xFFFFFFFF) return cp;
+ }
+ }
+ return codepage; // No overrides; use the top-level legacy setting
+ }
+};
+
+struct Keyword {
+ char name[33];
+ bool hasVal;
+ int val;
+};
+
+class Lexem {
+ public:
+ enum Type {
+ ltGroupBegin,
+ ltGroupEnd,
+ ltKeyword,
+ ltPCDATA_A,
+ ltPCDATA_W,
+ ltBDATA,
+ ltEOF,
+ ltError
+ };
+ explicit Lexem(Type t = ltError) : m_type(t) {}
+ Lexem(Lexem& from) {
+ switch (m_type = from.m_type) {
+ case ltKeyword:
+ m_keyword = from.m_keyword;
+ break;
+ case ltPCDATA_A:
+ m_pcdata_a = from.m_pcdata_a;
+ break;
+ case ltPCDATA_W:
+ m_pcdata_w = from.m_pcdata_w;
+ break;
+ case ltBDATA:
+ m_bdata = from.m_bdata; // Move pointers when copying.
+ from.m_type = ltError; // Invalidate the original. Not nice.
+ break;
+ }
+ }
+ ~Lexem() { Clear(); }
+ Lexem& operator=(Lexem& from) {
+ if (&from != this) {
+ Clear();
+ switch (m_type = from.m_type) {
+ case ltKeyword:
+ m_keyword = from.m_keyword;
+ break;
+ case ltPCDATA_A:
+ m_pcdata_a = from.m_pcdata_a;
+ break;
+ case ltPCDATA_W:
+ m_pcdata_w = from.m_pcdata_w;
+ break;
+ case ltBDATA:
+ m_bdata = from.m_bdata; // Move pointers when copying.
+ from.m_type = ltError; // Invalidate the original. Not nice.
+ break;
+ }
+ }
+ return *this;
+ }
+ Type type() const { return m_type; }
+ void SetPCDATA_A(char chdata) {
+ Clear();
+ m_pcdata_a = chdata;
+ m_type = ltPCDATA_A;
+ }
+ void SetPCDATA_W(wchar_t chdata) {
+ Clear();
+ m_pcdata_w = chdata;
+ m_type = ltPCDATA_W;
+ }
+ void SetBDATA(const char* data, int sz) {
+ char* tmp = new char[sz]; // to allow getting the data from itself
+ if (tmp) {
+ memcpy(tmp, data, sz);
+ Clear();
+ m_bdata.data = tmp;
+ m_bdata.sz = sz;
+ m_type = ltBDATA;
+ } else
+ m_type = ltError;
+ }
+ void SetKeyword(const Keyword& src) {
+ Clear();
+ m_type = ltKeyword;
+ m_keyword = src;
+ }
+ void SetKeyword(const char* name, bool hasVal = false, int val = 0) {
+ char tmp[SIZEOF(m_keyword.name)];
+ strncpy(tmp, name,
+ SIZEOF(m_keyword.name) - 1); // to allow copy drom itself
+ tmp[SIZEOF(m_keyword.name) - 1] = 0;
+ Clear();
+ m_type = ltKeyword;
+ memcpy(m_keyword.name, tmp, SIZEOF(m_keyword.name));
+ m_keyword.hasVal = hasVal;
+ m_keyword.val = val;
+ }
+ const char* KeywordName() const {
+ return (m_type == ltKeyword) ? m_keyword.name : 0;
+ }
+ const int* KeywordVal() const {
+ return ((m_type == ltKeyword) && m_keyword.hasVal) ? &m_keyword.val : 0;
+ }
+ char pcdata_a() const { return (m_type == ltPCDATA_A) ? m_pcdata_a : 0; }
+ wchar_t pcdata_w() const { return (m_type == ltPCDATA_W) ? m_pcdata_w : 0; }
+ const char* bdata() const { return (m_type == ltBDATA) ? m_bdata.data : 0; }
+ int bdata_sz() const { return (m_type == ltBDATA) ? m_bdata.sz : 0; }
+ static Lexem eof;
+ static Lexem groupBegin;
+ static Lexem groupEnd;
+ static Lexem error;
+
+ private:
+ struct BDATA {
+ size_t sz;
+ char* data;
+ };
+
+ Type m_type;
+ union {
+ Keyword m_keyword;
+ char m_pcdata_a;
+ wchar_t m_pcdata_w;
+ BDATA m_bdata;
+ };
+ // This function leaves the object in the broken state. Must be followed
+ // by a correct initialization.
+ void Clear() {
+ switch (m_type) {
+ case ltBDATA:
+ delete[] m_bdata.data;
+ break;
+ }
+ // m_type = ltError;
+ }
+};
+
+Lexem Lexem::eof(ltEOF);
+Lexem Lexem::groupBegin(ltGroupBegin);
+Lexem Lexem::groupEnd(ltGroupEnd);
+Lexem Lexem::error(ltError);
+
+// This function moves pos. When calling the function, pos must be next to the
+// backslash; pos must be in the same sequence and before end!
+Keyword GetKeyword(std::istream& stream) {
+ Keyword keyword = {"", false, 0};
+ char ch;
+ if (stream.get(ch).eof()) return keyword;
+ // Control word; maybe delimiter and value
+ if (IS_ALPHA(ch)) {
+ int i = 0;
+ do {
+ // We take up to 32 characters into account, skipping over extra
+ // characters (allowing for some non-conformant implementation).
+ if (i < 32) keyword.name[i++] = ch;
+ } while (!stream.get(ch).eof() && IS_ALPHA(ch));
+ keyword.name[i] = 0; // NULL-terminating
+ if (!stream.eof() && (IS_DIGIT(ch) || (ch == '-'))) { // Value begin
+ keyword.hasVal = true;
+ bool negative = (ch == '-');
+ if (negative) stream.get(ch);
+ i = 0;
+ while (!stream.eof() && IS_DIGIT(ch)) {
+ // We take into account only 10 digits, skip other. Older specs stated
+ // that we must be ready for an arbitrary number of digits.
+ if (i++ < 10) keyword.val = keyword.val * 10 + (ch - '0');
+ stream.get(ch);
+ }
+ if (negative) keyword.val = -keyword.val;
+ }
+ // End of control word; the space is just a delimiter - skip it
+ if (!stream.eof() && !(ch == ' ')) stream.unget();
+ } else { // Control symbol
+ keyword.name[0] = ch;
+ keyword.name[1] = 0;
+ }
+ return keyword;
+}
+
+void GetLexem(std::istream& stream, Lexem& result) {
+ // We always stay at the beginning of the next lexem or a crlf
+ // If it's a brace then it's group begin/end
+ // If it's a backslash -> Preprocess
+ // - if it's a \u or \' -> make UTF16 character
+ // - else it's a keyword -> Process (e.g., remember the codepage)
+ // - (if the keyword is \bin then the following is #BDATA)
+ // If it's some other character -> Preprocess
+ // - if it's 0x09 -> it's the keyword \tab
+ // - else it's a PCDATA
+ char ch;
+ while (!stream.get(ch).eof() && ((ch == '\n') || (ch == '\r')))
+ ; // Skip crlf
+ if (stream.eof())
+ result = Lexem::eof;
+ else {
+ switch (ch) {
+ case '{': // Group begin
+ case '}': // Group end
+ result = (ch == '{') ? Lexem::groupBegin : Lexem::groupEnd;
+ break;
+ case '\\': // Keyword
+ result.SetKeyword(GetKeyword(stream));
+ break;
+ case '\t': // tab
+ result.SetKeyword("tab");
+ break;
+ default: // PSDATA?
+ result.SetPCDATA_A(ch);
+ break;
+ }
+ }
+}
+
+void PreprocessLexem(/*inout*/ Lexem& lexem, std::istream& stream, int uc) {
+ if (lexem.type() == Lexem::ltKeyword) {
+ if (lexem.KeywordName()[0] == 0) // Empty keyword - maybe eof?
+ lexem = Lexem::error;
+ else if (eq(lexem.KeywordName(), "u")) {
+ // Unicode character - get the UTF16 and skip the uc characters
+ if (const int* val = lexem.KeywordVal()) {
+ lexem.SetPCDATA_W(*val);
+ stream.ignore(uc);
+ } else
+ lexem = Lexem::error;
+ } else if (eq(lexem.KeywordName(), "'")) {
+ // 8-bit character (\'hh) -> use current codepage
+ char ch = 0, ch1 = 0;
+ if (!stream.get(ch).eof()) ch1 = HexToInt(ch);
+ if (!stream.get(ch).eof()) (ch1 <<= 4) += HexToInt(ch);
+ lexem.SetPCDATA_A(ch1);
+ } else if (eq(lexem.KeywordName(), "\\") || eq(lexem.KeywordName(), "{") ||
+ eq(lexem.KeywordName(), "}")) // escaped characters
+ lexem.SetPCDATA_A(lexem.KeywordName()[0]);
+ else if (eq(lexem.KeywordName(), "bin")) {
+ if (const int* i = lexem.KeywordVal()) {
+ char* data = new char[*i];
+ if (data) {
+ stream.read(data, *i);
+ if (stream.fail())
+ lexem = Lexem::error;
+ else
+ lexem.SetBDATA(data, *i);
+ delete[] data;
+ } else
+ lexem = Lexem::error;
+ } else
+ lexem = Lexem::error;
+ } else if (eq(lexem.KeywordName(), "\n") || eq(lexem.KeywordName(), "\r")) {
+ // escaped cr or lf
+ lexem.SetKeyword("par");
+ }
+ }
+}
+
+void UpdateState(const Lexem& lexem, /*inout*/ GlobalState& globalState) {
+ switch (globalState.pcdata_a_state) {
+ case GlobalState::pcdsfinished: // Last time we finished the pcdata
+ globalState.pcdata_a_state = GlobalState::pcdsno;
+ break;
+ case GlobalState::pcdsin:
+ // to be reset later if still in the pcdata
+ globalState.pcdata_a_state = GlobalState::pcdsfinished;
+ break;
+ }
+
+ switch (lexem.type()) {
+ case Lexem::ltGroupBegin:
+ globalState.stack.push(globalState.stack.top());
+ break;
+ case Lexem::ltGroupEnd:
+ globalState.stack.pop();
+ break;
+ case Lexem::ltKeyword: {
+ const int* val = lexem.KeywordVal();
+ if (eq(lexem.KeywordName(), "ansi"))
+ globalState.codepage = CP_ACP;
+ else if (eq(lexem.KeywordName(), "mac"))
+ globalState.codepage = CP_MACCP;
+ else if (eq(lexem.KeywordName(), "pc"))
+ globalState.codepage = 437;
+ else if (eq(lexem.KeywordName(), "pca"))
+ globalState.codepage = 850;
+ else if (eq(lexem.KeywordName(), "ansicpg") && val)
+ globalState.codepage = static_cast<unsigned int>(*val);
+ else if (eq(lexem.KeywordName(), "deff") && val)
+ globalState.deff = *val;
+ else if (eq(lexem.KeywordName(), "fonttbl"))
+ globalState.stack.top().fonttbl = true;
+ else if (eq(lexem.KeywordName(), "f") && val) {
+ globalState.stack.top().f = *val;
+ } else if (eq(lexem.KeywordName(), "fcharset") &&
+ globalState.stack.top().fonttbl &&
+ (globalState.stack.top().f != -1) && val) {
+ FontInfo& f = globalState.fonttbl[globalState.stack.top().f];
+ f.options |= FontInfo::has_fcharset;
+ f.fcharset = *val;
+ } else if (eq(lexem.KeywordName(), "cpg") && val) {
+ if (globalState.stack.top().fonttbl &&
+ (globalState.stack.top().f != -1)) { // Defining a font
+ FontInfo& f = globalState.fonttbl[globalState.stack.top().f];
+ f.options |= FontInfo::has_cpg;
+ f.cpg = *val;
+ } else { // Overriding the codepage for the block - may be in filenames
+ globalState.stack.top().codepage = *val;
+ }
+ } else if (eq(lexem.KeywordName(), "plain"))
+ globalState.stack.top().f = -1;
+ else if (eq(lexem.KeywordName(), "uc") && val)
+ globalState.stack.top().uc = *val;
+ } break;
+ case Lexem::ltPCDATA_A:
+ if (globalState.pcdata_a_state ==
+ GlobalState::pcdsno) // Beginning of the pcdata
+ globalState.pcdata_a_codepage =
+ globalState.GetCurrentCP(); // to use later to convert to utf16
+ globalState.pcdata_a_state = GlobalState::pcdsin;
+ globalState.pcdata_a << lexem.pcdata_a();
+ break;
+ }
+}
+
+void DecodeRTF(std::istream& rtf, CRTFDecoder& decoder) {
+ // Check if this is the rtf
+ Lexem lexem;
+ GetLexem(rtf, lexem);
+ if (lexem.type() != Lexem::ltGroupBegin) return;
+ decoder.BeginGroup();
+ GetLexem(rtf, lexem);
+ if ((lexem.type() != Lexem::ltKeyword) || !eq(lexem.KeywordName(), "rtf") ||
+ !lexem.KeywordVal() || (*lexem.KeywordVal() != 1))
+ return;
+ decoder.Keyword(lexem.KeywordName(), lexem.KeywordVal());
+
+ GlobalState state(rtf);
+ // Level is the count of elements in the stack
+
+ while (!state.stream.eof() &&
+ (state.stack.size() > 0)) { // Don't go past the global group
+ GetLexem(state.stream, lexem);
+ PreprocessLexem(lexem, state.stream, state.stack.top().uc);
+ UpdateState(lexem, state);
+
+ if (state.pcdata_a_state == GlobalState::pcdsfinished) {
+ std::string s = state.pcdata_a.str();
+ int sz = ::MultiByteToWideChar(state.pcdata_a_codepage, 0, s.c_str(),
+ s.size(), 0, 0);
+ if (sz) {
+ wchar_t* data = new wchar_t[sz];
+ ::MultiByteToWideChar(state.pcdata_a_codepage, 0, s.c_str(), s.size(),
+ data, sz);
+ decoder.PCDATA(data, sz);
+ delete[] data;
+ }
+ state.pcdata_a.str(""); // reset
+ }
+
+ switch (lexem.type()) {
+ case Lexem::ltGroupBegin:
+ decoder.BeginGroup();
+ break;
+ case Lexem::ltGroupEnd:
+ decoder.EndGroup();
+ break;
+ case Lexem::ltKeyword:
+ decoder.Keyword(lexem.KeywordName(), lexem.KeywordVal());
+ break;
+ case Lexem::ltPCDATA_W: {
+ wchar_t ch = lexem.pcdata_w();
+ decoder.PCDATA(&ch, 1);
+ } break;
+ case Lexem::ltBDATA:
+ decoder.BDATA(lexem.bdata(), lexem.bdata_sz());
+ break;
+ case Lexem::ltError:
+ break; // Just silently skip the erroneous data - basic error recovery
+ }
+ } // while
+} // DecodeRTF