diff options
Diffstat (limited to 'comm/mailnews/import/src/rtfDecoder.cpp')
-rw-r--r-- | comm/mailnews/import/src/rtfDecoder.cpp | 561 |
1 files changed, 561 insertions, 0 deletions
diff --git a/comm/mailnews/import/src/rtfDecoder.cpp b/comm/mailnews/import/src/rtfDecoder.cpp new file mode 100644 index 0000000000..86a8151618 --- /dev/null +++ b/comm/mailnews/import/src/rtfDecoder.cpp @@ -0,0 +1,561 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include <stack> +#include <map> +#include <sstream> +#include "windows.h" +#include "rtfDecoder.h" + +#define SIZEOF(x) (sizeof(x) / sizeof((x)[0])) +#define IS_DIGIT(i) ((i) >= '0' && (i) <= '9') +#define IS_ALPHA(VAL) \ + (((VAL) >= 'a' && (VAL) <= 'z') || ((VAL) >= 'A' && (VAL) <= 'Z')) + +inline int HexToInt(char ch) { + switch (ch) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + return ch - '0'; + case 'A': + case 'B': + case 'C': + case 'D': + case 'E': + case 'F': + return ch - 'A' + 10; + case 'a': + case 'b': + case 'c': + case 'd': + case 'e': + case 'f': + return ch - 'a' + 10; + default: + return 0; + } +} + +inline int CharsetToCP(int charset) { + // We don't know the Code page for the commented out charsets. + switch (charset) { + case 0: + return 1252; // ANSI + case 1: + return 0; // Default + // case 2: return 42; // Symbol + case 2: + return 1252; // Symbol + case 77: + return 10000; // Mac Roman + case 78: + return 10001; // Mac Shift Jis + case 79: + return 10003; // Mac Hangul + case 80: + return 10008; // Mac GB2312 + case 81: + return 10002; // Mac Big5 + // case 82: Mac Johab (old) + case 83: + return 10005; // Mac Hebrew + case 84: + return 10004; // Mac Arabic + case 85: + return 10006; // Mac Greek + case 86: + return 10081; // Mac Turkish + case 87: + return 10021; // Mac Thai + case 88: + return 10029; // Mac East Europe + case 89: + return 10007; // Mac Russian + case 128: + return 932; // Shift JIS + case 129: + return 949; // Hangul + case 130: + return 1361; // Johab + case 134: + return 936; // GB2312 + case 136: + return 950; // Big5 + case 161: + return 1253; // Greek + case 162: + return 1254; // Turkish + case 163: + return 1258; // Vietnamese + case 177: + return 1255; // Hebrew + case 178: + return 1256; // Arabic + // case 179: Arabic Traditional (old) + // case 180: Arabic user (old) + // case 181: Hebrew user (old) + case 186: + return 1257; // Baltic + case 204: + return 1251; // Russian + case 222: + return 874; // Thai + case 238: + return 1250; // Eastern European + case 254: + return 437; // PC 437 + case 255: + return 850; // OEM + default: + return CP_ACP; + } +} + +struct FontInfo { + enum Options { has_fcharset = 0x0001, has_cpg = 0x0002 }; + unsigned int options; + int fcharset; + unsigned int cpg; + FontInfo() : options(0), fcharset(0), cpg(0xFFFFFFFF) {} + unsigned int Codepage() { + if (options & has_cpg) + return cpg; + else if (options & has_fcharset) + return CharsetToCP(fcharset); + else + return 0xFFFFFFFF; + } +}; +typedef std::map<int, FontInfo> Fonttbl; + +struct LocalState { + bool fonttbl; // When fonts are being defined + int f; // Index of the font being defined/used; defines the codepage if no + // \cpg + unsigned int uc; // ucN keyword value; its default is 1 + unsigned int codepage; // defined by \cpg +}; +typedef std::stack<LocalState> StateStack; + +struct GlobalState { + enum Pcdata_state { pcdsno, pcdsin, pcdsfinished }; + std::istream& stream; + Fonttbl fonttbl; + StateStack stack; + unsigned int codepage; // defined by \ansi, \mac, \pc, \pca, and \ansicpgN + int deff; + std::stringstream pcdata_a; + unsigned int pcdata_a_codepage; + Pcdata_state pcdata_a_state; + + explicit GlobalState(std::istream& s) + : stream(s), codepage(CP_ACP), deff(-1), pcdata_a_state(pcdsno) { + LocalState st; + st.fonttbl = false; + st.f = -1; + st.uc = 1; + st.codepage = 0xFFFFFFFF; + stack.push(st); + } + unsigned int GetCurrentCP() { + if (stack.top().codepage != 0xFFFFFFFF) // \cpg in use + return stack.top().codepage; + // \cpg not used; use font settings + int f = (stack.top().f != -1) ? stack.top().f : deff; + if (f != -1) { + Fonttbl::iterator iter = fonttbl.find(f); + if (iter != fonttbl.end()) { + unsigned int cp = iter->second.Codepage(); + if (cp != 0xFFFFFFFF) return cp; + } + } + return codepage; // No overrides; use the top-level legacy setting + } +}; + +struct Keyword { + char name[33]; + bool hasVal; + int val; +}; + +class Lexem { + public: + enum Type { + ltGroupBegin, + ltGroupEnd, + ltKeyword, + ltPCDATA_A, + ltPCDATA_W, + ltBDATA, + ltEOF, + ltError + }; + explicit Lexem(Type t = ltError) : m_type(t) {} + Lexem(Lexem& from) { + switch (m_type = from.m_type) { + case ltKeyword: + m_keyword = from.m_keyword; + break; + case ltPCDATA_A: + m_pcdata_a = from.m_pcdata_a; + break; + case ltPCDATA_W: + m_pcdata_w = from.m_pcdata_w; + break; + case ltBDATA: + m_bdata = from.m_bdata; // Move pointers when copying. + from.m_type = ltError; // Invalidate the original. Not nice. + break; + } + } + ~Lexem() { Clear(); } + Lexem& operator=(Lexem& from) { + if (&from != this) { + Clear(); + switch (m_type = from.m_type) { + case ltKeyword: + m_keyword = from.m_keyword; + break; + case ltPCDATA_A: + m_pcdata_a = from.m_pcdata_a; + break; + case ltPCDATA_W: + m_pcdata_w = from.m_pcdata_w; + break; + case ltBDATA: + m_bdata = from.m_bdata; // Move pointers when copying. + from.m_type = ltError; // Invalidate the original. Not nice. + break; + } + } + return *this; + } + Type type() const { return m_type; } + void SetPCDATA_A(char chdata) { + Clear(); + m_pcdata_a = chdata; + m_type = ltPCDATA_A; + } + void SetPCDATA_W(wchar_t chdata) { + Clear(); + m_pcdata_w = chdata; + m_type = ltPCDATA_W; + } + void SetBDATA(const char* data, int sz) { + char* tmp = new char[sz]; // to allow getting the data from itself + if (tmp) { + memcpy(tmp, data, sz); + Clear(); + m_bdata.data = tmp; + m_bdata.sz = sz; + m_type = ltBDATA; + } else + m_type = ltError; + } + void SetKeyword(const Keyword& src) { + Clear(); + m_type = ltKeyword; + m_keyword = src; + } + void SetKeyword(const char* name, bool hasVal = false, int val = 0) { + char tmp[SIZEOF(m_keyword.name)]; + strncpy(tmp, name, + SIZEOF(m_keyword.name) - 1); // to allow copy drom itself + tmp[SIZEOF(m_keyword.name) - 1] = 0; + Clear(); + m_type = ltKeyword; + memcpy(m_keyword.name, tmp, SIZEOF(m_keyword.name)); + m_keyword.hasVal = hasVal; + m_keyword.val = val; + } + const char* KeywordName() const { + return (m_type == ltKeyword) ? m_keyword.name : 0; + } + const int* KeywordVal() const { + return ((m_type == ltKeyword) && m_keyword.hasVal) ? &m_keyword.val : 0; + } + char pcdata_a() const { return (m_type == ltPCDATA_A) ? m_pcdata_a : 0; } + wchar_t pcdata_w() const { return (m_type == ltPCDATA_W) ? m_pcdata_w : 0; } + const char* bdata() const { return (m_type == ltBDATA) ? m_bdata.data : 0; } + int bdata_sz() const { return (m_type == ltBDATA) ? m_bdata.sz : 0; } + static Lexem eof; + static Lexem groupBegin; + static Lexem groupEnd; + static Lexem error; + + private: + struct BDATA { + size_t sz; + char* data; + }; + + Type m_type; + union { + Keyword m_keyword; + char m_pcdata_a; + wchar_t m_pcdata_w; + BDATA m_bdata; + }; + // This function leaves the object in the broken state. Must be followed + // by a correct initialization. + void Clear() { + switch (m_type) { + case ltBDATA: + delete[] m_bdata.data; + break; + } + // m_type = ltError; + } +}; + +Lexem Lexem::eof(ltEOF); +Lexem Lexem::groupBegin(ltGroupBegin); +Lexem Lexem::groupEnd(ltGroupEnd); +Lexem Lexem::error(ltError); + +// This function moves pos. When calling the function, pos must be next to the +// backslash; pos must be in the same sequence and before end! +Keyword GetKeyword(std::istream& stream) { + Keyword keyword = {"", false, 0}; + char ch; + if (stream.get(ch).eof()) return keyword; + // Control word; maybe delimiter and value + if (IS_ALPHA(ch)) { + int i = 0; + do { + // We take up to 32 characters into account, skipping over extra + // characters (allowing for some non-conformant implementation). + if (i < 32) keyword.name[i++] = ch; + } while (!stream.get(ch).eof() && IS_ALPHA(ch)); + keyword.name[i] = 0; // NULL-terminating + if (!stream.eof() && (IS_DIGIT(ch) || (ch == '-'))) { // Value begin + keyword.hasVal = true; + bool negative = (ch == '-'); + if (negative) stream.get(ch); + i = 0; + while (!stream.eof() && IS_DIGIT(ch)) { + // We take into account only 10 digits, skip other. Older specs stated + // that we must be ready for an arbitrary number of digits. + if (i++ < 10) keyword.val = keyword.val * 10 + (ch - '0'); + stream.get(ch); + } + if (negative) keyword.val = -keyword.val; + } + // End of control word; the space is just a delimiter - skip it + if (!stream.eof() && !(ch == ' ')) stream.unget(); + } else { // Control symbol + keyword.name[0] = ch; + keyword.name[1] = 0; + } + return keyword; +} + +void GetLexem(std::istream& stream, Lexem& result) { + // We always stay at the beginning of the next lexem or a crlf + // If it's a brace then it's group begin/end + // If it's a backslash -> Preprocess + // - if it's a \u or \' -> make UTF16 character + // - else it's a keyword -> Process (e.g., remember the codepage) + // - (if the keyword is \bin then the following is #BDATA) + // If it's some other character -> Preprocess + // - if it's 0x09 -> it's the keyword \tab + // - else it's a PCDATA + char ch; + while (!stream.get(ch).eof() && ((ch == '\n') || (ch == '\r'))) + ; // Skip crlf + if (stream.eof()) + result = Lexem::eof; + else { + switch (ch) { + case '{': // Group begin + case '}': // Group end + result = (ch == '{') ? Lexem::groupBegin : Lexem::groupEnd; + break; + case '\\': // Keyword + result.SetKeyword(GetKeyword(stream)); + break; + case '\t': // tab + result.SetKeyword("tab"); + break; + default: // PSDATA? + result.SetPCDATA_A(ch); + break; + } + } +} + +void PreprocessLexem(/*inout*/ Lexem& lexem, std::istream& stream, int uc) { + if (lexem.type() == Lexem::ltKeyword) { + if (lexem.KeywordName()[0] == 0) // Empty keyword - maybe eof? + lexem = Lexem::error; + else if (eq(lexem.KeywordName(), "u")) { + // Unicode character - get the UTF16 and skip the uc characters + if (const int* val = lexem.KeywordVal()) { + lexem.SetPCDATA_W(*val); + stream.ignore(uc); + } else + lexem = Lexem::error; + } else if (eq(lexem.KeywordName(), "'")) { + // 8-bit character (\'hh) -> use current codepage + char ch = 0, ch1 = 0; + if (!stream.get(ch).eof()) ch1 = HexToInt(ch); + if (!stream.get(ch).eof()) (ch1 <<= 4) += HexToInt(ch); + lexem.SetPCDATA_A(ch1); + } else if (eq(lexem.KeywordName(), "\\") || eq(lexem.KeywordName(), "{") || + eq(lexem.KeywordName(), "}")) // escaped characters + lexem.SetPCDATA_A(lexem.KeywordName()[0]); + else if (eq(lexem.KeywordName(), "bin")) { + if (const int* i = lexem.KeywordVal()) { + char* data = new char[*i]; + if (data) { + stream.read(data, *i); + if (stream.fail()) + lexem = Lexem::error; + else + lexem.SetBDATA(data, *i); + delete[] data; + } else + lexem = Lexem::error; + } else + lexem = Lexem::error; + } else if (eq(lexem.KeywordName(), "\n") || eq(lexem.KeywordName(), "\r")) { + // escaped cr or lf + lexem.SetKeyword("par"); + } + } +} + +void UpdateState(const Lexem& lexem, /*inout*/ GlobalState& globalState) { + switch (globalState.pcdata_a_state) { + case GlobalState::pcdsfinished: // Last time we finished the pcdata + globalState.pcdata_a_state = GlobalState::pcdsno; + break; + case GlobalState::pcdsin: + // to be reset later if still in the pcdata + globalState.pcdata_a_state = GlobalState::pcdsfinished; + break; + } + + switch (lexem.type()) { + case Lexem::ltGroupBegin: + globalState.stack.push(globalState.stack.top()); + break; + case Lexem::ltGroupEnd: + globalState.stack.pop(); + break; + case Lexem::ltKeyword: { + const int* val = lexem.KeywordVal(); + if (eq(lexem.KeywordName(), "ansi")) + globalState.codepage = CP_ACP; + else if (eq(lexem.KeywordName(), "mac")) + globalState.codepage = CP_MACCP; + else if (eq(lexem.KeywordName(), "pc")) + globalState.codepage = 437; + else if (eq(lexem.KeywordName(), "pca")) + globalState.codepage = 850; + else if (eq(lexem.KeywordName(), "ansicpg") && val) + globalState.codepage = static_cast<unsigned int>(*val); + else if (eq(lexem.KeywordName(), "deff") && val) + globalState.deff = *val; + else if (eq(lexem.KeywordName(), "fonttbl")) + globalState.stack.top().fonttbl = true; + else if (eq(lexem.KeywordName(), "f") && val) { + globalState.stack.top().f = *val; + } else if (eq(lexem.KeywordName(), "fcharset") && + globalState.stack.top().fonttbl && + (globalState.stack.top().f != -1) && val) { + FontInfo& f = globalState.fonttbl[globalState.stack.top().f]; + f.options |= FontInfo::has_fcharset; + f.fcharset = *val; + } else if (eq(lexem.KeywordName(), "cpg") && val) { + if (globalState.stack.top().fonttbl && + (globalState.stack.top().f != -1)) { // Defining a font + FontInfo& f = globalState.fonttbl[globalState.stack.top().f]; + f.options |= FontInfo::has_cpg; + f.cpg = *val; + } else { // Overriding the codepage for the block - may be in filenames + globalState.stack.top().codepage = *val; + } + } else if (eq(lexem.KeywordName(), "plain")) + globalState.stack.top().f = -1; + else if (eq(lexem.KeywordName(), "uc") && val) + globalState.stack.top().uc = *val; + } break; + case Lexem::ltPCDATA_A: + if (globalState.pcdata_a_state == + GlobalState::pcdsno) // Beginning of the pcdata + globalState.pcdata_a_codepage = + globalState.GetCurrentCP(); // to use later to convert to utf16 + globalState.pcdata_a_state = GlobalState::pcdsin; + globalState.pcdata_a << lexem.pcdata_a(); + break; + } +} + +void DecodeRTF(std::istream& rtf, CRTFDecoder& decoder) { + // Check if this is the rtf + Lexem lexem; + GetLexem(rtf, lexem); + if (lexem.type() != Lexem::ltGroupBegin) return; + decoder.BeginGroup(); + GetLexem(rtf, lexem); + if ((lexem.type() != Lexem::ltKeyword) || !eq(lexem.KeywordName(), "rtf") || + !lexem.KeywordVal() || (*lexem.KeywordVal() != 1)) + return; + decoder.Keyword(lexem.KeywordName(), lexem.KeywordVal()); + + GlobalState state(rtf); + // Level is the count of elements in the stack + + while (!state.stream.eof() && + (state.stack.size() > 0)) { // Don't go past the global group + GetLexem(state.stream, lexem); + PreprocessLexem(lexem, state.stream, state.stack.top().uc); + UpdateState(lexem, state); + + if (state.pcdata_a_state == GlobalState::pcdsfinished) { + std::string s = state.pcdata_a.str(); + int sz = ::MultiByteToWideChar(state.pcdata_a_codepage, 0, s.c_str(), + s.size(), 0, 0); + if (sz) { + wchar_t* data = new wchar_t[sz]; + ::MultiByteToWideChar(state.pcdata_a_codepage, 0, s.c_str(), s.size(), + data, sz); + decoder.PCDATA(data, sz); + delete[] data; + } + state.pcdata_a.str(""); // reset + } + + switch (lexem.type()) { + case Lexem::ltGroupBegin: + decoder.BeginGroup(); + break; + case Lexem::ltGroupEnd: + decoder.EndGroup(); + break; + case Lexem::ltKeyword: + decoder.Keyword(lexem.KeywordName(), lexem.KeywordVal()); + break; + case Lexem::ltPCDATA_W: { + wchar_t ch = lexem.pcdata_w(); + decoder.PCDATA(&ch, 1); + } break; + case Lexem::ltBDATA: + decoder.BDATA(lexem.bdata(), lexem.bdata_sz()); + break; + case Lexem::ltError: + break; // Just silently skip the erroneous data - basic error recovery + } + } // while +} // DecodeRTF |