summaryrefslogtreecommitdiffstats
path: root/writerfilter/source/rtftok/rtftokenizer.cxx
diff options
context:
space:
mode:
Diffstat (limited to 'writerfilter/source/rtftok/rtftokenizer.cxx')
-rw-r--r--writerfilter/source/rtftok/rtftokenizer.cxx329
1 files changed, 329 insertions, 0 deletions
diff --git a/writerfilter/source/rtftok/rtftokenizer.cxx b/writerfilter/source/rtftok/rtftokenizer.cxx
new file mode 100644
index 000000000..4dc80416c
--- /dev/null
+++ b/writerfilter/source/rtftok/rtftokenizer.cxx
@@ -0,0 +1,329 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include "rtftokenizer.hxx"
+#include <tools/stream.hxx>
+#include <svx/dialmgr.hxx>
+#include <svx/strings.hrc>
+#include <rtl/strbuf.hxx>
+#include <rtl/character.hxx>
+#include <sal/log.hxx>
+#include "rtfskipdestination.hxx"
+#include <com/sun/star/io/BufferSizeExceededException.hpp>
+#include <com/sun/star/task/XStatusIndicator.hpp>
+#include <filter/msfilter/rtfutil.hxx>
+
+using namespace com::sun::star;
+
+namespace writerfilter::rtftok
+{
+std::unordered_map<OString, RTFSymbol> RTFTokenizer::s_aRTFControlWords;
+bool RTFTokenizer::s_bControlWordsInitialised;
+std::vector<RTFMathSymbol> RTFTokenizer::s_aRTFMathControlWords;
+bool RTFTokenizer::s_bMathControlWordsSorted;
+
+RTFTokenizer::RTFTokenizer(RTFListener& rImport, SvStream* pInStream,
+ uno::Reference<task::XStatusIndicator> const& xStatusIndicator)
+ : m_rImport(rImport)
+ , m_pInStream(pInStream)
+ , m_xStatusIndicator(xStatusIndicator)
+ , m_nGroup(0)
+ , m_nLineNumber(0)
+ , m_nLineStartPos(0)
+ , m_nGroupStart(0)
+{
+ if (!RTFTokenizer::s_bControlWordsInitialised)
+ {
+ RTFTokenizer::s_bControlWordsInitialised = true;
+ for (int i = 0; i < nRTFControlWords; ++i)
+ s_aRTFControlWords.emplace(OString(aRTFControlWords[i].GetKeyword()),
+ aRTFControlWords[i]);
+ }
+ if (!RTFTokenizer::s_bMathControlWordsSorted)
+ {
+ RTFTokenizer::s_bMathControlWordsSorted = true;
+ s_aRTFMathControlWords = std::vector<RTFMathSymbol>(
+ aRTFMathControlWords, aRTFMathControlWords + nRTFMathControlWords);
+ std::sort(s_aRTFMathControlWords.begin(), s_aRTFMathControlWords.end());
+ }
+}
+
+RTFTokenizer::~RTFTokenizer() = default;
+
+RTFError RTFTokenizer::resolveParse()
+{
+ SAL_INFO("writerfilter.rtf", __func__);
+ char ch;
+ RTFError ret;
+ // for hex chars
+ int b = 0;
+ int count = 2;
+ std::size_t nPercentSize = 0;
+ sal_uInt64 nLastPos = 0;
+
+ if (m_xStatusIndicator.is())
+ {
+ OUString sDocLoad(SvxResId(RID_SVXSTR_DOC_LOAD));
+
+ sal_uInt64 const nCurrentPos = Strm().Tell();
+ sal_uInt64 const nEndPos = nCurrentPos + Strm().remainingSize();
+ m_xStatusIndicator->start(sDocLoad, nEndPos);
+ nPercentSize = nEndPos / 100;
+
+ nLastPos = nCurrentPos;
+ m_xStatusIndicator->setValue(nLastPos);
+ }
+
+ while (Strm().ReadChar(ch), !Strm().eof())
+ {
+ //SAL_INFO("writerfilter", __func__ << ": parsing character '" << ch << "'");
+
+ sal_uInt64 const nCurrentPos = Strm().Tell();
+ if (m_xStatusIndicator.is() && nCurrentPos > (nLastPos + nPercentSize))
+ {
+ nLastPos = nCurrentPos;
+ m_xStatusIndicator->setValue(nLastPos);
+ }
+
+ if (m_nGroup < 0)
+ return RTFError::GROUP_UNDER;
+ if (m_nGroup > 0 && m_rImport.getInternalState() == RTFInternalState::BIN)
+ {
+ ret = m_rImport.resolveChars(ch);
+ if (ret != RTFError::OK)
+ return ret;
+ }
+ else
+ {
+ switch (ch)
+ {
+ case '{':
+ m_nGroupStart = Strm().Tell() - 1;
+ ret = m_rImport.pushState();
+ if (ret != RTFError::OK)
+ return ret;
+ break;
+ case '}':
+ ret = m_rImport.popState();
+ if (ret != RTFError::OK)
+ return ret;
+ if (m_nGroup == 0)
+ {
+ if (m_rImport.isSubstream())
+ m_rImport.finishSubstream();
+ return RTFError::OK;
+ }
+ break;
+ case '\\':
+ ret = resolveKeyword();
+ if (ret != RTFError::OK)
+ return ret;
+ break;
+ case 0x0d:
+ break; // ignore this
+ case 0x0a:
+ m_nLineNumber++;
+ m_nLineStartPos = nCurrentPos;
+ break;
+ default:
+ if (m_nGroup == 0)
+ return RTFError::CHAR_OVER;
+ if (m_rImport.getInternalState() == RTFInternalState::NORMAL)
+ {
+ ret = m_rImport.resolveChars(ch);
+ if (ret != RTFError::OK)
+ return ret;
+ }
+ else
+ {
+ SAL_INFO("writerfilter.rtf", __func__ << ": hex internal state");
+ // Assume that \'<number><junk> means \'0<number>.
+ if (rtl::isAsciiDigit(static_cast<unsigned char>(ch))
+ || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F'))
+ {
+ b = b << 4;
+ sal_Int8 parsed = msfilter::rtfutil::AsHex(ch);
+ if (parsed == -1)
+ return RTFError::HEX_INVALID;
+ b += parsed;
+ }
+ count--;
+ if (!count)
+ {
+ ret = m_rImport.resolveChars(b);
+ if (ret != RTFError::OK)
+ return ret;
+ count = 2;
+ b = 0;
+ m_rImport.setInternalState(RTFInternalState::NORMAL);
+ }
+ }
+ break;
+ }
+ }
+ }
+
+ if (m_nGroup < 0)
+ return RTFError::GROUP_UNDER;
+ if (m_nGroup > 0)
+ return RTFError::GROUP_OVER;
+ return RTFError::OK;
+}
+
+void RTFTokenizer::pushGroup() { m_nGroup++; }
+
+void RTFTokenizer::popGroup() { m_nGroup--; }
+
+RTFError RTFTokenizer::resolveKeyword()
+{
+ char ch;
+
+ Strm().ReadChar(ch);
+ if (Strm().eof())
+ return RTFError::UNEXPECTED_EOF;
+
+ if (!rtl::isAsciiAlpha(static_cast<unsigned char>(ch)))
+ {
+ // control symbols aren't followed by a space, so we can return here
+ // without doing any SeekRel()
+ return dispatchKeyword(OString(ch), false, 0);
+ }
+ OStringBuffer aBuf(32);
+ while (rtl::isAsciiAlpha(static_cast<unsigned char>(ch)))
+ {
+ aBuf.append(ch);
+ if (aBuf.getLength() > 32)
+ // See RTF spec v1.9.1, page 7
+ // A control word's name cannot be longer than 32 letters.
+ throw io::BufferSizeExceededException();
+ Strm().ReadChar(ch);
+ if (Strm().eof())
+ {
+ ch = ' ';
+ break;
+ }
+ }
+
+ bool bNeg = false;
+ if (ch == '-')
+ {
+ // in case we'll have a parameter, that will be negative
+ bNeg = true;
+ Strm().ReadChar(ch);
+ if (Strm().eof())
+ return RTFError::UNEXPECTED_EOF;
+ }
+ bool bParam = false;
+ int nParam = 0;
+ if (rtl::isAsciiDigit(static_cast<unsigned char>(ch)))
+ {
+ OStringBuffer aParameter;
+
+ // we have a parameter
+ bParam = true;
+ while (rtl::isAsciiDigit(static_cast<unsigned char>(ch)))
+ {
+ aParameter.append(ch);
+ Strm().ReadChar(ch);
+ if (Strm().eof())
+ {
+ ch = ' ';
+ break;
+ }
+ }
+ nParam = aParameter.makeStringAndClear().toInt32();
+ if (bNeg)
+ nParam = -nParam;
+ }
+ if (ch != ' ')
+ Strm().SeekRel(-1);
+ OString aKeyword = aBuf.makeStringAndClear();
+ return dispatchKeyword(aKeyword, bParam, nParam);
+}
+
+bool RTFTokenizer::lookupMathKeyword(RTFMathSymbol& rSymbol)
+{
+ auto low
+ = std::lower_bound(s_aRTFMathControlWords.begin(), s_aRTFMathControlWords.end(), rSymbol);
+ if (low == s_aRTFMathControlWords.end() || rSymbol < *low)
+ return false;
+ rSymbol = *low;
+ return true;
+}
+
+RTFError RTFTokenizer::dispatchKeyword(OString const& rKeyword, bool bParam, int nParam)
+{
+ if (m_rImport.getDestination() == Destination::SKIP)
+ {
+ // skip binary data explicitly, to not trip over rtf markup
+ // control characters
+ if (rKeyword == "bin" && nParam > 0)
+ Strm().SeekRel(nParam);
+ return RTFError::OK;
+ }
+ SAL_INFO("writerfilter.rtf", __func__ << ": keyword '\\" << rKeyword << "' with param? "
+ << (bParam ? 1 : 0) << " param val: '"
+ << (bParam ? nParam : 0) << "'");
+ auto findIt = s_aRTFControlWords.find(rKeyword);
+ if (findIt == s_aRTFControlWords.end())
+ {
+ SAL_INFO("writerfilter.rtf", __func__ << ": unknown keyword '\\" << rKeyword << "'");
+ RTFSkipDestination aSkip(m_rImport);
+ aSkip.setParsed(false);
+ return RTFError::OK;
+ }
+
+ RTFError ret;
+ RTFSymbol const& rSymbol = findIt->second;
+ switch (rSymbol.GetControlType())
+ {
+ case RTFControlType::FLAG:
+ // flags ignore any parameter by definition
+ ret = m_rImport.dispatchFlag(rSymbol.GetIndex());
+ if (ret != RTFError::OK)
+ return ret;
+ break;
+ case RTFControlType::DESTINATION:
+ // same for destinations
+ ret = m_rImport.dispatchDestination(rSymbol.GetIndex());
+ if (ret != RTFError::OK)
+ return ret;
+ break;
+ case RTFControlType::SYMBOL:
+ // and symbols
+ ret = m_rImport.dispatchSymbol(rSymbol.GetIndex());
+ if (ret != RTFError::OK)
+ return ret;
+ break;
+ case RTFControlType::TOGGLE:
+ ret = m_rImport.dispatchToggle(rSymbol.GetIndex(), bParam, nParam);
+ if (ret != RTFError::OK)
+ return ret;
+ break;
+ case RTFControlType::VALUE:
+ if (!bParam)
+ nParam = rSymbol.GetDefValue();
+ ret = m_rImport.dispatchValue(rSymbol.GetIndex(), nParam);
+ if (ret != RTFError::OK)
+ return ret;
+ break;
+ }
+
+ return RTFError::OK;
+}
+
+OUString RTFTokenizer::getPosition()
+{
+ return OUString::number(m_nLineNumber + 1) + ","
+ + OUString::number(Strm().Tell() - m_nLineStartPos + 1);
+}
+
+} // namespace writerfilter::rtftok
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */