1 files changed, 708 insertions, 0 deletions
diff --git a/svtools/source/svrtf/parrtf.cxx b/svtools/source/svrtf/parrtf.cxx
new file mode 100644
index 0000000000..82d69f7881
--- /dev/null
+++ b/svtools/source/svrtf/parrtf.cxx
@@ -0,0 +1,708 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * This file incorporates work covered by the following license notice:
+ *
+ *   Licensed to the Apache Software Foundation (ASF) under one or more
+ *   contributor license agreements. See the NOTICE file distributed
+ *   with this work for additional information regarding copyright
+ *   ownership. The ASF licenses this file to you under the Apache
+ *   License, Version 2.0 (the "License"); you may not use this file
+ *   except in compliance with the License. You may obtain a copy of
+ *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
+ */
+
+#include <sal/config.h>
+#include <sal/log.hxx>
+
+#include <comphelper/scopeguard.hxx>
+
+#include <rtl/character.hxx>
+#include <rtl/strbuf.hxx>
+#include <rtl/tencinfo.h>
+#include <rtl/ustrbuf.hxx>
+#include <tools/stream.hxx>
+#include <tools/debug.hxx>
+#include <svtools/rtftoken.h>
+#include <svtools/parrtf.hxx>
+
+const int MAX_STRING_LEN = 1024;
+
+#define RTF_ISDIGIT( c ) rtl::isAsciiDigit(c)
+#define RTF_ISALPHA( c ) rtl::isAsciiAlpha(c)
+
+SvRTFParser::SvRTFParser( SvStream& rIn, sal_uInt8 nStackSize )
+    : SvParser<int>( rIn, nStackSize )
+    , nOpenBrackets(0)
+    , nUPRLevel(0)
+    , eCodeSet(RTL_TEXTENCODING_MS_1252)
+    , nUCharOverread(1)
+{
+    // default is ANSI-CodeSet
+    SetSrcEncoding( RTL_TEXTENCODING_MS_1252 );
+    bRTF_InTextRead = false;
+}
+
+SvRTFParser::~SvRTFParser()
+{
+}
+
+
+int SvRTFParser::GetNextToken_()
+{
+    int nRet = 0;
+    do {
+        bool bNextCh = true;
+        switch( nNextCh )
+        {
+        case '\\':
+            {
+                // control characters
+                nNextCh = GetNextChar();
+                switch( nNextCh )
+                {
+                case '{':
+                case '}':
+                case '\\':
+                case '+':       // I found it in a RTF-file
+                case '~':       // nonbreaking space
+                case '-':       // optional hyphen
+                case '_':       // nonbreaking hyphen
+                case '\'':      // HexValue
+                    nNextCh = '\\';
+                    rInput.SeekRel( -1 );
+                    ScanText();
+                    nRet = RTF_TEXTTOKEN;
+                    bNextCh = 0 == nNextCh;
+                    break;
+
+                case '*':       // ignoreflag
+                    nRet = RTF_IGNOREFLAG;
+                    break;
+                case ':':       // subentry in an index entry
+                    nRet = RTF_SUBENTRYINDEX;
+                    break;
+                case '|':       // formula-character
+                    nRet = RTF_FORMULA;
+                    break;
+
+                case 0x0a:
+                case 0x0d:
+                    nRet = RTF_PAR;
+                    break;
+
+                default:
+                    if( RTF_ISALPHA( nNextCh ) )
+                    {
+                        aToken = "\\";
+                        {
+                            do {
+                                aToken.appendUtf32(nNextCh);
+                                nNextCh = GetNextChar();
+                            } while( RTF_ISALPHA( nNextCh ) );
+                        }
+
+                        // minus before numeric parameters
+                        bool bNegValue = false;
+                        if( '-' == nNextCh )
+                        {
+                            bNegValue = true;
+                            nNextCh = GetNextChar();
+                        }
+
+                        // possible numeric parameter
+                        if( RTF_ISDIGIT( nNextCh ) )
+                        {
+                            OUStringBuffer aNumber;
+                            do {
+                                aNumber.append(static_cast<sal_Unicode>(nNextCh));
+                                nNextCh = GetNextChar();
+                            } while( RTF_ISDIGIT( nNextCh ) );
+                            nTokenValue = OUString::unacquired(aNumber).toInt32();
+                            if( bNegValue )
+                                nTokenValue = -nTokenValue;
+                            bTokenHasValue=true;
+                        }
+                        else if( bNegValue )        // restore minus
+                        {
+                            nNextCh = '-';
+                            rInput.SeekRel( -1 );
+                        }
+                        if( ' ' == nNextCh )        // blank is part of token!
+                            nNextCh = GetNextChar();
+
+                        // search for the token in the table:
+                        if( 0 == (nRet = GetRTFToken( aToken )) )
+                            // Unknown Control
+                            nRet = RTF_UNKNOWNCONTROL;
+
+                        // bug 76812 - unicode token handled as normal text
+                        bNextCh = false;
+                        switch( nRet )
+                        {
+                        case RTF_UC:
+                            if( 0 <= nTokenValue )
+                            {
+                                nUCharOverread = static_cast<sal_uInt8>(nTokenValue);
+                                if (!aParserStates.empty())
+                                {
+                                    //cmc: other ifdef breaks #i3584
+                                    aParserStates.top().nUCharOverread = nUCharOverread;
+                                }
+                            }
+                            aToken.setLength( 0 ); // #i47831# erase token to prevent the token from being treated as text
+                            // read next token
+                            nRet = 0;
+                            break;
+
+                        case RTF_UPR:
+                            if (!_inSkipGroup)
+                            {
+                                if (nUPRLevel > 256) // fairly sure > 1 is probably an error, but provide some leeway
+                                {
+                                    SAL_WARN("svtools", "urp stack too deep");
+                                    eState = SvParserState::Error;
+                                    break;
+                                }
+
+                                ++nUPRLevel;
+
+                                // UPR - overread the group with the ansi
+                                //       information
+                                int nNextToken;
+                                do
+                                {
+                                    nNextToken = GetNextToken_();
+                                }
+                                while (nNextToken != '{' && nNextToken != sal_Unicode(EOF) && IsParserWorking());
+
+                                SkipGroup();
+                                GetNextToken_();  // overread the last bracket
+                                nRet = 0;
+
+                                --nUPRLevel;
+                            }
+                            break;
+
+                        case RTF_U:
+                            if( !bRTF_InTextRead )
+                            {
+                                nRet = RTF_TEXTTOKEN;
+                                aToken = OUStringChar( static_cast<sal_Unicode>(nTokenValue) );
+
+                                // overread the next n "RTF" characters. This
+                                // can be also \{, \}, \'88
+                                for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
+                                {
+                                    sal_uInt32 cAnsi = nNextCh;
+                                    while( 0xD == cAnsi )
+                                        cAnsi = GetNextChar();
+                                    while( 0xA == cAnsi )
+                                        cAnsi = GetNextChar();
+
+                                    if( '\\' == cAnsi &&
+                                        '\'' == GetNextChar() )
+                                        // skip HexValue
+                                        GetHexValue();
+                                    nNextCh = GetNextChar();
+                                }
+                                ScanText();
+                                bNextCh = 0 == nNextCh;
+                            }
+                            break;
+                        }
+                    }
+                    else if( SvParserState::Pending != eState )
+                    {
+                        // Bug 34631 - "\ " read on - Blank as character
+                        // eState = SvParserState::Error;
+                        bNextCh = false;
+                    }
+                    break;
+                }
+            }
+            break;
+
+        case sal_Unicode(EOF):
+            eState = SvParserState::Accepted;
+            nRet = nNextCh;
+            break;
+
+        case '{':
+            {
+                if( 0 <= nOpenBrackets )
+                {
+                    RtfParserState_Impl aState( nUCharOverread, GetSrcEncoding() );
+                    aParserStates.push( aState );
+                }
+                ++nOpenBrackets;
+                DBG_ASSERT(
+                    static_cast<size_t>(nOpenBrackets) == aParserStates.size(),
+                    "ParserStateStack unequal to bracket count" );
+                nRet = nNextCh;
+            }
+            break;
+
+        case '}':
+            --nOpenBrackets;
+            if( 0 <= nOpenBrackets )
+            {
+                aParserStates.pop();
+                if( !aParserStates.empty() )
+                {
+                    const RtfParserState_Impl& rRPS =
+                            aParserStates.top();
+                    nUCharOverread = rRPS.nUCharOverread;
+                    SetSrcEncoding( rRPS.eCodeSet );
+                }
+                else
+                {
+                    nUCharOverread = 1;
+                    SetSrcEncoding( GetCodeSet() );
+                }
+            }
+            DBG_ASSERT(
+                static_cast<size_t>(nOpenBrackets) == aParserStates.size(),
+                "ParserStateStack unequal to bracket count" );
+            nRet = nNextCh;
+            break;
+
+        case 0x0d:
+        case 0x0a:
+            break;
+
+        default:
+            // now normal text follows
+            ScanText();
+            nRet = RTF_TEXTTOKEN;
+            bNextCh = 0 == nNextCh;
+            break;
+        }
+
+        if( bNextCh )
+            nNextCh = GetNextChar();
+
+    } while( !nRet && SvParserState::Working == eState );
+    return nRet;
+}
+
+
+sal_Unicode SvRTFParser::GetHexValue()
+{
+    // collect Hex values
+    int n;
+    sal_Unicode nHexVal = 0;
+
+    for( n = 0; n < 2; ++n )
+    {
+        nHexVal *= 16;
+        nNextCh = GetNextChar();
+        if( nNextCh >= '0' && nNextCh <= '9' )
+            nHexVal += (nNextCh - 48);
+        else if( nNextCh >= 'a' && nNextCh <= 'f' )
+            nHexVal += (nNextCh - 87);
+        else if( nNextCh >= 'A' && nNextCh <= 'F' )
+            nHexVal += (nNextCh - 55);
+    }
+    return nHexVal;
+}
+
+void SvRTFParser::ScanText()
+{
+    const sal_Unicode cBreak = 0;
+    OUStringBuffer aStrBuffer;
+    bool bContinue = true;
+    while( bContinue && IsParserWorking() && aStrBuffer.getLength() < MAX_STRING_LEN)
+    {
+        bool bNextCh = true;
+        switch( nNextCh )
+        {
+        case '\\':
+            {
+                nNextCh = GetNextChar();
+                switch (nNextCh)
+                {
+                case '\'':
+                    {
+
+                        OStringBuffer aByteString;
+                        while (true)
+                        {
+                            char c = static_cast<char>(GetHexValue());
+                            /*
+                             * Note: \'00 is a valid internal character in  a
+                             * string in RTF. OStringBuffer supports
+                             * appending nulls fine
+                             */
+                            aByteString.append(c);
+
+                            bool bBreak = false;
+                            bool bEOF = false;
+                            char nSlash = '\\';
+                            while (!bBreak)
+                            {
+                                auto next = GetNextChar();
+                                if (sal_Unicode(EOF) == next)
+                                {
+                                    bEOF = true;
+                                    break;
+                                }
+                                if (next>0xFF) // fix for #i43933# and #i35653#
+                                {
+                                    if (!aByteString.isEmpty())
+                                    {
+                                        aStrBuffer.append( OStringToOUString(aByteString, GetSrcEncoding()) );
+                                        aByteString.setLength(0);
+                                    }
+                                    aStrBuffer.append(static_cast<sal_Unicode>(next));
+
+                                    continue;
+                                }
+                                nSlash = static_cast<char>(next);
+                                while (nSlash == 0xD || nSlash == 0xA)
+                                    nSlash = static_cast<char>(GetNextChar());
+
+                                switch (nSlash)
+                                {
+                                    case '{':
+                                    case '}':
+                                    case '\\':
+                                        bBreak = true;
+                                        break;
+                                    default:
+                                        aByteString.append(nSlash);
+                                        break;
+                                }
+                            }
+
+                            if (bEOF)
+                            {
+                                bContinue = false;        // abort, string together
+                                break;
+                            }
+
+                            nNextCh = GetNextChar();
+
+                            if (nSlash != '\\' || nNextCh != '\'')
+                            {
+                                rInput.SeekRel(-1);
+                                nNextCh = static_cast<unsigned char>(nSlash);
+                                break;
+                            }
+                        }
+
+                        bNextCh = false;
+
+                        if (!aByteString.isEmpty())
+                        {
+                            aStrBuffer.append( OStringToOUString(aByteString, GetSrcEncoding()) );
+                            aByteString.setLength(0);
+                        }
+                    }
+                    break;
+                case '\\':
+                case '}':
+                case '{':
+                case '+':       // I found in a RTF file
+                    aStrBuffer.append(sal_Unicode(nNextCh));
+                    break;
+                case '~':       // nonbreaking space
+                    aStrBuffer.append(u'\x00A0');
+                    break;
+                case '-':       // optional hyphen
+                    aStrBuffer.append(u'\x00AD');
+                    break;
+                case '_':       // nonbreaking hyphen
+                    aStrBuffer.append(u'\x2011');
+                    break;
+
+                case 'u':
+                    // read UNI-Code characters
+                    {
+                        nNextCh = GetNextChar();
+                        rInput.SeekRel( -2 );
+
+                        if( '-' == nNextCh || RTF_ISDIGIT( nNextCh ) )
+                        {
+                            bRTF_InTextRead = true;
+
+                            OUString sSave( aToken ); // GetNextToken_() overwrites this
+                            nNextCh = '\\';
+                            int nToken = GetNextToken_();
+                            DBG_ASSERT( RTF_U == nToken, "still not a UNI-Code character" );
+                            // don't convert symbol chars
+                            aStrBuffer.append(static_cast< sal_Unicode >(nTokenValue));
+
+                            // overread the next n "RTF" characters. This
+                            // can be also \{, \}, \'88
+                            for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
+                            {
+                                sal_Unicode cAnsi = nNextCh;
+                                while( 0xD == cAnsi )
+                                    cAnsi = GetNextChar();
+                                while( 0xA == cAnsi )
+                                    cAnsi = GetNextChar();
+
+                                if( '\\' == cAnsi &&
+                                    '\'' == GetNextChar() )
+                                    // skip HexValue
+                                    GetHexValue();
+                                nNextCh = GetNextChar();
+                            }
+                            bNextCh = false;
+                            aToken = sSave;
+                            bRTF_InTextRead = false;
+                        }
+                        else if ( 'c' == nNextCh )
+                        {
+                            // Prevent text breaking into multiple tokens.
+                            rInput.SeekRel( 2 );
+                            nNextCh = GetNextChar();
+                            if (RTF_ISDIGIT( nNextCh ))
+                            {
+                                sal_uInt8 nNewOverread = 0 ;
+                                do {
+                                    nNewOverread *= 10;
+                                    nNewOverread += nNextCh - '0';
+                                    nNextCh = GetNextChar();
+                                } while ( RTF_ISDIGIT( nNextCh ) );
+                                nUCharOverread = nNewOverread;
+                                if (!aParserStates.empty())
+                                    aParserStates.top().nUCharOverread = nNewOverread;
+                            }
+                            bNextCh = 0x20 == nNextCh;
+                        }
+                        else
+                        {
+                            nNextCh = '\\';
+                            bContinue = false;        // abort, string together
+                        }
+                    }
+                    break;
+
+                default:
+                    rInput.SeekRel( -1 );
+                    nNextCh = '\\';
+                    bContinue = false;        // abort, string together
+                    break;
+                }
+            }
+            break;
+
+        case sal_Unicode(EOF):
+            eState = SvParserState::Error;
+            [[fallthrough]];
+        case '{':
+        case '}':
+            bContinue = false;
+            break;
+
+        case 0x0a:
+        case 0x0d:
+            break;
+
+        default:
+            if( nNextCh == cBreak || aStrBuffer.getLength() >= MAX_STRING_LEN)
+                bContinue = false;
+            else
+            {
+                do {
+                    // all other characters end up in the text
+                    aStrBuffer.appendUtf32(nNextCh);
+
+                    if (sal_Unicode(EOF) == (nNextCh = GetNextChar()))
+                    {
+                        if (!aStrBuffer.isEmpty())
+                            aToken.append( aStrBuffer );
+                        return;
+                    }
+                } while
+                (
+                    (RTF_ISALPHA(nNextCh) || RTF_ISDIGIT(nNextCh)) &&
+                    (aStrBuffer.getLength() < MAX_STRING_LEN)
+                );
+                bNextCh = false;
+            }
+        }
+
+        if( bContinue && bNextCh )
+            nNextCh = GetNextChar();
+    }
+
+    if (!aStrBuffer.isEmpty())
+        aToken.append( aStrBuffer );
+}
+
+
+short SvRTFParser::_inSkipGroup=0;
+
+void SvRTFParser::SkipGroup()
+{
+    short nBrackets=1;
+    if (_inSkipGroup>0)
+        return;
+    _inSkipGroup++;
+//#i16185# faking \bin keyword
+    do
+    {
+        switch (nNextCh)
+        {
+            case '{':
+                ++nBrackets;
+                break;
+            case '}':
+                if (!--nBrackets) {
+                    _inSkipGroup--;
+                    return;
+                }
+                break;
+        }
+        int nToken = GetNextToken_();
+        if (nToken == RTF_BIN)
+        {
+            rInput.SeekRel(-1);
+            SAL_WARN_IF(nTokenValue < 0, "svtools", "negative value argument for rtf \\bin keyword");
+            if (nTokenValue > 0)
+                rInput.SeekRel(nTokenValue);
+            nNextCh = GetNextChar();
+        }
+        while (nNextCh==0xa || nNextCh==0xd)
+        {
+            nNextCh = GetNextChar();
+        }
+    } while (sal_Unicode(EOF) != nNextCh && IsParserWorking());
+
+    if( SvParserState::Pending != eState && '}' != nNextCh )
+        eState = SvParserState::Error;
+    _inSkipGroup--;
+}
+
+void SvRTFParser::ReadUnknownData() { SkipGroup(); }
+void SvRTFParser::ReadBitmapData()  { SkipGroup(); }
+
+
+SvParserState SvRTFParser::CallParser()
+{
+    char cFirstCh(0);
+    nNextChPos = rInput.Tell();
+    rInput.ReadChar( cFirstCh );
+    nNextCh = static_cast<unsigned char>(cFirstCh);
+    eState = SvParserState::Working;
+    nOpenBrackets = 0;
+    eCodeSet = RTL_TEXTENCODING_MS_1252;
+    SetSrcEncoding( eCodeSet );
+
+    // the first two tokens should be '{' and \\rtf !!
+    if( '{' == GetNextToken() && RTF_RTF == GetNextToken() )
+    {
+        AddFirstRef();
+        // call ReleaseRef at end of this scope, even in the face of exceptions
+        comphelper::ScopeGuard g([this] {
+            if( SvParserState::Pending != eState )
+                ReleaseRef();       // now parser is not needed anymore
+        });
+        Continue( 0 );
+    }
+    else
+        eState = SvParserState::Error;
+
+    return eState;
+}
+
+void SvRTFParser::Continue( int nToken )
+{
+//  DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(),
+//              "Characterset was changed." );
+
+    if( !nToken )
+        nToken = GetNextToken();
+
+    bool bLooping = false;
+
+    while (IsParserWorking() && !bLooping)
+    {
+        auto nCurrentTokenIndex = m_nTokenIndex;
+        auto nCurrentToken = nToken;
+
+        SaveState( nToken );
+        switch( nToken )
+        {
+        case '}':
+            if( nOpenBrackets )
+                goto NEXTTOKEN;
+            eState = SvParserState::Accepted;
+            break;
+
+        case '{':
+            // an unknown group ?
+            {
+                if( RTF_IGNOREFLAG != GetNextToken() )
+                    nToken = SkipToken();
+                else if( RTF_UNKNOWNCONTROL != GetNextToken() )
+                    nToken = SkipToken( -2 );
+                else
+                {
+                    // filter immediately
+                    ReadUnknownData();
+                    nToken = GetNextToken();
+                    if( '}' != nToken )
+                        eState = SvParserState::Error;
+                    break;      // move to next token!!
+                }
+            }
+            goto NEXTTOKEN;
+
+        case RTF_UNKNOWNCONTROL:
+            break;      // skip unknown token
+        case RTF_NEXTTYPE:
+        case RTF_ANSITYPE:
+            eCodeSet = RTL_TEXTENCODING_MS_1252;
+            SetSrcEncoding( eCodeSet );
+            break;
+        case RTF_MACTYPE:
+            eCodeSet = RTL_TEXTENCODING_APPLE_ROMAN;
+            SetSrcEncoding( eCodeSet );
+            break;
+        case RTF_PCTYPE:
+            eCodeSet = RTL_TEXTENCODING_IBM_437;
+            SetSrcEncoding( eCodeSet );
+            break;
+        case RTF_PCATYPE:
+            eCodeSet = RTL_TEXTENCODING_IBM_850;
+            SetSrcEncoding( eCodeSet );
+            break;
+        case RTF_ANSICPG:
+            eCodeSet = rtl_getTextEncodingFromWindowsCodePage(nTokenValue);
+            SetSrcEncoding(eCodeSet);
+            break;
+        default:
+NEXTTOKEN:
+            NextToken( nToken );
+            break;
+        }
+        if( IsParserWorking() )
+            SaveState( 0 );         // processed till here,
+                                    // continue with new token!
+        nToken = GetNextToken();
+        bLooping = nCurrentTokenIndex == m_nTokenIndex && nToken == nCurrentToken;
+    }
+    if( SvParserState::Accepted == eState && 0 < nOpenBrackets )
+        eState = SvParserState::Error;
+}
+
+void SvRTFParser::SetEncoding( rtl_TextEncoding eEnc )
+{
+    if (eEnc == RTL_TEXTENCODING_DONTKNOW)
+        eEnc = GetCodeSet();
+
+    if (!aParserStates.empty())
+        aParserStates.top().eCodeSet = eEnc;
+    SetSrcEncoding(eEnc);
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */