diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-15 05:54:39 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-15 05:54:39 +0000 |
commit | 267c6f2ac71f92999e969232431ba04678e7437e (patch) | |
tree | 358c9467650e1d0a1d7227a21dac2e3d08b622b2 /svtools/source/svrtf/svparser.cxx | |
parent | Initial commit. (diff) | |
download | libreoffice-267c6f2ac71f92999e969232431ba04678e7437e.tar.xz libreoffice-267c6f2ac71f92999e969232431ba04678e7437e.zip |
Adding upstream version 4:24.2.0.upstream/4%24.2.0
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'svtools/source/svrtf/svparser.cxx')
-rw-r--r-- | svtools/source/svrtf/svparser.cxx | 682 |
1 files changed, 682 insertions, 0 deletions
diff --git a/svtools/source/svrtf/svparser.cxx b/svtools/source/svrtf/svparser.cxx new file mode 100644 index 0000000000..c003bcae21 --- /dev/null +++ b/svtools/source/svrtf/svparser.cxx @@ -0,0 +1,682 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ + +#include <svtools/svparser.hxx> +#include <svtools/htmltokn.h> +#include <tools/stream.hxx> +#include <tools/debug.hxx> +#include <rtl/textcvt.h> +#include <rtl/tencinfo.h> +#include <rtl/character.hxx> +#include <sal/log.hxx> +#include <unicode/ucsdet.h> +#include <unotools/configmgr.hxx> + +#include <vector> + +// structure to store the actual data +template<typename T> +struct SvParser_Impl +{ + OUString aToken; // gescanntes Token + sal_uInt64 nFilePos; // actual position in stream + sal_uInt32 nlLineNr; // actual line number + sal_uInt32 nlLinePos; // actual column number + tools::Long nTokenValue; // extra value (RTF) + bool bTokenHasValue; // indicates whether nTokenValue is valid + T nToken; // actual Token + sal_uInt32 nNextCh; // actual character + T nSaveToken; // the token from Continue + + rtl_TextToUnicodeConverter hConv; + rtl_TextToUnicodeContext hContext; + + SvParser_Impl() + : nFilePos(0) + , nlLineNr(0) + , nlLinePos(0) + , nTokenValue(0) + , bTokenHasValue(false) + , nToken(static_cast<T>(0)) + , nNextCh(0) + , nSaveToken(static_cast<T>(0)) + , hConv( nullptr ) + , hContext( reinterpret_cast<rtl_TextToUnicodeContext>(1) ) + { + } + +}; + + +template<typename T> +SvParser<T>::TokenStackType::TokenStackType() + : nTokenValue(0) + , bTokenHasValue(false) + , nTokenId(static_cast<T>(0)) +{ +} + +// Constructor +template<typename T> +SvParser<T>::SvParser( SvStream& rIn, sal_uInt8 nStackSize ) + : rInput( rIn ) + , nlLineNr( 1 ) + , nlLinePos( 1 ) + , nConversionErrors( 0 ) + , pImplData( nullptr ) + , m_nTokenIndex(0) + , nTokenValue( 0 ) + , bTokenHasValue( false ) + , bFuzzing(utl::ConfigManager::IsFuzzing()) + , eState( SvParserState::NotStarted ) + , eSrcEnc( RTL_TEXTENCODING_DONTKNOW ) + , nNextChPos(0) + , nNextCh(0) + , bSwitchToUCS2(false) + , bRTF_InTextRead(false) + , nTokenStackSize( nStackSize ) + , nTokenStackPos( 0 ) +{ + eState = SvParserState::NotStarted; + if( nTokenStackSize < 3 ) + nTokenStackSize = 3; + pTokenStack.reset(new TokenStackType[ nTokenStackSize ]); + pTokenStackPos = pTokenStack.get(); +} + +template<typename T> +SvParser<T>::~SvParser() +{ + if( pImplData && pImplData->hConv ) + { + rtl_destroyTextToUnicodeContext( pImplData->hConv, + pImplData->hContext ); + rtl_destroyTextToUnicodeConverter( pImplData->hConv ); + } + + pTokenStack.reset(); +} + +template<typename T> SvParserState SvParser<T>::GetStatus() const { return eState; } +template<typename T> sal_uInt32 SvParser<T>::GetLineNr() const { return nlLineNr; } +template<typename T> sal_uInt32 SvParser<T>::GetLinePos() const { return nlLinePos; } +template<typename T> void SvParser<T>::IncLineNr() { ++nlLineNr; } +template<typename T> sal_uInt32 SvParser<T>::IncLinePos() { return ++nlLinePos; } +template<typename T> void SvParser<T>::SetLineNr( sal_uInt32 nlNum ) { nlLineNr = nlNum; } +template<typename T> void SvParser<T>::SetLinePos( sal_uInt32 nlPos ) { nlLinePos = nlPos; } +template<typename T> bool SvParser<T>::IsParserWorking() const { return SvParserState::Working == eState; } +template<typename T> rtl_TextEncoding SvParser<T>::GetSrcEncoding() const { return eSrcEnc; } +template<typename T> void SvParser<T>::SetSwitchToUCS2( bool bSet ) { bSwitchToUCS2 = bSet; } +template<typename T> bool SvParser<T>::IsSwitchToUCS2() const { return bSwitchToUCS2; } +template<typename T> sal_uInt16 SvParser<T>::GetCharSize() const { return (RTL_TEXTENCODING_UCS2 == eSrcEnc) ? 2 : 1; } +template<typename T> Link<LinkParamNone*,void> SvParser<T>::GetAsynchCallLink() const +{ + return LINK( const_cast<SvParser*>(this), SvParser, NewDataRead ); +} + +template<typename T> +void SvParser<T>::ClearTxtConvContext() +{ + if( pImplData && pImplData->hConv ) + rtl_resetTextToUnicodeContext( pImplData->hConv, pImplData->hContext ); +} + +template<typename T> +void SvParser<T>::SetSrcEncoding( rtl_TextEncoding eEnc ) +{ + if( eEnc == eSrcEnc ) + return; + + if( pImplData && pImplData->hConv ) + { + rtl_destroyTextToUnicodeContext( pImplData->hConv, + pImplData->hContext ); + rtl_destroyTextToUnicodeConverter( pImplData->hConv ); + pImplData->hConv = nullptr; + pImplData->hContext = reinterpret_cast<rtl_TextToUnicodeContext>(1); + } + + if( rtl_isOctetTextEncoding(eEnc) || + RTL_TEXTENCODING_UCS2 == eEnc ) + { + eSrcEnc = eEnc; + if( !pImplData ) + pImplData.reset(new SvParser_Impl<T>); + pImplData->hConv = rtl_createTextToUnicodeConverter( eSrcEnc ); + DBG_ASSERT( pImplData->hConv, + "SvParser::SetSrcEncoding: no converter for source encoding" ); + if( !pImplData->hConv ) + eSrcEnc = RTL_TEXTENCODING_DONTKNOW; + else + pImplData->hContext = + rtl_createTextToUnicodeContext( pImplData->hConv ); + } + else + { + SAL_WARN( "svtools", + "SvParser::SetSrcEncoding: invalid source encoding" ); + eSrcEnc = RTL_TEXTENCODING_DONTKNOW; + } +} + +template<typename T> +void SvParser<T>::RereadLookahead() +{ + rInput.Seek(nNextChPos); + nNextCh = GetNextChar(); +} + +template<typename T> +sal_uInt32 SvParser<T>::GetNextChar() +{ + sal_uInt32 c = 0U; + + // When reading multiple bytes, we don't have to care about the file + // position when we run into the pending state. The file position is + // maintained by SaveState/RestoreState. + if( bSwitchToUCS2 && 0 == rInput.Tell() ) + { + rInput.StartReadingUnicodeText(RTL_TEXTENCODING_DONTKNOW); + if (rInput.good()) + { + sal_uInt64 nPos = rInput.Tell(); + if (nPos == 2) + eSrcEnc = RTL_TEXTENCODING_UCS2; + else if (nPos == 3) + SetSrcEncoding(RTL_TEXTENCODING_UTF8); + else // Try to detect encoding without BOM + { + std::vector<char> buf(65535); // Arbitrarily chosen 64KiB buffer + const size_t nSize = rInput.ReadBytes(buf.data(), buf.size()); + rInput.Seek(0); + if (nSize > 0) + { + UErrorCode uerr = U_ZERO_ERROR; + UCharsetDetector* ucd = ucsdet_open(&uerr); + ucsdet_setText(ucd, buf.data(), nSize, &uerr); + if (const UCharsetMatch* match = ucsdet_detect(ucd, &uerr)) + { + const char* pEncodingName = ucsdet_getName(match, &uerr); + + if (U_SUCCESS(uerr)) + { + if (strcmp("UTF-8", pEncodingName) == 0) + { + SetSrcEncoding(RTL_TEXTENCODING_UTF8); + } + else if (strcmp("UTF-16LE", pEncodingName) == 0) + { + eSrcEnc = RTL_TEXTENCODING_UCS2; + rInput.SetEndian(SvStreamEndian::LITTLE); + } + else if (strcmp("UTF-16BE", pEncodingName) == 0) + { + eSrcEnc = RTL_TEXTENCODING_UCS2; + rInput.SetEndian(SvStreamEndian::BIG); + } + } + } + + ucsdet_close(ucd); + } + } + } + bSwitchToUCS2 = false; + } + + bool bErr; + nNextChPos = rInput.Tell(); + + if( RTL_TEXTENCODING_UCS2 == eSrcEnc ) + { + sal_Unicode cUC; + rInput.ReadUtf16(cUC); + bErr = !rInput.good(); + if( !bErr ) + { + c = cUC; + if (rtl::isHighSurrogate(cUC)) + { + const sal_uInt64 nPos = rInput.Tell(); + rInput.ReadUtf16(cUC); + if (rtl::isLowSurrogate(cUC)) // can only be true when ReadUtf16 succeeded + c = rtl::combineSurrogates(c, cUC); + else + rInput.Seek(nPos); // process lone high surrogate + } + } + } + else + { + sal_Size nChars = 0; + do + { + char c1; // signed, that's the text converter expects + rInput.ReadChar( c1 ); + bErr = !rInput.good(); + if( !bErr ) + { + if ( + RTL_TEXTENCODING_DONTKNOW == eSrcEnc || + RTL_TEXTENCODING_SYMBOL == eSrcEnc + ) + { + // no conversion shall take place + c = reinterpret_cast<unsigned char&>( c1 ); + nChars = 1; + } + else + { + assert(pImplData && pImplData->hConv && "no text converter!"); + + sal_Unicode cUC; + sal_uInt32 nInfo = 0; + sal_Size nCvtBytes; + nChars = rtl_convertTextToUnicode( + pImplData->hConv, pImplData->hContext, + &c1, 1, &cUC, 1, + RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR| + RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR| + RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR, + &nInfo, &nCvtBytes); + if( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL) != 0 ) + { + // The conversion wasn't successful because we haven't + // read enough characters. + if( pImplData->hContext != reinterpret_cast<rtl_TextToUnicodeContext>(1) ) + { + sal_Unicode sCh[2]; + while( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL) != 0 ) + { + rInput.ReadChar( c1 ); + bErr = !rInput.good(); + if( bErr ) + break; + + nChars = rtl_convertTextToUnicode( + pImplData->hConv, pImplData->hContext, + &c1, 1, sCh , 2, + RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR| + RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR| + RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR, + &nInfo, &nCvtBytes); + } + if( !bErr ) + { + if( 1 == nChars && 0 == nInfo ) + { + c = sal_uInt32( sCh[0] ); + } + else if( 2 == nChars && 0 == nInfo ) + { + c = rtl::combineSurrogates( sCh[0], sCh[1] ); + } + else if( 0 != nChars || 0 != nInfo ) + { + DBG_ASSERT( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL) == 0, + "source buffer is too small" ); + DBG_ASSERT( (nInfo&~(RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL)) == 0, + "there is a conversion error" ); + DBG_ASSERT( 0 == nChars, + "there is a converted character, but an error" ); + // There are still errors, but nothing we can + // do + c = '?'; + nChars = 1; + ++nConversionErrors; + } + } + } + else + { + char sBuffer[10]; + sBuffer[0] = c1; + sal_uInt16 nLen = 1; + while( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL) != 0 && + nLen < 10 ) + { + rInput.ReadChar( c1 ); + bErr = !rInput.good(); + if( bErr ) + break; + + sBuffer[nLen++] = c1; + nChars = rtl_convertTextToUnicode( + pImplData->hConv, nullptr, sBuffer, nLen, &cUC, 1, + RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR| + RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR| + RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR, + &nInfo, &nCvtBytes); + } + if( !bErr ) + { + if( 1 == nChars && 0 == nInfo ) + { + DBG_ASSERT( nCvtBytes == nLen, + "no all bytes have been converted!" ); + c = cUC; + } + else + { + DBG_ASSERT( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL) == 0, + "source buffer is too small" ); + DBG_ASSERT( (nInfo&~(RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL)) == 0, + "there is a conversion error" ); + DBG_ASSERT( 0 == nChars, + "there is a converted character, but an error" ); + + // There are still errors, so we use the first + // character and restart after that. + c = reinterpret_cast<unsigned char&>( sBuffer[0] ); + rInput.SeekRel( -(nLen-1) ); + nChars = 1; + ++nConversionErrors; + } + } + } + } + else if( 1 == nChars && 0 == nInfo ) + { + // The conversion was successful + DBG_ASSERT( nCvtBytes == 1, + "no all bytes have been converted!" ); + c = cUC; + } + else if( 0 != nChars || 0 != nInfo ) + { + DBG_ASSERT( 0 == nChars, + "there is a converted character, but an error" ); + DBG_ASSERT( 0 != nInfo, + "there is no converted character and no error" ); + // #73398#: If the character could not be converted, + // because a conversion is not available, do no conversion at all. + c = reinterpret_cast<unsigned char&>( c1 ); + nChars = 1; + ++nConversionErrors; + } + } + } + } + while( 0 == nChars && !bErr ); + } + + if ( ! rtl::isUnicodeScalarValue( c ) ) + c = '?' ; + + if (bFuzzing && nConversionErrors > 128) + { + SAL_WARN("svtools", "SvParser::GetNextChar too many conversion errors while fuzzing, abandoning for performance"); + bErr = true; + } + + if( bErr ) + { + if( ERRCODE_IO_PENDING == rInput.GetError() ) + { + eState = SvParserState::Pending; + return c; + } + else + return sal_Unicode(EOF); + } + + if( c == '\n' ) + { + IncLineNr(); + SetLinePos( 1 ); + } + else + IncLinePos(); + + return c; +} + +template<typename T> +T SvParser<T>::GetNextToken() +{ + T nRet = static_cast<T>(0); + + if( !nTokenStackPos ) + { + aToken.setLength( 0 ); // empty token buffer + nTokenValue = -1; // marker for no value read + bTokenHasValue = false; + + nRet = GetNextToken_(); + if( SvParserState::Pending == eState ) + return nRet; + } + + ++pTokenStackPos; + if( pTokenStackPos == pTokenStack.get() + nTokenStackSize ) + pTokenStackPos = pTokenStack.get(); + + // pop from stack ?? + if( nTokenStackPos ) + { + --nTokenStackPos; + nTokenValue = pTokenStackPos->nTokenValue; + bTokenHasValue = pTokenStackPos->bTokenHasValue; + aToken = pTokenStackPos->sToken; + nRet = pTokenStackPos->nTokenId; + ++m_nTokenIndex; + } + // no, now push actual value on stack + else if( SvParserState::Working == eState ) + { + pTokenStackPos->sToken = aToken; + pTokenStackPos->nTokenValue = nTokenValue; + pTokenStackPos->bTokenHasValue = bTokenHasValue; + pTokenStackPos->nTokenId = nRet; + ++m_nTokenIndex; + } + else if( SvParserState::Accepted != eState && SvParserState::Pending != eState ) + eState = SvParserState::Error; // an error occurred + + return nRet; +} + +template<typename T> +T SvParser<T>::SkipToken( short nCnt ) // "skip" n Tokens backward +{ + pTokenStackPos = GetStackPtr( nCnt ); + short nTmp = nTokenStackPos - nCnt; + if( nTmp < 0 ) + nTmp = 0; + else if( nTmp > nTokenStackSize ) + nTmp = nTokenStackSize; + nTokenStackPos = sal_uInt8(nTmp); + + m_nTokenIndex -= nTmp; + + // restore values + aToken = pTokenStackPos->sToken; + nTokenValue = pTokenStackPos->nTokenValue; + bTokenHasValue = pTokenStackPos->bTokenHasValue; + + return pTokenStackPos->nTokenId; +} + +template<typename T> +typename SvParser<T>::TokenStackType* SvParser<T>::GetStackPtr( short nCnt ) +{ + sal_uInt8 nCurrentPos = sal_uInt8(pTokenStackPos - pTokenStack.get()); + if( nCnt > 0 ) + { + if( nCnt >= nTokenStackSize ) + nCnt = (nTokenStackSize-1); + if( nCurrentPos + nCnt < nTokenStackSize ) + nCurrentPos = sal::static_int_cast< sal_uInt8 >(nCurrentPos + nCnt); + else + nCurrentPos = sal::static_int_cast< sal_uInt8 >( + nCurrentPos + (nCnt - nTokenStackSize)); + } + else if( nCnt < 0 ) + { + if( -nCnt >= nTokenStackSize ) + nCnt = -nTokenStackSize+1; + if( -nCnt <= nCurrentPos ) + nCurrentPos = sal::static_int_cast< sal_uInt8 >(nCurrentPos + nCnt); + else + nCurrentPos = sal::static_int_cast< sal_uInt8 >( + nCurrentPos + (nCnt + nTokenStackSize)); + } + return pTokenStack.get() + nCurrentPos; +} + +// to read asynchronous from SvStream + +template<typename T> +T SvParser<T>::GetSaveToken() const +{ + return pImplData ? pImplData->nSaveToken : static_cast<T>(0); +} + +template<typename T> +void SvParser<T>::SaveState( T nToken ) +{ + // save actual status + if( !pImplData ) + { + pImplData.reset(new SvParser_Impl<T>); + pImplData->nSaveToken = static_cast<T>(0); + } + + pImplData->nFilePos = rInput.Tell(); + pImplData->nToken = nToken; + + pImplData->aToken = aToken; + pImplData->nlLineNr = nlLineNr; + pImplData->nlLinePos = nlLinePos; + pImplData->nTokenValue= nTokenValue; + pImplData->bTokenHasValue = bTokenHasValue; + pImplData->nNextCh = nNextCh; +} + +template<typename T> +void SvParser<T>::RestoreState() +{ + // restore old status + if( !pImplData ) + return; + + if( ERRCODE_IO_PENDING == rInput.GetError() ) + rInput.ResetError(); + aToken = pImplData->aToken; + nlLineNr = pImplData->nlLineNr; + nlLinePos = pImplData->nlLinePos; + nTokenValue= pImplData->nTokenValue; + bTokenHasValue=pImplData->bTokenHasValue; + nNextCh = pImplData->nNextCh; + + pImplData->nSaveToken = pImplData->nToken; + + rInput.Seek( pImplData->nFilePos ); +} + +template<typename T> +void SvParser<T>::Continue( T ) +{ +} + + +// expanded out version of +// IMPL_LINK_NOARG( SvParser, NewDataRead, LinkParamNone*, void ) +// since it can't cope with template methods +template<typename T> +void SvParser<T>::LinkStubNewDataRead(void * instance, LinkParamNone* data) { + return static_cast<SvParser<T> *>(instance)->NewDataRead(data); +} +template<typename T> +void SvParser<T>::NewDataRead(SAL_UNUSED_PARAMETER LinkParamNone*) +{ + switch( eState ) + { + case SvParserState::Pending: + eState = SvParserState::Working; + RestoreState(); + + Continue( pImplData->nToken ); + + if( ERRCODE_IO_PENDING == rInput.GetError() ) + rInput.ResetError(); + + if( SvParserState::Pending != eState ) + ReleaseRef(); // ready otherwise! + break; + + case SvParserState::NotStarted: + case SvParserState::Working: + break; + + default: + ReleaseRef(); // ready otherwise! + break; + } +} + +template class SVT_DLLPUBLIC SvParser<int>; +template class SVT_DLLPUBLIC SvParser<HtmlTokenId>; + +/*======================================================================== + * + * SvKeyValueIterator. + * + *======================================================================*/ + +typedef std::vector<SvKeyValue> SvKeyValueList_Impl; + +struct SvKeyValueIterator::Impl +{ + SvKeyValueList_Impl maList; + sal_uInt16 mnPos; + + Impl() : mnPos(0) {} +}; + +SvKeyValueIterator::SvKeyValueIterator() : mpImpl(new Impl) {} + +SvKeyValueIterator::~SvKeyValueIterator() = default; + +bool SvKeyValueIterator::GetFirst (SvKeyValue &rKeyVal) +{ + mpImpl->mnPos = mpImpl->maList.size(); + return GetNext (rKeyVal); +} + +bool SvKeyValueIterator::GetNext (SvKeyValue &rKeyVal) +{ + if (mpImpl->mnPos > 0) + { + rKeyVal = mpImpl->maList[--mpImpl->mnPos]; + return true; + } + else + { + // Nothing to do. + return false; + } +} + +void SvKeyValueIterator::Append (const SvKeyValue &rKeyVal) +{ + mpImpl->maList.push_back(rKeyVal); +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |