summaryrefslogtreecommitdiffstats
path: root/svtools/source/svhtml/parhtml.cxx
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-15 05:54:39 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-15 05:54:39 +0000
commit267c6f2ac71f92999e969232431ba04678e7437e (patch)
tree358c9467650e1d0a1d7227a21dac2e3d08b622b2 /svtools/source/svhtml/parhtml.cxx
parentInitial commit. (diff)
downloadlibreoffice-267c6f2ac71f92999e969232431ba04678e7437e.tar.xz
libreoffice-267c6f2ac71f92999e969232431ba04678e7437e.zip
Adding upstream version 4:24.2.0.upstream/4%24.2.0
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'svtools/source/svhtml/parhtml.cxx')
-rw-r--r--svtools/source/svhtml/parhtml.cxx2213
1 files changed, 2213 insertions, 0 deletions
diff --git a/svtools/source/svhtml/parhtml.cxx b/svtools/source/svhtml/parhtml.cxx
new file mode 100644
index 0000000000..1f87456744
--- /dev/null
+++ b/svtools/source/svhtml/parhtml.cxx
@@ -0,0 +1,2213 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * This file incorporates work covered by the following license notice:
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed
+ * with this work for additional information regarding copyright
+ * ownership. The ASF licenses this file to you under the Apache
+ * License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of
+ * the License at http://www.apache.org/licenses/LICENSE-2.0 .
+ */
+
+#include <comphelper/string.hxx>
+#include <o3tl/safeint.hxx>
+#include <o3tl/string_view.hxx>
+#include <tools/stream.hxx>
+#include <tools/debug.hxx>
+#include <tools/color.hxx>
+#include <rtl/ustrbuf.hxx>
+#include <rtl/character.hxx>
+#include <rtl/tencinfo.h>
+#include <sal/log.hxx>
+#include <tools/tenccvt.hxx>
+#include <tools/datetime.hxx>
+#include <unotools/datetime.hxx>
+#include <svl/inettype.hxx>
+#include <svl/lngmisc.hxx>
+#include <com/sun/star/beans/PropertyAttribute.hpp>
+#include <com/sun/star/document/XDocumentProperties.hpp>
+
+#include <svtools/parhtml.hxx>
+#include <svtools/htmltokn.h>
+#include <svtools/htmlkywd.hxx>
+
+#include <utility>
+
+using namespace ::com::sun::star;
+
+
+const sal_Int32 MAX_LEN( 1024 );
+
+const sal_Int32 MAX_ENTITY_LEN( 8 );
+
+
+// Tables to convert option values into strings
+
+// <INPUT TYPE=xxx>
+HTMLOptionEnum<HTMLInputType> const aInputTypeOptEnums[] =
+{
+ { OOO_STRING_SVTOOLS_HTML_IT_text, HTMLInputType::Text },
+ { OOO_STRING_SVTOOLS_HTML_IT_password, HTMLInputType::Password },
+ { OOO_STRING_SVTOOLS_HTML_IT_checkbox, HTMLInputType::Checkbox },
+ { OOO_STRING_SVTOOLS_HTML_IT_radio, HTMLInputType::Radio },
+ { OOO_STRING_SVTOOLS_HTML_IT_range, HTMLInputType::Range },
+ { OOO_STRING_SVTOOLS_HTML_IT_scribble, HTMLInputType::Scribble },
+ { OOO_STRING_SVTOOLS_HTML_IT_file, HTMLInputType::File },
+ { OOO_STRING_SVTOOLS_HTML_IT_hidden, HTMLInputType::Hidden },
+ { OOO_STRING_SVTOOLS_HTML_IT_submit, HTMLInputType::Submit },
+ { OOO_STRING_SVTOOLS_HTML_IT_image, HTMLInputType::Image },
+ { OOO_STRING_SVTOOLS_HTML_IT_reset, HTMLInputType::Reset },
+ { OOO_STRING_SVTOOLS_HTML_IT_button, HTMLInputType::Button },
+ { nullptr, HTMLInputType(0) }
+};
+
+// <TABLE FRAME=xxx>
+HTMLOptionEnum<HTMLTableFrame> const aTableFrameOptEnums[] =
+{
+ { OOO_STRING_SVTOOLS_HTML_TF_void, HTMLTableFrame::Void },
+ { OOO_STRING_SVTOOLS_HTML_TF_above, HTMLTableFrame::Above },
+ { OOO_STRING_SVTOOLS_HTML_TF_below, HTMLTableFrame::Below },
+ { OOO_STRING_SVTOOLS_HTML_TF_hsides, HTMLTableFrame::HSides },
+ { OOO_STRING_SVTOOLS_HTML_TF_lhs, HTMLTableFrame::LHS },
+ { OOO_STRING_SVTOOLS_HTML_TF_rhs, HTMLTableFrame::RHS },
+ { OOO_STRING_SVTOOLS_HTML_TF_vsides, HTMLTableFrame::VSides },
+ { OOO_STRING_SVTOOLS_HTML_TF_box, HTMLTableFrame::Box },
+ { OOO_STRING_SVTOOLS_HTML_TF_border, HTMLTableFrame::Box },
+ { nullptr, HTMLTableFrame(0) }
+};
+
+// <TABLE RULES=xxx>
+HTMLOptionEnum<HTMLTableRules> const aTableRulesOptEnums[] =
+{
+ { OOO_STRING_SVTOOLS_HTML_TR_none, HTMLTableRules::NONE },
+ { OOO_STRING_SVTOOLS_HTML_TR_groups, HTMLTableRules::Groups },
+ { OOO_STRING_SVTOOLS_HTML_TR_rows, HTMLTableRules::Rows },
+ { OOO_STRING_SVTOOLS_HTML_TR_cols, HTMLTableRules::Cols },
+ { OOO_STRING_SVTOOLS_HTML_TR_all, HTMLTableRules::All },
+ { nullptr, HTMLTableRules(0) }
+};
+
+
+HTMLOption::HTMLOption( HtmlOptionId nTok, OUString _aToken,
+ OUString _aValue )
+ : aValue(std::move(_aValue))
+ , aToken(std::move(_aToken))
+ , nToken( nTok )
+{
+ DBG_ASSERT( nToken>=HtmlOptionId::BOOL_START && nToken<HtmlOptionId::END,
+ "HTMLOption: unknown token" );
+}
+
+sal_uInt32 HTMLOption::GetNumber() const
+{
+ DBG_ASSERT( (nToken>=HtmlOptionId::NUMBER_START &&
+ nToken<HtmlOptionId::NUMBER_END) ||
+ (nToken>=HtmlOptionId::CONTEXT_START &&
+ nToken<HtmlOptionId::CONTEXT_END) ||
+ nToken==HtmlOptionId::VALUE,
+ "GetNumber: Option not numerical" );
+ OUString aTmp(comphelper::string::stripStart(aValue, ' '));
+ sal_Int32 nTmp = aTmp.toInt32();
+ return nTmp >= 0 ? static_cast<sal_uInt32>(nTmp) : 0;
+}
+
+sal_Int32 HTMLOption::GetSNumber() const
+{
+ DBG_ASSERT( (nToken>=HtmlOptionId::NUMBER_START && nToken<HtmlOptionId::NUMBER_END) ||
+ (nToken>=HtmlOptionId::CONTEXT_START && nToken<HtmlOptionId::CONTEXT_END),
+ "GetSNumber: Option not numerical" );
+ OUString aTmp(comphelper::string::stripStart(aValue, ' '));
+ return aTmp.toInt32();
+}
+
+void HTMLOption::GetNumbers( std::vector<sal_uInt32> &rNumbers ) const
+{
+ rNumbers.clear();
+
+ // This is a very simplified scanner: it only searches all
+ // numerals in the string.
+ bool bInNum = false;
+ sal_uInt32 nNum = 0;
+ for( sal_Int32 i=0; i<aValue.getLength(); i++ )
+ {
+ sal_Unicode c = aValue[ i ];
+ if( c>='0' && c<='9' )
+ {
+ nNum *= 10;
+ nNum += (c - '0');
+ bInNum = true;
+ }
+ else if( bInNum )
+ {
+ rNumbers.push_back( nNum );
+ bInNum = false;
+ nNum = 0;
+ }
+ }
+ if( bInNum )
+ {
+ rNumbers.push_back( nNum );
+ }
+}
+
+void HTMLOption::GetColor( Color& rColor ) const
+{
+ DBG_ASSERT( (nToken>=HtmlOptionId::COLOR_START && nToken<HtmlOptionId::COLOR_END) || nToken==HtmlOptionId::SIZE,
+ "GetColor: Option is not a color." );
+
+ OUString aTmp(aValue.toAsciiLowerCase());
+ sal_uInt32 nColor = SAL_MAX_UINT32;
+ if (!aTmp.isEmpty() && aTmp[0] != '#')
+ nColor = GetHTMLColor(aTmp);
+
+ if( SAL_MAX_UINT32 == nColor )
+ {
+ nColor = 0;
+ sal_Int32 nPos = 0;
+ for (sal_uInt32 i=0; i<6; ++i)
+ {
+ // Whatever Netscape does to get color values,
+ // at maximum three characters < '0' are ignored.
+ sal_Unicode c = nPos<aTmp.getLength() ? aTmp[ nPos++ ] : '0';
+ if( c < '0' )
+ {
+ c = nPos<aTmp.getLength() ? aTmp[nPos++] : '0';
+ if( c < '0' )
+ c = nPos<aTmp.getLength() ? aTmp[nPos++] : '0';
+ }
+ nColor *= 16;
+ if( c >= '0' && c <= '9' )
+ nColor += (c - '0');
+ else if( c >= 'a' && c <= 'f' )
+ nColor += (c + 0xa - 'a');
+ }
+ }
+
+ rColor.SetRed( static_cast<sal_uInt8>((nColor & 0x00ff0000) >> 16) );
+ rColor.SetGreen( static_cast<sal_uInt8>((nColor & 0x0000ff00) >> 8));
+ rColor.SetBlue( static_cast<sal_uInt8>(nColor & 0x000000ff) );
+}
+
+HTMLInputType HTMLOption::GetInputType() const
+{
+ DBG_ASSERT( nToken==HtmlOptionId::TYPE, "GetInputType: Option not TYPE" );
+ return GetEnum( aInputTypeOptEnums, HTMLInputType::Text );
+}
+
+HTMLTableFrame HTMLOption::GetTableFrame() const
+{
+ DBG_ASSERT( nToken==HtmlOptionId::FRAME, "GetTableFrame: Option not FRAME" );
+ return GetEnum( aTableFrameOptEnums );
+}
+
+HTMLTableRules HTMLOption::GetTableRules() const
+{
+ DBG_ASSERT( nToken==HtmlOptionId::RULES, "GetTableRules: Option not RULES" );
+ return GetEnum( aTableRulesOptEnums );
+}
+
+HTMLParser::HTMLParser( SvStream& rIn, bool bReadNewDoc ) :
+ SvParser<HtmlTokenId>( rIn ),
+ bNewDoc(bReadNewDoc),
+ bIsInHeader(true),
+ bReadListing(false),
+ bReadXMP(false),
+ bReadPRE(false),
+ bReadTextArea(false),
+ bReadScript(false),
+ bReadStyle(false),
+ bEndTokenFound(false),
+ bPre_IgnoreNewPara(false),
+ bReadNextChar(false),
+ bReadComment(false),
+ nPre_LinePos(0),
+ mnPendingOffToken(HtmlTokenId::NONE)
+{
+ //#i76649, default to UTF-8 for HTML unless we know differently
+ SetSrcEncoding(RTL_TEXTENCODING_UTF8);
+}
+
+HTMLParser::~HTMLParser()
+{
+}
+
+void HTMLParser::SetNamespace(std::u16string_view rNamespace)
+{
+ // Convert namespace alias to a prefix.
+ maNamespace = OUString::Concat(rNamespace) + ":";
+}
+
+namespace
+{
+ class RefGuard
+ {
+ private:
+ HTMLParser& m_rParser;
+ public:
+ RefGuard(HTMLParser& rParser)
+ : m_rParser(rParser)
+ {
+ m_rParser.AddFirstRef();
+ }
+
+ ~RefGuard()
+ {
+ if (m_rParser.GetStatus() != SvParserState::Pending)
+ m_rParser.ReleaseRef(); // Parser not needed anymore
+ }
+ };
+}
+
+SvParserState HTMLParser::CallParser()
+{
+ eState = SvParserState::Working;
+ nNextCh = GetNextChar();
+ SaveState( HtmlTokenId::NONE );
+
+ nPre_LinePos = 0;
+ bPre_IgnoreNewPara = false;
+
+ RefGuard aRefGuard(*this);
+
+ Continue( HtmlTokenId::NONE );
+
+ return eState;
+}
+
+void HTMLParser::Continue( HtmlTokenId nToken )
+{
+ if( nToken == HtmlTokenId::NONE )
+ nToken = GetNextToken();
+
+ while( IsParserWorking() )
+ {
+ SaveState( nToken );
+ nToken = FilterToken( nToken );
+
+ if( nToken != HtmlTokenId::NONE )
+ NextToken( nToken );
+
+ if( IsParserWorking() )
+ SaveState( HtmlTokenId::NONE ); // continue with new token
+
+ nToken = GetNextToken();
+ }
+}
+
+HtmlTokenId HTMLParser::FilterToken( HtmlTokenId nToken )
+{
+ switch( nToken )
+ {
+ case HtmlTokenId(EOF):
+ nToken = HtmlTokenId::NONE;
+ break; // don't pass
+
+ case HtmlTokenId::HEAD_OFF:
+ bIsInHeader = false;
+ break;
+
+ case HtmlTokenId::HEAD_ON:
+ bIsInHeader = true;
+ break;
+
+ case HtmlTokenId::BODY_ON:
+ bIsInHeader = false;
+ break;
+
+ case HtmlTokenId::FRAMESET_ON:
+ bIsInHeader = false;
+ break;
+
+ case HtmlTokenId::BODY_OFF:
+ bReadPRE = bReadListing = bReadXMP = false;
+ break;
+
+ case HtmlTokenId::HTML_OFF:
+ nToken = HtmlTokenId::NONE;
+ bReadPRE = bReadListing = bReadXMP = false;
+ break; // HtmlTokenId::ON hasn't been passed either !
+
+ case HtmlTokenId::PREFORMTXT_ON:
+ StartPRE();
+ break;
+
+ case HtmlTokenId::PREFORMTXT_OFF:
+ FinishPRE();
+ break;
+
+ case HtmlTokenId::LISTING_ON:
+ StartListing();
+ break;
+
+ case HtmlTokenId::LISTING_OFF:
+ FinishListing();
+ break;
+
+ case HtmlTokenId::XMP_ON:
+ StartXMP();
+ break;
+
+ case HtmlTokenId::XMP_OFF:
+ FinishXMP();
+ break;
+
+ default:
+ if( bReadPRE )
+ nToken = FilterPRE( nToken );
+ else if( bReadListing )
+ nToken = FilterListing( nToken );
+ else if( bReadXMP )
+ nToken = FilterXMP( nToken );
+
+ break;
+ }
+
+ return nToken;
+}
+
+namespace {
+
+constexpr bool HTML_ISPRINTABLE(sal_Unicode c) { return c >= 32 && c != 127; }
+
+constexpr bool HTML_ISSPACE(sal_uInt32 c)
+{
+ return ' ' == c || '\t' == c || '\r' == c || '\n' == c || '\x0b' == c;
+}
+
+}
+
+HtmlTokenId HTMLParser::ScanText(const sal_Unicode cBreak)
+{
+ OUStringBuffer sTmpBuffer( MAX_LEN );
+ bool bContinue = true;
+ bool bEqSignFound = false;
+ sal_uInt32 cQuote = 0U;
+
+ while( bContinue && IsParserWorking() )
+ {
+ bool bNextCh = true;
+ switch( nNextCh )
+ {
+ case '&':
+ bEqSignFound = false;
+ if( bReadXMP )
+ sTmpBuffer.append( '&' );
+ else
+ {
+ sal_uInt64 nStreamPos = rInput.Tell();
+ sal_uInt32 nLinePos = GetLinePos();
+
+ sal_uInt32 cChar = 0U;
+ if( '#' == (nNextCh = GetNextChar()) )
+ {
+ nNextCh = GetNextChar();
+ const bool bIsHex( 'x' == nNextCh );
+ const bool bIsDecOrHex( bIsHex || rtl::isAsciiDigit(nNextCh) );
+ if ( bIsDecOrHex )
+ {
+ if ( bIsHex )
+ {
+ nNextCh = GetNextChar();
+ while ( rtl::isAsciiHexDigit(nNextCh) )
+ {
+ cChar = cChar * 16U +
+ ( nNextCh <= '9'
+ ? sal_uInt32( nNextCh - '0' )
+ : ( nNextCh <= 'F'
+ ? sal_uInt32( nNextCh - 'A' + 10 )
+ : sal_uInt32( nNextCh - 'a' + 10 ) ) );
+ nNextCh = GetNextChar();
+ }
+ }
+ else
+ {
+ do
+ {
+ cChar = cChar * 10U + sal_uInt32( nNextCh - '0');
+ nNextCh = GetNextChar();
+ }
+ while( rtl::isAsciiDigit(nNextCh) );
+ }
+
+ if( RTL_TEXTENCODING_DONTKNOW != eSrcEnc &&
+ RTL_TEXTENCODING_UCS2 != eSrcEnc &&
+ RTL_TEXTENCODING_UTF8 != eSrcEnc &&
+ cChar < 256 )
+ {
+ const sal_uInt32 convertFlags =
+ RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT |
+ RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
+ RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT;
+
+ char cEncodedChar = static_cast<char>(cChar);
+ cChar = OUString(&cEncodedChar, 1, eSrcEnc, convertFlags).toChar();
+ if( 0U == cChar )
+ {
+ // If the character could not be
+ // converted, because a conversion is not
+ // available, do no conversion at all.
+ cChar = cEncodedChar;
+ }
+ }
+ }
+ else
+ nNextCh = 0U;
+
+ if (!rtl::isUnicodeCodePoint(cChar)
+ || (linguistic::IsControlChar(cChar)
+ && cChar != '\r' && cChar != '\n' && cChar != '\t'))
+ {
+ cChar = '?';
+ }
+ }
+ else if( rtl::isAsciiAlpha( nNextCh ) )
+ {
+ OUStringBuffer sEntityBuffer( MAX_ENTITY_LEN );
+ sal_Int32 nPos = 0;
+ do
+ {
+ sEntityBuffer.appendUtf32( nNextCh );
+ nPos++;
+ nNextCh = GetNextChar();
+ }
+ while( nPos < MAX_ENTITY_LEN && rtl::isAsciiAlphanumeric( nNextCh ) &&
+ !rInput.eof() );
+
+ if( IsParserWorking() && !rInput.eof() )
+ {
+ std::u16string_view sEntity(sEntityBuffer.subView(0, nPos));
+ cChar = GetHTMLCharName( sEntity );
+
+ // not found ( == 0 ): plain text
+ // or a character which is inserted as attribute
+ if( 0U == cChar && ';' != nNextCh )
+ {
+ DBG_ASSERT( rInput.Tell() - nStreamPos ==
+ static_cast<sal_uInt64>(nPos+1)*GetCharSize(),
+ "UTF-8 is failing here" );
+ for( sal_Int32 i = nPos-1; i>1; i-- )
+ {
+ nNextCh = sEntityBuffer[i];
+ sEntityBuffer.setLength( i );
+ sEntity = sEntityBuffer.subView(0, i);
+ cChar = GetHTMLCharName( sEntity );
+ if( cChar )
+ {
+ rInput.SeekRel( -static_cast<sal_Int64>
+ (nPos-i)*GetCharSize() );
+ nlLinePos -= sal_uInt32(nPos-i);
+ nPos = i;
+ ClearTxtConvContext();
+ break;
+ }
+ }
+ }
+
+ if( !cChar ) // unknown character?
+ {
+ // back in stream, insert '&'
+ // and restart with next character
+ sTmpBuffer.append( '&' );
+
+ DBG_ASSERT( rInput.Tell()-nStreamPos ==
+ static_cast<sal_uInt64>(nPos+1)*GetCharSize(),
+ "Wrong stream position" );
+ DBG_ASSERT( nlLinePos-nLinePos ==
+ static_cast<sal_uInt32>(nPos+1),
+ "Wrong line position" );
+ rInput.Seek( nStreamPos );
+ nlLinePos = nLinePos;
+ ClearTxtConvContext();
+ break;
+ }
+
+ assert(cChar != 0);
+
+ // 1 == Non Breaking Space
+ // 2 == SoftHyphen
+
+ if (cChar == 1 || cChar == 2)
+ {
+ if( '>' == cBreak )
+ {
+ // When reading the content of a tag we have
+ // to change it to ' ' or '-'
+ if( 1U == cChar )
+ cChar = ' ';
+ else //2U
+ cChar = '-';
+ }
+ else
+ {
+ // If not scanning a tag return token
+ aToken.append( sTmpBuffer );
+ sTmpBuffer.setLength(0);
+
+ if( !aToken.isEmpty() )
+ {
+ // restart with character
+ nNextCh = '&';
+ DBG_ASSERT( rInput.Tell()-nStreamPos ==
+ static_cast<sal_uInt64>(nPos+1)*GetCharSize(),
+ "Wrong stream position" );
+ DBG_ASSERT( nlLinePos-nLinePos ==
+ static_cast<sal_uInt32>(nPos+1),
+ "Wrong line position" );
+ rInput.Seek( nStreamPos );
+ nlLinePos = nLinePos;
+ ClearTxtConvContext();
+ return HtmlTokenId::TEXTTOKEN;
+ }
+
+ // Hack: _GetNextChar shall not read the
+ // next character
+ if( ';' != nNextCh )
+ aToken.append( " " );
+ if( 1U == cChar )
+ return HtmlTokenId::NONBREAKSPACE;
+ else //2U
+ return HtmlTokenId::SOFTHYPH;
+ }
+ }
+ }
+ else
+ nNextCh = 0U;
+ }
+ // &{...};-JavaScript-Macros are not supported any longer.
+ else if( IsParserWorking() )
+ {
+ sTmpBuffer.append( '&' );
+ bNextCh = false;
+ break;
+ }
+
+ bNextCh = (';' == nNextCh);
+ if( cBreak=='>' && (cChar=='\\' || cChar=='\'' ||
+ cChar=='\"' || cChar==' ') )
+ {
+ // ' and " have to be escaped within tags to separate
+ // them from ' and " enclosing options.
+ // \ has to be escaped as well.
+ // Space is protected because it's not a delimiter between
+ // options.
+ sTmpBuffer.append( '\\' );
+ }
+ if( IsParserWorking() )
+ {
+ if( cChar )
+ sTmpBuffer.appendUtf32( cChar );
+ }
+ else if( SvParserState::Pending==eState && '>'!=cBreak )
+ {
+ // Restart with '&', the remainder is returned as
+ // text token.
+ if( !aToken.isEmpty() || !sTmpBuffer.isEmpty() )
+ {
+ // _GetNextChar() returns the previous text and
+ // during the next execution a new character is read.
+ // Thus we have to position in front of the '&'.
+ nNextCh = 0U;
+ rInput.Seek( nStreamPos - GetCharSize() );
+ nlLinePos = nLinePos-1;
+ ClearTxtConvContext();
+ bReadNextChar = true;
+ }
+ bNextCh = false;
+ }
+ }
+ break;
+ case '=':
+ if( '>'==cBreak && !cQuote )
+ bEqSignFound = true;
+ sTmpBuffer.appendUtf32( nNextCh );
+ break;
+
+ case '\\':
+ if( '>'==cBreak )
+ {
+ // mark within tags
+ sTmpBuffer.append( '\\' );
+ }
+ sTmpBuffer.append( '\\' );
+ break;
+
+ case '\"':
+ case '\'':
+ if( '>'==cBreak )
+ {
+ if( bEqSignFound )
+ cQuote = nNextCh;
+ else if( cQuote && (cQuote==nNextCh ) )
+ cQuote = 0U;
+ }
+ sTmpBuffer.appendUtf32( nNextCh );
+ bEqSignFound = false;
+ break;
+
+ case sal_Unicode(EOF):
+ if( rInput.eof() )
+ {
+ bContinue = false;
+ }
+ // else: ignore, not a valid code point
+ break;
+
+ case '<':
+ bEqSignFound = false;
+ if( '>'==cBreak )
+ sTmpBuffer.appendUtf32( nNextCh );
+ else
+ bContinue = false; // break, string is together
+ break;
+
+ case '\f':
+ if( '>' == cBreak )
+ {
+ // If scanning options treat it like a space, ...
+ sTmpBuffer.append( ' ' );
+ }
+ else
+ {
+ // otherwise it's a separate token.
+ bContinue = false;
+ }
+ break;
+
+ case '\r':
+ case '\n':
+ if( '>'==cBreak )
+ {
+ // cr/lf in tag is handled in GetNextToken_()
+ sTmpBuffer.appendUtf32( nNextCh );
+ break;
+ }
+ else if( bReadListing || bReadXMP || bReadPRE || bReadTextArea )
+ {
+ bContinue = false;
+ break;
+ }
+ // Reduce sequence of CR/LF/BLANK/TAB to a single blank
+ [[fallthrough]];
+ case '\t':
+ if( '\t'==nNextCh && bReadPRE && '>'!=cBreak )
+ {
+ // Pass Tabs up in <PRE>
+ bContinue = false;
+ break;
+ }
+ [[fallthrough]];
+ case '\x0b':
+ if( '\x0b'==nNextCh && (bReadPRE || bReadXMP ||bReadListing) &&
+ '>'!=cBreak )
+ {
+ break;
+ }
+ if (!m_bPreserveSpaces)
+ nNextCh = ' ';
+ [[fallthrough]];
+ case ' ':
+ if (!m_bPreserveSpaces)
+ {
+ sTmpBuffer.appendUtf32(nNextCh);
+ if ('>' != cBreak && (!bReadListing && !bReadXMP && !bReadPRE && !bReadTextArea))
+ {
+ // Reduce sequences of Blanks/Tabs/CR/LF to a single blank
+ do
+ {
+ nNextCh = GetNextChar();
+ if (sal_Unicode(EOF) == nNextCh && rInput.eof())
+ {
+ if (!aToken.isEmpty() || sTmpBuffer.getLength() > 1)
+ {
+ // Have seen s.th. aside from blanks?
+ aToken.append(sTmpBuffer);
+ sTmpBuffer.setLength(0);
+ return HtmlTokenId::TEXTTOKEN;
+ }
+ else
+ // Only read blanks: no text must be returned
+ // and GetNextToken_ has to read until EOF
+ return HtmlTokenId::NONE;
+ }
+ } while (HTML_ISSPACE(nNextCh));
+ bNextCh = false;
+ }
+ break;
+ }
+ [[fallthrough]];
+ default:
+ bEqSignFound = false;
+ if (nNextCh == cBreak && !cQuote)
+ bContinue = false;
+ else
+ {
+ do {
+ if (!linguistic::IsControlChar(nNextCh) || HTML_ISSPACE(nNextCh))
+ {
+ // All remaining characters make their way into the text.
+ sTmpBuffer.appendUtf32( nNextCh );
+ }
+
+ nNextCh = GetNextChar();
+ if( ( sal_Unicode(EOF) == nNextCh && rInput.eof() ) ||
+ !IsParserWorking() )
+ {
+ if( !sTmpBuffer.isEmpty() )
+ aToken.append( sTmpBuffer );
+ return HtmlTokenId::TEXTTOKEN;
+ }
+ } while( rtl::isAsciiAlpha( nNextCh ) || rtl::isAsciiDigit( nNextCh ) );
+ bNextCh = false;
+ }
+ }
+
+ if( bContinue && bNextCh )
+ nNextCh = GetNextChar();
+ }
+
+ if( !sTmpBuffer.isEmpty() )
+ aToken.append( sTmpBuffer );
+
+ return HtmlTokenId::TEXTTOKEN;
+}
+
+HtmlTokenId HTMLParser::GetNextRawToken()
+{
+ OUStringBuffer sTmpBuffer( MAX_LEN );
+
+ if( bEndTokenFound )
+ {
+ // During the last execution we already found the end token,
+ // thus we don't have to search it again.
+ bReadScript = false;
+ bReadStyle = false;
+ aEndToken.clear();
+ bEndTokenFound = false;
+
+ return HtmlTokenId::NONE;
+ }
+
+ // Default return value: HtmlTokenId::RAWDATA
+ bool bContinue = true;
+ HtmlTokenId nToken = HtmlTokenId::RAWDATA;
+ SaveState( HtmlTokenId::NONE );
+ while( bContinue && IsParserWorking() )
+ {
+ bool bNextCh = true;
+ switch( nNextCh )
+ {
+ case '<':
+ {
+ // Maybe we've reached the end.
+
+ // Save what we have read previously...
+ aToken.append( sTmpBuffer );
+ sTmpBuffer.setLength(0);
+
+ // and remember position in stream.
+ sal_uInt64 nStreamPos = rInput.Tell();
+ sal_uInt32 nLineNr = GetLineNr();
+ sal_uInt32 nLinePos = GetLinePos();
+
+ // Start of an end token?
+ bool bOffState = false;
+ if( '/' == (nNextCh = GetNextChar()) )
+ {
+ bOffState = true;
+ nNextCh = GetNextChar();
+ }
+ else if( '!' == nNextCh )
+ {
+ sTmpBuffer.appendUtf32( nNextCh );
+ nNextCh = GetNextChar();
+ }
+
+ // Read following letters
+ while( (rtl::isAsciiAlpha(nNextCh) || '-'==nNextCh) &&
+ IsParserWorking() && sTmpBuffer.getLength() < MAX_LEN )
+ {
+ sTmpBuffer.appendUtf32( nNextCh );
+ nNextCh = GetNextChar();
+ }
+
+ OUString aTok( sTmpBuffer.toString() );
+ aTok = aTok.toAsciiLowerCase();
+ bool bDone = false;
+ if( bReadScript || !aEndToken.isEmpty() )
+ {
+ if( !bReadComment )
+ {
+ if( aTok.startsWith( OOO_STRING_SVTOOLS_HTML_comment ) )
+ {
+ bReadComment = true;
+ }
+ else
+ {
+ // A script has to end with "</SCRIPT>". But
+ // ">" is optional for security reasons
+ bDone = bOffState &&
+ ( bReadScript
+ ? aTok == OOO_STRING_SVTOOLS_HTML_script
+ : aTok == aEndToken );
+ }
+ }
+ if( bReadComment && '>'==nNextCh && aTok.endsWith( "--" ) )
+ {
+ // End of comment of style <!----->
+ bReadComment = false;
+ }
+ }
+ else
+ {
+ // Style sheets can be closed by </STYLE>, </HEAD> or <BODY>
+ if( bOffState )
+ bDone = aTok == OOO_STRING_SVTOOLS_HTML_style ||
+ aTok == OOO_STRING_SVTOOLS_HTML_head;
+ else
+ bDone = aTok == OOO_STRING_SVTOOLS_HTML_body;
+ }
+
+ if( bDone )
+ {
+ // Done! Return the previously read string (if requested)
+ // and continue.
+
+ bContinue = false;
+
+ // nToken==0 means, GetNextToken_ continues to read
+ if( aToken.isEmpty() && (bReadStyle || bReadScript) )
+ {
+ // Immediately close environment (or context?)
+ // and parse the end token
+ bReadScript = false;
+ bReadStyle = false;
+ aEndToken.clear();
+ nToken = HtmlTokenId::NONE;
+ }
+ else
+ {
+ // Keep bReadScript/bReadStyle alive
+ // and parse end token during next execution
+ bEndTokenFound = true;
+ }
+
+ // Move backwards in stream to '<'
+ rInput.Seek( nStreamPos );
+ SetLineNr( nLineNr );
+ SetLinePos( nLinePos );
+ ClearTxtConvContext();
+ nNextCh = '<';
+
+ // Don't append string to token.
+ sTmpBuffer.setLength( 0 );
+ }
+ else
+ {
+ // remember "</" , everything else we find in the buffer
+ aToken.append( "<" );
+ if( bOffState )
+ aToken.append( "/" );
+
+ bNextCh = false;
+ }
+ }
+ break;
+ case '-':
+ sTmpBuffer.appendUtf32( nNextCh );
+ if( bReadComment )
+ {
+ bool bTwoMinus = false;
+ nNextCh = GetNextChar();
+ while( '-' == nNextCh && IsParserWorking() )
+ {
+ bTwoMinus = true;
+ sTmpBuffer.appendUtf32( nNextCh );
+ nNextCh = GetNextChar();
+ }
+
+ if( '>' == nNextCh && IsParserWorking() && bTwoMinus )
+ bReadComment = false;
+
+ bNextCh = false;
+ }
+ break;
+
+ case '\r':
+ // \r\n? closes the current text token (even if it's empty)
+ nNextCh = GetNextChar();
+ if( nNextCh=='\n' )
+ nNextCh = GetNextChar();
+ bContinue = false;
+ break;
+ case '\n':
+ // \n closes the current text token (even if it's empty)
+ nNextCh = GetNextChar();
+ bContinue = false;
+ break;
+ case sal_Unicode(EOF):
+ // eof closes the current text token and behaves like having read
+ // an end token
+ if( rInput.eof() )
+ {
+ bContinue = false;
+ if( !aToken.isEmpty() || !sTmpBuffer.isEmpty() )
+ {
+ bEndTokenFound = true;
+ }
+ else
+ {
+ bReadScript = false;
+ bReadStyle = false;
+ aEndToken.clear();
+ nToken = HtmlTokenId::NONE;
+ }
+ }
+ break;
+ default:
+ if (!linguistic::IsControlChar(nNextCh) || nNextCh == '\t')
+ {
+ // all remaining characters are appended to the buffer
+ sTmpBuffer.appendUtf32( nNextCh );
+ }
+ break;
+ }
+
+ if( !bContinue && !sTmpBuffer.isEmpty() )
+ {
+ aToken.append( sTmpBuffer );
+ sTmpBuffer.setLength(0);
+ }
+
+ if( bContinue && bNextCh )
+ nNextCh = GetNextChar();
+ }
+
+ if( IsParserWorking() )
+ SaveState( HtmlTokenId::NONE );
+ else
+ nToken = HtmlTokenId::NONE;
+
+ return nToken;
+}
+
+// Scan next token
+HtmlTokenId HTMLParser::GetNextToken_()
+{
+ HtmlTokenId nRet = HtmlTokenId::NONE;
+ sSaveToken.clear();
+
+ if (mnPendingOffToken != HtmlTokenId::NONE)
+ {
+ // HtmlTokenId::<TOKEN>_OFF generated for HtmlTokenId::<TOKEN>_ON
+ nRet = mnPendingOffToken;
+ mnPendingOffToken = HtmlTokenId::NONE;
+ aToken.setLength( 0 );
+ return nRet;
+ }
+
+ // Delete options
+ maOptions.clear();
+
+ if( !IsParserWorking() ) // Don't continue if already an error occurred
+ return HtmlTokenId::NONE;
+
+ bool bReadNextCharSave = bReadNextChar;
+ if( bReadNextChar )
+ {
+ DBG_ASSERT( !bEndTokenFound,
+ "Read a character despite </SCRIPT> was read?" );
+ nNextCh = GetNextChar();
+ if( !IsParserWorking() ) // Don't continue if already an error occurred
+ return HtmlTokenId::NONE;
+ bReadNextChar = false;
+ }
+
+ if( bReadScript || bReadStyle || !aEndToken.isEmpty() )
+ {
+ nRet = GetNextRawToken();
+ if( nRet != HtmlTokenId::NONE || !IsParserWorking() )
+ return nRet;
+ }
+
+ do {
+ bool bNextCh = true;
+ switch( nNextCh )
+ {
+ case '<':
+ {
+ sal_uInt64 nStreamPos = rInput.Tell();
+ sal_uInt32 nLineNr = GetLineNr();
+ sal_uInt32 nLinePos = GetLinePos();
+
+ bool bOffState = false;
+ if( '/' == (nNextCh = GetNextChar()) )
+ {
+ bOffState = true;
+ nNextCh = GetNextChar();
+ }
+ // Assume '<?' is a start of an XML declaration, ignore it.
+ if (rtl::isAsciiAlpha(nNextCh) || nNextCh == '!' || nNextCh == '?')
+ {
+ OUStringBuffer sTmpBuffer;
+ do {
+ sTmpBuffer.appendUtf32( nNextCh );
+ nNextCh = GetNextChar();
+ if (std::u16string_view(sTmpBuffer) == u"![CDATA[")
+ break;
+ if (bFuzzing && sTmpBuffer.getLength() > 1024)
+ {
+ SAL_WARN("svtools", "abandoning import for performance reasons with long tokens");
+ eState = SvParserState::Error;
+ break;
+ }
+ } while( '>' != nNextCh && '/' != nNextCh && !rtl::isAsciiWhiteSpace( nNextCh ) &&
+ !linguistic::IsControlChar(nNextCh) &&
+ IsParserWorking() && !rInput.eof() );
+
+ if( !sTmpBuffer.isEmpty() )
+ {
+ aToken.append( sTmpBuffer );
+ sTmpBuffer.setLength(0);
+ }
+
+ // Skip blanks
+ while( rtl::isAsciiWhiteSpace( nNextCh ) && IsParserWorking() )
+ nNextCh = GetNextChar();
+
+ if( !IsParserWorking() )
+ {
+ if( SvParserState::Pending == eState )
+ bReadNextChar = bReadNextCharSave;
+ break;
+ }
+
+ // Search token in table:
+ sSaveToken = aToken;
+ aToken = aToken.toString().toAsciiLowerCase();
+
+ if (!maNamespace.isEmpty() && o3tl::starts_with(aToken, maNamespace))
+ aToken.remove( 0, maNamespace.getLength());
+
+ if( HtmlTokenId::NONE == (nRet = GetHTMLToken( aToken )) )
+ // Unknown control
+ nRet = HtmlTokenId::UNKNOWNCONTROL_ON;
+
+ // If it's a token which can be switched off...
+ if( bOffState )
+ {
+ if( nRet >= HtmlTokenId::ONOFF_START )
+ {
+ // and there is an off token, return off token instead
+ nRet = static_cast<HtmlTokenId>(static_cast<int>(nRet) + 1);
+ }
+ else if( HtmlTokenId::LINEBREAK!=nRet || !maNamespace.isEmpty())
+ {
+ // and there is no off token, return unknown token.
+ // (except for </BR>, that is treated like <BR>)
+ // No exception for XHTML, though.
+ nRet = HtmlTokenId::UNKNOWNCONTROL_OFF;
+ }
+ }
+
+ if( nRet == HtmlTokenId::COMMENT )
+ {
+ // fix: due to being case sensitive use sSaveToken as start of comment
+ // and append a blank.
+ aToken = sSaveToken;
+ if( '>'!=nNextCh )
+ aToken.append( " " );
+ sal_uInt64 nCStreamPos = 0;
+ sal_uInt32 nCLineNr = 0;
+ sal_uInt32 nCLinePos = 0;
+ sal_Int32 nCStrLen = 0;
+
+ bool bDone = false;
+ // Read until closing -->. If not found restart at first >
+ sTmpBuffer = aToken;
+ while( !bDone && !rInput.eof() && IsParserWorking() )
+ {
+ if( '>'==nNextCh )
+ {
+ if( !nCStreamPos )
+ {
+ nCStreamPos = rInput.Tell();
+ nCStrLen = sTmpBuffer.getLength();
+ nCLineNr = GetLineNr();
+ nCLinePos = GetLinePos();
+ }
+ bDone = sTmpBuffer.getLength() >= 2 && sTmpBuffer[sTmpBuffer.getLength() - 2] == '-' && sTmpBuffer[sTmpBuffer.getLength() - 1] == '-';
+ if( !bDone )
+ sTmpBuffer.appendUtf32(nNextCh);
+ }
+ else if (!linguistic::IsControlChar(nNextCh)
+ || nNextCh == '\r' || nNextCh == '\n' || nNextCh == '\t')
+ {
+ sTmpBuffer.appendUtf32(nNextCh);
+ }
+ if( !bDone )
+ nNextCh = GetNextChar();
+ }
+ aToken = sTmpBuffer;
+ sTmpBuffer.setLength(0);
+ if( !bDone && IsParserWorking() && nCStreamPos )
+ {
+ rInput.Seek( nCStreamPos );
+ SetLineNr( nCLineNr );
+ SetLinePos( nCLinePos );
+ ClearTxtConvContext();
+ aToken.truncate(nCStrLen);
+ nNextCh = '>';
+ }
+ }
+ else if (nRet == HtmlTokenId::CDATA)
+ {
+ // Read until the closing ]]>.
+ bool bDone = false;
+ while (!bDone && !rInput.eof() && IsParserWorking())
+ {
+ if (nNextCh == '>')
+ {
+ if (sTmpBuffer.getLength() >= 2)
+ {
+ bDone = sTmpBuffer[sTmpBuffer.getLength() - 2] == ']'
+ && sTmpBuffer[sTmpBuffer.getLength() - 1] == ']';
+ if (bDone)
+ {
+ // Ignore ]] at the end.
+ sTmpBuffer.setLength(sTmpBuffer.getLength() - 2);
+ }
+ }
+ if (!bDone)
+ {
+ sTmpBuffer.appendUtf32(nNextCh);
+ }
+ }
+ else if (!linguistic::IsControlChar(nNextCh))
+ {
+ sTmpBuffer.appendUtf32(nNextCh);
+ }
+ if (!bDone)
+ {
+ nNextCh = GetNextChar();
+ }
+ }
+ aToken = sTmpBuffer;
+ sTmpBuffer.setLength(0);
+ }
+ else
+ {
+ // TokenString not needed anymore
+ aToken.setLength( 0 );
+ }
+
+ // Read until closing '>'
+ if( '>' != nNextCh && IsParserWorking() )
+ {
+ ScanText( '>' );
+
+ // fdo#34666 fdo#36080 fdo#36390: closing "/>"?:
+ // generate pending HtmlTokenId::<TOKEN>_OFF for HtmlTokenId::<TOKEN>_ON
+ // Do not convert this to a single HtmlTokenId::<TOKEN>_OFF
+ // which lead to fdo#56772.
+ if ((nRet >= HtmlTokenId::ONOFF_START) && o3tl::ends_with(aToken, u"/"))
+ {
+ mnPendingOffToken = static_cast<HtmlTokenId>(static_cast<int>(nRet) + 1); // HtmlTokenId::<TOKEN>_ON -> HtmlTokenId::<TOKEN>_OFF
+ aToken.setLength( aToken.getLength()-1 ); // remove trailing '/'
+ }
+ if( sal_Unicode(EOF) == nNextCh && rInput.eof() )
+ {
+ // Move back in front of < and restart there.
+ // Return < as text.
+ rInput.Seek( nStreamPos );
+ SetLineNr( nLineNr );
+ SetLinePos( nLinePos );
+ ClearTxtConvContext();
+
+ aToken = "<";
+ nRet = HtmlTokenId::TEXTTOKEN;
+ nNextCh = GetNextChar();
+ bNextCh = false;
+ break;
+ }
+ }
+ if( SvParserState::Pending == eState )
+ bReadNextChar = bReadNextCharSave;
+ }
+ else
+ {
+ if( bOffState )
+ {
+ // simply throw away everything
+ ScanText( '>' );
+ if( sal_Unicode(EOF) == nNextCh && rInput.eof() )
+ {
+ // Move back in front of < and restart there.
+ // Return < as text.
+ rInput.Seek( nStreamPos );
+ SetLineNr( nLineNr );
+ SetLinePos( nLinePos );
+ ClearTxtConvContext();
+
+ aToken = "<";
+ nRet = HtmlTokenId::TEXTTOKEN;
+ nNextCh = GetNextChar();
+ bNextCh = false;
+ break;
+ }
+ if( SvParserState::Pending == eState )
+ bReadNextChar = bReadNextCharSave;
+ aToken.setLength( 0 );
+ }
+ else if( '%' == nNextCh )
+ {
+ nRet = HtmlTokenId::UNKNOWNCONTROL_ON;
+
+ sal_uInt64 nCStreamPos = rInput.Tell();
+ sal_uInt32 nCLineNr = GetLineNr(), nCLinePos = GetLinePos();
+
+ bool bDone = false;
+ // Read until closing %>. If not found restart at first >.
+ sal_Unicode nLastTokenChar = !aToken.isEmpty() ? aToken[aToken.getLength() - 1] : 0;
+ OUStringBuffer aTmpBuffer(aToken);
+ while( !bDone && !rInput.eof() && IsParserWorking() )
+ {
+ bDone = '>'==nNextCh && nLastTokenChar == '%';
+ if( !bDone )
+ {
+ aTmpBuffer.appendUtf32(nNextCh);
+ nLastTokenChar = aTmpBuffer[aTmpBuffer.getLength() - 1];
+ nNextCh = GetNextChar();
+ }
+ }
+ if( !bDone && IsParserWorking() )
+ {
+ rInput.Seek( nCStreamPos );
+ SetLineNr( nCLineNr );
+ SetLinePos( nCLinePos );
+ ClearTxtConvContext();
+ aToken = "<%";
+ nRet = HtmlTokenId::TEXTTOKEN;
+ break;
+ }
+ aToken = aTmpBuffer;
+ aTmpBuffer.setLength(0);
+ if( IsParserWorking() )
+ {
+ sSaveToken = aToken;
+ aToken.setLength( 0 );
+ }
+ }
+ else
+ {
+ aToken = "<";
+ nRet = HtmlTokenId::TEXTTOKEN;
+ bNextCh = false;
+ break;
+ }
+ }
+
+ if( IsParserWorking() )
+ {
+ bNextCh = '>' == nNextCh;
+ switch( nRet )
+ {
+ case HtmlTokenId::TEXTAREA_ON:
+ bReadTextArea = true;
+ break;
+ case HtmlTokenId::TEXTAREA_OFF:
+ bReadTextArea = false;
+ break;
+ case HtmlTokenId::SCRIPT_ON:
+ if( !bReadTextArea )
+ bReadScript = true;
+ break;
+ case HtmlTokenId::SCRIPT_OFF:
+ if( !bReadTextArea )
+ {
+ bReadScript = false;
+ // JavaScript might modify the stream,
+ // thus the last character has to be read again.
+ bReadNextChar = true;
+ bNextCh = false;
+ }
+ break;
+
+ case HtmlTokenId::STYLE_ON:
+ bReadStyle = true;
+ break;
+ case HtmlTokenId::STYLE_OFF:
+ bReadStyle = false;
+ break;
+ default: break;
+ }
+ }
+ }
+ break;
+
+ case sal_Unicode(EOF):
+ if( rInput.eof() )
+ {
+ eState = SvParserState::Accepted;
+ nRet = HtmlTokenId(nNextCh);
+ }
+ else
+ {
+ // Read normal text.
+ goto scan_text;
+ }
+ break;
+
+ case '\f':
+ // form feeds are passed upwards separately
+ nRet = HtmlTokenId::LINEFEEDCHAR; // !!! should be FORMFEEDCHAR
+ break;
+
+ case '\n':
+ case '\r':
+ if( bReadListing || bReadXMP || bReadPRE || bReadTextArea )
+ {
+ sal_Unicode c = GetNextChar();
+ if( ( '\n' != nNextCh || '\r' != c ) &&
+ ( '\r' != nNextCh || '\n' != c ) )
+ {
+ bNextCh = false;
+ nNextCh = c;
+ }
+ nRet = HtmlTokenId::NEWPARA;
+ break;
+ }
+ [[fallthrough]];
+ case '\t':
+ if( bReadPRE )
+ {
+ nRet = HtmlTokenId::TABCHAR;
+ break;
+ }
+ [[fallthrough]];
+ case ' ':
+ [[fallthrough]];
+ default:
+
+scan_text:
+ // "normal" text to come
+ nRet = ScanText();
+ bNextCh = 0 == aToken.getLength();
+
+ // the text should be processed
+ if( !bNextCh && eState == SvParserState::Pending )
+ {
+ eState = SvParserState::Working;
+ bReadNextChar = true;
+ }
+
+ break;
+ }
+
+ if( bNextCh && SvParserState::Working == eState )
+ {
+ nNextCh = GetNextChar();
+ if( SvParserState::Pending == eState && nRet != HtmlTokenId::NONE && HtmlTokenId::TEXTTOKEN != nRet )
+ {
+ bReadNextChar = true;
+ eState = SvParserState::Working;
+ }
+ }
+
+ } while( nRet == HtmlTokenId::NONE && SvParserState::Working == eState );
+
+ if( SvParserState::Pending == eState )
+ nRet = HtmlTokenId::INVALID; // s.th. invalid
+
+ return nRet;
+}
+
+void HTMLParser::UnescapeToken()
+{
+ sal_Int32 nPos=0;
+
+ bool bEscape = false;
+ while( nPos < aToken.getLength() )
+ {
+ bool bOldEscape = bEscape;
+ bEscape = false;
+ if( '\\'==aToken[nPos] && !bOldEscape )
+ {
+ aToken.remove( nPos, 1 );
+ bEscape = true;
+ }
+ else
+ {
+ nPos++;
+ }
+ }
+}
+
+const HTMLOptions& HTMLParser::GetOptions( HtmlOptionId const *pNoConvertToken )
+{
+ // If the options for the current token have already been returned,
+ // return them once again.
+ if (!maOptions.empty())
+ return maOptions;
+
+ sal_Int32 nPos = 0;
+ while( nPos < aToken.getLength() )
+ {
+ // A letter? Option beginning here.
+ if( rtl::isAsciiAlpha( aToken[nPos] ) )
+ {
+ HtmlOptionId nToken;
+ OUString aValue;
+ sal_Int32 nStt = nPos;
+ sal_Unicode cChar = 0;
+
+ // Actually only certain characters allowed.
+ // Netscape only looks for "=" and white space (c.f.
+ // Mozilla: PA_FetchRequestedNameValues in libparse/pa_mdl.c)
+ while( nPos < aToken.getLength() )
+ {
+ cChar = aToken[nPos];
+ if ( '=' == cChar ||!HTML_ISPRINTABLE(cChar) || rtl::isAsciiWhiteSpace(cChar) )
+ break;
+ nPos++;
+ }
+
+ OUString sName( aToken.subView( nStt, nPos-nStt ) );
+
+ // PlugIns require original token name. Convert to lower case only for searching.
+ nToken = GetHTMLOption( sName.toAsciiLowerCase() ); // Name is ready
+ SAL_WARN_IF( nToken==HtmlOptionId::UNKNOWN, "svtools",
+ "GetOption: unknown HTML option '" << sName << "'" );
+ bool bStripCRLF = (nToken < HtmlOptionId::SCRIPT_START ||
+ nToken >= HtmlOptionId::SCRIPT_END) &&
+ (!pNoConvertToken || nToken != *pNoConvertToken);
+
+ while( nPos < aToken.getLength() )
+ {
+ cChar = aToken[nPos];
+ if ( HTML_ISPRINTABLE(cChar) && !rtl::isAsciiWhiteSpace(cChar) )
+ break;
+ nPos++;
+ }
+
+ // Option with value?
+ if( nPos!=aToken.getLength() && '='==cChar )
+ {
+ nPos++;
+
+ while( nPos < aToken.getLength() )
+ {
+ cChar = aToken[nPos];
+ if ( HTML_ISPRINTABLE(cChar) && ' ' != cChar && '\t' != cChar && '\r' != cChar && '\n' != cChar )
+ break;
+ nPos++;
+ }
+
+ if( nPos != aToken.getLength() )
+ {
+ sal_Int32 nLen = 0;
+ nStt = nPos;
+ if( ('"'==cChar) || '\''==cChar )
+ {
+ sal_Unicode cEnd = cChar;
+ nPos++; nStt++;
+ bool bDone = false;
+ bool bEscape = false;
+ while( nPos < aToken.getLength() && !bDone )
+ {
+ bool bOldEscape = bEscape;
+ bEscape = false;
+ cChar = aToken[nPos];
+ switch( cChar )
+ {
+ case '\r':
+ case '\n':
+ if( bStripCRLF )
+ aToken.remove( nPos, 1 );
+ else
+ {
+ nPos++;
+ nLen++;
+ }
+ break;
+ case '\\':
+ if( bOldEscape )
+ {
+ nPos++;
+ nLen++;
+ }
+ else
+ {
+ aToken.remove( nPos, 1 );
+ bEscape = true;
+ }
+ break;
+ case '"':
+ case '\'':
+ bDone = !bOldEscape && cChar==cEnd;
+ if( !bDone )
+ {
+ nPos++;
+ nLen++;
+ }
+ break;
+ default:
+ nPos++;
+ nLen++;
+ break;
+ }
+ }
+ if( nPos!=aToken.getLength() )
+ nPos++;
+ }
+ else
+ {
+ // More liberal than the standard: allow all printable characters
+ bool bEscape = false;
+ bool bDone = false;
+ while( nPos < aToken.getLength() && !bDone )
+ {
+ bool bOldEscape = bEscape;
+ bEscape = false;
+ sal_Unicode c = aToken[nPos];
+ switch( c )
+ {
+ case ' ':
+ bDone = !bOldEscape;
+ if( !bDone )
+ {
+ nPos++;
+ nLen++;
+ }
+ break;
+
+ case '\t':
+ case '\r':
+ case '\n':
+ bDone = true;
+ break;
+
+ case '\\':
+ if( bOldEscape )
+ {
+ nPos++;
+ nLen++;
+ }
+ else
+ {
+ aToken.remove( nPos, 1 );
+ bEscape = true;
+ }
+ break;
+
+ default:
+ if( HTML_ISPRINTABLE( c ) )
+ {
+ nPos++;
+ nLen++;
+ }
+ else
+ bDone = true;
+ break;
+ }
+ }
+ }
+
+ if( nLen )
+ aValue = aToken.subView( nStt, nLen );
+ }
+ }
+
+ // Token is known and can be saved
+ maOptions.emplace_back(nToken, sName, aValue);
+
+ }
+ else
+ // Ignore white space and unexpected characters
+ nPos++;
+ }
+
+ return maOptions;
+}
+
+HtmlTokenId HTMLParser::FilterPRE( HtmlTokenId nToken )
+{
+ switch( nToken )
+ {
+ // in Netscape they only have impact in not empty paragraphs
+ case HtmlTokenId::PARABREAK_ON:
+ nToken = HtmlTokenId::LINEBREAK;
+ [[fallthrough]];
+ case HtmlTokenId::LINEBREAK:
+ case HtmlTokenId::NEWPARA:
+ nPre_LinePos = 0;
+ if( bPre_IgnoreNewPara )
+ nToken = HtmlTokenId::NONE;
+ break;
+
+ case HtmlTokenId::TABCHAR:
+ {
+ sal_Int32 nSpaces = 8 - (nPre_LinePos % 8);
+ DBG_ASSERT( aToken.isEmpty(), "Why is the token not empty?" );
+ if (aToken.getLength() < nSpaces)
+ {
+ using comphelper::string::padToLength;
+ OUStringBuffer aBuf(aToken);
+ aToken = padToLength(aBuf, nSpaces, ' ');
+ }
+ nPre_LinePos += nSpaces;
+ nToken = HtmlTokenId::TEXTTOKEN;
+ }
+ break;
+ // Keep those
+ case HtmlTokenId::TEXTTOKEN:
+ nPre_LinePos += aToken.getLength();
+ break;
+
+ case HtmlTokenId::SELECT_ON:
+ case HtmlTokenId::SELECT_OFF:
+ case HtmlTokenId::BODY_ON:
+ case HtmlTokenId::FORM_ON:
+ case HtmlTokenId::FORM_OFF:
+ case HtmlTokenId::INPUT:
+ case HtmlTokenId::OPTION:
+ case HtmlTokenId::TEXTAREA_ON:
+ case HtmlTokenId::TEXTAREA_OFF:
+
+ case HtmlTokenId::IMAGE:
+ case HtmlTokenId::APPLET_ON:
+ case HtmlTokenId::APPLET_OFF:
+ case HtmlTokenId::PARAM:
+ case HtmlTokenId::EMBED:
+
+ case HtmlTokenId::HEAD1_ON:
+ case HtmlTokenId::HEAD1_OFF:
+ case HtmlTokenId::HEAD2_ON:
+ case HtmlTokenId::HEAD2_OFF:
+ case HtmlTokenId::HEAD3_ON:
+ case HtmlTokenId::HEAD3_OFF:
+ case HtmlTokenId::HEAD4_ON:
+ case HtmlTokenId::HEAD4_OFF:
+ case HtmlTokenId::HEAD5_ON:
+ case HtmlTokenId::HEAD5_OFF:
+ case HtmlTokenId::HEAD6_ON:
+ case HtmlTokenId::HEAD6_OFF:
+ case HtmlTokenId::BLOCKQUOTE_ON:
+ case HtmlTokenId::BLOCKQUOTE_OFF:
+ case HtmlTokenId::ADDRESS_ON:
+ case HtmlTokenId::ADDRESS_OFF:
+ case HtmlTokenId::HORZRULE:
+
+ case HtmlTokenId::CENTER_ON:
+ case HtmlTokenId::CENTER_OFF:
+ case HtmlTokenId::DIVISION_ON:
+ case HtmlTokenId::DIVISION_OFF:
+
+ case HtmlTokenId::SCRIPT_ON:
+ case HtmlTokenId::SCRIPT_OFF:
+ case HtmlTokenId::RAWDATA:
+
+ case HtmlTokenId::TABLE_ON:
+ case HtmlTokenId::TABLE_OFF:
+ case HtmlTokenId::CAPTION_ON:
+ case HtmlTokenId::CAPTION_OFF:
+ case HtmlTokenId::COLGROUP_ON:
+ case HtmlTokenId::COLGROUP_OFF:
+ case HtmlTokenId::COL_ON:
+ case HtmlTokenId::COL_OFF:
+ case HtmlTokenId::THEAD_ON:
+ case HtmlTokenId::THEAD_OFF:
+ case HtmlTokenId::TFOOT_ON:
+ case HtmlTokenId::TFOOT_OFF:
+ case HtmlTokenId::TBODY_ON:
+ case HtmlTokenId::TBODY_OFF:
+ case HtmlTokenId::TABLEROW_ON:
+ case HtmlTokenId::TABLEROW_OFF:
+ case HtmlTokenId::TABLEDATA_ON:
+ case HtmlTokenId::TABLEDATA_OFF:
+ case HtmlTokenId::TABLEHEADER_ON:
+ case HtmlTokenId::TABLEHEADER_OFF:
+
+ case HtmlTokenId::ANCHOR_ON:
+ case HtmlTokenId::ANCHOR_OFF:
+ case HtmlTokenId::BOLD_ON:
+ case HtmlTokenId::BOLD_OFF:
+ case HtmlTokenId::ITALIC_ON:
+ case HtmlTokenId::ITALIC_OFF:
+ case HtmlTokenId::STRIKE_ON:
+ case HtmlTokenId::STRIKE_OFF:
+ case HtmlTokenId::STRIKETHROUGH_ON:
+ case HtmlTokenId::STRIKETHROUGH_OFF:
+ case HtmlTokenId::UNDERLINE_ON:
+ case HtmlTokenId::UNDERLINE_OFF:
+ case HtmlTokenId::BASEFONT_ON:
+ case HtmlTokenId::BASEFONT_OFF:
+ case HtmlTokenId::FONT_ON:
+ case HtmlTokenId::FONT_OFF:
+ case HtmlTokenId::BLINK_ON:
+ case HtmlTokenId::BLINK_OFF:
+ case HtmlTokenId::SPAN_ON:
+ case HtmlTokenId::SPAN_OFF:
+ case HtmlTokenId::SUBSCRIPT_ON:
+ case HtmlTokenId::SUBSCRIPT_OFF:
+ case HtmlTokenId::SUPERSCRIPT_ON:
+ case HtmlTokenId::SUPERSCRIPT_OFF:
+ case HtmlTokenId::BIGPRINT_ON:
+ case HtmlTokenId::BIGPRINT_OFF:
+ case HtmlTokenId::SMALLPRINT_OFF:
+ case HtmlTokenId::SMALLPRINT_ON:
+
+ case HtmlTokenId::EMPHASIS_ON:
+ case HtmlTokenId::EMPHASIS_OFF:
+ case HtmlTokenId::CITATION_ON:
+ case HtmlTokenId::CITATION_OFF:
+ case HtmlTokenId::STRONG_ON:
+ case HtmlTokenId::STRONG_OFF:
+ case HtmlTokenId::CODE_ON:
+ case HtmlTokenId::CODE_OFF:
+ case HtmlTokenId::SAMPLE_ON:
+ case HtmlTokenId::SAMPLE_OFF:
+ case HtmlTokenId::KEYBOARD_ON:
+ case HtmlTokenId::KEYBOARD_OFF:
+ case HtmlTokenId::VARIABLE_ON:
+ case HtmlTokenId::VARIABLE_OFF:
+ case HtmlTokenId::DEFINSTANCE_ON:
+ case HtmlTokenId::DEFINSTANCE_OFF:
+ case HtmlTokenId::SHORTQUOTE_ON:
+ case HtmlTokenId::SHORTQUOTE_OFF:
+ case HtmlTokenId::LANGUAGE_ON:
+ case HtmlTokenId::LANGUAGE_OFF:
+ case HtmlTokenId::AUTHOR_ON:
+ case HtmlTokenId::AUTHOR_OFF:
+ case HtmlTokenId::PERSON_ON:
+ case HtmlTokenId::PERSON_OFF:
+ case HtmlTokenId::ACRONYM_ON:
+ case HtmlTokenId::ACRONYM_OFF:
+ case HtmlTokenId::ABBREVIATION_ON:
+ case HtmlTokenId::ABBREVIATION_OFF:
+ case HtmlTokenId::INSERTEDTEXT_ON:
+ case HtmlTokenId::INSERTEDTEXT_OFF:
+ case HtmlTokenId::DELETEDTEXT_ON:
+ case HtmlTokenId::DELETEDTEXT_OFF:
+ case HtmlTokenId::TELETYPE_ON:
+ case HtmlTokenId::TELETYPE_OFF:
+
+ break;
+
+ // The remainder is treated as an unknown token.
+ default:
+ if( nToken != HtmlTokenId::NONE )
+ {
+ nToken =
+ ( ((nToken >= HtmlTokenId::ONOFF_START) && isOffToken(nToken))
+ ? HtmlTokenId::UNKNOWNCONTROL_OFF
+ : HtmlTokenId::UNKNOWNCONTROL_ON );
+ }
+ break;
+ }
+
+ bPre_IgnoreNewPara = false;
+
+ return nToken;
+}
+
+HtmlTokenId HTMLParser::FilterXMP( HtmlTokenId nToken )
+{
+ switch( nToken )
+ {
+ case HtmlTokenId::NEWPARA:
+ if( bPre_IgnoreNewPara )
+ nToken = HtmlTokenId::NONE;
+ [[fallthrough]];
+ case HtmlTokenId::TEXTTOKEN:
+ case HtmlTokenId::NONBREAKSPACE:
+ case HtmlTokenId::SOFTHYPH:
+ break; // kept
+
+ default:
+ if( nToken != HtmlTokenId::NONE )
+ {
+ if( (nToken >= HtmlTokenId::ONOFF_START) && isOffToken(nToken) )
+ {
+ sSaveToken = "</" + sSaveToken;
+ }
+ else
+ sSaveToken = "<" + sSaveToken;
+ if( !aToken.isEmpty() )
+ {
+ UnescapeToken();
+ sSaveToken += " ";
+ aToken.insert(0, sSaveToken);
+ }
+ else
+ aToken = sSaveToken;
+ aToken.append( ">" );
+ nToken = HtmlTokenId::TEXTTOKEN;
+ }
+ break;
+ }
+
+ bPre_IgnoreNewPara = false;
+
+ return nToken;
+}
+
+HtmlTokenId HTMLParser::FilterListing( HtmlTokenId nToken )
+{
+ switch( nToken )
+ {
+ case HtmlTokenId::NEWPARA:
+ if( bPre_IgnoreNewPara )
+ nToken = HtmlTokenId::NONE;
+ [[fallthrough]];
+ case HtmlTokenId::TEXTTOKEN:
+ case HtmlTokenId::NONBREAKSPACE:
+ case HtmlTokenId::SOFTHYPH:
+ break; // kept
+
+ default:
+ if( nToken != HtmlTokenId::NONE )
+ {
+ nToken =
+ ( ((nToken >= HtmlTokenId::ONOFF_START) && isOffToken(nToken))
+ ? HtmlTokenId::UNKNOWNCONTROL_OFF
+ : HtmlTokenId::UNKNOWNCONTROL_ON );
+ }
+ break;
+ }
+
+ bPre_IgnoreNewPara = false;
+
+ return nToken;
+}
+
+bool HTMLParser::InternalImgToPrivateURL( OUString& rURL )
+{
+ bool bFound = false;
+
+ if( rURL.startsWith( OOO_STRING_SVTOOLS_HTML_internal_icon ) )
+ {
+ OUString aName( rURL.copy(14) );
+ switch( aName[0] )
+ {
+ case 'b':
+ bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_baddata;
+ break;
+ case 'd':
+ bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_delayed;
+ break;
+ case 'e':
+ bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_embed;
+ break;
+ case 'i':
+ bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_insecure;
+ break;
+ case 'n':
+ bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_notfound;
+ break;
+ }
+ }
+ if( bFound )
+ {
+ OUString sTmp ( rURL );
+ rURL = OOO_STRING_SVTOOLS_HTML_private_image;
+ rURL += sTmp;
+ }
+
+ return bFound;
+}
+
+namespace {
+
+enum class HtmlMeta {
+ NONE = 0,
+ Author,
+ Description,
+ Keywords,
+ Refresh,
+ Classification,
+ Created,
+ ChangedBy,
+ Changed,
+ Generator,
+ SDFootnote,
+ SDEndnote,
+ ContentType
+};
+
+}
+
+// <META NAME=xxx>
+HTMLOptionEnum<HtmlMeta> const aHTMLMetaNameTable[] =
+{
+ { OOO_STRING_SVTOOLS_HTML_META_author, HtmlMeta::Author },
+ { OOO_STRING_SVTOOLS_HTML_META_changed, HtmlMeta::Changed },
+ { OOO_STRING_SVTOOLS_HTML_META_changedby, HtmlMeta::ChangedBy },
+ { OOO_STRING_SVTOOLS_HTML_META_classification,HtmlMeta::Classification},
+ { OOO_STRING_SVTOOLS_HTML_META_content_type, HtmlMeta::ContentType },
+ { OOO_STRING_SVTOOLS_HTML_META_created, HtmlMeta::Created },
+ { OOO_STRING_SVTOOLS_HTML_META_description, HtmlMeta::Description },
+ { OOO_STRING_SVTOOLS_HTML_META_keywords, HtmlMeta::Keywords },
+ { OOO_STRING_SVTOOLS_HTML_META_generator, HtmlMeta::Generator },
+ { OOO_STRING_SVTOOLS_HTML_META_refresh, HtmlMeta::Refresh },
+ { OOO_STRING_SVTOOLS_HTML_META_sdendnote, HtmlMeta::SDEndnote },
+ { OOO_STRING_SVTOOLS_HTML_META_sdfootnote, HtmlMeta::SDFootnote },
+ { nullptr, HtmlMeta(0) }
+};
+
+
+void HTMLParser::AddMetaUserDefined( OUString const & )
+{
+}
+
+bool HTMLParser::ParseMetaOptionsImpl(
+ const uno::Reference<document::XDocumentProperties> & i_xDocProps,
+ SvKeyValueIterator *i_pHTTPHeader,
+ const HTMLOptions& aOptions,
+ rtl_TextEncoding& o_rEnc )
+{
+ OUString aName, aContent;
+ HtmlMeta nAction = HtmlMeta::NONE;
+ bool bHTTPEquiv = false, bChanged = false;
+
+ for ( size_t i = aOptions.size(); i; )
+ {
+ const HTMLOption& aOption = aOptions[--i];
+ switch ( aOption.GetToken() )
+ {
+ case HtmlOptionId::NAME:
+ aName = aOption.GetString();
+ if ( HtmlMeta::NONE==nAction )
+ {
+ aOption.GetEnum( nAction, aHTMLMetaNameTable );
+ }
+ break;
+ case HtmlOptionId::HTTPEQUIV:
+ aName = aOption.GetString();
+ aOption.GetEnum( nAction, aHTMLMetaNameTable );
+ bHTTPEquiv = true;
+ break;
+ case HtmlOptionId::CONTENT:
+ aContent = aOption.GetString();
+ break;
+ case HtmlOptionId::CHARSET:
+ {
+ OString sValue(OUStringToOString(aOption.GetString(), RTL_TEXTENCODING_ASCII_US));
+ o_rEnc = GetExtendedCompatibilityTextEncoding(rtl_getTextEncodingFromMimeCharset(sValue.getStr()));
+ break;
+ }
+ default: break;
+ }
+ }
+
+ if ( bHTTPEquiv || HtmlMeta::Description != nAction )
+ {
+ // if it is not a Description, remove CRs and LFs from CONTENT
+ aContent = aContent.replaceAll("\r", "").replaceAll("\n", "");
+ }
+ else
+ {
+ // convert line endings for Description
+ aContent = convertLineEnd(aContent, GetSystemLineEnd());
+ }
+
+ if ( bHTTPEquiv && i_pHTTPHeader )
+ {
+ // Netscape seems to just ignore a closing ", so we do too
+ if ( aContent.endsWith("\"") )
+ {
+ aContent = aContent.copy( 0, aContent.getLength() - 1 );
+ }
+ SvKeyValue aKeyValue( aName, aContent );
+ i_pHTTPHeader->Append( aKeyValue );
+ }
+
+ switch ( nAction )
+ {
+ case HtmlMeta::Author:
+ if (i_xDocProps.is()) {
+ i_xDocProps->setAuthor( aContent );
+ bChanged = true;
+ }
+ break;
+ case HtmlMeta::Description:
+ if (i_xDocProps.is()) {
+ i_xDocProps->setDescription( aContent );
+ bChanged = true;
+ }
+ break;
+ case HtmlMeta::Keywords:
+ if (i_xDocProps.is()) {
+ i_xDocProps->setKeywords(
+ ::comphelper::string::convertCommaSeparated(aContent));
+ bChanged = true;
+ }
+ break;
+ case HtmlMeta::Classification:
+ if (i_xDocProps.is()) {
+ i_xDocProps->setSubject( aContent );
+ bChanged = true;
+ }
+ break;
+
+ case HtmlMeta::ChangedBy:
+ if (i_xDocProps.is()) {
+ i_xDocProps->setModifiedBy( aContent );
+ bChanged = true;
+ }
+ break;
+
+ case HtmlMeta::Created:
+ case HtmlMeta::Changed:
+ if (i_xDocProps.is() && !aContent.isEmpty())
+ {
+ ::util::DateTime uDT;
+ bool valid = false;
+ if (comphelper::string::getTokenCount(aContent, ';') == 2)
+ {
+ sal_Int32 nIdx{ 0 };
+ Date aDate(o3tl::toInt32(o3tl::getToken(aContent, 0, ';', nIdx)));
+ auto nTime = o3tl::toInt64(o3tl::getToken(aContent, 0, ';', nIdx));
+ if (nTime < 0)
+ nTime = o3tl::saturating_toggle_sign(nTime);
+ tools::Time aTime(nTime);
+ DateTime aDateTime(aDate, aTime);
+ uDT = aDateTime.GetUNODateTime();
+ valid = true;
+ }
+ else if (utl::ISO8601parseDateTime(aContent, uDT))
+ valid = true;
+
+ if (valid)
+ {
+ bChanged = true;
+ if (HtmlMeta::Created == nAction)
+ i_xDocProps->setCreationDate(uDT);
+ else
+ i_xDocProps->setModificationDate(uDT);
+ }
+ }
+ break;
+
+ case HtmlMeta::Refresh:
+ DBG_ASSERT( !bHTTPEquiv || i_pHTTPHeader, "Lost Reload-URL because of omitted MUST change." );
+ break;
+
+ case HtmlMeta::ContentType:
+ if ( !aContent.isEmpty() )
+ {
+ o_rEnc = GetEncodingByMIME( aContent );
+ }
+ break;
+
+ case HtmlMeta::NONE:
+ if ( !bHTTPEquiv )
+ {
+ if (i_xDocProps.is())
+ {
+ uno::Reference<beans::XPropertyContainer> xUDProps
+ = i_xDocProps->getUserDefinedProperties();
+ try {
+ xUDProps->addProperty(aName,
+ beans::PropertyAttribute::REMOVABLE,
+ uno::Any(aContent));
+ AddMetaUserDefined(aName);
+ bChanged = true;
+ } catch (uno::Exception &) {
+ // ignore
+ }
+ }
+ }
+ break;
+ default:
+ break;
+ }
+
+ return bChanged;
+}
+
+bool HTMLParser::ParseMetaOptions(
+ const uno::Reference<document::XDocumentProperties> & i_xDocProps,
+ SvKeyValueIterator *i_pHeader )
+{
+ HtmlOptionId nContentOption = HtmlOptionId::CONTENT;
+ rtl_TextEncoding eEnc = RTL_TEXTENCODING_DONTKNOW;
+
+ bool bRet = ParseMetaOptionsImpl( i_xDocProps, i_pHeader,
+ GetOptions(&nContentOption),
+ eEnc );
+
+ // If the encoding is set by a META tag, it may only overwrite the
+ // current encoding if both, the current and the new encoding, are 1-sal_uInt8
+ // encodings. Everything else cannot lead to reasonable results.
+ if (RTL_TEXTENCODING_DONTKNOW != eEnc &&
+ rtl_isOctetTextEncoding( eEnc ) &&
+ rtl_isOctetTextEncoding( GetSrcEncoding() ) )
+ {
+ eEnc = GetExtendedCompatibilityTextEncoding( eEnc );
+ SetSrcEncoding( eEnc );
+ }
+
+ return bRet;
+}
+
+rtl_TextEncoding HTMLParser::GetEncodingByMIME( const OUString& rMime )
+{
+ OUString sType;
+ OUString sSubType;
+ INetContentTypeParameterList aParameters;
+ if (INetContentTypes::parse(rMime, sType, sSubType, &aParameters))
+ {
+ auto const iter = aParameters.find("charset"_ostr);
+ if (iter != aParameters.end())
+ {
+ const INetContentTypeParameter * pCharset = &iter->second;
+ OString sValue(OUStringToOString(pCharset->m_sValue, RTL_TEXTENCODING_ASCII_US));
+ return GetExtendedCompatibilityTextEncoding( rtl_getTextEncodingFromMimeCharset( sValue.getStr() ) );
+ }
+ }
+ return RTL_TEXTENCODING_DONTKNOW;
+}
+
+rtl_TextEncoding HTMLParser::GetEncodingByHttpHeader( SvKeyValueIterator *pHTTPHeader )
+{
+ rtl_TextEncoding eRet = RTL_TEXTENCODING_DONTKNOW;
+ if( pHTTPHeader )
+ {
+ SvKeyValue aKV;
+ for( bool bCont = pHTTPHeader->GetFirst( aKV ); bCont;
+ bCont = pHTTPHeader->GetNext( aKV ) )
+ {
+ if( aKV.GetKey().equalsIgnoreAsciiCase( OOO_STRING_SVTOOLS_HTML_META_content_type ) )
+ {
+ if( !aKV.GetValue().isEmpty() )
+ {
+ eRet = HTMLParser::GetEncodingByMIME( aKV.GetValue() );
+ }
+ }
+ }
+ }
+ return eRet;
+}
+
+bool HTMLParser::SetEncodingByHTTPHeader( SvKeyValueIterator *pHTTPHeader )
+{
+ bool bRet = false;
+ rtl_TextEncoding eEnc = HTMLParser::GetEncodingByHttpHeader( pHTTPHeader );
+ if(RTL_TEXTENCODING_DONTKNOW != eEnc)
+ {
+ SetSrcEncoding( eEnc );
+ bRet = true;
+ }
+ return bRet;
+}
+
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */