Adding upstream version 4:24.2.0.upstream/4%24.2.0

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-15 05:54:39 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-15 05:54:39 +0000
commit: 267c6f2ac71f92999e969232431ba04678e7437e (patch)
tree: 358c9467650e1d0a1d7227a21dac2e3d08b622b2 /svtools/source/svhtml/parhtml.cxx
parent: Initial commit. (diff)
download: libreoffice-267c6f2ac71f92999e969232431ba04678e7437e.tar.xz
libreoffice-267c6f2ac71f92999e969232431ba04678e7437e.zip
1 files changed, 2213 insertions, 0 deletions
diff --git a/svtools/source/svhtml/parhtml.cxx b/svtools/source/svhtml/parhtml.cxx
new file mode 100644
index 0000000000..1f87456744
--- /dev/null
+++ b/svtools/source/svhtml/parhtml.cxx
@@ -0,0 +1,2213 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * This file incorporates work covered by the following license notice:
+ *
+ *   Licensed to the Apache Software Foundation (ASF) under one or more
+ *   contributor license agreements. See the NOTICE file distributed
+ *   with this work for additional information regarding copyright
+ *   ownership. The ASF licenses this file to you under the Apache
+ *   License, Version 2.0 (the "License"); you may not use this file
+ *   except in compliance with the License. You may obtain a copy of
+ *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
+ */
+
+#include <comphelper/string.hxx>
+#include <o3tl/safeint.hxx>
+#include <o3tl/string_view.hxx>
+#include <tools/stream.hxx>
+#include <tools/debug.hxx>
+#include <tools/color.hxx>
+#include <rtl/ustrbuf.hxx>
+#include <rtl/character.hxx>
+#include <rtl/tencinfo.h>
+#include <sal/log.hxx>
+#include <tools/tenccvt.hxx>
+#include <tools/datetime.hxx>
+#include <unotools/datetime.hxx>
+#include <svl/inettype.hxx>
+#include <svl/lngmisc.hxx>
+#include <com/sun/star/beans/PropertyAttribute.hpp>
+#include <com/sun/star/document/XDocumentProperties.hpp>
+
+#include <svtools/parhtml.hxx>
+#include <svtools/htmltokn.h>
+#include <svtools/htmlkywd.hxx>
+
+#include <utility>
+
+using namespace ::com::sun::star;
+
+
+const sal_Int32 MAX_LEN( 1024 );
+
+const sal_Int32 MAX_ENTITY_LEN( 8 );
+
+
+// Tables to convert option values into strings
+
+// <INPUT TYPE=xxx>
+HTMLOptionEnum<HTMLInputType> const aInputTypeOptEnums[] =
+{
+    { OOO_STRING_SVTOOLS_HTML_IT_text,      HTMLInputType::Text        },
+    { OOO_STRING_SVTOOLS_HTML_IT_password,  HTMLInputType::Password    },
+    { OOO_STRING_SVTOOLS_HTML_IT_checkbox,  HTMLInputType::Checkbox    },
+    { OOO_STRING_SVTOOLS_HTML_IT_radio,     HTMLInputType::Radio       },
+    { OOO_STRING_SVTOOLS_HTML_IT_range,     HTMLInputType::Range       },
+    { OOO_STRING_SVTOOLS_HTML_IT_scribble,  HTMLInputType::Scribble    },
+    { OOO_STRING_SVTOOLS_HTML_IT_file,      HTMLInputType::File        },
+    { OOO_STRING_SVTOOLS_HTML_IT_hidden,    HTMLInputType::Hidden      },
+    { OOO_STRING_SVTOOLS_HTML_IT_submit,    HTMLInputType::Submit      },
+    { OOO_STRING_SVTOOLS_HTML_IT_image,     HTMLInputType::Image       },
+    { OOO_STRING_SVTOOLS_HTML_IT_reset,     HTMLInputType::Reset       },
+    { OOO_STRING_SVTOOLS_HTML_IT_button,    HTMLInputType::Button      },
+    { nullptr,                              HTMLInputType(0)    }
+};
+
+// <TABLE FRAME=xxx>
+HTMLOptionEnum<HTMLTableFrame> const aTableFrameOptEnums[] =
+{
+    { OOO_STRING_SVTOOLS_HTML_TF_void,    HTMLTableFrame::Void    },
+    { OOO_STRING_SVTOOLS_HTML_TF_above,   HTMLTableFrame::Above   },
+    { OOO_STRING_SVTOOLS_HTML_TF_below,   HTMLTableFrame::Below   },
+    { OOO_STRING_SVTOOLS_HTML_TF_hsides,  HTMLTableFrame::HSides  },
+    { OOO_STRING_SVTOOLS_HTML_TF_lhs,     HTMLTableFrame::LHS     },
+    { OOO_STRING_SVTOOLS_HTML_TF_rhs,     HTMLTableFrame::RHS     },
+    { OOO_STRING_SVTOOLS_HTML_TF_vsides,  HTMLTableFrame::VSides  },
+    { OOO_STRING_SVTOOLS_HTML_TF_box,     HTMLTableFrame::Box     },
+    { OOO_STRING_SVTOOLS_HTML_TF_border,  HTMLTableFrame::Box     },
+    { nullptr,                            HTMLTableFrame(0) }
+};
+
+// <TABLE RULES=xxx>
+HTMLOptionEnum<HTMLTableRules> const aTableRulesOptEnums[] =
+{
+    { OOO_STRING_SVTOOLS_HTML_TR_none,   HTMLTableRules::NONE      },
+    { OOO_STRING_SVTOOLS_HTML_TR_groups, HTMLTableRules::Groups    },
+    { OOO_STRING_SVTOOLS_HTML_TR_rows,   HTMLTableRules::Rows      },
+    { OOO_STRING_SVTOOLS_HTML_TR_cols,   HTMLTableRules::Cols      },
+    { OOO_STRING_SVTOOLS_HTML_TR_all,    HTMLTableRules::All       },
+    { nullptr,                           HTMLTableRules(0) }
+};
+
+
+HTMLOption::HTMLOption( HtmlOptionId nTok, OUString _aToken,
+                        OUString _aValue )
+    : aValue(std::move(_aValue))
+    , aToken(std::move(_aToken))
+    , nToken( nTok )
+{
+    DBG_ASSERT( nToken>=HtmlOptionId::BOOL_START && nToken<HtmlOptionId::END,
+        "HTMLOption: unknown token" );
+}
+
+sal_uInt32 HTMLOption::GetNumber() const
+{
+    DBG_ASSERT( (nToken>=HtmlOptionId::NUMBER_START &&
+                 nToken<HtmlOptionId::NUMBER_END) ||
+                (nToken>=HtmlOptionId::CONTEXT_START &&
+                 nToken<HtmlOptionId::CONTEXT_END) ||
+                nToken==HtmlOptionId::VALUE,
+        "GetNumber: Option not numerical" );
+    OUString aTmp(comphelper::string::stripStart(aValue, ' '));
+    sal_Int32 nTmp = aTmp.toInt32();
+    return nTmp >= 0 ? static_cast<sal_uInt32>(nTmp) : 0;
+}
+
+sal_Int32 HTMLOption::GetSNumber() const
+{
+    DBG_ASSERT( (nToken>=HtmlOptionId::NUMBER_START && nToken<HtmlOptionId::NUMBER_END) ||
+                (nToken>=HtmlOptionId::CONTEXT_START && nToken<HtmlOptionId::CONTEXT_END),
+        "GetSNumber: Option not numerical" );
+    OUString aTmp(comphelper::string::stripStart(aValue, ' '));
+    return aTmp.toInt32();
+}
+
+void HTMLOption::GetNumbers( std::vector<sal_uInt32> &rNumbers ) const
+{
+    rNumbers.clear();
+
+    // This is a very simplified scanner: it only searches all
+    // numerals in the string.
+    bool bInNum = false;
+    sal_uInt32 nNum = 0;
+    for( sal_Int32 i=0; i<aValue.getLength(); i++ )
+    {
+        sal_Unicode c = aValue[ i ];
+        if( c>='0' && c<='9' )
+        {
+            nNum *= 10;
+            nNum += (c - '0');
+            bInNum = true;
+        }
+        else if( bInNum )
+        {
+            rNumbers.push_back( nNum );
+            bInNum = false;
+            nNum = 0;
+        }
+    }
+    if( bInNum )
+    {
+        rNumbers.push_back( nNum );
+    }
+}
+
+void HTMLOption::GetColor( Color& rColor ) const
+{
+    DBG_ASSERT( (nToken>=HtmlOptionId::COLOR_START && nToken<HtmlOptionId::COLOR_END) || nToken==HtmlOptionId::SIZE,
+        "GetColor: Option is not a color." );
+
+    OUString aTmp(aValue.toAsciiLowerCase());
+    sal_uInt32 nColor = SAL_MAX_UINT32;
+    if (!aTmp.isEmpty() && aTmp[0] != '#')
+        nColor = GetHTMLColor(aTmp);
+
+    if( SAL_MAX_UINT32 == nColor )
+    {
+        nColor = 0;
+        sal_Int32 nPos = 0;
+        for (sal_uInt32 i=0; i<6; ++i)
+        {
+            // Whatever Netscape does to get color values,
+            // at maximum three characters < '0' are ignored.
+            sal_Unicode c = nPos<aTmp.getLength() ? aTmp[ nPos++ ] : '0';
+            if( c < '0' )
+            {
+                c = nPos<aTmp.getLength() ? aTmp[nPos++] : '0';
+                if( c < '0' )
+                    c = nPos<aTmp.getLength() ? aTmp[nPos++] : '0';
+            }
+            nColor *= 16;
+            if( c >= '0' && c <= '9' )
+                nColor += (c - '0');
+            else if( c >= 'a' && c <= 'f' )
+                nColor += (c + 0xa - 'a');
+        }
+    }
+
+    rColor.SetRed(   static_cast<sal_uInt8>((nColor & 0x00ff0000) >> 16) );
+    rColor.SetGreen( static_cast<sal_uInt8>((nColor & 0x0000ff00) >> 8));
+    rColor.SetBlue(  static_cast<sal_uInt8>(nColor & 0x000000ff) );
+}
+
+HTMLInputType HTMLOption::GetInputType() const
+{
+    DBG_ASSERT( nToken==HtmlOptionId::TYPE, "GetInputType: Option not TYPE" );
+    return GetEnum( aInputTypeOptEnums, HTMLInputType::Text );
+}
+
+HTMLTableFrame HTMLOption::GetTableFrame() const
+{
+    DBG_ASSERT( nToken==HtmlOptionId::FRAME, "GetTableFrame: Option not FRAME" );
+    return GetEnum( aTableFrameOptEnums );
+}
+
+HTMLTableRules HTMLOption::GetTableRules() const
+{
+    DBG_ASSERT( nToken==HtmlOptionId::RULES, "GetTableRules: Option not RULES" );
+    return GetEnum( aTableRulesOptEnums );
+}
+
+HTMLParser::HTMLParser( SvStream& rIn, bool bReadNewDoc ) :
+    SvParser<HtmlTokenId>( rIn ),
+    bNewDoc(bReadNewDoc),
+    bIsInHeader(true),
+    bReadListing(false),
+    bReadXMP(false),
+    bReadPRE(false),
+    bReadTextArea(false),
+    bReadScript(false),
+    bReadStyle(false),
+    bEndTokenFound(false),
+    bPre_IgnoreNewPara(false),
+    bReadNextChar(false),
+    bReadComment(false),
+    nPre_LinePos(0),
+    mnPendingOffToken(HtmlTokenId::NONE)
+{
+    //#i76649, default to UTF-8 for HTML unless we know differently
+    SetSrcEncoding(RTL_TEXTENCODING_UTF8);
+}
+
+HTMLParser::~HTMLParser()
+{
+}
+
+void HTMLParser::SetNamespace(std::u16string_view rNamespace)
+{
+    // Convert namespace alias to a prefix.
+    maNamespace = OUString::Concat(rNamespace) + ":";
+}
+
+namespace
+{
+    class RefGuard
+    {
+    private:
+        HTMLParser& m_rParser;
+    public:
+        RefGuard(HTMLParser& rParser)
+            : m_rParser(rParser)
+        {
+            m_rParser.AddFirstRef();
+        }
+
+        ~RefGuard()
+        {
+            if (m_rParser.GetStatus() != SvParserState::Pending)
+                m_rParser.ReleaseRef(); // Parser not needed anymore
+        }
+    };
+}
+
+SvParserState HTMLParser::CallParser()
+{
+    eState = SvParserState::Working;
+    nNextCh = GetNextChar();
+    SaveState( HtmlTokenId::NONE );
+
+    nPre_LinePos = 0;
+    bPre_IgnoreNewPara = false;
+
+    RefGuard aRefGuard(*this);
+
+    Continue( HtmlTokenId::NONE );
+
+    return eState;
+}
+
+void HTMLParser::Continue( HtmlTokenId nToken )
+{
+    if( nToken == HtmlTokenId::NONE )
+        nToken = GetNextToken();
+
+    while( IsParserWorking() )
+    {
+        SaveState( nToken );
+        nToken = FilterToken( nToken );
+
+        if( nToken != HtmlTokenId::NONE )
+            NextToken( nToken );
+
+        if( IsParserWorking() )
+            SaveState( HtmlTokenId::NONE );         // continue with new token
+
+        nToken = GetNextToken();
+    }
+}
+
+HtmlTokenId HTMLParser::FilterToken( HtmlTokenId nToken )
+{
+    switch( nToken )
+    {
+    case HtmlTokenId(EOF):
+        nToken = HtmlTokenId::NONE;
+        break;          // don't pass
+
+    case HtmlTokenId::HEAD_OFF:
+        bIsInHeader = false;
+        break;
+
+    case HtmlTokenId::HEAD_ON:
+        bIsInHeader = true;
+        break;
+
+    case HtmlTokenId::BODY_ON:
+        bIsInHeader = false;
+        break;
+
+    case HtmlTokenId::FRAMESET_ON:
+        bIsInHeader = false;
+        break;
+
+    case HtmlTokenId::BODY_OFF:
+        bReadPRE = bReadListing = bReadXMP = false;
+        break;
+
+    case HtmlTokenId::HTML_OFF:
+        nToken = HtmlTokenId::NONE;
+        bReadPRE = bReadListing = bReadXMP = false;
+        break;      // HtmlTokenId::ON hasn't been passed either !
+
+    case HtmlTokenId::PREFORMTXT_ON:
+        StartPRE();
+        break;
+
+    case HtmlTokenId::PREFORMTXT_OFF:
+        FinishPRE();
+        break;
+
+    case HtmlTokenId::LISTING_ON:
+        StartListing();
+        break;
+
+    case HtmlTokenId::LISTING_OFF:
+        FinishListing();
+        break;
+
+    case HtmlTokenId::XMP_ON:
+        StartXMP();
+        break;
+
+    case HtmlTokenId::XMP_OFF:
+        FinishXMP();
+        break;
+
+    default:
+        if( bReadPRE )
+            nToken = FilterPRE( nToken );
+        else if( bReadListing )
+            nToken = FilterListing( nToken );
+        else if( bReadXMP )
+            nToken = FilterXMP( nToken );
+
+        break;
+    }
+
+    return nToken;
+}
+
+namespace {
+
+constexpr bool HTML_ISPRINTABLE(sal_Unicode c) { return c >= 32 && c != 127; }
+
+constexpr bool HTML_ISSPACE(sal_uInt32 c)
+{
+    return ' ' == c || '\t' == c || '\r' == c || '\n' == c || '\x0b' == c;
+}
+
+}
+
+HtmlTokenId HTMLParser::ScanText(const sal_Unicode cBreak)
+{
+    OUStringBuffer sTmpBuffer( MAX_LEN );
+    bool bContinue = true;
+    bool bEqSignFound = false;
+    sal_uInt32  cQuote = 0U;
+
+    while( bContinue && IsParserWorking() )
+    {
+        bool bNextCh = true;
+        switch( nNextCh )
+        {
+        case '&':
+            bEqSignFound = false;
+            if( bReadXMP )
+                sTmpBuffer.append( '&' );
+            else
+            {
+                sal_uInt64 nStreamPos = rInput.Tell();
+                sal_uInt32 nLinePos = GetLinePos();
+
+                sal_uInt32 cChar = 0U;
+                if( '#' == (nNextCh = GetNextChar()) )
+                {
+                    nNextCh = GetNextChar();
+                    const bool bIsHex( 'x' == nNextCh );
+                    const bool bIsDecOrHex( bIsHex || rtl::isAsciiDigit(nNextCh) );
+                    if ( bIsDecOrHex )
+                    {
+                        if ( bIsHex )
+                        {
+                            nNextCh = GetNextChar();
+                            while ( rtl::isAsciiHexDigit(nNextCh) )
+                            {
+                                cChar = cChar * 16U +
+                                        ( nNextCh <= '9'
+                                          ? sal_uInt32( nNextCh - '0' )
+                                          : ( nNextCh <= 'F'
+                                              ? sal_uInt32( nNextCh - 'A' + 10 )
+                                              : sal_uInt32( nNextCh - 'a' + 10 ) ) );
+                                nNextCh = GetNextChar();
+                            }
+                        }
+                        else
+                        {
+                            do
+                            {
+                                cChar = cChar * 10U + sal_uInt32( nNextCh - '0');
+                                nNextCh = GetNextChar();
+                            }
+                            while( rtl::isAsciiDigit(nNextCh) );
+                        }
+
+                        if( RTL_TEXTENCODING_DONTKNOW != eSrcEnc &&
+                            RTL_TEXTENCODING_UCS2 != eSrcEnc &&
+                            RTL_TEXTENCODING_UTF8 != eSrcEnc &&
+                            cChar < 256 )
+                        {
+                            const sal_uInt32 convertFlags =
+                                RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT |
+                                RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
+                                RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT;
+
+                            char cEncodedChar = static_cast<char>(cChar);
+                            cChar = OUString(&cEncodedChar, 1, eSrcEnc, convertFlags).toChar();
+                            if( 0U == cChar )
+                            {
+                                // If the character could not be
+                                // converted, because a conversion is not
+                                // available, do no conversion at all.
+                                cChar = cEncodedChar;
+                            }
+                        }
+                    }
+                    else
+                        nNextCh = 0U;
+
+                    if (!rtl::isUnicodeCodePoint(cChar)
+                        || (linguistic::IsControlChar(cChar)
+                            && cChar != '\r' && cChar != '\n' && cChar != '\t'))
+                    {
+                        cChar = '?';
+                    }
+                }
+                else if( rtl::isAsciiAlpha( nNextCh ) )
+                {
+                    OUStringBuffer sEntityBuffer( MAX_ENTITY_LEN );
+                    sal_Int32 nPos = 0;
+                    do
+                    {
+                        sEntityBuffer.appendUtf32( nNextCh );
+                        nPos++;
+                        nNextCh = GetNextChar();
+                    }
+                    while( nPos < MAX_ENTITY_LEN && rtl::isAsciiAlphanumeric( nNextCh ) &&
+                           !rInput.eof() );
+
+                    if( IsParserWorking() && !rInput.eof() )
+                    {
+                        std::u16string_view sEntity(sEntityBuffer.subView(0, nPos));
+                        cChar = GetHTMLCharName( sEntity );
+
+                        // not found ( == 0 ): plain text
+                        // or a character which is inserted as attribute
+                        if( 0U == cChar && ';' != nNextCh )
+                        {
+                            DBG_ASSERT( rInput.Tell() - nStreamPos ==
+                                        static_cast<sal_uInt64>(nPos+1)*GetCharSize(),
+                                        "UTF-8 is failing here" );
+                            for( sal_Int32 i = nPos-1; i>1; i-- )
+                            {
+                                nNextCh = sEntityBuffer[i];
+                                sEntityBuffer.setLength( i );
+                                sEntity = sEntityBuffer.subView(0, i);
+                                cChar = GetHTMLCharName( sEntity );
+                                if( cChar )
+                                {
+                                    rInput.SeekRel( -static_cast<sal_Int64>
+                                            (nPos-i)*GetCharSize() );
+                                    nlLinePos -= sal_uInt32(nPos-i);
+                                    nPos = i;
+                                    ClearTxtConvContext();
+                                    break;
+                                }
+                            }
+                        }
+
+                        if( !cChar )        // unknown character?
+                        {
+                            // back in stream, insert '&'
+                            // and restart with next character
+                            sTmpBuffer.append( '&' );
+
+                            DBG_ASSERT( rInput.Tell()-nStreamPos ==
+                                        static_cast<sal_uInt64>(nPos+1)*GetCharSize(),
+                                        "Wrong stream position" );
+                            DBG_ASSERT( nlLinePos-nLinePos ==
+                                        static_cast<sal_uInt32>(nPos+1),
+                                        "Wrong line position" );
+                            rInput.Seek( nStreamPos );
+                            nlLinePos = nLinePos;
+                            ClearTxtConvContext();
+                            break;
+                        }
+
+                        assert(cChar != 0);
+
+                        // 1 == Non Breaking Space
+                        // 2 == SoftHyphen
+
+                        if (cChar == 1 || cChar == 2)
+                        {
+                            if( '>' == cBreak )
+                            {
+                                // When reading the content of a tag we have
+                                // to change it to ' ' or '-'
+                                if( 1U == cChar )
+                                    cChar = ' ';
+                                else //2U
+                                    cChar = '-';
+                            }
+                            else
+                            {
+                                // If not scanning a tag return token
+                                aToken.append( sTmpBuffer );
+                                sTmpBuffer.setLength(0);
+
+                                if( !aToken.isEmpty() )
+                                {
+                                    // restart with character
+                                    nNextCh = '&';
+                                    DBG_ASSERT( rInput.Tell()-nStreamPos ==
+                                                static_cast<sal_uInt64>(nPos+1)*GetCharSize(),
+                                                "Wrong stream position" );
+                                    DBG_ASSERT( nlLinePos-nLinePos ==
+                                                static_cast<sal_uInt32>(nPos+1),
+                                                "Wrong line position" );
+                                    rInput.Seek( nStreamPos );
+                                    nlLinePos = nLinePos;
+                                    ClearTxtConvContext();
+                                    return HtmlTokenId::TEXTTOKEN;
+                                }
+
+                                // Hack: _GetNextChar shall not read the
+                                // next character
+                                if( ';' != nNextCh )
+                                    aToken.append( " " );
+                                if( 1U == cChar )
+                                    return HtmlTokenId::NONBREAKSPACE;
+                                else //2U
+                                    return HtmlTokenId::SOFTHYPH;
+                            }
+                        }
+                    }
+                    else
+                        nNextCh = 0U;
+                }
+                // &{...};-JavaScript-Macros are not supported any longer.
+                else if( IsParserWorking() )
+                {
+                    sTmpBuffer.append( '&' );
+                    bNextCh = false;
+                    break;
+                }
+
+                bNextCh = (';' == nNextCh);
+                if( cBreak=='>' && (cChar=='\\' || cChar=='\'' ||
+                                    cChar=='\"' || cChar==' ') )
+                {
+                    // ' and " have to be escaped within tags to separate
+                    // them from ' and " enclosing options.
+                    // \ has to be escaped as well.
+                    // Space is protected because it's not a delimiter between
+                    // options.
+                    sTmpBuffer.append( '\\' );
+                }
+                if( IsParserWorking() )
+                {
+                    if( cChar )
+                        sTmpBuffer.appendUtf32( cChar );
+                }
+                else if( SvParserState::Pending==eState && '>'!=cBreak )
+                {
+                    // Restart with '&', the remainder is returned as
+                    // text token.
+                    if( !aToken.isEmpty() || !sTmpBuffer.isEmpty() )
+                    {
+                        // _GetNextChar() returns the previous text and
+                        // during the next execution a new character is read.
+                        // Thus we have to position in front of the '&'.
+                        nNextCh = 0U;
+                        rInput.Seek( nStreamPos - GetCharSize() );
+                        nlLinePos = nLinePos-1;
+                        ClearTxtConvContext();
+                        bReadNextChar = true;
+                    }
+                    bNextCh = false;
+                }
+            }
+            break;
+        case '=':
+            if( '>'==cBreak && !cQuote )
+                bEqSignFound = true;
+            sTmpBuffer.appendUtf32( nNextCh );
+            break;
+
+        case '\\':
+            if( '>'==cBreak )
+            {
+                // mark within tags
+                sTmpBuffer.append( '\\' );
+            }
+            sTmpBuffer.append( '\\' );
+            break;
+
+        case '\"':
+        case '\'':
+            if( '>'==cBreak )
+            {
+                if( bEqSignFound )
+                    cQuote = nNextCh;
+                else if( cQuote && (cQuote==nNextCh ) )
+                    cQuote = 0U;
+            }
+            sTmpBuffer.appendUtf32( nNextCh );
+            bEqSignFound = false;
+            break;
+
+        case sal_Unicode(EOF):
+            if( rInput.eof() )
+            {
+                bContinue = false;
+            }
+            // else: ignore, not a valid code point
+            break;
+
+        case '<':
+            bEqSignFound = false;
+            if( '>'==cBreak )
+                sTmpBuffer.appendUtf32( nNextCh );
+            else
+                bContinue = false;      // break, string is together
+            break;
+
+        case '\f':
+            if( '>' == cBreak )
+            {
+                // If scanning options treat it like a space, ...
+                sTmpBuffer.append( ' ' );
+            }
+            else
+            {
+                // otherwise it's a separate token.
+                bContinue = false;
+            }
+            break;
+
+        case '\r':
+        case '\n':
+            if( '>'==cBreak )
+            {
+                // cr/lf in tag is handled in GetNextToken_()
+                sTmpBuffer.appendUtf32( nNextCh );
+                break;
+            }
+            else if( bReadListing || bReadXMP || bReadPRE || bReadTextArea )
+            {
+                bContinue = false;
+                break;
+            }
+            // Reduce sequence of CR/LF/BLANK/TAB to a single blank
+            [[fallthrough]];
+        case '\t':
+            if( '\t'==nNextCh && bReadPRE && '>'!=cBreak )
+            {
+                // Pass Tabs up in <PRE>
+                bContinue = false;
+                break;
+            }
+            [[fallthrough]];
+        case '\x0b':
+            if( '\x0b'==nNextCh && (bReadPRE || bReadXMP ||bReadListing) &&
+                '>'!=cBreak )
+            {
+                break;
+            }
+            if (!m_bPreserveSpaces)
+                nNextCh = ' ';
+            [[fallthrough]];
+        case ' ':
+            if (!m_bPreserveSpaces)
+            {
+                sTmpBuffer.appendUtf32(nNextCh);
+                if ('>' != cBreak && (!bReadListing && !bReadXMP && !bReadPRE && !bReadTextArea))
+                {
+                    // Reduce sequences of Blanks/Tabs/CR/LF to a single blank
+                    do
+                    {
+                        nNextCh = GetNextChar();
+                        if (sal_Unicode(EOF) == nNextCh && rInput.eof())
+                        {
+                            if (!aToken.isEmpty() || sTmpBuffer.getLength() > 1)
+                            {
+                                // Have seen s.th. aside from blanks?
+                                aToken.append(sTmpBuffer);
+                                sTmpBuffer.setLength(0);
+                                return HtmlTokenId::TEXTTOKEN;
+                            }
+                            else
+                                // Only read blanks: no text must be returned
+                                // and GetNextToken_ has to read until EOF
+                                return HtmlTokenId::NONE;
+                        }
+                    } while (HTML_ISSPACE(nNextCh));
+                    bNextCh = false;
+                }
+                break;
+            }
+            [[fallthrough]];
+        default:
+            bEqSignFound = false;
+            if (nNextCh == cBreak && !cQuote)
+                bContinue = false;
+            else
+            {
+                do {
+                    if (!linguistic::IsControlChar(nNextCh) || HTML_ISSPACE(nNextCh))
+                    {
+                    // All remaining characters make their way into the text.
+                        sTmpBuffer.appendUtf32( nNextCh );
+                    }
+
+                    nNextCh = GetNextChar();
+                    if( ( sal_Unicode(EOF) == nNextCh && rInput.eof() ) ||
+                        !IsParserWorking() )
+                    {
+                        if( !sTmpBuffer.isEmpty() )
+                            aToken.append( sTmpBuffer );
+                        return HtmlTokenId::TEXTTOKEN;
+                    }
+                } while( rtl::isAsciiAlpha( nNextCh ) || rtl::isAsciiDigit( nNextCh ) );
+                bNextCh = false;
+            }
+        }
+
+        if( bContinue && bNextCh )
+            nNextCh = GetNextChar();
+    }
+
+    if( !sTmpBuffer.isEmpty() )
+        aToken.append( sTmpBuffer );
+
+    return HtmlTokenId::TEXTTOKEN;
+}
+
+HtmlTokenId HTMLParser::GetNextRawToken()
+{
+    OUStringBuffer sTmpBuffer( MAX_LEN );
+
+    if( bEndTokenFound )
+    {
+        // During the last execution we already found the end token,
+        // thus we don't have to search it again.
+        bReadScript = false;
+        bReadStyle = false;
+        aEndToken.clear();
+        bEndTokenFound = false;
+
+        return HtmlTokenId::NONE;
+    }
+
+    // Default return value: HtmlTokenId::RAWDATA
+    bool bContinue = true;
+    HtmlTokenId nToken = HtmlTokenId::RAWDATA;
+    SaveState( HtmlTokenId::NONE );
+    while( bContinue && IsParserWorking() )
+    {
+        bool bNextCh = true;
+        switch( nNextCh )
+        {
+        case '<':
+            {
+                // Maybe we've reached the end.
+
+                // Save what we have read previously...
+                aToken.append( sTmpBuffer );
+                sTmpBuffer.setLength(0);
+
+                // and remember position in stream.
+                sal_uInt64 nStreamPos = rInput.Tell();
+                sal_uInt32 nLineNr = GetLineNr();
+                sal_uInt32 nLinePos = GetLinePos();
+
+                // Start of an end token?
+                bool bOffState = false;
+                if( '/' == (nNextCh = GetNextChar()) )
+                {
+                    bOffState = true;
+                    nNextCh = GetNextChar();
+                }
+                else if( '!' == nNextCh )
+                {
+                    sTmpBuffer.appendUtf32( nNextCh );
+                    nNextCh = GetNextChar();
+                }
+
+                // Read following letters
+                while( (rtl::isAsciiAlpha(nNextCh) || '-'==nNextCh) &&
+                       IsParserWorking() && sTmpBuffer.getLength() < MAX_LEN )
+                {
+                    sTmpBuffer.appendUtf32( nNextCh );
+                    nNextCh = GetNextChar();
+                }
+
+                OUString aTok( sTmpBuffer.toString() );
+                aTok = aTok.toAsciiLowerCase();
+                bool bDone = false;
+                if( bReadScript || !aEndToken.isEmpty() )
+                {
+                    if( !bReadComment )
+                    {
+                        if( aTok.startsWith( OOO_STRING_SVTOOLS_HTML_comment ) )
+                        {
+                            bReadComment = true;
+                        }
+                        else
+                        {
+                            // A script has to end with "</SCRIPT>". But
+                            // ">" is optional for security reasons
+                            bDone = bOffState &&
+                            ( bReadScript
+                                ? aTok == OOO_STRING_SVTOOLS_HTML_script
+                                : aTok == aEndToken );
+                        }
+                    }
+                    if( bReadComment && '>'==nNextCh && aTok.endsWith( "--" ) )
+                    {
+                        // End of comment of style <!----->
+                        bReadComment = false;
+                    }
+                }
+                else
+                {
+                    // Style sheets can be closed by </STYLE>, </HEAD> or <BODY>
+                    if( bOffState )
+                        bDone = aTok == OOO_STRING_SVTOOLS_HTML_style ||
+                                aTok == OOO_STRING_SVTOOLS_HTML_head;
+                    else
+                        bDone = aTok == OOO_STRING_SVTOOLS_HTML_body;
+                }
+
+                if( bDone )
+                {
+                    // Done! Return the previously read string (if requested)
+                    // and continue.
+
+                    bContinue = false;
+
+                    // nToken==0 means, GetNextToken_ continues to read
+                    if( aToken.isEmpty() && (bReadStyle || bReadScript) )
+                    {
+                        // Immediately close environment (or context?)
+                        // and parse the end token
+                        bReadScript = false;
+                        bReadStyle = false;
+                        aEndToken.clear();
+                        nToken = HtmlTokenId::NONE;
+                    }
+                    else
+                    {
+                        // Keep bReadScript/bReadStyle alive
+                        // and parse end token during next execution
+                        bEndTokenFound = true;
+                    }
+
+                    // Move backwards in stream to '<'
+                    rInput.Seek( nStreamPos );
+                    SetLineNr( nLineNr );
+                    SetLinePos( nLinePos );
+                    ClearTxtConvContext();
+                    nNextCh = '<';
+
+                    // Don't append string to token.
+                    sTmpBuffer.setLength( 0 );
+                }
+                else
+                {
+                    // remember "</" , everything else we find in the buffer
+                    aToken.append( "<" );
+                    if( bOffState )
+                        aToken.append( "/" );
+
+                    bNextCh = false;
+                }
+            }
+            break;
+        case '-':
+            sTmpBuffer.appendUtf32( nNextCh );
+            if( bReadComment )
+            {
+                bool bTwoMinus = false;
+                nNextCh = GetNextChar();
+                while( '-' == nNextCh && IsParserWorking() )
+                {
+                    bTwoMinus = true;
+                    sTmpBuffer.appendUtf32( nNextCh );
+                    nNextCh = GetNextChar();
+                }
+
+                if( '>' == nNextCh && IsParserWorking() && bTwoMinus )
+                    bReadComment = false;
+
+                bNextCh = false;
+            }
+            break;
+
+        case '\r':
+            // \r\n? closes the current text token (even if it's empty)
+            nNextCh = GetNextChar();
+            if( nNextCh=='\n' )
+                nNextCh = GetNextChar();
+            bContinue = false;
+            break;
+        case '\n':
+            // \n closes the current text token (even if it's empty)
+            nNextCh = GetNextChar();
+            bContinue = false;
+            break;
+        case sal_Unicode(EOF):
+            // eof closes the current text token and behaves like having read
+            // an end token
+            if( rInput.eof() )
+            {
+                bContinue = false;
+                if( !aToken.isEmpty() || !sTmpBuffer.isEmpty() )
+                {
+                    bEndTokenFound = true;
+                }
+                else
+                {
+                    bReadScript = false;
+                    bReadStyle = false;
+                    aEndToken.clear();
+                    nToken = HtmlTokenId::NONE;
+                }
+            }
+            break;
+        default:
+            if (!linguistic::IsControlChar(nNextCh) || nNextCh == '\t')
+            {
+                // all remaining characters are appended to the buffer
+                sTmpBuffer.appendUtf32( nNextCh );
+            }
+            break;
+        }
+
+        if( !bContinue && !sTmpBuffer.isEmpty() )
+        {
+            aToken.append( sTmpBuffer );
+            sTmpBuffer.setLength(0);
+        }
+
+        if( bContinue && bNextCh )
+            nNextCh = GetNextChar();
+    }
+
+    if( IsParserWorking() )
+        SaveState( HtmlTokenId::NONE );
+    else
+        nToken = HtmlTokenId::NONE;
+
+    return nToken;
+}
+
+// Scan next token
+HtmlTokenId HTMLParser::GetNextToken_()
+{
+    HtmlTokenId nRet = HtmlTokenId::NONE;
+    sSaveToken.clear();
+
+    if (mnPendingOffToken != HtmlTokenId::NONE)
+    {
+        // HtmlTokenId::<TOKEN>_OFF generated for HtmlTokenId::<TOKEN>_ON
+        nRet = mnPendingOffToken;
+        mnPendingOffToken = HtmlTokenId::NONE;
+        aToken.setLength( 0 );
+        return nRet;
+    }
+
+    // Delete options
+    maOptions.clear();
+
+    if( !IsParserWorking() )        // Don't continue if already an error occurred
+        return HtmlTokenId::NONE;
+
+    bool bReadNextCharSave = bReadNextChar;
+    if( bReadNextChar )
+    {
+        DBG_ASSERT( !bEndTokenFound,
+                    "Read a character despite </SCRIPT> was read?" );
+        nNextCh = GetNextChar();
+        if( !IsParserWorking() )        // Don't continue if already an error occurred
+            return HtmlTokenId::NONE;
+        bReadNextChar = false;
+    }
+
+    if( bReadScript || bReadStyle || !aEndToken.isEmpty() )
+    {
+        nRet = GetNextRawToken();
+        if( nRet != HtmlTokenId::NONE || !IsParserWorking() )
+            return nRet;
+    }
+
+    do {
+        bool bNextCh = true;
+        switch( nNextCh )
+        {
+        case '<':
+            {
+                sal_uInt64 nStreamPos = rInput.Tell();
+                sal_uInt32 nLineNr = GetLineNr();
+                sal_uInt32 nLinePos = GetLinePos();
+
+                bool bOffState = false;
+                if( '/' == (nNextCh = GetNextChar()) )
+                {
+                    bOffState = true;
+                    nNextCh = GetNextChar();
+                }
+                // Assume '<?' is a start of an XML declaration, ignore it.
+                if (rtl::isAsciiAlpha(nNextCh) || nNextCh == '!' || nNextCh == '?')
+                {
+                    OUStringBuffer sTmpBuffer;
+                    do {
+                        sTmpBuffer.appendUtf32( nNextCh );
+                        nNextCh = GetNextChar();
+                        if (std::u16string_view(sTmpBuffer) == u"![CDATA[")
+                            break;
+                        if (bFuzzing && sTmpBuffer.getLength() > 1024)
+                        {
+                            SAL_WARN("svtools", "abandoning import for performance reasons with long tokens");
+                            eState = SvParserState::Error;
+                            break;
+                        }
+                    } while( '>' != nNextCh && '/' != nNextCh && !rtl::isAsciiWhiteSpace( nNextCh ) &&
+                            !linguistic::IsControlChar(nNextCh) &&
+                             IsParserWorking() && !rInput.eof() );
+
+                    if( !sTmpBuffer.isEmpty() )
+                    {
+                        aToken.append( sTmpBuffer );
+                        sTmpBuffer.setLength(0);
+                    }
+
+                    // Skip blanks
+                    while( rtl::isAsciiWhiteSpace( nNextCh ) && IsParserWorking() )
+                        nNextCh = GetNextChar();
+
+                    if( !IsParserWorking() )
+                    {
+                        if( SvParserState::Pending == eState )
+                            bReadNextChar = bReadNextCharSave;
+                        break;
+                    }
+
+                    // Search token in table:
+                    sSaveToken = aToken;
+                    aToken = aToken.toString().toAsciiLowerCase();
+
+                    if (!maNamespace.isEmpty() && o3tl::starts_with(aToken, maNamespace))
+                        aToken.remove( 0, maNamespace.getLength());
+
+                    if( HtmlTokenId::NONE == (nRet = GetHTMLToken( aToken )) )
+                        // Unknown control
+                        nRet = HtmlTokenId::UNKNOWNCONTROL_ON;
+
+                    // If it's a token which can be switched off...
+                    if( bOffState )
+                    {
+                         if( nRet >= HtmlTokenId::ONOFF_START )
+                         {
+                            // and there is an off token, return off token instead
+                            nRet = static_cast<HtmlTokenId>(static_cast<int>(nRet) + 1);
+                         }
+                         else if( HtmlTokenId::LINEBREAK!=nRet || !maNamespace.isEmpty())
+                         {
+                            // and there is no off token, return unknown token.
+                            // (except for </BR>, that is treated like <BR>)
+                            // No exception for XHTML, though.
+                            nRet = HtmlTokenId::UNKNOWNCONTROL_OFF;
+                         }
+                    }
+
+                    if( nRet == HtmlTokenId::COMMENT )
+                    {
+                        // fix: due to being case sensitive use sSaveToken as start of comment
+                        //      and append a blank.
+                        aToken = sSaveToken;
+                        if( '>'!=nNextCh )
+                            aToken.append( " " );
+                        sal_uInt64 nCStreamPos = 0;
+                        sal_uInt32 nCLineNr = 0;
+                        sal_uInt32 nCLinePos = 0;
+                        sal_Int32 nCStrLen = 0;
+
+                        bool bDone = false;
+                        // Read until closing -->. If not found restart at first >
+                        sTmpBuffer = aToken;
+                        while( !bDone && !rInput.eof() && IsParserWorking() )
+                        {
+                            if( '>'==nNextCh )
+                            {
+                                if( !nCStreamPos )
+                                {
+                                    nCStreamPos = rInput.Tell();
+                                    nCStrLen = sTmpBuffer.getLength();
+                                    nCLineNr = GetLineNr();
+                                    nCLinePos = GetLinePos();
+                                }
+                                bDone = sTmpBuffer.getLength() >= 2 && sTmpBuffer[sTmpBuffer.getLength() - 2] == '-' && sTmpBuffer[sTmpBuffer.getLength() - 1] == '-';
+                                if( !bDone )
+                                    sTmpBuffer.appendUtf32(nNextCh);
+                            }
+                            else if (!linguistic::IsControlChar(nNextCh)
+                                || nNextCh == '\r' || nNextCh == '\n' || nNextCh == '\t')
+                            {
+                                sTmpBuffer.appendUtf32(nNextCh);
+                            }
+                            if( !bDone )
+                                nNextCh = GetNextChar();
+                        }
+                        aToken = sTmpBuffer;
+                        sTmpBuffer.setLength(0);
+                        if( !bDone && IsParserWorking() && nCStreamPos )
+                        {
+                            rInput.Seek( nCStreamPos );
+                            SetLineNr( nCLineNr );
+                            SetLinePos( nCLinePos );
+                            ClearTxtConvContext();
+                            aToken.truncate(nCStrLen);
+                            nNextCh = '>';
+                        }
+                    }
+                    else if (nRet == HtmlTokenId::CDATA)
+                    {
+                        // Read until the closing ]]>.
+                        bool bDone = false;
+                        while (!bDone && !rInput.eof() && IsParserWorking())
+                        {
+                            if (nNextCh == '>')
+                            {
+                                if (sTmpBuffer.getLength() >= 2)
+                                {
+                                    bDone = sTmpBuffer[sTmpBuffer.getLength() - 2] == ']'
+                                            && sTmpBuffer[sTmpBuffer.getLength() - 1] == ']';
+                                    if (bDone)
+                                    {
+                                        // Ignore ]] at the end.
+                                        sTmpBuffer.setLength(sTmpBuffer.getLength() - 2);
+                                    }
+                                }
+                                if (!bDone)
+                                {
+                                    sTmpBuffer.appendUtf32(nNextCh);
+                                }
+                            }
+                            else if (!linguistic::IsControlChar(nNextCh))
+                            {
+                                sTmpBuffer.appendUtf32(nNextCh);
+                            }
+                            if (!bDone)
+                            {
+                                nNextCh = GetNextChar();
+                            }
+                        }
+                        aToken = sTmpBuffer;
+                        sTmpBuffer.setLength(0);
+                    }
+                    else
+                    {
+                        // TokenString not needed anymore
+                        aToken.setLength( 0 );
+                    }
+
+                    // Read until closing '>'
+                    if( '>' != nNextCh && IsParserWorking() )
+                    {
+                        ScanText( '>' );
+
+                        // fdo#34666 fdo#36080 fdo#36390: closing "/>"?:
+                        // generate pending HtmlTokenId::<TOKEN>_OFF for HtmlTokenId::<TOKEN>_ON
+                        // Do not convert this to a single HtmlTokenId::<TOKEN>_OFF
+                        // which lead to fdo#56772.
+                        if ((nRet >= HtmlTokenId::ONOFF_START) && o3tl::ends_with(aToken, u"/"))
+                        {
+                            mnPendingOffToken = static_cast<HtmlTokenId>(static_cast<int>(nRet) + 1);       // HtmlTokenId::<TOKEN>_ON -> HtmlTokenId::<TOKEN>_OFF
+                            aToken.setLength( aToken.getLength()-1 );   // remove trailing '/'
+                        }
+                        if( sal_Unicode(EOF) == nNextCh && rInput.eof() )
+                        {
+                            // Move back in front of < and restart there.
+                            // Return < as text.
+                            rInput.Seek( nStreamPos );
+                            SetLineNr( nLineNr );
+                            SetLinePos( nLinePos );
+                            ClearTxtConvContext();
+
+                            aToken = "<";
+                            nRet = HtmlTokenId::TEXTTOKEN;
+                            nNextCh = GetNextChar();
+                            bNextCh = false;
+                            break;
+                        }
+                    }
+                    if( SvParserState::Pending == eState )
+                        bReadNextChar = bReadNextCharSave;
+                }
+                else
+                {
+                    if( bOffState )
+                    {
+                        // simply throw away everything
+                        ScanText( '>' );
+                        if( sal_Unicode(EOF) == nNextCh && rInput.eof() )
+                        {
+                            // Move back in front of < and restart there.
+                            // Return < as text.
+                            rInput.Seek( nStreamPos );
+                            SetLineNr( nLineNr );
+                            SetLinePos( nLinePos );
+                            ClearTxtConvContext();
+
+                            aToken = "<";
+                            nRet = HtmlTokenId::TEXTTOKEN;
+                            nNextCh = GetNextChar();
+                            bNextCh = false;
+                            break;
+                        }
+                        if( SvParserState::Pending == eState )
+                            bReadNextChar = bReadNextCharSave;
+                        aToken.setLength( 0 );
+                    }
+                    else if( '%' == nNextCh )
+                    {
+                        nRet = HtmlTokenId::UNKNOWNCONTROL_ON;
+
+                        sal_uInt64 nCStreamPos = rInput.Tell();
+                        sal_uInt32 nCLineNr = GetLineNr(), nCLinePos = GetLinePos();
+
+                        bool bDone = false;
+                        // Read until closing %>. If not found restart at first >.
+                        sal_Unicode nLastTokenChar = !aToken.isEmpty() ? aToken[aToken.getLength() - 1] : 0;
+                        OUStringBuffer aTmpBuffer(aToken);
+                        while( !bDone && !rInput.eof() && IsParserWorking() )
+                        {
+                            bDone = '>'==nNextCh && nLastTokenChar == '%';
+                            if( !bDone )
+                            {
+                                aTmpBuffer.appendUtf32(nNextCh);
+                                nLastTokenChar = aTmpBuffer[aTmpBuffer.getLength() - 1];
+                                nNextCh = GetNextChar();
+                            }
+                        }
+                        if( !bDone && IsParserWorking() )
+                        {
+                            rInput.Seek( nCStreamPos );
+                            SetLineNr( nCLineNr );
+                            SetLinePos( nCLinePos );
+                            ClearTxtConvContext();
+                            aToken = "<%";
+                            nRet = HtmlTokenId::TEXTTOKEN;
+                            break;
+                        }
+                        aToken = aTmpBuffer;
+                        aTmpBuffer.setLength(0);
+                        if( IsParserWorking() )
+                        {
+                            sSaveToken = aToken;
+                            aToken.setLength( 0 );
+                        }
+                    }
+                    else
+                    {
+                        aToken = "<";
+                        nRet = HtmlTokenId::TEXTTOKEN;
+                        bNextCh = false;
+                        break;
+                    }
+                }
+
+                if( IsParserWorking() )
+                {
+                    bNextCh = '>' == nNextCh;
+                    switch( nRet )
+                    {
+                    case HtmlTokenId::TEXTAREA_ON:
+                        bReadTextArea = true;
+                        break;
+                    case HtmlTokenId::TEXTAREA_OFF:
+                        bReadTextArea = false;
+                        break;
+                    case HtmlTokenId::SCRIPT_ON:
+                        if( !bReadTextArea )
+                            bReadScript = true;
+                        break;
+                    case HtmlTokenId::SCRIPT_OFF:
+                        if( !bReadTextArea )
+                        {
+                            bReadScript = false;
+                            // JavaScript might modify the stream,
+                            // thus the last character has to be read again.
+                            bReadNextChar = true;
+                            bNextCh = false;
+                        }
+                        break;
+
+                    case HtmlTokenId::STYLE_ON:
+                        bReadStyle = true;
+                        break;
+                    case HtmlTokenId::STYLE_OFF:
+                        bReadStyle = false;
+                        break;
+                    default: break;
+                    }
+                }
+            }
+            break;
+
+        case sal_Unicode(EOF):
+            if( rInput.eof() )
+            {
+                eState = SvParserState::Accepted;
+                nRet = HtmlTokenId(nNextCh);
+            }
+            else
+            {
+                // Read normal text.
+                goto scan_text;
+            }
+            break;
+
+        case '\f':
+            // form feeds are passed upwards separately
+            nRet = HtmlTokenId::LINEFEEDCHAR; // !!! should be FORMFEEDCHAR
+            break;
+
+        case '\n':
+        case '\r':
+            if( bReadListing || bReadXMP || bReadPRE || bReadTextArea )
+            {
+                sal_Unicode c = GetNextChar();
+                if( ( '\n' != nNextCh || '\r' != c ) &&
+                    ( '\r' != nNextCh || '\n' != c ) )
+                {
+                    bNextCh = false;
+                    nNextCh = c;
+                }
+                nRet = HtmlTokenId::NEWPARA;
+                break;
+            }
+            [[fallthrough]];
+        case '\t':
+            if( bReadPRE )
+            {
+                nRet = HtmlTokenId::TABCHAR;
+                break;
+            }
+            [[fallthrough]];
+        case ' ':
+            [[fallthrough]];
+        default:
+
+scan_text:
+            // "normal" text to come
+            nRet = ScanText();
+            bNextCh = 0 == aToken.getLength();
+
+            // the text should be processed
+            if( !bNextCh && eState == SvParserState::Pending )
+            {
+                eState = SvParserState::Working;
+                bReadNextChar = true;
+            }
+
+            break;
+        }
+
+        if( bNextCh && SvParserState::Working == eState )
+        {
+            nNextCh = GetNextChar();
+            if( SvParserState::Pending == eState && nRet != HtmlTokenId::NONE && HtmlTokenId::TEXTTOKEN != nRet )
+            {
+                bReadNextChar = true;
+                eState = SvParserState::Working;
+            }
+        }
+
+    } while( nRet == HtmlTokenId::NONE && SvParserState::Working == eState );
+
+    if( SvParserState::Pending == eState )
+        nRet = HtmlTokenId::INVALID;      // s.th. invalid
+
+    return nRet;
+}
+
+void HTMLParser::UnescapeToken()
+{
+    sal_Int32 nPos=0;
+
+    bool bEscape = false;
+    while( nPos < aToken.getLength() )
+    {
+        bool bOldEscape = bEscape;
+        bEscape = false;
+        if( '\\'==aToken[nPos] && !bOldEscape )
+        {
+            aToken.remove( nPos, 1 );
+            bEscape = true;
+        }
+        else
+        {
+            nPos++;
+        }
+    }
+}
+
+const HTMLOptions& HTMLParser::GetOptions( HtmlOptionId const *pNoConvertToken )
+{
+    // If the options for the current token have already been returned,
+    // return them once again.
+    if (!maOptions.empty())
+        return maOptions;
+
+    sal_Int32 nPos = 0;
+    while( nPos < aToken.getLength() )
+    {
+        // A letter? Option beginning here.
+        if( rtl::isAsciiAlpha( aToken[nPos] ) )
+        {
+            HtmlOptionId nToken;
+            OUString aValue;
+            sal_Int32 nStt = nPos;
+            sal_Unicode cChar = 0;
+
+            // Actually only certain characters allowed.
+            // Netscape only looks for "=" and white space (c.f.
+            // Mozilla: PA_FetchRequestedNameValues in libparse/pa_mdl.c)
+            while( nPos < aToken.getLength() )
+            {
+                cChar = aToken[nPos];
+                if ( '=' == cChar ||!HTML_ISPRINTABLE(cChar) || rtl::isAsciiWhiteSpace(cChar) )
+                    break;
+                nPos++;
+            }
+
+            OUString sName( aToken.subView( nStt, nPos-nStt ) );
+
+            // PlugIns require original token name. Convert to lower case only for searching.
+            nToken = GetHTMLOption( sName.toAsciiLowerCase() ); // Name is ready
+            SAL_WARN_IF( nToken==HtmlOptionId::UNKNOWN, "svtools",
+                        "GetOption: unknown HTML option '" << sName << "'" );
+            bool bStripCRLF = (nToken < HtmlOptionId::SCRIPT_START ||
+                               nToken >= HtmlOptionId::SCRIPT_END) &&
+                              (!pNoConvertToken || nToken != *pNoConvertToken);
+
+            while( nPos < aToken.getLength() )
+            {
+                cChar = aToken[nPos];
+                if ( HTML_ISPRINTABLE(cChar) && !rtl::isAsciiWhiteSpace(cChar) )
+                    break;
+                nPos++;
+            }
+
+            // Option with value?
+            if( nPos!=aToken.getLength() && '='==cChar )
+            {
+                nPos++;
+
+                while( nPos < aToken.getLength() )
+                {
+                    cChar = aToken[nPos];
+                    if ( HTML_ISPRINTABLE(cChar) && ' ' != cChar && '\t' != cChar && '\r' != cChar && '\n' != cChar )
+                        break;
+                    nPos++;
+                }
+
+                if( nPos != aToken.getLength() )
+                {
+                    sal_Int32 nLen = 0;
+                    nStt = nPos;
+                    if( ('"'==cChar) || '\''==cChar )
+                    {
+                        sal_Unicode cEnd = cChar;
+                        nPos++; nStt++;
+                        bool bDone = false;
+                        bool bEscape = false;
+                        while( nPos < aToken.getLength() && !bDone )
+                        {
+                            bool bOldEscape = bEscape;
+                            bEscape = false;
+                            cChar = aToken[nPos];
+                            switch( cChar )
+                            {
+                            case '\r':
+                            case '\n':
+                                if( bStripCRLF )
+                                    aToken.remove( nPos, 1 );
+                                else
+                                {
+                                    nPos++;
+                                    nLen++;
+                                }
+                                break;
+                            case '\\':
+                                if( bOldEscape )
+                                {
+                                    nPos++;
+                                    nLen++;
+                                }
+                                else
+                                {
+                                    aToken.remove( nPos, 1 );
+                                    bEscape = true;
+                                }
+                                break;
+                            case '"':
+                            case '\'':
+                                bDone = !bOldEscape && cChar==cEnd;
+                                if( !bDone )
+                                {
+                                    nPos++;
+                                    nLen++;
+                                }
+                                break;
+                            default:
+                                nPos++;
+                                nLen++;
+                                break;
+                            }
+                        }
+                        if( nPos!=aToken.getLength() )
+                            nPos++;
+                    }
+                    else
+                    {
+                        // More liberal than the standard: allow all printable characters
+                        bool bEscape = false;
+                        bool bDone = false;
+                        while( nPos < aToken.getLength() && !bDone )
+                        {
+                            bool bOldEscape = bEscape;
+                            bEscape = false;
+                            sal_Unicode c = aToken[nPos];
+                            switch( c )
+                            {
+                            case ' ':
+                                bDone = !bOldEscape;
+                                if( !bDone )
+                                {
+                                    nPos++;
+                                    nLen++;
+                                }
+                                break;
+
+                            case '\t':
+                            case '\r':
+                            case '\n':
+                                bDone = true;
+                                break;
+
+                            case '\\':
+                                if( bOldEscape )
+                                {
+                                    nPos++;
+                                    nLen++;
+                                }
+                                else
+                                {
+                                    aToken.remove( nPos, 1 );
+                                    bEscape = true;
+                                }
+                                break;
+
+                            default:
+                                if( HTML_ISPRINTABLE( c ) )
+                                {
+                                    nPos++;
+                                    nLen++;
+                                }
+                                else
+                                    bDone = true;
+                                break;
+                            }
+                        }
+                    }
+
+                    if( nLen )
+                        aValue = aToken.subView( nStt, nLen );
+                }
+            }
+
+            // Token is known and can be saved
+            maOptions.emplace_back(nToken, sName, aValue);
+
+        }
+        else
+            // Ignore white space and unexpected characters
+            nPos++;
+    }
+
+    return maOptions;
+}
+
+HtmlTokenId HTMLParser::FilterPRE( HtmlTokenId nToken )
+{
+    switch( nToken )
+    {
+    // in Netscape they only have impact in not empty paragraphs
+    case HtmlTokenId::PARABREAK_ON:
+        nToken = HtmlTokenId::LINEBREAK;
+        [[fallthrough]];
+    case HtmlTokenId::LINEBREAK:
+    case HtmlTokenId::NEWPARA:
+        nPre_LinePos = 0;
+        if( bPre_IgnoreNewPara )
+            nToken = HtmlTokenId::NONE;
+        break;
+
+    case HtmlTokenId::TABCHAR:
+        {
+            sal_Int32 nSpaces = 8 - (nPre_LinePos % 8);
+            DBG_ASSERT( aToken.isEmpty(), "Why is the token not empty?" );
+            if (aToken.getLength() < nSpaces)
+            {
+                using comphelper::string::padToLength;
+                OUStringBuffer aBuf(aToken);
+                aToken = padToLength(aBuf, nSpaces, ' ');
+            }
+            nPre_LinePos += nSpaces;
+            nToken = HtmlTokenId::TEXTTOKEN;
+        }
+        break;
+    // Keep those
+    case HtmlTokenId::TEXTTOKEN:
+        nPre_LinePos += aToken.getLength();
+        break;
+
+    case HtmlTokenId::SELECT_ON:
+    case HtmlTokenId::SELECT_OFF:
+    case HtmlTokenId::BODY_ON:
+    case HtmlTokenId::FORM_ON:
+    case HtmlTokenId::FORM_OFF:
+    case HtmlTokenId::INPUT:
+    case HtmlTokenId::OPTION:
+    case HtmlTokenId::TEXTAREA_ON:
+    case HtmlTokenId::TEXTAREA_OFF:
+
+    case HtmlTokenId::IMAGE:
+    case HtmlTokenId::APPLET_ON:
+    case HtmlTokenId::APPLET_OFF:
+    case HtmlTokenId::PARAM:
+    case HtmlTokenId::EMBED:
+
+    case HtmlTokenId::HEAD1_ON:
+    case HtmlTokenId::HEAD1_OFF:
+    case HtmlTokenId::HEAD2_ON:
+    case HtmlTokenId::HEAD2_OFF:
+    case HtmlTokenId::HEAD3_ON:
+    case HtmlTokenId::HEAD3_OFF:
+    case HtmlTokenId::HEAD4_ON:
+    case HtmlTokenId::HEAD4_OFF:
+    case HtmlTokenId::HEAD5_ON:
+    case HtmlTokenId::HEAD5_OFF:
+    case HtmlTokenId::HEAD6_ON:
+    case HtmlTokenId::HEAD6_OFF:
+    case HtmlTokenId::BLOCKQUOTE_ON:
+    case HtmlTokenId::BLOCKQUOTE_OFF:
+    case HtmlTokenId::ADDRESS_ON:
+    case HtmlTokenId::ADDRESS_OFF:
+    case HtmlTokenId::HORZRULE:
+
+    case HtmlTokenId::CENTER_ON:
+    case HtmlTokenId::CENTER_OFF:
+    case HtmlTokenId::DIVISION_ON:
+    case HtmlTokenId::DIVISION_OFF:
+
+    case HtmlTokenId::SCRIPT_ON:
+    case HtmlTokenId::SCRIPT_OFF:
+    case HtmlTokenId::RAWDATA:
+
+    case HtmlTokenId::TABLE_ON:
+    case HtmlTokenId::TABLE_OFF:
+    case HtmlTokenId::CAPTION_ON:
+    case HtmlTokenId::CAPTION_OFF:
+    case HtmlTokenId::COLGROUP_ON:
+    case HtmlTokenId::COLGROUP_OFF:
+    case HtmlTokenId::COL_ON:
+    case HtmlTokenId::COL_OFF:
+    case HtmlTokenId::THEAD_ON:
+    case HtmlTokenId::THEAD_OFF:
+    case HtmlTokenId::TFOOT_ON:
+    case HtmlTokenId::TFOOT_OFF:
+    case HtmlTokenId::TBODY_ON:
+    case HtmlTokenId::TBODY_OFF:
+    case HtmlTokenId::TABLEROW_ON:
+    case HtmlTokenId::TABLEROW_OFF:
+    case HtmlTokenId::TABLEDATA_ON:
+    case HtmlTokenId::TABLEDATA_OFF:
+    case HtmlTokenId::TABLEHEADER_ON:
+    case HtmlTokenId::TABLEHEADER_OFF:
+
+    case HtmlTokenId::ANCHOR_ON:
+    case HtmlTokenId::ANCHOR_OFF:
+    case HtmlTokenId::BOLD_ON:
+    case HtmlTokenId::BOLD_OFF:
+    case HtmlTokenId::ITALIC_ON:
+    case HtmlTokenId::ITALIC_OFF:
+    case HtmlTokenId::STRIKE_ON:
+    case HtmlTokenId::STRIKE_OFF:
+    case HtmlTokenId::STRIKETHROUGH_ON:
+    case HtmlTokenId::STRIKETHROUGH_OFF:
+    case HtmlTokenId::UNDERLINE_ON:
+    case HtmlTokenId::UNDERLINE_OFF:
+    case HtmlTokenId::BASEFONT_ON:
+    case HtmlTokenId::BASEFONT_OFF:
+    case HtmlTokenId::FONT_ON:
+    case HtmlTokenId::FONT_OFF:
+    case HtmlTokenId::BLINK_ON:
+    case HtmlTokenId::BLINK_OFF:
+    case HtmlTokenId::SPAN_ON:
+    case HtmlTokenId::SPAN_OFF:
+    case HtmlTokenId::SUBSCRIPT_ON:
+    case HtmlTokenId::SUBSCRIPT_OFF:
+    case HtmlTokenId::SUPERSCRIPT_ON:
+    case HtmlTokenId::SUPERSCRIPT_OFF:
+    case HtmlTokenId::BIGPRINT_ON:
+    case HtmlTokenId::BIGPRINT_OFF:
+    case HtmlTokenId::SMALLPRINT_OFF:
+    case HtmlTokenId::SMALLPRINT_ON:
+
+    case HtmlTokenId::EMPHASIS_ON:
+    case HtmlTokenId::EMPHASIS_OFF:
+    case HtmlTokenId::CITATION_ON:
+    case HtmlTokenId::CITATION_OFF:
+    case HtmlTokenId::STRONG_ON:
+    case HtmlTokenId::STRONG_OFF:
+    case HtmlTokenId::CODE_ON:
+    case HtmlTokenId::CODE_OFF:
+    case HtmlTokenId::SAMPLE_ON:
+    case HtmlTokenId::SAMPLE_OFF:
+    case HtmlTokenId::KEYBOARD_ON:
+    case HtmlTokenId::KEYBOARD_OFF:
+    case HtmlTokenId::VARIABLE_ON:
+    case HtmlTokenId::VARIABLE_OFF:
+    case HtmlTokenId::DEFINSTANCE_ON:
+    case HtmlTokenId::DEFINSTANCE_OFF:
+    case HtmlTokenId::SHORTQUOTE_ON:
+    case HtmlTokenId::SHORTQUOTE_OFF:
+    case HtmlTokenId::LANGUAGE_ON:
+    case HtmlTokenId::LANGUAGE_OFF:
+    case HtmlTokenId::AUTHOR_ON:
+    case HtmlTokenId::AUTHOR_OFF:
+    case HtmlTokenId::PERSON_ON:
+    case HtmlTokenId::PERSON_OFF:
+    case HtmlTokenId::ACRONYM_ON:
+    case HtmlTokenId::ACRONYM_OFF:
+    case HtmlTokenId::ABBREVIATION_ON:
+    case HtmlTokenId::ABBREVIATION_OFF:
+    case HtmlTokenId::INSERTEDTEXT_ON:
+    case HtmlTokenId::INSERTEDTEXT_OFF:
+    case HtmlTokenId::DELETEDTEXT_ON:
+    case HtmlTokenId::DELETEDTEXT_OFF:
+    case HtmlTokenId::TELETYPE_ON:
+    case HtmlTokenId::TELETYPE_OFF:
+
+        break;
+
+    // The remainder is treated as an unknown token.
+    default:
+        if( nToken != HtmlTokenId::NONE )
+        {
+            nToken =
+                ( ((nToken >= HtmlTokenId::ONOFF_START) && isOffToken(nToken))
+                    ? HtmlTokenId::UNKNOWNCONTROL_OFF
+                    : HtmlTokenId::UNKNOWNCONTROL_ON );
+        }
+        break;
+    }
+
+    bPre_IgnoreNewPara = false;
+
+    return nToken;
+}
+
+HtmlTokenId HTMLParser::FilterXMP( HtmlTokenId nToken )
+{
+    switch( nToken )
+    {
+    case HtmlTokenId::NEWPARA:
+        if( bPre_IgnoreNewPara )
+            nToken = HtmlTokenId::NONE;
+        [[fallthrough]];
+    case HtmlTokenId::TEXTTOKEN:
+    case HtmlTokenId::NONBREAKSPACE:
+    case HtmlTokenId::SOFTHYPH:
+        break;              // kept
+
+    default:
+        if( nToken != HtmlTokenId::NONE )
+        {
+            if( (nToken >= HtmlTokenId::ONOFF_START) && isOffToken(nToken) )
+            {
+                sSaveToken = "</" + sSaveToken;
+            }
+            else
+                sSaveToken = "<" + sSaveToken;
+            if( !aToken.isEmpty() )
+            {
+                UnescapeToken();
+                sSaveToken += " ";
+                aToken.insert(0, sSaveToken);
+            }
+            else
+                aToken = sSaveToken;
+            aToken.append( ">" );
+            nToken = HtmlTokenId::TEXTTOKEN;
+        }
+        break;
+    }
+
+    bPre_IgnoreNewPara = false;
+
+    return nToken;
+}
+
+HtmlTokenId HTMLParser::FilterListing( HtmlTokenId nToken )
+{
+    switch( nToken )
+    {
+    case HtmlTokenId::NEWPARA:
+        if( bPre_IgnoreNewPara )
+            nToken = HtmlTokenId::NONE;
+        [[fallthrough]];
+    case HtmlTokenId::TEXTTOKEN:
+    case HtmlTokenId::NONBREAKSPACE:
+    case HtmlTokenId::SOFTHYPH:
+        break;      // kept
+
+    default:
+        if( nToken != HtmlTokenId::NONE )
+        {
+            nToken =
+                ( ((nToken >= HtmlTokenId::ONOFF_START) && isOffToken(nToken))
+                    ? HtmlTokenId::UNKNOWNCONTROL_OFF
+                    : HtmlTokenId::UNKNOWNCONTROL_ON );
+        }
+        break;
+    }
+
+    bPre_IgnoreNewPara = false;
+
+    return nToken;
+}
+
+bool HTMLParser::InternalImgToPrivateURL( OUString& rURL )
+{
+    bool bFound = false;
+
+    if( rURL.startsWith( OOO_STRING_SVTOOLS_HTML_internal_icon ) )
+    {
+        OUString aName( rURL.copy(14) );
+        switch( aName[0] )
+        {
+        case 'b':
+            bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_baddata;
+            break;
+        case 'd':
+            bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_delayed;
+            break;
+        case 'e':
+            bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_embed;
+            break;
+        case 'i':
+            bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_insecure;
+            break;
+        case 'n':
+            bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_notfound;
+            break;
+        }
+    }
+    if( bFound )
+    {
+        OUString sTmp ( rURL );
+        rURL =  OOO_STRING_SVTOOLS_HTML_private_image;
+        rURL += sTmp;
+    }
+
+    return bFound;
+}
+
+namespace {
+
+enum class HtmlMeta {
+    NONE = 0,
+    Author,
+    Description,
+    Keywords,
+    Refresh,
+    Classification,
+    Created,
+    ChangedBy,
+    Changed,
+    Generator,
+    SDFootnote,
+    SDEndnote,
+    ContentType
+};
+
+}
+
+// <META NAME=xxx>
+HTMLOptionEnum<HtmlMeta> const aHTMLMetaNameTable[] =
+{
+    { OOO_STRING_SVTOOLS_HTML_META_author,        HtmlMeta::Author        },
+    { OOO_STRING_SVTOOLS_HTML_META_changed,       HtmlMeta::Changed       },
+    { OOO_STRING_SVTOOLS_HTML_META_changedby,     HtmlMeta::ChangedBy     },
+    { OOO_STRING_SVTOOLS_HTML_META_classification,HtmlMeta::Classification},
+    { OOO_STRING_SVTOOLS_HTML_META_content_type,  HtmlMeta::ContentType   },
+    { OOO_STRING_SVTOOLS_HTML_META_created,       HtmlMeta::Created       },
+    { OOO_STRING_SVTOOLS_HTML_META_description,   HtmlMeta::Description   },
+    { OOO_STRING_SVTOOLS_HTML_META_keywords,      HtmlMeta::Keywords      },
+    { OOO_STRING_SVTOOLS_HTML_META_generator,     HtmlMeta::Generator     },
+    { OOO_STRING_SVTOOLS_HTML_META_refresh,       HtmlMeta::Refresh       },
+    { OOO_STRING_SVTOOLS_HTML_META_sdendnote,     HtmlMeta::SDEndnote     },
+    { OOO_STRING_SVTOOLS_HTML_META_sdfootnote,    HtmlMeta::SDFootnote    },
+    { nullptr,                                    HtmlMeta(0)             }
+};
+
+
+void HTMLParser::AddMetaUserDefined( OUString const & )
+{
+}
+
+bool HTMLParser::ParseMetaOptionsImpl(
+        const uno::Reference<document::XDocumentProperties> & i_xDocProps,
+        SvKeyValueIterator *i_pHTTPHeader,
+        const HTMLOptions& aOptions,
+        rtl_TextEncoding& o_rEnc )
+{
+    OUString aName, aContent;
+    HtmlMeta nAction = HtmlMeta::NONE;
+    bool bHTTPEquiv = false, bChanged = false;
+
+    for ( size_t i = aOptions.size(); i; )
+    {
+        const HTMLOption& aOption = aOptions[--i];
+        switch ( aOption.GetToken() )
+        {
+            case HtmlOptionId::NAME:
+                aName = aOption.GetString();
+                if ( HtmlMeta::NONE==nAction )
+                {
+                    aOption.GetEnum( nAction, aHTMLMetaNameTable );
+                }
+                break;
+            case HtmlOptionId::HTTPEQUIV:
+                aName = aOption.GetString();
+                aOption.GetEnum( nAction, aHTMLMetaNameTable );
+                bHTTPEquiv = true;
+                break;
+            case HtmlOptionId::CONTENT:
+                aContent = aOption.GetString();
+                break;
+            case HtmlOptionId::CHARSET:
+            {
+                OString sValue(OUStringToOString(aOption.GetString(), RTL_TEXTENCODING_ASCII_US));
+                o_rEnc = GetExtendedCompatibilityTextEncoding(rtl_getTextEncodingFromMimeCharset(sValue.getStr()));
+                break;
+            }
+            default: break;
+        }
+    }
+
+    if ( bHTTPEquiv || HtmlMeta::Description != nAction )
+    {
+        // if it is not a Description, remove CRs and LFs from CONTENT
+        aContent = aContent.replaceAll("\r", "").replaceAll("\n", "");
+    }
+    else
+    {
+        // convert line endings for Description
+        aContent = convertLineEnd(aContent, GetSystemLineEnd());
+    }
+
+    if ( bHTTPEquiv && i_pHTTPHeader )
+    {
+        // Netscape seems to just ignore a closing ", so we do too
+        if ( aContent.endsWith("\"") )
+        {
+            aContent = aContent.copy( 0, aContent.getLength() - 1 );
+        }
+        SvKeyValue aKeyValue( aName, aContent );
+        i_pHTTPHeader->Append( aKeyValue );
+    }
+
+    switch ( nAction )
+    {
+        case HtmlMeta::Author:
+            if (i_xDocProps.is()) {
+                i_xDocProps->setAuthor( aContent );
+                bChanged = true;
+            }
+            break;
+        case HtmlMeta::Description:
+            if (i_xDocProps.is()) {
+                i_xDocProps->setDescription( aContent );
+                bChanged = true;
+            }
+            break;
+        case HtmlMeta::Keywords:
+            if (i_xDocProps.is()) {
+                i_xDocProps->setKeywords(
+                    ::comphelper::string::convertCommaSeparated(aContent));
+                bChanged = true;
+            }
+            break;
+        case HtmlMeta::Classification:
+            if (i_xDocProps.is()) {
+                i_xDocProps->setSubject( aContent );
+                bChanged = true;
+            }
+            break;
+
+        case HtmlMeta::ChangedBy:
+            if (i_xDocProps.is()) {
+                i_xDocProps->setModifiedBy( aContent );
+                bChanged = true;
+            }
+            break;
+
+        case HtmlMeta::Created:
+        case HtmlMeta::Changed:
+            if (i_xDocProps.is() && !aContent.isEmpty())
+            {
+                ::util::DateTime uDT;
+                bool valid = false;
+                if (comphelper::string::getTokenCount(aContent, ';') == 2)
+                {
+                    sal_Int32 nIdx{ 0 };
+                    Date aDate(o3tl::toInt32(o3tl::getToken(aContent, 0, ';', nIdx)));
+                    auto nTime = o3tl::toInt64(o3tl::getToken(aContent, 0, ';', nIdx));
+                    if (nTime < 0)
+                        nTime = o3tl::saturating_toggle_sign(nTime);
+                    tools::Time aTime(nTime);
+                    DateTime aDateTime(aDate, aTime);
+                    uDT = aDateTime.GetUNODateTime();
+                    valid = true;
+                }
+                else if (utl::ISO8601parseDateTime(aContent, uDT))
+                    valid = true;
+
+                if (valid)
+                {
+                    bChanged = true;
+                    if (HtmlMeta::Created == nAction)
+                        i_xDocProps->setCreationDate(uDT);
+                    else
+                        i_xDocProps->setModificationDate(uDT);
+                }
+            }
+            break;
+
+        case HtmlMeta::Refresh:
+            DBG_ASSERT( !bHTTPEquiv || i_pHTTPHeader, "Lost Reload-URL because of omitted MUST change." );
+            break;
+
+        case HtmlMeta::ContentType:
+            if ( !aContent.isEmpty() )
+            {
+                o_rEnc = GetEncodingByMIME( aContent );
+            }
+            break;
+
+        case HtmlMeta::NONE:
+            if ( !bHTTPEquiv )
+            {
+                if (i_xDocProps.is())
+                {
+                    uno::Reference<beans::XPropertyContainer> xUDProps
+                        = i_xDocProps->getUserDefinedProperties();
+                    try {
+                        xUDProps->addProperty(aName,
+                            beans::PropertyAttribute::REMOVABLE,
+                            uno::Any(aContent));
+                        AddMetaUserDefined(aName);
+                        bChanged = true;
+                    } catch (uno::Exception &) {
+                        // ignore
+                    }
+                }
+            }
+            break;
+        default:
+            break;
+    }
+
+    return bChanged;
+}
+
+bool HTMLParser::ParseMetaOptions(
+        const uno::Reference<document::XDocumentProperties> & i_xDocProps,
+        SvKeyValueIterator *i_pHeader )
+{
+    HtmlOptionId nContentOption = HtmlOptionId::CONTENT;
+    rtl_TextEncoding eEnc = RTL_TEXTENCODING_DONTKNOW;
+
+    bool bRet = ParseMetaOptionsImpl( i_xDocProps, i_pHeader,
+                      GetOptions(&nContentOption),
+                      eEnc );
+
+    // If the encoding is set by a META tag, it may only overwrite the
+    // current encoding if both, the current and the new encoding, are 1-sal_uInt8
+    // encodings. Everything else cannot lead to reasonable results.
+    if (RTL_TEXTENCODING_DONTKNOW != eEnc &&
+        rtl_isOctetTextEncoding( eEnc ) &&
+        rtl_isOctetTextEncoding( GetSrcEncoding() ) )
+    {
+        eEnc = GetExtendedCompatibilityTextEncoding( eEnc );
+        SetSrcEncoding( eEnc );
+    }
+
+    return bRet;
+}
+
+rtl_TextEncoding HTMLParser::GetEncodingByMIME( const OUString& rMime )
+{
+    OUString sType;
+    OUString sSubType;
+    INetContentTypeParameterList aParameters;
+    if (INetContentTypes::parse(rMime, sType, sSubType, &aParameters))
+    {
+        auto const iter = aParameters.find("charset"_ostr);
+        if (iter != aParameters.end())
+        {
+            const INetContentTypeParameter * pCharset = &iter->second;
+            OString sValue(OUStringToOString(pCharset->m_sValue, RTL_TEXTENCODING_ASCII_US));
+            return GetExtendedCompatibilityTextEncoding( rtl_getTextEncodingFromMimeCharset( sValue.getStr() ) );
+        }
+    }
+    return RTL_TEXTENCODING_DONTKNOW;
+}
+
+rtl_TextEncoding HTMLParser::GetEncodingByHttpHeader( SvKeyValueIterator *pHTTPHeader )
+{
+    rtl_TextEncoding eRet = RTL_TEXTENCODING_DONTKNOW;
+    if( pHTTPHeader )
+    {
+        SvKeyValue aKV;
+        for( bool bCont = pHTTPHeader->GetFirst( aKV ); bCont;
+             bCont = pHTTPHeader->GetNext( aKV ) )
+        {
+            if( aKV.GetKey().equalsIgnoreAsciiCase( OOO_STRING_SVTOOLS_HTML_META_content_type ) )
+            {
+                if( !aKV.GetValue().isEmpty() )
+                {
+                    eRet = HTMLParser::GetEncodingByMIME( aKV.GetValue() );
+                }
+            }
+        }
+    }
+    return eRet;
+}
+
+bool HTMLParser::SetEncodingByHTTPHeader( SvKeyValueIterator *pHTTPHeader )
+{
+    bool bRet = false;
+    rtl_TextEncoding eEnc = HTMLParser::GetEncodingByHttpHeader( pHTTPHeader );
+    if(RTL_TEXTENCODING_DONTKNOW != eEnc)
+    {
+        SetSrcEncoding( eEnc );
+        bRet = true;
+    }
+    return bRet;
+}
+
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-15 05:54:39 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-15 05:54:39 +0000
commit	267c6f2ac71f92999e969232431ba04678e7437e (patch)
tree	358c9467650e1d0a1d7227a21dac2e3d08b622b2 /svtools/source/svhtml/parhtml.cxx
parent	Initial commit. (diff)
download	libreoffice-267c6f2ac71f92999e969232431ba04678e7437e.tar.xz libreoffice-267c6f2ac71f92999e969232431ba04678e7437e.zip