summaryrefslogtreecommitdiffstats
path: root/sax/source/expatwrap
diff options
context:
space:
mode:
Diffstat (limited to 'sax/source/expatwrap')
-rw-r--r--sax/source/expatwrap/expwrap.component38
-rw-r--r--sax/source/expatwrap/sax_expat.cxx957
-rw-r--r--sax/source/expatwrap/saxwriter.cxx1464
-rw-r--r--sax/source/expatwrap/xml2utf.cxx519
4 files changed, 2978 insertions, 0 deletions
diff --git a/sax/source/expatwrap/expwrap.component b/sax/source/expatwrap/expwrap.component
new file mode 100644
index 0000000000..1f72eccf31
--- /dev/null
+++ b/sax/source/expatwrap/expwrap.component
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * This file incorporates work covered by the following license notice:
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed
+ * with this work for additional information regarding copyright
+ * ownership. The ASF licenses this file to you under the Apache
+ * License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of
+ * the License at http://www.apache.org/licenses/LICENSE-2.0 .
+ -->
+
+<component loader="com.sun.star.loader.SharedLibrary" environment="@CPPU_ENV@"
+ xmlns="http://openoffice.org/2010/uno-components">
+ <implementation name="com.sun.star.comp.extensions.xml.sax.ParserExpat"
+ constructor="com_sun_star_comp_extensions_xml_sax_ParserExpat_get_implementation">
+ <service name="com.sun.star.xml.sax.Parser"/>
+ </implementation>
+ <implementation name="com.sun.star.extensions.xml.sax.Writer"
+ constructor="com_sun_star_extensions_xml_sax_Writer_get_implementation">
+ <service name="com.sun.star.xml.sax.Writer"/>
+ </implementation>
+ <implementation name="com.sun.star.comp.extensions.xml.sax.FastParser"
+ constructor="com_sun_star_comp_extensions_xml_sax_FastParser_get_implementation">
+ <service name="com.sun.star.xml.sax.FastParser"/>
+ </implementation>
+ <implementation name="com.sun.star.comp.extensions.xml.sax.LegacyFastParser"
+ constructor="com_sun_star_comp_extensions_xml_sax_LegacyFastParser_get_implementation">
+ <service name="com.sun.star.xml.sax.LegacyFastParser"/>
+ </implementation>
+</component>
diff --git a/sax/source/expatwrap/sax_expat.cxx b/sax/source/expatwrap/sax_expat.cxx
new file mode 100644
index 0000000000..9a82b87036
--- /dev/null
+++ b/sax/source/expatwrap/sax_expat.cxx
@@ -0,0 +1,957 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * This file incorporates work covered by the following license notice:
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed
+ * with this work for additional information regarding copyright
+ * ownership. The ASF licenses this file to you under the Apache
+ * License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of
+ * the License at http://www.apache.org/licenses/LICENSE-2.0 .
+ */
+
+#include <string.h>
+#include <cassert>
+#include <memory>
+#include <mutex>
+#include <utility>
+#include <string_view>
+#include <vector>
+
+
+#include <com/sun/star/lang/XServiceInfo.hpp>
+#include <com/sun/star/lang/XInitialization.hpp>
+#include <com/sun/star/uno/XComponentContext.hpp>
+#include <com/sun/star/xml/sax/XExtendedDocumentHandler.hpp>
+#include <com/sun/star/xml/sax/XParser.hpp>
+#include <com/sun/star/xml/sax/SAXParseException.hpp>
+#include <com/sun/star/io/IOException.hpp>
+#include <com/sun/star/io/XSeekable.hpp>
+#include <com/sun/star/lang/WrappedTargetRuntimeException.hpp>
+
+#include <comphelper/attributelist.hxx>
+#include <cppuhelper/weak.hxx>
+#include <cppuhelper/implbase.hxx>
+#include <cppuhelper/supportsservice.hxx>
+#include <rtl/ref.hxx>
+#include <sal/log.hxx>
+
+#include <expat.h>
+#include <xml2utf.hxx>
+
+using namespace ::osl;
+using namespace ::cppu;
+using namespace ::com::sun::star::lang;
+using namespace ::com::sun::star::xml::sax;
+using namespace ::com::sun::star::io;
+
+
+namespace {
+
+#define XML_CHAR_TO_OUSTRING(x) OUString(x , strlen( x ), RTL_TEXTENCODING_UTF8)
+#define XML_CHAR_N_TO_USTRING(x,n) OUString(x,n, RTL_TEXTENCODING_UTF8 )
+
+
+/*
+* The following macro encapsulates any call to an event handler.
+* It ensures, that exceptions thrown by the event handler are
+* treated properly.
+*/
+#define CALL_ELEMENT_HANDLER_AND_CARE_FOR_EXCEPTIONS(pThis,call) \
+ if( ! pThis->bExceptionWasThrown ) { \
+ try {\
+ pThis->call;\
+ }\
+ catch( const SAXParseException &e ) {\
+ callErrorHandler( pThis , e );\
+ }\
+ catch( const SAXException &e ) {\
+ callErrorHandler( pThis , SAXParseException(\
+ e.Message, \
+ e.Context, \
+ e.WrappedException,\
+ pThis->rDocumentLocator->getPublicId(),\
+ pThis->rDocumentLocator->getSystemId(),\
+ pThis->rDocumentLocator->getLineNumber(),\
+ pThis->rDocumentLocator->getColumnNumber()\
+ ) );\
+ }\
+ catch( const css::uno::RuntimeException &e ) {\
+ pThis->bExceptionWasThrown = true; \
+ pThis->bRTExceptionWasThrown = true; \
+ pImpl->rtexception = e; \
+ }\
+ catch( const css::uno::Exception &e ) {\
+ pThis->bExceptionWasThrown = true; \
+ pThis->bRTExceptionWasThrown = true; \
+ pImpl->rtexception = WrappedTargetRuntimeException("Non-runtime UNO exception caught during parse", e.Context, css::uno::Any(e)); \
+ }\
+ }\
+ ((void)0)
+
+
+class SaxExpatParser_Impl;
+
+// This class implements the external Parser interface
+class SaxExpatParser
+ : public WeakImplHelper< XInitialization
+ , XServiceInfo
+ , XParser >
+{
+
+public:
+ SaxExpatParser();
+
+ // css::lang::XInitialization:
+ virtual void SAL_CALL initialize(css::uno::Sequence<css::uno::Any> const& rArguments) override;
+
+ // The SAX-Parser-Interface
+ virtual void SAL_CALL parseStream( const InputSource& structSource) override;
+ virtual void SAL_CALL setDocumentHandler(const css::uno::Reference< XDocumentHandler > & xHandler) override;
+
+ virtual void SAL_CALL setErrorHandler(const css::uno::Reference< XErrorHandler > & xHandler) override;
+ virtual void SAL_CALL setDTDHandler(const css::uno::Reference < XDTDHandler > & xHandler) override;
+ virtual void SAL_CALL setEntityResolver(const css::uno::Reference< XEntityResolver >& xResolver) override;
+
+ virtual void SAL_CALL setLocale( const Locale &locale ) override;
+
+public: // XServiceInfo
+ OUString SAL_CALL getImplementationName() override;
+ css::uno::Sequence< OUString > SAL_CALL getSupportedServiceNames() override;
+ sal_Bool SAL_CALL supportsService(const OUString& ServiceName) override;
+
+private:
+ std::unique_ptr<SaxExpatParser_Impl> m_pImpl;
+};
+
+
+// Entity binds all information needed for a single file
+struct Entity
+{
+ InputSource structSource;
+ XML_Parser pParser;
+ sax_expatwrap::XMLFile2UTFConverter converter;
+};
+
+
+class SaxExpatParser_Impl
+{
+public: // module scope
+ std::mutex aMutex;
+ bool m_bEnableDoS; // fdo#60471 thank you Adobe Illustrator
+
+ css::uno::Reference< XDocumentHandler > rDocumentHandler;
+ css::uno::Reference< XExtendedDocumentHandler > rExtendedDocumentHandler;
+
+ css::uno::Reference< XErrorHandler > rErrorHandler;
+ css::uno::Reference< XDTDHandler > rDTDHandler;
+ css::uno::Reference< XEntityResolver > rEntityResolver;
+ css::uno::Reference < XLocator > rDocumentLocator;
+
+
+ rtl::Reference < comphelper::AttributeList > rAttrList;
+
+ // External entity stack
+ std::vector<struct Entity> vecEntity;
+ void pushEntity( Entity &&entity )
+ { vecEntity.push_back( std::move(entity) ); }
+ void popEntity()
+ { vecEntity.pop_back( ); }
+ struct Entity &getEntity()
+ { return vecEntity.back(); }
+
+
+ // Exception cannot be thrown through the C-XmlParser (possible resource leaks),
+ // therefore the exception must be saved somewhere.
+ SAXParseException exception;
+ css::uno::RuntimeException rtexception;
+ bool bExceptionWasThrown;
+ bool bRTExceptionWasThrown;
+
+public:
+ SaxExpatParser_Impl()
+ : m_bEnableDoS(false)
+ , bExceptionWasThrown(false)
+ , bRTExceptionWasThrown(false)
+ {
+ }
+
+ // the C-Callbacks for the expat parser
+ void static callbackStartElement(void *userData, const XML_Char *name , const XML_Char **atts);
+ void static callbackEndElement(void *userData, const XML_Char *name);
+ void static callbackCharacters( void *userData , const XML_Char *s , int nLen );
+ void static callbackProcessingInstruction( void *userData ,
+ const XML_Char *sTarget ,
+ const XML_Char *sData );
+
+ void static callbackEntityDecl( void *userData ,
+ const XML_Char *entityName,
+ int is_parameter_entity,
+ const XML_Char *value,
+ int value_length,
+ const XML_Char *base,
+ const XML_Char *systemId,
+ const XML_Char *publicId,
+ const XML_Char *notationName);
+
+ void static callbackNotationDecl( void *userData,
+ const XML_Char *notationName,
+ const XML_Char *base,
+ const XML_Char *systemId,
+ const XML_Char *publicId);
+
+ bool static callbackExternalEntityRef( XML_Parser parser,
+ const XML_Char *openEntityNames,
+ const XML_Char *base,
+ const XML_Char *systemId,
+ const XML_Char *publicId);
+
+ int static callbackUnknownEncoding(void *encodingHandlerData,
+ const XML_Char *name,
+ XML_Encoding *info);
+
+ void static callbackDefault( void *userData, const XML_Char *s, int len);
+
+ void static callbackStartCDATA( void *userData );
+ void static callbackEndCDATA( void *userData );
+ void static callbackComment( void *userData , const XML_Char *s );
+ void static callErrorHandler( SaxExpatParser_Impl *pImpl , const SAXParseException &e );
+
+public:
+ void parse();
+};
+
+extern "C"
+{
+ static void call_callbackStartElement(void *userData, const XML_Char *name , const XML_Char **atts)
+ {
+ SaxExpatParser_Impl::callbackStartElement(userData,name,atts);
+ }
+ static void call_callbackEndElement(void *userData, const XML_Char *name)
+ {
+ SaxExpatParser_Impl::callbackEndElement(userData,name);
+ }
+ static void call_callbackCharacters( void *userData , const XML_Char *s , int nLen )
+ {
+ SaxExpatParser_Impl::callbackCharacters(userData,s,nLen);
+ }
+ static void call_callbackProcessingInstruction(void *userData,const XML_Char *sTarget,const XML_Char *sData )
+ {
+ SaxExpatParser_Impl::callbackProcessingInstruction(userData,sTarget,sData );
+ }
+ static void call_callbackEntityDecl(void *userData ,
+ const XML_Char *entityName,
+ int is_parameter_entity,
+ const XML_Char *value,
+ int value_length,
+ const XML_Char *base,
+ const XML_Char *systemId,
+ const XML_Char *publicId,
+ const XML_Char *notationName)
+ {
+ SaxExpatParser_Impl::callbackEntityDecl(userData, entityName,
+ is_parameter_entity, value, value_length,
+ base, systemId, publicId, notationName);
+ }
+ static void call_callbackNotationDecl(void *userData,
+ const XML_Char *notationName,
+ const XML_Char *base,
+ const XML_Char *systemId,
+ const XML_Char *publicId)
+ {
+ SaxExpatParser_Impl::callbackNotationDecl(userData,notationName,base,systemId,publicId);
+ }
+ static int call_callbackExternalEntityRef(XML_Parser parser,
+ const XML_Char *openEntityNames,
+ const XML_Char *base,
+ const XML_Char *systemId,
+ const XML_Char *publicId)
+ {
+ return SaxExpatParser_Impl::callbackExternalEntityRef(parser,openEntityNames,base,systemId,publicId);
+ }
+ static int call_callbackUnknownEncoding(void *encodingHandlerData,
+ const XML_Char *name,
+ XML_Encoding *info)
+ {
+ return SaxExpatParser_Impl::callbackUnknownEncoding(encodingHandlerData,name,info);
+ }
+ static void call_callbackDefault( void *userData, const XML_Char *s, int len)
+ {
+ SaxExpatParser_Impl::callbackDefault(userData,s,len);
+ }
+ static void call_callbackStartCDATA( void *userData )
+ {
+ SaxExpatParser_Impl::callbackStartCDATA(userData);
+ }
+ static void call_callbackEndCDATA( void *userData )
+ {
+ SaxExpatParser_Impl::callbackEndCDATA(userData);
+ }
+ static void call_callbackComment( void *userData , const XML_Char *s )
+ {
+ SaxExpatParser_Impl::callbackComment(userData,s);
+ }
+}
+
+
+// LocatorImpl
+
+class LocatorImpl :
+ public WeakImplHelper< XLocator, css::io::XSeekable >
+ // should use a different interface for stream positions!
+{
+public:
+ explicit LocatorImpl(SaxExpatParser_Impl *p)
+ : m_pParser(p)
+ {
+ }
+
+public: //XLocator
+ virtual sal_Int32 SAL_CALL getColumnNumber() override
+ {
+ return XML_GetCurrentColumnNumber( m_pParser->getEntity().pParser );
+ }
+ virtual sal_Int32 SAL_CALL getLineNumber() override
+ {
+ return XML_GetCurrentLineNumber( m_pParser->getEntity().pParser );
+ }
+ virtual OUString SAL_CALL getPublicId() override
+ {
+ return m_pParser->getEntity().structSource.sPublicId;
+ }
+ virtual OUString SAL_CALL getSystemId() override
+ {
+ return m_pParser->getEntity().structSource.sSystemId;
+ }
+
+ // XSeekable (only for getPosition)
+
+ virtual void SAL_CALL seek( sal_Int64 ) override
+ {
+ }
+ virtual sal_Int64 SAL_CALL getPosition() override
+ {
+ return XML_GetCurrentByteIndex( m_pParser->getEntity().pParser );
+ }
+ virtual ::sal_Int64 SAL_CALL getLength() override
+ {
+ return 0;
+ }
+
+private:
+
+ SaxExpatParser_Impl *m_pParser;
+};
+
+
+SaxExpatParser::SaxExpatParser( )
+{
+ m_pImpl.reset( new SaxExpatParser_Impl );
+
+ rtl::Reference<LocatorImpl> pLoc = new LocatorImpl( m_pImpl.get() );
+ m_pImpl->rDocumentLocator = pLoc;
+
+ // Performance-improvement; handing out the same object with every call of
+ // the startElement callback is allowed (see sax-specification):
+ m_pImpl->rAttrList = new comphelper::AttributeList;
+
+ m_pImpl->bExceptionWasThrown = false;
+ m_pImpl->bRTExceptionWasThrown = false;
+}
+
+// css::lang::XInitialization:
+void SAL_CALL
+SaxExpatParser::initialize(css::uno::Sequence< css::uno::Any > const& rArguments)
+{
+ // possible arguments: a string "DoSmeplease"
+ if (rArguments.hasElements())
+ {
+ OUString str;
+ if ((rArguments[0] >>= str) && "DoSmeplease" == str)
+ {
+ std::unique_lock guard( m_pImpl->aMutex );
+ m_pImpl->m_bEnableDoS = true;
+ }
+ }
+}
+
+class ParserCleanup
+{
+private:
+ SaxExpatParser_Impl& m_rParser;
+ XML_Parser m_xmlParser;
+public:
+ ParserCleanup(SaxExpatParser_Impl& rParser, XML_Parser xmlParser)
+ : m_rParser(rParser)
+ , m_xmlParser(xmlParser)
+ {
+ }
+ ~ParserCleanup()
+ {
+ m_rParser.popEntity();
+ //XML_ParserFree accepts a null arg
+ XML_ParserFree(m_xmlParser);
+ }
+};
+
+/***************
+*
+* parseStream does Parser-startup initializations. The SaxExpatParser_Impl::parse() method does
+* the file-specific initialization work. (During a parser run, external files may be opened)
+*
+****************/
+void SaxExpatParser::parseStream( const InputSource& structSource)
+{
+ // Only one text at one time
+ std::unique_lock guard( m_pImpl->aMutex );
+
+
+ struct Entity entity;
+ entity.structSource = structSource;
+
+ if( ! entity.structSource.aInputStream.is() )
+ {
+ throw SAXException("No input source",
+ css::uno::Reference< css::uno::XInterface > () , css::uno::Any() );
+ }
+
+ entity.converter.setInputStream( entity.structSource.aInputStream );
+ if( !entity.structSource.sEncoding.isEmpty() )
+ {
+ entity.converter.setEncoding(
+ OUStringToOString( entity.structSource.sEncoding , RTL_TEXTENCODING_ASCII_US ) );
+ }
+
+ // create parser with proper encoding
+ entity.pParser = XML_ParserCreate( nullptr );
+ if( ! entity.pParser )
+ {
+ throw SAXException("Couldn't create parser",
+ css::uno::Reference< css::uno::XInterface > (), css::uno::Any() );
+ }
+
+ // set all necessary C-Callbacks
+ XML_SetUserData( entity.pParser, m_pImpl.get() );
+ XML_SetElementHandler( entity.pParser ,
+ call_callbackStartElement ,
+ call_callbackEndElement );
+ XML_SetCharacterDataHandler( entity.pParser , call_callbackCharacters );
+ XML_SetProcessingInstructionHandler(entity.pParser ,
+ call_callbackProcessingInstruction );
+ if (!m_pImpl->m_bEnableDoS)
+ {
+ XML_SetEntityDeclHandler(entity.pParser, call_callbackEntityDecl);
+ }
+ XML_SetNotationDeclHandler( entity.pParser, call_callbackNotationDecl );
+ XML_SetExternalEntityRefHandler( entity.pParser,
+ call_callbackExternalEntityRef);
+ XML_SetUnknownEncodingHandler( entity.pParser, call_callbackUnknownEncoding ,nullptr);
+
+ if( m_pImpl->rExtendedDocumentHandler.is() ) {
+
+ // These handlers just delegate calls to the ExtendedHandler. If no extended handler is
+ // given, these callbacks can be ignored
+ XML_SetDefaultHandlerExpand( entity.pParser, call_callbackDefault );
+ XML_SetCommentHandler( entity.pParser, call_callbackComment );
+ XML_SetCdataSectionHandler( entity.pParser ,
+ call_callbackStartCDATA ,
+ call_callbackEndCDATA );
+ }
+
+
+ m_pImpl->exception = SAXParseException();
+ auto const xmlParser = entity.pParser;
+ m_pImpl->pushEntity( std::move(entity) );
+
+ ParserCleanup aEnsureFree(*m_pImpl, xmlParser);
+
+ // start the document
+ if( m_pImpl->rDocumentHandler.is() ) {
+ m_pImpl->rDocumentHandler->setDocumentLocator( m_pImpl->rDocumentLocator );
+ m_pImpl->rDocumentHandler->startDocument();
+ }
+
+ m_pImpl->parse();
+
+ // finish document
+ if( m_pImpl->rDocumentHandler.is() ) {
+ m_pImpl->rDocumentHandler->endDocument();
+ }
+}
+
+void SaxExpatParser::setDocumentHandler(const css::uno::Reference< XDocumentHandler > & xHandler)
+{
+ m_pImpl->rDocumentHandler = xHandler;
+ m_pImpl->rExtendedDocumentHandler =
+ css::uno::Reference< XExtendedDocumentHandler >( xHandler , css::uno::UNO_QUERY );
+}
+
+void SaxExpatParser::setErrorHandler(const css::uno::Reference< XErrorHandler > & xHandler)
+{
+ m_pImpl->rErrorHandler = xHandler;
+}
+
+void SaxExpatParser::setDTDHandler(const css::uno::Reference< XDTDHandler > & xHandler)
+{
+ m_pImpl->rDTDHandler = xHandler;
+}
+
+void SaxExpatParser::setEntityResolver(const css::uno::Reference < XEntityResolver > & xResolver)
+{
+ m_pImpl->rEntityResolver = xResolver;
+}
+
+
+void SaxExpatParser::setLocale( const Locale & )
+{
+ // not implemented
+}
+
+// XServiceInfo
+OUString SaxExpatParser::getImplementationName()
+{
+ return "com.sun.star.comp.extensions.xml.sax.ParserExpat";
+}
+
+// XServiceInfo
+sal_Bool SaxExpatParser::supportsService(const OUString& ServiceName)
+{
+ return cppu::supportsService(this, ServiceName);
+}
+
+// XServiceInfo
+css::uno::Sequence< OUString > SaxExpatParser::getSupportedServiceNames()
+{
+ return { "com.sun.star.xml.sax.Parser" };
+}
+
+
+/*---------------------------------------
+*
+* Helper functions and classes
+*
+*
+*-------------------------------------------*/
+OUString getErrorMessage( XML_Error xmlE, std::u16string_view sSystemId , sal_Int32 nLine )
+{
+ OUString Message;
+ if( XML_ERROR_NONE == xmlE ) {
+ Message = "No";
+ }
+ else if( XML_ERROR_NO_MEMORY == xmlE ) {
+ Message = "no memory";
+ }
+ else if( XML_ERROR_SYNTAX == xmlE ) {
+ Message = "syntax";
+ }
+ else if( XML_ERROR_NO_ELEMENTS == xmlE ) {
+ Message = "no elements";
+ }
+ else if( XML_ERROR_INVALID_TOKEN == xmlE ) {
+ Message = "invalid token";
+ }
+ else if( XML_ERROR_UNCLOSED_TOKEN == xmlE ) {
+ Message = "unclosed token";
+ }
+ else if( XML_ERROR_PARTIAL_CHAR == xmlE ) {
+ Message = "partial char";
+ }
+ else if( XML_ERROR_TAG_MISMATCH == xmlE ) {
+ Message = "tag mismatch";
+ }
+ else if( XML_ERROR_DUPLICATE_ATTRIBUTE == xmlE ) {
+ Message = "duplicate attribute";
+ }
+ else if( XML_ERROR_JUNK_AFTER_DOC_ELEMENT == xmlE ) {
+ Message = "junk after doc element";
+ }
+ else if( XML_ERROR_PARAM_ENTITY_REF == xmlE ) {
+ Message = "parameter entity reference";
+ }
+ else if( XML_ERROR_UNDEFINED_ENTITY == xmlE ) {
+ Message = "undefined entity";
+ }
+ else if( XML_ERROR_RECURSIVE_ENTITY_REF == xmlE ) {
+ Message = "recursive entity reference";
+ }
+ else if( XML_ERROR_ASYNC_ENTITY == xmlE ) {
+ Message = "async entity";
+ }
+ else if( XML_ERROR_BAD_CHAR_REF == xmlE ) {
+ Message = "bad char reference";
+ }
+ else if( XML_ERROR_BINARY_ENTITY_REF == xmlE ) {
+ Message = "binary entity reference";
+ }
+ else if( XML_ERROR_ATTRIBUTE_EXTERNAL_ENTITY_REF == xmlE ) {
+ Message = "attribute external entity reference";
+ }
+ else if( XML_ERROR_MISPLACED_XML_PI == xmlE ) {
+ Message = "misplaced xml processing instruction";
+ }
+ else if( XML_ERROR_UNKNOWN_ENCODING == xmlE ) {
+ Message = "unknown encoding";
+ }
+ else if( XML_ERROR_INCORRECT_ENCODING == xmlE ) {
+ Message = "incorrect encoding";
+ }
+ else if( XML_ERROR_UNCLOSED_CDATA_SECTION == xmlE ) {
+ Message = "unclosed cdata section";
+ }
+ else if( XML_ERROR_EXTERNAL_ENTITY_HANDLING == xmlE ) {
+ Message = "external entity reference";
+ }
+ else if( XML_ERROR_NOT_STANDALONE == xmlE ) {
+ Message = "not standalone";
+ }
+
+ OUString str = OUString::Concat("[") +
+ sSystemId +
+ " line " +
+ OUString::number( nLine ) +
+ "]: " +
+ Message +
+ "error";
+
+ return str;
+}
+
+
+// starts parsing with actual parser !
+void SaxExpatParser_Impl::parse( )
+{
+ const int nBufSize = 16*1024;
+
+ int nRead = nBufSize;
+ css::uno::Sequence< sal_Int8 > seqOut(nBufSize);
+
+ while( nRead ) {
+ nRead = getEntity().converter.readAndConvert( seqOut , nBufSize );
+
+ bool bContinue(false);
+
+ if( ! nRead ) {
+ // last call - must return OK
+ XML_Status const ret = XML_Parse( getEntity().pParser,
+ reinterpret_cast<const char *>(seqOut.getConstArray()),
+ 0 ,
+ 1 );
+ if (ret == XML_STATUS_OK) {
+ break;
+ }
+ } else {
+ bContinue = ( XML_Parse( getEntity().pParser,
+ reinterpret_cast<const char *>(seqOut.getConstArray()),
+ nRead,
+ 0 ) != XML_STATUS_ERROR );
+ }
+
+ if( ! bContinue || bExceptionWasThrown ) {
+
+ if ( bRTExceptionWasThrown )
+ throw rtexception;
+
+ // Error during parsing !
+ XML_Error xmlE = XML_GetErrorCode( getEntity().pParser );
+ OUString sSystemId = rDocumentLocator->getSystemId();
+ sal_Int32 nLine = rDocumentLocator->getLineNumber();
+
+ SAXParseException aExcept(
+ getErrorMessage(xmlE , sSystemId, nLine) ,
+ css::uno::Reference< css::uno::XInterface >(),
+ css::uno::Any( &exception , cppu::UnoType<decltype(exception)>::get() ),
+ rDocumentLocator->getPublicId(),
+ rDocumentLocator->getSystemId(),
+ rDocumentLocator->getLineNumber(),
+ rDocumentLocator->getColumnNumber()
+ );
+
+ if( rErrorHandler.is() ) {
+
+ // error handler is set, so the handler may throw the exception
+ css::uno::Any a;
+ a <<= aExcept;
+ rErrorHandler->fatalError( a );
+ }
+
+ // Error handler has not thrown an exception, but parsing cannot go on,
+ // so an exception MUST be thrown.
+ throw aExcept;
+ } // if( ! bContinue )
+ } // while
+}
+
+
+// The C-Callbacks
+
+
+void SaxExpatParser_Impl::callbackStartElement( void *pvThis ,
+ const XML_Char *pwName ,
+ const XML_Char **awAttributes )
+{
+ SaxExpatParser_Impl *pImpl = static_cast<SaxExpatParser_Impl*>(pvThis);
+
+ if( !pImpl->rDocumentHandler.is() )
+ return;
+
+ int i = 0;
+ pImpl->rAttrList->Clear();
+
+ while( awAttributes[i] ) {
+ assert(awAttributes[i+1]);
+ pImpl->rAttrList->AddAttribute(
+ XML_CHAR_TO_OUSTRING( awAttributes[i] ) ,
+ XML_CHAR_TO_OUSTRING( awAttributes[i+1] ) );
+ i +=2;
+ }
+
+ CALL_ELEMENT_HANDLER_AND_CARE_FOR_EXCEPTIONS(
+ pImpl ,
+ rDocumentHandler->startElement( XML_CHAR_TO_OUSTRING( pwName ) ,
+ pImpl->rAttrList ) );
+}
+
+void SaxExpatParser_Impl::callbackEndElement( void *pvThis , const XML_Char *pwName )
+{
+ SaxExpatParser_Impl *pImpl = static_cast<SaxExpatParser_Impl*>(pvThis);
+
+ if( pImpl->rDocumentHandler.is() ) {
+ CALL_ELEMENT_HANDLER_AND_CARE_FOR_EXCEPTIONS( pImpl,
+ rDocumentHandler->endElement( XML_CHAR_TO_OUSTRING( pwName ) ) );
+ }
+}
+
+
+void SaxExpatParser_Impl::callbackCharacters( void *pvThis , const XML_Char *s , int nLen )
+{
+ SaxExpatParser_Impl *pImpl = static_cast<SaxExpatParser_Impl*>(pvThis);
+
+ if( pImpl->rDocumentHandler.is() ) {
+ CALL_ELEMENT_HANDLER_AND_CARE_FOR_EXCEPTIONS( pImpl ,
+ rDocumentHandler->characters( XML_CHAR_N_TO_USTRING(s,nLen) ) );
+ }
+}
+
+void SaxExpatParser_Impl::callbackProcessingInstruction( void *pvThis,
+ const XML_Char *sTarget ,
+ const XML_Char *sData )
+{
+ SaxExpatParser_Impl *pImpl = static_cast<SaxExpatParser_Impl*>(pvThis);
+ if( pImpl->rDocumentHandler.is() ) {
+ CALL_ELEMENT_HANDLER_AND_CARE_FOR_EXCEPTIONS(
+ pImpl ,
+ rDocumentHandler->processingInstruction( XML_CHAR_TO_OUSTRING( sTarget ),
+ XML_CHAR_TO_OUSTRING( sData ) ) );
+ }
+}
+
+
+void SaxExpatParser_Impl::callbackEntityDecl(
+ void *pvThis, const XML_Char *entityName,
+ SAL_UNUSED_PARAMETER int /*is_parameter_entity*/,
+ const XML_Char *value, SAL_UNUSED_PARAMETER int /*value_length*/,
+ SAL_UNUSED_PARAMETER const XML_Char * /*base*/, const XML_Char *systemId,
+ const XML_Char *publicId, const XML_Char *notationName)
+{
+ SaxExpatParser_Impl *pImpl = static_cast<SaxExpatParser_Impl*>(pvThis);
+ if (value) { // value != 0 means internal entity
+ SAL_INFO("sax","SaxExpatParser: internal entity declaration, stopping");
+ XML_StopParser(pImpl->getEntity().pParser, XML_FALSE);
+ pImpl->exception = SAXParseException(
+ "SaxExpatParser: internal entity declaration, stopping",
+ nullptr, css::uno::Any(),
+ pImpl->rDocumentLocator->getPublicId(),
+ pImpl->rDocumentLocator->getSystemId(),
+ pImpl->rDocumentLocator->getLineNumber(),
+ pImpl->rDocumentLocator->getColumnNumber() );
+ pImpl->bExceptionWasThrown = true;
+ } else {
+ if( pImpl->rDTDHandler.is() ) {
+ CALL_ELEMENT_HANDLER_AND_CARE_FOR_EXCEPTIONS(
+ pImpl ,
+ rDTDHandler->unparsedEntityDecl(
+ XML_CHAR_TO_OUSTRING( entityName ),
+ XML_CHAR_TO_OUSTRING( publicId ) ,
+ XML_CHAR_TO_OUSTRING( systemId ) ,
+ XML_CHAR_TO_OUSTRING( notationName ) ) );
+ }
+ }
+}
+
+void SaxExpatParser_Impl::callbackNotationDecl(
+ void *pvThis, const XML_Char *notationName,
+ SAL_UNUSED_PARAMETER const XML_Char * /*base*/, const XML_Char *systemId,
+ const XML_Char *publicId)
+{
+ SaxExpatParser_Impl *pImpl = static_cast<SaxExpatParser_Impl*>(pvThis);
+ if( pImpl->rDTDHandler.is() ) {
+ CALL_ELEMENT_HANDLER_AND_CARE_FOR_EXCEPTIONS( pImpl,
+ rDTDHandler->notationDecl( XML_CHAR_TO_OUSTRING( notationName ) ,
+ XML_CHAR_TO_OUSTRING( publicId ) ,
+ XML_CHAR_TO_OUSTRING( systemId ) ) );
+ }
+
+}
+
+
+bool SaxExpatParser_Impl::callbackExternalEntityRef(
+ XML_Parser parser, const XML_Char *context,
+ SAL_UNUSED_PARAMETER const XML_Char * /*base*/, const XML_Char *systemId,
+ const XML_Char *publicId)
+{
+ bool bOK = true;
+ SaxExpatParser_Impl *pImpl = static_cast<SaxExpatParser_Impl*>(XML_GetUserData( parser ));
+
+ struct Entity entity;
+
+ if( pImpl->rEntityResolver.is() ) {
+ try
+ {
+ entity.structSource = pImpl->rEntityResolver->resolveEntity(
+ XML_CHAR_TO_OUSTRING( publicId ) ,
+ XML_CHAR_TO_OUSTRING( systemId ) );
+ }
+ catch( const SAXParseException & e )
+ {
+ pImpl->exception = e;
+ bOK = false;
+ }
+ catch( const SAXException & e )
+ {
+ pImpl->exception = SAXParseException(
+ e.Message , e.Context , e.WrappedException ,
+ pImpl->rDocumentLocator->getPublicId(),
+ pImpl->rDocumentLocator->getSystemId(),
+ pImpl->rDocumentLocator->getLineNumber(),
+ pImpl->rDocumentLocator->getColumnNumber() );
+ bOK = false;
+ }
+ }
+
+ if( entity.structSource.aInputStream.is() ) {
+ entity.pParser = XML_ExternalEntityParserCreate( parser , context, nullptr );
+ if( ! entity.pParser )
+ {
+ return false;
+ }
+
+ entity.converter.setInputStream( entity.structSource.aInputStream );
+ auto const xmlParser = entity.pParser;
+ pImpl->pushEntity( std::move(entity) );
+ try
+ {
+ pImpl->parse();
+ }
+ catch( const SAXParseException & e )
+ {
+ pImpl->exception = e;
+ bOK = false;
+ }
+ catch( const IOException &e )
+ {
+ pImpl->exception.WrappedException <<= e;
+ bOK = false;
+ }
+ catch( const css::uno::RuntimeException &e )
+ {
+ pImpl->exception.WrappedException <<=e;
+ bOK = false;
+ }
+
+ pImpl->popEntity();
+
+ XML_ParserFree( xmlParser );
+ }
+
+ return bOK;
+}
+
+int SaxExpatParser_Impl::callbackUnknownEncoding(
+ SAL_UNUSED_PARAMETER void * /*encodingHandlerData*/,
+ SAL_UNUSED_PARAMETER const XML_Char * /*name*/,
+ SAL_UNUSED_PARAMETER XML_Encoding * /*info*/)
+{
+ return 0;
+}
+
+void SaxExpatParser_Impl::callbackDefault( void *pvThis, const XML_Char *s, int len)
+{
+ SaxExpatParser_Impl *pImpl = static_cast<SaxExpatParser_Impl*>(pvThis);
+
+ CALL_ELEMENT_HANDLER_AND_CARE_FOR_EXCEPTIONS( pImpl,
+ rExtendedDocumentHandler->unknown( XML_CHAR_N_TO_USTRING( s ,len) ) );
+}
+
+void SaxExpatParser_Impl::callbackComment( void *pvThis , const XML_Char *s )
+{
+ SaxExpatParser_Impl *pImpl = static_cast<SaxExpatParser_Impl*>(pvThis);
+ CALL_ELEMENT_HANDLER_AND_CARE_FOR_EXCEPTIONS( pImpl,
+ rExtendedDocumentHandler->comment( XML_CHAR_TO_OUSTRING( s ) ) );
+}
+
+void SaxExpatParser_Impl::callbackStartCDATA( void *pvThis )
+{
+ SaxExpatParser_Impl *pImpl = static_cast<SaxExpatParser_Impl*>(pvThis);
+
+ CALL_ELEMENT_HANDLER_AND_CARE_FOR_EXCEPTIONS( pImpl, rExtendedDocumentHandler->startCDATA() );
+}
+
+
+void SaxExpatParser_Impl::callErrorHandler( SaxExpatParser_Impl *pImpl ,
+ const SAXParseException & e )
+{
+ try
+ {
+ if( pImpl->rErrorHandler.is() ) {
+ css::uno::Any a;
+ a <<= e;
+ pImpl->rErrorHandler->error( a );
+ }
+ else {
+ pImpl->exception = e;
+ pImpl->bExceptionWasThrown = true;
+ }
+ }
+ catch( const SAXParseException & ex ) {
+ pImpl->exception = ex;
+ pImpl->bExceptionWasThrown = true;
+ }
+ catch( const SAXException & ex ) {
+ pImpl->exception = SAXParseException(
+ ex.Message,
+ ex.Context,
+ ex.WrappedException,
+ pImpl->rDocumentLocator->getPublicId(),
+ pImpl->rDocumentLocator->getSystemId(),
+ pImpl->rDocumentLocator->getLineNumber(),
+ pImpl->rDocumentLocator->getColumnNumber()
+ );
+ pImpl->bExceptionWasThrown = true;
+ }
+}
+
+void SaxExpatParser_Impl::callbackEndCDATA( void *pvThis )
+{
+ SaxExpatParser_Impl *pImpl = static_cast<SaxExpatParser_Impl*>(pvThis);
+
+ CALL_ELEMENT_HANDLER_AND_CARE_FOR_EXCEPTIONS(pImpl,rExtendedDocumentHandler->endCDATA() );
+}
+
+} // namespace
+
+extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface *
+com_sun_star_comp_extensions_xml_sax_ParserExpat_get_implementation(
+ css::uno::XComponentContext *,
+ css::uno::Sequence<css::uno::Any> const &)
+{
+ return cppu::acquire(new SaxExpatParser);
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/sax/source/expatwrap/saxwriter.cxx b/sax/source/expatwrap/saxwriter.cxx
new file mode 100644
index 0000000000..55608101fa
--- /dev/null
+++ b/sax/source/expatwrap/saxwriter.cxx
@@ -0,0 +1,1464 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * This file incorporates work covered by the following license notice:
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed
+ * with this work for additional information regarding copyright
+ * ownership. The ASF licenses this file to you under the Apache
+ * License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of
+ * the License at http://www.apache.org/licenses/LICENSE-2.0 .
+ */
+
+#include <string.h>
+
+#include <cassert>
+#include <set>
+#include <stack>
+#include <vector>
+
+#include <com/sun/star/io/IOException.hpp>
+#include <com/sun/star/lang/WrappedTargetRuntimeException.hpp>
+#include <com/sun/star/lang/XServiceInfo.hpp>
+#include <com/sun/star/uno/XComponentContext.hpp>
+#include <com/sun/star/util/XCloneable.hpp>
+#include <com/sun/star/xml/sax/SAXInvalidCharacterException.hpp>
+#include <com/sun/star/xml/sax/XWriter.hpp>
+
+#include <cppuhelper/exc_hlp.hxx>
+#include <cppuhelper/weak.hxx>
+#include <cppuhelper/implbase.hxx>
+#include <cppuhelper/supportsservice.hxx>
+
+#include <osl/diagnose.h>
+#include <rtl/character.hxx>
+#include <sal/log.hxx>
+
+#include <memory>
+
+using namespace ::osl;
+using namespace ::cppu;
+using namespace ::com::sun::star::uno;
+using namespace ::com::sun::star::lang;
+using namespace ::com::sun::star::xml::sax;
+using namespace ::com::sun::star::util;
+using namespace ::com::sun::star::io;
+
+#define LINEFEED 10
+#define SEQUENCESIZE 1024
+#define MAXCOLUMNCOUNT 72
+
+/******
+*
+*
+* Character conversion functions
+*
+*
+*****/
+
+namespace
+{
+enum SaxInvalidCharacterError
+{
+ SAX_NONE,
+ SAX_WARNING,
+ SAX_ERROR
+};
+
+// Stuff for custom entity names
+struct ReplacementPair
+{
+ OUString name;
+ OUString replacement;
+};
+inline bool operator<(const ReplacementPair& lhs, const ReplacementPair& rhs)
+{
+ return lhs.replacement.compareTo(rhs.replacement) < 0;
+}
+
+class SaxWriterHelper
+{
+#ifdef DBG_UTIL
+public:
+ ::std::stack<OUString> m_DebugStartedElements;
+#endif
+
+private:
+ Reference<XOutputStream> m_out;
+ Sequence<sal_Int8> m_Sequence;
+ sal_Int8* mp_Sequence;
+
+ sal_Int32 nLastLineFeedPos; // is negative after writing a sequence
+ sal_uInt32 nCurrentPos;
+ bool m_bStartElementFinished;
+
+ std::vector<ReplacementPair> m_Replacements;
+
+ /// @throws SAXException
+ sal_uInt32 writeSequence();
+
+ // use only if to insert the bytes more space in the sequence is needed and
+ // so the sequence has to write out and reset rPos to 0
+ // writes sequence only on overflow, sequence could be full on the end (rPos == SEQUENCESIZE)
+ /// @throws SAXException
+ void AddBytes(sal_Int8* pTarget, sal_uInt32& rPos, const sal_Int8* pBytes,
+ sal_uInt32 nBytesCount);
+ /// @throws SAXException
+ bool convertToXML(const sal_Unicode* pStr, sal_Int32 nStrLen, bool bDoNormalization,
+ bool bNormalizeWhitespace, sal_Int8* pTarget, sal_uInt32& rPos);
+ /// @throws SAXException
+ void FinishStartElement();
+
+ // Search for the correct replacement
+ const ReplacementPair* findXMLReplacement(const sal_Unicode* pStr, sal_Int32 nStrLen);
+
+public:
+ explicit SaxWriterHelper(Reference<XOutputStream> const& m_TempOut)
+ : m_out(m_TempOut)
+ , m_Sequence(SEQUENCESIZE)
+ , mp_Sequence(nullptr)
+ , nLastLineFeedPos(0)
+ , nCurrentPos(0)
+ , m_bStartElementFinished(true)
+ {
+ OSL_ENSURE(SEQUENCESIZE > 50, "Sequence cache size too small");
+ mp_Sequence = m_Sequence.getArray();
+ }
+ ~SaxWriterHelper()
+ {
+ OSL_ENSURE(!nCurrentPos, "cached Sequence not written");
+ OSL_ENSURE(m_bStartElementFinished, "StartElement not completely written");
+ }
+
+ /// @throws SAXException
+ void insertIndentation(sal_uInt32 m_nLevel);
+
+ // returns whether it works correct or invalid characters were in the string
+ // If there are invalid characters in the string it returns sal_False.
+ // Than the calling method has to throw the needed Exception.
+ /// @throws SAXException
+ bool writeString(const OUString& rWriteOutString, bool bDoNormalization,
+ bool bNormalizeWhitespace);
+
+ sal_uInt32 GetLastColumnCount() const noexcept
+ {
+ return static_cast<sal_uInt32>(nCurrentPos - nLastLineFeedPos);
+ }
+
+ /// @throws SAXException
+ void startDocument();
+
+ // returns whether it works correct or invalid characters were in the strings
+ // If there are invalid characters in one of the strings it returns sal_False.
+ // Than the calling method has to throw the needed Exception.
+ /// @throws SAXException
+ SaxInvalidCharacterError startElement(const OUString& rName,
+ const Reference<XAttributeList>& xAttribs);
+ /// @throws SAXException
+ bool FinishEmptyElement();
+
+ // returns whether it works correct or invalid characters were in the string
+ // If there are invalid characters in the string it returns sal_False.
+ // Than the calling method has to throw the needed Exception.
+ /// @throws SAXException
+ bool endElement(const OUString& rName);
+ /// @throws SAXException
+ void endDocument();
+
+ // returns whether it works correct or invalid characters were in the strings
+ // If there are invalid characters in the string it returns sal_False.
+ // Than the calling method has to throw the needed Exception.
+ /// @throws SAXException
+ bool processingInstruction(const OUString& rTarget, const OUString& rData);
+ /// @throws SAXException
+ void startCDATA();
+ /// @throws SAXException
+ void endCDATA();
+
+ // returns whether it works correct or invalid characters were in the strings
+ // If there are invalid characters in the string it returns sal_False.
+ // Than the calling method has to throw the needed Exception.
+ /// @throws SAXException
+ bool comment(const OUString& rComment);
+
+ /// @throws SAXException
+ void clearBuffer();
+
+ // Use custom entity names
+ void setCustomEntityNames(
+ const ::css::uno::Sequence<::css::beans::Pair<::rtl::OUString, ::rtl::OUString>>&
+ replacements);
+
+ // Calculate length for convertToXML
+ sal_Int32 calcXMLByteLength(const OUString& rStr, bool bDoNormalization,
+ bool bNormalizeWhitespace);
+};
+
+const bool g_bValidCharsBelow32[32] = {
+ // clang-format off
+// 0 1 2 3 4 5 6 7
+ false, false, false, false, false, false, false, false, //0
+ false, true, true, false, false, true, false, false, //8
+ false, false, false, false, false, false, false, false, //16
+ false, false, false, false, false, false, false, false
+ // clang-format on
+};
+
+bool IsInvalidChar(const sal_Unicode aChar)
+{
+ bool bRet(false);
+ // check first for the most common characters
+ if (aChar < 32 || aChar >= 0xd800)
+ bRet = ((aChar < 32 && !g_bValidCharsBelow32[aChar]) || aChar == 0xffff || aChar == 0xfffe);
+ return bRet;
+}
+
+/********
+* write through to the output stream
+*
+*****/
+sal_uInt32 SaxWriterHelper::writeSequence()
+{
+ try
+ {
+ m_out->writeBytes(m_Sequence);
+ }
+ catch (const IOException&)
+ {
+ css::uno::Any anyEx = cppu::getCaughtException();
+ throw SAXException("IO exception during writing", Reference<XInterface>(), anyEx);
+ }
+ nLastLineFeedPos -= SEQUENCESIZE;
+ return 0;
+}
+
+void SaxWriterHelper::AddBytes(sal_Int8* pTarget, sal_uInt32& rPos, const sal_Int8* pBytes,
+ sal_uInt32 nBytesCount)
+{
+ OSL_ENSURE((rPos + nBytesCount) > SEQUENCESIZE, "wrong use of AddBytesMethod");
+ sal_uInt32 nCount(SEQUENCESIZE - rPos);
+ memcpy(&(pTarget[rPos]), pBytes, nCount);
+
+ OSL_ENSURE(rPos + nCount == SEQUENCESIZE, "the position should be the at the end");
+
+ rPos = writeSequence();
+ sal_uInt32 nRestCount(nBytesCount - nCount);
+ if ((rPos + nRestCount) <= SEQUENCESIZE)
+ {
+ memcpy(&(pTarget[rPos]), &pBytes[nCount], nRestCount);
+ rPos += nRestCount;
+ }
+ else
+ AddBytes(pTarget, rPos, &pBytes[nCount], nRestCount);
+}
+
+void SaxWriterHelper::setCustomEntityNames(
+ const ::css::uno::Sequence<::css::beans::Pair<::rtl::OUString, ::rtl::OUString>>& replacements)
+{
+ m_Replacements.resize(replacements.size());
+ for (size_t i = 0; i < replacements.size(); ++i)
+ {
+ m_Replacements[i].name = replacements[i].First;
+ m_Replacements[i].replacement = replacements[i].Second;
+ }
+ if (replacements.size() > 1)
+ std::sort(m_Replacements.begin(), m_Replacements.end());
+}
+
+/** Converts a UTF-16 string to UTF-8 and does XML normalization
+
+ @param pTarget
+ Pointer to a piece of memory, to where the output should be written. The caller
+ must call calcXMLByteLength on the same string, to ensure,
+ that there is enough memory for converting.
+ */
+bool SaxWriterHelper::convertToXML(const sal_Unicode* pStr, sal_Int32 nStrLen,
+ bool bDoNormalization, bool bNormalizeWhitespace,
+ sal_Int8* pTarget, sal_uInt32& rPos)
+{
+ bool bRet(true);
+ sal_uInt32 nSurrogate = 0;
+
+ for (sal_Int32 i = 0; i < nStrLen; i++)
+ {
+ sal_Unicode c = pStr[i];
+ if (IsInvalidChar(c))
+ bRet = false;
+ else if ((c >= 0x0001) && (c <= 0x007F)) // Deal with ascii
+ {
+ if (bDoNormalization)
+ {
+ switch (c)
+ {
+ case '&': // resemble to &amp;
+ {
+ if ((rPos + 5) > SEQUENCESIZE)
+ AddBytes(pTarget, rPos, reinterpret_cast<sal_Int8 const*>("&amp;"), 5);
+ else
+ {
+ memcpy(&(pTarget[rPos]), "&amp;", 5);
+ rPos += 5;
+ }
+ }
+ break;
+ case '<':
+ {
+ if ((rPos + 4) > SEQUENCESIZE)
+ AddBytes(pTarget, rPos, reinterpret_cast<sal_Int8 const*>("&lt;"), 4);
+ else
+ {
+ memcpy(&(pTarget[rPos]), "&lt;", 4);
+ rPos += 4; // &lt;
+ }
+ }
+ break;
+ case '>':
+ {
+ if ((rPos + 4) > SEQUENCESIZE)
+ AddBytes(pTarget, rPos, reinterpret_cast<sal_Int8 const*>("&gt;"), 4);
+ else
+ {
+ memcpy(&(pTarget[rPos]), "&gt;", 4);
+ rPos += 4; // &gt;
+ }
+ }
+ break;
+ case '\'':
+ {
+ if ((rPos + 6) > SEQUENCESIZE)
+ AddBytes(pTarget, rPos, reinterpret_cast<sal_Int8 const*>("&apos;"), 6);
+ else
+ {
+ memcpy(&(pTarget[rPos]), "&apos;", 6);
+ rPos += 6; // &apos;
+ }
+ }
+ break;
+ case '"':
+ {
+ if ((rPos + 6) > SEQUENCESIZE)
+ AddBytes(pTarget, rPos, reinterpret_cast<sal_Int8 const*>("&quot;"), 6);
+ else
+ {
+ memcpy(&(pTarget[rPos]), "&quot;", 6);
+ rPos += 6; // &quot;
+ }
+ }
+ break;
+ case 13:
+ {
+ if ((rPos + 6) > SEQUENCESIZE)
+ AddBytes(pTarget, rPos, reinterpret_cast<sal_Int8 const*>("&#x0d;"), 6);
+ else
+ {
+ memcpy(&(pTarget[rPos]), "&#x0d;", 6);
+ rPos += 6;
+ }
+ }
+ break;
+ case LINEFEED:
+ {
+ if (bNormalizeWhitespace)
+ {
+ if ((rPos + 6) > SEQUENCESIZE)
+ AddBytes(pTarget, rPos, reinterpret_cast<sal_Int8 const*>("&#x0a;"),
+ 6);
+ else
+ {
+ memcpy(&(pTarget[rPos]), "&#x0a;", 6);
+ rPos += 6;
+ }
+ }
+ else
+ {
+ pTarget[rPos] = LINEFEED;
+ nLastLineFeedPos = rPos;
+ rPos++;
+ }
+ }
+ break;
+ case 9:
+ {
+ if (bNormalizeWhitespace)
+ {
+ if ((rPos + 6) > SEQUENCESIZE)
+ AddBytes(pTarget, rPos, reinterpret_cast<sal_Int8 const*>("&#x09;"),
+ 6);
+ else
+ {
+ memcpy(&(pTarget[rPos]), "&#x09;", 6);
+ rPos += 6;
+ }
+ }
+ else
+ {
+ pTarget[rPos] = 9;
+ rPos++;
+ }
+ }
+ break;
+ default:
+ {
+ pTarget[rPos] = static_cast<sal_Int8>(c);
+ rPos++;
+ }
+ break;
+ }
+ }
+ else
+ {
+ pTarget[rPos] = static_cast<sal_Int8>(c);
+ if (static_cast<sal_Int8>(c) == LINEFEED)
+ nLastLineFeedPos = rPos;
+ rPos++;
+ }
+ }
+ else
+ {
+ // Deal with replacements
+ if (bDoNormalization && !m_Replacements.empty())
+ {
+ // search
+ const ReplacementPair* it = findXMLReplacement(&pStr[i], nStrLen - i);
+
+ // replace
+ if (it != nullptr)
+ {
+ OString name = ::rtl::OUStringToOString(it->name, RTL_TEXTENCODING_UTF8);
+ if (rPos + name.getLength() > SEQUENCESIZE)
+ AddBytes(pTarget, rPos, reinterpret_cast<sal_Int8 const*>(name.getStr()),
+ name.getLength());
+ else
+ {
+ memcpy(&(pTarget[rPos]), name.getStr(), name.getLength());
+ rPos += name.getLength();
+ }
+ i += it->replacement.getLength() - 1;
+ continue;
+ }
+ }
+
+ // Deal with other unicode cases
+ if (rtl::isHighSurrogate(c))
+ {
+ // 1. surrogate: save (until 2. surrogate)
+ if (nSurrogate != 0) // left-over lone 1st Unicode surrogate
+ {
+ OSL_FAIL("left-over Unicode surrogate");
+ bRet = false;
+ }
+ nSurrogate = c;
+ }
+ else if (rtl::isLowSurrogate(c))
+ {
+ // 2. surrogate: write as UTF-8
+ if (nSurrogate) // can only be 1st surrogate
+ {
+ nSurrogate = rtl::combineSurrogates(nSurrogate, c);
+ sal_Int8 aBytes[] = { sal_Int8(0xF0 | ((nSurrogate >> 18) & 0x0F)),
+ sal_Int8(0x80 | ((nSurrogate >> 12) & 0x3F)),
+ sal_Int8(0x80 | ((nSurrogate >> 6) & 0x3F)),
+ sal_Int8(0x80 | ((nSurrogate >> 0) & 0x3F)) };
+ if ((rPos + 4) > SEQUENCESIZE)
+ AddBytes(pTarget, rPos, aBytes, 4);
+ else
+ {
+ pTarget[rPos] = aBytes[0];
+ rPos++;
+ pTarget[rPos] = aBytes[1];
+ rPos++;
+ pTarget[rPos] = aBytes[2];
+ rPos++;
+ pTarget[rPos] = aBytes[3];
+ rPos++;
+ }
+ }
+ else // lone 2nd surrogate
+ {
+ OSL_FAIL("illegal Unicode character");
+ bRet = false;
+ }
+
+ // reset surrogate
+ nSurrogate = 0;
+ }
+ else if (c > 0x07FF)
+ {
+ sal_Int8 aBytes[]
+ = { sal_Int8(0xE0 | ((c >> 12) & 0x0F)), sal_Int8(0x80 | ((c >> 6) & 0x3F)),
+ sal_Int8(0x80 | ((c >> 0) & 0x3F)) };
+ if ((rPos + 3) > SEQUENCESIZE)
+ AddBytes(pTarget, rPos, aBytes, 3);
+ else
+ {
+ pTarget[rPos] = aBytes[0];
+ rPos++;
+ pTarget[rPos] = aBytes[1];
+ rPos++;
+ pTarget[rPos] = aBytes[2];
+ rPos++;
+ }
+ }
+ else
+ {
+ sal_Int8 aBytes[]
+ = { sal_Int8(0xC0 | ((c >> 6) & 0x1F)), sal_Int8(0x80 | ((c >> 0) & 0x3F)) };
+ if ((rPos + 2) > SEQUENCESIZE)
+ AddBytes(pTarget, rPos, aBytes, 2);
+ else
+ {
+ pTarget[rPos] = aBytes[0];
+ rPos++;
+ pTarget[rPos] = aBytes[1];
+ rPos++;
+ }
+ }
+ }
+
+ OSL_ENSURE(rPos <= SEQUENCESIZE, "not reset current position");
+ if (rPos == SEQUENCESIZE)
+ rPos = writeSequence();
+
+ // reset left-over surrogate
+ if ((nSurrogate != 0) && !rtl::isHighSurrogate(c))
+ {
+ OSL_FAIL("left-over Unicode surrogate");
+ nSurrogate = 0;
+ bRet = false;
+ }
+ }
+ if (nSurrogate != 0) // trailing lone 1st surrogate
+ {
+ OSL_FAIL("left-over Unicode surrogate");
+ bRet = false;
+ }
+ return bRet;
+}
+
+void SaxWriterHelper::FinishStartElement()
+{
+ if (!m_bStartElementFinished)
+ {
+ mp_Sequence[nCurrentPos] = '>';
+ nCurrentPos++;
+ if (nCurrentPos == SEQUENCESIZE)
+ nCurrentPos = writeSequence();
+ m_bStartElementFinished = true;
+ }
+}
+
+void SaxWriterHelper::insertIndentation(sal_uInt32 m_nLevel)
+{
+ FinishStartElement();
+ if (m_nLevel > 0)
+ {
+ if ((nCurrentPos + m_nLevel + 1) <= SEQUENCESIZE)
+ {
+ mp_Sequence[nCurrentPos] = LINEFEED;
+ nLastLineFeedPos = nCurrentPos;
+ nCurrentPos++;
+ memset(&(mp_Sequence[nCurrentPos]), 32, m_nLevel);
+ nCurrentPos += m_nLevel;
+ if (nCurrentPos == SEQUENCESIZE)
+ nCurrentPos = writeSequence();
+ }
+ else
+ {
+ sal_uInt32 nCount(m_nLevel + 1);
+ std::unique_ptr<sal_Int8[]> pBytes(new sal_Int8[nCount]);
+ pBytes[0] = LINEFEED;
+ memset(&(pBytes[1]), 32, m_nLevel);
+ AddBytes(mp_Sequence, nCurrentPos, pBytes.get(), nCount);
+ pBytes.reset();
+ nLastLineFeedPos = nCurrentPos - nCount;
+ if (nCurrentPos == SEQUENCESIZE)
+ nCurrentPos = writeSequence();
+ }
+ }
+ else
+ {
+ mp_Sequence[nCurrentPos] = LINEFEED;
+ nLastLineFeedPos = nCurrentPos;
+ nCurrentPos++;
+ if (nCurrentPos == SEQUENCESIZE)
+ nCurrentPos = writeSequence();
+ }
+}
+
+bool SaxWriterHelper::writeString(const OUString& rWriteOutString, bool bDoNormalization,
+ bool bNormalizeWhitespace)
+{
+ FinishStartElement();
+ return convertToXML(rWriteOutString.getStr(), rWriteOutString.getLength(), bDoNormalization,
+ bNormalizeWhitespace, mp_Sequence, nCurrentPos);
+}
+
+void SaxWriterHelper::startDocument()
+{
+ const char pc[] = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
+ const int nLen = strlen(pc);
+ if ((nCurrentPos + nLen) <= SEQUENCESIZE)
+ {
+ memcpy(mp_Sequence, pc, nLen);
+ nCurrentPos += nLen;
+ }
+ else
+ {
+ AddBytes(mp_Sequence, nCurrentPos, reinterpret_cast<sal_Int8 const*>(pc), nLen);
+ }
+ OSL_ENSURE(nCurrentPos <= SEQUENCESIZE, "not reset current position");
+ if (nCurrentPos == SEQUENCESIZE)
+ nCurrentPos = writeSequence();
+ mp_Sequence[nCurrentPos] = LINEFEED;
+ nCurrentPos++;
+ if (nCurrentPos == SEQUENCESIZE)
+ nCurrentPos = writeSequence();
+}
+
+#ifndef NDEBUG
+bool inrange(sal_Unicode c, sal_Unicode start, sal_Unicode end) { return c >= start && c <= end; }
+#endif
+
+void CheckValidName(OUString const& rName)
+{
+#ifdef NDEBUG
+ (void)rName;
+#else
+ assert(!rName.isEmpty());
+ bool hasColon(false);
+ for (sal_Int32 i = 0; i < rName.getLength(); ++i)
+ {
+ auto const c(rName[i]);
+ if (c == ':')
+ {
+ // see https://www.w3.org/TR/REC-xml-names/#ns-qualnames
+ SAL_WARN_IF(hasColon, "sax", "only one colon allowed: " << rName);
+ assert(!hasColon && "only one colon allowed");
+ hasColon = true;
+ }
+ else if (!rtl::isAsciiAlphanumeric(c) && c != '_' && c != '-' && c != '.'
+ && !inrange(c, 0x00C0, 0x00D6) && !inrange(c, 0x00D8, 0x00F6)
+ && !inrange(c, 0x00F8, 0x02FF) && !inrange(c, 0x0370, 0x037D)
+ && !inrange(c, 0x037F, 0x1FFF) && !inrange(c, 0x200C, 0x200D)
+ && !inrange(c, 0x2070, 0x218F) && !inrange(c, 0x2C00, 0x2FEF)
+ && !inrange(c, 0x3001, 0xD7FF) && !inrange(c, 0xF900, 0xFDCF)
+ && !inrange(c, 0xFDF0, 0xFFFD) && c != 0x00B7 && !inrange(c, 0x0300, 0x036F)
+ && !inrange(c, 0x203F, 0x2040))
+ {
+ // https://www.w3.org/TR/xml11/#NT-NameChar
+ // (currently we don't warn about invalid start chars)
+ SAL_WARN("sax", "unexpected character in attribute name: " << rName);
+ assert(!"unexpected character in attribute name");
+ }
+ }
+#endif
+}
+
+SaxInvalidCharacterError SaxWriterHelper::startElement(const OUString& rName,
+ const Reference<XAttributeList>& xAttribs)
+{
+ FinishStartElement();
+
+#ifdef DBG_UTIL
+ m_DebugStartedElements.push(rName);
+ ::std::set<OUString> DebugAttributes;
+#endif
+
+ mp_Sequence[nCurrentPos] = '<';
+ nCurrentPos++;
+ if (nCurrentPos == SEQUENCESIZE)
+ nCurrentPos = writeSequence();
+
+ SaxInvalidCharacterError eRet(SAX_NONE);
+ CheckValidName(rName);
+ if (!writeString(rName, false, false))
+ eRet = SAX_ERROR;
+
+ sal_Int16 nAttribCount = xAttribs.is() ? xAttribs->getLength() : 0;
+ for (sal_Int16 i = 0; i < nAttribCount; i++)
+ {
+ mp_Sequence[nCurrentPos] = ' ';
+ nCurrentPos++;
+ if (nCurrentPos == SEQUENCESIZE)
+ nCurrentPos = writeSequence();
+
+ OUString const& rAttrName(xAttribs->getNameByIndex(i));
+#ifdef DBG_UTIL
+ // Well-formedness constraint: Unique Att Spec
+ assert(DebugAttributes.find(rAttrName) == DebugAttributes.end());
+ DebugAttributes.insert(rAttrName);
+#endif
+ CheckValidName(rAttrName);
+ if (!writeString(rAttrName, false, false))
+ eRet = SAX_ERROR;
+
+ mp_Sequence[nCurrentPos] = '=';
+ nCurrentPos++;
+ if (nCurrentPos == SEQUENCESIZE)
+ nCurrentPos = writeSequence();
+ mp_Sequence[nCurrentPos] = '"';
+ nCurrentPos++;
+ if (nCurrentPos == SEQUENCESIZE)
+ nCurrentPos = writeSequence();
+
+ if (!writeString(xAttribs->getValueByIndex(i), true, true) && eRet != SAX_ERROR)
+ eRet = SAX_WARNING;
+
+ mp_Sequence[nCurrentPos] = '"';
+ nCurrentPos++;
+ if (nCurrentPos == SEQUENCESIZE)
+ nCurrentPos = writeSequence();
+ }
+
+ m_bStartElementFinished = false; // because the '>' character is not added,
+ // because it is possible, that the "/>"
+ // characters have to add
+ return eRet;
+}
+
+bool SaxWriterHelper::FinishEmptyElement()
+{
+ if (m_bStartElementFinished)
+ return false;
+
+ mp_Sequence[nCurrentPos] = '/';
+ nCurrentPos++;
+ if (nCurrentPos == SEQUENCESIZE)
+ nCurrentPos = writeSequence();
+ mp_Sequence[nCurrentPos] = '>';
+ nCurrentPos++;
+ if (nCurrentPos == SEQUENCESIZE)
+ nCurrentPos = writeSequence();
+
+ m_bStartElementFinished = true;
+
+ return true;
+}
+
+bool SaxWriterHelper::endElement(const OUString& rName)
+{
+ FinishStartElement();
+
+ mp_Sequence[nCurrentPos] = '<';
+ nCurrentPos++;
+ if (nCurrentPos == SEQUENCESIZE)
+ nCurrentPos = writeSequence();
+ mp_Sequence[nCurrentPos] = '/';
+ nCurrentPos++;
+ if (nCurrentPos == SEQUENCESIZE)
+ nCurrentPos = writeSequence();
+
+ CheckValidName(rName);
+ bool bRet(writeString(rName, false, false));
+
+ mp_Sequence[nCurrentPos] = '>';
+ nCurrentPos++;
+ if (nCurrentPos == SEQUENCESIZE)
+ nCurrentPos = writeSequence();
+
+ return bRet;
+}
+
+void SaxWriterHelper::endDocument()
+{
+ if (nCurrentPos > 0)
+ {
+ m_Sequence.realloc(nCurrentPos);
+ nCurrentPos = writeSequence();
+ //m_Sequence.realloc(SEQUENCESIZE);
+ }
+}
+
+void SaxWriterHelper::clearBuffer()
+{
+ FinishStartElement();
+ if (nCurrentPos > 0)
+ {
+ m_Sequence.realloc(nCurrentPos);
+ nCurrentPos = writeSequence();
+ m_Sequence.realloc(SEQUENCESIZE);
+ // Be sure to update the array pointer after the reallocation.
+ mp_Sequence = m_Sequence.getArray();
+ }
+}
+
+bool SaxWriterHelper::processingInstruction(const OUString& rTarget, const OUString& rData)
+{
+ FinishStartElement();
+ mp_Sequence[nCurrentPos] = '<';
+ nCurrentPos++;
+ if (nCurrentPos == SEQUENCESIZE)
+ nCurrentPos = writeSequence();
+ mp_Sequence[nCurrentPos] = '?';
+ nCurrentPos++;
+ if (nCurrentPos == SEQUENCESIZE)
+ nCurrentPos = writeSequence();
+
+ bool bRet(writeString(rTarget, false, false));
+
+ mp_Sequence[nCurrentPos] = ' ';
+ nCurrentPos++;
+ if (nCurrentPos == SEQUENCESIZE)
+ nCurrentPos = writeSequence();
+
+ if (!writeString(rData, false, false))
+ bRet = false;
+
+ mp_Sequence[nCurrentPos] = '?';
+ nCurrentPos++;
+ if (nCurrentPos == SEQUENCESIZE)
+ nCurrentPos = writeSequence();
+ mp_Sequence[nCurrentPos] = '>';
+ nCurrentPos++;
+ if (nCurrentPos == SEQUENCESIZE)
+ nCurrentPos = writeSequence();
+
+ return bRet;
+}
+
+void SaxWriterHelper::startCDATA()
+{
+ FinishStartElement();
+ if ((nCurrentPos + 9) <= SEQUENCESIZE)
+ {
+ memcpy(&(mp_Sequence[nCurrentPos]), "<![CDATA[", 9);
+ nCurrentPos += 9;
+ }
+ else
+ AddBytes(mp_Sequence, nCurrentPos, reinterpret_cast<sal_Int8 const*>("<![CDATA["), 9);
+ if (nCurrentPos == SEQUENCESIZE)
+ nCurrentPos = writeSequence();
+}
+
+void SaxWriterHelper::endCDATA()
+{
+ FinishStartElement();
+ if ((nCurrentPos + 3) <= SEQUENCESIZE)
+ {
+ memcpy(&(mp_Sequence[nCurrentPos]), "]]>", 3);
+ nCurrentPos += 3;
+ }
+ else
+ AddBytes(mp_Sequence, nCurrentPos, reinterpret_cast<sal_Int8 const*>("]]>"), 3);
+ if (nCurrentPos == SEQUENCESIZE)
+ nCurrentPos = writeSequence();
+}
+
+bool SaxWriterHelper::comment(const OUString& rComment)
+{
+ FinishStartElement();
+ mp_Sequence[nCurrentPos] = '<';
+ nCurrentPos++;
+ if (nCurrentPos == SEQUENCESIZE)
+ nCurrentPos = writeSequence();
+ mp_Sequence[nCurrentPos] = '!';
+ nCurrentPos++;
+ if (nCurrentPos == SEQUENCESIZE)
+ nCurrentPos = writeSequence();
+ mp_Sequence[nCurrentPos] = '-';
+ nCurrentPos++;
+ if (nCurrentPos == SEQUENCESIZE)
+ nCurrentPos = writeSequence();
+ mp_Sequence[nCurrentPos] = '-';
+ nCurrentPos++;
+ if (nCurrentPos == SEQUENCESIZE)
+ nCurrentPos = writeSequence();
+
+ bool bRet(writeString(rComment, false, false));
+
+ mp_Sequence[nCurrentPos] = '-';
+ nCurrentPos++;
+ if (nCurrentPos == SEQUENCESIZE)
+ nCurrentPos = writeSequence();
+ mp_Sequence[nCurrentPos] = '-';
+ nCurrentPos++;
+ if (nCurrentPos == SEQUENCESIZE)
+ nCurrentPos = writeSequence();
+ mp_Sequence[nCurrentPos] = '>';
+ nCurrentPos++;
+ if (nCurrentPos == SEQUENCESIZE)
+ nCurrentPos = writeSequence();
+
+ return bRet;
+}
+
+sal_Int32 SaxWriterHelper::calcXMLByteLength(const OUString& rStr, bool bDoNormalization,
+ bool bNormalizeWhitespace)
+{
+ sal_Int32 nOutputLength = 0;
+ sal_uInt32 nSurrogate = 0;
+
+ const sal_Unicode* pStr = rStr.getStr();
+ sal_Int32 nStrLen = rStr.getLength();
+ for (sal_Int32 i = 0; i < nStrLen; i++)
+ {
+ sal_uInt16 c = pStr[i];
+ if (!IsInvalidChar(c) && (c >= 0x0001) && (c <= 0x007F))
+ {
+ if (bDoNormalization)
+ {
+ switch (c)
+ {
+ case '&': // resemble to &amp;
+ nOutputLength += 5;
+ break;
+ case '<': // &lt;
+ case '>': // &gt;
+ nOutputLength += 4;
+ break;
+ case '\'': // &apos;
+ case '"': // &quot;
+ case 13: // &#x0d;
+ nOutputLength += 6;
+ break;
+
+ case 10: // &#x0a;
+ case 9: // &#x09;
+ if (bNormalizeWhitespace)
+ {
+ nOutputLength += 6;
+ }
+ else
+ {
+ nOutputLength++;
+ }
+ break;
+ default:
+ nOutputLength++;
+ }
+ }
+ else
+ {
+ nOutputLength++;
+ }
+ }
+ else
+ {
+ // Deal with replacements
+ if (bDoNormalization && !m_Replacements.empty())
+ {
+ // search
+ const ReplacementPair* it = findXMLReplacement(&pStr[i], nStrLen - i);
+
+ if (it != nullptr)
+ {
+ nOutputLength
+ += ::rtl::OUStringToOString(it->name, RTL_TEXTENCODING_UTF8).getLength();
+ i += it->replacement.getLength() - 1;
+ continue;
+ }
+ }
+
+ // Deal with other unicode cases
+ if (rtl::isHighSurrogate(c))
+ {
+ // save surrogate
+ nSurrogate = c;
+ }
+ else if (rtl::isLowSurrogate(c))
+ {
+ // 2. surrogate: write as UTF-8 (if range is OK
+ if (nSurrogate)
+ nOutputLength += 4;
+ nSurrogate = 0;
+ }
+ else if (c > 0x07FF)
+ {
+ nOutputLength += 3;
+ }
+ else
+ {
+ nOutputLength += 2;
+ }
+ }
+
+ // surrogate processing
+ if ((nSurrogate != 0) && !rtl::isHighSurrogate(c))
+ nSurrogate = 0;
+ }
+
+ return nOutputLength;
+}
+
+const ReplacementPair* SaxWriterHelper::findXMLReplacement(const sal_Unicode* pStr,
+ sal_Int32 nStrLen)
+{
+ for (size_t iter = 0; iter < m_Replacements.size(); ++iter)
+ {
+ if (m_Replacements[iter].replacement.getLength() > nStrLen)
+ continue;
+ sal_Int32 matches = m_Replacements[iter].replacement.compareTo(
+ std::u16string_view(pStr, m_Replacements[iter].replacement.getLength()));
+ if (matches == 0)
+ return &m_Replacements[iter];
+ if (matches > 0)
+ return nullptr;
+ }
+ return nullptr;
+}
+
+class SAXWriter : public WeakImplHelper<XWriter, XServiceInfo>
+{
+public:
+ SAXWriter()
+ : m_bDocStarted(false)
+ , m_bIsCDATA(false)
+ , m_bForceLineBreak(false)
+ , m_bAllowLineBreak(false)
+ , m_nLevel(0)
+ {
+ }
+
+public: // XActiveDataSource
+ virtual void SAL_CALL setOutputStream(const Reference<XOutputStream>& aStream) override
+ {
+ try
+ {
+ // temporary: set same stream again to clear buffer
+ if (m_out == aStream && m_pSaxWriterHelper && m_bDocStarted)
+ m_pSaxWriterHelper->clearBuffer();
+ else
+ {
+ m_out = aStream;
+ m_pSaxWriterHelper.reset(new SaxWriterHelper(m_out));
+ m_bDocStarted = false;
+ m_nLevel = 0;
+ m_bIsCDATA = false;
+ }
+ }
+ catch (const SAXException& e)
+ {
+ throw css::lang::WrappedTargetRuntimeException(e.Message, getXWeak(),
+ e.WrappedException);
+ }
+ }
+ virtual Reference<XOutputStream> SAL_CALL getOutputStream() override { return m_out; }
+
+public: // XDocumentHandler
+ virtual void SAL_CALL startDocument() override;
+
+ virtual void SAL_CALL endDocument() override;
+
+ virtual void SAL_CALL startElement(const OUString& aName,
+ const Reference<XAttributeList>& xAttribs) override;
+
+ virtual void SAL_CALL endElement(const OUString& aName) override;
+
+ virtual void SAL_CALL characters(const OUString& aChars) override;
+
+ virtual void SAL_CALL ignorableWhitespace(const OUString& aWhitespaces) override;
+ virtual void SAL_CALL processingInstruction(const OUString& aTarget,
+ const OUString& aData) override;
+ virtual void SAL_CALL setDocumentLocator(const Reference<XLocator>& xLocator) override;
+ virtual void SAL_CALL setCustomEntityNames(
+ const ::css::uno::Sequence<::css::beans::Pair<::rtl::OUString, ::rtl::OUString>>&
+ replacements) override;
+
+public: // XExtendedDocumentHandler
+ virtual void SAL_CALL startCDATA() override;
+ virtual void SAL_CALL endCDATA() override;
+ virtual void SAL_CALL comment(const OUString& sComment) override;
+ virtual void SAL_CALL unknown(const OUString& sString) override;
+ virtual void SAL_CALL allowLineBreak() override;
+
+public: // XServiceInfo
+ OUString SAL_CALL getImplementationName() override;
+ Sequence<OUString> SAL_CALL getSupportedServiceNames() override;
+ sal_Bool SAL_CALL supportsService(const OUString& ServiceName) override;
+
+private:
+ sal_Int32 getIndentPrefixLength(sal_Int32 nFirstLineBreakOccurrence) noexcept;
+
+ Reference<XOutputStream> m_out;
+ std::unique_ptr<SaxWriterHelper> m_pSaxWriterHelper;
+
+ // Status information
+ bool m_bDocStarted : 1;
+ bool m_bIsCDATA : 1;
+ bool m_bForceLineBreak : 1;
+ bool m_bAllowLineBreak : 1;
+ sal_Int32 m_nLevel;
+};
+
+sal_Int32 SAXWriter::getIndentPrefixLength(sal_Int32 nFirstLineBreakOccurrence) noexcept
+{
+ sal_Int32 nLength = -1;
+ if (m_pSaxWriterHelper)
+ {
+ if (m_bForceLineBreak
+ || (m_bAllowLineBreak
+ && ((nFirstLineBreakOccurrence + m_pSaxWriterHelper->GetLastColumnCount())
+ > MAXCOLUMNCOUNT)))
+ nLength = m_nLevel;
+ }
+ m_bForceLineBreak = false;
+ m_bAllowLineBreak = false;
+ return nLength;
+}
+
+bool isFirstCharWhitespace(const sal_Unicode* p) noexcept { return *p == ' '; }
+
+// XServiceInfo
+OUString SAXWriter::getImplementationName() { return "com.sun.star.extensions.xml.sax.Writer"; }
+
+// XServiceInfo
+sal_Bool SAXWriter::supportsService(const OUString& ServiceName)
+{
+ return cppu::supportsService(this, ServiceName);
+}
+
+// XServiceInfo
+Sequence<OUString> SAXWriter::getSupportedServiceNames()
+{
+ return { "com.sun.star.xml.sax.Writer" };
+}
+
+void SAXWriter::startDocument()
+{
+ if (m_bDocStarted || !m_out.is() || !m_pSaxWriterHelper)
+ {
+ throw SAXException();
+ }
+ m_bDocStarted = true;
+ m_pSaxWriterHelper->startDocument();
+}
+
+void SAXWriter::endDocument()
+{
+ if (!m_bDocStarted)
+ {
+ throw SAXException("endDocument called before startDocument", Reference<XInterface>(),
+ Any());
+ }
+ if (m_nLevel)
+ {
+ throw SAXException("unexpected end of document", Reference<XInterface>(), Any());
+ }
+ m_pSaxWriterHelper->endDocument();
+ try
+ {
+ m_out->closeOutput();
+ }
+ catch (const IOException&)
+ {
+ css::uno::Any anyEx = cppu::getCaughtException();
+ throw SAXException("IO exception during closing the IO Stream", Reference<XInterface>(),
+ anyEx);
+ }
+}
+
+void SAXWriter::startElement(const OUString& aName, const Reference<XAttributeList>& xAttribs)
+{
+ if (!m_bDocStarted)
+ {
+ throw SAXException("startElement called before startDocument", {}, {});
+ }
+ if (m_bIsCDATA)
+ {
+ throw SAXException("startElement call not allowed with CDATA sections", {}, {});
+ }
+
+ sal_Int32 nLength(0);
+ if (m_bAllowLineBreak)
+ {
+ sal_Int32 nAttribCount = xAttribs.is() ? xAttribs->getLength() : 0;
+
+ nLength++; // "<"
+ nLength += m_pSaxWriterHelper->calcXMLByteLength(aName, false, false); // the tag name
+
+ sal_Int16 n;
+ for (n = 0; n < static_cast<sal_Int16>(nAttribCount); n++)
+ {
+ nLength++; // " "
+ OUString tmp = xAttribs->getNameByIndex(n);
+
+ nLength += m_pSaxWriterHelper->calcXMLByteLength(tmp, false, false);
+
+ nLength += 2; // ="
+
+ tmp = xAttribs->getValueByIndex(n);
+
+ nLength += m_pSaxWriterHelper->calcXMLByteLength(tmp, true, true);
+
+ nLength += 1; // "
+ }
+
+ nLength++; // '>'
+ }
+
+ // Is there a new indentation necessary ?
+ sal_Int32 nPrefix(getIndentPrefixLength(nLength));
+
+ // write into sequence
+ if (nPrefix >= 0)
+ m_pSaxWriterHelper->insertIndentation(nPrefix);
+
+ SaxInvalidCharacterError eRet(m_pSaxWriterHelper->startElement(aName, xAttribs));
+
+ m_nLevel++;
+
+ if (eRet == SAX_WARNING)
+ {
+ throw SAXInvalidCharacterException(
+ "Invalid character during XML-Export in an attribute value", {}, {});
+ }
+ else if (eRet == SAX_ERROR)
+ {
+ throw SAXException("Invalid character during XML-Export", {}, {});
+ }
+}
+
+void SAXWriter::endElement(const OUString& aName)
+{
+ if (!m_bDocStarted)
+ {
+ throw SAXException();
+ }
+ m_nLevel--;
+
+ if (m_nLevel < 0)
+ {
+ throw SAXException();
+ }
+ bool bRet(true);
+
+ // check here because Helper's endElement is not always called
+#ifdef DBG_UTIL
+ assert(!m_pSaxWriterHelper->m_DebugStartedElements.empty());
+ // Well-formedness constraint: Element Type Match
+ assert(aName == m_pSaxWriterHelper->m_DebugStartedElements.top());
+ m_pSaxWriterHelper->m_DebugStartedElements.pop();
+#endif
+
+ if (m_pSaxWriterHelper->FinishEmptyElement())
+ m_bForceLineBreak = false;
+ else
+ {
+ // only ascii chars allowed
+ sal_Int32 nLength(0);
+ if (m_bAllowLineBreak)
+ nLength = 3 + m_pSaxWriterHelper->calcXMLByteLength(aName, false, false);
+ sal_Int32 nPrefix = getIndentPrefixLength(nLength);
+
+ if (nPrefix >= 0)
+ m_pSaxWriterHelper->insertIndentation(nPrefix);
+
+ bRet = m_pSaxWriterHelper->endElement(aName);
+ }
+
+ if (!bRet)
+ {
+ throw SAXException("Invalid character during XML-Export", {}, {});
+ }
+}
+
+void SAXWriter::characters(const OUString& aChars)
+{
+ if (!m_bDocStarted)
+ {
+ throw SAXException("characters method called before startDocument", {}, {});
+ }
+
+ bool bThrowException(false);
+ if (!aChars.isEmpty())
+ {
+ if (m_bIsCDATA)
+ bThrowException = !m_pSaxWriterHelper->writeString(aChars, false, false);
+ else
+ {
+ // Note : nFirstLineBreakOccurrence is not exact, because we don't know, how
+ // many 2 and 3 byte chars are inbetween. However this whole stuff
+ // is eitherway for pretty printing only, so it does not need to be exact.
+ sal_Int32 nLength(0);
+ sal_Int32 nIndentPrefix(-1);
+ if (m_bAllowLineBreak)
+ {
+ // returns position of first ascii 10 within the string, -1 when no 10 in string.
+ sal_Int32 nFirstLineBreakOccurrence = aChars.indexOf(LINEFEED);
+
+ nLength = m_pSaxWriterHelper->calcXMLByteLength(aChars, !m_bIsCDATA, false);
+ nIndentPrefix = getIndentPrefixLength(
+ nFirstLineBreakOccurrence >= 0 ? nFirstLineBreakOccurrence : nLength);
+ }
+ else
+ nIndentPrefix = getIndentPrefixLength(nLength);
+
+ // insert indentation
+ if (nIndentPrefix >= 0)
+ {
+ if (isFirstCharWhitespace(aChars.getStr()))
+ m_pSaxWriterHelper->insertIndentation(nIndentPrefix - 1);
+ else
+ m_pSaxWriterHelper->insertIndentation(nIndentPrefix);
+ }
+ bThrowException = !m_pSaxWriterHelper->writeString(aChars, true, false);
+ }
+ }
+ if (bThrowException)
+ {
+ throw SAXInvalidCharacterException("Invalid character during XML-Export", {}, {});
+ }
+}
+
+void SAXWriter::ignorableWhitespace(const OUString&)
+{
+ if (!m_bDocStarted)
+ {
+ throw SAXException();
+ }
+
+ m_bForceLineBreak = true;
+}
+
+void SAXWriter::processingInstruction(const OUString& aTarget, const OUString& aData)
+{
+ if (!m_bDocStarted || m_bIsCDATA)
+ {
+ throw SAXException();
+ }
+
+ sal_Int32 nLength(0);
+ if (m_bAllowLineBreak)
+ {
+ nLength = 2; // "<?"
+ nLength += m_pSaxWriterHelper->calcXMLByteLength(aTarget, false, false);
+
+ nLength += 1; // " "
+
+ nLength += m_pSaxWriterHelper->calcXMLByteLength(aData, false, false);
+
+ nLength += 2; // "?>"
+ }
+
+ sal_Int32 nPrefix = getIndentPrefixLength(nLength);
+
+ if (nPrefix >= 0)
+ m_pSaxWriterHelper->insertIndentation(nPrefix);
+
+ if (!m_pSaxWriterHelper->processingInstruction(aTarget, aData))
+ {
+ throw SAXException("Invalid character during XML-Export", {}, {});
+ }
+}
+
+void SAXWriter::setDocumentLocator(const Reference<XLocator>&) {}
+
+void SAXWriter::setCustomEntityNames(
+ const ::css::uno::Sequence<::css::beans::Pair<::rtl::OUString, ::rtl::OUString>>& replacements)
+{
+ m_pSaxWriterHelper->setCustomEntityNames(replacements);
+}
+
+void SAXWriter::startCDATA()
+{
+ if (!m_bDocStarted || m_bIsCDATA)
+ {
+ throw SAXException();
+ }
+
+ sal_Int32 nPrefix = getIndentPrefixLength(9);
+ if (nPrefix >= 0)
+ m_pSaxWriterHelper->insertIndentation(nPrefix);
+
+ m_pSaxWriterHelper->startCDATA();
+
+ m_bIsCDATA = true;
+}
+
+void SAXWriter::endCDATA()
+{
+ if (!m_bDocStarted || !m_bIsCDATA)
+ {
+ throw SAXException("endCDATA was called without startCDATA", {}, {});
+ }
+
+ sal_Int32 nPrefix = getIndentPrefixLength(3);
+ if (nPrefix >= 0)
+ m_pSaxWriterHelper->insertIndentation(nPrefix);
+
+ m_pSaxWriterHelper->endCDATA();
+
+ m_bIsCDATA = false;
+}
+
+void SAXWriter::comment(const OUString& sComment)
+{
+ if (!m_bDocStarted || m_bIsCDATA)
+ {
+ throw SAXException();
+ }
+
+ sal_Int32 nLength(0);
+ if (m_bAllowLineBreak)
+ {
+ nLength = 4; // "<!--"
+ nLength += m_pSaxWriterHelper->calcXMLByteLength(sComment, false, false);
+
+ nLength += 3;
+ }
+
+ sal_Int32 nPrefix = getIndentPrefixLength(nLength);
+ if (nPrefix >= 0)
+ m_pSaxWriterHelper->insertIndentation(nPrefix);
+
+ if (!m_pSaxWriterHelper->comment(sComment))
+ {
+ throw SAXException("Invalid character during XML-Export", {}, {});
+ }
+}
+
+void SAXWriter::allowLineBreak()
+{
+ if (!m_bDocStarted || m_bAllowLineBreak)
+ {
+ throw SAXException();
+ }
+
+ m_bAllowLineBreak = true;
+}
+
+void SAXWriter::unknown(const OUString& sString)
+{
+ if (!m_bDocStarted)
+ {
+ throw SAXException();
+ }
+ if (m_bIsCDATA)
+ {
+ throw SAXException();
+ }
+
+ if (sString.startsWith("<?xml"))
+ return;
+
+ sal_Int32 nLength(0);
+ if (m_bAllowLineBreak)
+ nLength = m_pSaxWriterHelper->calcXMLByteLength(sString, false, false);
+
+ sal_Int32 nPrefix = getIndentPrefixLength(nLength);
+ if (nPrefix >= 0)
+ m_pSaxWriterHelper->insertIndentation(nPrefix);
+
+ if (!m_pSaxWriterHelper->writeString(sString, false, false))
+ {
+ throw SAXException("Invalid character during XML-Export", {}, {});
+ }
+}
+
+} // namespace
+
+extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface*
+com_sun_star_extensions_xml_sax_Writer_get_implementation(css::uno::XComponentContext*,
+ css::uno::Sequence<css::uno::Any> const&)
+{
+ return cppu::acquire(new SAXWriter);
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/sax/source/expatwrap/xml2utf.cxx b/sax/source/expatwrap/xml2utf.cxx
new file mode 100644
index 0000000000..5b3e4b9e1c
--- /dev/null
+++ b/sax/source/expatwrap/xml2utf.cxx
@@ -0,0 +1,519 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * This file incorporates work covered by the following license notice:
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed
+ * with this work for additional information regarding copyright
+ * ownership. The ASF licenses this file to you under the Apache
+ * License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of
+ * the License at http://www.apache.org/licenses/LICENSE-2.0 .
+ */
+#include <string.h>
+
+#include <algorithm>
+
+#include <sal/types.h>
+
+#include <rtl/textenc.h>
+#include <rtl/tencinfo.h>
+#include <com/sun/star/io/NotConnectedException.hpp>
+#include <com/sun/star/io/XInputStream.hpp>
+#include <xml2utf.hxx>
+#include <memory>
+
+
+using namespace ::com::sun::star::uno;
+using namespace ::com::sun::star::io;
+
+
+namespace sax_expatwrap {
+
+sal_Int32 XMLFile2UTFConverter::readAndConvert( Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead )
+{
+ if( ! m_in.is() ) {
+ throw NotConnectedException();
+ }
+ if( ! m_bStarted ) {
+ // it should be possible to find the encoding attribute
+ // within the first 512 bytes == 128 chars in UCS-4
+ nMaxToRead = ::std::max( sal_Int32(512) , nMaxToRead );
+ }
+
+ sal_Int32 nRead;
+ Sequence< sal_Int8 > seqStart;
+ while( true )
+ {
+ nRead = m_in->readSomeBytes( seq , nMaxToRead );
+
+ if( nRead + seqStart.getLength())
+ {
+ // if nRead is 0, the file is already eof.
+ if( ! m_bStarted && nRead )
+ {
+ // ensure that enough data is available to parse encoding
+ if( seqStart.hasElements() )
+ {
+ // prefix with what we had so far.
+ sal_Int32 nLength = seq.getLength();
+ seq.realloc( seqStart.getLength() + nLength );
+
+ memmove (seq.getArray() + seqStart.getLength(),
+ seq.getConstArray(),
+ nLength);
+ memcpy (seq.getArray(),
+ seqStart.getConstArray(),
+ seqStart.getLength());
+ }
+
+ // autodetection with the first bytes
+ if( ! isEncodingRecognizable( seq ) )
+ {
+ // remember what we have so far.
+ seqStart = seq;
+
+ // read more !
+ continue;
+ }
+ if( scanForEncoding( seq ) || !m_sEncoding.isEmpty() ) {
+ // initialize decoding
+ initializeDecoding();
+ }
+ seqStart = Sequence < sal_Int8 > ();
+ }
+
+ // do the encoding
+ if( m_pText2Unicode && m_pUnicode2Text &&
+ m_pText2Unicode->canContinue() ) {
+
+ Sequence<sal_Unicode> seqUnicode = m_pText2Unicode->convert( seq );
+ seq = m_pUnicode2Text->convert( seqUnicode.getConstArray(), seqUnicode.getLength() );
+ }
+
+ if( ! m_bStarted )
+ {
+ // it must now be ensured, that no encoding attribute exist anymore
+ // ( otherwise the expat-Parser will crash )
+ // This must be done after decoding !
+ // ( e.g. Files decoded in ucs-4 cannot be read properly )
+ m_bStarted = true;
+ removeEncoding( seq );
+ }
+ nRead = seq.getLength();
+ }
+
+ break;
+ }
+ return nRead;
+}
+
+void XMLFile2UTFConverter::removeEncoding( Sequence<sal_Int8> &seq )
+{
+ const sal_Int8 *pSource = seq.getArray();
+ if (seq.getLength() < 5 || strncmp(reinterpret_cast<const char *>(pSource), "<?xml", 5))
+ return;
+
+ // scan for encoding
+ OString str( reinterpret_cast<char const *>(pSource), seq.getLength() );
+
+ // cut sequence to first line break
+ // find first line break;
+ int nMax = str.indexOf( 10 );
+ if( nMax >= 0 )
+ {
+ str = str.copy( 0 , nMax );
+ }
+
+ int nFound = str.indexOf( " encoding" );
+ if( nFound < 0 ) return;
+
+ int nStop;
+ int nStart = str.indexOf( "\"" , nFound );
+ if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
+ {
+ nStart = str.indexOf( "'" , nFound );
+ nStop = str.indexOf( "'" , nStart +1 );
+ }
+ else
+ {
+ nStop = str.indexOf( "\"" , nStart +1);
+ }
+
+ if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
+ {
+ // remove encoding tag from file
+ memmove( &( seq.getArray()[nFound] ) ,
+ &( seq.getArray()[nStop+1]) ,
+ seq.getLength() - nStop -1);
+ seq.realloc( seq.getLength() - ( nStop+1 - nFound ) );
+ }
+}
+
+// Checks, if enough data has been accumulated to recognize the encoding
+bool XMLFile2UTFConverter::isEncodingRecognizable( const Sequence< sal_Int8 > &seq)
+{
+ const sal_Int8 *pSource = seq.getConstArray();
+ bool bCheckIfFirstClosingBracketExists = false;
+
+ if( seq.getLength() < 8 ) {
+ // no recognition possible, when less than 8 bytes are available
+ return false;
+ }
+
+ if( ! strncmp( reinterpret_cast<const char *>(pSource), "<?xml", 5 ) ) {
+ // scan if the <?xml tag finishes within this buffer
+ bCheckIfFirstClosingBracketExists = true;
+ }
+ else if( ('<' == pSource[0] || '<' == pSource[2] ) &&
+ ('?' == pSource[4] || '?' == pSource[6] ) )
+ {
+ // check for utf-16
+ bCheckIfFirstClosingBracketExists = true;
+ }
+ else if( ( '<' == pSource[1] || '<' == pSource[3] ) &&
+ ( '?' == pSource[5] || '?' == pSource[7] ) )
+ {
+ // check for
+ bCheckIfFirstClosingBracketExists = true;
+ }
+
+ if( bCheckIfFirstClosingBracketExists )
+ {
+ // whole <?xml tag is valid
+ return std::find(seq.begin(), seq.end(), '>') != seq.end();
+ }
+
+ // No <? tag in front, no need for a bigger buffer
+ return true;
+}
+
+bool XMLFile2UTFConverter::scanForEncoding( Sequence< sal_Int8 > &seq )
+{
+ const sal_uInt8 *pSource = reinterpret_cast<const sal_uInt8*>( seq.getConstArray() );
+ bool bReturn = true;
+
+ if( seq.getLength() < 4 ) {
+ // no recognition possible, when less than 4 bytes are available
+ return false;
+ }
+
+ // first level : detect possible file formats
+ if (seq.getLength() >= 5 && !strncmp(reinterpret_cast<const char *>(pSource), "<?xml", 5)) {
+ // scan for encoding
+ OString str( reinterpret_cast<const char *>(pSource), seq.getLength() );
+
+ // cut sequence to first line break
+ //find first line break;
+ int nMax = str.indexOf( 10 );
+ if( nMax >= 0 )
+ {
+ str = str.copy( 0 , nMax );
+ }
+
+ int nFound = str.indexOf( " encoding" );
+ if( nFound >= 0 ) {
+ int nStop;
+ int nStart = str.indexOf( "\"" , nFound );
+ if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
+ {
+ nStart = str.indexOf( "'" , nFound );
+ nStop = str.indexOf( "'" , nStart +1 );
+ }
+ else
+ {
+ nStop = str.indexOf( "\"" , nStart +1);
+ }
+ if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
+ {
+ // encoding found finally
+ m_sEncoding = str.copy( nStart+1 , nStop - nStart - 1 );
+ }
+ }
+ }
+ else if( 0xFE == pSource[0] &&
+ 0xFF == pSource[1] ) {
+ // UTF-16 big endian
+ // conversion is done so that encoding information can be easily extracted
+ m_sEncoding = "utf-16"_ostr;
+ }
+ else if( 0xFF == pSource[0] &&
+ 0xFE == pSource[1] ) {
+ // UTF-16 little endian
+ // conversion is done so that encoding information can be easily extracted
+ m_sEncoding = "utf-16"_ostr;
+ }
+ else if( 0x00 == pSource[0] && 0x3c == pSource[1] && 0x00 == pSource[2] && 0x3f == pSource[3] ) {
+ // UTF-16 big endian without byte order mark (this is (strictly speaking) an error.)
+ // The byte order mark is simply added
+
+ // simply add the byte order mark !
+ seq.realloc( seq.getLength() + 2 );
+ memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
+ reinterpret_cast<sal_uInt8*>(seq.getArray())[0] = 0xFE;
+ reinterpret_cast<sal_uInt8*>(seq.getArray())[1] = 0xFF;
+
+ m_sEncoding = "utf-16"_ostr;
+ }
+ else if( 0x3c == pSource[0] && 0x00 == pSource[1] && 0x3f == pSource[2] && 0x00 == pSource[3] ) {
+ // UTF-16 little endian without byte order mark (this is (strictly speaking) an error.)
+ // The byte order mark is simply added
+
+ seq.realloc( seq.getLength() + 2 );
+ memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
+ reinterpret_cast<sal_uInt8*>(seq.getArray())[0] = 0xFF;
+ reinterpret_cast<sal_uInt8*>(seq.getArray())[1] = 0xFE;
+
+ m_sEncoding = "utf-16"_ostr;
+ }
+ else if( 0xEF == pSource[0] &&
+ 0xBB == pSource[1] &&
+ 0xBF == pSource[2] )
+ {
+ // UTF-8 BOM (byte order mark); signifies utf-8, and not byte order
+ // The BOM is removed.
+ memmove( seq.getArray(), &( seq.getArray()[3] ), seq.getLength()-3 );
+ seq.realloc( seq.getLength() - 3 );
+ m_sEncoding = "utf-8"_ostr;
+ }
+ else if( 0x00 == pSource[0] && 0x00 == pSource[1] && 0x00 == pSource[2] && 0x3c == pSource[3] ) {
+ // UCS-4 big endian
+ m_sEncoding = "ucs-4"_ostr;
+ }
+ else if( 0x3c == pSource[0] && 0x00 == pSource[1] && 0x00 == pSource[2] && 0x00 == pSource[3] ) {
+ // UCS-4 little endian
+ m_sEncoding = "ucs-4"_ostr;
+ }
+/* TODO: no need to test for the moment since we return sal_False like default case anyway
+ else if( 0x4c == pSource[0] && 0x6f == pSource[1] &&
+ 0xa7 == static_cast<unsigned char> (pSource[2]) &&
+ 0x94 == static_cast<unsigned char> (pSource[3]) ) {
+ // EBCDIC
+ bReturn = sal_False; // must be extended
+ }
+*/
+ else {
+ // other
+ // UTF8 is directly recognized by the parser.
+ bReturn = false;
+ }
+
+ return bReturn;
+}
+
+void XMLFile2UTFConverter::initializeDecoding()
+{
+
+ if( !m_sEncoding.isEmpty() )
+ {
+ rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( m_sEncoding.getStr() );
+ if( encoding != RTL_TEXTENCODING_UTF8 )
+ {
+ m_pText2Unicode = std::make_unique<Text2UnicodeConverter>( m_sEncoding );
+ m_pUnicode2Text = std::make_unique<Unicode2TextConverter>( RTL_TEXTENCODING_UTF8 );
+ }
+ }
+}
+
+
+// Text2UnicodeConverter
+
+
+Text2UnicodeConverter::Text2UnicodeConverter( const OString &sEncoding )
+ : m_convText2Unicode(nullptr)
+ , m_contextText2Unicode(nullptr)
+{
+ rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( sEncoding.getStr() );
+ if( RTL_TEXTENCODING_DONTKNOW == encoding )
+ {
+ m_bCanContinue = false;
+ m_bInitialized = false;
+ }
+ else
+ {
+ init( encoding );
+ }
+}
+
+Text2UnicodeConverter::~Text2UnicodeConverter()
+{
+ if( m_bInitialized )
+ {
+ rtl_destroyTextToUnicodeContext( m_convText2Unicode , m_contextText2Unicode );
+ rtl_destroyUnicodeToTextConverter( m_convText2Unicode );
+ }
+}
+
+void Text2UnicodeConverter::init( rtl_TextEncoding encoding )
+{
+ m_bCanContinue = true;
+ m_bInitialized = true;
+
+ m_convText2Unicode = rtl_createTextToUnicodeConverter(encoding);
+ m_contextText2Unicode = rtl_createTextToUnicodeContext( m_convText2Unicode );
+}
+
+
+Sequence<sal_Unicode> Text2UnicodeConverter::convert( const Sequence<sal_Int8> &seqText )
+{
+ sal_uInt32 uiInfo;
+ sal_Size nSrcCvtBytes = 0;
+ sal_Size nTargetCount = 0;
+ sal_Size nSourceCount = 0;
+
+ // the whole source size
+ sal_Int32 nSourceSize = seqText.getLength() + m_seqSource.getLength();
+ Sequence<sal_Unicode> seqUnicode ( nSourceSize );
+
+ const sal_Int8 *pbSource = seqText.getConstArray();
+ std::unique_ptr<sal_Int8[]> pbTempMem;
+
+ if( m_seqSource.hasElements() ) {
+ // put old rest and new byte sequence into one array
+ pbTempMem.reset(new sal_Int8[ nSourceSize ]);
+ memcpy( pbTempMem.get() , m_seqSource.getConstArray() , m_seqSource.getLength() );
+ memcpy( &(pbTempMem[ m_seqSource.getLength() ]) , seqText.getConstArray() , seqText.getLength() );
+ pbSource = pbTempMem.get();
+
+ // set to zero again
+ m_seqSource = Sequence< sal_Int8 >();
+ }
+
+ while( true ) {
+
+ /* All invalid characters are transformed to the unicode undefined char */
+ nTargetCount += rtl_convertTextToUnicode(
+ m_convText2Unicode,
+ m_contextText2Unicode,
+ reinterpret_cast<const char *>(&( pbSource[nSourceCount] )),
+ nSourceSize - nSourceCount ,
+ &( seqUnicode.getArray()[ nTargetCount ] ),
+ seqUnicode.getLength() - nTargetCount,
+ RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT |
+ RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
+ RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT,
+ &uiInfo,
+ &nSrcCvtBytes );
+ nSourceCount += nSrcCvtBytes;
+
+ if( uiInfo & RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL ) {
+ // save necessary bytes for next conversion
+ seqUnicode.realloc( seqUnicode.getLength() * 2 );
+ continue;
+ }
+ break;
+ }
+ if( uiInfo & RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL ) {
+ m_seqSource.realloc( nSourceSize - nSourceCount );
+ memcpy( m_seqSource.getArray() , &(pbSource[nSourceCount]) , nSourceSize-nSourceCount );
+ }
+
+ // set to correct unicode size
+ seqUnicode.realloc( nTargetCount );
+
+ return seqUnicode;
+}
+
+
+// Unicode2TextConverter
+
+
+Unicode2TextConverter::Unicode2TextConverter( rtl_TextEncoding encoding )
+{
+ m_convUnicode2Text = rtl_createUnicodeToTextConverter( encoding );
+ m_contextUnicode2Text = rtl_createUnicodeToTextContext( m_convUnicode2Text );
+}
+
+
+Unicode2TextConverter::~Unicode2TextConverter()
+{
+ rtl_destroyUnicodeToTextContext( m_convUnicode2Text , m_contextUnicode2Text );
+ rtl_destroyUnicodeToTextConverter( m_convUnicode2Text );
+}
+
+
+Sequence<sal_Int8> Unicode2TextConverter::convert(const sal_Unicode *puSource , sal_Int32 nSourceSize)
+{
+ std::unique_ptr<sal_Unicode[]> puTempMem;
+
+ if( m_seqSource.hasElements() ) {
+ // For surrogates !
+ // put old rest and new byte sequence into one array
+ // In general when surrogates are used, they should be rarely
+ // cut off between two convert()-calls. So this code is used
+ // rarely and the extra copy is acceptable.
+ puTempMem.reset(new sal_Unicode[ nSourceSize + m_seqSource.getLength()]);
+ memcpy( puTempMem.get() ,
+ m_seqSource.getConstArray() ,
+ m_seqSource.getLength() * sizeof( sal_Unicode ) );
+ memcpy(
+ &(puTempMem[ m_seqSource.getLength() ]) ,
+ puSource ,
+ nSourceSize*sizeof( sal_Unicode ) );
+ puSource = puTempMem.get();
+ nSourceSize += m_seqSource.getLength();
+
+ m_seqSource = Sequence< sal_Unicode > ();
+ }
+
+
+ sal_Size nTargetCount = 0;
+ sal_Size nSourceCount = 0;
+
+ sal_uInt32 uiInfo;
+ sal_Size nSrcCvtChars;
+
+ // take nSourceSize * 3 as preference
+ // this is an upper boundary for converting to utf8,
+ // which most often used as the target.
+ sal_Int32 nSeqSize = nSourceSize * 3;
+
+ Sequence<sal_Int8> seqText( nSeqSize );
+ char *pTarget = reinterpret_cast<char *>(seqText.getArray());
+ while( true ) {
+
+ nTargetCount += rtl_convertUnicodeToText(
+ m_convUnicode2Text,
+ m_contextUnicode2Text,
+ &( puSource[nSourceCount] ),
+ nSourceSize - nSourceCount ,
+ &( pTarget[nTargetCount] ),
+ nSeqSize - nTargetCount,
+ RTL_UNICODETOTEXT_FLAGS_UNDEFINED_DEFAULT |
+ RTL_UNICODETOTEXT_FLAGS_INVALID_DEFAULT ,
+ &uiInfo,
+ &nSrcCvtChars);
+ nSourceCount += nSrcCvtChars;
+
+ if( uiInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL ) {
+ nSeqSize = nSeqSize *2;
+ seqText.realloc( nSeqSize ); // double array size
+ pTarget = reinterpret_cast<char *>(seqText.getArray());
+ continue;
+ }
+ break;
+ }
+
+ // for surrogates
+ if( uiInfo & RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL ) {
+ m_seqSource.realloc( nSourceSize - nSourceCount );
+ memcpy( m_seqSource.getArray() ,
+ &(puSource[nSourceCount]),
+ (nSourceSize - nSourceCount) * sizeof( sal_Unicode ) );
+ }
+
+ // reduce the size of the buffer (fast, no copy necessary)
+ seqText.realloc( nTargetCount );
+
+ return seqText;
+}
+
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */