diff options
Diffstat (limited to 'sax/source/expatwrap')
-rw-r--r-- | sax/source/expatwrap/expwrap.component | 38 | ||||
-rw-r--r-- | sax/source/expatwrap/sax_expat.cxx | 957 | ||||
-rw-r--r-- | sax/source/expatwrap/saxwriter.cxx | 1464 | ||||
-rw-r--r-- | sax/source/expatwrap/xml2utf.cxx | 519 |
4 files changed, 2978 insertions, 0 deletions
diff --git a/sax/source/expatwrap/expwrap.component b/sax/source/expatwrap/expwrap.component new file mode 100644 index 0000000000..1f72eccf31 --- /dev/null +++ b/sax/source/expatwrap/expwrap.component @@ -0,0 +1,38 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + --> + +<component loader="com.sun.star.loader.SharedLibrary" environment="@CPPU_ENV@" + xmlns="http://openoffice.org/2010/uno-components"> + <implementation name="com.sun.star.comp.extensions.xml.sax.ParserExpat" + constructor="com_sun_star_comp_extensions_xml_sax_ParserExpat_get_implementation"> + <service name="com.sun.star.xml.sax.Parser"/> + </implementation> + <implementation name="com.sun.star.extensions.xml.sax.Writer" + constructor="com_sun_star_extensions_xml_sax_Writer_get_implementation"> + <service name="com.sun.star.xml.sax.Writer"/> + </implementation> + <implementation name="com.sun.star.comp.extensions.xml.sax.FastParser" + constructor="com_sun_star_comp_extensions_xml_sax_FastParser_get_implementation"> + <service name="com.sun.star.xml.sax.FastParser"/> + </implementation> + <implementation name="com.sun.star.comp.extensions.xml.sax.LegacyFastParser" + constructor="com_sun_star_comp_extensions_xml_sax_LegacyFastParser_get_implementation"> + <service name="com.sun.star.xml.sax.LegacyFastParser"/> + </implementation> +</component> diff --git a/sax/source/expatwrap/sax_expat.cxx b/sax/source/expatwrap/sax_expat.cxx new file mode 100644 index 0000000000..9a82b87036 --- /dev/null +++ b/sax/source/expatwrap/sax_expat.cxx @@ -0,0 +1,957 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ + +#include <string.h> +#include <cassert> +#include <memory> +#include <mutex> +#include <utility> +#include <string_view> +#include <vector> + + +#include <com/sun/star/lang/XServiceInfo.hpp> +#include <com/sun/star/lang/XInitialization.hpp> +#include <com/sun/star/uno/XComponentContext.hpp> +#include <com/sun/star/xml/sax/XExtendedDocumentHandler.hpp> +#include <com/sun/star/xml/sax/XParser.hpp> +#include <com/sun/star/xml/sax/SAXParseException.hpp> +#include <com/sun/star/io/IOException.hpp> +#include <com/sun/star/io/XSeekable.hpp> +#include <com/sun/star/lang/WrappedTargetRuntimeException.hpp> + +#include <comphelper/attributelist.hxx> +#include <cppuhelper/weak.hxx> +#include <cppuhelper/implbase.hxx> +#include <cppuhelper/supportsservice.hxx> +#include <rtl/ref.hxx> +#include <sal/log.hxx> + +#include <expat.h> +#include <xml2utf.hxx> + +using namespace ::osl; +using namespace ::cppu; +using namespace ::com::sun::star::lang; +using namespace ::com::sun::star::xml::sax; +using namespace ::com::sun::star::io; + + +namespace { + +#define XML_CHAR_TO_OUSTRING(x) OUString(x , strlen( x ), RTL_TEXTENCODING_UTF8) +#define XML_CHAR_N_TO_USTRING(x,n) OUString(x,n, RTL_TEXTENCODING_UTF8 ) + + +/* +* The following macro encapsulates any call to an event handler. +* It ensures, that exceptions thrown by the event handler are +* treated properly. +*/ +#define CALL_ELEMENT_HANDLER_AND_CARE_FOR_EXCEPTIONS(pThis,call) \ + if( ! pThis->bExceptionWasThrown ) { \ + try {\ + pThis->call;\ + }\ + catch( const SAXParseException &e ) {\ + callErrorHandler( pThis , e );\ + }\ + catch( const SAXException &e ) {\ + callErrorHandler( pThis , SAXParseException(\ + e.Message, \ + e.Context, \ + e.WrappedException,\ + pThis->rDocumentLocator->getPublicId(),\ + pThis->rDocumentLocator->getSystemId(),\ + pThis->rDocumentLocator->getLineNumber(),\ + pThis->rDocumentLocator->getColumnNumber()\ + ) );\ + }\ + catch( const css::uno::RuntimeException &e ) {\ + pThis->bExceptionWasThrown = true; \ + pThis->bRTExceptionWasThrown = true; \ + pImpl->rtexception = e; \ + }\ + catch( const css::uno::Exception &e ) {\ + pThis->bExceptionWasThrown = true; \ + pThis->bRTExceptionWasThrown = true; \ + pImpl->rtexception = WrappedTargetRuntimeException("Non-runtime UNO exception caught during parse", e.Context, css::uno::Any(e)); \ + }\ + }\ + ((void)0) + + +class SaxExpatParser_Impl; + +// This class implements the external Parser interface +class SaxExpatParser + : public WeakImplHelper< XInitialization + , XServiceInfo + , XParser > +{ + +public: + SaxExpatParser(); + + // css::lang::XInitialization: + virtual void SAL_CALL initialize(css::uno::Sequence<css::uno::Any> const& rArguments) override; + + // The SAX-Parser-Interface + virtual void SAL_CALL parseStream( const InputSource& structSource) override; + virtual void SAL_CALL setDocumentHandler(const css::uno::Reference< XDocumentHandler > & xHandler) override; + + virtual void SAL_CALL setErrorHandler(const css::uno::Reference< XErrorHandler > & xHandler) override; + virtual void SAL_CALL setDTDHandler(const css::uno::Reference < XDTDHandler > & xHandler) override; + virtual void SAL_CALL setEntityResolver(const css::uno::Reference< XEntityResolver >& xResolver) override; + + virtual void SAL_CALL setLocale( const Locale &locale ) override; + +public: // XServiceInfo + OUString SAL_CALL getImplementationName() override; + css::uno::Sequence< OUString > SAL_CALL getSupportedServiceNames() override; + sal_Bool SAL_CALL supportsService(const OUString& ServiceName) override; + +private: + std::unique_ptr<SaxExpatParser_Impl> m_pImpl; +}; + + +// Entity binds all information needed for a single file +struct Entity +{ + InputSource structSource; + XML_Parser pParser; + sax_expatwrap::XMLFile2UTFConverter converter; +}; + + +class SaxExpatParser_Impl +{ +public: // module scope + std::mutex aMutex; + bool m_bEnableDoS; // fdo#60471 thank you Adobe Illustrator + + css::uno::Reference< XDocumentHandler > rDocumentHandler; + css::uno::Reference< XExtendedDocumentHandler > rExtendedDocumentHandler; + + css::uno::Reference< XErrorHandler > rErrorHandler; + css::uno::Reference< XDTDHandler > rDTDHandler; + css::uno::Reference< XEntityResolver > rEntityResolver; + css::uno::Reference < XLocator > rDocumentLocator; + + + rtl::Reference < comphelper::AttributeList > rAttrList; + + // External entity stack + std::vector<struct Entity> vecEntity; + void pushEntity( Entity &&entity ) + { vecEntity.push_back( std::move(entity) ); } + void popEntity() + { vecEntity.pop_back( ); } + struct Entity &getEntity() + { return vecEntity.back(); } + + + // Exception cannot be thrown through the C-XmlParser (possible resource leaks), + // therefore the exception must be saved somewhere. + SAXParseException exception; + css::uno::RuntimeException rtexception; + bool bExceptionWasThrown; + bool bRTExceptionWasThrown; + +public: + SaxExpatParser_Impl() + : m_bEnableDoS(false) + , bExceptionWasThrown(false) + , bRTExceptionWasThrown(false) + { + } + + // the C-Callbacks for the expat parser + void static callbackStartElement(void *userData, const XML_Char *name , const XML_Char **atts); + void static callbackEndElement(void *userData, const XML_Char *name); + void static callbackCharacters( void *userData , const XML_Char *s , int nLen ); + void static callbackProcessingInstruction( void *userData , + const XML_Char *sTarget , + const XML_Char *sData ); + + void static callbackEntityDecl( void *userData , + const XML_Char *entityName, + int is_parameter_entity, + const XML_Char *value, + int value_length, + const XML_Char *base, + const XML_Char *systemId, + const XML_Char *publicId, + const XML_Char *notationName); + + void static callbackNotationDecl( void *userData, + const XML_Char *notationName, + const XML_Char *base, + const XML_Char *systemId, + const XML_Char *publicId); + + bool static callbackExternalEntityRef( XML_Parser parser, + const XML_Char *openEntityNames, + const XML_Char *base, + const XML_Char *systemId, + const XML_Char *publicId); + + int static callbackUnknownEncoding(void *encodingHandlerData, + const XML_Char *name, + XML_Encoding *info); + + void static callbackDefault( void *userData, const XML_Char *s, int len); + + void static callbackStartCDATA( void *userData ); + void static callbackEndCDATA( void *userData ); + void static callbackComment( void *userData , const XML_Char *s ); + void static callErrorHandler( SaxExpatParser_Impl *pImpl , const SAXParseException &e ); + +public: + void parse(); +}; + +extern "C" +{ + static void call_callbackStartElement(void *userData, const XML_Char *name , const XML_Char **atts) + { + SaxExpatParser_Impl::callbackStartElement(userData,name,atts); + } + static void call_callbackEndElement(void *userData, const XML_Char *name) + { + SaxExpatParser_Impl::callbackEndElement(userData,name); + } + static void call_callbackCharacters( void *userData , const XML_Char *s , int nLen ) + { + SaxExpatParser_Impl::callbackCharacters(userData,s,nLen); + } + static void call_callbackProcessingInstruction(void *userData,const XML_Char *sTarget,const XML_Char *sData ) + { + SaxExpatParser_Impl::callbackProcessingInstruction(userData,sTarget,sData ); + } + static void call_callbackEntityDecl(void *userData , + const XML_Char *entityName, + int is_parameter_entity, + const XML_Char *value, + int value_length, + const XML_Char *base, + const XML_Char *systemId, + const XML_Char *publicId, + const XML_Char *notationName) + { + SaxExpatParser_Impl::callbackEntityDecl(userData, entityName, + is_parameter_entity, value, value_length, + base, systemId, publicId, notationName); + } + static void call_callbackNotationDecl(void *userData, + const XML_Char *notationName, + const XML_Char *base, + const XML_Char *systemId, + const XML_Char *publicId) + { + SaxExpatParser_Impl::callbackNotationDecl(userData,notationName,base,systemId,publicId); + } + static int call_callbackExternalEntityRef(XML_Parser parser, + const XML_Char *openEntityNames, + const XML_Char *base, + const XML_Char *systemId, + const XML_Char *publicId) + { + return SaxExpatParser_Impl::callbackExternalEntityRef(parser,openEntityNames,base,systemId,publicId); + } + static int call_callbackUnknownEncoding(void *encodingHandlerData, + const XML_Char *name, + XML_Encoding *info) + { + return SaxExpatParser_Impl::callbackUnknownEncoding(encodingHandlerData,name,info); + } + static void call_callbackDefault( void *userData, const XML_Char *s, int len) + { + SaxExpatParser_Impl::callbackDefault(userData,s,len); + } + static void call_callbackStartCDATA( void *userData ) + { + SaxExpatParser_Impl::callbackStartCDATA(userData); + } + static void call_callbackEndCDATA( void *userData ) + { + SaxExpatParser_Impl::callbackEndCDATA(userData); + } + static void call_callbackComment( void *userData , const XML_Char *s ) + { + SaxExpatParser_Impl::callbackComment(userData,s); + } +} + + +// LocatorImpl + +class LocatorImpl : + public WeakImplHelper< XLocator, css::io::XSeekable > + // should use a different interface for stream positions! +{ +public: + explicit LocatorImpl(SaxExpatParser_Impl *p) + : m_pParser(p) + { + } + +public: //XLocator + virtual sal_Int32 SAL_CALL getColumnNumber() override + { + return XML_GetCurrentColumnNumber( m_pParser->getEntity().pParser ); + } + virtual sal_Int32 SAL_CALL getLineNumber() override + { + return XML_GetCurrentLineNumber( m_pParser->getEntity().pParser ); + } + virtual OUString SAL_CALL getPublicId() override + { + return m_pParser->getEntity().structSource.sPublicId; + } + virtual OUString SAL_CALL getSystemId() override + { + return m_pParser->getEntity().structSource.sSystemId; + } + + // XSeekable (only for getPosition) + + virtual void SAL_CALL seek( sal_Int64 ) override + { + } + virtual sal_Int64 SAL_CALL getPosition() override + { + return XML_GetCurrentByteIndex( m_pParser->getEntity().pParser ); + } + virtual ::sal_Int64 SAL_CALL getLength() override + { + return 0; + } + +private: + + SaxExpatParser_Impl *m_pParser; +}; + + +SaxExpatParser::SaxExpatParser( ) +{ + m_pImpl.reset( new SaxExpatParser_Impl ); + + rtl::Reference<LocatorImpl> pLoc = new LocatorImpl( m_pImpl.get() ); + m_pImpl->rDocumentLocator = pLoc; + + // Performance-improvement; handing out the same object with every call of + // the startElement callback is allowed (see sax-specification): + m_pImpl->rAttrList = new comphelper::AttributeList; + + m_pImpl->bExceptionWasThrown = false; + m_pImpl->bRTExceptionWasThrown = false; +} + +// css::lang::XInitialization: +void SAL_CALL +SaxExpatParser::initialize(css::uno::Sequence< css::uno::Any > const& rArguments) +{ + // possible arguments: a string "DoSmeplease" + if (rArguments.hasElements()) + { + OUString str; + if ((rArguments[0] >>= str) && "DoSmeplease" == str) + { + std::unique_lock guard( m_pImpl->aMutex ); + m_pImpl->m_bEnableDoS = true; + } + } +} + +class ParserCleanup +{ +private: + SaxExpatParser_Impl& m_rParser; + XML_Parser m_xmlParser; +public: + ParserCleanup(SaxExpatParser_Impl& rParser, XML_Parser xmlParser) + : m_rParser(rParser) + , m_xmlParser(xmlParser) + { + } + ~ParserCleanup() + { + m_rParser.popEntity(); + //XML_ParserFree accepts a null arg + XML_ParserFree(m_xmlParser); + } +}; + +/*************** +* +* parseStream does Parser-startup initializations. The SaxExpatParser_Impl::parse() method does +* the file-specific initialization work. (During a parser run, external files may be opened) +* +****************/ +void SaxExpatParser::parseStream( const InputSource& structSource) +{ + // Only one text at one time + std::unique_lock guard( m_pImpl->aMutex ); + + + struct Entity entity; + entity.structSource = structSource; + + if( ! entity.structSource.aInputStream.is() ) + { + throw SAXException("No input source", + css::uno::Reference< css::uno::XInterface > () , css::uno::Any() ); + } + + entity.converter.setInputStream( entity.structSource.aInputStream ); + if( !entity.structSource.sEncoding.isEmpty() ) + { + entity.converter.setEncoding( + OUStringToOString( entity.structSource.sEncoding , RTL_TEXTENCODING_ASCII_US ) ); + } + + // create parser with proper encoding + entity.pParser = XML_ParserCreate( nullptr ); + if( ! entity.pParser ) + { + throw SAXException("Couldn't create parser", + css::uno::Reference< css::uno::XInterface > (), css::uno::Any() ); + } + + // set all necessary C-Callbacks + XML_SetUserData( entity.pParser, m_pImpl.get() ); + XML_SetElementHandler( entity.pParser , + call_callbackStartElement , + call_callbackEndElement ); + XML_SetCharacterDataHandler( entity.pParser , call_callbackCharacters ); + XML_SetProcessingInstructionHandler(entity.pParser , + call_callbackProcessingInstruction ); + if (!m_pImpl->m_bEnableDoS) + { + XML_SetEntityDeclHandler(entity.pParser, call_callbackEntityDecl); + } + XML_SetNotationDeclHandler( entity.pParser, call_callbackNotationDecl ); + XML_SetExternalEntityRefHandler( entity.pParser, + call_callbackExternalEntityRef); + XML_SetUnknownEncodingHandler( entity.pParser, call_callbackUnknownEncoding ,nullptr); + + if( m_pImpl->rExtendedDocumentHandler.is() ) { + + // These handlers just delegate calls to the ExtendedHandler. If no extended handler is + // given, these callbacks can be ignored + XML_SetDefaultHandlerExpand( entity.pParser, call_callbackDefault ); + XML_SetCommentHandler( entity.pParser, call_callbackComment ); + XML_SetCdataSectionHandler( entity.pParser , + call_callbackStartCDATA , + call_callbackEndCDATA ); + } + + + m_pImpl->exception = SAXParseException(); + auto const xmlParser = entity.pParser; + m_pImpl->pushEntity( std::move(entity) ); + + ParserCleanup aEnsureFree(*m_pImpl, xmlParser); + + // start the document + if( m_pImpl->rDocumentHandler.is() ) { + m_pImpl->rDocumentHandler->setDocumentLocator( m_pImpl->rDocumentLocator ); + m_pImpl->rDocumentHandler->startDocument(); + } + + m_pImpl->parse(); + + // finish document + if( m_pImpl->rDocumentHandler.is() ) { + m_pImpl->rDocumentHandler->endDocument(); + } +} + +void SaxExpatParser::setDocumentHandler(const css::uno::Reference< XDocumentHandler > & xHandler) +{ + m_pImpl->rDocumentHandler = xHandler; + m_pImpl->rExtendedDocumentHandler = + css::uno::Reference< XExtendedDocumentHandler >( xHandler , css::uno::UNO_QUERY ); +} + +void SaxExpatParser::setErrorHandler(const css::uno::Reference< XErrorHandler > & xHandler) +{ + m_pImpl->rErrorHandler = xHandler; +} + +void SaxExpatParser::setDTDHandler(const css::uno::Reference< XDTDHandler > & xHandler) +{ + m_pImpl->rDTDHandler = xHandler; +} + +void SaxExpatParser::setEntityResolver(const css::uno::Reference < XEntityResolver > & xResolver) +{ + m_pImpl->rEntityResolver = xResolver; +} + + +void SaxExpatParser::setLocale( const Locale & ) +{ + // not implemented +} + +// XServiceInfo +OUString SaxExpatParser::getImplementationName() +{ + return "com.sun.star.comp.extensions.xml.sax.ParserExpat"; +} + +// XServiceInfo +sal_Bool SaxExpatParser::supportsService(const OUString& ServiceName) +{ + return cppu::supportsService(this, ServiceName); +} + +// XServiceInfo +css::uno::Sequence< OUString > SaxExpatParser::getSupportedServiceNames() +{ + return { "com.sun.star.xml.sax.Parser" }; +} + + +/*--------------------------------------- +* +* Helper functions and classes +* +* +*-------------------------------------------*/ +OUString getErrorMessage( XML_Error xmlE, std::u16string_view sSystemId , sal_Int32 nLine ) +{ + OUString Message; + if( XML_ERROR_NONE == xmlE ) { + Message = "No"; + } + else if( XML_ERROR_NO_MEMORY == xmlE ) { + Message = "no memory"; + } + else if( XML_ERROR_SYNTAX == xmlE ) { + Message = "syntax"; + } + else if( XML_ERROR_NO_ELEMENTS == xmlE ) { + Message = "no elements"; + } + else if( XML_ERROR_INVALID_TOKEN == xmlE ) { + Message = "invalid token"; + } + else if( XML_ERROR_UNCLOSED_TOKEN == xmlE ) { + Message = "unclosed token"; + } + else if( XML_ERROR_PARTIAL_CHAR == xmlE ) { + Message = "partial char"; + } + else if( XML_ERROR_TAG_MISMATCH == xmlE ) { + Message = "tag mismatch"; + } + else if( XML_ERROR_DUPLICATE_ATTRIBUTE == xmlE ) { + Message = "duplicate attribute"; + } + else if( XML_ERROR_JUNK_AFTER_DOC_ELEMENT == xmlE ) { + Message = "junk after doc element"; + } + else if( XML_ERROR_PARAM_ENTITY_REF == xmlE ) { + Message = "parameter entity reference"; + } + else if( XML_ERROR_UNDEFINED_ENTITY == xmlE ) { + Message = "undefined entity"; + } + else if( XML_ERROR_RECURSIVE_ENTITY_REF == xmlE ) { + Message = "recursive entity reference"; + } + else if( XML_ERROR_ASYNC_ENTITY == xmlE ) { + Message = "async entity"; + } + else if( XML_ERROR_BAD_CHAR_REF == xmlE ) { + Message = "bad char reference"; + } + else if( XML_ERROR_BINARY_ENTITY_REF == xmlE ) { + Message = "binary entity reference"; + } + else if( XML_ERROR_ATTRIBUTE_EXTERNAL_ENTITY_REF == xmlE ) { + Message = "attribute external entity reference"; + } + else if( XML_ERROR_MISPLACED_XML_PI == xmlE ) { + Message = "misplaced xml processing instruction"; + } + else if( XML_ERROR_UNKNOWN_ENCODING == xmlE ) { + Message = "unknown encoding"; + } + else if( XML_ERROR_INCORRECT_ENCODING == xmlE ) { + Message = "incorrect encoding"; + } + else if( XML_ERROR_UNCLOSED_CDATA_SECTION == xmlE ) { + Message = "unclosed cdata section"; + } + else if( XML_ERROR_EXTERNAL_ENTITY_HANDLING == xmlE ) { + Message = "external entity reference"; + } + else if( XML_ERROR_NOT_STANDALONE == xmlE ) { + Message = "not standalone"; + } + + OUString str = OUString::Concat("[") + + sSystemId + + " line " + + OUString::number( nLine ) + + "]: " + + Message + + "error"; + + return str; +} + + +// starts parsing with actual parser ! +void SaxExpatParser_Impl::parse( ) +{ + const int nBufSize = 16*1024; + + int nRead = nBufSize; + css::uno::Sequence< sal_Int8 > seqOut(nBufSize); + + while( nRead ) { + nRead = getEntity().converter.readAndConvert( seqOut , nBufSize ); + + bool bContinue(false); + + if( ! nRead ) { + // last call - must return OK + XML_Status const ret = XML_Parse( getEntity().pParser, + reinterpret_cast<const char *>(seqOut.getConstArray()), + 0 , + 1 ); + if (ret == XML_STATUS_OK) { + break; + } + } else { + bContinue = ( XML_Parse( getEntity().pParser, + reinterpret_cast<const char *>(seqOut.getConstArray()), + nRead, + 0 ) != XML_STATUS_ERROR ); + } + + if( ! bContinue || bExceptionWasThrown ) { + + if ( bRTExceptionWasThrown ) + throw rtexception; + + // Error during parsing ! + XML_Error xmlE = XML_GetErrorCode( getEntity().pParser ); + OUString sSystemId = rDocumentLocator->getSystemId(); + sal_Int32 nLine = rDocumentLocator->getLineNumber(); + + SAXParseException aExcept( + getErrorMessage(xmlE , sSystemId, nLine) , + css::uno::Reference< css::uno::XInterface >(), + css::uno::Any( &exception , cppu::UnoType<decltype(exception)>::get() ), + rDocumentLocator->getPublicId(), + rDocumentLocator->getSystemId(), + rDocumentLocator->getLineNumber(), + rDocumentLocator->getColumnNumber() + ); + + if( rErrorHandler.is() ) { + + // error handler is set, so the handler may throw the exception + css::uno::Any a; + a <<= aExcept; + rErrorHandler->fatalError( a ); + } + + // Error handler has not thrown an exception, but parsing cannot go on, + // so an exception MUST be thrown. + throw aExcept; + } // if( ! bContinue ) + } // while +} + + +// The C-Callbacks + + +void SaxExpatParser_Impl::callbackStartElement( void *pvThis , + const XML_Char *pwName , + const XML_Char **awAttributes ) +{ + SaxExpatParser_Impl *pImpl = static_cast<SaxExpatParser_Impl*>(pvThis); + + if( !pImpl->rDocumentHandler.is() ) + return; + + int i = 0; + pImpl->rAttrList->Clear(); + + while( awAttributes[i] ) { + assert(awAttributes[i+1]); + pImpl->rAttrList->AddAttribute( + XML_CHAR_TO_OUSTRING( awAttributes[i] ) , + XML_CHAR_TO_OUSTRING( awAttributes[i+1] ) ); + i +=2; + } + + CALL_ELEMENT_HANDLER_AND_CARE_FOR_EXCEPTIONS( + pImpl , + rDocumentHandler->startElement( XML_CHAR_TO_OUSTRING( pwName ) , + pImpl->rAttrList ) ); +} + +void SaxExpatParser_Impl::callbackEndElement( void *pvThis , const XML_Char *pwName ) +{ + SaxExpatParser_Impl *pImpl = static_cast<SaxExpatParser_Impl*>(pvThis); + + if( pImpl->rDocumentHandler.is() ) { + CALL_ELEMENT_HANDLER_AND_CARE_FOR_EXCEPTIONS( pImpl, + rDocumentHandler->endElement( XML_CHAR_TO_OUSTRING( pwName ) ) ); + } +} + + +void SaxExpatParser_Impl::callbackCharacters( void *pvThis , const XML_Char *s , int nLen ) +{ + SaxExpatParser_Impl *pImpl = static_cast<SaxExpatParser_Impl*>(pvThis); + + if( pImpl->rDocumentHandler.is() ) { + CALL_ELEMENT_HANDLER_AND_CARE_FOR_EXCEPTIONS( pImpl , + rDocumentHandler->characters( XML_CHAR_N_TO_USTRING(s,nLen) ) ); + } +} + +void SaxExpatParser_Impl::callbackProcessingInstruction( void *pvThis, + const XML_Char *sTarget , + const XML_Char *sData ) +{ + SaxExpatParser_Impl *pImpl = static_cast<SaxExpatParser_Impl*>(pvThis); + if( pImpl->rDocumentHandler.is() ) { + CALL_ELEMENT_HANDLER_AND_CARE_FOR_EXCEPTIONS( + pImpl , + rDocumentHandler->processingInstruction( XML_CHAR_TO_OUSTRING( sTarget ), + XML_CHAR_TO_OUSTRING( sData ) ) ); + } +} + + +void SaxExpatParser_Impl::callbackEntityDecl( + void *pvThis, const XML_Char *entityName, + SAL_UNUSED_PARAMETER int /*is_parameter_entity*/, + const XML_Char *value, SAL_UNUSED_PARAMETER int /*value_length*/, + SAL_UNUSED_PARAMETER const XML_Char * /*base*/, const XML_Char *systemId, + const XML_Char *publicId, const XML_Char *notationName) +{ + SaxExpatParser_Impl *pImpl = static_cast<SaxExpatParser_Impl*>(pvThis); + if (value) { // value != 0 means internal entity + SAL_INFO("sax","SaxExpatParser: internal entity declaration, stopping"); + XML_StopParser(pImpl->getEntity().pParser, XML_FALSE); + pImpl->exception = SAXParseException( + "SaxExpatParser: internal entity declaration, stopping", + nullptr, css::uno::Any(), + pImpl->rDocumentLocator->getPublicId(), + pImpl->rDocumentLocator->getSystemId(), + pImpl->rDocumentLocator->getLineNumber(), + pImpl->rDocumentLocator->getColumnNumber() ); + pImpl->bExceptionWasThrown = true; + } else { + if( pImpl->rDTDHandler.is() ) { + CALL_ELEMENT_HANDLER_AND_CARE_FOR_EXCEPTIONS( + pImpl , + rDTDHandler->unparsedEntityDecl( + XML_CHAR_TO_OUSTRING( entityName ), + XML_CHAR_TO_OUSTRING( publicId ) , + XML_CHAR_TO_OUSTRING( systemId ) , + XML_CHAR_TO_OUSTRING( notationName ) ) ); + } + } +} + +void SaxExpatParser_Impl::callbackNotationDecl( + void *pvThis, const XML_Char *notationName, + SAL_UNUSED_PARAMETER const XML_Char * /*base*/, const XML_Char *systemId, + const XML_Char *publicId) +{ + SaxExpatParser_Impl *pImpl = static_cast<SaxExpatParser_Impl*>(pvThis); + if( pImpl->rDTDHandler.is() ) { + CALL_ELEMENT_HANDLER_AND_CARE_FOR_EXCEPTIONS( pImpl, + rDTDHandler->notationDecl( XML_CHAR_TO_OUSTRING( notationName ) , + XML_CHAR_TO_OUSTRING( publicId ) , + XML_CHAR_TO_OUSTRING( systemId ) ) ); + } + +} + + +bool SaxExpatParser_Impl::callbackExternalEntityRef( + XML_Parser parser, const XML_Char *context, + SAL_UNUSED_PARAMETER const XML_Char * /*base*/, const XML_Char *systemId, + const XML_Char *publicId) +{ + bool bOK = true; + SaxExpatParser_Impl *pImpl = static_cast<SaxExpatParser_Impl*>(XML_GetUserData( parser )); + + struct Entity entity; + + if( pImpl->rEntityResolver.is() ) { + try + { + entity.structSource = pImpl->rEntityResolver->resolveEntity( + XML_CHAR_TO_OUSTRING( publicId ) , + XML_CHAR_TO_OUSTRING( systemId ) ); + } + catch( const SAXParseException & e ) + { + pImpl->exception = e; + bOK = false; + } + catch( const SAXException & e ) + { + pImpl->exception = SAXParseException( + e.Message , e.Context , e.WrappedException , + pImpl->rDocumentLocator->getPublicId(), + pImpl->rDocumentLocator->getSystemId(), + pImpl->rDocumentLocator->getLineNumber(), + pImpl->rDocumentLocator->getColumnNumber() ); + bOK = false; + } + } + + if( entity.structSource.aInputStream.is() ) { + entity.pParser = XML_ExternalEntityParserCreate( parser , context, nullptr ); + if( ! entity.pParser ) + { + return false; + } + + entity.converter.setInputStream( entity.structSource.aInputStream ); + auto const xmlParser = entity.pParser; + pImpl->pushEntity( std::move(entity) ); + try + { + pImpl->parse(); + } + catch( const SAXParseException & e ) + { + pImpl->exception = e; + bOK = false; + } + catch( const IOException &e ) + { + pImpl->exception.WrappedException <<= e; + bOK = false; + } + catch( const css::uno::RuntimeException &e ) + { + pImpl->exception.WrappedException <<=e; + bOK = false; + } + + pImpl->popEntity(); + + XML_ParserFree( xmlParser ); + } + + return bOK; +} + +int SaxExpatParser_Impl::callbackUnknownEncoding( + SAL_UNUSED_PARAMETER void * /*encodingHandlerData*/, + SAL_UNUSED_PARAMETER const XML_Char * /*name*/, + SAL_UNUSED_PARAMETER XML_Encoding * /*info*/) +{ + return 0; +} + +void SaxExpatParser_Impl::callbackDefault( void *pvThis, const XML_Char *s, int len) +{ + SaxExpatParser_Impl *pImpl = static_cast<SaxExpatParser_Impl*>(pvThis); + + CALL_ELEMENT_HANDLER_AND_CARE_FOR_EXCEPTIONS( pImpl, + rExtendedDocumentHandler->unknown( XML_CHAR_N_TO_USTRING( s ,len) ) ); +} + +void SaxExpatParser_Impl::callbackComment( void *pvThis , const XML_Char *s ) +{ + SaxExpatParser_Impl *pImpl = static_cast<SaxExpatParser_Impl*>(pvThis); + CALL_ELEMENT_HANDLER_AND_CARE_FOR_EXCEPTIONS( pImpl, + rExtendedDocumentHandler->comment( XML_CHAR_TO_OUSTRING( s ) ) ); +} + +void SaxExpatParser_Impl::callbackStartCDATA( void *pvThis ) +{ + SaxExpatParser_Impl *pImpl = static_cast<SaxExpatParser_Impl*>(pvThis); + + CALL_ELEMENT_HANDLER_AND_CARE_FOR_EXCEPTIONS( pImpl, rExtendedDocumentHandler->startCDATA() ); +} + + +void SaxExpatParser_Impl::callErrorHandler( SaxExpatParser_Impl *pImpl , + const SAXParseException & e ) +{ + try + { + if( pImpl->rErrorHandler.is() ) { + css::uno::Any a; + a <<= e; + pImpl->rErrorHandler->error( a ); + } + else { + pImpl->exception = e; + pImpl->bExceptionWasThrown = true; + } + } + catch( const SAXParseException & ex ) { + pImpl->exception = ex; + pImpl->bExceptionWasThrown = true; + } + catch( const SAXException & ex ) { + pImpl->exception = SAXParseException( + ex.Message, + ex.Context, + ex.WrappedException, + pImpl->rDocumentLocator->getPublicId(), + pImpl->rDocumentLocator->getSystemId(), + pImpl->rDocumentLocator->getLineNumber(), + pImpl->rDocumentLocator->getColumnNumber() + ); + pImpl->bExceptionWasThrown = true; + } +} + +void SaxExpatParser_Impl::callbackEndCDATA( void *pvThis ) +{ + SaxExpatParser_Impl *pImpl = static_cast<SaxExpatParser_Impl*>(pvThis); + + CALL_ELEMENT_HANDLER_AND_CARE_FOR_EXCEPTIONS(pImpl,rExtendedDocumentHandler->endCDATA() ); +} + +} // namespace + +extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface * +com_sun_star_comp_extensions_xml_sax_ParserExpat_get_implementation( + css::uno::XComponentContext *, + css::uno::Sequence<css::uno::Any> const &) +{ + return cppu::acquire(new SaxExpatParser); +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/sax/source/expatwrap/saxwriter.cxx b/sax/source/expatwrap/saxwriter.cxx new file mode 100644 index 0000000000..55608101fa --- /dev/null +++ b/sax/source/expatwrap/saxwriter.cxx @@ -0,0 +1,1464 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ + +#include <string.h> + +#include <cassert> +#include <set> +#include <stack> +#include <vector> + +#include <com/sun/star/io/IOException.hpp> +#include <com/sun/star/lang/WrappedTargetRuntimeException.hpp> +#include <com/sun/star/lang/XServiceInfo.hpp> +#include <com/sun/star/uno/XComponentContext.hpp> +#include <com/sun/star/util/XCloneable.hpp> +#include <com/sun/star/xml/sax/SAXInvalidCharacterException.hpp> +#include <com/sun/star/xml/sax/XWriter.hpp> + +#include <cppuhelper/exc_hlp.hxx> +#include <cppuhelper/weak.hxx> +#include <cppuhelper/implbase.hxx> +#include <cppuhelper/supportsservice.hxx> + +#include <osl/diagnose.h> +#include <rtl/character.hxx> +#include <sal/log.hxx> + +#include <memory> + +using namespace ::osl; +using namespace ::cppu; +using namespace ::com::sun::star::uno; +using namespace ::com::sun::star::lang; +using namespace ::com::sun::star::xml::sax; +using namespace ::com::sun::star::util; +using namespace ::com::sun::star::io; + +#define LINEFEED 10 +#define SEQUENCESIZE 1024 +#define MAXCOLUMNCOUNT 72 + +/****** +* +* +* Character conversion functions +* +* +*****/ + +namespace +{ +enum SaxInvalidCharacterError +{ + SAX_NONE, + SAX_WARNING, + SAX_ERROR +}; + +// Stuff for custom entity names +struct ReplacementPair +{ + OUString name; + OUString replacement; +}; +inline bool operator<(const ReplacementPair& lhs, const ReplacementPair& rhs) +{ + return lhs.replacement.compareTo(rhs.replacement) < 0; +} + +class SaxWriterHelper +{ +#ifdef DBG_UTIL +public: + ::std::stack<OUString> m_DebugStartedElements; +#endif + +private: + Reference<XOutputStream> m_out; + Sequence<sal_Int8> m_Sequence; + sal_Int8* mp_Sequence; + + sal_Int32 nLastLineFeedPos; // is negative after writing a sequence + sal_uInt32 nCurrentPos; + bool m_bStartElementFinished; + + std::vector<ReplacementPair> m_Replacements; + + /// @throws SAXException + sal_uInt32 writeSequence(); + + // use only if to insert the bytes more space in the sequence is needed and + // so the sequence has to write out and reset rPos to 0 + // writes sequence only on overflow, sequence could be full on the end (rPos == SEQUENCESIZE) + /// @throws SAXException + void AddBytes(sal_Int8* pTarget, sal_uInt32& rPos, const sal_Int8* pBytes, + sal_uInt32 nBytesCount); + /// @throws SAXException + bool convertToXML(const sal_Unicode* pStr, sal_Int32 nStrLen, bool bDoNormalization, + bool bNormalizeWhitespace, sal_Int8* pTarget, sal_uInt32& rPos); + /// @throws SAXException + void FinishStartElement(); + + // Search for the correct replacement + const ReplacementPair* findXMLReplacement(const sal_Unicode* pStr, sal_Int32 nStrLen); + +public: + explicit SaxWriterHelper(Reference<XOutputStream> const& m_TempOut) + : m_out(m_TempOut) + , m_Sequence(SEQUENCESIZE) + , mp_Sequence(nullptr) + , nLastLineFeedPos(0) + , nCurrentPos(0) + , m_bStartElementFinished(true) + { + OSL_ENSURE(SEQUENCESIZE > 50, "Sequence cache size too small"); + mp_Sequence = m_Sequence.getArray(); + } + ~SaxWriterHelper() + { + OSL_ENSURE(!nCurrentPos, "cached Sequence not written"); + OSL_ENSURE(m_bStartElementFinished, "StartElement not completely written"); + } + + /// @throws SAXException + void insertIndentation(sal_uInt32 m_nLevel); + + // returns whether it works correct or invalid characters were in the string + // If there are invalid characters in the string it returns sal_False. + // Than the calling method has to throw the needed Exception. + /// @throws SAXException + bool writeString(const OUString& rWriteOutString, bool bDoNormalization, + bool bNormalizeWhitespace); + + sal_uInt32 GetLastColumnCount() const noexcept + { + return static_cast<sal_uInt32>(nCurrentPos - nLastLineFeedPos); + } + + /// @throws SAXException + void startDocument(); + + // returns whether it works correct or invalid characters were in the strings + // If there are invalid characters in one of the strings it returns sal_False. + // Than the calling method has to throw the needed Exception. + /// @throws SAXException + SaxInvalidCharacterError startElement(const OUString& rName, + const Reference<XAttributeList>& xAttribs); + /// @throws SAXException + bool FinishEmptyElement(); + + // returns whether it works correct or invalid characters were in the string + // If there are invalid characters in the string it returns sal_False. + // Than the calling method has to throw the needed Exception. + /// @throws SAXException + bool endElement(const OUString& rName); + /// @throws SAXException + void endDocument(); + + // returns whether it works correct or invalid characters were in the strings + // If there are invalid characters in the string it returns sal_False. + // Than the calling method has to throw the needed Exception. + /// @throws SAXException + bool processingInstruction(const OUString& rTarget, const OUString& rData); + /// @throws SAXException + void startCDATA(); + /// @throws SAXException + void endCDATA(); + + // returns whether it works correct or invalid characters were in the strings + // If there are invalid characters in the string it returns sal_False. + // Than the calling method has to throw the needed Exception. + /// @throws SAXException + bool comment(const OUString& rComment); + + /// @throws SAXException + void clearBuffer(); + + // Use custom entity names + void setCustomEntityNames( + const ::css::uno::Sequence<::css::beans::Pair<::rtl::OUString, ::rtl::OUString>>& + replacements); + + // Calculate length for convertToXML + sal_Int32 calcXMLByteLength(const OUString& rStr, bool bDoNormalization, + bool bNormalizeWhitespace); +}; + +const bool g_bValidCharsBelow32[32] = { + // clang-format off +// 0 1 2 3 4 5 6 7 + false, false, false, false, false, false, false, false, //0 + false, true, true, false, false, true, false, false, //8 + false, false, false, false, false, false, false, false, //16 + false, false, false, false, false, false, false, false + // clang-format on +}; + +bool IsInvalidChar(const sal_Unicode aChar) +{ + bool bRet(false); + // check first for the most common characters + if (aChar < 32 || aChar >= 0xd800) + bRet = ((aChar < 32 && !g_bValidCharsBelow32[aChar]) || aChar == 0xffff || aChar == 0xfffe); + return bRet; +} + +/******** +* write through to the output stream +* +*****/ +sal_uInt32 SaxWriterHelper::writeSequence() +{ + try + { + m_out->writeBytes(m_Sequence); + } + catch (const IOException&) + { + css::uno::Any anyEx = cppu::getCaughtException(); + throw SAXException("IO exception during writing", Reference<XInterface>(), anyEx); + } + nLastLineFeedPos -= SEQUENCESIZE; + return 0; +} + +void SaxWriterHelper::AddBytes(sal_Int8* pTarget, sal_uInt32& rPos, const sal_Int8* pBytes, + sal_uInt32 nBytesCount) +{ + OSL_ENSURE((rPos + nBytesCount) > SEQUENCESIZE, "wrong use of AddBytesMethod"); + sal_uInt32 nCount(SEQUENCESIZE - rPos); + memcpy(&(pTarget[rPos]), pBytes, nCount); + + OSL_ENSURE(rPos + nCount == SEQUENCESIZE, "the position should be the at the end"); + + rPos = writeSequence(); + sal_uInt32 nRestCount(nBytesCount - nCount); + if ((rPos + nRestCount) <= SEQUENCESIZE) + { + memcpy(&(pTarget[rPos]), &pBytes[nCount], nRestCount); + rPos += nRestCount; + } + else + AddBytes(pTarget, rPos, &pBytes[nCount], nRestCount); +} + +void SaxWriterHelper::setCustomEntityNames( + const ::css::uno::Sequence<::css::beans::Pair<::rtl::OUString, ::rtl::OUString>>& replacements) +{ + m_Replacements.resize(replacements.size()); + for (size_t i = 0; i < replacements.size(); ++i) + { + m_Replacements[i].name = replacements[i].First; + m_Replacements[i].replacement = replacements[i].Second; + } + if (replacements.size() > 1) + std::sort(m_Replacements.begin(), m_Replacements.end()); +} + +/** Converts a UTF-16 string to UTF-8 and does XML normalization + + @param pTarget + Pointer to a piece of memory, to where the output should be written. The caller + must call calcXMLByteLength on the same string, to ensure, + that there is enough memory for converting. + */ +bool SaxWriterHelper::convertToXML(const sal_Unicode* pStr, sal_Int32 nStrLen, + bool bDoNormalization, bool bNormalizeWhitespace, + sal_Int8* pTarget, sal_uInt32& rPos) +{ + bool bRet(true); + sal_uInt32 nSurrogate = 0; + + for (sal_Int32 i = 0; i < nStrLen; i++) + { + sal_Unicode c = pStr[i]; + if (IsInvalidChar(c)) + bRet = false; + else if ((c >= 0x0001) && (c <= 0x007F)) // Deal with ascii + { + if (bDoNormalization) + { + switch (c) + { + case '&': // resemble to & + { + if ((rPos + 5) > SEQUENCESIZE) + AddBytes(pTarget, rPos, reinterpret_cast<sal_Int8 const*>("&"), 5); + else + { + memcpy(&(pTarget[rPos]), "&", 5); + rPos += 5; + } + } + break; + case '<': + { + if ((rPos + 4) > SEQUENCESIZE) + AddBytes(pTarget, rPos, reinterpret_cast<sal_Int8 const*>("<"), 4); + else + { + memcpy(&(pTarget[rPos]), "<", 4); + rPos += 4; // < + } + } + break; + case '>': + { + if ((rPos + 4) > SEQUENCESIZE) + AddBytes(pTarget, rPos, reinterpret_cast<sal_Int8 const*>(">"), 4); + else + { + memcpy(&(pTarget[rPos]), ">", 4); + rPos += 4; // > + } + } + break; + case '\'': + { + if ((rPos + 6) > SEQUENCESIZE) + AddBytes(pTarget, rPos, reinterpret_cast<sal_Int8 const*>("'"), 6); + else + { + memcpy(&(pTarget[rPos]), "'", 6); + rPos += 6; // ' + } + } + break; + case '"': + { + if ((rPos + 6) > SEQUENCESIZE) + AddBytes(pTarget, rPos, reinterpret_cast<sal_Int8 const*>("""), 6); + else + { + memcpy(&(pTarget[rPos]), """, 6); + rPos += 6; // " + } + } + break; + case 13: + { + if ((rPos + 6) > SEQUENCESIZE) + AddBytes(pTarget, rPos, reinterpret_cast<sal_Int8 const*>("
"), 6); + else + { + memcpy(&(pTarget[rPos]), "
", 6); + rPos += 6; + } + } + break; + case LINEFEED: + { + if (bNormalizeWhitespace) + { + if ((rPos + 6) > SEQUENCESIZE) + AddBytes(pTarget, rPos, reinterpret_cast<sal_Int8 const*>("
"), + 6); + else + { + memcpy(&(pTarget[rPos]), "
", 6); + rPos += 6; + } + } + else + { + pTarget[rPos] = LINEFEED; + nLastLineFeedPos = rPos; + rPos++; + } + } + break; + case 9: + { + if (bNormalizeWhitespace) + { + if ((rPos + 6) > SEQUENCESIZE) + AddBytes(pTarget, rPos, reinterpret_cast<sal_Int8 const*>("	"), + 6); + else + { + memcpy(&(pTarget[rPos]), "	", 6); + rPos += 6; + } + } + else + { + pTarget[rPos] = 9; + rPos++; + } + } + break; + default: + { + pTarget[rPos] = static_cast<sal_Int8>(c); + rPos++; + } + break; + } + } + else + { + pTarget[rPos] = static_cast<sal_Int8>(c); + if (static_cast<sal_Int8>(c) == LINEFEED) + nLastLineFeedPos = rPos; + rPos++; + } + } + else + { + // Deal with replacements + if (bDoNormalization && !m_Replacements.empty()) + { + // search + const ReplacementPair* it = findXMLReplacement(&pStr[i], nStrLen - i); + + // replace + if (it != nullptr) + { + OString name = ::rtl::OUStringToOString(it->name, RTL_TEXTENCODING_UTF8); + if (rPos + name.getLength() > SEQUENCESIZE) + AddBytes(pTarget, rPos, reinterpret_cast<sal_Int8 const*>(name.getStr()), + name.getLength()); + else + { + memcpy(&(pTarget[rPos]), name.getStr(), name.getLength()); + rPos += name.getLength(); + } + i += it->replacement.getLength() - 1; + continue; + } + } + + // Deal with other unicode cases + if (rtl::isHighSurrogate(c)) + { + // 1. surrogate: save (until 2. surrogate) + if (nSurrogate != 0) // left-over lone 1st Unicode surrogate + { + OSL_FAIL("left-over Unicode surrogate"); + bRet = false; + } + nSurrogate = c; + } + else if (rtl::isLowSurrogate(c)) + { + // 2. surrogate: write as UTF-8 + if (nSurrogate) // can only be 1st surrogate + { + nSurrogate = rtl::combineSurrogates(nSurrogate, c); + sal_Int8 aBytes[] = { sal_Int8(0xF0 | ((nSurrogate >> 18) & 0x0F)), + sal_Int8(0x80 | ((nSurrogate >> 12) & 0x3F)), + sal_Int8(0x80 | ((nSurrogate >> 6) & 0x3F)), + sal_Int8(0x80 | ((nSurrogate >> 0) & 0x3F)) }; + if ((rPos + 4) > SEQUENCESIZE) + AddBytes(pTarget, rPos, aBytes, 4); + else + { + pTarget[rPos] = aBytes[0]; + rPos++; + pTarget[rPos] = aBytes[1]; + rPos++; + pTarget[rPos] = aBytes[2]; + rPos++; + pTarget[rPos] = aBytes[3]; + rPos++; + } + } + else // lone 2nd surrogate + { + OSL_FAIL("illegal Unicode character"); + bRet = false; + } + + // reset surrogate + nSurrogate = 0; + } + else if (c > 0x07FF) + { + sal_Int8 aBytes[] + = { sal_Int8(0xE0 | ((c >> 12) & 0x0F)), sal_Int8(0x80 | ((c >> 6) & 0x3F)), + sal_Int8(0x80 | ((c >> 0) & 0x3F)) }; + if ((rPos + 3) > SEQUENCESIZE) + AddBytes(pTarget, rPos, aBytes, 3); + else + { + pTarget[rPos] = aBytes[0]; + rPos++; + pTarget[rPos] = aBytes[1]; + rPos++; + pTarget[rPos] = aBytes[2]; + rPos++; + } + } + else + { + sal_Int8 aBytes[] + = { sal_Int8(0xC0 | ((c >> 6) & 0x1F)), sal_Int8(0x80 | ((c >> 0) & 0x3F)) }; + if ((rPos + 2) > SEQUENCESIZE) + AddBytes(pTarget, rPos, aBytes, 2); + else + { + pTarget[rPos] = aBytes[0]; + rPos++; + pTarget[rPos] = aBytes[1]; + rPos++; + } + } + } + + OSL_ENSURE(rPos <= SEQUENCESIZE, "not reset current position"); + if (rPos == SEQUENCESIZE) + rPos = writeSequence(); + + // reset left-over surrogate + if ((nSurrogate != 0) && !rtl::isHighSurrogate(c)) + { + OSL_FAIL("left-over Unicode surrogate"); + nSurrogate = 0; + bRet = false; + } + } + if (nSurrogate != 0) // trailing lone 1st surrogate + { + OSL_FAIL("left-over Unicode surrogate"); + bRet = false; + } + return bRet; +} + +void SaxWriterHelper::FinishStartElement() +{ + if (!m_bStartElementFinished) + { + mp_Sequence[nCurrentPos] = '>'; + nCurrentPos++; + if (nCurrentPos == SEQUENCESIZE) + nCurrentPos = writeSequence(); + m_bStartElementFinished = true; + } +} + +void SaxWriterHelper::insertIndentation(sal_uInt32 m_nLevel) +{ + FinishStartElement(); + if (m_nLevel > 0) + { + if ((nCurrentPos + m_nLevel + 1) <= SEQUENCESIZE) + { + mp_Sequence[nCurrentPos] = LINEFEED; + nLastLineFeedPos = nCurrentPos; + nCurrentPos++; + memset(&(mp_Sequence[nCurrentPos]), 32, m_nLevel); + nCurrentPos += m_nLevel; + if (nCurrentPos == SEQUENCESIZE) + nCurrentPos = writeSequence(); + } + else + { + sal_uInt32 nCount(m_nLevel + 1); + std::unique_ptr<sal_Int8[]> pBytes(new sal_Int8[nCount]); + pBytes[0] = LINEFEED; + memset(&(pBytes[1]), 32, m_nLevel); + AddBytes(mp_Sequence, nCurrentPos, pBytes.get(), nCount); + pBytes.reset(); + nLastLineFeedPos = nCurrentPos - nCount; + if (nCurrentPos == SEQUENCESIZE) + nCurrentPos = writeSequence(); + } + } + else + { + mp_Sequence[nCurrentPos] = LINEFEED; + nLastLineFeedPos = nCurrentPos; + nCurrentPos++; + if (nCurrentPos == SEQUENCESIZE) + nCurrentPos = writeSequence(); + } +} + +bool SaxWriterHelper::writeString(const OUString& rWriteOutString, bool bDoNormalization, + bool bNormalizeWhitespace) +{ + FinishStartElement(); + return convertToXML(rWriteOutString.getStr(), rWriteOutString.getLength(), bDoNormalization, + bNormalizeWhitespace, mp_Sequence, nCurrentPos); +} + +void SaxWriterHelper::startDocument() +{ + const char pc[] = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"; + const int nLen = strlen(pc); + if ((nCurrentPos + nLen) <= SEQUENCESIZE) + { + memcpy(mp_Sequence, pc, nLen); + nCurrentPos += nLen; + } + else + { + AddBytes(mp_Sequence, nCurrentPos, reinterpret_cast<sal_Int8 const*>(pc), nLen); + } + OSL_ENSURE(nCurrentPos <= SEQUENCESIZE, "not reset current position"); + if (nCurrentPos == SEQUENCESIZE) + nCurrentPos = writeSequence(); + mp_Sequence[nCurrentPos] = LINEFEED; + nCurrentPos++; + if (nCurrentPos == SEQUENCESIZE) + nCurrentPos = writeSequence(); +} + +#ifndef NDEBUG +bool inrange(sal_Unicode c, sal_Unicode start, sal_Unicode end) { return c >= start && c <= end; } +#endif + +void CheckValidName(OUString const& rName) +{ +#ifdef NDEBUG + (void)rName; +#else + assert(!rName.isEmpty()); + bool hasColon(false); + for (sal_Int32 i = 0; i < rName.getLength(); ++i) + { + auto const c(rName[i]); + if (c == ':') + { + // see https://www.w3.org/TR/REC-xml-names/#ns-qualnames + SAL_WARN_IF(hasColon, "sax", "only one colon allowed: " << rName); + assert(!hasColon && "only one colon allowed"); + hasColon = true; + } + else if (!rtl::isAsciiAlphanumeric(c) && c != '_' && c != '-' && c != '.' + && !inrange(c, 0x00C0, 0x00D6) && !inrange(c, 0x00D8, 0x00F6) + && !inrange(c, 0x00F8, 0x02FF) && !inrange(c, 0x0370, 0x037D) + && !inrange(c, 0x037F, 0x1FFF) && !inrange(c, 0x200C, 0x200D) + && !inrange(c, 0x2070, 0x218F) && !inrange(c, 0x2C00, 0x2FEF) + && !inrange(c, 0x3001, 0xD7FF) && !inrange(c, 0xF900, 0xFDCF) + && !inrange(c, 0xFDF0, 0xFFFD) && c != 0x00B7 && !inrange(c, 0x0300, 0x036F) + && !inrange(c, 0x203F, 0x2040)) + { + // https://www.w3.org/TR/xml11/#NT-NameChar + // (currently we don't warn about invalid start chars) + SAL_WARN("sax", "unexpected character in attribute name: " << rName); + assert(!"unexpected character in attribute name"); + } + } +#endif +} + +SaxInvalidCharacterError SaxWriterHelper::startElement(const OUString& rName, + const Reference<XAttributeList>& xAttribs) +{ + FinishStartElement(); + +#ifdef DBG_UTIL + m_DebugStartedElements.push(rName); + ::std::set<OUString> DebugAttributes; +#endif + + mp_Sequence[nCurrentPos] = '<'; + nCurrentPos++; + if (nCurrentPos == SEQUENCESIZE) + nCurrentPos = writeSequence(); + + SaxInvalidCharacterError eRet(SAX_NONE); + CheckValidName(rName); + if (!writeString(rName, false, false)) + eRet = SAX_ERROR; + + sal_Int16 nAttribCount = xAttribs.is() ? xAttribs->getLength() : 0; + for (sal_Int16 i = 0; i < nAttribCount; i++) + { + mp_Sequence[nCurrentPos] = ' '; + nCurrentPos++; + if (nCurrentPos == SEQUENCESIZE) + nCurrentPos = writeSequence(); + + OUString const& rAttrName(xAttribs->getNameByIndex(i)); +#ifdef DBG_UTIL + // Well-formedness constraint: Unique Att Spec + assert(DebugAttributes.find(rAttrName) == DebugAttributes.end()); + DebugAttributes.insert(rAttrName); +#endif + CheckValidName(rAttrName); + if (!writeString(rAttrName, false, false)) + eRet = SAX_ERROR; + + mp_Sequence[nCurrentPos] = '='; + nCurrentPos++; + if (nCurrentPos == SEQUENCESIZE) + nCurrentPos = writeSequence(); + mp_Sequence[nCurrentPos] = '"'; + nCurrentPos++; + if (nCurrentPos == SEQUENCESIZE) + nCurrentPos = writeSequence(); + + if (!writeString(xAttribs->getValueByIndex(i), true, true) && eRet != SAX_ERROR) + eRet = SAX_WARNING; + + mp_Sequence[nCurrentPos] = '"'; + nCurrentPos++; + if (nCurrentPos == SEQUENCESIZE) + nCurrentPos = writeSequence(); + } + + m_bStartElementFinished = false; // because the '>' character is not added, + // because it is possible, that the "/>" + // characters have to add + return eRet; +} + +bool SaxWriterHelper::FinishEmptyElement() +{ + if (m_bStartElementFinished) + return false; + + mp_Sequence[nCurrentPos] = '/'; + nCurrentPos++; + if (nCurrentPos == SEQUENCESIZE) + nCurrentPos = writeSequence(); + mp_Sequence[nCurrentPos] = '>'; + nCurrentPos++; + if (nCurrentPos == SEQUENCESIZE) + nCurrentPos = writeSequence(); + + m_bStartElementFinished = true; + + return true; +} + +bool SaxWriterHelper::endElement(const OUString& rName) +{ + FinishStartElement(); + + mp_Sequence[nCurrentPos] = '<'; + nCurrentPos++; + if (nCurrentPos == SEQUENCESIZE) + nCurrentPos = writeSequence(); + mp_Sequence[nCurrentPos] = '/'; + nCurrentPos++; + if (nCurrentPos == SEQUENCESIZE) + nCurrentPos = writeSequence(); + + CheckValidName(rName); + bool bRet(writeString(rName, false, false)); + + mp_Sequence[nCurrentPos] = '>'; + nCurrentPos++; + if (nCurrentPos == SEQUENCESIZE) + nCurrentPos = writeSequence(); + + return bRet; +} + +void SaxWriterHelper::endDocument() +{ + if (nCurrentPos > 0) + { + m_Sequence.realloc(nCurrentPos); + nCurrentPos = writeSequence(); + //m_Sequence.realloc(SEQUENCESIZE); + } +} + +void SaxWriterHelper::clearBuffer() +{ + FinishStartElement(); + if (nCurrentPos > 0) + { + m_Sequence.realloc(nCurrentPos); + nCurrentPos = writeSequence(); + m_Sequence.realloc(SEQUENCESIZE); + // Be sure to update the array pointer after the reallocation. + mp_Sequence = m_Sequence.getArray(); + } +} + +bool SaxWriterHelper::processingInstruction(const OUString& rTarget, const OUString& rData) +{ + FinishStartElement(); + mp_Sequence[nCurrentPos] = '<'; + nCurrentPos++; + if (nCurrentPos == SEQUENCESIZE) + nCurrentPos = writeSequence(); + mp_Sequence[nCurrentPos] = '?'; + nCurrentPos++; + if (nCurrentPos == SEQUENCESIZE) + nCurrentPos = writeSequence(); + + bool bRet(writeString(rTarget, false, false)); + + mp_Sequence[nCurrentPos] = ' '; + nCurrentPos++; + if (nCurrentPos == SEQUENCESIZE) + nCurrentPos = writeSequence(); + + if (!writeString(rData, false, false)) + bRet = false; + + mp_Sequence[nCurrentPos] = '?'; + nCurrentPos++; + if (nCurrentPos == SEQUENCESIZE) + nCurrentPos = writeSequence(); + mp_Sequence[nCurrentPos] = '>'; + nCurrentPos++; + if (nCurrentPos == SEQUENCESIZE) + nCurrentPos = writeSequence(); + + return bRet; +} + +void SaxWriterHelper::startCDATA() +{ + FinishStartElement(); + if ((nCurrentPos + 9) <= SEQUENCESIZE) + { + memcpy(&(mp_Sequence[nCurrentPos]), "<![CDATA[", 9); + nCurrentPos += 9; + } + else + AddBytes(mp_Sequence, nCurrentPos, reinterpret_cast<sal_Int8 const*>("<![CDATA["), 9); + if (nCurrentPos == SEQUENCESIZE) + nCurrentPos = writeSequence(); +} + +void SaxWriterHelper::endCDATA() +{ + FinishStartElement(); + if ((nCurrentPos + 3) <= SEQUENCESIZE) + { + memcpy(&(mp_Sequence[nCurrentPos]), "]]>", 3); + nCurrentPos += 3; + } + else + AddBytes(mp_Sequence, nCurrentPos, reinterpret_cast<sal_Int8 const*>("]]>"), 3); + if (nCurrentPos == SEQUENCESIZE) + nCurrentPos = writeSequence(); +} + +bool SaxWriterHelper::comment(const OUString& rComment) +{ + FinishStartElement(); + mp_Sequence[nCurrentPos] = '<'; + nCurrentPos++; + if (nCurrentPos == SEQUENCESIZE) + nCurrentPos = writeSequence(); + mp_Sequence[nCurrentPos] = '!'; + nCurrentPos++; + if (nCurrentPos == SEQUENCESIZE) + nCurrentPos = writeSequence(); + mp_Sequence[nCurrentPos] = '-'; + nCurrentPos++; + if (nCurrentPos == SEQUENCESIZE) + nCurrentPos = writeSequence(); + mp_Sequence[nCurrentPos] = '-'; + nCurrentPos++; + if (nCurrentPos == SEQUENCESIZE) + nCurrentPos = writeSequence(); + + bool bRet(writeString(rComment, false, false)); + + mp_Sequence[nCurrentPos] = '-'; + nCurrentPos++; + if (nCurrentPos == SEQUENCESIZE) + nCurrentPos = writeSequence(); + mp_Sequence[nCurrentPos] = '-'; + nCurrentPos++; + if (nCurrentPos == SEQUENCESIZE) + nCurrentPos = writeSequence(); + mp_Sequence[nCurrentPos] = '>'; + nCurrentPos++; + if (nCurrentPos == SEQUENCESIZE) + nCurrentPos = writeSequence(); + + return bRet; +} + +sal_Int32 SaxWriterHelper::calcXMLByteLength(const OUString& rStr, bool bDoNormalization, + bool bNormalizeWhitespace) +{ + sal_Int32 nOutputLength = 0; + sal_uInt32 nSurrogate = 0; + + const sal_Unicode* pStr = rStr.getStr(); + sal_Int32 nStrLen = rStr.getLength(); + for (sal_Int32 i = 0; i < nStrLen; i++) + { + sal_uInt16 c = pStr[i]; + if (!IsInvalidChar(c) && (c >= 0x0001) && (c <= 0x007F)) + { + if (bDoNormalization) + { + switch (c) + { + case '&': // resemble to & + nOutputLength += 5; + break; + case '<': // < + case '>': // > + nOutputLength += 4; + break; + case '\'': // ' + case '"': // " + case 13: // 
 + nOutputLength += 6; + break; + + case 10: // 
 + case 9: // 	 + if (bNormalizeWhitespace) + { + nOutputLength += 6; + } + else + { + nOutputLength++; + } + break; + default: + nOutputLength++; + } + } + else + { + nOutputLength++; + } + } + else + { + // Deal with replacements + if (bDoNormalization && !m_Replacements.empty()) + { + // search + const ReplacementPair* it = findXMLReplacement(&pStr[i], nStrLen - i); + + if (it != nullptr) + { + nOutputLength + += ::rtl::OUStringToOString(it->name, RTL_TEXTENCODING_UTF8).getLength(); + i += it->replacement.getLength() - 1; + continue; + } + } + + // Deal with other unicode cases + if (rtl::isHighSurrogate(c)) + { + // save surrogate + nSurrogate = c; + } + else if (rtl::isLowSurrogate(c)) + { + // 2. surrogate: write as UTF-8 (if range is OK + if (nSurrogate) + nOutputLength += 4; + nSurrogate = 0; + } + else if (c > 0x07FF) + { + nOutputLength += 3; + } + else + { + nOutputLength += 2; + } + } + + // surrogate processing + if ((nSurrogate != 0) && !rtl::isHighSurrogate(c)) + nSurrogate = 0; + } + + return nOutputLength; +} + +const ReplacementPair* SaxWriterHelper::findXMLReplacement(const sal_Unicode* pStr, + sal_Int32 nStrLen) +{ + for (size_t iter = 0; iter < m_Replacements.size(); ++iter) + { + if (m_Replacements[iter].replacement.getLength() > nStrLen) + continue; + sal_Int32 matches = m_Replacements[iter].replacement.compareTo( + std::u16string_view(pStr, m_Replacements[iter].replacement.getLength())); + if (matches == 0) + return &m_Replacements[iter]; + if (matches > 0) + return nullptr; + } + return nullptr; +} + +class SAXWriter : public WeakImplHelper<XWriter, XServiceInfo> +{ +public: + SAXWriter() + : m_bDocStarted(false) + , m_bIsCDATA(false) + , m_bForceLineBreak(false) + , m_bAllowLineBreak(false) + , m_nLevel(0) + { + } + +public: // XActiveDataSource + virtual void SAL_CALL setOutputStream(const Reference<XOutputStream>& aStream) override + { + try + { + // temporary: set same stream again to clear buffer + if (m_out == aStream && m_pSaxWriterHelper && m_bDocStarted) + m_pSaxWriterHelper->clearBuffer(); + else + { + m_out = aStream; + m_pSaxWriterHelper.reset(new SaxWriterHelper(m_out)); + m_bDocStarted = false; + m_nLevel = 0; + m_bIsCDATA = false; + } + } + catch (const SAXException& e) + { + throw css::lang::WrappedTargetRuntimeException(e.Message, getXWeak(), + e.WrappedException); + } + } + virtual Reference<XOutputStream> SAL_CALL getOutputStream() override { return m_out; } + +public: // XDocumentHandler + virtual void SAL_CALL startDocument() override; + + virtual void SAL_CALL endDocument() override; + + virtual void SAL_CALL startElement(const OUString& aName, + const Reference<XAttributeList>& xAttribs) override; + + virtual void SAL_CALL endElement(const OUString& aName) override; + + virtual void SAL_CALL characters(const OUString& aChars) override; + + virtual void SAL_CALL ignorableWhitespace(const OUString& aWhitespaces) override; + virtual void SAL_CALL processingInstruction(const OUString& aTarget, + const OUString& aData) override; + virtual void SAL_CALL setDocumentLocator(const Reference<XLocator>& xLocator) override; + virtual void SAL_CALL setCustomEntityNames( + const ::css::uno::Sequence<::css::beans::Pair<::rtl::OUString, ::rtl::OUString>>& + replacements) override; + +public: // XExtendedDocumentHandler + virtual void SAL_CALL startCDATA() override; + virtual void SAL_CALL endCDATA() override; + virtual void SAL_CALL comment(const OUString& sComment) override; + virtual void SAL_CALL unknown(const OUString& sString) override; + virtual void SAL_CALL allowLineBreak() override; + +public: // XServiceInfo + OUString SAL_CALL getImplementationName() override; + Sequence<OUString> SAL_CALL getSupportedServiceNames() override; + sal_Bool SAL_CALL supportsService(const OUString& ServiceName) override; + +private: + sal_Int32 getIndentPrefixLength(sal_Int32 nFirstLineBreakOccurrence) noexcept; + + Reference<XOutputStream> m_out; + std::unique_ptr<SaxWriterHelper> m_pSaxWriterHelper; + + // Status information + bool m_bDocStarted : 1; + bool m_bIsCDATA : 1; + bool m_bForceLineBreak : 1; + bool m_bAllowLineBreak : 1; + sal_Int32 m_nLevel; +}; + +sal_Int32 SAXWriter::getIndentPrefixLength(sal_Int32 nFirstLineBreakOccurrence) noexcept +{ + sal_Int32 nLength = -1; + if (m_pSaxWriterHelper) + { + if (m_bForceLineBreak + || (m_bAllowLineBreak + && ((nFirstLineBreakOccurrence + m_pSaxWriterHelper->GetLastColumnCount()) + > MAXCOLUMNCOUNT))) + nLength = m_nLevel; + } + m_bForceLineBreak = false; + m_bAllowLineBreak = false; + return nLength; +} + +bool isFirstCharWhitespace(const sal_Unicode* p) noexcept { return *p == ' '; } + +// XServiceInfo +OUString SAXWriter::getImplementationName() { return "com.sun.star.extensions.xml.sax.Writer"; } + +// XServiceInfo +sal_Bool SAXWriter::supportsService(const OUString& ServiceName) +{ + return cppu::supportsService(this, ServiceName); +} + +// XServiceInfo +Sequence<OUString> SAXWriter::getSupportedServiceNames() +{ + return { "com.sun.star.xml.sax.Writer" }; +} + +void SAXWriter::startDocument() +{ + if (m_bDocStarted || !m_out.is() || !m_pSaxWriterHelper) + { + throw SAXException(); + } + m_bDocStarted = true; + m_pSaxWriterHelper->startDocument(); +} + +void SAXWriter::endDocument() +{ + if (!m_bDocStarted) + { + throw SAXException("endDocument called before startDocument", Reference<XInterface>(), + Any()); + } + if (m_nLevel) + { + throw SAXException("unexpected end of document", Reference<XInterface>(), Any()); + } + m_pSaxWriterHelper->endDocument(); + try + { + m_out->closeOutput(); + } + catch (const IOException&) + { + css::uno::Any anyEx = cppu::getCaughtException(); + throw SAXException("IO exception during closing the IO Stream", Reference<XInterface>(), + anyEx); + } +} + +void SAXWriter::startElement(const OUString& aName, const Reference<XAttributeList>& xAttribs) +{ + if (!m_bDocStarted) + { + throw SAXException("startElement called before startDocument", {}, {}); + } + if (m_bIsCDATA) + { + throw SAXException("startElement call not allowed with CDATA sections", {}, {}); + } + + sal_Int32 nLength(0); + if (m_bAllowLineBreak) + { + sal_Int32 nAttribCount = xAttribs.is() ? xAttribs->getLength() : 0; + + nLength++; // "<" + nLength += m_pSaxWriterHelper->calcXMLByteLength(aName, false, false); // the tag name + + sal_Int16 n; + for (n = 0; n < static_cast<sal_Int16>(nAttribCount); n++) + { + nLength++; // " " + OUString tmp = xAttribs->getNameByIndex(n); + + nLength += m_pSaxWriterHelper->calcXMLByteLength(tmp, false, false); + + nLength += 2; // =" + + tmp = xAttribs->getValueByIndex(n); + + nLength += m_pSaxWriterHelper->calcXMLByteLength(tmp, true, true); + + nLength += 1; // " + } + + nLength++; // '>' + } + + // Is there a new indentation necessary ? + sal_Int32 nPrefix(getIndentPrefixLength(nLength)); + + // write into sequence + if (nPrefix >= 0) + m_pSaxWriterHelper->insertIndentation(nPrefix); + + SaxInvalidCharacterError eRet(m_pSaxWriterHelper->startElement(aName, xAttribs)); + + m_nLevel++; + + if (eRet == SAX_WARNING) + { + throw SAXInvalidCharacterException( + "Invalid character during XML-Export in an attribute value", {}, {}); + } + else if (eRet == SAX_ERROR) + { + throw SAXException("Invalid character during XML-Export", {}, {}); + } +} + +void SAXWriter::endElement(const OUString& aName) +{ + if (!m_bDocStarted) + { + throw SAXException(); + } + m_nLevel--; + + if (m_nLevel < 0) + { + throw SAXException(); + } + bool bRet(true); + + // check here because Helper's endElement is not always called +#ifdef DBG_UTIL + assert(!m_pSaxWriterHelper->m_DebugStartedElements.empty()); + // Well-formedness constraint: Element Type Match + assert(aName == m_pSaxWriterHelper->m_DebugStartedElements.top()); + m_pSaxWriterHelper->m_DebugStartedElements.pop(); +#endif + + if (m_pSaxWriterHelper->FinishEmptyElement()) + m_bForceLineBreak = false; + else + { + // only ascii chars allowed + sal_Int32 nLength(0); + if (m_bAllowLineBreak) + nLength = 3 + m_pSaxWriterHelper->calcXMLByteLength(aName, false, false); + sal_Int32 nPrefix = getIndentPrefixLength(nLength); + + if (nPrefix >= 0) + m_pSaxWriterHelper->insertIndentation(nPrefix); + + bRet = m_pSaxWriterHelper->endElement(aName); + } + + if (!bRet) + { + throw SAXException("Invalid character during XML-Export", {}, {}); + } +} + +void SAXWriter::characters(const OUString& aChars) +{ + if (!m_bDocStarted) + { + throw SAXException("characters method called before startDocument", {}, {}); + } + + bool bThrowException(false); + if (!aChars.isEmpty()) + { + if (m_bIsCDATA) + bThrowException = !m_pSaxWriterHelper->writeString(aChars, false, false); + else + { + // Note : nFirstLineBreakOccurrence is not exact, because we don't know, how + // many 2 and 3 byte chars are inbetween. However this whole stuff + // is eitherway for pretty printing only, so it does not need to be exact. + sal_Int32 nLength(0); + sal_Int32 nIndentPrefix(-1); + if (m_bAllowLineBreak) + { + // returns position of first ascii 10 within the string, -1 when no 10 in string. + sal_Int32 nFirstLineBreakOccurrence = aChars.indexOf(LINEFEED); + + nLength = m_pSaxWriterHelper->calcXMLByteLength(aChars, !m_bIsCDATA, false); + nIndentPrefix = getIndentPrefixLength( + nFirstLineBreakOccurrence >= 0 ? nFirstLineBreakOccurrence : nLength); + } + else + nIndentPrefix = getIndentPrefixLength(nLength); + + // insert indentation + if (nIndentPrefix >= 0) + { + if (isFirstCharWhitespace(aChars.getStr())) + m_pSaxWriterHelper->insertIndentation(nIndentPrefix - 1); + else + m_pSaxWriterHelper->insertIndentation(nIndentPrefix); + } + bThrowException = !m_pSaxWriterHelper->writeString(aChars, true, false); + } + } + if (bThrowException) + { + throw SAXInvalidCharacterException("Invalid character during XML-Export", {}, {}); + } +} + +void SAXWriter::ignorableWhitespace(const OUString&) +{ + if (!m_bDocStarted) + { + throw SAXException(); + } + + m_bForceLineBreak = true; +} + +void SAXWriter::processingInstruction(const OUString& aTarget, const OUString& aData) +{ + if (!m_bDocStarted || m_bIsCDATA) + { + throw SAXException(); + } + + sal_Int32 nLength(0); + if (m_bAllowLineBreak) + { + nLength = 2; // "<?" + nLength += m_pSaxWriterHelper->calcXMLByteLength(aTarget, false, false); + + nLength += 1; // " " + + nLength += m_pSaxWriterHelper->calcXMLByteLength(aData, false, false); + + nLength += 2; // "?>" + } + + sal_Int32 nPrefix = getIndentPrefixLength(nLength); + + if (nPrefix >= 0) + m_pSaxWriterHelper->insertIndentation(nPrefix); + + if (!m_pSaxWriterHelper->processingInstruction(aTarget, aData)) + { + throw SAXException("Invalid character during XML-Export", {}, {}); + } +} + +void SAXWriter::setDocumentLocator(const Reference<XLocator>&) {} + +void SAXWriter::setCustomEntityNames( + const ::css::uno::Sequence<::css::beans::Pair<::rtl::OUString, ::rtl::OUString>>& replacements) +{ + m_pSaxWriterHelper->setCustomEntityNames(replacements); +} + +void SAXWriter::startCDATA() +{ + if (!m_bDocStarted || m_bIsCDATA) + { + throw SAXException(); + } + + sal_Int32 nPrefix = getIndentPrefixLength(9); + if (nPrefix >= 0) + m_pSaxWriterHelper->insertIndentation(nPrefix); + + m_pSaxWriterHelper->startCDATA(); + + m_bIsCDATA = true; +} + +void SAXWriter::endCDATA() +{ + if (!m_bDocStarted || !m_bIsCDATA) + { + throw SAXException("endCDATA was called without startCDATA", {}, {}); + } + + sal_Int32 nPrefix = getIndentPrefixLength(3); + if (nPrefix >= 0) + m_pSaxWriterHelper->insertIndentation(nPrefix); + + m_pSaxWriterHelper->endCDATA(); + + m_bIsCDATA = false; +} + +void SAXWriter::comment(const OUString& sComment) +{ + if (!m_bDocStarted || m_bIsCDATA) + { + throw SAXException(); + } + + sal_Int32 nLength(0); + if (m_bAllowLineBreak) + { + nLength = 4; // "<!--" + nLength += m_pSaxWriterHelper->calcXMLByteLength(sComment, false, false); + + nLength += 3; + } + + sal_Int32 nPrefix = getIndentPrefixLength(nLength); + if (nPrefix >= 0) + m_pSaxWriterHelper->insertIndentation(nPrefix); + + if (!m_pSaxWriterHelper->comment(sComment)) + { + throw SAXException("Invalid character during XML-Export", {}, {}); + } +} + +void SAXWriter::allowLineBreak() +{ + if (!m_bDocStarted || m_bAllowLineBreak) + { + throw SAXException(); + } + + m_bAllowLineBreak = true; +} + +void SAXWriter::unknown(const OUString& sString) +{ + if (!m_bDocStarted) + { + throw SAXException(); + } + if (m_bIsCDATA) + { + throw SAXException(); + } + + if (sString.startsWith("<?xml")) + return; + + sal_Int32 nLength(0); + if (m_bAllowLineBreak) + nLength = m_pSaxWriterHelper->calcXMLByteLength(sString, false, false); + + sal_Int32 nPrefix = getIndentPrefixLength(nLength); + if (nPrefix >= 0) + m_pSaxWriterHelper->insertIndentation(nPrefix); + + if (!m_pSaxWriterHelper->writeString(sString, false, false)) + { + throw SAXException("Invalid character during XML-Export", {}, {}); + } +} + +} // namespace + +extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface* +com_sun_star_extensions_xml_sax_Writer_get_implementation(css::uno::XComponentContext*, + css::uno::Sequence<css::uno::Any> const&) +{ + return cppu::acquire(new SAXWriter); +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/sax/source/expatwrap/xml2utf.cxx b/sax/source/expatwrap/xml2utf.cxx new file mode 100644 index 0000000000..5b3e4b9e1c --- /dev/null +++ b/sax/source/expatwrap/xml2utf.cxx @@ -0,0 +1,519 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ +#include <string.h> + +#include <algorithm> + +#include <sal/types.h> + +#include <rtl/textenc.h> +#include <rtl/tencinfo.h> +#include <com/sun/star/io/NotConnectedException.hpp> +#include <com/sun/star/io/XInputStream.hpp> +#include <xml2utf.hxx> +#include <memory> + + +using namespace ::com::sun::star::uno; +using namespace ::com::sun::star::io; + + +namespace sax_expatwrap { + +sal_Int32 XMLFile2UTFConverter::readAndConvert( Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead ) +{ + if( ! m_in.is() ) { + throw NotConnectedException(); + } + if( ! m_bStarted ) { + // it should be possible to find the encoding attribute + // within the first 512 bytes == 128 chars in UCS-4 + nMaxToRead = ::std::max( sal_Int32(512) , nMaxToRead ); + } + + sal_Int32 nRead; + Sequence< sal_Int8 > seqStart; + while( true ) + { + nRead = m_in->readSomeBytes( seq , nMaxToRead ); + + if( nRead + seqStart.getLength()) + { + // if nRead is 0, the file is already eof. + if( ! m_bStarted && nRead ) + { + // ensure that enough data is available to parse encoding + if( seqStart.hasElements() ) + { + // prefix with what we had so far. + sal_Int32 nLength = seq.getLength(); + seq.realloc( seqStart.getLength() + nLength ); + + memmove (seq.getArray() + seqStart.getLength(), + seq.getConstArray(), + nLength); + memcpy (seq.getArray(), + seqStart.getConstArray(), + seqStart.getLength()); + } + + // autodetection with the first bytes + if( ! isEncodingRecognizable( seq ) ) + { + // remember what we have so far. + seqStart = seq; + + // read more ! + continue; + } + if( scanForEncoding( seq ) || !m_sEncoding.isEmpty() ) { + // initialize decoding + initializeDecoding(); + } + seqStart = Sequence < sal_Int8 > (); + } + + // do the encoding + if( m_pText2Unicode && m_pUnicode2Text && + m_pText2Unicode->canContinue() ) { + + Sequence<sal_Unicode> seqUnicode = m_pText2Unicode->convert( seq ); + seq = m_pUnicode2Text->convert( seqUnicode.getConstArray(), seqUnicode.getLength() ); + } + + if( ! m_bStarted ) + { + // it must now be ensured, that no encoding attribute exist anymore + // ( otherwise the expat-Parser will crash ) + // This must be done after decoding ! + // ( e.g. Files decoded in ucs-4 cannot be read properly ) + m_bStarted = true; + removeEncoding( seq ); + } + nRead = seq.getLength(); + } + + break; + } + return nRead; +} + +void XMLFile2UTFConverter::removeEncoding( Sequence<sal_Int8> &seq ) +{ + const sal_Int8 *pSource = seq.getArray(); + if (seq.getLength() < 5 || strncmp(reinterpret_cast<const char *>(pSource), "<?xml", 5)) + return; + + // scan for encoding + OString str( reinterpret_cast<char const *>(pSource), seq.getLength() ); + + // cut sequence to first line break + // find first line break; + int nMax = str.indexOf( 10 ); + if( nMax >= 0 ) + { + str = str.copy( 0 , nMax ); + } + + int nFound = str.indexOf( " encoding" ); + if( nFound < 0 ) return; + + int nStop; + int nStart = str.indexOf( "\"" , nFound ); + if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart ) + { + nStart = str.indexOf( "'" , nFound ); + nStop = str.indexOf( "'" , nStart +1 ); + } + else + { + nStop = str.indexOf( "\"" , nStart +1); + } + + if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop ) + { + // remove encoding tag from file + memmove( &( seq.getArray()[nFound] ) , + &( seq.getArray()[nStop+1]) , + seq.getLength() - nStop -1); + seq.realloc( seq.getLength() - ( nStop+1 - nFound ) ); + } +} + +// Checks, if enough data has been accumulated to recognize the encoding +bool XMLFile2UTFConverter::isEncodingRecognizable( const Sequence< sal_Int8 > &seq) +{ + const sal_Int8 *pSource = seq.getConstArray(); + bool bCheckIfFirstClosingBracketExists = false; + + if( seq.getLength() < 8 ) { + // no recognition possible, when less than 8 bytes are available + return false; + } + + if( ! strncmp( reinterpret_cast<const char *>(pSource), "<?xml", 5 ) ) { + // scan if the <?xml tag finishes within this buffer + bCheckIfFirstClosingBracketExists = true; + } + else if( ('<' == pSource[0] || '<' == pSource[2] ) && + ('?' == pSource[4] || '?' == pSource[6] ) ) + { + // check for utf-16 + bCheckIfFirstClosingBracketExists = true; + } + else if( ( '<' == pSource[1] || '<' == pSource[3] ) && + ( '?' == pSource[5] || '?' == pSource[7] ) ) + { + // check for + bCheckIfFirstClosingBracketExists = true; + } + + if( bCheckIfFirstClosingBracketExists ) + { + // whole <?xml tag is valid + return std::find(seq.begin(), seq.end(), '>') != seq.end(); + } + + // No <? tag in front, no need for a bigger buffer + return true; +} + +bool XMLFile2UTFConverter::scanForEncoding( Sequence< sal_Int8 > &seq ) +{ + const sal_uInt8 *pSource = reinterpret_cast<const sal_uInt8*>( seq.getConstArray() ); + bool bReturn = true; + + if( seq.getLength() < 4 ) { + // no recognition possible, when less than 4 bytes are available + return false; + } + + // first level : detect possible file formats + if (seq.getLength() >= 5 && !strncmp(reinterpret_cast<const char *>(pSource), "<?xml", 5)) { + // scan for encoding + OString str( reinterpret_cast<const char *>(pSource), seq.getLength() ); + + // cut sequence to first line break + //find first line break; + int nMax = str.indexOf( 10 ); + if( nMax >= 0 ) + { + str = str.copy( 0 , nMax ); + } + + int nFound = str.indexOf( " encoding" ); + if( nFound >= 0 ) { + int nStop; + int nStart = str.indexOf( "\"" , nFound ); + if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart ) + { + nStart = str.indexOf( "'" , nFound ); + nStop = str.indexOf( "'" , nStart +1 ); + } + else + { + nStop = str.indexOf( "\"" , nStart +1); + } + if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop ) + { + // encoding found finally + m_sEncoding = str.copy( nStart+1 , nStop - nStart - 1 ); + } + } + } + else if( 0xFE == pSource[0] && + 0xFF == pSource[1] ) { + // UTF-16 big endian + // conversion is done so that encoding information can be easily extracted + m_sEncoding = "utf-16"_ostr; + } + else if( 0xFF == pSource[0] && + 0xFE == pSource[1] ) { + // UTF-16 little endian + // conversion is done so that encoding information can be easily extracted + m_sEncoding = "utf-16"_ostr; + } + else if( 0x00 == pSource[0] && 0x3c == pSource[1] && 0x00 == pSource[2] && 0x3f == pSource[3] ) { + // UTF-16 big endian without byte order mark (this is (strictly speaking) an error.) + // The byte order mark is simply added + + // simply add the byte order mark ! + seq.realloc( seq.getLength() + 2 ); + memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 ); + reinterpret_cast<sal_uInt8*>(seq.getArray())[0] = 0xFE; + reinterpret_cast<sal_uInt8*>(seq.getArray())[1] = 0xFF; + + m_sEncoding = "utf-16"_ostr; + } + else if( 0x3c == pSource[0] && 0x00 == pSource[1] && 0x3f == pSource[2] && 0x00 == pSource[3] ) { + // UTF-16 little endian without byte order mark (this is (strictly speaking) an error.) + // The byte order mark is simply added + + seq.realloc( seq.getLength() + 2 ); + memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 ); + reinterpret_cast<sal_uInt8*>(seq.getArray())[0] = 0xFF; + reinterpret_cast<sal_uInt8*>(seq.getArray())[1] = 0xFE; + + m_sEncoding = "utf-16"_ostr; + } + else if( 0xEF == pSource[0] && + 0xBB == pSource[1] && + 0xBF == pSource[2] ) + { + // UTF-8 BOM (byte order mark); signifies utf-8, and not byte order + // The BOM is removed. + memmove( seq.getArray(), &( seq.getArray()[3] ), seq.getLength()-3 ); + seq.realloc( seq.getLength() - 3 ); + m_sEncoding = "utf-8"_ostr; + } + else if( 0x00 == pSource[0] && 0x00 == pSource[1] && 0x00 == pSource[2] && 0x3c == pSource[3] ) { + // UCS-4 big endian + m_sEncoding = "ucs-4"_ostr; + } + else if( 0x3c == pSource[0] && 0x00 == pSource[1] && 0x00 == pSource[2] && 0x00 == pSource[3] ) { + // UCS-4 little endian + m_sEncoding = "ucs-4"_ostr; + } +/* TODO: no need to test for the moment since we return sal_False like default case anyway + else if( 0x4c == pSource[0] && 0x6f == pSource[1] && + 0xa7 == static_cast<unsigned char> (pSource[2]) && + 0x94 == static_cast<unsigned char> (pSource[3]) ) { + // EBCDIC + bReturn = sal_False; // must be extended + } +*/ + else { + // other + // UTF8 is directly recognized by the parser. + bReturn = false; + } + + return bReturn; +} + +void XMLFile2UTFConverter::initializeDecoding() +{ + + if( !m_sEncoding.isEmpty() ) + { + rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( m_sEncoding.getStr() ); + if( encoding != RTL_TEXTENCODING_UTF8 ) + { + m_pText2Unicode = std::make_unique<Text2UnicodeConverter>( m_sEncoding ); + m_pUnicode2Text = std::make_unique<Unicode2TextConverter>( RTL_TEXTENCODING_UTF8 ); + } + } +} + + +// Text2UnicodeConverter + + +Text2UnicodeConverter::Text2UnicodeConverter( const OString &sEncoding ) + : m_convText2Unicode(nullptr) + , m_contextText2Unicode(nullptr) +{ + rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( sEncoding.getStr() ); + if( RTL_TEXTENCODING_DONTKNOW == encoding ) + { + m_bCanContinue = false; + m_bInitialized = false; + } + else + { + init( encoding ); + } +} + +Text2UnicodeConverter::~Text2UnicodeConverter() +{ + if( m_bInitialized ) + { + rtl_destroyTextToUnicodeContext( m_convText2Unicode , m_contextText2Unicode ); + rtl_destroyUnicodeToTextConverter( m_convText2Unicode ); + } +} + +void Text2UnicodeConverter::init( rtl_TextEncoding encoding ) +{ + m_bCanContinue = true; + m_bInitialized = true; + + m_convText2Unicode = rtl_createTextToUnicodeConverter(encoding); + m_contextText2Unicode = rtl_createTextToUnicodeContext( m_convText2Unicode ); +} + + +Sequence<sal_Unicode> Text2UnicodeConverter::convert( const Sequence<sal_Int8> &seqText ) +{ + sal_uInt32 uiInfo; + sal_Size nSrcCvtBytes = 0; + sal_Size nTargetCount = 0; + sal_Size nSourceCount = 0; + + // the whole source size + sal_Int32 nSourceSize = seqText.getLength() + m_seqSource.getLength(); + Sequence<sal_Unicode> seqUnicode ( nSourceSize ); + + const sal_Int8 *pbSource = seqText.getConstArray(); + std::unique_ptr<sal_Int8[]> pbTempMem; + + if( m_seqSource.hasElements() ) { + // put old rest and new byte sequence into one array + pbTempMem.reset(new sal_Int8[ nSourceSize ]); + memcpy( pbTempMem.get() , m_seqSource.getConstArray() , m_seqSource.getLength() ); + memcpy( &(pbTempMem[ m_seqSource.getLength() ]) , seqText.getConstArray() , seqText.getLength() ); + pbSource = pbTempMem.get(); + + // set to zero again + m_seqSource = Sequence< sal_Int8 >(); + } + + while( true ) { + + /* All invalid characters are transformed to the unicode undefined char */ + nTargetCount += rtl_convertTextToUnicode( + m_convText2Unicode, + m_contextText2Unicode, + reinterpret_cast<const char *>(&( pbSource[nSourceCount] )), + nSourceSize - nSourceCount , + &( seqUnicode.getArray()[ nTargetCount ] ), + seqUnicode.getLength() - nTargetCount, + RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT | + RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT | + RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT, + &uiInfo, + &nSrcCvtBytes ); + nSourceCount += nSrcCvtBytes; + + if( uiInfo & RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL ) { + // save necessary bytes for next conversion + seqUnicode.realloc( seqUnicode.getLength() * 2 ); + continue; + } + break; + } + if( uiInfo & RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL ) { + m_seqSource.realloc( nSourceSize - nSourceCount ); + memcpy( m_seqSource.getArray() , &(pbSource[nSourceCount]) , nSourceSize-nSourceCount ); + } + + // set to correct unicode size + seqUnicode.realloc( nTargetCount ); + + return seqUnicode; +} + + +// Unicode2TextConverter + + +Unicode2TextConverter::Unicode2TextConverter( rtl_TextEncoding encoding ) +{ + m_convUnicode2Text = rtl_createUnicodeToTextConverter( encoding ); + m_contextUnicode2Text = rtl_createUnicodeToTextContext( m_convUnicode2Text ); +} + + +Unicode2TextConverter::~Unicode2TextConverter() +{ + rtl_destroyUnicodeToTextContext( m_convUnicode2Text , m_contextUnicode2Text ); + rtl_destroyUnicodeToTextConverter( m_convUnicode2Text ); +} + + +Sequence<sal_Int8> Unicode2TextConverter::convert(const sal_Unicode *puSource , sal_Int32 nSourceSize) +{ + std::unique_ptr<sal_Unicode[]> puTempMem; + + if( m_seqSource.hasElements() ) { + // For surrogates ! + // put old rest and new byte sequence into one array + // In general when surrogates are used, they should be rarely + // cut off between two convert()-calls. So this code is used + // rarely and the extra copy is acceptable. + puTempMem.reset(new sal_Unicode[ nSourceSize + m_seqSource.getLength()]); + memcpy( puTempMem.get() , + m_seqSource.getConstArray() , + m_seqSource.getLength() * sizeof( sal_Unicode ) ); + memcpy( + &(puTempMem[ m_seqSource.getLength() ]) , + puSource , + nSourceSize*sizeof( sal_Unicode ) ); + puSource = puTempMem.get(); + nSourceSize += m_seqSource.getLength(); + + m_seqSource = Sequence< sal_Unicode > (); + } + + + sal_Size nTargetCount = 0; + sal_Size nSourceCount = 0; + + sal_uInt32 uiInfo; + sal_Size nSrcCvtChars; + + // take nSourceSize * 3 as preference + // this is an upper boundary for converting to utf8, + // which most often used as the target. + sal_Int32 nSeqSize = nSourceSize * 3; + + Sequence<sal_Int8> seqText( nSeqSize ); + char *pTarget = reinterpret_cast<char *>(seqText.getArray()); + while( true ) { + + nTargetCount += rtl_convertUnicodeToText( + m_convUnicode2Text, + m_contextUnicode2Text, + &( puSource[nSourceCount] ), + nSourceSize - nSourceCount , + &( pTarget[nTargetCount] ), + nSeqSize - nTargetCount, + RTL_UNICODETOTEXT_FLAGS_UNDEFINED_DEFAULT | + RTL_UNICODETOTEXT_FLAGS_INVALID_DEFAULT , + &uiInfo, + &nSrcCvtChars); + nSourceCount += nSrcCvtChars; + + if( uiInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL ) { + nSeqSize = nSeqSize *2; + seqText.realloc( nSeqSize ); // double array size + pTarget = reinterpret_cast<char *>(seqText.getArray()); + continue; + } + break; + } + + // for surrogates + if( uiInfo & RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL ) { + m_seqSource.realloc( nSourceSize - nSourceCount ); + memcpy( m_seqSource.getArray() , + &(puSource[nSourceCount]), + (nSourceSize - nSourceCount) * sizeof( sal_Unicode ) ); + } + + // reduce the size of the buffer (fast, no copy necessary) + seqText.realloc( nTargetCount ); + + return seqText; +} + +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |