summaryrefslogtreecommitdiffstats
path: root/oox/source/vml/vmlinputstream.cxx
diff options
context:
space:
mode:
Diffstat (limited to 'oox/source/vml/vmlinputstream.cxx')
-rw-r--r--oox/source/vml/vmlinputstream.cxx393
1 files changed, 393 insertions, 0 deletions
diff --git a/oox/source/vml/vmlinputstream.cxx b/oox/source/vml/vmlinputstream.cxx
new file mode 100644
index 000000000..4f9420761
--- /dev/null
+++ b/oox/source/vml/vmlinputstream.cxx
@@ -0,0 +1,393 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * This file incorporates work covered by the following license notice:
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed
+ * with this work for additional information regarding copyright
+ * ownership. The ASF licenses this file to you under the Apache
+ * License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of
+ * the License at http://www.apache.org/licenses/LICENSE-2.0 .
+ */
+
+#include <oox/vml/vmlinputstream.hxx>
+
+#include <com/sun/star/io/IOException.hpp>
+#include <com/sun/star/io/XTextInputStream2.hpp>
+#include <map>
+#include <string.h>
+#include <rtl/strbuf.hxx>
+#include <osl/diagnose.h>
+#include <oox/helper/textinputstream.hxx>
+
+namespace oox::vml {
+
+using namespace ::com::sun::star::io;
+using namespace ::com::sun::star::uno;
+
+namespace {
+
+const char* lclFindCharacter( const char* pcBeg, const char* pcEnd, char cChar )
+{
+ sal_Int32 nIndex = rtl_str_indexOfChar_WithLength( pcBeg, static_cast< sal_Int32 >( pcEnd - pcBeg ), cChar );
+ return (nIndex < 0) ? pcEnd : (pcBeg + nIndex);
+}
+
+bool lclIsWhiteSpace( char cChar )
+{
+ return cChar >= 0 && cChar <= 32;
+}
+
+const char* lclFindWhiteSpace( const char* pcBeg, const char* pcEnd )
+{
+ for( ; pcBeg < pcEnd; ++pcBeg )
+ if( lclIsWhiteSpace( *pcBeg ) )
+ return pcBeg;
+ return pcEnd;
+}
+
+const char* lclFindNonWhiteSpace( const char* pcBeg, const char* pcEnd )
+{
+ for( ; pcBeg < pcEnd; ++pcBeg )
+ if( !lclIsWhiteSpace( *pcBeg ) )
+ return pcBeg;
+ return pcEnd;
+}
+
+const char* lclTrimWhiteSpaceFromEnd( const char* pcBeg, const char* pcEnd )
+{
+ while( (pcBeg < pcEnd) && lclIsWhiteSpace( pcEnd[ -1 ] ) )
+ --pcEnd;
+ return pcEnd;
+}
+
+void lclAppendToBuffer( OStringBuffer& rBuffer, const char* pcBeg, const char* pcEnd )
+{
+ rBuffer.append( pcBeg, static_cast< sal_Int32 >( pcEnd - pcBeg ) );
+}
+
+void lclProcessAttribs( OStringBuffer& rBuffer, const char* pcBeg, const char* pcEnd )
+{
+ /* Map attribute names to char-pointer of all attributes. This map is used
+ to find multiple occurrences of attributes with the same name. The
+ mapped pointers are used as map key in the next map below. */
+ typedef ::std::map< OString, const char* > AttributeNameMap;
+ AttributeNameMap aAttributeNames;
+
+ /* Map the char-pointers of all attributes to the full attribute definition
+ string. This preserves the original order of the used attributes. */
+ typedef ::std::map< const char*, OString > AttributeDataMap;
+ AttributeDataMap aAttributes;
+
+ bool bOk = true;
+ const char* pcNameBeg = pcBeg;
+ while( bOk && (pcNameBeg < pcEnd) )
+ {
+ // pcNameBeg points to begin of attribute name, find equality sign
+ const char* pcEqualSign = lclFindCharacter( pcNameBeg, pcEnd, '=' );
+ bOk = (pcEqualSign < pcEnd);
+ if (bOk)
+ {
+ // find end of attribute name (ignore whitespace between name and equality sign)
+ const char* pcNameEnd = lclTrimWhiteSpaceFromEnd( pcNameBeg, pcEqualSign );
+ bOk = (pcNameBeg < pcNameEnd);
+ if( bOk )
+ {
+ // find begin of attribute value (must be single or double quote)
+ const char* pcValueBeg = lclFindNonWhiteSpace( pcEqualSign + 1, pcEnd );
+ bOk = (pcValueBeg < pcEnd) && ((*pcValueBeg == '\'') || (*pcValueBeg == '"'));
+ if( bOk )
+ {
+ // find end of attribute value (matching quote character)
+ const char* pcValueEnd = lclFindCharacter( pcValueBeg + 1, pcEnd, *pcValueBeg );
+ bOk = (pcValueEnd < pcEnd);
+ if( bOk )
+ {
+ ++pcValueEnd;
+ OString aAttribName( pcNameBeg, static_cast< sal_Int32 >( pcNameEnd - pcNameBeg ) );
+ OString aAttribData( pcNameBeg, static_cast< sal_Int32 >( pcValueEnd - pcNameBeg ) );
+ // search for an existing attribute with the same name
+ AttributeNameMap::iterator aIt = aAttributeNames.find( aAttribName );
+ // remove its definition from the data map
+ if( aIt != aAttributeNames.end() )
+ aAttributes.erase( aIt->second );
+ // insert the attribute into both maps
+ aAttributeNames[ aAttribName ] = pcNameBeg;
+ aAttributes[ pcNameBeg ] = aAttribData;
+ // continue with next attribute (skip whitespace after this attribute)
+ pcNameBeg = pcValueEnd;
+ if( pcNameBeg < pcEnd )
+ {
+ bOk = lclIsWhiteSpace( *pcNameBeg );
+ if( bOk )
+ pcNameBeg = lclFindNonWhiteSpace( pcNameBeg + 1, pcEnd );
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // if no error has occurred, build the resulting attribute list
+ if( bOk )
+ for (auto const& attrib : aAttributes)
+ rBuffer.append( ' ' ).append( attrib.second );
+ // on error, just append the complete passed string
+ else
+ lclAppendToBuffer( rBuffer, pcBeg, pcEnd );
+}
+
+void lclProcessElement( OStringBuffer& rBuffer, const OString& rElement )
+{
+ // check that passed string starts and ends with the brackets of an XML element
+ sal_Int32 nElementLen = rElement.getLength();
+ if( nElementLen == 0 )
+ return;
+
+ const char* pcOpen = rElement.getStr();
+ const char* pcClose = pcOpen + nElementLen - 1;
+
+ // no complete element found
+ if( (pcOpen >= pcClose) || (*pcOpen != '<') || (*pcClose != '>') )
+ {
+ // just append all passed characters
+ rBuffer.append( rElement );
+ }
+
+ // skip parser instructions: '<![...]>'
+ else if( (nElementLen >= 5) && (pcOpen[ 1 ] == '!') && (pcOpen[ 2 ] == '[') && (pcClose[ -1 ] == ']') )
+ {
+ // do nothing
+ }
+
+ // just append any xml prolog (text directive) or processing instructions: <?...?>
+ else if( (nElementLen >= 4) && (pcOpen[ 1 ] == '?') && (pcClose[ -1 ] == '?') )
+ {
+ rBuffer.append( rElement );
+ }
+
+ // replace '<br>' element with newline
+ else if( (nElementLen >= 4) && (pcOpen[ 1 ] == 'b') && (pcOpen[ 2 ] == 'r') && (lclFindNonWhiteSpace( pcOpen + 3, pcClose ) == pcClose) )
+ {
+ rBuffer.append( '\n' );
+ }
+
+ // check start elements and simple elements for repeated attributes
+ else if( pcOpen[ 1 ] != '/' )
+ {
+ // find positions of text content inside brackets, exclude '/' in '<simpleelement/>'
+ const char* pcContentBeg = pcOpen + 1;
+ bool bIsEmptyElement = pcClose[ -1 ] == '/';
+ const char* pcContentEnd = bIsEmptyElement ? (pcClose - 1) : pcClose;
+ // append opening bracket and element name to buffer
+ const char* pcWhiteSpace = lclFindWhiteSpace( pcContentBeg, pcContentEnd );
+ lclAppendToBuffer( rBuffer, pcOpen, pcWhiteSpace );
+ // find begin of attributes, and process all attributes
+ const char* pcAttribBeg = lclFindNonWhiteSpace( pcWhiteSpace, pcContentEnd );
+ if( pcAttribBeg < pcContentEnd )
+ lclProcessAttribs( rBuffer, pcAttribBeg, pcContentEnd );
+ // close the element
+ if( bIsEmptyElement )
+ rBuffer.append( '/' );
+ rBuffer.append( '>' );
+ }
+
+ // append end elements without further processing
+ else
+ {
+ rBuffer.append( rElement );
+ }
+}
+
+bool lclProcessCharacters( OStringBuffer& rBuffer, const OString& rChars )
+{
+ /* MSO has a very weird way to store and handle whitespaces. The stream
+ may contain lots of spaces, tabs, and newlines which have to be handled
+ as single space character. This will be done in this function.
+
+ If the element text contains a literal line break, it will be stored as
+ <br> tag (without matching </br> element). This input stream wrapper
+ will replace this element with a literal LF character (see below).
+
+ A single space character for its own is stored as is. Example: The
+ element
+ <font> </font>
+ represents a single space character. The XML parser will ignore this
+ space character completely without issuing a 'characters' event. The
+ VML import filter implementation has to react on this case manually.
+
+ A single space character following another character is stored
+ literally and must not be stripped away here. Example: The element
+ <font>abc </font>
+ contains the three letters a, b, and c, followed by a space character.
+
+ Consecutive space characters, or a leading single space character, are
+ stored in a <span> element. If there are N space characters (N > 1),
+ then the <span> element contains exactly (N-1) NBSP (non-breaking
+ space) characters, followed by a regular space character. Examples:
+ The element
+ <font><span style='mso-spacerun:yes'>\xA0\xA0\xA0 </span></font>
+ represents 4 consecutive space characters. Has to be handled by the
+ implementation. The element
+ <font><span style='mso-spacerun:yes'> abc</span></font>
+ represents a space characters followed by the letters a, b, c. These
+ strings have to be handled by the VML import filter implementation.
+ */
+
+ // passed string ends with the leading opening bracket of an XML element
+ const char* pcBeg = rChars.getStr();
+ const char* pcEnd = pcBeg + rChars.getLength();
+ bool bHasBracket = (pcBeg < pcEnd) && (pcEnd[ -1 ] == '<');
+ if( bHasBracket ) --pcEnd;
+
+ // skip leading whitespace
+ const char* pcContentsBeg = lclFindNonWhiteSpace( pcBeg, pcEnd );
+ while( pcContentsBeg < pcEnd )
+ {
+ const char* pcWhitespaceBeg = lclFindWhiteSpace( pcContentsBeg + 1, pcEnd );
+ lclAppendToBuffer( rBuffer, pcContentsBeg, pcWhitespaceBeg );
+ if( pcWhitespaceBeg < pcEnd )
+ rBuffer.append( ' ' );
+ pcContentsBeg = lclFindNonWhiteSpace( pcWhitespaceBeg, pcEnd );
+ }
+
+ return bHasBracket;
+}
+
+} // namespace
+
+constexpr OStringLiteral gaOpeningCData( "<![CDATA[" );
+constexpr OStringLiteral gaClosingCData( "]]>" );
+
+InputStream::InputStream( const Reference< XComponentContext >& rxContext, const Reference< XInputStream >& rxInStrm ) :
+ // use single-byte ISO-8859-1 encoding which maps all byte characters to the first 256 Unicode characters
+ mxTextStrm( TextInputStream::createXTextInputStream( rxContext, rxInStrm, RTL_TEXTENCODING_ISO_8859_1 ) ),
+ maOpeningBracket{ '<' },
+ maClosingBracket{ '>' },
+ mnBufferPos( 0 )
+{
+ if (!mxTextStrm.is())
+ throw IOException();
+}
+
+InputStream::~InputStream()
+{
+}
+
+sal_Int32 SAL_CALL InputStream::readBytes( Sequence< sal_Int8 >& rData, sal_Int32 nBytesToRead )
+{
+ if( nBytesToRead < 0 )
+ throw IOException();
+
+ rData.realloc( nBytesToRead );
+ sal_Int8* pcDest = rData.getArray();
+ sal_Int32 nRet = 0;
+ while( (nBytesToRead > 0) && !mxTextStrm->isEOF() )
+ {
+ updateBuffer();
+ sal_Int32 nReadSize = ::std::min( nBytesToRead, maBuffer.getLength() - mnBufferPos );
+ if( nReadSize > 0 )
+ {
+ memcpy( pcDest + nRet, maBuffer.getStr() + mnBufferPos, static_cast< size_t >( nReadSize ) );
+ mnBufferPos += nReadSize;
+ nBytesToRead -= nReadSize;
+ nRet += nReadSize;
+ }
+ }
+ if( nRet < rData.getLength() )
+ rData.realloc( nRet );
+ return nRet;
+}
+
+sal_Int32 SAL_CALL InputStream::readSomeBytes( Sequence< sal_Int8 >& rData, sal_Int32 nMaxBytesToRead )
+{
+ return readBytes( rData, nMaxBytesToRead );
+}
+
+void SAL_CALL InputStream::skipBytes( sal_Int32 nBytesToSkip )
+{
+ if( nBytesToSkip < 0 )
+ throw IOException();
+
+ while( (nBytesToSkip > 0) && !mxTextStrm->isEOF() )
+ {
+ updateBuffer();
+ sal_Int32 nSkipSize = ::std::min( nBytesToSkip, maBuffer.getLength() - mnBufferPos );
+ mnBufferPos += nSkipSize;
+ nBytesToSkip -= nSkipSize;
+ }
+}
+
+sal_Int32 SAL_CALL InputStream::available()
+{
+ updateBuffer();
+ return maBuffer.getLength() - mnBufferPos;
+}
+
+void SAL_CALL InputStream::closeInput()
+{
+ mxTextStrm->closeInput();
+}
+
+// private --------------------------------------------------------------------
+
+void InputStream::updateBuffer()
+{
+ while( (mnBufferPos >= maBuffer.getLength()) && !mxTextStrm->isEOF() )
+ {
+ // collect new contents in a string buffer
+ OStringBuffer aBuffer;
+
+ // read and process characters until the opening bracket of the next XML element
+ OString aChars = readToElementBegin();
+ bool bHasOpeningBracket = lclProcessCharacters( aBuffer, aChars );
+
+ // read and process characters until (and including) closing bracket (an XML element)
+ OSL_ENSURE( bHasOpeningBracket || mxTextStrm->isEOF(), "InputStream::updateBuffer - missing opening bracket of XML element" );
+ if( bHasOpeningBracket && !mxTextStrm->isEOF() )
+ {
+ // read the element text (add the leading opening bracket manually)
+ OString aElement = "<" + readToElementEnd();
+ // check for CDATA part, starting with '<![CDATA['
+ if( aElement.match( gaOpeningCData ) )
+ {
+ // search the end tag ']]>'
+ while( ((aElement.getLength() < gaClosingCData.getLength()) || !aElement.endsWith( gaClosingCData )) && !mxTextStrm->isEOF() )
+ aElement += readToElementEnd();
+ // copy the entire CDATA part
+ aBuffer.append( aElement );
+ }
+ else
+ {
+ // no CDATA part - process the contents of the element
+ lclProcessElement( aBuffer, aElement );
+ }
+ }
+
+ maBuffer = aBuffer.makeStringAndClear();
+ mnBufferPos = 0;
+ }
+}
+
+OString InputStream::readToElementBegin()
+{
+ return OUStringToOString( mxTextStrm->readString( maOpeningBracket, false ), RTL_TEXTENCODING_ISO_8859_1 );
+}
+
+OString InputStream::readToElementEnd()
+{
+ OString aText = OUStringToOString( mxTextStrm->readString( maClosingBracket, false ), RTL_TEXTENCODING_ISO_8859_1 );
+ OSL_ENSURE( aText.endsWith(">"), "InputStream::readToElementEnd - missing closing bracket of XML element" );
+ return aText;
+}
+
+} // namespace oox::vml
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */