/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ /* * This file is part of the LibreOffice project. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. * * This file incorporates work covered by the following license notice: * * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed * with this work for additional information regarding copyright * ownership. The ASF licenses this file to you under the Apache * License, Version 2.0 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of * the License at http://www.apache.org/licenses/LICENSE-2.0 . */ #include // boost using obsolete stuff #if defined(_MSC_VER) #pragma warning(push) #pragma warning(disable:4996) #pragma warning(disable:4503) #endif // workaround windows compiler: do not include multi_pass.hpp #include #include #include #include #include #include #include #include #include #include #include #include #include // disable warnings again because someone along the line has enabled them // (we have included boost headers, what did you expect?) #if defined(_MSC_VER) #pragma warning(push) #pragma warning(disable:4996) #pragma warning(disable:4503) #endif using namespace boost::spirit::classic; using namespace pdfparse; namespace { class StringEmitContext : public EmitContext { OStringBuffer m_aBuf; public: StringEmitContext() : m_aBuf(256) {} virtual bool write( const void* pBuf, unsigned int nLen ) noexcept override { m_aBuf.append( static_cast(pBuf), nLen ); return true; } virtual unsigned int getCurPos() noexcept override { return m_aBuf.getLength(); } virtual bool copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen ) noexcept override { return (nOrigOffset+nLen < o3tl::make_unsigned(m_aBuf.getLength()) ) && write( m_aBuf.getStr() + nOrigOffset, nLen ); } virtual unsigned int readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf ) noexcept override { if( nOrigOffset+nLen < o3tl::make_unsigned(m_aBuf.getLength()) ) { memcpy( pBuf, m_aBuf.getStr()+nOrigOffset, nLen ); return nLen; } return 0; } OString getString() { return m_aBuf.makeStringAndClear(); } }; template< class iteratorT > class PDFGrammar : public grammar< PDFGrammar > { public: explicit PDFGrammar( iteratorT first ) : m_fDouble( 0.0 ), m_aGlobalBegin(std::move( first )) {} ~PDFGrammar() { if( !m_aObjectStack.empty() ) delete m_aObjectStack.front(); } double m_fDouble; std::vector< unsigned int > m_aUIntStack; std::vector< PDFEntry* > m_aObjectStack; OString m_aErrorString; iteratorT m_aGlobalBegin; public: struct pdf_string_parser { typedef nil_t result_t; template std::ptrdiff_t operator()(ScannerT const& scan, result_t&) const { std::ptrdiff_t len = 0; int nBraceLevel = 0; while( ! scan.at_end() ) { char c = *scan; if( c == ')' ) { nBraceLevel--; if( nBraceLevel < 0 ) break; } else if( c == '(' ) nBraceLevel++; else if( c == '\\' ) // ignore escaped braces { ++len; ++scan.first; // tdf#63054: avoid skipping spaces if( scan.first == scan.last ) // tdf#63054: avoid skipping spaces break; } ++len; ++scan; } return scan.at_end() ? -1 : len; } }; template< typename ScannerT > struct definition { explicit definition( const PDFGrammar& rSelf ) { using namespace boost::placeholders; PDFGrammar* pSelf = const_cast< PDFGrammar* >( &rSelf ); // workaround workshop compiler: comment_p doesn't work // comment = comment_p("%")[boost::bind(&PDFGrammar::pushComment, pSelf, _1, _2 )]; comment = lexeme_d[ (ch_p('%') >> *(~ch_p('\r') & ~ch_p('\n')) >> eol_p)[boost::bind(&PDFGrammar::pushComment, pSelf, _1, _2 )] ]; boolean = (str_p("true") | str_p("false"))[boost::bind(&PDFGrammar::pushBool, pSelf, _1, _2)]; // workaround workshop compiler: confix_p doesn't work //stream = confix_p( "stream", *anychar_p, "endstream" )[boost::bind(&PDFGrammar::emitStream, pSelf, _1, _2 )]; stream = (str_p("stream") >> *(anychar_p - str_p("endstream")) >> str_p("endstream"))[boost::bind(&PDFGrammar::emitStream, pSelf, _1, _2 )]; name = lexeme_d[ ch_p('/') >> (*(anychar_p-chset_p("\t\n\f\r ()<>[]{}/%")-ch_p('\0'))) [boost::bind(&PDFGrammar::pushName, pSelf, _1, _2)] ]; // workaround workshop compiler: confix_p doesn't work //stringtype = ( confix_p("(",*anychar_p, ")") | // confix_p("<",*xdigit_p, ">") ) // [boost::bind(&PDFGrammar::pushString,pSelf, _1, _2)]; stringtype = ( ( ch_p('(') >> functor_parser() >> ch_p(')') ) | ( ch_p('<') >> *xdigit_p >> ch_p('>') ) ) [boost::bind(&PDFGrammar::pushString,pSelf, _1, _2)]; null_object = str_p( "null" )[boost::bind(&PDFGrammar::pushNull, pSelf, _1, _2)]; #ifdef USE_ASSIGN_ACTOR objectref = ( uint_p[push_back_a(pSelf->m_aUIntStack)] >> uint_p[push_back_a(pSelf->m_aUIntStack)] >> ch_p('R') >> eps_p )[boost::bind(&PDFGrammar::pushObjectRef, pSelf, _1, _2)]; #else objectref = ( uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)] >> uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)] >> ch_p('R') >> eps_p )[boost::bind(&PDFGrammar::pushObjectRef, pSelf, _1, _2)]; #endif #ifdef USE_ASSIGN_ACTOR simple_type = objectref | name | ( real_p[assign_a(pSelf->m_fDouble)] >> eps_p ) [boost::bind(&PDFGrammar::pushDouble, pSelf, _1, _2)] | stringtype | boolean | null_object; #else simple_type = objectref | name | ( real_p[boost::bind(&PDFGrammar::assign_action_double, pSelf, _1)] >> eps_p ) [boost::bind(&PDFGrammar::pushDouble, pSelf, _1, _2)] | stringtype | boolean | null_object; #endif dict_begin = str_p( "<<" )[boost::bind(&PDFGrammar::beginDict, pSelf, _1, _2)]; dict_end = str_p( ">>" )[boost::bind(&PDFGrammar::endDict, pSelf, _1, _2)]; array_begin = str_p("[")[boost::bind(&PDFGrammar::beginArray,pSelf, _1, _2)]; array_end = str_p("]")[boost::bind(&PDFGrammar::endArray,pSelf, _1, _2)]; #ifdef USE_ASSIGN_ACTOR object_begin= uint_p[push_back_a(pSelf->m_aUIntStack)] >> uint_p[push_back_a(pSelf->m_aUIntStack)] >> str_p("obj" )[boost::bind(&PDFGrammar::beginObject, pSelf, _1, _2)]; #else object_begin= uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)] >> uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)] >> str_p("obj" )[boost::bind(&PDFGrammar::beginObject, pSelf, _1, _2)]; #endif object_end = str_p( "endobj" )[boost::bind(&PDFGrammar::endObject, pSelf, _1, _2)]; xref = str_p( "xref" ) >> uint_p >> uint_p >> lexeme_d[ +( repeat_p(10)[digit_p] >> blank_p >> repeat_p(5)[digit_p] >> blank_p >> ( ch_p('n') | ch_p('f') ) >> repeat_p(2)[space_p] ) ]; dict_element= dict_begin | comment | simple_type | array_begin | array_end | dict_end; object = object_begin >> *dict_element >> !stream >> object_end; trailer = str_p( "trailer" )[boost::bind(&PDFGrammar::beginTrailer,pSelf,_1,_2)] >> *dict_element >> str_p("startxref") >> uint_p >> str_p("%%EOF")[boost::bind(&PDFGrammar::endTrailer,pSelf,_1,_2)]; #ifdef USE_ASSIGN_ACTOR pdfrule = ! (lexeme_d[ str_p( "%PDF-" ) >> uint_p[push_back_a(pSelf->m_aUIntStack)] >> ch_p('.') >> uint_p[push_back_a(pSelf->m_aUIntStack)] >> *((~ch_p('\r') & ~ch_p('\n'))) >> eol_p ])[boost::bind(&PDFGrammar::haveFile,pSelf, _1, _2)] >> *( comment | object | ( xref >> trailer ) ); #else pdfrule = ! (lexeme_d[ str_p( "%PDF-" ) >> uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)] >> ch_p('.') >> uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)] >> *(~ch_p('\r') & ~ch_p('\n')) >> eol_p ])[boost::bind(&PDFGrammar::haveFile,pSelf, _1, _2)] >> *( comment | object | ( xref >> trailer ) ); #endif } rule< ScannerT > comment, stream, boolean, name, stringtype, null_object, simple_type, objectref, array, value, dict_element, dict_begin, dict_end, array_begin, array_end, object, object_begin, object_end, xref, trailer, pdfrule; const rule< ScannerT >& start() const { return pdfrule; } }; #ifndef USE_ASSIGN_ACTOR void push_back_action_uint( unsigned int i ) { m_aUIntStack.push_back( i ); } void assign_action_double( double d ) { m_fDouble = d; } #endif static void parseError( const char* pMessage, iteratorT pLocation ) { throw_( pLocation, pMessage ); } OString iteratorToString( iteratorT first, iteratorT last ) const { OStringBuffer aStr( 32 ); while( first != last ) { aStr.append( *first ); ++first; } return aStr.makeStringAndClear(); } void haveFile( iteratorT pBegin, SAL_UNUSED_PARAMETER iteratorT /*pEnd*/ ) { if( m_aObjectStack.empty() ) { PDFFile* pFile = new PDFFile(); pFile->m_nMinor = m_aUIntStack.back(); m_aUIntStack.pop_back(); pFile->m_nMajor = m_aUIntStack.back(); m_aUIntStack.pop_back(); m_aObjectStack.push_back( pFile ); } else parseError( "found file header in unusual place", pBegin ); } void pushComment( iteratorT first, iteratorT last ) { // add a comment to the current stack element PDFComment* pComment = new PDFComment(iteratorToString(first,last)); if( m_aObjectStack.empty() ) m_aObjectStack.push_back( new PDFPart() ); PDFContainer* pContainer = dynamic_cast(m_aObjectStack.back()); if( pContainer == nullptr ) parseError( "comment without container", first ); pContainer->m_aSubElements.emplace_back( pComment ); } void insertNewValue( std::unique_ptr pNewValue, iteratorT pPos ) { PDFContainer* pContainer = nullptr; const char* pMsg = nullptr; if( ! m_aObjectStack.empty() ) { pContainer = dynamic_cast(m_aObjectStack.back()); if (pContainer) { if( dynamic_cast(pContainer) == nullptr && dynamic_cast(pContainer) == nullptr ) { PDFObject* pObj = dynamic_cast(pContainer); if( pObj ) { if( pObj->m_pObject == nullptr ) pObj->m_pObject = pNewValue.get(); else { pMsg = "second value for object"; pContainer = nullptr; } } else if( dynamic_cast(pNewValue.get()) ) { PDFTrailer* pTrailer = dynamic_cast(pContainer); if( pTrailer ) { if( pTrailer->m_pDict == nullptr ) pTrailer->m_pDict = dynamic_cast(pNewValue.get()); else pContainer = nullptr; } else pContainer = nullptr; } else pContainer = nullptr; } } } if( pContainer ) pContainer->m_aSubElements.emplace_back( std::move(pNewValue) ); else { if( ! pMsg ) { if( dynamic_cast(pNewValue.get()) ) pMsg = "array without container"; else pMsg = "value without container"; } parseError( pMsg, pPos ); } } void pushName( iteratorT first, iteratorT last ) { insertNewValue( std::make_unique(iteratorToString(first,last)), first ); } void pushDouble( iteratorT first, SAL_UNUSED_PARAMETER iteratorT /*last*/ ) { insertNewValue( std::make_unique(m_fDouble), first ); } void pushString( iteratorT first, iteratorT last ) { insertNewValue( std::make_unique(iteratorToString(first,last)), first ); } void pushBool( iteratorT first, iteratorT last ) { insertNewValue( std::make_unique( last-first == 4 ), first ); } void pushNull( iteratorT first, SAL_UNUSED_PARAMETER iteratorT ) { insertNewValue( std::make_unique(), first ); } void beginObject( iteratorT first, SAL_UNUSED_PARAMETER iteratorT /*last*/ ) { if( m_aObjectStack.empty() ) m_aObjectStack.push_back( new PDFPart() ); unsigned int nGeneration = m_aUIntStack.back(); m_aUIntStack.pop_back(); unsigned int nObject = m_aUIntStack.back(); m_aUIntStack.pop_back(); PDFObject* pObj = new PDFObject( nObject, nGeneration ); pObj->m_nOffset = first - m_aGlobalBegin; PDFContainer* pContainer = dynamic_cast(m_aObjectStack.back()); if( pContainer && ( dynamic_cast(pContainer) || dynamic_cast(pContainer) ) ) { pContainer->m_aSubElements.emplace_back( pObj ); m_aObjectStack.push_back( pObj ); } else parseError( "object in wrong place", first ); } void endObject( iteratorT first, SAL_UNUSED_PARAMETER iteratorT ) { if( m_aObjectStack.empty() ) parseError( "endobj without obj", first ); else if( dynamic_cast(m_aObjectStack.back()) == nullptr ) parseError( "spurious endobj", first ); else m_aObjectStack.pop_back(); } void pushObjectRef( iteratorT first, SAL_UNUSED_PARAMETER iteratorT ) { unsigned int nGeneration = m_aUIntStack.back(); m_aUIntStack.pop_back(); unsigned int nObject = m_aUIntStack.back(); m_aUIntStack.pop_back(); insertNewValue( std::make_unique(nObject,nGeneration), first ); } void beginDict( iteratorT first, SAL_UNUSED_PARAMETER iteratorT ) { PDFDict* pDict = new PDFDict(); pDict->m_nOffset = first - m_aGlobalBegin; insertNewValue( std::unique_ptr(pDict), first ); // will not come here if insertion fails (exception) m_aObjectStack.push_back( pDict ); } void endDict( iteratorT first, SAL_UNUSED_PARAMETER iteratorT ) { PDFDict* pDict = nullptr; if( m_aObjectStack.empty() ) parseError( "dictionary end without begin", first ); else if( (pDict = dynamic_cast(m_aObjectStack.back())) == nullptr ) parseError( "spurious dictionary end", first ); else m_aObjectStack.pop_back(); PDFEntry* pOffender = pDict->buildMap(); if( pOffender ) { StringEmitContext aCtx; aCtx.write( "offending dictionary element: ", 30 ); pOffender->emit( aCtx ); m_aErrorString = aCtx.getString(); parseError( m_aErrorString.getStr(), first ); } } void beginArray( iteratorT first, SAL_UNUSED_PARAMETER iteratorT ) { PDFArray* pArray = new PDFArray(); pArray->m_nOffset = first - m_aGlobalBegin; insertNewValue( std::unique_ptr(pArray), first ); // will not come here if insertion fails (exception) m_aObjectStack.push_back( pArray ); } void endArray( iteratorT first, SAL_UNUSED_PARAMETER iteratorT ) { if( m_aObjectStack.empty() ) parseError( "array end without begin", first ); else if( dynamic_cast(m_aObjectStack.back()) == nullptr ) parseError( "spurious array end", first ); else m_aObjectStack.pop_back(); } void emitStream( iteratorT first, iteratorT last ) { if( m_aObjectStack.empty() ) parseError( "stream without object", first ); PDFObject* pObj = dynamic_cast(m_aObjectStack.back()); if( pObj && pObj->m_pObject ) { if( pObj->m_pStream ) parseError( "multiple streams in object", first ); PDFDict* pDict = dynamic_cast(pObj->m_pObject); if( pDict ) { PDFStream* pStream = new PDFStream( first - m_aGlobalBegin, last - m_aGlobalBegin, pDict ); pObj->m_pStream = pStream; pObj->m_aSubElements.emplace_back( pStream ); } } else parseError( "stream without object", first ); } void beginTrailer( iteratorT first, SAL_UNUSED_PARAMETER iteratorT ) { if( m_aObjectStack.empty() ) m_aObjectStack.push_back( new PDFPart() ); PDFTrailer* pTrailer = new PDFTrailer(); pTrailer->m_nOffset = first - m_aGlobalBegin; PDFContainer* pContainer = dynamic_cast(m_aObjectStack.back()); if( pContainer && ( dynamic_cast(pContainer) || dynamic_cast(pContainer) ) ) { pContainer->m_aSubElements.emplace_back( pTrailer ); m_aObjectStack.push_back( pTrailer ); } else parseError( "trailer in wrong place", first ); } void endTrailer( iteratorT first, SAL_UNUSED_PARAMETER iteratorT ) { if( m_aObjectStack.empty() ) parseError( "%%EOF without trailer", first ); else if( dynamic_cast(m_aObjectStack.back()) == nullptr ) parseError( "spurious %%EOF", first ); else m_aObjectStack.pop_back(); } }; } std::unique_ptr PDFReader::read(std::u16string_view aFileName) { #ifdef _WIN32 file_iterator<> file_start(std::wstring(o3tl::toW(aFileName))); #else file_iterator<> file_start( std::string(OUStringToOString(aFileName, osl_getThreadTextEncoding()))); #endif if( ! file_start ) return nullptr; file_iterator<> file_end = file_start.make_end(); PDFGrammar< file_iterator<> > aGrammar( file_start ); try { #if OSL_DEBUG_LEVEL > 0 boost::spirit::classic::parse_info< file_iterator<> > aInfo = #endif boost::spirit::classic::parse( file_start, file_end, aGrammar, boost::spirit::classic::space_p ); #if OSL_DEBUG_LEVEL > 0 SAL_INFO("sdext.pdfimport.pdfparse", "parseinfo: stop at offset = " << aInfo.stop - file_start << ", hit = " << (aInfo.hit ? "true" : "false") << ", full = " << (aInfo.full ? "true" : "false") << ", length = " << aInfo.length); #endif } catch( const parser_error< const char*, file_iterator<> >& rError ) { SAL_WARN("sdext.pdfimport.pdfparse", "parse error: " << rError.descriptor << " at buffer pos " << rError.where - file_start); #if OSL_DEBUG_LEVEL > 0 OUStringBuffer aTmp; unsigned int nElem = aGrammar.m_aObjectStack.size(); for( unsigned int i = 0; i < nElem; i++ ) { aTmp.append(" "); aTmp.appendAscii(typeid( *(aGrammar.m_aObjectStack[i]) ).name()); } SAL_WARN("sdext.pdfimport.pdfparse", "parse error object stack: " << aTmp.makeStringAndClear()); #endif } std::unique_ptr pRet; unsigned int nEntries = aGrammar.m_aObjectStack.size(); if( nEntries == 1 ) { pRet.reset(aGrammar.m_aObjectStack.back()); aGrammar.m_aObjectStack.pop_back(); } else if( nEntries > 1 ) { // It is possible that there are multiple trailers, which is OK. // But still keep the warnings, just in case. SAL_WARN("sdext.pdfimport.pdfparse", "error got " << nEntries << " stack objects in parse"); for (;;) { PDFEntry* pEntry = aGrammar.m_aObjectStack.back(); aGrammar.m_aObjectStack.pop_back(); SAL_WARN("sdext.pdfimport.pdfparse", typeid(*pEntry).name()); PDFObject* pObj = dynamic_cast(pEntry); if( pObj ) SAL_WARN("sdext.pdfimport.pdfparse", " -> object " << pObj->m_nNumber << " generation " << pObj->m_nGeneration); if (aGrammar.m_aObjectStack.empty()) { pRet.reset(pEntry); // The first entry references all others - see PDFGrammar dtor break; } } } return pRet; } #if defined(_MSC_VER) #pragma warning(pop) #endif /* vim:set shiftwidth=4 softtabstop=4 expandtab: */