diff options
Diffstat (limited to 'filter/source/textfilterdetect/filterdetect.cxx')
-rw-r--r-- | filter/source/textfilterdetect/filterdetect.cxx | 247 |
1 files changed, 247 insertions, 0 deletions
diff --git a/filter/source/textfilterdetect/filterdetect.cxx b/filter/source/textfilterdetect/filterdetect.cxx new file mode 100644 index 000000000..9d25e289e --- /dev/null +++ b/filter/source/textfilterdetect/filterdetect.cxx @@ -0,0 +1,247 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include "filterdetect.hxx" + +#include <svtools/htmltokn.h> +#include <tools/urlobj.hxx> +#include <tools/zcodec.hxx> +#include <ucbhelper/content.hxx> +#include <unotools/mediadescriptor.hxx> +#include <unotools/streamwrap.hxx> +#include <unotools/ucbstreamhelper.hxx> + +#include <com/sun/star/io/XInputStream.hpp> +#include <cppuhelper/supportsservice.hxx> +#include <memory> + +constexpr OUStringLiteral WRITER_TEXT_FILTER = u"Text"; +constexpr OUStringLiteral CALC_TEXT_FILTER = u"Text - txt - csv (StarCalc)"; + +constexpr OUStringLiteral WEB_HTML_FILTER = u"HTML"; +constexpr OUStringLiteral WRITER_HTML_FILTER = u"HTML (StarWriter)"; +constexpr OUStringLiteral CALC_HTML_FILTER = u"calc_HTML_WebQuery"; + +constexpr OUStringLiteral WRITER_DOCSERVICE = u"com.sun.star.text.TextDocument"; +constexpr OUStringLiteral CALC_DOCSERVICE = u"com.sun.star.sheet.SpreadsheetDocument"; + +using namespace ::com::sun::star; +using utl::MediaDescriptor; + +namespace { + +bool IsHTMLStream( const uno::Reference<io::XInputStream>& xInStream ) +{ + std::unique_ptr<SvStream> pInStream( utl::UcbStreamHelper::CreateStream( xInStream ) ); + if ( !pInStream || pInStream->GetError() ) + // No stream + return false; + + // Read the stream header + pInStream->StartReadingUnicodeText( RTL_TEXTENCODING_DONTKNOW ); + const sal_uInt64 nUniPos = pInStream->Tell(); + const sal_uInt16 nSize = 4096; + + OString sHeader; + if ( nUniPos == 3 || nUniPos == 0 ) // UTF-8 or non-Unicode + sHeader = read_uInt8s_ToOString( *pInStream, nSize ); + else // UTF-16 (nUniPos = 2) + sHeader = OUStringToOString( read_uInt16s_ToOUString( *pInStream, nSize ), RTL_TEXTENCODING_ASCII_US ); + + // Now check whether the stream begins with a known HTML tag. + enum DetectPhase { BeforeTag, TagOpened, InTagName }; + DetectPhase dp = BeforeTag; + /// BeforeDeclaration -> ? -> DeclarationOpened -> > -> BeforeDeclaration. + enum DeclarationPhase + { + BeforeDeclaration, + DeclarationOpened + }; + DeclarationPhase eDeclaration = BeforeDeclaration; + + const char* pHeader = sHeader.getStr(); + const int nLength = sHeader.getLength(); + int i = 0, nStartOfTagIndex = 0; + + for ( i = 0; i < nLength; ++i, ++pHeader ) + { + char c = *pHeader; + if ((c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\f') + && eDeclaration == BeforeDeclaration) + { + if ( dp == TagOpened ) + return false; // Invalid: Should start with a tag name + else if ( dp == InTagName ) + break; // End of tag name reached + } + else if ( c == '<' ) + { + if ( dp == BeforeTag ) + dp = TagOpened; + else + return false; // Invalid: Nested '<' + } + else if ( c == '>' ) + { + if ( dp == InTagName ) + break; // End of tag name reached + else if (eDeclaration == DeclarationOpened) + { + dp = BeforeTag; + eDeclaration = BeforeDeclaration; + } + else + return false; // Invalid: Empty tag or before '<' + } + else if ( c == '!' ) + { + if ( dp == TagOpened ) + return true; // "<!" - DOCTYPE or comments block + else + return false; // Invalid: '!' before '<' or inside tag name + } + else + { + if ( dp == BeforeTag ) + return false; // Invalid: Should start with a tag + else if ( dp == TagOpened ) + { + if (c == '?' && eDeclaration == BeforeDeclaration) + eDeclaration = DeclarationOpened; + else if (eDeclaration == BeforeDeclaration) + { + nStartOfTagIndex = i; + dp = InTagName; + } + } + } + } + + // The string following '<' has to be a known HTML token. + OString aToken = sHeader.copy( nStartOfTagIndex, i - nStartOfTagIndex ); + return GetHTMLToken( OStringToOUString( aToken.toAsciiLowerCase(), RTL_TEXTENCODING_ASCII_US ) ) != HtmlTokenId::NONE; +} +} + +PlainTextFilterDetect::PlainTextFilterDetect() {} + +PlainTextFilterDetect::~PlainTextFilterDetect() {} + +OUString SAL_CALL PlainTextFilterDetect::detect(uno::Sequence<beans::PropertyValue>& lDescriptor) +{ + MediaDescriptor aMediaDesc(lDescriptor); + + OUString aType = aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_TYPENAME, OUString() ); + OUString aDocService = aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_DOCUMENTSERVICE, OUString() ); + + if ((aType == "generic_HTML") || (aType == "calc_HTML")) + { + uno::Reference<io::XInputStream> xInStream(aMediaDesc[MediaDescriptor::PROP_INPUTSTREAM], uno::UNO_QUERY); + if (!xInStream.is() || !IsHTMLStream(xInStream)) + return OUString(); + + if ((aDocService == CALC_DOCSERVICE) || (aType == "calc_HTML")) + aMediaDesc[MediaDescriptor::PROP_FILTERNAME] <<= OUString(CALC_HTML_FILTER); + else if (aDocService == WRITER_DOCSERVICE) + aMediaDesc[MediaDescriptor::PROP_FILTERNAME] <<= OUString(WRITER_HTML_FILTER); + else + aMediaDesc[MediaDescriptor::PROP_FILTERNAME] <<= OUString(WEB_HTML_FILTER); + } + + else if (aType == "generic_Text") + { + uno::Reference<io::XStream> xStream(aMediaDesc[MediaDescriptor::PROP_STREAM], uno::UNO_QUERY); + uno::Reference<io::XInputStream> xInStream(aMediaDesc[MediaDescriptor::PROP_INPUTSTREAM], uno::UNO_QUERY); + if (xStream.is() || xInStream.is()) + { + ZCodec aCodecGZ; + std::unique_ptr<SvStream> pInStream; + if (xStream.is()) + pInStream = utl::UcbStreamHelper::CreateStream(xStream); + else + pInStream = utl::UcbStreamHelper::CreateStream(xInStream); + std::unique_ptr<SvMemoryStream> pDecompressedStream(new SvMemoryStream()); + if (aCodecGZ.AttemptDecompression(*pInStream, *pDecompressedStream)) + { + uno::Reference<io::XStream> xStreamDecompressed(new utl::OStreamWrapper(std::move(pDecompressedStream))); + aMediaDesc[MediaDescriptor::PROP_STREAM] <<= xStreamDecompressed; + aMediaDesc[MediaDescriptor::PROP_INPUTSTREAM] <<= xStreamDecompressed->getInputStream(); + OUString aURL = aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_URL, OUString() ); + sal_Int32 nIdx = aURL.lastIndexOf(".gz"); + if (nIdx != -1) + aMediaDesc[MediaDescriptor::PROP_URL] <<= aURL.copy(0, nIdx); + } + } + // Get the file name extension. + INetURLObject aParser(aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_URL, OUString() ) ); + OUString aExt = aParser.getExtension(INetURLObject::LAST_SEGMENT, true, INetURLObject::DecodeMechanism::WithCharset); + aExt = aExt.toAsciiLowerCase(); + OUString aName = aParser.getName().toAsciiLowerCase(); + + // Decide which filter to use based on the document service first, + // then on extension if that's not available. + + if (aDocService == CALC_DOCSERVICE) + aMediaDesc[MediaDescriptor::PROP_FILTERNAME] <<= OUString(CALC_TEXT_FILTER); + else if (aDocService == WRITER_DOCSERVICE) + aMediaDesc[MediaDescriptor::PROP_FILTERNAME] <<= OUString(WRITER_TEXT_FILTER); + else if (aExt == "csv" || aExt == "tsv" || aExt == "tab" || aExt == "xls" || aName.endsWith(".csv.gz")) + aMediaDesc[MediaDescriptor::PROP_FILTERNAME] <<= OUString(CALC_TEXT_FILTER); + else + aMediaDesc[MediaDescriptor::PROP_FILTERNAME] <<= OUString(WRITER_TEXT_FILTER); + } + + else + // Nothing to detect. + return OUString(); + + aMediaDesc >> lDescriptor; + return aType; +} + +// XInitialization + +void SAL_CALL PlainTextFilterDetect::initialize(const uno::Sequence<uno::Any>& /*aArguments*/) +{ +} + +OUString PlainTextFilterDetect_getImplementationName() +{ + return "com.sun.star.comp.filters.PlainTextFilterDetect"; +} + +uno::Sequence<OUString> PlainTextFilterDetect_getSupportedServiceNames() +{ + return { "com.sun.star.document.ExtendedTypeDetection", "com.sun.star.comp.filters.PlainTextFilterDetect" }; +} + +// XServiceInfo +OUString SAL_CALL PlainTextFilterDetect::getImplementationName() +{ + return PlainTextFilterDetect_getImplementationName(); +} + +sal_Bool SAL_CALL PlainTextFilterDetect::supportsService(const OUString& rServiceName) +{ + return cppu::supportsService(this, rServiceName); +} + +uno::Sequence<OUString> SAL_CALL PlainTextFilterDetect::getSupportedServiceNames() +{ + return PlainTextFilterDetect_getSupportedServiceNames(); +} + +extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface* +com_sun_star_comp_filters_PlainTextFilterDetect_get_implementation(css::uno::XComponentContext* , + css::uno::Sequence<css::uno::Any> const &) +{ + return cppu::acquire(new PlainTextFilterDetect); +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |