summaryrefslogtreecommitdiffstats
path: root/filter/source/textfilterdetect/filterdetect.cxx
diff options
context:
space:
mode:
Diffstat (limited to 'filter/source/textfilterdetect/filterdetect.cxx')
-rw-r--r--filter/source/textfilterdetect/filterdetect.cxx247
1 files changed, 247 insertions, 0 deletions
diff --git a/filter/source/textfilterdetect/filterdetect.cxx b/filter/source/textfilterdetect/filterdetect.cxx
new file mode 100644
index 000000000..9d25e289e
--- /dev/null
+++ b/filter/source/textfilterdetect/filterdetect.cxx
@@ -0,0 +1,247 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include "filterdetect.hxx"
+
+#include <svtools/htmltokn.h>
+#include <tools/urlobj.hxx>
+#include <tools/zcodec.hxx>
+#include <ucbhelper/content.hxx>
+#include <unotools/mediadescriptor.hxx>
+#include <unotools/streamwrap.hxx>
+#include <unotools/ucbstreamhelper.hxx>
+
+#include <com/sun/star/io/XInputStream.hpp>
+#include <cppuhelper/supportsservice.hxx>
+#include <memory>
+
+constexpr OUStringLiteral WRITER_TEXT_FILTER = u"Text";
+constexpr OUStringLiteral CALC_TEXT_FILTER = u"Text - txt - csv (StarCalc)";
+
+constexpr OUStringLiteral WEB_HTML_FILTER = u"HTML";
+constexpr OUStringLiteral WRITER_HTML_FILTER = u"HTML (StarWriter)";
+constexpr OUStringLiteral CALC_HTML_FILTER = u"calc_HTML_WebQuery";
+
+constexpr OUStringLiteral WRITER_DOCSERVICE = u"com.sun.star.text.TextDocument";
+constexpr OUStringLiteral CALC_DOCSERVICE = u"com.sun.star.sheet.SpreadsheetDocument";
+
+using namespace ::com::sun::star;
+using utl::MediaDescriptor;
+
+namespace {
+
+bool IsHTMLStream( const uno::Reference<io::XInputStream>& xInStream )
+{
+ std::unique_ptr<SvStream> pInStream( utl::UcbStreamHelper::CreateStream( xInStream ) );
+ if ( !pInStream || pInStream->GetError() )
+ // No stream
+ return false;
+
+ // Read the stream header
+ pInStream->StartReadingUnicodeText( RTL_TEXTENCODING_DONTKNOW );
+ const sal_uInt64 nUniPos = pInStream->Tell();
+ const sal_uInt16 nSize = 4096;
+
+ OString sHeader;
+ if ( nUniPos == 3 || nUniPos == 0 ) // UTF-8 or non-Unicode
+ sHeader = read_uInt8s_ToOString( *pInStream, nSize );
+ else // UTF-16 (nUniPos = 2)
+ sHeader = OUStringToOString( read_uInt16s_ToOUString( *pInStream, nSize ), RTL_TEXTENCODING_ASCII_US );
+
+ // Now check whether the stream begins with a known HTML tag.
+ enum DetectPhase { BeforeTag, TagOpened, InTagName };
+ DetectPhase dp = BeforeTag;
+ /// BeforeDeclaration -> ? -> DeclarationOpened -> > -> BeforeDeclaration.
+ enum DeclarationPhase
+ {
+ BeforeDeclaration,
+ DeclarationOpened
+ };
+ DeclarationPhase eDeclaration = BeforeDeclaration;
+
+ const char* pHeader = sHeader.getStr();
+ const int nLength = sHeader.getLength();
+ int i = 0, nStartOfTagIndex = 0;
+
+ for ( i = 0; i < nLength; ++i, ++pHeader )
+ {
+ char c = *pHeader;
+ if ((c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\f')
+ && eDeclaration == BeforeDeclaration)
+ {
+ if ( dp == TagOpened )
+ return false; // Invalid: Should start with a tag name
+ else if ( dp == InTagName )
+ break; // End of tag name reached
+ }
+ else if ( c == '<' )
+ {
+ if ( dp == BeforeTag )
+ dp = TagOpened;
+ else
+ return false; // Invalid: Nested '<'
+ }
+ else if ( c == '>' )
+ {
+ if ( dp == InTagName )
+ break; // End of tag name reached
+ else if (eDeclaration == DeclarationOpened)
+ {
+ dp = BeforeTag;
+ eDeclaration = BeforeDeclaration;
+ }
+ else
+ return false; // Invalid: Empty tag or before '<'
+ }
+ else if ( c == '!' )
+ {
+ if ( dp == TagOpened )
+ return true; // "<!" - DOCTYPE or comments block
+ else
+ return false; // Invalid: '!' before '<' or inside tag name
+ }
+ else
+ {
+ if ( dp == BeforeTag )
+ return false; // Invalid: Should start with a tag
+ else if ( dp == TagOpened )
+ {
+ if (c == '?' && eDeclaration == BeforeDeclaration)
+ eDeclaration = DeclarationOpened;
+ else if (eDeclaration == BeforeDeclaration)
+ {
+ nStartOfTagIndex = i;
+ dp = InTagName;
+ }
+ }
+ }
+ }
+
+ // The string following '<' has to be a known HTML token.
+ OString aToken = sHeader.copy( nStartOfTagIndex, i - nStartOfTagIndex );
+ return GetHTMLToken( OStringToOUString( aToken.toAsciiLowerCase(), RTL_TEXTENCODING_ASCII_US ) ) != HtmlTokenId::NONE;
+}
+}
+
+PlainTextFilterDetect::PlainTextFilterDetect() {}
+
+PlainTextFilterDetect::~PlainTextFilterDetect() {}
+
+OUString SAL_CALL PlainTextFilterDetect::detect(uno::Sequence<beans::PropertyValue>& lDescriptor)
+{
+ MediaDescriptor aMediaDesc(lDescriptor);
+
+ OUString aType = aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_TYPENAME, OUString() );
+ OUString aDocService = aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_DOCUMENTSERVICE, OUString() );
+
+ if ((aType == "generic_HTML") || (aType == "calc_HTML"))
+ {
+ uno::Reference<io::XInputStream> xInStream(aMediaDesc[MediaDescriptor::PROP_INPUTSTREAM], uno::UNO_QUERY);
+ if (!xInStream.is() || !IsHTMLStream(xInStream))
+ return OUString();
+
+ if ((aDocService == CALC_DOCSERVICE) || (aType == "calc_HTML"))
+ aMediaDesc[MediaDescriptor::PROP_FILTERNAME] <<= OUString(CALC_HTML_FILTER);
+ else if (aDocService == WRITER_DOCSERVICE)
+ aMediaDesc[MediaDescriptor::PROP_FILTERNAME] <<= OUString(WRITER_HTML_FILTER);
+ else
+ aMediaDesc[MediaDescriptor::PROP_FILTERNAME] <<= OUString(WEB_HTML_FILTER);
+ }
+
+ else if (aType == "generic_Text")
+ {
+ uno::Reference<io::XStream> xStream(aMediaDesc[MediaDescriptor::PROP_STREAM], uno::UNO_QUERY);
+ uno::Reference<io::XInputStream> xInStream(aMediaDesc[MediaDescriptor::PROP_INPUTSTREAM], uno::UNO_QUERY);
+ if (xStream.is() || xInStream.is())
+ {
+ ZCodec aCodecGZ;
+ std::unique_ptr<SvStream> pInStream;
+ if (xStream.is())
+ pInStream = utl::UcbStreamHelper::CreateStream(xStream);
+ else
+ pInStream = utl::UcbStreamHelper::CreateStream(xInStream);
+ std::unique_ptr<SvMemoryStream> pDecompressedStream(new SvMemoryStream());
+ if (aCodecGZ.AttemptDecompression(*pInStream, *pDecompressedStream))
+ {
+ uno::Reference<io::XStream> xStreamDecompressed(new utl::OStreamWrapper(std::move(pDecompressedStream)));
+ aMediaDesc[MediaDescriptor::PROP_STREAM] <<= xStreamDecompressed;
+ aMediaDesc[MediaDescriptor::PROP_INPUTSTREAM] <<= xStreamDecompressed->getInputStream();
+ OUString aURL = aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_URL, OUString() );
+ sal_Int32 nIdx = aURL.lastIndexOf(".gz");
+ if (nIdx != -1)
+ aMediaDesc[MediaDescriptor::PROP_URL] <<= aURL.copy(0, nIdx);
+ }
+ }
+ // Get the file name extension.
+ INetURLObject aParser(aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_URL, OUString() ) );
+ OUString aExt = aParser.getExtension(INetURLObject::LAST_SEGMENT, true, INetURLObject::DecodeMechanism::WithCharset);
+ aExt = aExt.toAsciiLowerCase();
+ OUString aName = aParser.getName().toAsciiLowerCase();
+
+ // Decide which filter to use based on the document service first,
+ // then on extension if that's not available.
+
+ if (aDocService == CALC_DOCSERVICE)
+ aMediaDesc[MediaDescriptor::PROP_FILTERNAME] <<= OUString(CALC_TEXT_FILTER);
+ else if (aDocService == WRITER_DOCSERVICE)
+ aMediaDesc[MediaDescriptor::PROP_FILTERNAME] <<= OUString(WRITER_TEXT_FILTER);
+ else if (aExt == "csv" || aExt == "tsv" || aExt == "tab" || aExt == "xls" || aName.endsWith(".csv.gz"))
+ aMediaDesc[MediaDescriptor::PROP_FILTERNAME] <<= OUString(CALC_TEXT_FILTER);
+ else
+ aMediaDesc[MediaDescriptor::PROP_FILTERNAME] <<= OUString(WRITER_TEXT_FILTER);
+ }
+
+ else
+ // Nothing to detect.
+ return OUString();
+
+ aMediaDesc >> lDescriptor;
+ return aType;
+}
+
+// XInitialization
+
+void SAL_CALL PlainTextFilterDetect::initialize(const uno::Sequence<uno::Any>& /*aArguments*/)
+{
+}
+
+OUString PlainTextFilterDetect_getImplementationName()
+{
+ return "com.sun.star.comp.filters.PlainTextFilterDetect";
+}
+
+uno::Sequence<OUString> PlainTextFilterDetect_getSupportedServiceNames()
+{
+ return { "com.sun.star.document.ExtendedTypeDetection", "com.sun.star.comp.filters.PlainTextFilterDetect" };
+}
+
+// XServiceInfo
+OUString SAL_CALL PlainTextFilterDetect::getImplementationName()
+{
+ return PlainTextFilterDetect_getImplementationName();
+}
+
+sal_Bool SAL_CALL PlainTextFilterDetect::supportsService(const OUString& rServiceName)
+{
+ return cppu::supportsService(this, rServiceName);
+}
+
+uno::Sequence<OUString> SAL_CALL PlainTextFilterDetect::getSupportedServiceNames()
+{
+ return PlainTextFilterDetect_getSupportedServiceNames();
+}
+
+extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface*
+com_sun_star_comp_filters_PlainTextFilterDetect_get_implementation(css::uno::XComponentContext* ,
+ css::uno::Sequence<css::uno::Any> const &)
+{
+ return cppu::acquire(new PlainTextFilterDetect);
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */