From ed5640d8b587fbcfed7dd7967f3de04b37a76f26 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 11:06:44 +0200 Subject: Adding upstream version 4:7.4.7. Signed-off-by: Daniel Baumann --- sc/source/ui/dataprovider/htmldataprovider.cxx | 280 +++++++++++++++++++++++++ 1 file changed, 280 insertions(+) create mode 100644 sc/source/ui/dataprovider/htmldataprovider.cxx (limited to 'sc/source/ui/dataprovider/htmldataprovider.cxx') diff --git a/sc/source/ui/dataprovider/htmldataprovider.cxx b/sc/source/ui/dataprovider/htmldataprovider.cxx new file mode 100644 index 000000000..8b241ddfe --- /dev/null +++ b/sc/source/ui/dataprovider/htmldataprovider.cxx @@ -0,0 +1,280 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include "htmldataprovider.hxx" +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include + +namespace sc { + +class HTMLFetchThread : public salhelper::Thread +{ + ScDocument& mrDocument; + OUString maURL; + OUString maID; + const std::vector> maDataTransformations; + std::function maImportFinishedHdl; + + void handleTable(xmlNodePtr pTable); + void handleRow(xmlNodePtr pRow, SCROW nRow); + void skipHeadBody(xmlNodePtr pSkip, SCROW& rRow); + void handleCell(xmlNodePtr pCell, SCROW nRow, SCCOL nCol); + +public: + HTMLFetchThread(ScDocument& rDoc, const OUString&, const OUString& rID, std::function aImportFinishedHdl, + std::vector>&& rTransformations); + + virtual void execute() override; +}; + +HTMLFetchThread::HTMLFetchThread( + ScDocument& rDoc, const OUString& rURL, const OUString& rID, + std::function aImportFinishedHdl, + std::vector>&& rTransformations) + : salhelper::Thread("HTML Fetch Thread") + , mrDocument(rDoc) + , maURL(rURL) + , maID(rID) + , maDataTransformations(std::move(rTransformations)) + , maImportFinishedHdl(std::move(aImportFinishedHdl)) +{ +} + +namespace { + +OString toString(const xmlChar* pStr) +{ + return OString(reinterpret_cast(pStr), xmlStrlen(pStr)); +} + +OUString trim_string(const OUString& aStr) +{ + OUString aOldString; + OUString aString = aStr; + do + { + aOldString = aString; + aString = comphelper::string::strip(aString, ' '); + aString = comphelper::string::strip(aString, '\n'); + aString = comphelper::string::strip(aString, '\r'); + aString = comphelper::string::strip(aString, '\t'); + } + while (aOldString != aString); + + return aString; +} + +OUString get_node_str(xmlNodePtr pNode) +{ + OUStringBuffer aStr; + for (xmlNodePtr cur_node = pNode->children; cur_node; cur_node = cur_node->next) + { + if (cur_node->type == XML_TEXT_NODE) + { + OUString aString = OStringToOUString(toString(cur_node->content), RTL_TEXTENCODING_UTF8); + aStr.append(trim_string(aString)); + } + else if (cur_node->type == XML_ELEMENT_NODE) + { + aStr.append(get_node_str(cur_node)); + } + } + + return aStr.makeStringAndClear(); +} + +} + +void HTMLFetchThread::handleCell(xmlNodePtr pCellNode, SCROW nRow, SCCOL nCol) +{ + OUStringBuffer aStr; + for (xmlNodePtr cur_node = pCellNode->children; cur_node; cur_node = cur_node->next) + { + if (cur_node->type == XML_TEXT_NODE) + { + OUString aString = OStringToOUString(toString(cur_node->content), RTL_TEXTENCODING_UTF8); + aStr.append(trim_string(aString)); + } + else if (cur_node->type == XML_ELEMENT_NODE) + { + aStr.append(get_node_str(cur_node)); + } + } + + if (!aStr.isEmpty()) + { + OUString aCellStr = aStr.makeStringAndClear(); + mrDocument.SetString(nCol, nRow, 0, aCellStr); + } +} + +void HTMLFetchThread::handleRow(xmlNodePtr pRowNode, SCROW nRow) +{ + sal_Int32 nCol = 0; + for (xmlNodePtr cur_node = pRowNode->children; cur_node; cur_node = cur_node->next) + { + if (cur_node->type == XML_ELEMENT_NODE) + { + OString aNodeName = toString(cur_node->name); + if (aNodeName == "td" || aNodeName == "th") + { + handleCell(cur_node, nRow, nCol); + ++nCol; + } + } + } +} + +void HTMLFetchThread::skipHeadBody(xmlNodePtr pSkipElement, SCROW& rRow) +{ + for (xmlNodePtr cur_node = pSkipElement->children; cur_node; cur_node = cur_node->next) + { + if (cur_node->type == XML_ELEMENT_NODE) + { + OString aNodeName = toString(cur_node->name); + if (aNodeName == "tr") + { + handleRow(cur_node, rRow); + ++rRow; + } + + } + } +} + +void HTMLFetchThread::handleTable(xmlNodePtr pTable) +{ + sal_Int32 nRow = 0; + for (xmlNodePtr cur_node = pTable->children; cur_node; cur_node = cur_node->next) + { + if (cur_node->type == XML_ELEMENT_NODE) + { + OString aNodeName = toString(cur_node->name); + if (aNodeName == "tr") + { + handleRow(cur_node, nRow); + ++nRow; + } + else if (aNodeName == "thead" || aNodeName == "tbody") + { + skipHeadBody(cur_node, nRow); + } + } + } +} + +void HTMLFetchThread::execute() +{ + OStringBuffer aBuffer(64000); + DataProvider::FetchStreamFromURL(maURL, aBuffer); + + if (aBuffer.isEmpty()) + return; + + htmlDocPtr pHtmlPtr = htmlParseDoc(reinterpret_cast(const_cast(aBuffer.getStr())), nullptr); + + OString aID = OUStringToOString(maID, RTL_TEXTENCODING_UTF8); + xmlXPathContextPtr pXmlXpathCtx = xmlXPathNewContext(pHtmlPtr); + xmlXPathObjectPtr pXmlXpathObj = xmlXPathEvalExpression(BAD_CAST(aID.getStr()), pXmlXpathCtx); + + if (!pXmlXpathObj) + { + xmlXPathFreeContext(pXmlXpathCtx); + return; + } + xmlNodeSetPtr pXmlNodes = pXmlXpathObj->nodesetval; + + if (!pXmlNodes) + { + xmlXPathFreeNodeSetList(pXmlXpathObj); + xmlXPathFreeContext(pXmlXpathCtx); + return; + } + + if (pXmlNodes->nodeNr == 0) + { + xmlXPathFreeNodeSet(pXmlNodes); + xmlXPathFreeNodeSetList(pXmlXpathObj); + xmlXPathFreeContext(pXmlXpathCtx); + return; + } + + xmlNodePtr pNode = pXmlNodes->nodeTab[0]; + handleTable(pNode); + + xmlXPathFreeNodeSet(pXmlNodes); + xmlXPathFreeNodeSetList(pXmlXpathObj); + xmlXPathFreeContext(pXmlXpathCtx); + + for (auto& itr : maDataTransformations) + { + itr->Transform(mrDocument); + } + + SolarMutexGuard aGuard; + maImportFinishedHdl(); +} + +HTMLDataProvider::HTMLDataProvider(ScDocument* pDoc, sc::ExternalDataSource& rDataSource): + DataProvider(rDataSource), + mpDocument(pDoc) +{ +} + +HTMLDataProvider::~HTMLDataProvider() +{ + if (mxHTMLFetchThread.is()) + { + SolarMutexReleaser aReleaser; + mxHTMLFetchThread->join(); + } +} + +void HTMLDataProvider::Import() +{ + // already importing data + if (mpDoc) + return; + + mpDoc.reset(new ScDocument(SCDOCMODE_CLIP)); + mpDoc->ResetClip(mpDocument, SCTAB(0)); + mxHTMLFetchThread = new HTMLFetchThread(*mpDoc, mrDataSource.getURL(), mrDataSource.getID(), + std::bind(&HTMLDataProvider::ImportFinished, this), std::vector(mrDataSource.getDataTransformation())); + mxHTMLFetchThread->launch(); + + if (mbDeterministic) + { + SolarMutexReleaser aReleaser; + mxHTMLFetchThread->join(); + } +} + +void HTMLDataProvider::ImportFinished() +{ + mrDataSource.getDBManager()->WriteToDoc(*mpDoc); +} + +const OUString& HTMLDataProvider::GetURL() const +{ + return mrDataSource.getURL(); +} + +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ -- cgit v1.2.3