summaryrefslogtreecommitdiffstats
path: root/sc/source/ui/dataprovider/htmldataprovider.cxx
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 16:51:28 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 16:51:28 +0000
commit940b4d1848e8c70ab7642901a68594e8016caffc (patch)
treeeb72f344ee6c3d9b80a7ecc079ea79e9fba8676d /sc/source/ui/dataprovider/htmldataprovider.cxx
parentInitial commit. (diff)
downloadlibreoffice-upstream.tar.xz
libreoffice-upstream.zip
Adding upstream version 1:7.0.4.upstream/1%7.0.4upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'sc/source/ui/dataprovider/htmldataprovider.cxx')
-rw-r--r--sc/source/ui/dataprovider/htmldataprovider.cxx282
1 files changed, 282 insertions, 0 deletions
diff --git a/sc/source/ui/dataprovider/htmldataprovider.cxx b/sc/source/ui/dataprovider/htmldataprovider.cxx
new file mode 100644
index 000000000..fbd1daaa3
--- /dev/null
+++ b/sc/source/ui/dataprovider/htmldataprovider.cxx
@@ -0,0 +1,282 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include "htmldataprovider.hxx"
+#include <datamapper.hxx>
+#include <datatransformation.hxx>
+#include <salhelper/thread.hxx>
+#include <vcl/svapp.hxx>
+#include <tools/stream.hxx>
+
+#include <libxml/HTMLparser.h>
+
+#include <libxml/xpath.h>
+
+#include <comphelper/string.hxx>
+#include <utility>
+
+namespace sc {
+
+class HTMLFetchThread : public salhelper::Thread
+{
+ ScDocument& mrDocument;
+ OUString maURL;
+ OUString maID;
+ const std::vector<std::shared_ptr<sc::DataTransformation>> maDataTransformations;
+ std::function<void()> maImportFinishedHdl;
+
+ void handleTable(xmlNodePtr pTable);
+ void handleRow(xmlNodePtr pRow, SCROW nRow);
+ void skipHeadBody(xmlNodePtr pSkip, SCROW& rRow);
+ void handleCell(xmlNodePtr pCell, SCROW nRow, SCCOL nCol);
+
+public:
+ HTMLFetchThread(ScDocument& rDoc, const OUString&, const OUString& rID, std::function<void()> aImportFinishedHdl,
+ const std::vector<std::shared_ptr<sc::DataTransformation>>& rTransformations);
+
+ virtual void execute() override;
+};
+
+HTMLFetchThread::HTMLFetchThread(
+ ScDocument& rDoc, const OUString& rURL, const OUString& rID,
+ std::function<void()> aImportFinishedHdl,
+ const std::vector<std::shared_ptr<sc::DataTransformation>>& rTransformations)
+ : salhelper::Thread("HTML Fetch Thread")
+ , mrDocument(rDoc)
+ , maURL(rURL)
+ , maID(rID)
+ , maDataTransformations(rTransformations)
+ , maImportFinishedHdl(std::move(aImportFinishedHdl))
+{
+}
+
+namespace {
+
+OString toString(const xmlChar* pStr)
+{
+ return OString(reinterpret_cast<const char*>(pStr), xmlStrlen(pStr));
+}
+
+OUString trim_string(const OUString& aStr)
+{
+ OUString aOldString;
+ OUString aString = aStr;
+ do
+ {
+ aOldString = aString;
+ aString = comphelper::string::strip(aString, ' ');
+ aString = comphelper::string::strip(aString, '\n');
+ aString = comphelper::string::strip(aString, '\r');
+ aString = comphelper::string::strip(aString, '\t');
+ }
+ while (aOldString != aString);
+
+ return aString;
+}
+
+OUString get_node_str(xmlNodePtr pNode)
+{
+ OUStringBuffer aStr;
+ for (xmlNodePtr cur_node = pNode->children; cur_node; cur_node = cur_node->next)
+ {
+ if (cur_node->type == XML_TEXT_NODE)
+ {
+ OUString aString = OStringToOUString(toString(cur_node->content), RTL_TEXTENCODING_UTF8);
+ aStr.append(trim_string(aString));
+ }
+ else if (cur_node->type == XML_ELEMENT_NODE)
+ {
+ aStr.append(get_node_str(cur_node));
+ }
+ }
+
+ return aStr.makeStringAndClear();
+}
+
+}
+
+void HTMLFetchThread::handleCell(xmlNodePtr pCellNode, SCROW nRow, SCCOL nCol)
+{
+ OUStringBuffer aStr;
+ for (xmlNodePtr cur_node = pCellNode->children; cur_node; cur_node = cur_node->next)
+ {
+ if (cur_node->type == XML_TEXT_NODE)
+ {
+ OUString aString = OStringToOUString(toString(cur_node->content), RTL_TEXTENCODING_UTF8);
+ aStr.append(trim_string(aString));
+ }
+ else if (cur_node->type == XML_ELEMENT_NODE)
+ {
+ aStr.append(get_node_str(cur_node));
+ }
+ }
+
+ if (!aStr.isEmpty())
+ {
+ OUString aCellStr = aStr.makeStringAndClear();
+ mrDocument.SetString(nCol, nRow, 0, aCellStr);
+ }
+}
+
+void HTMLFetchThread::handleRow(xmlNodePtr pRowNode, SCROW nRow)
+{
+ sal_Int32 nCol = 0;
+ for (xmlNodePtr cur_node = pRowNode->children; cur_node; cur_node = cur_node->next)
+ {
+ if (cur_node->type == XML_ELEMENT_NODE)
+ {
+ OString aNodeName = toString(cur_node->name);
+ if (aNodeName == "td" || aNodeName == "th")
+ {
+ handleCell(cur_node, nRow, nCol);
+ ++nCol;
+ }
+ }
+ }
+}
+
+void HTMLFetchThread::skipHeadBody(xmlNodePtr pSkipElement, SCROW& rRow)
+{
+ for (xmlNodePtr cur_node = pSkipElement->children; cur_node; cur_node = cur_node->next)
+ {
+ if (cur_node->type == XML_ELEMENT_NODE)
+ {
+ OString aNodeName = toString(cur_node->name);
+ if (aNodeName == "tr")
+ {
+ handleRow(cur_node, rRow);
+ ++rRow;
+ }
+
+ }
+ }
+}
+
+void HTMLFetchThread::handleTable(xmlNodePtr pTable)
+{
+ sal_Int32 nRow = 0;
+ for (xmlNodePtr cur_node = pTable->children; cur_node; cur_node = cur_node->next)
+ {
+ if (cur_node->type == XML_ELEMENT_NODE)
+ {
+ OString aNodeName = toString(cur_node->name);
+ if (aNodeName == "tr")
+ {
+ handleRow(cur_node, nRow);
+ ++nRow;
+ }
+ else if (aNodeName == "thead" || aNodeName == "tbody")
+ {
+ skipHeadBody(cur_node, nRow);
+ }
+ }
+ }
+}
+
+void HTMLFetchThread::execute()
+{
+ OStringBuffer aBuffer(64000);
+ DataProvider::FetchStreamFromURL(maURL, aBuffer);
+
+ if (aBuffer.isEmpty())
+ return;
+
+ htmlDocPtr pHtmlPtr = htmlParseDoc(reinterpret_cast<xmlChar*>(const_cast<char*>(aBuffer.getStr())), nullptr);
+
+ OString aID = OUStringToOString(maID, RTL_TEXTENCODING_UTF8);
+ xmlXPathContextPtr pXmlXpathCtx = xmlXPathNewContext(pHtmlPtr);
+ xmlXPathObjectPtr pXmlXpathObj = xmlXPathEvalExpression(BAD_CAST(aID.getStr()), pXmlXpathCtx);
+
+ if (!pXmlXpathObj)
+ {
+ xmlXPathFreeContext(pXmlXpathCtx);
+ return;
+ }
+ xmlNodeSetPtr pXmlNodes = pXmlXpathObj->nodesetval;
+
+ if (!pXmlNodes)
+ {
+ xmlXPathFreeNodeSetList(pXmlXpathObj);
+ xmlXPathFreeContext(pXmlXpathCtx);
+ return;
+ }
+
+ if (pXmlNodes->nodeNr == 0)
+ {
+ xmlXPathFreeNodeSet(pXmlNodes);
+ xmlXPathFreeNodeSetList(pXmlXpathObj);
+ xmlXPathFreeContext(pXmlXpathCtx);
+ return;
+ }
+
+ xmlNodePtr pNode = pXmlNodes->nodeTab[0];
+ handleTable(pNode);
+
+ xmlXPathFreeNodeSet(pXmlNodes);
+ xmlXPathFreeNodeSetList(pXmlXpathObj);
+ xmlXPathFreeContext(pXmlXpathCtx);
+
+ for (auto& itr : maDataTransformations)
+ {
+ itr->Transform(mrDocument);
+ }
+
+ SolarMutexGuard aGuard;
+ maImportFinishedHdl();
+}
+
+HTMLDataProvider::HTMLDataProvider(ScDocument* pDoc, sc::ExternalDataSource& rDataSource):
+ DataProvider(rDataSource),
+ mpDocument(pDoc)
+{
+}
+
+HTMLDataProvider::~HTMLDataProvider()
+{
+ if (mxHTMLFetchThread.is())
+ {
+ SolarMutexReleaser aReleaser;
+ mxHTMLFetchThread->join();
+ }
+}
+
+void HTMLDataProvider::Import()
+{
+ // already importing data
+ if (mpDoc)
+ return;
+
+ mpDoc.reset(new ScDocument(SCDOCMODE_CLIP));
+ mpDoc->ResetClip(mpDocument, SCTAB(0));
+ mxHTMLFetchThread = new HTMLFetchThread(*mpDoc, mrDataSource.getURL(), mrDataSource.getID(),
+ std::bind(&HTMLDataProvider::ImportFinished, this), mrDataSource.getDataTransformation());
+ mxHTMLFetchThread->launch();
+
+ if (mbDeterministic)
+ {
+ SolarMutexReleaser aReleaser;
+ mxHTMLFetchThread->join();
+ }
+}
+
+void HTMLDataProvider::ImportFinished()
+{
+ mrDataSource.getDBManager()->WriteToDoc(*mpDoc);
+ mxHTMLFetchThread.clear();
+ mpDoc.reset();
+}
+
+const OUString& HTMLDataProvider::GetURL() const
+{
+ return mrDataSource.getURL();
+}
+
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */