1
0
Fork 0
libreoffice/sc/source/ui/dataprovider/htmldataprovider.cxx
Daniel Baumann 8e63e14cf6
Adding upstream version 4:25.2.3.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
2025-06-22 16:20:04 +02:00

280 lines
7.4 KiB
C++

/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
* This file is part of the LibreOffice project.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/
#include "htmldataprovider.hxx"
#include <datamapper.hxx>
#include <datatransformation.hxx>
#include <salhelper/thread.hxx>
#include <utility>
#include <vcl/svapp.hxx>
#include <tools/stream.hxx>
#include <libxml/HTMLparser.h>
#include <libxml/xpath.h>
#include <comphelper/string.hxx>
namespace sc {
class HTMLFetchThread : public salhelper::Thread
{
ScDocument& mrDocument;
OUString maURL;
OUString maID;
const std::vector<std::shared_ptr<sc::DataTransformation>> maDataTransformations;
std::function<void()> maImportFinishedHdl;
void handleTable(xmlNodePtr pTable);
void handleRow(xmlNodePtr pRow, SCROW nRow);
void skipHeadBody(xmlNodePtr pSkip, SCROW& rRow);
void handleCell(xmlNodePtr pCell, SCROW nRow, SCCOL nCol);
public:
HTMLFetchThread(ScDocument& rDoc, const OUString&, const OUString& rID, std::function<void()> aImportFinishedHdl,
std::vector<std::shared_ptr<sc::DataTransformation>>&& rTransformations);
virtual void execute() override;
};
HTMLFetchThread::HTMLFetchThread(
ScDocument& rDoc, const OUString& rURL, const OUString& rID,
std::function<void()> aImportFinishedHdl,
std::vector<std::shared_ptr<sc::DataTransformation>>&& rTransformations)
: salhelper::Thread("HTML Fetch Thread")
, mrDocument(rDoc)
, maURL(rURL)
, maID(rID)
, maDataTransformations(std::move(rTransformations))
, maImportFinishedHdl(std::move(aImportFinishedHdl))
{
}
namespace {
OString toString(const xmlChar* pStr)
{
return OString(reinterpret_cast<const char*>(pStr), xmlStrlen(pStr));
}
OUString trim_string(const OUString& aStr)
{
OUString aOldString;
OUString aString = aStr;
do
{
aOldString = aString;
aString = comphelper::string::strip(aString, ' ');
aString = comphelper::string::strip(aString, '\n');
aString = comphelper::string::strip(aString, '\r');
aString = comphelper::string::strip(aString, '\t');
}
while (aOldString != aString);
return aString;
}
OUString get_node_str(xmlNodePtr pNode)
{
OUStringBuffer aStr;
for (xmlNodePtr cur_node = pNode->children; cur_node; cur_node = cur_node->next)
{
if (cur_node->type == XML_TEXT_NODE)
{
OUString aString = OStringToOUString(toString(cur_node->content), RTL_TEXTENCODING_UTF8);
aStr.append(trim_string(aString));
}
else if (cur_node->type == XML_ELEMENT_NODE)
{
aStr.append(get_node_str(cur_node));
}
}
return aStr.makeStringAndClear();
}
}
void HTMLFetchThread::handleCell(xmlNodePtr pCellNode, SCROW nRow, SCCOL nCol)
{
OUStringBuffer aStr;
for (xmlNodePtr cur_node = pCellNode->children; cur_node; cur_node = cur_node->next)
{
if (cur_node->type == XML_TEXT_NODE)
{
OUString aString = OStringToOUString(toString(cur_node->content), RTL_TEXTENCODING_UTF8);
aStr.append(trim_string(aString));
}
else if (cur_node->type == XML_ELEMENT_NODE)
{
aStr.append(get_node_str(cur_node));
}
}
if (!aStr.isEmpty())
{
OUString aCellStr = aStr.makeStringAndClear();
mrDocument.SetString(nCol, nRow, 0, aCellStr);
}
}
void HTMLFetchThread::handleRow(xmlNodePtr pRowNode, SCROW nRow)
{
sal_Int32 nCol = 0;
for (xmlNodePtr cur_node = pRowNode->children; cur_node; cur_node = cur_node->next)
{
if (cur_node->type == XML_ELEMENT_NODE)
{
OString aNodeName = toString(cur_node->name);
if (aNodeName == "td" || aNodeName == "th")
{
handleCell(cur_node, nRow, nCol);
++nCol;
}
}
}
}
void HTMLFetchThread::skipHeadBody(xmlNodePtr pSkipElement, SCROW& rRow)
{
for (xmlNodePtr cur_node = pSkipElement->children; cur_node; cur_node = cur_node->next)
{
if (cur_node->type == XML_ELEMENT_NODE)
{
OString aNodeName = toString(cur_node->name);
if (aNodeName == "tr")
{
handleRow(cur_node, rRow);
++rRow;
}
}
}
}
void HTMLFetchThread::handleTable(xmlNodePtr pTable)
{
sal_Int32 nRow = 0;
for (xmlNodePtr cur_node = pTable->children; cur_node; cur_node = cur_node->next)
{
if (cur_node->type == XML_ELEMENT_NODE)
{
OString aNodeName = toString(cur_node->name);
if (aNodeName == "tr")
{
handleRow(cur_node, nRow);
++nRow;
}
else if (aNodeName == "thead" || aNodeName == "tbody")
{
skipHeadBody(cur_node, nRow);
}
}
}
}
void HTMLFetchThread::execute()
{
OStringBuffer aBuffer(64000);
DataProvider::FetchStreamFromURL(maURL, aBuffer);
if (aBuffer.isEmpty())
return;
htmlDocPtr pHtmlPtr = htmlParseDoc(reinterpret_cast<xmlChar*>(const_cast<char*>(aBuffer.getStr())), nullptr);
OString aID = OUStringToOString(maID, RTL_TEXTENCODING_UTF8);
xmlXPathContextPtr pXmlXpathCtx = xmlXPathNewContext(pHtmlPtr);
xmlXPathObjectPtr pXmlXpathObj = xmlXPathEvalExpression(BAD_CAST(aID.getStr()), pXmlXpathCtx);
if (!pXmlXpathObj)
{
xmlXPathFreeContext(pXmlXpathCtx);
return;
}
xmlNodeSetPtr pXmlNodes = pXmlXpathObj->nodesetval;
if (!pXmlNodes)
{
xmlXPathFreeNodeSetList(pXmlXpathObj);
xmlXPathFreeContext(pXmlXpathCtx);
return;
}
if (pXmlNodes->nodeNr == 0)
{
xmlXPathFreeNodeSet(pXmlNodes);
xmlXPathFreeNodeSetList(pXmlXpathObj);
xmlXPathFreeContext(pXmlXpathCtx);
return;
}
xmlNodePtr pNode = pXmlNodes->nodeTab[0];
handleTable(pNode);
xmlXPathFreeNodeSet(pXmlNodes);
xmlXPathFreeNodeSetList(pXmlXpathObj);
xmlXPathFreeContext(pXmlXpathCtx);
for (auto& itr : maDataTransformations)
{
itr->Transform(mrDocument);
}
SolarMutexGuard aGuard;
maImportFinishedHdl();
}
HTMLDataProvider::HTMLDataProvider(ScDocument* pDoc, sc::ExternalDataSource& rDataSource):
DataProvider(rDataSource),
mpDocument(pDoc)
{
}
HTMLDataProvider::~HTMLDataProvider()
{
if (mxHTMLFetchThread.is())
{
SolarMutexReleaser aReleaser;
mxHTMLFetchThread->join();
}
}
void HTMLDataProvider::Import()
{
// already importing data
if (mpDoc)
return;
mpDoc.reset(new ScDocument(SCDOCMODE_CLIP));
mpDoc->ResetClip(mpDocument, SCTAB(0));
mxHTMLFetchThread = new HTMLFetchThread(*mpDoc, mrDataSource.getURL(), mrDataSource.getID(),
std::bind(&HTMLDataProvider::ImportFinished, this), std::vector(mrDataSource.getDataTransformation()));
mxHTMLFetchThread->launch();
if (mbDeterministic)
{
SolarMutexReleaser aReleaser;
mxHTMLFetchThread->join();
}
}
void HTMLDataProvider::ImportFinished()
{
mrDataSource.getDBManager()->WriteToDoc(*mpDoc);
}
const OUString& HTMLDataProvider::GetURL() const
{
return mrDataSource.getURL();
}
}
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */