summaryrefslogtreecommitdiffstats
path: root/xbmc/utils/ScraperUrl.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'xbmc/utils/ScraperUrl.cpp')
-rw-r--r--xbmc/utils/ScraperUrl.cpp434
1 files changed, 434 insertions, 0 deletions
diff --git a/xbmc/utils/ScraperUrl.cpp b/xbmc/utils/ScraperUrl.cpp
new file mode 100644
index 0000000..f131b16
--- /dev/null
+++ b/xbmc/utils/ScraperUrl.cpp
@@ -0,0 +1,434 @@
+/*
+ * Copyright (C) 2005-2018 Team Kodi
+ * This file is part of Kodi - https://kodi.tv
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * See LICENSES/README.md for more information.
+ */
+
+#include "ScraperUrl.h"
+
+#include "CharsetConverter.h"
+#include "ServiceBroker.h"
+#include "URIUtils.h"
+#include "URL.h"
+#include "XMLUtils.h"
+#include "filesystem/CurlFile.h"
+#include "filesystem/ZipFile.h"
+#include "settings/AdvancedSettings.h"
+#include "settings/SettingsComponent.h"
+#include "utils/CharsetDetection.h"
+#include "utils/Mime.h"
+#include "utils/StringUtils.h"
+#include "utils/XBMCTinyXML.h"
+#include "utils/log.h"
+
+#include <algorithm>
+#include <cstring>
+#include <sstream>
+
+CScraperUrl::CScraperUrl() : m_relevance(0.0), m_parsed(false)
+{
+}
+
+CScraperUrl::CScraperUrl(const std::string& strUrl) : CScraperUrl()
+{
+ ParseFromData(strUrl);
+}
+
+CScraperUrl::CScraperUrl(const TiXmlElement* element) : CScraperUrl()
+{
+ ParseAndAppendUrl(element);
+}
+
+CScraperUrl::~CScraperUrl() = default;
+
+void CScraperUrl::Clear()
+{
+ m_urls.clear();
+ m_data.clear();
+ m_relevance = 0.0;
+ m_parsed = false;
+}
+
+void CScraperUrl::SetData(std::string data)
+{
+ m_data = std::move(data);
+ m_parsed = false;
+}
+
+const CScraperUrl::SUrlEntry CScraperUrl::GetFirstUrlByType(const std::string& type) const
+{
+ const auto url = std::find_if(m_urls.begin(), m_urls.end(), [type](const SUrlEntry& url) {
+ return url.m_type == UrlType::General && (type.empty() || url.m_aspect == type);
+ });
+ if (url != m_urls.end())
+ return *url;
+
+ return SUrlEntry();
+}
+
+const CScraperUrl::SUrlEntry CScraperUrl::GetSeasonUrl(int season, const std::string& type) const
+{
+ const auto url = std::find_if(m_urls.begin(), m_urls.end(), [season, type](const SUrlEntry& url) {
+ return url.m_type == UrlType::Season && url.m_season == season &&
+ (type.empty() || type == "thumb" || url.m_aspect == type);
+ });
+ if (url != m_urls.end())
+ return *url;
+
+ return SUrlEntry();
+}
+
+unsigned int CScraperUrl::GetMaxSeasonUrl() const
+{
+ unsigned int maxSeason = 0;
+ for (const auto& url : m_urls)
+ {
+ if (url.m_type == UrlType::Season && url.m_season > 0 &&
+ static_cast<unsigned int>(url.m_season) > maxSeason)
+ maxSeason = url.m_season;
+ }
+ return maxSeason;
+}
+
+std::string CScraperUrl::GetFirstThumbUrl() const
+{
+ if (m_urls.empty())
+ return {};
+
+ return GetThumbUrl(m_urls.front());
+}
+
+void CScraperUrl::GetThumbUrls(std::vector<std::string>& thumbs,
+ const std::string& type,
+ int season,
+ bool unique) const
+{
+ for (const auto& url : m_urls)
+ {
+ if (url.m_aspect == type || type.empty() || url.m_aspect.empty())
+ {
+ if ((url.m_type == CScraperUrl::UrlType::General && season == -1) ||
+ (url.m_type == CScraperUrl::UrlType::Season && url.m_season == season))
+ {
+ std::string thumbUrl = GetThumbUrl(url);
+ if (!unique || std::find(thumbs.begin(), thumbs.end(), thumbUrl) == thumbs.end())
+ thumbs.push_back(thumbUrl);
+ }
+ }
+ }
+}
+
+bool CScraperUrl::Parse()
+{
+ if (m_parsed)
+ return true;
+
+ auto dataToParse = m_data;
+ m_data.clear();
+ return ParseFromData(dataToParse);
+}
+
+bool CScraperUrl::ParseFromData(const std::string& data)
+{
+ if (data.empty())
+ return false;
+
+ CXBMCTinyXML doc;
+ /* strUrl is coming from internal sources (usually generated by scraper or from database)
+ * so strUrl is always in UTF-8 */
+ doc.Parse(data, TIXML_ENCODING_UTF8);
+
+ auto pElement = doc.RootElement();
+ if (pElement == nullptr)
+ {
+ m_urls.emplace_back(data);
+ m_data = data;
+ }
+ else
+ {
+ while (pElement != nullptr)
+ {
+ ParseAndAppendUrl(pElement);
+ pElement = pElement->NextSiblingElement(pElement->Value());
+ }
+ }
+
+ m_parsed = true;
+ return true;
+}
+
+bool CScraperUrl::ParseAndAppendUrl(const TiXmlElement* element)
+{
+ if (element == nullptr || element->FirstChild() == nullptr ||
+ element->FirstChild()->Value() == nullptr)
+ return false;
+
+ bool wasEmpty = m_data.empty();
+
+ std::stringstream stream;
+ stream << *element;
+ m_data += stream.str();
+
+ SUrlEntry url(element->FirstChild()->ValueStr());
+ url.m_spoof = XMLUtils::GetAttribute(element, "spoof");
+
+ const char* szPost = element->Attribute("post");
+ if (szPost && StringUtils::CompareNoCase(szPost, "yes") == 0)
+ url.m_post = true;
+ else
+ url.m_post = false;
+
+ const char* szIsGz = element->Attribute("gzip");
+ if (szIsGz && StringUtils::CompareNoCase(szIsGz, "yes") == 0)
+ url.m_isgz = true;
+ else
+ url.m_isgz = false;
+
+ url.m_cache = XMLUtils::GetAttribute(element, "cache");
+
+ const char* szType = element->Attribute("type");
+ if (szType && StringUtils::CompareNoCase(szType, "season") == 0)
+ {
+ url.m_type = UrlType::Season;
+ const char* szSeason = element->Attribute("season");
+ if (szSeason)
+ url.m_season = atoi(szSeason);
+ }
+
+ url.m_aspect = XMLUtils::GetAttribute(element, "aspect");
+ url.m_preview = XMLUtils::GetAttribute(element, "preview");
+
+ m_urls.push_back(url);
+
+ if (wasEmpty)
+ m_parsed = true;
+
+ return true;
+}
+
+// XML format is of strUrls is:
+// <TAG><url>...</url>...</TAG> (parsed by ParseElement) or <url>...</url> (ditto)
+bool CScraperUrl::ParseAndAppendUrlsFromEpisodeGuide(const std::string& episodeGuide)
+{
+ if (episodeGuide.empty())
+ return false;
+
+ // ok, now parse the xml file
+ CXBMCTinyXML doc;
+ /* strUrls is coming from internal sources so strUrls is always in UTF-8 */
+ doc.Parse(episodeGuide, TIXML_ENCODING_UTF8);
+ if (doc.RootElement() == nullptr)
+ return false;
+
+ bool wasEmpty = m_data.empty();
+
+ TiXmlHandle docHandle(&doc);
+ auto link = docHandle.FirstChild("episodeguide").Element();
+ if (link->FirstChildElement("url"))
+ {
+ for (link = link->FirstChildElement("url"); link; link = link->NextSiblingElement("url"))
+ ParseAndAppendUrl(link);
+ }
+ else if (link->FirstChild() && link->FirstChild()->Value())
+ ParseAndAppendUrl(link);
+
+ if (wasEmpty)
+ m_parsed = true;
+
+ return true;
+}
+
+void CScraperUrl::AddParsedUrl(const std::string& url,
+ const std::string& aspect,
+ const std::string& preview,
+ const std::string& referrer,
+ const std::string& cache,
+ bool post,
+ bool isgz,
+ int season)
+{
+ bool wasEmpty = m_data.empty();
+
+ TiXmlElement thumb("thumb");
+ thumb.SetAttribute("spoof", referrer);
+ thumb.SetAttribute("cache", cache);
+ if (post)
+ thumb.SetAttribute("post", "yes");
+ if (isgz)
+ thumb.SetAttribute("gzip", "yes");
+ if (season >= 0)
+ {
+ thumb.SetAttribute("season", std::to_string(season));
+ thumb.SetAttribute("type", "season");
+ }
+ thumb.SetAttribute("aspect", aspect);
+ thumb.SetAttribute("preview", preview);
+ TiXmlText text(url);
+ thumb.InsertEndChild(text);
+
+ m_data << thumb;
+
+ SUrlEntry nUrl(url);
+ nUrl.m_spoof = referrer;
+ nUrl.m_post = post;
+ nUrl.m_isgz = isgz;
+ nUrl.m_cache = cache;
+ nUrl.m_preview = preview;
+ if (season >= 0)
+ {
+ nUrl.m_type = UrlType::Season;
+ nUrl.m_season = season;
+ }
+ nUrl.m_aspect = aspect;
+
+ m_urls.push_back(nUrl);
+
+ if (wasEmpty)
+ m_parsed = true;
+}
+
+std::string CScraperUrl::GetThumbUrl(const CScraperUrl::SUrlEntry& entry)
+{
+ if (entry.m_spoof.empty())
+ return entry.m_url;
+
+ return entry.m_url + "|Referer=" + CURL::Encode(entry.m_spoof);
+}
+
+bool CScraperUrl::Get(const SUrlEntry& scrURL,
+ std::string& strHTML,
+ XFILE::CCurlFile& http,
+ const std::string& cacheContext)
+{
+ CURL url(scrURL.m_url);
+ http.SetReferer(scrURL.m_spoof);
+ std::string strCachePath;
+
+ if (!scrURL.m_cache.empty())
+ {
+ strCachePath = URIUtils::AddFileToFolder(
+ CServiceBroker::GetSettingsComponent()->GetAdvancedSettings()->m_cachePath, "scrapers",
+ cacheContext, scrURL.m_cache);
+ if (XFILE::CFile::Exists(strCachePath))
+ {
+ XFILE::CFile file;
+ std::vector<uint8_t> buffer;
+ if (file.LoadFile(strCachePath, buffer) > 0)
+ {
+ strHTML.assign(reinterpret_cast<char*>(buffer.data()), buffer.size());
+ return true;
+ }
+ }
+ }
+
+ auto strHTML1 = strHTML;
+
+ if (scrURL.m_post)
+ {
+ std::string strOptions = url.GetOptions();
+ strOptions = strOptions.substr(1);
+ url.SetOptions("");
+
+ if (!http.Post(url.Get(), strOptions, strHTML1))
+ return false;
+ }
+ else if (!http.Get(url.Get(), strHTML1))
+ return false;
+
+ strHTML = strHTML1;
+
+ const auto mimeType = http.GetProperty(XFILE::FILE_PROPERTY_MIME_TYPE);
+ CMime::EFileType ftype = CMime::GetFileTypeFromMime(mimeType);
+ if (ftype == CMime::FileTypeUnknown)
+ ftype = CMime::GetFileTypeFromContent(strHTML);
+
+ if (ftype == CMime::FileTypeZip || ftype == CMime::FileTypeGZip)
+ {
+ XFILE::CZipFile file;
+ std::string strBuffer;
+ auto iSize = file.UnpackFromMemory(
+ strBuffer, strHTML, scrURL.m_isgz); // FIXME: use FileTypeGZip instead of scrURL.m_isgz?
+ if (iSize > 0)
+ {
+ strHTML = strBuffer;
+ CLog::Log(LOGDEBUG, "{}: Archive \"{}\" was unpacked in memory", __FUNCTION__, scrURL.m_url);
+ }
+ else
+ CLog::Log(LOGWARNING, "{}: \"{}\" looks like archive but cannot be unpacked", __FUNCTION__,
+ scrURL.m_url);
+ }
+
+ const auto reportedCharset = http.GetProperty(XFILE::FILE_PROPERTY_CONTENT_CHARSET);
+ if (ftype == CMime::FileTypeHtml)
+ {
+ std::string realHtmlCharset, converted;
+ if (!CCharsetDetection::ConvertHtmlToUtf8(strHTML, converted, reportedCharset, realHtmlCharset))
+ CLog::Log(LOGWARNING,
+ "{}: Can't find precise charset for HTML \"{}\", using \"{}\" as fallback",
+ __FUNCTION__, scrURL.m_url, realHtmlCharset);
+ else
+ CLog::Log(LOGDEBUG, "{}: Using \"{}\" charset for HTML \"{}\"", __FUNCTION__, realHtmlCharset,
+ scrURL.m_url);
+
+ strHTML = converted;
+ }
+ else if (ftype == CMime::FileTypeXml)
+ {
+ CXBMCTinyXML xmlDoc;
+ xmlDoc.Parse(strHTML, reportedCharset);
+
+ const auto realXmlCharset = xmlDoc.GetUsedCharset();
+ if (!realXmlCharset.empty())
+ {
+ CLog::Log(LOGDEBUG, "{}: Using \"{}\" charset for XML \"{}\"", __FUNCTION__, realXmlCharset,
+ scrURL.m_url);
+ std::string converted;
+ g_charsetConverter.ToUtf8(realXmlCharset, strHTML, converted);
+ strHTML = converted;
+ }
+ }
+ else if (ftype == CMime::FileTypePlainText ||
+ StringUtils::EqualsNoCase(mimeType.substr(0, 5), "text/"))
+ {
+ std::string realTextCharset;
+ std::string converted;
+ CCharsetDetection::ConvertPlainTextToUtf8(strHTML, converted, reportedCharset, realTextCharset);
+ strHTML = converted;
+ if (reportedCharset != realTextCharset)
+ CLog::Log(LOGWARNING,
+ "{}: Using \"{}\" charset for plain text \"{}\" instead of server reported \"{}\" "
+ "charset",
+ __FUNCTION__, realTextCharset, scrURL.m_url, reportedCharset);
+ else
+ CLog::Log(LOGDEBUG, "{}: Using \"{}\" charset for plain text \"{}\"", __FUNCTION__,
+ realTextCharset, scrURL.m_url);
+ }
+ else if (!reportedCharset.empty())
+ {
+ CLog::Log(LOGDEBUG, "{}: Using \"{}\" charset for \"{}\"", __FUNCTION__, reportedCharset,
+ scrURL.m_url);
+ if (reportedCharset != "UTF-8")
+ {
+ std::string converted;
+ g_charsetConverter.ToUtf8(reportedCharset, strHTML, converted);
+ strHTML = converted;
+ }
+ }
+ else
+ CLog::Log(LOGDEBUG, "{}: Using content of \"{}\" as binary or text with \"UTF-8\" charset",
+ __FUNCTION__, scrURL.m_url);
+
+ if (!scrURL.m_cache.empty())
+ {
+ const auto strCachePath = URIUtils::AddFileToFolder(
+ CServiceBroker::GetSettingsComponent()->GetAdvancedSettings()->m_cachePath, "scrapers",
+ cacheContext, scrURL.m_cache);
+ XFILE::CFile file;
+ if (!file.OpenForWrite(strCachePath, true) ||
+ file.Write(strHTML.data(), strHTML.size()) != static_cast<ssize_t>(strHTML.size()))
+ return false;
+ }
+ return true;
+}