diff options
Diffstat (limited to 'xbmc/utils/ScraperUrl.cpp')
-rw-r--r-- | xbmc/utils/ScraperUrl.cpp | 434 |
1 files changed, 434 insertions, 0 deletions
diff --git a/xbmc/utils/ScraperUrl.cpp b/xbmc/utils/ScraperUrl.cpp new file mode 100644 index 0000000..f131b16 --- /dev/null +++ b/xbmc/utils/ScraperUrl.cpp @@ -0,0 +1,434 @@ +/* + * Copyright (C) 2005-2018 Team Kodi + * This file is part of Kodi - https://kodi.tv + * + * SPDX-License-Identifier: GPL-2.0-or-later + * See LICENSES/README.md for more information. + */ + +#include "ScraperUrl.h" + +#include "CharsetConverter.h" +#include "ServiceBroker.h" +#include "URIUtils.h" +#include "URL.h" +#include "XMLUtils.h" +#include "filesystem/CurlFile.h" +#include "filesystem/ZipFile.h" +#include "settings/AdvancedSettings.h" +#include "settings/SettingsComponent.h" +#include "utils/CharsetDetection.h" +#include "utils/Mime.h" +#include "utils/StringUtils.h" +#include "utils/XBMCTinyXML.h" +#include "utils/log.h" + +#include <algorithm> +#include <cstring> +#include <sstream> + +CScraperUrl::CScraperUrl() : m_relevance(0.0), m_parsed(false) +{ +} + +CScraperUrl::CScraperUrl(const std::string& strUrl) : CScraperUrl() +{ + ParseFromData(strUrl); +} + +CScraperUrl::CScraperUrl(const TiXmlElement* element) : CScraperUrl() +{ + ParseAndAppendUrl(element); +} + +CScraperUrl::~CScraperUrl() = default; + +void CScraperUrl::Clear() +{ + m_urls.clear(); + m_data.clear(); + m_relevance = 0.0; + m_parsed = false; +} + +void CScraperUrl::SetData(std::string data) +{ + m_data = std::move(data); + m_parsed = false; +} + +const CScraperUrl::SUrlEntry CScraperUrl::GetFirstUrlByType(const std::string& type) const +{ + const auto url = std::find_if(m_urls.begin(), m_urls.end(), [type](const SUrlEntry& url) { + return url.m_type == UrlType::General && (type.empty() || url.m_aspect == type); + }); + if (url != m_urls.end()) + return *url; + + return SUrlEntry(); +} + +const CScraperUrl::SUrlEntry CScraperUrl::GetSeasonUrl(int season, const std::string& type) const +{ + const auto url = std::find_if(m_urls.begin(), m_urls.end(), [season, type](const SUrlEntry& url) { + return url.m_type == UrlType::Season && url.m_season == season && + (type.empty() || type == "thumb" || url.m_aspect == type); + }); + if (url != m_urls.end()) + return *url; + + return SUrlEntry(); +} + +unsigned int CScraperUrl::GetMaxSeasonUrl() const +{ + unsigned int maxSeason = 0; + for (const auto& url : m_urls) + { + if (url.m_type == UrlType::Season && url.m_season > 0 && + static_cast<unsigned int>(url.m_season) > maxSeason) + maxSeason = url.m_season; + } + return maxSeason; +} + +std::string CScraperUrl::GetFirstThumbUrl() const +{ + if (m_urls.empty()) + return {}; + + return GetThumbUrl(m_urls.front()); +} + +void CScraperUrl::GetThumbUrls(std::vector<std::string>& thumbs, + const std::string& type, + int season, + bool unique) const +{ + for (const auto& url : m_urls) + { + if (url.m_aspect == type || type.empty() || url.m_aspect.empty()) + { + if ((url.m_type == CScraperUrl::UrlType::General && season == -1) || + (url.m_type == CScraperUrl::UrlType::Season && url.m_season == season)) + { + std::string thumbUrl = GetThumbUrl(url); + if (!unique || std::find(thumbs.begin(), thumbs.end(), thumbUrl) == thumbs.end()) + thumbs.push_back(thumbUrl); + } + } + } +} + +bool CScraperUrl::Parse() +{ + if (m_parsed) + return true; + + auto dataToParse = m_data; + m_data.clear(); + return ParseFromData(dataToParse); +} + +bool CScraperUrl::ParseFromData(const std::string& data) +{ + if (data.empty()) + return false; + + CXBMCTinyXML doc; + /* strUrl is coming from internal sources (usually generated by scraper or from database) + * so strUrl is always in UTF-8 */ + doc.Parse(data, TIXML_ENCODING_UTF8); + + auto pElement = doc.RootElement(); + if (pElement == nullptr) + { + m_urls.emplace_back(data); + m_data = data; + } + else + { + while (pElement != nullptr) + { + ParseAndAppendUrl(pElement); + pElement = pElement->NextSiblingElement(pElement->Value()); + } + } + + m_parsed = true; + return true; +} + +bool CScraperUrl::ParseAndAppendUrl(const TiXmlElement* element) +{ + if (element == nullptr || element->FirstChild() == nullptr || + element->FirstChild()->Value() == nullptr) + return false; + + bool wasEmpty = m_data.empty(); + + std::stringstream stream; + stream << *element; + m_data += stream.str(); + + SUrlEntry url(element->FirstChild()->ValueStr()); + url.m_spoof = XMLUtils::GetAttribute(element, "spoof"); + + const char* szPost = element->Attribute("post"); + if (szPost && StringUtils::CompareNoCase(szPost, "yes") == 0) + url.m_post = true; + else + url.m_post = false; + + const char* szIsGz = element->Attribute("gzip"); + if (szIsGz && StringUtils::CompareNoCase(szIsGz, "yes") == 0) + url.m_isgz = true; + else + url.m_isgz = false; + + url.m_cache = XMLUtils::GetAttribute(element, "cache"); + + const char* szType = element->Attribute("type"); + if (szType && StringUtils::CompareNoCase(szType, "season") == 0) + { + url.m_type = UrlType::Season; + const char* szSeason = element->Attribute("season"); + if (szSeason) + url.m_season = atoi(szSeason); + } + + url.m_aspect = XMLUtils::GetAttribute(element, "aspect"); + url.m_preview = XMLUtils::GetAttribute(element, "preview"); + + m_urls.push_back(url); + + if (wasEmpty) + m_parsed = true; + + return true; +} + +// XML format is of strUrls is: +// <TAG><url>...</url>...</TAG> (parsed by ParseElement) or <url>...</url> (ditto) +bool CScraperUrl::ParseAndAppendUrlsFromEpisodeGuide(const std::string& episodeGuide) +{ + if (episodeGuide.empty()) + return false; + + // ok, now parse the xml file + CXBMCTinyXML doc; + /* strUrls is coming from internal sources so strUrls is always in UTF-8 */ + doc.Parse(episodeGuide, TIXML_ENCODING_UTF8); + if (doc.RootElement() == nullptr) + return false; + + bool wasEmpty = m_data.empty(); + + TiXmlHandle docHandle(&doc); + auto link = docHandle.FirstChild("episodeguide").Element(); + if (link->FirstChildElement("url")) + { + for (link = link->FirstChildElement("url"); link; link = link->NextSiblingElement("url")) + ParseAndAppendUrl(link); + } + else if (link->FirstChild() && link->FirstChild()->Value()) + ParseAndAppendUrl(link); + + if (wasEmpty) + m_parsed = true; + + return true; +} + +void CScraperUrl::AddParsedUrl(const std::string& url, + const std::string& aspect, + const std::string& preview, + const std::string& referrer, + const std::string& cache, + bool post, + bool isgz, + int season) +{ + bool wasEmpty = m_data.empty(); + + TiXmlElement thumb("thumb"); + thumb.SetAttribute("spoof", referrer); + thumb.SetAttribute("cache", cache); + if (post) + thumb.SetAttribute("post", "yes"); + if (isgz) + thumb.SetAttribute("gzip", "yes"); + if (season >= 0) + { + thumb.SetAttribute("season", std::to_string(season)); + thumb.SetAttribute("type", "season"); + } + thumb.SetAttribute("aspect", aspect); + thumb.SetAttribute("preview", preview); + TiXmlText text(url); + thumb.InsertEndChild(text); + + m_data << thumb; + + SUrlEntry nUrl(url); + nUrl.m_spoof = referrer; + nUrl.m_post = post; + nUrl.m_isgz = isgz; + nUrl.m_cache = cache; + nUrl.m_preview = preview; + if (season >= 0) + { + nUrl.m_type = UrlType::Season; + nUrl.m_season = season; + } + nUrl.m_aspect = aspect; + + m_urls.push_back(nUrl); + + if (wasEmpty) + m_parsed = true; +} + +std::string CScraperUrl::GetThumbUrl(const CScraperUrl::SUrlEntry& entry) +{ + if (entry.m_spoof.empty()) + return entry.m_url; + + return entry.m_url + "|Referer=" + CURL::Encode(entry.m_spoof); +} + +bool CScraperUrl::Get(const SUrlEntry& scrURL, + std::string& strHTML, + XFILE::CCurlFile& http, + const std::string& cacheContext) +{ + CURL url(scrURL.m_url); + http.SetReferer(scrURL.m_spoof); + std::string strCachePath; + + if (!scrURL.m_cache.empty()) + { + strCachePath = URIUtils::AddFileToFolder( + CServiceBroker::GetSettingsComponent()->GetAdvancedSettings()->m_cachePath, "scrapers", + cacheContext, scrURL.m_cache); + if (XFILE::CFile::Exists(strCachePath)) + { + XFILE::CFile file; + std::vector<uint8_t> buffer; + if (file.LoadFile(strCachePath, buffer) > 0) + { + strHTML.assign(reinterpret_cast<char*>(buffer.data()), buffer.size()); + return true; + } + } + } + + auto strHTML1 = strHTML; + + if (scrURL.m_post) + { + std::string strOptions = url.GetOptions(); + strOptions = strOptions.substr(1); + url.SetOptions(""); + + if (!http.Post(url.Get(), strOptions, strHTML1)) + return false; + } + else if (!http.Get(url.Get(), strHTML1)) + return false; + + strHTML = strHTML1; + + const auto mimeType = http.GetProperty(XFILE::FILE_PROPERTY_MIME_TYPE); + CMime::EFileType ftype = CMime::GetFileTypeFromMime(mimeType); + if (ftype == CMime::FileTypeUnknown) + ftype = CMime::GetFileTypeFromContent(strHTML); + + if (ftype == CMime::FileTypeZip || ftype == CMime::FileTypeGZip) + { + XFILE::CZipFile file; + std::string strBuffer; + auto iSize = file.UnpackFromMemory( + strBuffer, strHTML, scrURL.m_isgz); // FIXME: use FileTypeGZip instead of scrURL.m_isgz? + if (iSize > 0) + { + strHTML = strBuffer; + CLog::Log(LOGDEBUG, "{}: Archive \"{}\" was unpacked in memory", __FUNCTION__, scrURL.m_url); + } + else + CLog::Log(LOGWARNING, "{}: \"{}\" looks like archive but cannot be unpacked", __FUNCTION__, + scrURL.m_url); + } + + const auto reportedCharset = http.GetProperty(XFILE::FILE_PROPERTY_CONTENT_CHARSET); + if (ftype == CMime::FileTypeHtml) + { + std::string realHtmlCharset, converted; + if (!CCharsetDetection::ConvertHtmlToUtf8(strHTML, converted, reportedCharset, realHtmlCharset)) + CLog::Log(LOGWARNING, + "{}: Can't find precise charset for HTML \"{}\", using \"{}\" as fallback", + __FUNCTION__, scrURL.m_url, realHtmlCharset); + else + CLog::Log(LOGDEBUG, "{}: Using \"{}\" charset for HTML \"{}\"", __FUNCTION__, realHtmlCharset, + scrURL.m_url); + + strHTML = converted; + } + else if (ftype == CMime::FileTypeXml) + { + CXBMCTinyXML xmlDoc; + xmlDoc.Parse(strHTML, reportedCharset); + + const auto realXmlCharset = xmlDoc.GetUsedCharset(); + if (!realXmlCharset.empty()) + { + CLog::Log(LOGDEBUG, "{}: Using \"{}\" charset for XML \"{}\"", __FUNCTION__, realXmlCharset, + scrURL.m_url); + std::string converted; + g_charsetConverter.ToUtf8(realXmlCharset, strHTML, converted); + strHTML = converted; + } + } + else if (ftype == CMime::FileTypePlainText || + StringUtils::EqualsNoCase(mimeType.substr(0, 5), "text/")) + { + std::string realTextCharset; + std::string converted; + CCharsetDetection::ConvertPlainTextToUtf8(strHTML, converted, reportedCharset, realTextCharset); + strHTML = converted; + if (reportedCharset != realTextCharset) + CLog::Log(LOGWARNING, + "{}: Using \"{}\" charset for plain text \"{}\" instead of server reported \"{}\" " + "charset", + __FUNCTION__, realTextCharset, scrURL.m_url, reportedCharset); + else + CLog::Log(LOGDEBUG, "{}: Using \"{}\" charset for plain text \"{}\"", __FUNCTION__, + realTextCharset, scrURL.m_url); + } + else if (!reportedCharset.empty()) + { + CLog::Log(LOGDEBUG, "{}: Using \"{}\" charset for \"{}\"", __FUNCTION__, reportedCharset, + scrURL.m_url); + if (reportedCharset != "UTF-8") + { + std::string converted; + g_charsetConverter.ToUtf8(reportedCharset, strHTML, converted); + strHTML = converted; + } + } + else + CLog::Log(LOGDEBUG, "{}: Using content of \"{}\" as binary or text with \"UTF-8\" charset", + __FUNCTION__, scrURL.m_url); + + if (!scrURL.m_cache.empty()) + { + const auto strCachePath = URIUtils::AddFileToFolder( + CServiceBroker::GetSettingsComponent()->GetAdvancedSettings()->m_cachePath, "scrapers", + cacheContext, scrURL.m_cache); + XFILE::CFile file; + if (!file.OpenForWrite(strCachePath, true) || + file.Write(strHTML.data(), strHTML.size()) != static_cast<ssize_t>(strHTML.size())) + return false; + } + return true; +} |