/* * Copyright (C) 2005-2018 Team Kodi * This file is part of Kodi - https://kodi.tv * * SPDX-License-Identifier: GPL-2.0-or-later * See LICENSES/README.md for more information. */ #include "ScraperUrl.h" #include "CharsetConverter.h" #include "ServiceBroker.h" #include "URIUtils.h" #include "URL.h" #include "XMLUtils.h" #include "filesystem/CurlFile.h" #include "filesystem/ZipFile.h" #include "settings/AdvancedSettings.h" #include "settings/SettingsComponent.h" #include "utils/CharsetDetection.h" #include "utils/Mime.h" #include "utils/StringUtils.h" #include "utils/XBMCTinyXML.h" #include "utils/log.h" #include #include #include CScraperUrl::CScraperUrl() : m_relevance(0.0), m_parsed(false) { } CScraperUrl::CScraperUrl(const std::string& strUrl) : CScraperUrl() { ParseFromData(strUrl); } CScraperUrl::CScraperUrl(const TiXmlElement* element) : CScraperUrl() { ParseAndAppendUrl(element); } CScraperUrl::~CScraperUrl() = default; void CScraperUrl::Clear() { m_urls.clear(); m_data.clear(); m_relevance = 0.0; m_parsed = false; } void CScraperUrl::SetData(std::string data) { m_data = std::move(data); m_parsed = false; } const CScraperUrl::SUrlEntry CScraperUrl::GetFirstUrlByType(const std::string& type) const { const auto url = std::find_if(m_urls.begin(), m_urls.end(), [type](const SUrlEntry& url) { return url.m_type == UrlType::General && (type.empty() || url.m_aspect == type); }); if (url != m_urls.end()) return *url; return SUrlEntry(); } const CScraperUrl::SUrlEntry CScraperUrl::GetSeasonUrl(int season, const std::string& type) const { const auto url = std::find_if(m_urls.begin(), m_urls.end(), [season, type](const SUrlEntry& url) { return url.m_type == UrlType::Season && url.m_season == season && (type.empty() || type == "thumb" || url.m_aspect == type); }); if (url != m_urls.end()) return *url; return SUrlEntry(); } unsigned int CScraperUrl::GetMaxSeasonUrl() const { unsigned int maxSeason = 0; for (const auto& url : m_urls) { if (url.m_type == UrlType::Season && url.m_season > 0 && static_cast(url.m_season) > maxSeason) maxSeason = url.m_season; } return maxSeason; } std::string CScraperUrl::GetFirstThumbUrl() const { if (m_urls.empty()) return {}; return GetThumbUrl(m_urls.front()); } void CScraperUrl::GetThumbUrls(std::vector& thumbs, const std::string& type, int season, bool unique) const { for (const auto& url : m_urls) { if (url.m_aspect == type || type.empty() || url.m_aspect.empty()) { if ((url.m_type == CScraperUrl::UrlType::General && season == -1) || (url.m_type == CScraperUrl::UrlType::Season && url.m_season == season)) { std::string thumbUrl = GetThumbUrl(url); if (!unique || std::find(thumbs.begin(), thumbs.end(), thumbUrl) == thumbs.end()) thumbs.push_back(thumbUrl); } } } } bool CScraperUrl::Parse() { if (m_parsed) return true; auto dataToParse = m_data; m_data.clear(); return ParseFromData(dataToParse); } bool CScraperUrl::ParseFromData(const std::string& data) { if (data.empty()) return false; CXBMCTinyXML doc; /* strUrl is coming from internal sources (usually generated by scraper or from database) * so strUrl is always in UTF-8 */ doc.Parse(data, TIXML_ENCODING_UTF8); auto pElement = doc.RootElement(); if (pElement == nullptr) { m_urls.emplace_back(data); m_data = data; } else { while (pElement != nullptr) { ParseAndAppendUrl(pElement); pElement = pElement->NextSiblingElement(pElement->Value()); } } m_parsed = true; return true; } bool CScraperUrl::ParseAndAppendUrl(const TiXmlElement* element) { if (element == nullptr || element->FirstChild() == nullptr || element->FirstChild()->Value() == nullptr) return false; bool wasEmpty = m_data.empty(); std::stringstream stream; stream << *element; m_data += stream.str(); SUrlEntry url(element->FirstChild()->ValueStr()); url.m_spoof = XMLUtils::GetAttribute(element, "spoof"); const char* szPost = element->Attribute("post"); if (szPost && StringUtils::CompareNoCase(szPost, "yes") == 0) url.m_post = true; else url.m_post = false; const char* szIsGz = element->Attribute("gzip"); if (szIsGz && StringUtils::CompareNoCase(szIsGz, "yes") == 0) url.m_isgz = true; else url.m_isgz = false; url.m_cache = XMLUtils::GetAttribute(element, "cache"); const char* szType = element->Attribute("type"); if (szType && StringUtils::CompareNoCase(szType, "season") == 0) { url.m_type = UrlType::Season; const char* szSeason = element->Attribute("season"); if (szSeason) url.m_season = atoi(szSeason); } url.m_aspect = XMLUtils::GetAttribute(element, "aspect"); url.m_preview = XMLUtils::GetAttribute(element, "preview"); m_urls.push_back(url); if (wasEmpty) m_parsed = true; return true; } // XML format is of strUrls is: // ...... (parsed by ParseElement) or ... (ditto) bool CScraperUrl::ParseAndAppendUrlsFromEpisodeGuide(const std::string& episodeGuide) { if (episodeGuide.empty()) return false; // ok, now parse the xml file CXBMCTinyXML doc; /* strUrls is coming from internal sources so strUrls is always in UTF-8 */ doc.Parse(episodeGuide, TIXML_ENCODING_UTF8); if (doc.RootElement() == nullptr) return false; bool wasEmpty = m_data.empty(); TiXmlHandle docHandle(&doc); auto link = docHandle.FirstChild("episodeguide").Element(); if (link->FirstChildElement("url")) { for (link = link->FirstChildElement("url"); link; link = link->NextSiblingElement("url")) ParseAndAppendUrl(link); } else if (link->FirstChild() && link->FirstChild()->Value()) ParseAndAppendUrl(link); if (wasEmpty) m_parsed = true; return true; } void CScraperUrl::AddParsedUrl(const std::string& url, const std::string& aspect, const std::string& preview, const std::string& referrer, const std::string& cache, bool post, bool isgz, int season) { bool wasEmpty = m_data.empty(); TiXmlElement thumb("thumb"); thumb.SetAttribute("spoof", referrer); thumb.SetAttribute("cache", cache); if (post) thumb.SetAttribute("post", "yes"); if (isgz) thumb.SetAttribute("gzip", "yes"); if (season >= 0) { thumb.SetAttribute("season", std::to_string(season)); thumb.SetAttribute("type", "season"); } thumb.SetAttribute("aspect", aspect); thumb.SetAttribute("preview", preview); TiXmlText text(url); thumb.InsertEndChild(text); m_data << thumb; SUrlEntry nUrl(url); nUrl.m_spoof = referrer; nUrl.m_post = post; nUrl.m_isgz = isgz; nUrl.m_cache = cache; nUrl.m_preview = preview; if (season >= 0) { nUrl.m_type = UrlType::Season; nUrl.m_season = season; } nUrl.m_aspect = aspect; m_urls.push_back(nUrl); if (wasEmpty) m_parsed = true; } std::string CScraperUrl::GetThumbUrl(const CScraperUrl::SUrlEntry& entry) { if (entry.m_spoof.empty()) return entry.m_url; return entry.m_url + "|Referer=" + CURL::Encode(entry.m_spoof); } bool CScraperUrl::Get(const SUrlEntry& scrURL, std::string& strHTML, XFILE::CCurlFile& http, const std::string& cacheContext) { CURL url(scrURL.m_url); http.SetReferer(scrURL.m_spoof); std::string strCachePath; if (!scrURL.m_cache.empty()) { strCachePath = URIUtils::AddFileToFolder( CServiceBroker::GetSettingsComponent()->GetAdvancedSettings()->m_cachePath, "scrapers", cacheContext, scrURL.m_cache); if (XFILE::CFile::Exists(strCachePath)) { XFILE::CFile file; std::vector buffer; if (file.LoadFile(strCachePath, buffer) > 0) { strHTML.assign(reinterpret_cast(buffer.data()), buffer.size()); return true; } } } auto strHTML1 = strHTML; if (scrURL.m_post) { std::string strOptions = url.GetOptions(); strOptions = strOptions.substr(1); url.SetOptions(""); if (!http.Post(url.Get(), strOptions, strHTML1)) return false; } else if (!http.Get(url.Get(), strHTML1)) return false; strHTML = strHTML1; const auto mimeType = http.GetProperty(XFILE::FILE_PROPERTY_MIME_TYPE); CMime::EFileType ftype = CMime::GetFileTypeFromMime(mimeType); if (ftype == CMime::FileTypeUnknown) ftype = CMime::GetFileTypeFromContent(strHTML); if (ftype == CMime::FileTypeZip || ftype == CMime::FileTypeGZip) { XFILE::CZipFile file; std::string strBuffer; auto iSize = file.UnpackFromMemory( strBuffer, strHTML, scrURL.m_isgz); // FIXME: use FileTypeGZip instead of scrURL.m_isgz? if (iSize > 0) { strHTML = strBuffer; CLog::Log(LOGDEBUG, "{}: Archive \"{}\" was unpacked in memory", __FUNCTION__, scrURL.m_url); } else CLog::Log(LOGWARNING, "{}: \"{}\" looks like archive but cannot be unpacked", __FUNCTION__, scrURL.m_url); } const auto reportedCharset = http.GetProperty(XFILE::FILE_PROPERTY_CONTENT_CHARSET); if (ftype == CMime::FileTypeHtml) { std::string realHtmlCharset, converted; if (!CCharsetDetection::ConvertHtmlToUtf8(strHTML, converted, reportedCharset, realHtmlCharset)) CLog::Log(LOGWARNING, "{}: Can't find precise charset for HTML \"{}\", using \"{}\" as fallback", __FUNCTION__, scrURL.m_url, realHtmlCharset); else CLog::Log(LOGDEBUG, "{}: Using \"{}\" charset for HTML \"{}\"", __FUNCTION__, realHtmlCharset, scrURL.m_url); strHTML = converted; } else if (ftype == CMime::FileTypeXml) { CXBMCTinyXML xmlDoc; xmlDoc.Parse(strHTML, reportedCharset); const auto realXmlCharset = xmlDoc.GetUsedCharset(); if (!realXmlCharset.empty()) { CLog::Log(LOGDEBUG, "{}: Using \"{}\" charset for XML \"{}\"", __FUNCTION__, realXmlCharset, scrURL.m_url); std::string converted; g_charsetConverter.ToUtf8(realXmlCharset, strHTML, converted); strHTML = converted; } } else if (ftype == CMime::FileTypePlainText || StringUtils::EqualsNoCase(mimeType.substr(0, 5), "text/")) { std::string realTextCharset; std::string converted; CCharsetDetection::ConvertPlainTextToUtf8(strHTML, converted, reportedCharset, realTextCharset); strHTML = converted; if (reportedCharset != realTextCharset) CLog::Log(LOGWARNING, "{}: Using \"{}\" charset for plain text \"{}\" instead of server reported \"{}\" " "charset", __FUNCTION__, realTextCharset, scrURL.m_url, reportedCharset); else CLog::Log(LOGDEBUG, "{}: Using \"{}\" charset for plain text \"{}\"", __FUNCTION__, realTextCharset, scrURL.m_url); } else if (!reportedCharset.empty()) { CLog::Log(LOGDEBUG, "{}: Using \"{}\" charset for \"{}\"", __FUNCTION__, reportedCharset, scrURL.m_url); if (reportedCharset != "UTF-8") { std::string converted; g_charsetConverter.ToUtf8(reportedCharset, strHTML, converted); strHTML = converted; } } else CLog::Log(LOGDEBUG, "{}: Using content of \"{}\" as binary or text with \"UTF-8\" charset", __FUNCTION__, scrURL.m_url); if (!scrURL.m_cache.empty()) { const auto strCachePath = URIUtils::AddFileToFolder( CServiceBroker::GetSettingsComponent()->GetAdvancedSettings()->m_cachePath, "scrapers", cacheContext, scrURL.m_cache); XFILE::CFile file; if (!file.OpenForWrite(strCachePath, true) || file.Write(strHTML.data(), strHTML.size()) != static_cast(strHTML.size())) return false; } return true; }