1 files changed, 434 insertions, 0 deletions
diff --git a/xbmc/utils/ScraperUrl.cpp b/xbmc/utils/ScraperUrl.cpp
new file mode 100644
index 0000000..f131b16
--- /dev/null
+++ b/xbmc/utils/ScraperUrl.cpp
@@ -0,0 +1,434 @@
+/*
+ *  Copyright (C) 2005-2018 Team Kodi
+ *  This file is part of Kodi - https://kodi.tv
+ *
+ *  SPDX-License-Identifier: GPL-2.0-or-later
+ *  See LICENSES/README.md for more information.
+ */
+
+#include "ScraperUrl.h"
+
+#include "CharsetConverter.h"
+#include "ServiceBroker.h"
+#include "URIUtils.h"
+#include "URL.h"
+#include "XMLUtils.h"
+#include "filesystem/CurlFile.h"
+#include "filesystem/ZipFile.h"
+#include "settings/AdvancedSettings.h"
+#include "settings/SettingsComponent.h"
+#include "utils/CharsetDetection.h"
+#include "utils/Mime.h"
+#include "utils/StringUtils.h"
+#include "utils/XBMCTinyXML.h"
+#include "utils/log.h"
+
+#include <algorithm>
+#include <cstring>
+#include <sstream>
+
+CScraperUrl::CScraperUrl() : m_relevance(0.0), m_parsed(false)
+{
+}
+
+CScraperUrl::CScraperUrl(const std::string& strUrl) : CScraperUrl()
+{
+  ParseFromData(strUrl);
+}
+
+CScraperUrl::CScraperUrl(const TiXmlElement* element) : CScraperUrl()
+{
+  ParseAndAppendUrl(element);
+}
+
+CScraperUrl::~CScraperUrl() = default;
+
+void CScraperUrl::Clear()
+{
+  m_urls.clear();
+  m_data.clear();
+  m_relevance = 0.0;
+  m_parsed = false;
+}
+
+void CScraperUrl::SetData(std::string data)
+{
+  m_data = std::move(data);
+  m_parsed = false;
+}
+
+const CScraperUrl::SUrlEntry CScraperUrl::GetFirstUrlByType(const std::string& type) const
+{
+  const auto url = std::find_if(m_urls.begin(), m_urls.end(), [type](const SUrlEntry& url) {
+    return url.m_type == UrlType::General && (type.empty() || url.m_aspect == type);
+  });
+  if (url != m_urls.end())
+    return *url;
+
+  return SUrlEntry();
+}
+
+const CScraperUrl::SUrlEntry CScraperUrl::GetSeasonUrl(int season, const std::string& type) const
+{
+  const auto url = std::find_if(m_urls.begin(), m_urls.end(), [season, type](const SUrlEntry& url) {
+    return url.m_type == UrlType::Season && url.m_season == season &&
+           (type.empty() || type == "thumb" || url.m_aspect == type);
+  });
+  if (url != m_urls.end())
+    return *url;
+
+  return SUrlEntry();
+}
+
+unsigned int CScraperUrl::GetMaxSeasonUrl() const
+{
+  unsigned int maxSeason = 0;
+  for (const auto& url : m_urls)
+  {
+    if (url.m_type == UrlType::Season && url.m_season > 0 &&
+        static_cast<unsigned int>(url.m_season) > maxSeason)
+      maxSeason = url.m_season;
+  }
+  return maxSeason;
+}
+
+std::string CScraperUrl::GetFirstThumbUrl() const
+{
+  if (m_urls.empty())
+    return {};
+
+  return GetThumbUrl(m_urls.front());
+}
+
+void CScraperUrl::GetThumbUrls(std::vector<std::string>& thumbs,
+                               const std::string& type,
+                               int season,
+                               bool unique) const
+{
+  for (const auto& url : m_urls)
+  {
+    if (url.m_aspect == type || type.empty() || url.m_aspect.empty())
+    {
+      if ((url.m_type == CScraperUrl::UrlType::General && season == -1) ||
+          (url.m_type == CScraperUrl::UrlType::Season && url.m_season == season))
+      {
+        std::string thumbUrl = GetThumbUrl(url);
+        if (!unique || std::find(thumbs.begin(), thumbs.end(), thumbUrl) == thumbs.end())
+          thumbs.push_back(thumbUrl);
+      }
+    }
+  }
+}
+
+bool CScraperUrl::Parse()
+{
+  if (m_parsed)
+    return true;
+
+  auto dataToParse = m_data;
+  m_data.clear();
+  return ParseFromData(dataToParse);
+}
+
+bool CScraperUrl::ParseFromData(const std::string& data)
+{
+  if (data.empty())
+    return false;
+
+  CXBMCTinyXML doc;
+  /* strUrl is coming from internal sources (usually generated by scraper or from database)
+   * so strUrl is always in UTF-8 */
+  doc.Parse(data, TIXML_ENCODING_UTF8);
+
+  auto pElement = doc.RootElement();
+  if (pElement == nullptr)
+  {
+    m_urls.emplace_back(data);
+    m_data = data;
+  }
+  else
+  {
+    while (pElement != nullptr)
+    {
+      ParseAndAppendUrl(pElement);
+      pElement = pElement->NextSiblingElement(pElement->Value());
+    }
+  }
+
+  m_parsed = true;
+  return true;
+}
+
+bool CScraperUrl::ParseAndAppendUrl(const TiXmlElement* element)
+{
+  if (element == nullptr || element->FirstChild() == nullptr ||
+      element->FirstChild()->Value() == nullptr)
+    return false;
+
+  bool wasEmpty = m_data.empty();
+
+  std::stringstream stream;
+  stream << *element;
+  m_data += stream.str();
+
+  SUrlEntry url(element->FirstChild()->ValueStr());
+  url.m_spoof = XMLUtils::GetAttribute(element, "spoof");
+
+  const char* szPost = element->Attribute("post");
+  if (szPost && StringUtils::CompareNoCase(szPost, "yes") == 0)
+    url.m_post = true;
+  else
+    url.m_post = false;
+
+  const char* szIsGz = element->Attribute("gzip");
+  if (szIsGz && StringUtils::CompareNoCase(szIsGz, "yes") == 0)
+    url.m_isgz = true;
+  else
+    url.m_isgz = false;
+
+  url.m_cache = XMLUtils::GetAttribute(element, "cache");
+
+  const char* szType = element->Attribute("type");
+  if (szType && StringUtils::CompareNoCase(szType, "season") == 0)
+  {
+    url.m_type = UrlType::Season;
+    const char* szSeason = element->Attribute("season");
+    if (szSeason)
+      url.m_season = atoi(szSeason);
+  }
+
+  url.m_aspect = XMLUtils::GetAttribute(element, "aspect");
+  url.m_preview = XMLUtils::GetAttribute(element, "preview");
+
+  m_urls.push_back(url);
+
+  if (wasEmpty)
+    m_parsed = true;
+
+  return true;
+}
+
+// XML format is of strUrls is:
+// <TAG><url>...</url>...</TAG> (parsed by ParseElement) or <url>...</url> (ditto)
+bool CScraperUrl::ParseAndAppendUrlsFromEpisodeGuide(const std::string& episodeGuide)
+{
+  if (episodeGuide.empty())
+    return false;
+
+  // ok, now parse the xml file
+  CXBMCTinyXML doc;
+  /* strUrls is coming from internal sources so strUrls is always in UTF-8 */
+  doc.Parse(episodeGuide, TIXML_ENCODING_UTF8);
+  if (doc.RootElement() == nullptr)
+    return false;
+
+  bool wasEmpty = m_data.empty();
+
+  TiXmlHandle docHandle(&doc);
+  auto link = docHandle.FirstChild("episodeguide").Element();
+  if (link->FirstChildElement("url"))
+  {
+    for (link = link->FirstChildElement("url"); link; link = link->NextSiblingElement("url"))
+      ParseAndAppendUrl(link);
+  }
+  else if (link->FirstChild() && link->FirstChild()->Value())
+    ParseAndAppendUrl(link);
+
+  if (wasEmpty)
+    m_parsed = true;
+
+  return true;
+}
+
+void CScraperUrl::AddParsedUrl(const std::string& url,
+                               const std::string& aspect,
+                               const std::string& preview,
+                               const std::string& referrer,
+                               const std::string& cache,
+                               bool post,
+                               bool isgz,
+                               int season)
+{
+  bool wasEmpty = m_data.empty();
+
+  TiXmlElement thumb("thumb");
+  thumb.SetAttribute("spoof", referrer);
+  thumb.SetAttribute("cache", cache);
+  if (post)
+    thumb.SetAttribute("post", "yes");
+  if (isgz)
+    thumb.SetAttribute("gzip", "yes");
+  if (season >= 0)
+  {
+    thumb.SetAttribute("season", std::to_string(season));
+    thumb.SetAttribute("type", "season");
+  }
+  thumb.SetAttribute("aspect", aspect);
+  thumb.SetAttribute("preview", preview);
+  TiXmlText text(url);
+  thumb.InsertEndChild(text);
+
+  m_data << thumb;
+
+  SUrlEntry nUrl(url);
+  nUrl.m_spoof = referrer;
+  nUrl.m_post = post;
+  nUrl.m_isgz = isgz;
+  nUrl.m_cache = cache;
+  nUrl.m_preview = preview;
+  if (season >= 0)
+  {
+    nUrl.m_type = UrlType::Season;
+    nUrl.m_season = season;
+  }
+  nUrl.m_aspect = aspect;
+
+  m_urls.push_back(nUrl);
+
+  if (wasEmpty)
+    m_parsed = true;
+}
+
+std::string CScraperUrl::GetThumbUrl(const CScraperUrl::SUrlEntry& entry)
+{
+  if (entry.m_spoof.empty())
+    return entry.m_url;
+
+  return entry.m_url + "|Referer=" + CURL::Encode(entry.m_spoof);
+}
+
+bool CScraperUrl::Get(const SUrlEntry& scrURL,
+                      std::string& strHTML,
+                      XFILE::CCurlFile& http,
+                      const std::string& cacheContext)
+{
+  CURL url(scrURL.m_url);
+  http.SetReferer(scrURL.m_spoof);
+  std::string strCachePath;
+
+  if (!scrURL.m_cache.empty())
+  {
+    strCachePath = URIUtils::AddFileToFolder(
+        CServiceBroker::GetSettingsComponent()->GetAdvancedSettings()->m_cachePath, "scrapers",
+        cacheContext, scrURL.m_cache);
+    if (XFILE::CFile::Exists(strCachePath))
+    {
+      XFILE::CFile file;
+      std::vector<uint8_t> buffer;
+      if (file.LoadFile(strCachePath, buffer) > 0)
+      {
+        strHTML.assign(reinterpret_cast<char*>(buffer.data()), buffer.size());
+        return true;
+      }
+    }
+  }
+
+  auto strHTML1 = strHTML;
+
+  if (scrURL.m_post)
+  {
+    std::string strOptions = url.GetOptions();
+    strOptions = strOptions.substr(1);
+    url.SetOptions("");
+
+    if (!http.Post(url.Get(), strOptions, strHTML1))
+      return false;
+  }
+  else if (!http.Get(url.Get(), strHTML1))
+    return false;
+
+  strHTML = strHTML1;
+
+  const auto mimeType = http.GetProperty(XFILE::FILE_PROPERTY_MIME_TYPE);
+  CMime::EFileType ftype = CMime::GetFileTypeFromMime(mimeType);
+  if (ftype == CMime::FileTypeUnknown)
+    ftype = CMime::GetFileTypeFromContent(strHTML);
+
+  if (ftype == CMime::FileTypeZip || ftype == CMime::FileTypeGZip)
+  {
+    XFILE::CZipFile file;
+    std::string strBuffer;
+    auto iSize = file.UnpackFromMemory(
+        strBuffer, strHTML, scrURL.m_isgz); // FIXME: use FileTypeGZip instead of scrURL.m_isgz?
+    if (iSize > 0)
+    {
+      strHTML = strBuffer;
+      CLog::Log(LOGDEBUG, "{}: Archive \"{}\" was unpacked in memory", __FUNCTION__, scrURL.m_url);
+    }
+    else
+      CLog::Log(LOGWARNING, "{}: \"{}\" looks like archive but cannot be unpacked", __FUNCTION__,
+                scrURL.m_url);
+  }
+
+  const auto reportedCharset = http.GetProperty(XFILE::FILE_PROPERTY_CONTENT_CHARSET);
+  if (ftype == CMime::FileTypeHtml)
+  {
+    std::string realHtmlCharset, converted;
+    if (!CCharsetDetection::ConvertHtmlToUtf8(strHTML, converted, reportedCharset, realHtmlCharset))
+      CLog::Log(LOGWARNING,
+                "{}: Can't find precise charset for HTML \"{}\", using \"{}\" as fallback",
+                __FUNCTION__, scrURL.m_url, realHtmlCharset);
+    else
+      CLog::Log(LOGDEBUG, "{}: Using \"{}\" charset for HTML \"{}\"", __FUNCTION__, realHtmlCharset,
+                scrURL.m_url);
+
+    strHTML = converted;
+  }
+  else if (ftype == CMime::FileTypeXml)
+  {
+    CXBMCTinyXML xmlDoc;
+    xmlDoc.Parse(strHTML, reportedCharset);
+
+    const auto realXmlCharset = xmlDoc.GetUsedCharset();
+    if (!realXmlCharset.empty())
+    {
+      CLog::Log(LOGDEBUG, "{}: Using \"{}\" charset for XML \"{}\"", __FUNCTION__, realXmlCharset,
+                scrURL.m_url);
+      std::string converted;
+      g_charsetConverter.ToUtf8(realXmlCharset, strHTML, converted);
+      strHTML = converted;
+    }
+  }
+  else if (ftype == CMime::FileTypePlainText ||
+           StringUtils::EqualsNoCase(mimeType.substr(0, 5), "text/"))
+  {
+    std::string realTextCharset;
+    std::string converted;
+    CCharsetDetection::ConvertPlainTextToUtf8(strHTML, converted, reportedCharset, realTextCharset);
+    strHTML = converted;
+    if (reportedCharset != realTextCharset)
+      CLog::Log(LOGWARNING,
+                "{}: Using \"{}\" charset for plain text \"{}\" instead of server reported \"{}\" "
+                "charset",
+                __FUNCTION__, realTextCharset, scrURL.m_url, reportedCharset);
+    else
+      CLog::Log(LOGDEBUG, "{}: Using \"{}\" charset for plain text \"{}\"", __FUNCTION__,
+                realTextCharset, scrURL.m_url);
+  }
+  else if (!reportedCharset.empty())
+  {
+    CLog::Log(LOGDEBUG, "{}: Using \"{}\" charset for \"{}\"", __FUNCTION__, reportedCharset,
+              scrURL.m_url);
+    if (reportedCharset != "UTF-8")
+    {
+      std::string converted;
+      g_charsetConverter.ToUtf8(reportedCharset, strHTML, converted);
+      strHTML = converted;
+    }
+  }
+  else
+    CLog::Log(LOGDEBUG, "{}: Using content of \"{}\" as binary or text with \"UTF-8\" charset",
+              __FUNCTION__, scrURL.m_url);
+
+  if (!scrURL.m_cache.empty())
+  {
+    const auto strCachePath = URIUtils::AddFileToFolder(
+        CServiceBroker::GetSettingsComponent()->GetAdvancedSettings()->m_cachePath, "scrapers",
+        cacheContext, scrURL.m_cache);
+    XFILE::CFile file;
+    if (!file.OpenForWrite(strCachePath, true) ||
+        file.Write(strHTML.data(), strHTML.size()) != static_cast<ssize_t>(strHTML.size()))
+      return false;
+  }
+  return true;
+}