Adding upstream version 2:20.4+dfsg.upstream/2%20.4+dfsg

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-10 18:07:22 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-10 18:07:22 +0000
commit: c04dcc2e7d834218ef2d4194331e383402495ae1 (patch)
tree: 7333e38d10d75386e60f336b80c2443c1166031d /xbmc/utils/CharsetDetection.h
parent: Initial commit. (diff)
download: kodi-c04dcc2e7d834218ef2d4194331e383402495ae1.tar.xz
kodi-c04dcc2e7d834218ef2d4194331e383402495ae1.zip
1 files changed, 94 insertions, 0 deletions
diff --git a/xbmc/utils/CharsetDetection.h b/xbmc/utils/CharsetDetection.h
new file mode 100644
index 0000000..d1b9ba9
--- /dev/null
+++ b/xbmc/utils/CharsetDetection.h
@@ -0,0 +1,94 @@
+/*
+ *  Copyright (C) 2013-2018 Team Kodi
+ *  This file is part of Kodi - https://kodi.tv
+ *
+ *  SPDX-License-Identifier: GPL-2.0-or-later
+ *  See LICENSES/README.md for more information.
+ */
+
+#pragma once
+
+#include <string>
+
+
+class CCharsetDetection
+{
+public:
+  /**
+   * Detect text encoding by Byte Order Mark
+   * Multibyte encodings (UTF-16/32) always ends with explicit endianness (LE/BE)
+   * @param content     pointer to text to analyze
+   * @param contentLength       length of text
+   * @return detected encoding or empty string if BOM not detected
+   */
+  static std::string GetBomEncoding(const char* const content, const size_t contentLength);
+  /**
+   * Detect text encoding by Byte Order Mark
+   * Multibyte encodings (UTF-16/32) always ends with explicit endianness (LE/BE)
+   * @param content     the text to analyze
+   * @return detected encoding or empty string if BOM not detected
+   */
+  static inline std::string GetBomEncoding(const std::string& content)
+  { return GetBomEncoding(content.c_str(), content.length()); }
+
+  static inline bool DetectXmlEncoding(const std::string& xmlContent, std::string& detectedEncoding)
+  { return DetectXmlEncoding(xmlContent.c_str(), xmlContent.length(), detectedEncoding); }
+
+  static bool DetectXmlEncoding(const char* const xmlContent, const size_t contentLength, std::string& detectedEncoding);
+
+  /**
+   * Detect HTML charset and HTML convert to UTF-8
+   * @param htmlContent content of HTML file
+   * @param converted   receive result of conversion
+   * @param serverReportedCharset charset from HTTP header or from other out-of-band source, empty if unknown or unset
+   * @return true if charset is properly detected and HTML is correctly converted, false if charset is only guessed
+   */
+  static inline bool ConvertHtmlToUtf8(const std::string& htmlContent, std::string& converted, const std::string& serverReportedCharset = "")
+  {
+    std::string usedHtmlCharset;
+    return ConvertHtmlToUtf8(htmlContent, converted, serverReportedCharset, usedHtmlCharset);
+  }
+  /**
+   * Detect HTML charset and HTML convert to UTF-8
+   * @param htmlContent content of HTML file
+   * @param converted   receive result of conversion
+   * @param serverReportedCharset charset from HTTP header or from other out-of-band source, empty if unknown or unset
+   * @param usedHtmlCharset       receive charset used for conversion
+   * @return true if charset is properly detected and HTML is correctly converted, false if charset is only guessed
+   */
+  static bool ConvertHtmlToUtf8(const std::string& htmlContent, std::string& converted, const std::string& serverReportedCharset, std::string& usedHtmlCharset);
+
+  /**
+  * Try to convert plain text to UTF-8 using best suitable charset
+  * @param textContent text to convert
+  * @param converted   receive result of conversion
+  * @param serverReportedCharset charset from HTTP header or from other out-of-band source, empty if unknown or unset
+  * @param usedCharset       receive charset used for conversion
+  * @return true if converted without errors, false otherwise
+  */
+  static bool ConvertPlainTextToUtf8(const std::string& textContent, std::string& converted, const std::string& serverReportedCharset, std::string& usedCharset);
+
+private:
+  static bool GetXmlEncodingFromDeclaration(const char* const xmlContent, const size_t contentLength, std::string& declaredEncoding);
+  /**
+   * Try to guess text encoding by searching for '<?xml' mark in different encodings
+   * Multibyte encodings (UTF/UCS) always ends with explicit endianness (LE/BE)
+   * @param content     pointer to text to analyze
+   * @param contentLength       length of text
+   * @param detectedEncoding    reference to variable that receive supposed encoding
+   * @return true if any encoding supposed, false otherwise
+   */
+  static bool GuessXmlEncoding(const char* const xmlContent, const size_t contentLength, std::string& supposedEncoding);
+
+  static std::string GetHtmlEncodingFromHead(const std::string& htmlContent);
+  static size_t GetHtmlAttribute(const std::string& htmlContent, size_t pos, std::string& atrName, std::string& strValue);
+  static std::string ExtractEncodingFromHtmlMeta(const std::string& metaContent, size_t pos = 0);
+
+  static bool checkConversion(const std::string& srcCharset, const std::string& src, std::string& dst);
+  static void appendCharAsAsciiUpperCase(std::string& str, const char chr);
+
+  static const size_t m_XmlDeclarationMaxLength;
+  static const size_t m_HtmlCharsetEndSearchPos;
+
+  static const std::string m_HtmlWhitespaceChars;
+};
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-10 18:07:22 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-10 18:07:22 +0000
commit	c04dcc2e7d834218ef2d4194331e383402495ae1 (patch)
tree	7333e38d10d75386e60f336b80c2443c1166031d /xbmc/utils/CharsetDetection.h
parent	Initial commit. (diff)
download	kodi-c04dcc2e7d834218ef2d4194331e383402495ae1.tar.xz kodi-c04dcc2e7d834218ef2d4194331e383402495ae1.zip