summaryrefslogtreecommitdiffstats
path: root/xbmc/utils/CharsetDetection.h
blob: d1b9ba9d9041c14103226447dd8c2c0ee9edcf5b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
/*
 *  Copyright (C) 2013-2018 Team Kodi
 *  This file is part of Kodi - https://kodi.tv
 *
 *  SPDX-License-Identifier: GPL-2.0-or-later
 *  See LICENSES/README.md for more information.
 */

#pragma once

#include <string>


class CCharsetDetection
{
public:
  /**
   * Detect text encoding by Byte Order Mark
   * Multibyte encodings (UTF-16/32) always ends with explicit endianness (LE/BE)
   * @param content     pointer to text to analyze
   * @param contentLength       length of text
   * @return detected encoding or empty string if BOM not detected
   */
  static std::string GetBomEncoding(const char* const content, const size_t contentLength);
  /**
   * Detect text encoding by Byte Order Mark
   * Multibyte encodings (UTF-16/32) always ends with explicit endianness (LE/BE)
   * @param content     the text to analyze
   * @return detected encoding or empty string if BOM not detected
   */
  static inline std::string GetBomEncoding(const std::string& content)
  { return GetBomEncoding(content.c_str(), content.length()); }

  static inline bool DetectXmlEncoding(const std::string& xmlContent, std::string& detectedEncoding)
  { return DetectXmlEncoding(xmlContent.c_str(), xmlContent.length(), detectedEncoding); }

  static bool DetectXmlEncoding(const char* const xmlContent, const size_t contentLength, std::string& detectedEncoding);

  /**
   * Detect HTML charset and HTML convert to UTF-8
   * @param htmlContent content of HTML file
   * @param converted   receive result of conversion
   * @param serverReportedCharset charset from HTTP header or from other out-of-band source, empty if unknown or unset
   * @return true if charset is properly detected and HTML is correctly converted, false if charset is only guessed
   */
  static inline bool ConvertHtmlToUtf8(const std::string& htmlContent, std::string& converted, const std::string& serverReportedCharset = "")
  {
    std::string usedHtmlCharset;
    return ConvertHtmlToUtf8(htmlContent, converted, serverReportedCharset, usedHtmlCharset);
  }
  /**
   * Detect HTML charset and HTML convert to UTF-8
   * @param htmlContent content of HTML file
   * @param converted   receive result of conversion
   * @param serverReportedCharset charset from HTTP header or from other out-of-band source, empty if unknown or unset
   * @param usedHtmlCharset       receive charset used for conversion
   * @return true if charset is properly detected and HTML is correctly converted, false if charset is only guessed
   */
  static bool ConvertHtmlToUtf8(const std::string& htmlContent, std::string& converted, const std::string& serverReportedCharset, std::string& usedHtmlCharset);

  /**
  * Try to convert plain text to UTF-8 using best suitable charset
  * @param textContent text to convert
  * @param converted   receive result of conversion
  * @param serverReportedCharset charset from HTTP header or from other out-of-band source, empty if unknown or unset
  * @param usedCharset       receive charset used for conversion
  * @return true if converted without errors, false otherwise
  */
  static bool ConvertPlainTextToUtf8(const std::string& textContent, std::string& converted, const std::string& serverReportedCharset, std::string& usedCharset);

private:
  static bool GetXmlEncodingFromDeclaration(const char* const xmlContent, const size_t contentLength, std::string& declaredEncoding);
  /**
   * Try to guess text encoding by searching for '<?xml' mark in different encodings
   * Multibyte encodings (UTF/UCS) always ends with explicit endianness (LE/BE)
   * @param content     pointer to text to analyze
   * @param contentLength       length of text
   * @param detectedEncoding    reference to variable that receive supposed encoding
   * @return true if any encoding supposed, false otherwise
   */
  static bool GuessXmlEncoding(const char* const xmlContent, const size_t contentLength, std::string& supposedEncoding);

  static std::string GetHtmlEncodingFromHead(const std::string& htmlContent);
  static size_t GetHtmlAttribute(const std::string& htmlContent, size_t pos, std::string& atrName, std::string& strValue);
  static std::string ExtractEncodingFromHtmlMeta(const std::string& metaContent, size_t pos = 0);

  static bool checkConversion(const std::string& srcCharset, const std::string& src, std::string& dst);
  static void appendCharAsAsciiUpperCase(std::string& str, const char chr);

  static const size_t m_XmlDeclarationMaxLength;
  static const size_t m_HtmlCharsetEndSearchPos;

  static const std::string m_HtmlWhitespaceChars;
};