1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
|
/*
* Copyright (C) 2013-2018 Team Kodi
* This file is part of Kodi - https://kodi.tv
*
* SPDX-License-Identifier: GPL-2.0-or-later
* See LICENSES/README.md for more information.
*/
#pragma once
#include <string>
class CCharsetDetection
{
public:
/**
* Detect text encoding by Byte Order Mark
* Multibyte encodings (UTF-16/32) always ends with explicit endianness (LE/BE)
* @param content pointer to text to analyze
* @param contentLength length of text
* @return detected encoding or empty string if BOM not detected
*/
static std::string GetBomEncoding(const char* const content, const size_t contentLength);
/**
* Detect text encoding by Byte Order Mark
* Multibyte encodings (UTF-16/32) always ends with explicit endianness (LE/BE)
* @param content the text to analyze
* @return detected encoding or empty string if BOM not detected
*/
static inline std::string GetBomEncoding(const std::string& content)
{ return GetBomEncoding(content.c_str(), content.length()); }
static inline bool DetectXmlEncoding(const std::string& xmlContent, std::string& detectedEncoding)
{ return DetectXmlEncoding(xmlContent.c_str(), xmlContent.length(), detectedEncoding); }
static bool DetectXmlEncoding(const char* const xmlContent, const size_t contentLength, std::string& detectedEncoding);
/**
* Detect HTML charset and HTML convert to UTF-8
* @param htmlContent content of HTML file
* @param converted receive result of conversion
* @param serverReportedCharset charset from HTTP header or from other out-of-band source, empty if unknown or unset
* @return true if charset is properly detected and HTML is correctly converted, false if charset is only guessed
*/
static inline bool ConvertHtmlToUtf8(const std::string& htmlContent, std::string& converted, const std::string& serverReportedCharset = "")
{
std::string usedHtmlCharset;
return ConvertHtmlToUtf8(htmlContent, converted, serverReportedCharset, usedHtmlCharset);
}
/**
* Detect HTML charset and HTML convert to UTF-8
* @param htmlContent content of HTML file
* @param converted receive result of conversion
* @param serverReportedCharset charset from HTTP header or from other out-of-band source, empty if unknown or unset
* @param usedHtmlCharset receive charset used for conversion
* @return true if charset is properly detected and HTML is correctly converted, false if charset is only guessed
*/
static bool ConvertHtmlToUtf8(const std::string& htmlContent, std::string& converted, const std::string& serverReportedCharset, std::string& usedHtmlCharset);
/**
* Try to convert plain text to UTF-8 using best suitable charset
* @param textContent text to convert
* @param converted receive result of conversion
* @param serverReportedCharset charset from HTTP header or from other out-of-band source, empty if unknown or unset
* @param usedCharset receive charset used for conversion
* @return true if converted without errors, false otherwise
*/
static bool ConvertPlainTextToUtf8(const std::string& textContent, std::string& converted, const std::string& serverReportedCharset, std::string& usedCharset);
private:
static bool GetXmlEncodingFromDeclaration(const char* const xmlContent, const size_t contentLength, std::string& declaredEncoding);
/**
* Try to guess text encoding by searching for '<?xml' mark in different encodings
* Multibyte encodings (UTF/UCS) always ends with explicit endianness (LE/BE)
* @param content pointer to text to analyze
* @param contentLength length of text
* @param detectedEncoding reference to variable that receive supposed encoding
* @return true if any encoding supposed, false otherwise
*/
static bool GuessXmlEncoding(const char* const xmlContent, const size_t contentLength, std::string& supposedEncoding);
static std::string GetHtmlEncodingFromHead(const std::string& htmlContent);
static size_t GetHtmlAttribute(const std::string& htmlContent, size_t pos, std::string& atrName, std::string& strValue);
static std::string ExtractEncodingFromHtmlMeta(const std::string& metaContent, size_t pos = 0);
static bool checkConversion(const std::string& srcCharset, const std::string& src, std::string& dst);
static void appendCharAsAsciiUpperCase(std::string& str, const char chr);
static const size_t m_XmlDeclarationMaxLength;
static const size_t m_HtmlCharsetEndSearchPos;
static const std::string m_HtmlWhitespaceChars;
};
|