diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-10 21:30:40 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-10 21:30:40 +0000 |
commit | 133a45c109da5310add55824db21af5239951f93 (patch) | |
tree | ba6ac4c0a950a0dda56451944315d66409923918 /contrib/google-ced/util/encodings | |
parent | Initial commit. (diff) | |
download | rspamd-133a45c109da5310add55824db21af5239951f93.tar.xz rspamd-133a45c109da5310add55824db21af5239951f93.zip |
Adding upstream version 3.8.1.upstream/3.8.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'contrib/google-ced/util/encodings')
-rw-r--r-- | contrib/google-ced/util/encodings/encodings.cc | 891 | ||||
-rw-r--r-- | contrib/google-ced/util/encodings/encodings.h | 299 | ||||
-rw-r--r-- | contrib/google-ced/util/encodings/encodings.pb.h | 181 | ||||
-rw-r--r-- | contrib/google-ced/util/encodings/encodings_unittest.cc | 34 |
4 files changed, 1405 insertions, 0 deletions
diff --git a/contrib/google-ced/util/encodings/encodings.cc b/contrib/google-ced/util/encodings/encodings.cc new file mode 100644 index 0000000..b5f8dc5 --- /dev/null +++ b/contrib/google-ced/util/encodings/encodings.cc @@ -0,0 +1,891 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "util/encodings/encodings.h" + +#include <string.h> // for strcasecmp +#include <unordered_map> +#include <utility> // for pair + +#include "util/basictypes.h" +#include "util/string_util.h" +#include "util/case_insensitive_hash.h" + +struct EncodingInfo { + // The standard name for this encoding. + // + const char* encoding_name_; + + // The "preferred MIME name" of an encoding as specified by the IANA at: + // http://www.iana.org/assignments/character-sets + // + // Note that the preferred MIME name may differ slightly from the + // official IANA name: i.e. ISO-8859-1 vs. ISO_8859-1:1987 + // + const char* mime_encoding_name_; + + // It is an internal policy that if an encoding has an IANA name, + // then encoding_name_ and mime_encoding_name_ must be the same string. + // + // However, there can be exceptions if there are compelling reasons. + // For example, Japanese mobile handsets require the name + // "Shift_JIS" in charset=... parameter in Content-Type headers to + // process emoji (emoticons) in their private encodings. In that + // case, mime_encoding_name_ should be "Shift_JIS", despite + // encoding_name_ actually is "X-KDDI-Shift_JIS". + + // Some multi-byte encodings use byte values that coincide with the + // ASCII codes for HTML syntax characters <>"&' and browsers like MSIE + // can misinterpret these, as indicated in an external XSS report from + // 2007-02-15. Here, we map these dangerous encodings to safer ones. We + // also use UTF8 instead of encodings that we don't support in our + // output, and we generally try to be conservative in what we send out. + // Where the client asks for single- or double-byte encodings that are + // not as common, we substitute a more common single- or double-byte + // encoding, if there is one, thereby preserving the client's intent + // to use less space than UTF-8. This also means that characters + // outside the destination set will be converted to HTML NCRs (&#NNN;) + // if requested. + + Encoding preferred_web_output_encoding_; +}; + +static const EncodingInfo kEncodingInfoTable[] = { + { "ASCII", "ISO-8859-1", ISO_8859_1}, + { "Latin2", "ISO-8859-2", ISO_8859_2}, + { "Latin3", "ISO-8859-3", UTF8}, + // MSIE 6 does not support ISO-8859-3 (XSS issue) + { "Latin4", "ISO-8859-4", ISO_8859_4}, + { "ISO-8859-5", "ISO-8859-5", ISO_8859_5}, + { "Arabic", "ISO-8859-6", ISO_8859_6}, + { "Greek", "ISO-8859-7", ISO_8859_7}, + { "Hebrew", "ISO-8859-8", MSFT_CP1255}, + // we do not endorse the visual order + { "Latin5", "ISO-8859-9", ISO_8859_9}, + { "Latin6", "ISO-8859-10", UTF8}, + // MSIE does not support ISO-8859-10 (XSS issue) + { "EUC-JP", "EUC-JP", JAPANESE_EUC_JP}, + { "SJS", "Shift_JIS", JAPANESE_SHIFT_JIS}, + { "JIS", "ISO-2022-JP", JAPANESE_SHIFT_JIS}, + // due to potential confusion with HTML syntax chars + { "BIG5", "Big5", CHINESE_BIG5}, + { "GB", "GB2312", CHINESE_GB}, + { "EUC-CN", + "EUC-CN", + // Misnamed. Should be EUC-TW. + CHINESE_BIG5}, + // MSIE treats "EUC-CN" like GB2312, which is not EUC-TW, + // and EUC-TW is rare, so we prefer Big5 for output. + { "KSC", "EUC-KR", KOREAN_EUC_KR}, + { "Unicode", + "UTF-16LE", + // Internet Explorer doesn't recognize "ISO-10646-UCS-2" + UTF8 + // due to potential confusion with HTML syntax chars + }, + { "EUC", + "EUC", // Misnamed. Should be EUC-TW. + CHINESE_BIG5 + // MSIE does not recognize "EUC" (XSS issue), + // and EUC-TW is rare, so we prefer Big5 for output. + }, + { "CNS", + "CNS", // Misnamed. Should be EUC-TW. + CHINESE_BIG5}, + // MSIE does not recognize "CNS" (XSS issue), + // and EUC-TW is rare, so we prefer Big5 for output. + { "BIG5-CP950", + "BIG5-CP950", // Not an IANA name + CHINESE_BIG5 + // MSIE does not recognize "BIG5-CP950" (XSS issue) + }, + { "CP932", "CP932", // Not an IANA name + JAPANESE_SHIFT_JIS}, // MSIE does not recognize "CP932" (XSS issue) + { "UTF8", "UTF-8", UTF8}, + { "Unknown", + "x-unknown", // Not an IANA name + UTF8}, // UTF-8 is our default output encoding + { "ASCII-7-bit", "US-ASCII", ASCII_7BIT}, + { "KOI8R", "KOI8-R", RUSSIAN_KOI8_R}, + { "CP1251", "windows-1251", RUSSIAN_CP1251}, + { "CP1252", "windows-1252", MSFT_CP1252}, + { "KOI8U", + "KOI8-U", + ISO_8859_5}, // because koi8-u is not as common + { "CP1250", "windows-1250", MSFT_CP1250}, + { "ISO-8859-15", "ISO-8859-15", ISO_8859_15}, + { "CP1254", "windows-1254", MSFT_CP1254}, + { "CP1257", "windows-1257", MSFT_CP1257}, + { "ISO-8859-11", "ISO-8859-11", ISO_8859_11}, + { "CP874", "windows-874", MSFT_CP874}, + { "CP1256", "windows-1256", MSFT_CP1256}, + { "CP1255", "windows-1255", MSFT_CP1255}, + { "ISO-8859-8-I", "ISO-8859-8-I", MSFT_CP1255}, + // Java does not support iso-8859-8-i + { "VISUAL", "ISO-8859-8", MSFT_CP1255}, + // we do not endorse the visual order + { "CP852", "cp852", MSFT_CP1250}, + // because cp852 is not as common + { "CSN_369103", "csn_369103", MSFT_CP1250}, + // MSIE does not recognize "csn_369103" (XSS issue) + { "CP1253", "windows-1253", MSFT_CP1253}, + { "CP866", "IBM866", RUSSIAN_CP1251}, + // because cp866 is not as common + { "ISO-8859-13", "ISO-8859-13", UTF8}, + // because iso-8859-13 is not widely supported + { "ISO-2022-KR", "ISO-2022-KR", KOREAN_EUC_KR}, + // due to potential confusion with HTML syntax chars + { "GBK", "GBK", GBK}, + { "GB18030", "GB18030", GBK}, + // because gb18030 is not widely supported + { "BIG5_HKSCS", "BIG5-HKSCS", CHINESE_BIG5}, + // because Big5-HKSCS is not widely supported + { "ISO_2022_CN", "ISO-2022-CN", CHINESE_GB}, + // due to potential confusion with HTML syntax chars + { "TSCII", "tscii", UTF8}, + // we do not have an output converter for this font encoding + { "TAM", "tam", UTF8}, + // we do not have an output converter for this font encoding + { "TAB", "tab", UTF8}, + // we do not have an output converter for this font encoding + { "JAGRAN", "jagran", UTF8}, + // we do not have an output converter for this font encoding + { "MACINTOSH", "MACINTOSH", ISO_8859_1}, + // because macintosh is relatively uncommon + { "UTF7", "UTF-7", + UTF8}, // UTF-7 has been the subject of XSS attacks and is deprecated + { "BHASKAR", "bhaskar", + UTF8}, // we do not have an output converter for this font encoding + { "HTCHANAKYA", "htchanakya", // not an IANA charset name. + UTF8}, // we do not have an output converter for this font encoding + { "UTF-16BE", "UTF-16BE", + UTF8}, // due to potential confusion with HTML syntax chars + { "UTF-16LE", "UTF-16LE", + UTF8}, // due to potential confusion with HTML syntax chars + { "UTF-32BE", "UTF-32BE", + UTF8}, // unlikely to cause XSS bugs, but very uncommon on Web + { "UTF-32LE", "UTF-32LE", + UTF8}, // unlikely to cause XSS bugs, but very uncommon on Web + { "X-BINARYENC", "x-binaryenc", // Not an IANA name + UTF8}, // because this one is not intended for output (just input) + { "HZ-GB-2312", "HZ-GB-2312", + CHINESE_GB}, // due to potential confusion with HTML syntax chars + { "X-UTF8UTF8", "x-utf8utf8", // Not an IANA name + UTF8}, // because this one is not intended for output (just input) + { "X-TAM-ELANGO", "x-tam-elango", + UTF8}, // we do not have an output converter for this font encoding + { "X-TAM-LTTMBARANI", "x-tam-lttmbarani", + UTF8}, // we do not have an output converter for this font encoding + { "X-TAM-SHREE", "x-tam-shree", + UTF8}, // we do not have an output converter for this font encoding + { "X-TAM-TBOOMIS", "x-tam-tboomis", + UTF8}, // we do not have an output converter for this font encoding + { "X-TAM-TMNEWS", "x-tam-tmnews", + UTF8}, // we do not have an output converter for this font encoding + { "X-TAM-WEBTAMIL", "x-tam-webtamil", + UTF8}, // we do not have an output converter for this font encoding + + { "X-KDDI-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS}, + // KDDI version of Shift_JIS with Google Emoji PUA mappings. + // Note that MimeEncodingName() returns "Shift_JIS", since KDDI uses + // "Shift_JIS" in HTTP headers and email messages. + + { "X-DoCoMo-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS}, + // DoCoMo version of Shift_JIS with Google Emoji PUA mappings. + // See the comment at KDDI_SHIFT_JIS for other issues. + + { "X-SoftBank-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS}, + // SoftBank version of Shift_JIS with Google Emoji PUA mappings. + // See the comment at KDDI_SHIFT_JIS for other issues. + + { "X-KDDI-ISO-2022-JP", "ISO-2022-JP", JAPANESE_SHIFT_JIS}, + // KDDI version of ISO-2022-JP with Google Emoji PUA mappings. + // See the comment at KDDI_SHIFT_JIS for other issues. + // The preferred Web encoding is due to potential confusion with + // HTML syntax chars. + + { "X-SoftBank-ISO-2022-JP", "ISO-2022-JP", JAPANESE_SHIFT_JIS}, + // SoftBank version of ISO-2022-JP with Google Emoji PUA mappings. + // See the comment at KDDI_SHIFT_JIS for other issues. + // The preferred Web encoding is due to potential confusion with + // HTML syntax chars. + + // Please refer to NOTE: section in the comments in the definition + // of "struct I18NInfoByEncoding", before adding new encodings. + +}; + + + +COMPILE_ASSERT(arraysize(kEncodingInfoTable) == NUM_ENCODINGS, + kEncodingInfoTable_has_incorrect_size); + +Encoding default_encoding() {return LATIN1;} + +// ************************************************************* +// Encoding predicates +// IsValidEncoding() +// IsEncEncCompatible +// IsEncodingWithSupportedLanguage +// IsSupersetOfAscii7Bit +// Is8BitEncoding +// IsCJKEncoding +// IsHebrewEncoding +// IsRightToLeftEncoding +// IsLogicalRightToLeftEncoding +// IsVisualRightToLeftEncoding +// IsIso2022Encoding +// IsIso2022JpOrVariant +// IsShiftJisOrVariant +// IsJapaneseCellPhoneCarrierSpecificEncoding +// ************************************************************* + +bool IsValidEncoding(Encoding enc) { + return ((enc >= 0) && (enc < kNumEncodings)); +} + +bool IsEncEncCompatible(const Encoding from, const Encoding to) { + // Tests compatibility between the "from" and "to" encodings; in + // the typical case -- when both are valid known encodings -- this + // returns true iff converting from first to second is a no-op. + if (!IsValidEncoding(from) || !IsValidEncoding(to)) { + return false; // we only work with valid encodings... + } else if (to == from) { + return true; // the trivial common case + } + + if (to == UNKNOWN_ENCODING) { + return true; // all valid encodings are compatible with the unknown + } + + if (from == UNKNOWN_ENCODING) { + return false; // no unknown encoding is compatible with one that is + } + + if (from == ASCII_7BIT) { + return IsSupersetOfAscii7Bit(to); + } + + return (from == ISO_8859_1 && to == MSFT_CP1252) || + (from == ISO_8859_8 && to == HEBREW_VISUAL) || + (from == HEBREW_VISUAL && to == ISO_8859_8) || + (from == ISO_8859_9 && to == MSFT_CP1254) || + (from == ISO_8859_11 && to == MSFT_CP874) || + (from == JAPANESE_SHIFT_JIS && to == JAPANESE_CP932) || + (from == CHINESE_BIG5 && to == CHINESE_BIG5_CP950) || + (from == CHINESE_GB && to == GBK) || + (from == CHINESE_GB && to == GB18030) || + (from == CHINESE_EUC_CN && to == CHINESE_EUC_DEC) || + (from == CHINESE_EUC_CN && to == CHINESE_CNS) || + (from == CHINESE_EUC_DEC && to == CHINESE_EUC_CN) || + (from == CHINESE_EUC_DEC && to == CHINESE_CNS) || + (from == CHINESE_CNS && to == CHINESE_EUC_CN) || + (from == CHINESE_CNS && to == CHINESE_EUC_DEC); +} + +// To be a superset of 7-bit Ascii means that bytes 0...127 in the given +// encoding represent the same characters as they do in ISO_8859_1. + +// TODO: This list could be expanded. Many other encodings are supersets +// of 7-bit Ascii. In fact, Japanese JIS and Unicode are the only two +// encodings that I know for a fact should *not* be in this list. +bool IsSupersetOfAscii7Bit(Encoding e) { + switch (e) { + case ISO_8859_1: + case ISO_8859_2: + case ISO_8859_3: + case ISO_8859_4: + case ISO_8859_5: + case ISO_8859_6: + case ISO_8859_7: + case ISO_8859_8: + case ISO_8859_9: + case ISO_8859_10: + case JAPANESE_EUC_JP: + case JAPANESE_SHIFT_JIS: + case CHINESE_BIG5: + case CHINESE_GB: + case CHINESE_EUC_CN: + case KOREAN_EUC_KR: + case CHINESE_EUC_DEC: + case CHINESE_CNS: + case CHINESE_BIG5_CP950: + case JAPANESE_CP932: + case UTF8: + case UNKNOWN_ENCODING: + case ASCII_7BIT: + case RUSSIAN_KOI8_R: + case RUSSIAN_CP1251: + case MSFT_CP1252: + case RUSSIAN_KOI8_RU: + case MSFT_CP1250: + case ISO_8859_15: + case MSFT_CP1254: + case MSFT_CP1257: + case ISO_8859_11: + case MSFT_CP874: + case MSFT_CP1256: + case MSFT_CP1255: + case ISO_8859_8_I: + case HEBREW_VISUAL: + case CZECH_CP852: + case MSFT_CP1253: + case RUSSIAN_CP866: + case ISO_8859_13: + case GBK: + case GB18030: + case BIG5_HKSCS: + case MACINTOSH_ROMAN: + return true; + default: + return false; + } +} + +// To be an 8-bit encoding means that there are fewer than 256 symbols. +// Each byte determines a new character; there are no multi-byte sequences. + +// TODO: This list could maybe be expanded. Other encodings may be 8-bit. +bool Is8BitEncoding(Encoding e) { + switch (e) { + case ASCII_7BIT: + case ISO_8859_1: + case ISO_8859_2: + case ISO_8859_3: + case ISO_8859_4: + case ISO_8859_5: + case ISO_8859_6: + case ISO_8859_7: + case ISO_8859_8: + case ISO_8859_8_I: + case ISO_8859_9: + case ISO_8859_10: + case ISO_8859_11: + case ISO_8859_13: + case ISO_8859_15: + case MSFT_CP1252: + case MSFT_CP1253: + case MSFT_CP1254: + case MSFT_CP1255: + case MSFT_CP1256: + case MSFT_CP1257: + case RUSSIAN_KOI8_R: + case RUSSIAN_KOI8_RU: + case RUSSIAN_CP866: + return true; + default: + return false; + } +} + +bool IsCJKEncoding(Encoding e) { + switch (e) { + case JAPANESE_EUC_JP: + case JAPANESE_SHIFT_JIS: + case JAPANESE_JIS: + case CHINESE_BIG5: + case CHINESE_GB: + case CHINESE_EUC_CN: + case KOREAN_EUC_KR: + case CHINESE_EUC_DEC: + case CHINESE_CNS: + case CHINESE_BIG5_CP950: + case JAPANESE_CP932: + case ISO_2022_KR: + case GBK: + case GB18030: + case BIG5_HKSCS: + case ISO_2022_CN: + case HZ_GB_2312: + return true; + default: + return false; + } +} + +bool IsHebrewEncoding(Encoding e) { + return (e == ISO_8859_8 || + e == ISO_8859_8_I || + e == MSFT_CP1255 || + e == HEBREW_VISUAL); +} + + + +bool IsRightToLeftEncoding(Encoding enc) { + switch (enc) { + case MSFT_CP1255: + case MSFT_CP1256: + case ARABIC_ENCODING: + case HEBREW_ENCODING: + case ISO_8859_8_I: + case HEBREW_VISUAL: + return true; + default: + return false; + } +} + +bool IsLogicalRightToLeftEncoding(Encoding enc) { + return IsRightToLeftEncoding(enc) && !IsVisualRightToLeftEncoding(enc); +} + +// Note that despite an RFC to the contrary, ARABIC_ENCODING (ISO-8859-6) +// is NOT visual. +bool IsVisualRightToLeftEncoding(Encoding enc) { + switch (enc) { + case HEBREW_ENCODING: + case HEBREW_VISUAL: + return true; + default: + return false; + } +} + + + + + +bool IsIso2022Encoding(Encoding enc) { + return (IsIso2022JpOrVariant(enc) || + enc == ISO_2022_KR || + enc == ISO_2022_CN); +} + +bool IsIso2022JpOrVariant(Encoding enc) { + return (enc == JAPANESE_JIS || + enc == KDDI_ISO_2022_JP || + enc == SOFTBANK_ISO_2022_JP); +} + +bool IsShiftJisOrVariant(Encoding enc) { + return (enc == JAPANESE_SHIFT_JIS || + enc == JAPANESE_CP932 || + enc == KDDI_SHIFT_JIS || + enc == DOCOMO_SHIFT_JIS || + enc == SOFTBANK_SHIFT_JIS); +} + +bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc) { + return (enc == KDDI_ISO_2022_JP || + enc == KDDI_SHIFT_JIS || + enc == DOCOMO_SHIFT_JIS || + enc == SOFTBANK_SHIFT_JIS || + enc == SOFTBANK_ISO_2022_JP); +} + + +// ************************************************************* +// ENCODING NAMES +// EncodingName() [Encoding to name] +// MimeEncodingName() [Encoding to name] +// EncodingFromName() [name to Encoding] +// EncodingNameAliasToEncoding() [name to Encoding] +// default_encoding_name() +// invalid_encoding_name() +// ************************************************************* + +const char * EncodingName(const Encoding enc) { + if ( (enc < 0) || (enc >= kNumEncodings) ) + return invalid_encoding_name(); + return kEncodingInfoTable[enc].encoding_name_; +} + +// TODO: Unify MimeEncodingName and EncodingName, or determine why +// such a unification is not possible. + +const char * MimeEncodingName(Encoding enc) { + if ( (enc < 0) || (enc >= kNumEncodings) ) + return ""; // TODO: Should this be invalid_encoding_name()? + return kEncodingInfoTable[enc].mime_encoding_name_; +} + +bool EncodingFromName(const char* enc_name, Encoding *encoding) { + *encoding = UNKNOWN_ENCODING; + if ( enc_name == NULL ) return false; + + for ( int i = 0; i < kNumEncodings; i++ ) { + if (!base::strcasecmp(enc_name, kEncodingInfoTable[i].encoding_name_) ) { + *encoding = static_cast<Encoding>(i); + return true; + } + } + return false; +} + +// The encoding_map maps standard and non-standard encoding-names +// (strings) to Encoding enums. It is used only by +// EncodingNameAliasToEncoding. Note that the map uses +// case-insensitive hash and comparison functions. + +typedef std::unordered_map<const char *, Encoding, + CStringAlnumCaseHash, + CStringAlnumCaseEqual> EncodingMap; + +static const EncodingMap& GetEncodingMap() { + static EncodingMap encoding_map; + if (!encoding_map.empty()) { + // Already initialized + return encoding_map; + } + + // Initialize the map with all the "standard" encoding names, + // i.e., the ones returned by EncodingName and MimeEncodingName. + // + // First, add internal encoding names returned by EncodingName(). + for (int i = 0; i < NUM_ENCODINGS; ++i) { + Encoding e = static_cast<Encoding>(i); + // Internal encoding names must be unique. + // The internal names are guaranteed to be unique by the CHECK_EQ. + const char *encoding_name = EncodingName(e); + // CHECK_EQ(0, encoding_map.count(encoding_name)) + // << "Duplicate found for " << encoding_name; + encoding_map[encoding_name] = e; + } + // Then, add mime encoding names returned by MimeEncodingName(). + // We don't override existing entries, to give precedence to entries + // added earlier. + for (int i = 0; i < NUM_ENCODINGS; ++i) { + Encoding e = static_cast<Encoding>(i); + // Note that MimeEncodingName() can return the same mime encoding + // name for different encoding enums like JAPANESE_SHIFT_JIS and + // KDDI_SHIFT_JIS. In that case, the encoding enum first seen + // will be the value for the encoding name in the map. + const char *mime_encoding_name = MimeEncodingName(e); + if (encoding_map.count(mime_encoding_name) == 0) { + encoding_map[mime_encoding_name] = e; + } + } + + // Add some non-standard names: alternate spellings, common typos, + // etc. (It does no harm to add names already in the map.) Note + // that although the map is case-insensitive, by convention the + // keys are written here in lower case. For ease of maintenance, + // they are listed in alphabetical order. + encoding_map["5601"] = KOREAN_EUC_KR; + encoding_map["646"] = ASCII_7BIT; + encoding_map["852"] = CZECH_CP852; + encoding_map["866"] = RUSSIAN_CP866; + encoding_map["8859-1"] = ISO_8859_1; + encoding_map["ansi-1251"] = RUSSIAN_CP1251; + encoding_map["ansi_x3.4-1968"] = ASCII_7BIT; + encoding_map["arabic"] = ISO_8859_6; + encoding_map["ascii"] = ISO_8859_1; + encoding_map["ascii-7-bit"] = ASCII_7BIT; // not iana standard + encoding_map["asmo-708"] = ISO_8859_6; + encoding_map["bhaskar"] = BHASKAR; + encoding_map["big5"] = CHINESE_BIG5; + encoding_map["big5-cp950"] = CHINESE_BIG5_CP950; // not iana standard + encoding_map["big5-hkscs"] = BIG5_HKSCS; + encoding_map["chinese"] = CHINESE_GB; + encoding_map["cns"] = CHINESE_CNS; // not iana standard + encoding_map["cns11643"] = CHINESE_CNS; + encoding_map["cp1250"] = MSFT_CP1250; // not iana standard + encoding_map["cp1251"] = RUSSIAN_CP1251; // not iana standard + encoding_map["cp1252"] = MSFT_CP1252; // not iana standard + encoding_map["cp1253"] = MSFT_CP1253; // not iana standard + encoding_map["cp1254"] = MSFT_CP1254; // not iana standard + encoding_map["cp1255"] = MSFT_CP1255; + encoding_map["cp1256"] = MSFT_CP1256; + encoding_map["cp1257"] = MSFT_CP1257; // not iana standard + encoding_map["cp819"] = ISO_8859_1; + encoding_map["cp852"] = CZECH_CP852; + encoding_map["cp866"] = RUSSIAN_CP866; + encoding_map["cp-866"] = RUSSIAN_CP866; + encoding_map["cp874"] = MSFT_CP874; + encoding_map["cp932"] = JAPANESE_CP932; // not iana standard + encoding_map["cp950"] = CHINESE_BIG5_CP950; // not iana standard + encoding_map["csbig5"] = CHINESE_BIG5; + encoding_map["cseucjpkdfmtjapanese"] = JAPANESE_EUC_JP; + encoding_map["cseuckr"] = KOREAN_EUC_KR; + encoding_map["csgb2312"] = CHINESE_GB; + encoding_map["csibm852"] = CZECH_CP852; + encoding_map["csibm866"] = RUSSIAN_CP866; + encoding_map["csiso2022jp"] = JAPANESE_JIS; + encoding_map["csiso2022kr"] = ISO_2022_KR; + encoding_map["csiso58gb231280"] = CHINESE_GB; + encoding_map["csiso88598i"] = ISO_8859_8_I; + encoding_map["csisolatin1"] = ISO_8859_1; + encoding_map["csisolatin2"] = ISO_8859_2; + encoding_map["csisolatin3"] = ISO_8859_3; + encoding_map["csisolatin4"] = ISO_8859_4; + encoding_map["csisolatin5"] = ISO_8859_9; + encoding_map["csisolatin6"] = ISO_8859_10; + encoding_map["csisolatinarabic"] = ISO_8859_6; + encoding_map["csisolatincyrillic"] = ISO_8859_5; + encoding_map["csisolatingreek"] = ISO_8859_7; + encoding_map["csisolatinhebrew"] = ISO_8859_8; + encoding_map["csksc56011987"] = KOREAN_EUC_KR; + encoding_map["csmacintosh"] = MACINTOSH_ROMAN; + encoding_map["csn-369103"] = CZECH_CSN_369103; + encoding_map["csshiftjis"] = JAPANESE_SHIFT_JIS; + encoding_map["csunicode"] = UTF16BE; + encoding_map["csunicode11"] = UTF16BE; + encoding_map["csunicode11utf7"] = UTF7; + encoding_map["csunicodeascii"] = UTF16BE; + encoding_map["csunicodelatin1"] = UTF16BE; + encoding_map["cyrillic"] = ISO_8859_5; + encoding_map["ecma-114"] = ISO_8859_6; + encoding_map["ecma-118"] = ISO_8859_7; + encoding_map["elot_928"] = ISO_8859_7; + encoding_map["euc"] = CHINESE_EUC_DEC; // not iana standard + encoding_map["euc-cn"] = CHINESE_EUC_CN; // not iana standard + encoding_map["euc-dec"] = CHINESE_EUC_DEC; // not iana standard + encoding_map["euc-jp"] = JAPANESE_EUC_JP; + encoding_map["euc-kr"] = KOREAN_EUC_KR; + encoding_map["eucgb2312_cn"] = CHINESE_GB; + encoding_map["gb"] = CHINESE_GB; // not iana standard + encoding_map["gb18030"] = GB18030; + encoding_map["gb2132"] = CHINESE_GB; // common typo + encoding_map["gb2312"] = CHINESE_GB; + encoding_map["gb_2312-80"] = CHINESE_GB; + encoding_map["gbk"] = GBK; + encoding_map["greek"] = ISO_8859_7; + encoding_map["greek8"] = ISO_8859_7; + encoding_map["hebrew"] = ISO_8859_8; + encoding_map["htchanakya"] = HTCHANAKYA; + encoding_map["hz-gb-2312"] = HZ_GB_2312; + encoding_map["ibm819"] = ISO_8859_1; + encoding_map["ibm852"] = CZECH_CP852; + encoding_map["ibm874"] = MSFT_CP874; + encoding_map["iso-10646"] = UTF16BE; + encoding_map["iso-10646-j-1"] = UTF16BE; + encoding_map["iso-10646-ucs-2"] = UNICODE; + encoding_map["iso-10646-ucs-4"] = UTF32BE; + encoding_map["iso-10646-ucs-basic"] = UTF16BE; + encoding_map["iso-10646-unicode-latin1"] = UTF16BE; + encoding_map["iso-2022-cn"] = ISO_2022_CN; + encoding_map["iso-2022-jp"] = JAPANESE_JIS; + encoding_map["iso-2022-kr"] = ISO_2022_KR; + encoding_map["iso-8559-1"] = ISO_8859_1; // common typo + encoding_map["iso-874"] = MSFT_CP874; + encoding_map["iso-8858-1"] = ISO_8859_1; // common typo + // iso-8859-0 was a temporary name, eventually renamed iso-8859-15 + encoding_map["iso-8859-0"] = ISO_8859_15; + encoding_map["iso-8859-1"] = ISO_8859_1; + encoding_map["iso-8859-10"] = ISO_8859_10; + encoding_map["iso-8859-11"] = ISO_8859_11; + encoding_map["iso-8859-13"] = ISO_8859_13; + encoding_map["iso-8859-15"] = ISO_8859_15; + encoding_map["iso-8859-2"] = ISO_8859_2; + encoding_map["iso-8859-3"] = ISO_8859_3; + encoding_map["iso-8859-4"] = ISO_8859_4; + encoding_map["iso-8859-5"] = ISO_8859_5; + encoding_map["iso-8859-6"] = ISO_8859_6; + encoding_map["iso-8859-7"] = ISO_8859_7; + encoding_map["iso-8859-8"] = ISO_8859_8; + encoding_map["iso-8859-8-i"] = ISO_8859_8_I; + encoding_map["iso-8859-9"] = ISO_8859_9; + encoding_map["iso-9959-1"] = ISO_8859_1; // common typo + encoding_map["iso-ir-100"] = ISO_8859_1; + encoding_map["iso-ir-101"] = ISO_8859_2; + encoding_map["iso-ir-109"] = ISO_8859_3; + encoding_map["iso-ir-110"] = ISO_8859_4; + encoding_map["iso-ir-126"] = ISO_8859_7; + encoding_map["iso-ir-127"] = ISO_8859_6; + encoding_map["iso-ir-138"] = ISO_8859_8; + encoding_map["iso-ir-144"] = ISO_8859_5; + encoding_map["iso-ir-148"] = ISO_8859_9; + encoding_map["iso-ir-149"] = KOREAN_EUC_KR; + encoding_map["iso-ir-157"] = ISO_8859_10; + encoding_map["iso-ir-58"] = CHINESE_GB; + encoding_map["iso-latin-1"] = ISO_8859_1; + encoding_map["iso_2022-cn"] = ISO_2022_CN; + encoding_map["iso_2022-kr"] = ISO_2022_KR; + encoding_map["iso_8859-1"] = ISO_8859_1; + encoding_map["iso_8859-10:1992"] = ISO_8859_10; + encoding_map["iso_8859-11"] = ISO_8859_11; + encoding_map["iso_8859-13"] = ISO_8859_13; + encoding_map["iso_8859-15"] = ISO_8859_15; + encoding_map["iso_8859-1:1987"] = ISO_8859_1; + encoding_map["iso_8859-2"] = ISO_8859_2; + encoding_map["iso_8859-2:1987"] = ISO_8859_2; + encoding_map["iso_8859-3"] = ISO_8859_3; + encoding_map["iso_8859-3:1988"] = ISO_8859_3; + encoding_map["iso_8859-4"] = ISO_8859_4; + encoding_map["iso_8859-4:1988"] = ISO_8859_4; + encoding_map["iso_8859-5"] = ISO_8859_5; + encoding_map["iso_8859-5:1988"] = ISO_8859_5; + encoding_map["iso_8859-6"] = ISO_8859_6; + encoding_map["iso_8859-6:1987"] = ISO_8859_6; + encoding_map["iso_8859-7"] = ISO_8859_7; + encoding_map["iso_8859-7:1987"] = ISO_8859_7; + encoding_map["iso_8859-8"] = ISO_8859_8; + encoding_map["iso_8859-8:1988:"] = ISO_8859_8; + encoding_map["iso_8859-9"] = ISO_8859_9; + encoding_map["iso_8859-9:1989"] = ISO_8859_9; + encoding_map["jagran"] = JAGRAN; + encoding_map["jis"] = JAPANESE_JIS; // not iana standard + encoding_map["koi8-cs"] = CZECH_CSN_369103; + encoding_map["koi8-r"] = RUSSIAN_KOI8_R; + encoding_map["koi8-ru"] = RUSSIAN_KOI8_RU; // not iana standard + encoding_map["koi8-u"] = RUSSIAN_KOI8_RU; + encoding_map["koi8r"] = RUSSIAN_KOI8_R; // not iana standard + encoding_map["koi8u"] = RUSSIAN_KOI8_RU; // not iana standard + encoding_map["korean"] = KOREAN_EUC_KR; // i assume this is what is meant + encoding_map["ks-c-5601"] = KOREAN_EUC_KR; // not iana standard + encoding_map["ks-c-5601-1987"] = KOREAN_EUC_KR; // not iana standard + encoding_map["ks_c_5601-1989"] = KOREAN_EUC_KR; + encoding_map["ksc"] = KOREAN_EUC_KR; // not iana standard + encoding_map["l1"] = ISO_8859_1; + encoding_map["l2"] = ISO_8859_2; + encoding_map["l3"] = ISO_8859_3; + encoding_map["l4"] = ISO_8859_4; + encoding_map["l5"] = ISO_8859_9; + encoding_map["l6"] = ISO_8859_10; + encoding_map["latin-1"] = ISO_8859_1; // not iana standard + encoding_map["latin1"] = ISO_8859_1; + encoding_map["latin2"] = ISO_8859_2; + encoding_map["latin3"] = ISO_8859_3; + encoding_map["latin4"] = ISO_8859_4; + encoding_map["latin5"] = ISO_8859_9; + encoding_map["latin6"] = ISO_8859_10; + encoding_map["mac"] = MACINTOSH_ROMAN; + encoding_map["macintosh"] = MACINTOSH_ROMAN; + encoding_map["macintosh-roman"] = MACINTOSH_ROMAN; + encoding_map["ms932"] = JAPANESE_CP932; // not iana standard + encoding_map["ms_kanji"] = JAPANESE_CP932; + encoding_map["shift-jis"] = JAPANESE_SHIFT_JIS; + encoding_map["shift_jis"] = JAPANESE_SHIFT_JIS; + encoding_map["sjis"] = JAPANESE_SHIFT_JIS; // not iana standard + encoding_map["sjs"] = JAPANESE_SHIFT_JIS; // not iana standard + encoding_map["sun_eu_greek"] = ISO_8859_7; + encoding_map["tab"] = TAMIL_BI; + encoding_map["tam"] = TAMIL_MONO; + encoding_map["tis-620"] = ISO_8859_11; + encoding_map["tscii"] = TSCII; + encoding_map["un"] = UNKNOWN_ENCODING; // not iana standard + encoding_map["unicode"] = UNICODE; // not iana standard + encoding_map["unicode-1-1-utf-7"] = UTF7; + encoding_map["unicode-1-1-utf-8"] = UTF8; + encoding_map["unicode-2-0-utf-7"] = UTF7; + encoding_map["unknown"] = UNKNOWN_ENCODING; // not iana standard + encoding_map["us"] = ISO_8859_1; + encoding_map["us-ascii"] = ISO_8859_1; + encoding_map["utf-16be"] = UTF16BE; + encoding_map["utf-16le"] = UTF16LE; + encoding_map["utf-32be"] = UTF32BE; + encoding_map["utf-32le"] = UTF32LE; + encoding_map["utf-7"] = UTF7; + encoding_map["utf-8"] = UTF8; + encoding_map["utf7"] = UTF7; + encoding_map["utf8"] = UTF8; // not iana standard + encoding_map["visual"] = HEBREW_VISUAL; + encoding_map["win-1250"] = MSFT_CP1250; // not iana standard + encoding_map["win-1251"] = RUSSIAN_CP1251; // not iana standard + encoding_map["window-874"] = MSFT_CP874; + encoding_map["windows-1250"] = MSFT_CP1250; + encoding_map["windows-1251"] = RUSSIAN_CP1251; + encoding_map["windows-1252"] = MSFT_CP1252; + encoding_map["windows-1253"] = MSFT_CP1253; + encoding_map["windows-1254"] = MSFT_CP1254; + encoding_map["windows-1255"] = MSFT_CP1255; + encoding_map["windows-1256"] = MSFT_CP1256; + encoding_map["windows-1257"] = MSFT_CP1257; + encoding_map["windows-31j"] = JAPANESE_CP932; + encoding_map["windows-874"] = MSFT_CP874; + encoding_map["windows-936"] = GBK; + encoding_map["x-big5"] = CHINESE_BIG5; + encoding_map["x-binaryenc"] = BINARYENC; // not iana standard + encoding_map["x-cp1250"] = MSFT_CP1250; + encoding_map["x-cp1251"] = RUSSIAN_CP1251; + encoding_map["x-cp1252"] = MSFT_CP1252; + encoding_map["x-cp1253"] = MSFT_CP1253; + encoding_map["x-cp1254"] = MSFT_CP1254; + encoding_map["x-cp1255"] = MSFT_CP1255; + encoding_map["x-cp1256"] = MSFT_CP1256; + encoding_map["x-cp1257"] = MSFT_CP1257; + encoding_map["x-euc-jp"] = JAPANESE_EUC_JP; + encoding_map["x-euc-tw"] = CHINESE_CNS; + encoding_map["x-gbk"] = GBK; + encoding_map["x-iso-10646-ucs-2-be"] = UTF16BE; + encoding_map["x-iso-10646-ucs-2-le"] = UTF16LE; + encoding_map["x-iso-10646-ucs-4-be"] = UTF32BE; + encoding_map["x-iso-10646-ucs-4-le"] = UTF32LE; + encoding_map["x-jis"] = JAPANESE_JIS; // not iana standard + encoding_map["x-mac-roman"] = MACINTOSH_ROMAN; + encoding_map["x-shift_jis"] = JAPANESE_SHIFT_JIS; // not iana standard + encoding_map["x-sjis"] = JAPANESE_SHIFT_JIS; + encoding_map["x-unicode-2-0-utf-7"] = UTF7; + encoding_map["x-utf8utf8"] = UTF8UTF8; // not iana standard + encoding_map["x-x-big5"] = CHINESE_BIG5; + encoding_map["zh_cn.euc"] = CHINESE_GB; + encoding_map["zh_tw-big5"] = CHINESE_BIG5; + encoding_map["zh_tw-euc"] = CHINESE_CNS; + + // Remove they entry for the empty string, if any. + encoding_map.erase(""); + + return encoding_map; +} + +// ---------------------------------------------------------------------- +// EncodingNameAliasToEncoding() +// +// This function takes an encoding name/alias and returns the Encoding +// enum. The input is case insensitive. It is the union of the common +// IANA standard names, the charset names used in Netscape Navigator, +// and some common names we have been using. +// See: http://www.iana.org/assignments/character-sets +// http://physics.hallym.ac.kr/resource/relnotes/windows-2.0.html +// +// UNKNOWN_ENCODING is returned if none matches. +// +// TODO: Check if it is possible to remove the non-standard, +// non-netscape-use names. It is because this routine is used for +// encoding detections from html meta info. Non-standard names may +// introduce noise on encoding detection. +// +// TODO: Unify EncodingNameAliasToEncoding and EncodingFromName, +// or determine why such a unification is not possible. +// ---------------------------------------------------------------------- +Encoding EncodingNameAliasToEncoding(const char *encoding_name) { + if (!encoding_name) { + return UNKNOWN_ENCODING; + } + + const EncodingMap& encoding_map = GetEncodingMap(); + + EncodingMap::const_iterator emi = encoding_map.find(encoding_name); + if (emi != encoding_map.end()) { + return emi->second; + } else { + return UNKNOWN_ENCODING; + } +} + +const char* default_encoding_name() { + return kEncodingInfoTable[LATIN1].encoding_name_; +} + +static const char* const kInvalidEncodingName = "invalid_encoding"; + +const char *invalid_encoding_name() { + return kInvalidEncodingName; +} + + + +// ************************************************************* +// Miscellany +// ************************************************************* + + +Encoding PreferredWebOutputEncoding(Encoding enc) { + return IsValidEncoding(enc) + ? kEncodingInfoTable[enc].preferred_web_output_encoding_ + : UTF8; +} diff --git a/contrib/google-ced/util/encodings/encodings.h b/contrib/google-ced/util/encodings/encodings.h new file mode 100644 index 0000000..6477974 --- /dev/null +++ b/contrib/google-ced/util/encodings/encodings.h @@ -0,0 +1,299 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef UTIL_ENCODINGS_ENCODINGS_H_ +#define UTIL_ENCODINGS_ENCODINGS_H_ + +// This interface defines the Encoding enum and various functions that +// depend only on Encoding values. + +// A hash-function for Encoding, hash<Encoding>, is defined in +// i18n/encodings/public/encodings-hash.h + +// On some Windows projects, UNICODE may be defined, which would prevent the +// Encoding enum below from compiling. Note that this is a quick fix that does +// not break any existing projects. The UNICODE enum may someday be changed +// to something more specific and non-colliding, but this involves careful +// testing of changes in many other projects. +#undef UNICODE + +// NOTE: The Encoding enum must always start at 0. This assumption has +// been made and used. + +#ifndef SWIG + +#include "util/encodings/encodings.pb.h" + +#else + +// TODO: Include a SWIG workaround header file. + +#endif + +const int kNumEncodings = NUM_ENCODINGS; + +// some of the popular encoding aliases +// TODO: Make these static const Encoding values instead of macros. +#define LATIN1 ISO_8859_1 +#define LATIN2 ISO_8859_2 +#define LATIN3 ISO_8859_3 +#define LATIN4 ISO_8859_4 +#define CYRILLIC ISO_8859_5 +#define ARABIC_ENCODING ISO_8859_6 // avoiding the same name as language +#define GREEK_ENCODING ISO_8859_7 // avoiding the same name as language +#define HEBREW_ENCODING ISO_8859_8 // avoiding the same name as language +#define LATIN5 ISO_8859_9 +#define LATIN6 ISO_8859_10 +#define KOREAN_HANGUL KOREAN_EUC_KR + +// The default Encoding (LATIN1). +Encoding default_encoding(); + + + +// ************************************************************* +// Encoding predicates +// IsValidEncoding() +// IsEncEncCompatible +// IsSupersetOfAscii7Bit +// Is8BitEncoding +// IsCJKEncoding +// IsHebrewEncoding +// IsRightToLeftEncoding +// IsLogicalRightToLeftEncoding +// IsVisualRightToLeftEncoding +// IsIso2022Encoding +// IsIso2022JpOrVariant +// IsShiftJisOrVariant +// IsJapaneseCellPhoneCarrierSpecificEncoding +// ************************************************************* + +// IsValidEncoding +// =================================== +// +// Function to check if the input language enum is within range. +// + +bool IsValidEncoding(Encoding enc); + +// +// IsEncEncCompatible +// ------------------ +// +// This function is to determine whether or not converting from the +// first encoding to the second requires any changes to the underlying +// text (e.g. ASCII_7BIT is a subset of UTF8). +// +// TODO: the current implementation is likely incomplete. It would be +// good to consider the full matrix of all pairs of encodings and to fish out +// all compatible pairs. +// +bool IsEncEncCompatible(const Encoding from, const Encoding to); + +// To be a superset of 7-bit Ascii means that bytes 0...127 in the given +// encoding represent the same characters as they do in ISO_8859_1. + +// WARNING: This function does not currently return true for all encodings that +// are supersets of Ascii 7-bit. +bool IsSupersetOfAscii7Bit(Encoding e); + +// To be an 8-bit encoding means that there are fewer than 256 symbols. +// Each byte determines a new character; there are no multi-byte sequences. + +// WARNING: This function does not currently return true for all encodings that +// are 8-bit encodings. +bool Is8BitEncoding(Encoding e); + +// IsCJKEncoding +// ------------- +// +// This function returns true if the encoding is either Chinese +// (simplified or traditional), Japanese, or Korean. Note: UTF8 is not +// considered a CJK encoding. +bool IsCJKEncoding(Encoding e); + +// IsHebrewEncoding +// ------------- +// +// This function returns true if the encoding is a Hebrew specific +// encoding (not UTF8, etc). +bool IsHebrewEncoding(Encoding e); + +// IsRightToLeftEncoding +// --------------------- +// +// Returns true if the encoding is a right-to-left encoding. +// +// Note that the name of this function is somewhat misleading. There is nothing +// "right to left" about these encodings. They merely contain code points for +// characters in RTL languages such as Hebrew and Arabic. But this is also +// true for UTF-8. +// +// TODO: Get rid of this function. The only special-case we +// should need to worry about are visual encodings. Anything we +// need to do for all 'RTL' encodings we need to do for UTF-8 as well. +bool IsRightToLeftEncoding(Encoding enc); + +// IsLogicalRightToLeftEncoding +// ---------------------------- +// +// Returns true if the encoding is a logical right-to-left encoding. +// Logical right-to-left encodings are those that the browser renders +// right-to-left and applies the BiDi algorithm to. Therefore the characters +// appear in reading order in the file, and indexing, snippet generation etc. +// should all just work with no special processing. +// +// TODO: Get rid of this function. The only special-case we +// should need to worry about are visual encodings. +bool IsLogicalRightToLeftEncoding(Encoding enc); + +// IsVisualRightToLeftEncoding +// --------------------------- +// +// Returns true if the encoding is a visual right-to-left encoding. +// Visual right-to-left encodings are those that the browser renders +// left-to-right and does not apply the BiDi algorithm to. Therefore each +// line appears in reverse order in the file, lines are manually wrapped +// by abusing <br> or <p> tags, etc. Visual RTL encoding is a relic of +// the prehistoric days when browsers couldn't render right-to-left, but +// unfortunately some visual pages persist to this day. These documents require +// special processing so that we don't index or snippet them with each line +// reversed. +bool IsVisualRightToLeftEncoding(Encoding enc); + +// IsIso2022Encoding +// ----------------- +// +// Returns true if the encoding is a kind of ISO 2022 such as +// ISO-2022-JP. +bool IsIso2022Encoding(Encoding enc); + +// IsIso2022JpOrVariant +// -------------------- +// +// Returns true if the encoding is ISO-2022-JP or a variant such as +// KDDI's ISO-2022-JP. +bool IsIso2022JpOrVariant(Encoding enc); + +// IsShiftJisOrVariant +// -------------------- +// +// Returns true if the encoding is Shift_JIS or a variant such as +// KDDI's Shift_JIS. +bool IsShiftJisOrVariant(Encoding enc); + +// IsJapanesCellPhoneCarrierSpecificEncoding +// ----------------------------------------- +// +// Returns true if it's Japanese cell phone carrier specific encoding +// such as KDDI_SHIFT_JIS. +bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc); + + + +// ************************************************************* +// ENCODING NAMES +// +// This interface defines a standard name for each valid encoding, and +// a standard name for invalid encodings. (Some names use all upper +// case, but others use mixed case.) +// +// EncodingName() [Encoding to name] +// MimeEncodingName() [Encoding to name] +// EncodingFromName() [name to Encoding] +// EncodingNameAliasToEncoding() [name to Encoding] +// default_encoding_name() +// invalid_encoding_name() +// ************************************************************* + +// EncodingName +// ------------ +// +// Given the encoding, returns its standard name. +// Return invalid_encoding_name() if the encoding is invalid. +// +const char* EncodingName(Encoding enc); + +// +// MimeEncodingName +// ---------------- +// +// Return the "preferred MIME name" of an encoding. +// +// This name is suitable for using in HTTP headers, HTML tags, +// and as the "charset" parameter of a MIME Content-Type. +const char* MimeEncodingName(Encoding enc); + + +// The maximum length of an encoding name +const int kMaxEncodingNameSize = 50; + +// The standard name of the default encoding. +const char* default_encoding_name(); + +// The name used for an invalid encoding. +const char* invalid_encoding_name(); + +// EncodingFromName +// ---------------- +// +// If enc_name matches the standard name of an Encoding, using a +// case-insensitive comparison, set *encoding to that Encoding and +// return true. Otherwise set *encoding to UNKNOWN_ENCODING and +// return false. +// +// REQUIRES: encoding must not be NULL. +// +bool EncodingFromName(const char* enc_name, Encoding *encoding); + +// +// EncodingNameAliasToEncoding +// --------------------------- +// +// If enc_name matches the standard name or an alias of an Encoding, +// using a case-insensitive comparison, return that +// Encoding. Otherwise, return UNKNOWN_ENCODING. +// +// Aliases include most mime-encoding names (e.g., "ISO-8859-7" for +// GREEK), alternate names (e.g., "cyrillic" for ISO_8859_5) and +// common variations with hyphens and underscores (e.g., "koi8-u" and +// "koi8u" for RUSSIAN_KOI8_R). + +Encoding EncodingNameAliasToEncoding(const char *enc_name); + +// ************************************************************* +// Miscellany +// ************************************************************* + +// PreferredWebOutputEncoding +// -------------------------- +// +// Some multi-byte encodings use byte values that coincide with the +// ASCII codes for HTML syntax characters <>"&' and browsers like MSIE +// can misinterpret these, as indicated in an external XSS report from +// 2007-02-15. Here, we map these dangerous encodings to safer ones. We +// also use UTF8 instead of encodings that we don't support in our +// output, and we generally try to be conservative in what we send out. +// Where the client asks for single- or double-byte encodings that are +// not as common, we substitute a more common single- or double-byte +// encoding, if there is one, thereby preserving the client's intent +// to use less space than UTF-8. This also means that characters +// outside the destination set will be converted to HTML NCRs (&#NNN;) +// if requested. +Encoding PreferredWebOutputEncoding(Encoding enc); + + +#endif // UTIL_ENCODINGS_ENCODINGS_H_ diff --git a/contrib/google-ced/util/encodings/encodings.pb.h b/contrib/google-ced/util/encodings/encodings.pb.h new file mode 100644 index 0000000..ffbd716 --- /dev/null +++ b/contrib/google-ced/util/encodings/encodings.pb.h @@ -0,0 +1,181 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef UTIL_ENCODINGS_ENCODINGS_PB_H_ +#define UTIL_ENCODINGS_ENCODINGS_PB_H_ + +enum Encoding { + ISO_8859_1 = 0, // Teragram ASCII + ISO_8859_2 = 1, // Teragram Latin2 + ISO_8859_3 = 2, // in BasisTech but not in Teragram + ISO_8859_4 = 3, // Teragram Latin4 + ISO_8859_5 = 4, // Teragram ISO-8859-5 + ISO_8859_6 = 5, // Teragram Arabic + ISO_8859_7 = 6, // Teragram Greek + ISO_8859_8 = 7, // Teragram Hebrew + ISO_8859_9 = 8, // in BasisTech but not in Teragram + ISO_8859_10 = 9, // in BasisTech but not in Teragram + JAPANESE_EUC_JP = 10, // Teragram EUC_JP + JAPANESE_SHIFT_JIS = 11, // Teragram SJS + JAPANESE_JIS = 12, // Teragram JIS + CHINESE_BIG5 = 13, // Teragram BIG5 + CHINESE_GB = 14, // Teragram GB + CHINESE_EUC_CN = 15, // Misnamed. Should be EUC_TW. Was Basis Tech + // CNS11643EUC, before that Teragram EUC-CN(!) + // See //i18n/basistech/basistech_encodings.h + KOREAN_EUC_KR = 16, // Teragram KSC + UNICODE = 17, // Teragram Unicode + CHINESE_EUC_DEC = 18, // Misnamed. Should be EUC_TW. Was Basis Tech + // CNS11643EUC, before that Teragram EUC. + CHINESE_CNS = 19, // Misnamed. Should be EUC_TW. Was Basis Tech + // CNS11643EUC, before that Teragram CNS. + CHINESE_BIG5_CP950 = 20, // Teragram BIG5_CP950 + JAPANESE_CP932 = 21, // Teragram CP932 + UTF8 = 22, + UNKNOWN_ENCODING = 23, + ASCII_7BIT = 24, // ISO_8859_1 with all characters <= 127. + // Should be present only in the crawler + // and in the repository, + // *never* as a result of Document::encoding(). + RUSSIAN_KOI8_R = 25, // Teragram KOI8R + RUSSIAN_CP1251 = 26, // Teragram CP1251 + + //---------------------------------------------------------- + // These are _not_ output from teragram. Instead, they are as + // detected in the headers of usenet articles. + MSFT_CP1252 = 27, // 27: CP1252 aka MSFT euro ascii + RUSSIAN_KOI8_RU = 28, // CP21866 aka KOI8-U, used for Ukrainian. + // Misnamed, this is _not_ KOI8-RU but KOI8-U. + // KOI8-U is used much more often than KOI8-RU. + MSFT_CP1250 = 29, // CP1250 aka MSFT eastern european + ISO_8859_15 = 30, // aka ISO_8859_0 aka ISO_8859_1 euroized + //---------------------------------------------------------- + + //---------------------------------------------------------- + // These are in BasisTech but not in Teragram. They are + // needed for new interface languages. Now detected by + // research langid + MSFT_CP1254 = 31, // used for Turkish + MSFT_CP1257 = 32, // used in Baltic countries + //---------------------------------------------------------- + + //---------------------------------------------------------- + //---------------------------------------------------------- + // New encodings detected by Teragram + ISO_8859_11 = 33, // aka TIS-620, used for Thai + MSFT_CP874 = 34, // used for Thai + MSFT_CP1256 = 35, // used for Arabic + + //---------------------------------------------------------- + // Detected as ISO_8859_8 by Teragram, but can be found in META tags + MSFT_CP1255 = 36, // Logical Hebrew Microsoft + ISO_8859_8_I = 37, // Iso Hebrew Logical + HEBREW_VISUAL = 38, // Iso Hebrew Visual + //---------------------------------------------------------- + + //---------------------------------------------------------- + // Detected by research langid + CZECH_CP852 = 39, + CZECH_CSN_369103 = 40, // aka ISO_IR_139 aka KOI8_CS + MSFT_CP1253 = 41, // used for Greek + RUSSIAN_CP866 = 42, + //---------------------------------------------------------- + + //---------------------------------------------------------- + // Handled by iconv in glibc + ISO_8859_13 = 43, + ISO_2022_KR = 44, + GBK = 45, + GB18030 = 46, + BIG5_HKSCS = 47, + ISO_2022_CN = 48, + + //----------------------------------------------------------- + // Detected by xin liu's detector + // Handled by transcoder + // (Indic encodings) + + TSCII = 49, + TAMIL_MONO = 50, + TAMIL_BI = 51, + JAGRAN = 52, + + + MACINTOSH_ROMAN = 53, + UTF7 = 54, + BHASKAR = 55, // Indic encoding - Devanagari + HTCHANAKYA = 56, // 56 Indic encoding - Devanagari + + //----------------------------------------------------------- + // These allow a single place (inputconverter and outputconverter) + // to do UTF-16 <==> UTF-8 bulk conversions and UTF-32 <==> UTF-8 + // bulk conversions, with interchange-valid checking on input and + // fallback if needed on ouput. + UTF16BE = 57, // big-endian UTF-16 + UTF16LE = 58, // little-endian UTF-16 + UTF32BE = 59, // big-endian UTF-32 + UTF32LE = 60, // little-endian UTF-32 + //----------------------------------------------------------- + + //----------------------------------------------------------- + // An encoding that means "This is not text, but it may have some + // simple ASCII text embedded". Intended input conversion (not yet + // implemented) is to keep strings of >=4 seven-bit ASCII characters + // (follow each kept string with an ASCII space), delete the rest of + // the bytes. This will pick up and allow indexing of e.g. captions + // in JPEGs. No output conversion needed. + BINARYENC = 61, + //----------------------------------------------------------- + + //----------------------------------------------------------- + // Some Web pages allow a mixture of HZ-GB and GB-2312 by using + // ~{ ... ~} for 2-byte pairs, and the browsers support this. + HZ_GB_2312 = 62, + //----------------------------------------------------------- + + //----------------------------------------------------------- + // Some external vendors make the common input error of + // converting MSFT_CP1252 to UTF8 *twice*. No output conversion needed. + UTF8UTF8 = 63, + //----------------------------------------------------------- + + //----------------------------------------------------------- + // Handled by transcoder for tamil language specific font + // encodings without the support for detection at present. + TAM_ELANGO = 64, // Elango - Tamil + TAM_LTTMBARANI = 65, // Barani - Tamil + TAM_SHREE = 66, // Shree - Tamil + TAM_TBOOMIS = 67, // TBoomis - Tamil + TAM_TMNEWS = 68, // TMNews - Tamil + TAM_WEBTAMIL = 69, // Webtamil - Tamil + //----------------------------------------------------------- + + //----------------------------------------------------------- + // Shift_JIS variants used by Japanese cell phone carriers. + KDDI_SHIFT_JIS = 70, + DOCOMO_SHIFT_JIS = 71, + SOFTBANK_SHIFT_JIS = 72, + // ISO-2022-JP variants used by KDDI and SoftBank. + KDDI_ISO_2022_JP = 73, + SOFTBANK_ISO_2022_JP = 74, + //----------------------------------------------------------- + + NUM_ENCODINGS = 75, // Always keep this at the end. It is not a + // valid Encoding enum, it is only used to + // indicate the total number of Encodings. +}; + +#endif // UTIL_ENCODINGS_ENCODINGS_PB_H_ diff --git a/contrib/google-ced/util/encodings/encodings_unittest.cc b/contrib/google-ced/util/encodings/encodings_unittest.cc new file mode 100644 index 0000000..223e3e4 --- /dev/null +++ b/contrib/google-ced/util/encodings/encodings_unittest.cc @@ -0,0 +1,34 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "util/encodings/encodings.h" + +#include "gtest/gtest.h" + +TEST(EncodingsTest, EncodingNameAliasToEncoding) { + // Test that cases, non-alpha-numeric chars are ignored. + EXPECT_EQ(ISO_8859_1, EncodingNameAliasToEncoding("iso_8859_1")); + EXPECT_EQ(ISO_8859_1, EncodingNameAliasToEncoding("iso-8859-1")); + + // Test that spaces are ignored. + EXPECT_EQ(UTF8, EncodingNameAliasToEncoding("UTF8")); + EXPECT_EQ(UTF8, EncodingNameAliasToEncoding("UTF 8")); + EXPECT_EQ(UTF8, EncodingNameAliasToEncoding("UTF-8")); + + // Test alphanumeric differences are counted. + EXPECT_NE(UTF8, EncodingNameAliasToEncoding("UTF-7")); + EXPECT_NE(KOREAN_EUC_KR, EncodingNameAliasToEncoding("euc-jp")); +} |