summaryrefslogtreecommitdiffstats
path: root/contrib/google-ced/util/encodings
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-10 21:30:40 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-10 21:30:40 +0000
commit133a45c109da5310add55824db21af5239951f93 (patch)
treeba6ac4c0a950a0dda56451944315d66409923918 /contrib/google-ced/util/encodings
parentInitial commit. (diff)
downloadrspamd-133a45c109da5310add55824db21af5239951f93.tar.xz
rspamd-133a45c109da5310add55824db21af5239951f93.zip
Adding upstream version 3.8.1.upstream/3.8.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'contrib/google-ced/util/encodings')
-rw-r--r--contrib/google-ced/util/encodings/encodings.cc891
-rw-r--r--contrib/google-ced/util/encodings/encodings.h299
-rw-r--r--contrib/google-ced/util/encodings/encodings.pb.h181
-rw-r--r--contrib/google-ced/util/encodings/encodings_unittest.cc34
4 files changed, 1405 insertions, 0 deletions
diff --git a/contrib/google-ced/util/encodings/encodings.cc b/contrib/google-ced/util/encodings/encodings.cc
new file mode 100644
index 0000000..b5f8dc5
--- /dev/null
+++ b/contrib/google-ced/util/encodings/encodings.cc
@@ -0,0 +1,891 @@
+// Copyright 2016 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "util/encodings/encodings.h"
+
+#include <string.h> // for strcasecmp
+#include <unordered_map>
+#include <utility> // for pair
+
+#include "util/basictypes.h"
+#include "util/string_util.h"
+#include "util/case_insensitive_hash.h"
+
+struct EncodingInfo {
+ // The standard name for this encoding.
+ //
+ const char* encoding_name_;
+
+ // The "preferred MIME name" of an encoding as specified by the IANA at:
+ // http://www.iana.org/assignments/character-sets
+ //
+ // Note that the preferred MIME name may differ slightly from the
+ // official IANA name: i.e. ISO-8859-1 vs. ISO_8859-1:1987
+ //
+ const char* mime_encoding_name_;
+
+ // It is an internal policy that if an encoding has an IANA name,
+ // then encoding_name_ and mime_encoding_name_ must be the same string.
+ //
+ // However, there can be exceptions if there are compelling reasons.
+ // For example, Japanese mobile handsets require the name
+ // "Shift_JIS" in charset=... parameter in Content-Type headers to
+ // process emoji (emoticons) in their private encodings. In that
+ // case, mime_encoding_name_ should be "Shift_JIS", despite
+ // encoding_name_ actually is "X-KDDI-Shift_JIS".
+
+ // Some multi-byte encodings use byte values that coincide with the
+ // ASCII codes for HTML syntax characters <>"&' and browsers like MSIE
+ // can misinterpret these, as indicated in an external XSS report from
+ // 2007-02-15. Here, we map these dangerous encodings to safer ones. We
+ // also use UTF8 instead of encodings that we don't support in our
+ // output, and we generally try to be conservative in what we send out.
+ // Where the client asks for single- or double-byte encodings that are
+ // not as common, we substitute a more common single- or double-byte
+ // encoding, if there is one, thereby preserving the client's intent
+ // to use less space than UTF-8. This also means that characters
+ // outside the destination set will be converted to HTML NCRs (&#NNN;)
+ // if requested.
+
+ Encoding preferred_web_output_encoding_;
+};
+
+static const EncodingInfo kEncodingInfoTable[] = {
+ { "ASCII", "ISO-8859-1", ISO_8859_1},
+ { "Latin2", "ISO-8859-2", ISO_8859_2},
+ { "Latin3", "ISO-8859-3", UTF8},
+ // MSIE 6 does not support ISO-8859-3 (XSS issue)
+ { "Latin4", "ISO-8859-4", ISO_8859_4},
+ { "ISO-8859-5", "ISO-8859-5", ISO_8859_5},
+ { "Arabic", "ISO-8859-6", ISO_8859_6},
+ { "Greek", "ISO-8859-7", ISO_8859_7},
+ { "Hebrew", "ISO-8859-8", MSFT_CP1255},
+ // we do not endorse the visual order
+ { "Latin5", "ISO-8859-9", ISO_8859_9},
+ { "Latin6", "ISO-8859-10", UTF8},
+ // MSIE does not support ISO-8859-10 (XSS issue)
+ { "EUC-JP", "EUC-JP", JAPANESE_EUC_JP},
+ { "SJS", "Shift_JIS", JAPANESE_SHIFT_JIS},
+ { "JIS", "ISO-2022-JP", JAPANESE_SHIFT_JIS},
+ // due to potential confusion with HTML syntax chars
+ { "BIG5", "Big5", CHINESE_BIG5},
+ { "GB", "GB2312", CHINESE_GB},
+ { "EUC-CN",
+ "EUC-CN",
+ // Misnamed. Should be EUC-TW.
+ CHINESE_BIG5},
+ // MSIE treats "EUC-CN" like GB2312, which is not EUC-TW,
+ // and EUC-TW is rare, so we prefer Big5 for output.
+ { "KSC", "EUC-KR", KOREAN_EUC_KR},
+ { "Unicode",
+ "UTF-16LE",
+ // Internet Explorer doesn't recognize "ISO-10646-UCS-2"
+ UTF8
+ // due to potential confusion with HTML syntax chars
+ },
+ { "EUC",
+ "EUC", // Misnamed. Should be EUC-TW.
+ CHINESE_BIG5
+ // MSIE does not recognize "EUC" (XSS issue),
+ // and EUC-TW is rare, so we prefer Big5 for output.
+ },
+ { "CNS",
+ "CNS", // Misnamed. Should be EUC-TW.
+ CHINESE_BIG5},
+ // MSIE does not recognize "CNS" (XSS issue),
+ // and EUC-TW is rare, so we prefer Big5 for output.
+ { "BIG5-CP950",
+ "BIG5-CP950", // Not an IANA name
+ CHINESE_BIG5
+ // MSIE does not recognize "BIG5-CP950" (XSS issue)
+ },
+ { "CP932", "CP932", // Not an IANA name
+ JAPANESE_SHIFT_JIS}, // MSIE does not recognize "CP932" (XSS issue)
+ { "UTF8", "UTF-8", UTF8},
+ { "Unknown",
+ "x-unknown", // Not an IANA name
+ UTF8}, // UTF-8 is our default output encoding
+ { "ASCII-7-bit", "US-ASCII", ASCII_7BIT},
+ { "KOI8R", "KOI8-R", RUSSIAN_KOI8_R},
+ { "CP1251", "windows-1251", RUSSIAN_CP1251},
+ { "CP1252", "windows-1252", MSFT_CP1252},
+ { "KOI8U",
+ "KOI8-U",
+ ISO_8859_5}, // because koi8-u is not as common
+ { "CP1250", "windows-1250", MSFT_CP1250},
+ { "ISO-8859-15", "ISO-8859-15", ISO_8859_15},
+ { "CP1254", "windows-1254", MSFT_CP1254},
+ { "CP1257", "windows-1257", MSFT_CP1257},
+ { "ISO-8859-11", "ISO-8859-11", ISO_8859_11},
+ { "CP874", "windows-874", MSFT_CP874},
+ { "CP1256", "windows-1256", MSFT_CP1256},
+ { "CP1255", "windows-1255", MSFT_CP1255},
+ { "ISO-8859-8-I", "ISO-8859-8-I", MSFT_CP1255},
+ // Java does not support iso-8859-8-i
+ { "VISUAL", "ISO-8859-8", MSFT_CP1255},
+ // we do not endorse the visual order
+ { "CP852", "cp852", MSFT_CP1250},
+ // because cp852 is not as common
+ { "CSN_369103", "csn_369103", MSFT_CP1250},
+ // MSIE does not recognize "csn_369103" (XSS issue)
+ { "CP1253", "windows-1253", MSFT_CP1253},
+ { "CP866", "IBM866", RUSSIAN_CP1251},
+ // because cp866 is not as common
+ { "ISO-8859-13", "ISO-8859-13", UTF8},
+ // because iso-8859-13 is not widely supported
+ { "ISO-2022-KR", "ISO-2022-KR", KOREAN_EUC_KR},
+ // due to potential confusion with HTML syntax chars
+ { "GBK", "GBK", GBK},
+ { "GB18030", "GB18030", GBK},
+ // because gb18030 is not widely supported
+ { "BIG5_HKSCS", "BIG5-HKSCS", CHINESE_BIG5},
+ // because Big5-HKSCS is not widely supported
+ { "ISO_2022_CN", "ISO-2022-CN", CHINESE_GB},
+ // due to potential confusion with HTML syntax chars
+ { "TSCII", "tscii", UTF8},
+ // we do not have an output converter for this font encoding
+ { "TAM", "tam", UTF8},
+ // we do not have an output converter for this font encoding
+ { "TAB", "tab", UTF8},
+ // we do not have an output converter for this font encoding
+ { "JAGRAN", "jagran", UTF8},
+ // we do not have an output converter for this font encoding
+ { "MACINTOSH", "MACINTOSH", ISO_8859_1},
+ // because macintosh is relatively uncommon
+ { "UTF7", "UTF-7",
+ UTF8}, // UTF-7 has been the subject of XSS attacks and is deprecated
+ { "BHASKAR", "bhaskar",
+ UTF8}, // we do not have an output converter for this font encoding
+ { "HTCHANAKYA", "htchanakya", // not an IANA charset name.
+ UTF8}, // we do not have an output converter for this font encoding
+ { "UTF-16BE", "UTF-16BE",
+ UTF8}, // due to potential confusion with HTML syntax chars
+ { "UTF-16LE", "UTF-16LE",
+ UTF8}, // due to potential confusion with HTML syntax chars
+ { "UTF-32BE", "UTF-32BE",
+ UTF8}, // unlikely to cause XSS bugs, but very uncommon on Web
+ { "UTF-32LE", "UTF-32LE",
+ UTF8}, // unlikely to cause XSS bugs, but very uncommon on Web
+ { "X-BINARYENC", "x-binaryenc", // Not an IANA name
+ UTF8}, // because this one is not intended for output (just input)
+ { "HZ-GB-2312", "HZ-GB-2312",
+ CHINESE_GB}, // due to potential confusion with HTML syntax chars
+ { "X-UTF8UTF8", "x-utf8utf8", // Not an IANA name
+ UTF8}, // because this one is not intended for output (just input)
+ { "X-TAM-ELANGO", "x-tam-elango",
+ UTF8}, // we do not have an output converter for this font encoding
+ { "X-TAM-LTTMBARANI", "x-tam-lttmbarani",
+ UTF8}, // we do not have an output converter for this font encoding
+ { "X-TAM-SHREE", "x-tam-shree",
+ UTF8}, // we do not have an output converter for this font encoding
+ { "X-TAM-TBOOMIS", "x-tam-tboomis",
+ UTF8}, // we do not have an output converter for this font encoding
+ { "X-TAM-TMNEWS", "x-tam-tmnews",
+ UTF8}, // we do not have an output converter for this font encoding
+ { "X-TAM-WEBTAMIL", "x-tam-webtamil",
+ UTF8}, // we do not have an output converter for this font encoding
+
+ { "X-KDDI-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS},
+ // KDDI version of Shift_JIS with Google Emoji PUA mappings.
+ // Note that MimeEncodingName() returns "Shift_JIS", since KDDI uses
+ // "Shift_JIS" in HTTP headers and email messages.
+
+ { "X-DoCoMo-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS},
+ // DoCoMo version of Shift_JIS with Google Emoji PUA mappings.
+ // See the comment at KDDI_SHIFT_JIS for other issues.
+
+ { "X-SoftBank-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS},
+ // SoftBank version of Shift_JIS with Google Emoji PUA mappings.
+ // See the comment at KDDI_SHIFT_JIS for other issues.
+
+ { "X-KDDI-ISO-2022-JP", "ISO-2022-JP", JAPANESE_SHIFT_JIS},
+ // KDDI version of ISO-2022-JP with Google Emoji PUA mappings.
+ // See the comment at KDDI_SHIFT_JIS for other issues.
+ // The preferred Web encoding is due to potential confusion with
+ // HTML syntax chars.
+
+ { "X-SoftBank-ISO-2022-JP", "ISO-2022-JP", JAPANESE_SHIFT_JIS},
+ // SoftBank version of ISO-2022-JP with Google Emoji PUA mappings.
+ // See the comment at KDDI_SHIFT_JIS for other issues.
+ // The preferred Web encoding is due to potential confusion with
+ // HTML syntax chars.
+
+ // Please refer to NOTE: section in the comments in the definition
+ // of "struct I18NInfoByEncoding", before adding new encodings.
+
+};
+
+
+
+COMPILE_ASSERT(arraysize(kEncodingInfoTable) == NUM_ENCODINGS,
+ kEncodingInfoTable_has_incorrect_size);
+
+Encoding default_encoding() {return LATIN1;}
+
+// *************************************************************
+// Encoding predicates
+// IsValidEncoding()
+// IsEncEncCompatible
+// IsEncodingWithSupportedLanguage
+// IsSupersetOfAscii7Bit
+// Is8BitEncoding
+// IsCJKEncoding
+// IsHebrewEncoding
+// IsRightToLeftEncoding
+// IsLogicalRightToLeftEncoding
+// IsVisualRightToLeftEncoding
+// IsIso2022Encoding
+// IsIso2022JpOrVariant
+// IsShiftJisOrVariant
+// IsJapaneseCellPhoneCarrierSpecificEncoding
+// *************************************************************
+
+bool IsValidEncoding(Encoding enc) {
+ return ((enc >= 0) && (enc < kNumEncodings));
+}
+
+bool IsEncEncCompatible(const Encoding from, const Encoding to) {
+ // Tests compatibility between the "from" and "to" encodings; in
+ // the typical case -- when both are valid known encodings -- this
+ // returns true iff converting from first to second is a no-op.
+ if (!IsValidEncoding(from) || !IsValidEncoding(to)) {
+ return false; // we only work with valid encodings...
+ } else if (to == from) {
+ return true; // the trivial common case
+ }
+
+ if (to == UNKNOWN_ENCODING) {
+ return true; // all valid encodings are compatible with the unknown
+ }
+
+ if (from == UNKNOWN_ENCODING) {
+ return false; // no unknown encoding is compatible with one that is
+ }
+
+ if (from == ASCII_7BIT) {
+ return IsSupersetOfAscii7Bit(to);
+ }
+
+ return (from == ISO_8859_1 && to == MSFT_CP1252) ||
+ (from == ISO_8859_8 && to == HEBREW_VISUAL) ||
+ (from == HEBREW_VISUAL && to == ISO_8859_8) ||
+ (from == ISO_8859_9 && to == MSFT_CP1254) ||
+ (from == ISO_8859_11 && to == MSFT_CP874) ||
+ (from == JAPANESE_SHIFT_JIS && to == JAPANESE_CP932) ||
+ (from == CHINESE_BIG5 && to == CHINESE_BIG5_CP950) ||
+ (from == CHINESE_GB && to == GBK) ||
+ (from == CHINESE_GB && to == GB18030) ||
+ (from == CHINESE_EUC_CN && to == CHINESE_EUC_DEC) ||
+ (from == CHINESE_EUC_CN && to == CHINESE_CNS) ||
+ (from == CHINESE_EUC_DEC && to == CHINESE_EUC_CN) ||
+ (from == CHINESE_EUC_DEC && to == CHINESE_CNS) ||
+ (from == CHINESE_CNS && to == CHINESE_EUC_CN) ||
+ (from == CHINESE_CNS && to == CHINESE_EUC_DEC);
+}
+
+// To be a superset of 7-bit Ascii means that bytes 0...127 in the given
+// encoding represent the same characters as they do in ISO_8859_1.
+
+// TODO: This list could be expanded. Many other encodings are supersets
+// of 7-bit Ascii. In fact, Japanese JIS and Unicode are the only two
+// encodings that I know for a fact should *not* be in this list.
+bool IsSupersetOfAscii7Bit(Encoding e) {
+ switch (e) {
+ case ISO_8859_1:
+ case ISO_8859_2:
+ case ISO_8859_3:
+ case ISO_8859_4:
+ case ISO_8859_5:
+ case ISO_8859_6:
+ case ISO_8859_7:
+ case ISO_8859_8:
+ case ISO_8859_9:
+ case ISO_8859_10:
+ case JAPANESE_EUC_JP:
+ case JAPANESE_SHIFT_JIS:
+ case CHINESE_BIG5:
+ case CHINESE_GB:
+ case CHINESE_EUC_CN:
+ case KOREAN_EUC_KR:
+ case CHINESE_EUC_DEC:
+ case CHINESE_CNS:
+ case CHINESE_BIG5_CP950:
+ case JAPANESE_CP932:
+ case UTF8:
+ case UNKNOWN_ENCODING:
+ case ASCII_7BIT:
+ case RUSSIAN_KOI8_R:
+ case RUSSIAN_CP1251:
+ case MSFT_CP1252:
+ case RUSSIAN_KOI8_RU:
+ case MSFT_CP1250:
+ case ISO_8859_15:
+ case MSFT_CP1254:
+ case MSFT_CP1257:
+ case ISO_8859_11:
+ case MSFT_CP874:
+ case MSFT_CP1256:
+ case MSFT_CP1255:
+ case ISO_8859_8_I:
+ case HEBREW_VISUAL:
+ case CZECH_CP852:
+ case MSFT_CP1253:
+ case RUSSIAN_CP866:
+ case ISO_8859_13:
+ case GBK:
+ case GB18030:
+ case BIG5_HKSCS:
+ case MACINTOSH_ROMAN:
+ return true;
+ default:
+ return false;
+ }
+}
+
+// To be an 8-bit encoding means that there are fewer than 256 symbols.
+// Each byte determines a new character; there are no multi-byte sequences.
+
+// TODO: This list could maybe be expanded. Other encodings may be 8-bit.
+bool Is8BitEncoding(Encoding e) {
+ switch (e) {
+ case ASCII_7BIT:
+ case ISO_8859_1:
+ case ISO_8859_2:
+ case ISO_8859_3:
+ case ISO_8859_4:
+ case ISO_8859_5:
+ case ISO_8859_6:
+ case ISO_8859_7:
+ case ISO_8859_8:
+ case ISO_8859_8_I:
+ case ISO_8859_9:
+ case ISO_8859_10:
+ case ISO_8859_11:
+ case ISO_8859_13:
+ case ISO_8859_15:
+ case MSFT_CP1252:
+ case MSFT_CP1253:
+ case MSFT_CP1254:
+ case MSFT_CP1255:
+ case MSFT_CP1256:
+ case MSFT_CP1257:
+ case RUSSIAN_KOI8_R:
+ case RUSSIAN_KOI8_RU:
+ case RUSSIAN_CP866:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool IsCJKEncoding(Encoding e) {
+ switch (e) {
+ case JAPANESE_EUC_JP:
+ case JAPANESE_SHIFT_JIS:
+ case JAPANESE_JIS:
+ case CHINESE_BIG5:
+ case CHINESE_GB:
+ case CHINESE_EUC_CN:
+ case KOREAN_EUC_KR:
+ case CHINESE_EUC_DEC:
+ case CHINESE_CNS:
+ case CHINESE_BIG5_CP950:
+ case JAPANESE_CP932:
+ case ISO_2022_KR:
+ case GBK:
+ case GB18030:
+ case BIG5_HKSCS:
+ case ISO_2022_CN:
+ case HZ_GB_2312:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool IsHebrewEncoding(Encoding e) {
+ return (e == ISO_8859_8 ||
+ e == ISO_8859_8_I ||
+ e == MSFT_CP1255 ||
+ e == HEBREW_VISUAL);
+}
+
+
+
+bool IsRightToLeftEncoding(Encoding enc) {
+ switch (enc) {
+ case MSFT_CP1255:
+ case MSFT_CP1256:
+ case ARABIC_ENCODING:
+ case HEBREW_ENCODING:
+ case ISO_8859_8_I:
+ case HEBREW_VISUAL:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool IsLogicalRightToLeftEncoding(Encoding enc) {
+ return IsRightToLeftEncoding(enc) && !IsVisualRightToLeftEncoding(enc);
+}
+
+// Note that despite an RFC to the contrary, ARABIC_ENCODING (ISO-8859-6)
+// is NOT visual.
+bool IsVisualRightToLeftEncoding(Encoding enc) {
+ switch (enc) {
+ case HEBREW_ENCODING:
+ case HEBREW_VISUAL:
+ return true;
+ default:
+ return false;
+ }
+}
+
+
+
+
+
+bool IsIso2022Encoding(Encoding enc) {
+ return (IsIso2022JpOrVariant(enc) ||
+ enc == ISO_2022_KR ||
+ enc == ISO_2022_CN);
+}
+
+bool IsIso2022JpOrVariant(Encoding enc) {
+ return (enc == JAPANESE_JIS ||
+ enc == KDDI_ISO_2022_JP ||
+ enc == SOFTBANK_ISO_2022_JP);
+}
+
+bool IsShiftJisOrVariant(Encoding enc) {
+ return (enc == JAPANESE_SHIFT_JIS ||
+ enc == JAPANESE_CP932 ||
+ enc == KDDI_SHIFT_JIS ||
+ enc == DOCOMO_SHIFT_JIS ||
+ enc == SOFTBANK_SHIFT_JIS);
+}
+
+bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc) {
+ return (enc == KDDI_ISO_2022_JP ||
+ enc == KDDI_SHIFT_JIS ||
+ enc == DOCOMO_SHIFT_JIS ||
+ enc == SOFTBANK_SHIFT_JIS ||
+ enc == SOFTBANK_ISO_2022_JP);
+}
+
+
+// *************************************************************
+// ENCODING NAMES
+// EncodingName() [Encoding to name]
+// MimeEncodingName() [Encoding to name]
+// EncodingFromName() [name to Encoding]
+// EncodingNameAliasToEncoding() [name to Encoding]
+// default_encoding_name()
+// invalid_encoding_name()
+// *************************************************************
+
+const char * EncodingName(const Encoding enc) {
+ if ( (enc < 0) || (enc >= kNumEncodings) )
+ return invalid_encoding_name();
+ return kEncodingInfoTable[enc].encoding_name_;
+}
+
+// TODO: Unify MimeEncodingName and EncodingName, or determine why
+// such a unification is not possible.
+
+const char * MimeEncodingName(Encoding enc) {
+ if ( (enc < 0) || (enc >= kNumEncodings) )
+ return ""; // TODO: Should this be invalid_encoding_name()?
+ return kEncodingInfoTable[enc].mime_encoding_name_;
+}
+
+bool EncodingFromName(const char* enc_name, Encoding *encoding) {
+ *encoding = UNKNOWN_ENCODING;
+ if ( enc_name == NULL ) return false;
+
+ for ( int i = 0; i < kNumEncodings; i++ ) {
+ if (!base::strcasecmp(enc_name, kEncodingInfoTable[i].encoding_name_) ) {
+ *encoding = static_cast<Encoding>(i);
+ return true;
+ }
+ }
+ return false;
+}
+
+// The encoding_map maps standard and non-standard encoding-names
+// (strings) to Encoding enums. It is used only by
+// EncodingNameAliasToEncoding. Note that the map uses
+// case-insensitive hash and comparison functions.
+
+typedef std::unordered_map<const char *, Encoding,
+ CStringAlnumCaseHash,
+ CStringAlnumCaseEqual> EncodingMap;
+
+static const EncodingMap& GetEncodingMap() {
+ static EncodingMap encoding_map;
+ if (!encoding_map.empty()) {
+ // Already initialized
+ return encoding_map;
+ }
+
+ // Initialize the map with all the "standard" encoding names,
+ // i.e., the ones returned by EncodingName and MimeEncodingName.
+ //
+ // First, add internal encoding names returned by EncodingName().
+ for (int i = 0; i < NUM_ENCODINGS; ++i) {
+ Encoding e = static_cast<Encoding>(i);
+ // Internal encoding names must be unique.
+ // The internal names are guaranteed to be unique by the CHECK_EQ.
+ const char *encoding_name = EncodingName(e);
+ // CHECK_EQ(0, encoding_map.count(encoding_name))
+ // << "Duplicate found for " << encoding_name;
+ encoding_map[encoding_name] = e;
+ }
+ // Then, add mime encoding names returned by MimeEncodingName().
+ // We don't override existing entries, to give precedence to entries
+ // added earlier.
+ for (int i = 0; i < NUM_ENCODINGS; ++i) {
+ Encoding e = static_cast<Encoding>(i);
+ // Note that MimeEncodingName() can return the same mime encoding
+ // name for different encoding enums like JAPANESE_SHIFT_JIS and
+ // KDDI_SHIFT_JIS. In that case, the encoding enum first seen
+ // will be the value for the encoding name in the map.
+ const char *mime_encoding_name = MimeEncodingName(e);
+ if (encoding_map.count(mime_encoding_name) == 0) {
+ encoding_map[mime_encoding_name] = e;
+ }
+ }
+
+ // Add some non-standard names: alternate spellings, common typos,
+ // etc. (It does no harm to add names already in the map.) Note
+ // that although the map is case-insensitive, by convention the
+ // keys are written here in lower case. For ease of maintenance,
+ // they are listed in alphabetical order.
+ encoding_map["5601"] = KOREAN_EUC_KR;
+ encoding_map["646"] = ASCII_7BIT;
+ encoding_map["852"] = CZECH_CP852;
+ encoding_map["866"] = RUSSIAN_CP866;
+ encoding_map["8859-1"] = ISO_8859_1;
+ encoding_map["ansi-1251"] = RUSSIAN_CP1251;
+ encoding_map["ansi_x3.4-1968"] = ASCII_7BIT;
+ encoding_map["arabic"] = ISO_8859_6;
+ encoding_map["ascii"] = ISO_8859_1;
+ encoding_map["ascii-7-bit"] = ASCII_7BIT; // not iana standard
+ encoding_map["asmo-708"] = ISO_8859_6;
+ encoding_map["bhaskar"] = BHASKAR;
+ encoding_map["big5"] = CHINESE_BIG5;
+ encoding_map["big5-cp950"] = CHINESE_BIG5_CP950; // not iana standard
+ encoding_map["big5-hkscs"] = BIG5_HKSCS;
+ encoding_map["chinese"] = CHINESE_GB;
+ encoding_map["cns"] = CHINESE_CNS; // not iana standard
+ encoding_map["cns11643"] = CHINESE_CNS;
+ encoding_map["cp1250"] = MSFT_CP1250; // not iana standard
+ encoding_map["cp1251"] = RUSSIAN_CP1251; // not iana standard
+ encoding_map["cp1252"] = MSFT_CP1252; // not iana standard
+ encoding_map["cp1253"] = MSFT_CP1253; // not iana standard
+ encoding_map["cp1254"] = MSFT_CP1254; // not iana standard
+ encoding_map["cp1255"] = MSFT_CP1255;
+ encoding_map["cp1256"] = MSFT_CP1256;
+ encoding_map["cp1257"] = MSFT_CP1257; // not iana standard
+ encoding_map["cp819"] = ISO_8859_1;
+ encoding_map["cp852"] = CZECH_CP852;
+ encoding_map["cp866"] = RUSSIAN_CP866;
+ encoding_map["cp-866"] = RUSSIAN_CP866;
+ encoding_map["cp874"] = MSFT_CP874;
+ encoding_map["cp932"] = JAPANESE_CP932; // not iana standard
+ encoding_map["cp950"] = CHINESE_BIG5_CP950; // not iana standard
+ encoding_map["csbig5"] = CHINESE_BIG5;
+ encoding_map["cseucjpkdfmtjapanese"] = JAPANESE_EUC_JP;
+ encoding_map["cseuckr"] = KOREAN_EUC_KR;
+ encoding_map["csgb2312"] = CHINESE_GB;
+ encoding_map["csibm852"] = CZECH_CP852;
+ encoding_map["csibm866"] = RUSSIAN_CP866;
+ encoding_map["csiso2022jp"] = JAPANESE_JIS;
+ encoding_map["csiso2022kr"] = ISO_2022_KR;
+ encoding_map["csiso58gb231280"] = CHINESE_GB;
+ encoding_map["csiso88598i"] = ISO_8859_8_I;
+ encoding_map["csisolatin1"] = ISO_8859_1;
+ encoding_map["csisolatin2"] = ISO_8859_2;
+ encoding_map["csisolatin3"] = ISO_8859_3;
+ encoding_map["csisolatin4"] = ISO_8859_4;
+ encoding_map["csisolatin5"] = ISO_8859_9;
+ encoding_map["csisolatin6"] = ISO_8859_10;
+ encoding_map["csisolatinarabic"] = ISO_8859_6;
+ encoding_map["csisolatincyrillic"] = ISO_8859_5;
+ encoding_map["csisolatingreek"] = ISO_8859_7;
+ encoding_map["csisolatinhebrew"] = ISO_8859_8;
+ encoding_map["csksc56011987"] = KOREAN_EUC_KR;
+ encoding_map["csmacintosh"] = MACINTOSH_ROMAN;
+ encoding_map["csn-369103"] = CZECH_CSN_369103;
+ encoding_map["csshiftjis"] = JAPANESE_SHIFT_JIS;
+ encoding_map["csunicode"] = UTF16BE;
+ encoding_map["csunicode11"] = UTF16BE;
+ encoding_map["csunicode11utf7"] = UTF7;
+ encoding_map["csunicodeascii"] = UTF16BE;
+ encoding_map["csunicodelatin1"] = UTF16BE;
+ encoding_map["cyrillic"] = ISO_8859_5;
+ encoding_map["ecma-114"] = ISO_8859_6;
+ encoding_map["ecma-118"] = ISO_8859_7;
+ encoding_map["elot_928"] = ISO_8859_7;
+ encoding_map["euc"] = CHINESE_EUC_DEC; // not iana standard
+ encoding_map["euc-cn"] = CHINESE_EUC_CN; // not iana standard
+ encoding_map["euc-dec"] = CHINESE_EUC_DEC; // not iana standard
+ encoding_map["euc-jp"] = JAPANESE_EUC_JP;
+ encoding_map["euc-kr"] = KOREAN_EUC_KR;
+ encoding_map["eucgb2312_cn"] = CHINESE_GB;
+ encoding_map["gb"] = CHINESE_GB; // not iana standard
+ encoding_map["gb18030"] = GB18030;
+ encoding_map["gb2132"] = CHINESE_GB; // common typo
+ encoding_map["gb2312"] = CHINESE_GB;
+ encoding_map["gb_2312-80"] = CHINESE_GB;
+ encoding_map["gbk"] = GBK;
+ encoding_map["greek"] = ISO_8859_7;
+ encoding_map["greek8"] = ISO_8859_7;
+ encoding_map["hebrew"] = ISO_8859_8;
+ encoding_map["htchanakya"] = HTCHANAKYA;
+ encoding_map["hz-gb-2312"] = HZ_GB_2312;
+ encoding_map["ibm819"] = ISO_8859_1;
+ encoding_map["ibm852"] = CZECH_CP852;
+ encoding_map["ibm874"] = MSFT_CP874;
+ encoding_map["iso-10646"] = UTF16BE;
+ encoding_map["iso-10646-j-1"] = UTF16BE;
+ encoding_map["iso-10646-ucs-2"] = UNICODE;
+ encoding_map["iso-10646-ucs-4"] = UTF32BE;
+ encoding_map["iso-10646-ucs-basic"] = UTF16BE;
+ encoding_map["iso-10646-unicode-latin1"] = UTF16BE;
+ encoding_map["iso-2022-cn"] = ISO_2022_CN;
+ encoding_map["iso-2022-jp"] = JAPANESE_JIS;
+ encoding_map["iso-2022-kr"] = ISO_2022_KR;
+ encoding_map["iso-8559-1"] = ISO_8859_1; // common typo
+ encoding_map["iso-874"] = MSFT_CP874;
+ encoding_map["iso-8858-1"] = ISO_8859_1; // common typo
+ // iso-8859-0 was a temporary name, eventually renamed iso-8859-15
+ encoding_map["iso-8859-0"] = ISO_8859_15;
+ encoding_map["iso-8859-1"] = ISO_8859_1;
+ encoding_map["iso-8859-10"] = ISO_8859_10;
+ encoding_map["iso-8859-11"] = ISO_8859_11;
+ encoding_map["iso-8859-13"] = ISO_8859_13;
+ encoding_map["iso-8859-15"] = ISO_8859_15;
+ encoding_map["iso-8859-2"] = ISO_8859_2;
+ encoding_map["iso-8859-3"] = ISO_8859_3;
+ encoding_map["iso-8859-4"] = ISO_8859_4;
+ encoding_map["iso-8859-5"] = ISO_8859_5;
+ encoding_map["iso-8859-6"] = ISO_8859_6;
+ encoding_map["iso-8859-7"] = ISO_8859_7;
+ encoding_map["iso-8859-8"] = ISO_8859_8;
+ encoding_map["iso-8859-8-i"] = ISO_8859_8_I;
+ encoding_map["iso-8859-9"] = ISO_8859_9;
+ encoding_map["iso-9959-1"] = ISO_8859_1; // common typo
+ encoding_map["iso-ir-100"] = ISO_8859_1;
+ encoding_map["iso-ir-101"] = ISO_8859_2;
+ encoding_map["iso-ir-109"] = ISO_8859_3;
+ encoding_map["iso-ir-110"] = ISO_8859_4;
+ encoding_map["iso-ir-126"] = ISO_8859_7;
+ encoding_map["iso-ir-127"] = ISO_8859_6;
+ encoding_map["iso-ir-138"] = ISO_8859_8;
+ encoding_map["iso-ir-144"] = ISO_8859_5;
+ encoding_map["iso-ir-148"] = ISO_8859_9;
+ encoding_map["iso-ir-149"] = KOREAN_EUC_KR;
+ encoding_map["iso-ir-157"] = ISO_8859_10;
+ encoding_map["iso-ir-58"] = CHINESE_GB;
+ encoding_map["iso-latin-1"] = ISO_8859_1;
+ encoding_map["iso_2022-cn"] = ISO_2022_CN;
+ encoding_map["iso_2022-kr"] = ISO_2022_KR;
+ encoding_map["iso_8859-1"] = ISO_8859_1;
+ encoding_map["iso_8859-10:1992"] = ISO_8859_10;
+ encoding_map["iso_8859-11"] = ISO_8859_11;
+ encoding_map["iso_8859-13"] = ISO_8859_13;
+ encoding_map["iso_8859-15"] = ISO_8859_15;
+ encoding_map["iso_8859-1:1987"] = ISO_8859_1;
+ encoding_map["iso_8859-2"] = ISO_8859_2;
+ encoding_map["iso_8859-2:1987"] = ISO_8859_2;
+ encoding_map["iso_8859-3"] = ISO_8859_3;
+ encoding_map["iso_8859-3:1988"] = ISO_8859_3;
+ encoding_map["iso_8859-4"] = ISO_8859_4;
+ encoding_map["iso_8859-4:1988"] = ISO_8859_4;
+ encoding_map["iso_8859-5"] = ISO_8859_5;
+ encoding_map["iso_8859-5:1988"] = ISO_8859_5;
+ encoding_map["iso_8859-6"] = ISO_8859_6;
+ encoding_map["iso_8859-6:1987"] = ISO_8859_6;
+ encoding_map["iso_8859-7"] = ISO_8859_7;
+ encoding_map["iso_8859-7:1987"] = ISO_8859_7;
+ encoding_map["iso_8859-8"] = ISO_8859_8;
+ encoding_map["iso_8859-8:1988:"] = ISO_8859_8;
+ encoding_map["iso_8859-9"] = ISO_8859_9;
+ encoding_map["iso_8859-9:1989"] = ISO_8859_9;
+ encoding_map["jagran"] = JAGRAN;
+ encoding_map["jis"] = JAPANESE_JIS; // not iana standard
+ encoding_map["koi8-cs"] = CZECH_CSN_369103;
+ encoding_map["koi8-r"] = RUSSIAN_KOI8_R;
+ encoding_map["koi8-ru"] = RUSSIAN_KOI8_RU; // not iana standard
+ encoding_map["koi8-u"] = RUSSIAN_KOI8_RU;
+ encoding_map["koi8r"] = RUSSIAN_KOI8_R; // not iana standard
+ encoding_map["koi8u"] = RUSSIAN_KOI8_RU; // not iana standard
+ encoding_map["korean"] = KOREAN_EUC_KR; // i assume this is what is meant
+ encoding_map["ks-c-5601"] = KOREAN_EUC_KR; // not iana standard
+ encoding_map["ks-c-5601-1987"] = KOREAN_EUC_KR; // not iana standard
+ encoding_map["ks_c_5601-1989"] = KOREAN_EUC_KR;
+ encoding_map["ksc"] = KOREAN_EUC_KR; // not iana standard
+ encoding_map["l1"] = ISO_8859_1;
+ encoding_map["l2"] = ISO_8859_2;
+ encoding_map["l3"] = ISO_8859_3;
+ encoding_map["l4"] = ISO_8859_4;
+ encoding_map["l5"] = ISO_8859_9;
+ encoding_map["l6"] = ISO_8859_10;
+ encoding_map["latin-1"] = ISO_8859_1; // not iana standard
+ encoding_map["latin1"] = ISO_8859_1;
+ encoding_map["latin2"] = ISO_8859_2;
+ encoding_map["latin3"] = ISO_8859_3;
+ encoding_map["latin4"] = ISO_8859_4;
+ encoding_map["latin5"] = ISO_8859_9;
+ encoding_map["latin6"] = ISO_8859_10;
+ encoding_map["mac"] = MACINTOSH_ROMAN;
+ encoding_map["macintosh"] = MACINTOSH_ROMAN;
+ encoding_map["macintosh-roman"] = MACINTOSH_ROMAN;
+ encoding_map["ms932"] = JAPANESE_CP932; // not iana standard
+ encoding_map["ms_kanji"] = JAPANESE_CP932;
+ encoding_map["shift-jis"] = JAPANESE_SHIFT_JIS;
+ encoding_map["shift_jis"] = JAPANESE_SHIFT_JIS;
+ encoding_map["sjis"] = JAPANESE_SHIFT_JIS; // not iana standard
+ encoding_map["sjs"] = JAPANESE_SHIFT_JIS; // not iana standard
+ encoding_map["sun_eu_greek"] = ISO_8859_7;
+ encoding_map["tab"] = TAMIL_BI;
+ encoding_map["tam"] = TAMIL_MONO;
+ encoding_map["tis-620"] = ISO_8859_11;
+ encoding_map["tscii"] = TSCII;
+ encoding_map["un"] = UNKNOWN_ENCODING; // not iana standard
+ encoding_map["unicode"] = UNICODE; // not iana standard
+ encoding_map["unicode-1-1-utf-7"] = UTF7;
+ encoding_map["unicode-1-1-utf-8"] = UTF8;
+ encoding_map["unicode-2-0-utf-7"] = UTF7;
+ encoding_map["unknown"] = UNKNOWN_ENCODING; // not iana standard
+ encoding_map["us"] = ISO_8859_1;
+ encoding_map["us-ascii"] = ISO_8859_1;
+ encoding_map["utf-16be"] = UTF16BE;
+ encoding_map["utf-16le"] = UTF16LE;
+ encoding_map["utf-32be"] = UTF32BE;
+ encoding_map["utf-32le"] = UTF32LE;
+ encoding_map["utf-7"] = UTF7;
+ encoding_map["utf-8"] = UTF8;
+ encoding_map["utf7"] = UTF7;
+ encoding_map["utf8"] = UTF8; // not iana standard
+ encoding_map["visual"] = HEBREW_VISUAL;
+ encoding_map["win-1250"] = MSFT_CP1250; // not iana standard
+ encoding_map["win-1251"] = RUSSIAN_CP1251; // not iana standard
+ encoding_map["window-874"] = MSFT_CP874;
+ encoding_map["windows-1250"] = MSFT_CP1250;
+ encoding_map["windows-1251"] = RUSSIAN_CP1251;
+ encoding_map["windows-1252"] = MSFT_CP1252;
+ encoding_map["windows-1253"] = MSFT_CP1253;
+ encoding_map["windows-1254"] = MSFT_CP1254;
+ encoding_map["windows-1255"] = MSFT_CP1255;
+ encoding_map["windows-1256"] = MSFT_CP1256;
+ encoding_map["windows-1257"] = MSFT_CP1257;
+ encoding_map["windows-31j"] = JAPANESE_CP932;
+ encoding_map["windows-874"] = MSFT_CP874;
+ encoding_map["windows-936"] = GBK;
+ encoding_map["x-big5"] = CHINESE_BIG5;
+ encoding_map["x-binaryenc"] = BINARYENC; // not iana standard
+ encoding_map["x-cp1250"] = MSFT_CP1250;
+ encoding_map["x-cp1251"] = RUSSIAN_CP1251;
+ encoding_map["x-cp1252"] = MSFT_CP1252;
+ encoding_map["x-cp1253"] = MSFT_CP1253;
+ encoding_map["x-cp1254"] = MSFT_CP1254;
+ encoding_map["x-cp1255"] = MSFT_CP1255;
+ encoding_map["x-cp1256"] = MSFT_CP1256;
+ encoding_map["x-cp1257"] = MSFT_CP1257;
+ encoding_map["x-euc-jp"] = JAPANESE_EUC_JP;
+ encoding_map["x-euc-tw"] = CHINESE_CNS;
+ encoding_map["x-gbk"] = GBK;
+ encoding_map["x-iso-10646-ucs-2-be"] = UTF16BE;
+ encoding_map["x-iso-10646-ucs-2-le"] = UTF16LE;
+ encoding_map["x-iso-10646-ucs-4-be"] = UTF32BE;
+ encoding_map["x-iso-10646-ucs-4-le"] = UTF32LE;
+ encoding_map["x-jis"] = JAPANESE_JIS; // not iana standard
+ encoding_map["x-mac-roman"] = MACINTOSH_ROMAN;
+ encoding_map["x-shift_jis"] = JAPANESE_SHIFT_JIS; // not iana standard
+ encoding_map["x-sjis"] = JAPANESE_SHIFT_JIS;
+ encoding_map["x-unicode-2-0-utf-7"] = UTF7;
+ encoding_map["x-utf8utf8"] = UTF8UTF8; // not iana standard
+ encoding_map["x-x-big5"] = CHINESE_BIG5;
+ encoding_map["zh_cn.euc"] = CHINESE_GB;
+ encoding_map["zh_tw-big5"] = CHINESE_BIG5;
+ encoding_map["zh_tw-euc"] = CHINESE_CNS;
+
+ // Remove they entry for the empty string, if any.
+ encoding_map.erase("");
+
+ return encoding_map;
+}
+
+// ----------------------------------------------------------------------
+// EncodingNameAliasToEncoding()
+//
+// This function takes an encoding name/alias and returns the Encoding
+// enum. The input is case insensitive. It is the union of the common
+// IANA standard names, the charset names used in Netscape Navigator,
+// and some common names we have been using.
+// See: http://www.iana.org/assignments/character-sets
+// http://physics.hallym.ac.kr/resource/relnotes/windows-2.0.html
+//
+// UNKNOWN_ENCODING is returned if none matches.
+//
+// TODO: Check if it is possible to remove the non-standard,
+// non-netscape-use names. It is because this routine is used for
+// encoding detections from html meta info. Non-standard names may
+// introduce noise on encoding detection.
+//
+// TODO: Unify EncodingNameAliasToEncoding and EncodingFromName,
+// or determine why such a unification is not possible.
+// ----------------------------------------------------------------------
+Encoding EncodingNameAliasToEncoding(const char *encoding_name) {
+ if (!encoding_name) {
+ return UNKNOWN_ENCODING;
+ }
+
+ const EncodingMap& encoding_map = GetEncodingMap();
+
+ EncodingMap::const_iterator emi = encoding_map.find(encoding_name);
+ if (emi != encoding_map.end()) {
+ return emi->second;
+ } else {
+ return UNKNOWN_ENCODING;
+ }
+}
+
+const char* default_encoding_name() {
+ return kEncodingInfoTable[LATIN1].encoding_name_;
+}
+
+static const char* const kInvalidEncodingName = "invalid_encoding";
+
+const char *invalid_encoding_name() {
+ return kInvalidEncodingName;
+}
+
+
+
+// *************************************************************
+// Miscellany
+// *************************************************************
+
+
+Encoding PreferredWebOutputEncoding(Encoding enc) {
+ return IsValidEncoding(enc)
+ ? kEncodingInfoTable[enc].preferred_web_output_encoding_
+ : UTF8;
+}
diff --git a/contrib/google-ced/util/encodings/encodings.h b/contrib/google-ced/util/encodings/encodings.h
new file mode 100644
index 0000000..6477974
--- /dev/null
+++ b/contrib/google-ced/util/encodings/encodings.h
@@ -0,0 +1,299 @@
+// Copyright 2016 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef UTIL_ENCODINGS_ENCODINGS_H_
+#define UTIL_ENCODINGS_ENCODINGS_H_
+
+// This interface defines the Encoding enum and various functions that
+// depend only on Encoding values.
+
+// A hash-function for Encoding, hash<Encoding>, is defined in
+// i18n/encodings/public/encodings-hash.h
+
+// On some Windows projects, UNICODE may be defined, which would prevent the
+// Encoding enum below from compiling. Note that this is a quick fix that does
+// not break any existing projects. The UNICODE enum may someday be changed
+// to something more specific and non-colliding, but this involves careful
+// testing of changes in many other projects.
+#undef UNICODE
+
+// NOTE: The Encoding enum must always start at 0. This assumption has
+// been made and used.
+
+#ifndef SWIG
+
+#include "util/encodings/encodings.pb.h"
+
+#else
+
+// TODO: Include a SWIG workaround header file.
+
+#endif
+
+const int kNumEncodings = NUM_ENCODINGS;
+
+// some of the popular encoding aliases
+// TODO: Make these static const Encoding values instead of macros.
+#define LATIN1 ISO_8859_1
+#define LATIN2 ISO_8859_2
+#define LATIN3 ISO_8859_3
+#define LATIN4 ISO_8859_4
+#define CYRILLIC ISO_8859_5
+#define ARABIC_ENCODING ISO_8859_6 // avoiding the same name as language
+#define GREEK_ENCODING ISO_8859_7 // avoiding the same name as language
+#define HEBREW_ENCODING ISO_8859_8 // avoiding the same name as language
+#define LATIN5 ISO_8859_9
+#define LATIN6 ISO_8859_10
+#define KOREAN_HANGUL KOREAN_EUC_KR
+
+// The default Encoding (LATIN1).
+Encoding default_encoding();
+
+
+
+// *************************************************************
+// Encoding predicates
+// IsValidEncoding()
+// IsEncEncCompatible
+// IsSupersetOfAscii7Bit
+// Is8BitEncoding
+// IsCJKEncoding
+// IsHebrewEncoding
+// IsRightToLeftEncoding
+// IsLogicalRightToLeftEncoding
+// IsVisualRightToLeftEncoding
+// IsIso2022Encoding
+// IsIso2022JpOrVariant
+// IsShiftJisOrVariant
+// IsJapaneseCellPhoneCarrierSpecificEncoding
+// *************************************************************
+
+// IsValidEncoding
+// ===================================
+//
+// Function to check if the input language enum is within range.
+//
+
+bool IsValidEncoding(Encoding enc);
+
+//
+// IsEncEncCompatible
+// ------------------
+//
+// This function is to determine whether or not converting from the
+// first encoding to the second requires any changes to the underlying
+// text (e.g. ASCII_7BIT is a subset of UTF8).
+//
+// TODO: the current implementation is likely incomplete. It would be
+// good to consider the full matrix of all pairs of encodings and to fish out
+// all compatible pairs.
+//
+bool IsEncEncCompatible(const Encoding from, const Encoding to);
+
+// To be a superset of 7-bit Ascii means that bytes 0...127 in the given
+// encoding represent the same characters as they do in ISO_8859_1.
+
+// WARNING: This function does not currently return true for all encodings that
+// are supersets of Ascii 7-bit.
+bool IsSupersetOfAscii7Bit(Encoding e);
+
+// To be an 8-bit encoding means that there are fewer than 256 symbols.
+// Each byte determines a new character; there are no multi-byte sequences.
+
+// WARNING: This function does not currently return true for all encodings that
+// are 8-bit encodings.
+bool Is8BitEncoding(Encoding e);
+
+// IsCJKEncoding
+// -------------
+//
+// This function returns true if the encoding is either Chinese
+// (simplified or traditional), Japanese, or Korean. Note: UTF8 is not
+// considered a CJK encoding.
+bool IsCJKEncoding(Encoding e);
+
+// IsHebrewEncoding
+// -------------
+//
+// This function returns true if the encoding is a Hebrew specific
+// encoding (not UTF8, etc).
+bool IsHebrewEncoding(Encoding e);
+
+// IsRightToLeftEncoding
+// ---------------------
+//
+// Returns true if the encoding is a right-to-left encoding.
+//
+// Note that the name of this function is somewhat misleading. There is nothing
+// "right to left" about these encodings. They merely contain code points for
+// characters in RTL languages such as Hebrew and Arabic. But this is also
+// true for UTF-8.
+//
+// TODO: Get rid of this function. The only special-case we
+// should need to worry about are visual encodings. Anything we
+// need to do for all 'RTL' encodings we need to do for UTF-8 as well.
+bool IsRightToLeftEncoding(Encoding enc);
+
+// IsLogicalRightToLeftEncoding
+// ----------------------------
+//
+// Returns true if the encoding is a logical right-to-left encoding.
+// Logical right-to-left encodings are those that the browser renders
+// right-to-left and applies the BiDi algorithm to. Therefore the characters
+// appear in reading order in the file, and indexing, snippet generation etc.
+// should all just work with no special processing.
+//
+// TODO: Get rid of this function. The only special-case we
+// should need to worry about are visual encodings.
+bool IsLogicalRightToLeftEncoding(Encoding enc);
+
+// IsVisualRightToLeftEncoding
+// ---------------------------
+//
+// Returns true if the encoding is a visual right-to-left encoding.
+// Visual right-to-left encodings are those that the browser renders
+// left-to-right and does not apply the BiDi algorithm to. Therefore each
+// line appears in reverse order in the file, lines are manually wrapped
+// by abusing <br> or <p> tags, etc. Visual RTL encoding is a relic of
+// the prehistoric days when browsers couldn't render right-to-left, but
+// unfortunately some visual pages persist to this day. These documents require
+// special processing so that we don't index or snippet them with each line
+// reversed.
+bool IsVisualRightToLeftEncoding(Encoding enc);
+
+// IsIso2022Encoding
+// -----------------
+//
+// Returns true if the encoding is a kind of ISO 2022 such as
+// ISO-2022-JP.
+bool IsIso2022Encoding(Encoding enc);
+
+// IsIso2022JpOrVariant
+// --------------------
+//
+// Returns true if the encoding is ISO-2022-JP or a variant such as
+// KDDI's ISO-2022-JP.
+bool IsIso2022JpOrVariant(Encoding enc);
+
+// IsShiftJisOrVariant
+// --------------------
+//
+// Returns true if the encoding is Shift_JIS or a variant such as
+// KDDI's Shift_JIS.
+bool IsShiftJisOrVariant(Encoding enc);
+
+// IsJapanesCellPhoneCarrierSpecificEncoding
+// -----------------------------------------
+//
+// Returns true if it's Japanese cell phone carrier specific encoding
+// such as KDDI_SHIFT_JIS.
+bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc);
+
+
+
+// *************************************************************
+// ENCODING NAMES
+//
+// This interface defines a standard name for each valid encoding, and
+// a standard name for invalid encodings. (Some names use all upper
+// case, but others use mixed case.)
+//
+// EncodingName() [Encoding to name]
+// MimeEncodingName() [Encoding to name]
+// EncodingFromName() [name to Encoding]
+// EncodingNameAliasToEncoding() [name to Encoding]
+// default_encoding_name()
+// invalid_encoding_name()
+// *************************************************************
+
+// EncodingName
+// ------------
+//
+// Given the encoding, returns its standard name.
+// Return invalid_encoding_name() if the encoding is invalid.
+//
+const char* EncodingName(Encoding enc);
+
+//
+// MimeEncodingName
+// ----------------
+//
+// Return the "preferred MIME name" of an encoding.
+//
+// This name is suitable for using in HTTP headers, HTML tags,
+// and as the "charset" parameter of a MIME Content-Type.
+const char* MimeEncodingName(Encoding enc);
+
+
+// The maximum length of an encoding name
+const int kMaxEncodingNameSize = 50;
+
+// The standard name of the default encoding.
+const char* default_encoding_name();
+
+// The name used for an invalid encoding.
+const char* invalid_encoding_name();
+
+// EncodingFromName
+// ----------------
+//
+// If enc_name matches the standard name of an Encoding, using a
+// case-insensitive comparison, set *encoding to that Encoding and
+// return true. Otherwise set *encoding to UNKNOWN_ENCODING and
+// return false.
+//
+// REQUIRES: encoding must not be NULL.
+//
+bool EncodingFromName(const char* enc_name, Encoding *encoding);
+
+//
+// EncodingNameAliasToEncoding
+// ---------------------------
+//
+// If enc_name matches the standard name or an alias of an Encoding,
+// using a case-insensitive comparison, return that
+// Encoding. Otherwise, return UNKNOWN_ENCODING.
+//
+// Aliases include most mime-encoding names (e.g., "ISO-8859-7" for
+// GREEK), alternate names (e.g., "cyrillic" for ISO_8859_5) and
+// common variations with hyphens and underscores (e.g., "koi8-u" and
+// "koi8u" for RUSSIAN_KOI8_R).
+
+Encoding EncodingNameAliasToEncoding(const char *enc_name);
+
+// *************************************************************
+// Miscellany
+// *************************************************************
+
+// PreferredWebOutputEncoding
+// --------------------------
+//
+// Some multi-byte encodings use byte values that coincide with the
+// ASCII codes for HTML syntax characters <>"&' and browsers like MSIE
+// can misinterpret these, as indicated in an external XSS report from
+// 2007-02-15. Here, we map these dangerous encodings to safer ones. We
+// also use UTF8 instead of encodings that we don't support in our
+// output, and we generally try to be conservative in what we send out.
+// Where the client asks for single- or double-byte encodings that are
+// not as common, we substitute a more common single- or double-byte
+// encoding, if there is one, thereby preserving the client's intent
+// to use less space than UTF-8. This also means that characters
+// outside the destination set will be converted to HTML NCRs (&#NNN;)
+// if requested.
+Encoding PreferredWebOutputEncoding(Encoding enc);
+
+
+#endif // UTIL_ENCODINGS_ENCODINGS_H_
diff --git a/contrib/google-ced/util/encodings/encodings.pb.h b/contrib/google-ced/util/encodings/encodings.pb.h
new file mode 100644
index 0000000..ffbd716
--- /dev/null
+++ b/contrib/google-ced/util/encodings/encodings.pb.h
@@ -0,0 +1,181 @@
+// Copyright 2016 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef UTIL_ENCODINGS_ENCODINGS_PB_H_
+#define UTIL_ENCODINGS_ENCODINGS_PB_H_
+
+enum Encoding {
+ ISO_8859_1 = 0, // Teragram ASCII
+ ISO_8859_2 = 1, // Teragram Latin2
+ ISO_8859_3 = 2, // in BasisTech but not in Teragram
+ ISO_8859_4 = 3, // Teragram Latin4
+ ISO_8859_5 = 4, // Teragram ISO-8859-5
+ ISO_8859_6 = 5, // Teragram Arabic
+ ISO_8859_7 = 6, // Teragram Greek
+ ISO_8859_8 = 7, // Teragram Hebrew
+ ISO_8859_9 = 8, // in BasisTech but not in Teragram
+ ISO_8859_10 = 9, // in BasisTech but not in Teragram
+ JAPANESE_EUC_JP = 10, // Teragram EUC_JP
+ JAPANESE_SHIFT_JIS = 11, // Teragram SJS
+ JAPANESE_JIS = 12, // Teragram JIS
+ CHINESE_BIG5 = 13, // Teragram BIG5
+ CHINESE_GB = 14, // Teragram GB
+ CHINESE_EUC_CN = 15, // Misnamed. Should be EUC_TW. Was Basis Tech
+ // CNS11643EUC, before that Teragram EUC-CN(!)
+ // See //i18n/basistech/basistech_encodings.h
+ KOREAN_EUC_KR = 16, // Teragram KSC
+ UNICODE = 17, // Teragram Unicode
+ CHINESE_EUC_DEC = 18, // Misnamed. Should be EUC_TW. Was Basis Tech
+ // CNS11643EUC, before that Teragram EUC.
+ CHINESE_CNS = 19, // Misnamed. Should be EUC_TW. Was Basis Tech
+ // CNS11643EUC, before that Teragram CNS.
+ CHINESE_BIG5_CP950 = 20, // Teragram BIG5_CP950
+ JAPANESE_CP932 = 21, // Teragram CP932
+ UTF8 = 22,
+ UNKNOWN_ENCODING = 23,
+ ASCII_7BIT = 24, // ISO_8859_1 with all characters <= 127.
+ // Should be present only in the crawler
+ // and in the repository,
+ // *never* as a result of Document::encoding().
+ RUSSIAN_KOI8_R = 25, // Teragram KOI8R
+ RUSSIAN_CP1251 = 26, // Teragram CP1251
+
+ //----------------------------------------------------------
+ // These are _not_ output from teragram. Instead, they are as
+ // detected in the headers of usenet articles.
+ MSFT_CP1252 = 27, // 27: CP1252 aka MSFT euro ascii
+ RUSSIAN_KOI8_RU = 28, // CP21866 aka KOI8-U, used for Ukrainian.
+ // Misnamed, this is _not_ KOI8-RU but KOI8-U.
+ // KOI8-U is used much more often than KOI8-RU.
+ MSFT_CP1250 = 29, // CP1250 aka MSFT eastern european
+ ISO_8859_15 = 30, // aka ISO_8859_0 aka ISO_8859_1 euroized
+ //----------------------------------------------------------
+
+ //----------------------------------------------------------
+ // These are in BasisTech but not in Teragram. They are
+ // needed for new interface languages. Now detected by
+ // research langid
+ MSFT_CP1254 = 31, // used for Turkish
+ MSFT_CP1257 = 32, // used in Baltic countries
+ //----------------------------------------------------------
+
+ //----------------------------------------------------------
+ //----------------------------------------------------------
+ // New encodings detected by Teragram
+ ISO_8859_11 = 33, // aka TIS-620, used for Thai
+ MSFT_CP874 = 34, // used for Thai
+ MSFT_CP1256 = 35, // used for Arabic
+
+ //----------------------------------------------------------
+ // Detected as ISO_8859_8 by Teragram, but can be found in META tags
+ MSFT_CP1255 = 36, // Logical Hebrew Microsoft
+ ISO_8859_8_I = 37, // Iso Hebrew Logical
+ HEBREW_VISUAL = 38, // Iso Hebrew Visual
+ //----------------------------------------------------------
+
+ //----------------------------------------------------------
+ // Detected by research langid
+ CZECH_CP852 = 39,
+ CZECH_CSN_369103 = 40, // aka ISO_IR_139 aka KOI8_CS
+ MSFT_CP1253 = 41, // used for Greek
+ RUSSIAN_CP866 = 42,
+ //----------------------------------------------------------
+
+ //----------------------------------------------------------
+ // Handled by iconv in glibc
+ ISO_8859_13 = 43,
+ ISO_2022_KR = 44,
+ GBK = 45,
+ GB18030 = 46,
+ BIG5_HKSCS = 47,
+ ISO_2022_CN = 48,
+
+ //-----------------------------------------------------------
+ // Detected by xin liu's detector
+ // Handled by transcoder
+ // (Indic encodings)
+
+ TSCII = 49,
+ TAMIL_MONO = 50,
+ TAMIL_BI = 51,
+ JAGRAN = 52,
+
+
+ MACINTOSH_ROMAN = 53,
+ UTF7 = 54,
+ BHASKAR = 55, // Indic encoding - Devanagari
+ HTCHANAKYA = 56, // 56 Indic encoding - Devanagari
+
+ //-----------------------------------------------------------
+ // These allow a single place (inputconverter and outputconverter)
+ // to do UTF-16 <==> UTF-8 bulk conversions and UTF-32 <==> UTF-8
+ // bulk conversions, with interchange-valid checking on input and
+ // fallback if needed on ouput.
+ UTF16BE = 57, // big-endian UTF-16
+ UTF16LE = 58, // little-endian UTF-16
+ UTF32BE = 59, // big-endian UTF-32
+ UTF32LE = 60, // little-endian UTF-32
+ //-----------------------------------------------------------
+
+ //-----------------------------------------------------------
+ // An encoding that means "This is not text, but it may have some
+ // simple ASCII text embedded". Intended input conversion (not yet
+ // implemented) is to keep strings of >=4 seven-bit ASCII characters
+ // (follow each kept string with an ASCII space), delete the rest of
+ // the bytes. This will pick up and allow indexing of e.g. captions
+ // in JPEGs. No output conversion needed.
+ BINARYENC = 61,
+ //-----------------------------------------------------------
+
+ //-----------------------------------------------------------
+ // Some Web pages allow a mixture of HZ-GB and GB-2312 by using
+ // ~{ ... ~} for 2-byte pairs, and the browsers support this.
+ HZ_GB_2312 = 62,
+ //-----------------------------------------------------------
+
+ //-----------------------------------------------------------
+ // Some external vendors make the common input error of
+ // converting MSFT_CP1252 to UTF8 *twice*. No output conversion needed.
+ UTF8UTF8 = 63,
+ //-----------------------------------------------------------
+
+ //-----------------------------------------------------------
+ // Handled by transcoder for tamil language specific font
+ // encodings without the support for detection at present.
+ TAM_ELANGO = 64, // Elango - Tamil
+ TAM_LTTMBARANI = 65, // Barani - Tamil
+ TAM_SHREE = 66, // Shree - Tamil
+ TAM_TBOOMIS = 67, // TBoomis - Tamil
+ TAM_TMNEWS = 68, // TMNews - Tamil
+ TAM_WEBTAMIL = 69, // Webtamil - Tamil
+ //-----------------------------------------------------------
+
+ //-----------------------------------------------------------
+ // Shift_JIS variants used by Japanese cell phone carriers.
+ KDDI_SHIFT_JIS = 70,
+ DOCOMO_SHIFT_JIS = 71,
+ SOFTBANK_SHIFT_JIS = 72,
+ // ISO-2022-JP variants used by KDDI and SoftBank.
+ KDDI_ISO_2022_JP = 73,
+ SOFTBANK_ISO_2022_JP = 74,
+ //-----------------------------------------------------------
+
+ NUM_ENCODINGS = 75, // Always keep this at the end. It is not a
+ // valid Encoding enum, it is only used to
+ // indicate the total number of Encodings.
+};
+
+#endif // UTIL_ENCODINGS_ENCODINGS_PB_H_
diff --git a/contrib/google-ced/util/encodings/encodings_unittest.cc b/contrib/google-ced/util/encodings/encodings_unittest.cc
new file mode 100644
index 0000000..223e3e4
--- /dev/null
+++ b/contrib/google-ced/util/encodings/encodings_unittest.cc
@@ -0,0 +1,34 @@
+// Copyright 2016 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "util/encodings/encodings.h"
+
+#include "gtest/gtest.h"
+
+TEST(EncodingsTest, EncodingNameAliasToEncoding) {
+ // Test that cases, non-alpha-numeric chars are ignored.
+ EXPECT_EQ(ISO_8859_1, EncodingNameAliasToEncoding("iso_8859_1"));
+ EXPECT_EQ(ISO_8859_1, EncodingNameAliasToEncoding("iso-8859-1"));
+
+ // Test that spaces are ignored.
+ EXPECT_EQ(UTF8, EncodingNameAliasToEncoding("UTF8"));
+ EXPECT_EQ(UTF8, EncodingNameAliasToEncoding("UTF 8"));
+ EXPECT_EQ(UTF8, EncodingNameAliasToEncoding("UTF-8"));
+
+ // Test alphanumeric differences are counted.
+ EXPECT_NE(UTF8, EncodingNameAliasToEncoding("UTF-7"));
+ EXPECT_NE(KOREAN_EUC_KR, EncodingNameAliasToEncoding("euc-jp"));
+}