diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-10 21:30:40 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-10 21:30:40 +0000 |
commit | 133a45c109da5310add55824db21af5239951f93 (patch) | |
tree | ba6ac4c0a950a0dda56451944315d66409923918 /contrib/google-ced/util/languages | |
parent | Initial commit. (diff) | |
download | rspamd-133a45c109da5310add55824db21af5239951f93.tar.xz rspamd-133a45c109da5310add55824db21af5239951f93.zip |
Adding upstream version 3.8.1.upstream/3.8.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'contrib/google-ced/util/languages')
-rw-r--r-- | contrib/google-ced/util/languages/languages.cc | 349 | ||||
-rw-r--r-- | contrib/google-ced/util/languages/languages.h | 381 | ||||
-rw-r--r-- | contrib/google-ced/util/languages/languages.pb.h | 191 |
3 files changed, 921 insertions, 0 deletions
diff --git a/contrib/google-ced/util/languages/languages.cc b/contrib/google-ced/util/languages/languages.cc new file mode 100644 index 0000000..852351f --- /dev/null +++ b/contrib/google-ced/util/languages/languages.cc @@ -0,0 +1,349 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "util/languages/languages.h" + +#include "util/basictypes.h" +#include "util/string_util.h" + + +Language default_language() {return ENGLISH;} + + +// Language names and codes + +struct LanguageInfo { + const char * language_name_; + const char * language_code_639_1_; // the ISO-639-1 code for the language + const char * language_code_639_2_; // the ISO-639-2 code for the language + const char * language_code_other_; // some nonstandard code for the language +}; + +static const LanguageInfo kLanguageInfoTable[] = { + { "ENGLISH", "en", "eng", NULL}, + { "DANISH", "da", "dan", NULL}, + { "DUTCH", "nl", "dut", NULL}, + { "FINNISH", "fi", "fin", NULL}, + { "FRENCH", "fr", "fre", NULL}, + { "GERMAN", "de", "ger", NULL}, + { "HEBREW", "he", "heb", NULL}, + { "ITALIAN", "it", "ita", NULL}, + { "Japanese", "ja", "jpn", NULL}, + { "Korean", "ko", "kor", NULL}, + { "NORWEGIAN", "nb", "nor", NULL}, + { "POLISH", "pl", "pol", NULL}, + { "PORTUGUESE", "pt", "por", NULL}, + { "RUSSIAN", "ru", "rus", NULL}, + { "SPANISH", "es", "spa", NULL}, + { "SWEDISH", "sv", "swe", NULL}, + { "Chinese", "zh", "chi", "zh-CN"}, + { "CZECH", "cs", "cze", NULL}, + { "GREEK", "el", "gre", NULL}, + { "ICELANDIC", "is", "ice", NULL}, + { "LATVIAN", "lv", "lav", NULL}, + { "LITHUANIAN", "lt", "lit", NULL}, + { "ROMANIAN", "ro", "rum", NULL}, + { "HUNGARIAN", "hu", "hun", NULL}, + { "ESTONIAN", "et", "est", NULL}, + // TODO: Although Teragram has two output names "TG_UNKNOWN_LANGUAGE" + // and "Unknown", they are essentially the same. Need to unify them. + // "un" and "ut" are invented by us, not from ISO-639. + // + { "TG_UNKNOWN_LANGUAGE", NULL, NULL, "ut"}, + { "Unknown", NULL, NULL, "un"}, + { "BULGARIAN", "bg", "bul", NULL}, + { "CROATIAN", "hr", "scr", NULL}, + { "SERBIAN", "sr", "scc", NULL}, + { "IRISH", "ga", "gle", NULL}, + { "GALICIAN", "gl", "glg", NULL}, + // Impossible to tell Tagalog from Filipino at the moment. + // Use ISO 639-2 code for Filipino here. + { "TAGALOG", NULL, "fil", NULL}, + { "TURKISH", "tr", "tur", NULL}, + { "UKRAINIAN", "uk", "ukr", NULL}, + { "HINDI", "hi", "hin", NULL}, + { "MACEDONIAN", "mk", "mac", NULL}, + { "BENGALI", "bn", "ben", NULL}, + { "INDONESIAN", "id", "ind", NULL}, + { "LATIN", "la", "lat", NULL}, + { "MALAY", "ms", "may", NULL}, + { "MALAYALAM", "ml", "mal", NULL}, + { "WELSH", "cy", "wel", NULL}, + { "NEPALI", "ne", "nep", NULL}, + { "TELUGU", "te", "tel", NULL}, + { "ALBANIAN", "sq", "alb", NULL}, + { "TAMIL", "ta", "tam", NULL}, + { "BELARUSIAN", "be", "bel", NULL}, + { "JAVANESE", "jw", "jav", NULL}, + { "OCCITAN", "oc", "oci", NULL}, + { "URDU", "ur", "urd", NULL}, + { "BIHARI", "bh", "bih", NULL}, + { "GUJARATI", "gu", "guj", NULL}, + { "THAI", "th", "tha", NULL}, + { "ARABIC", "ar", "ara", NULL}, + { "CATALAN", "ca", "cat", NULL}, + { "ESPERANTO", "eo", "epo", NULL}, + { "BASQUE", "eu", "baq", NULL}, + { "INTERLINGUA", "ia", "ina", NULL}, + { "KANNADA", "kn", "kan", NULL}, + { "PUNJABI", "pa", "pan", NULL}, + { "SCOTS_GAELIC", "gd", "gla", NULL}, + { "SWAHILI", "sw", "swa", NULL}, + { "SLOVENIAN", "sl", "slv", NULL}, + { "MARATHI", "mr", "mar", NULL}, + { "MALTESE", "mt", "mlt", NULL}, + { "VIETNAMESE", "vi", "vie", NULL}, + { "FRISIAN", "fy", "fry", NULL}, + { "SLOVAK", "sk", "slo", NULL}, + { "ChineseT", + NULL, NULL, // We intentionally set these 2 fields to NULL to avoid + // confusion between CHINESE_T and CHINESE. + "zh-TW"}, + { "FAROESE", "fo", "fao", NULL}, + { "SUNDANESE", "su", "sun", NULL}, + { "UZBEK", "uz", "uzb", NULL}, + { "AMHARIC", "am", "amh", NULL}, + { "AZERBAIJANI", "az", "aze", NULL}, + { "GEORGIAN", "ka", "geo", NULL}, + { "TIGRINYA", "ti", "tir", NULL}, + { "PERSIAN", "fa", "per", NULL}, + { "BOSNIAN", "bs", "bos", NULL}, + { "SINHALESE", "si", "sin", NULL}, + { "NORWEGIAN_N", "nn", "nno", NULL}, + { "PORTUGUESE_P", NULL, NULL, "pt-PT"}, + { "PORTUGUESE_B", NULL, NULL, "pt-BR"}, + { "XHOSA", "xh", "xho", NULL}, + { "ZULU", "zu", "zul", NULL}, + { "GUARANI", "gn", "grn", NULL}, + { "SESOTHO", "st", "sot", NULL}, + { "TURKMEN", "tk", "tuk", NULL}, + { "KYRGYZ", "ky", "kir", NULL}, + { "BRETON", "br", "bre", NULL}, + { "TWI", "tw", "twi", NULL}, + { "YIDDISH", "yi", "yid", NULL}, + { "SERBO_CROATIAN", "sh", NULL, NULL}, + { "SOMALI", "so", "som", NULL}, + { "UIGHUR", "ug", "uig", NULL}, + { "KURDISH", "ku", "kur", NULL}, + { "MONGOLIAN", "mn", "mon", NULL}, + { "ARMENIAN", "hy", "arm", NULL}, + { "LAOTHIAN", "lo", "lao", NULL}, + { "SINDHI", "sd", "snd", NULL}, + { "RHAETO_ROMANCE", "rm", "roh", NULL}, + { "AFRIKAANS", "af", "afr", NULL}, + { "LUXEMBOURGISH", "lb", "ltz", NULL}, + { "BURMESE", "my", "bur", NULL}, + // KHMER is known as Cambodian for Google user interfaces. + { "KHMER", "km", "khm", NULL}, + { "TIBETAN", "bo", "tib", NULL}, + { "DHIVEHI", "dv", "div", NULL}, + { "CHEROKEE", NULL, "chr", NULL}, + { "SYRIAC", NULL, "syr", NULL}, + { "LIMBU", NULL, NULL, "sit-NP"}, + { "ORIYA", "or", "ori", NULL}, + { "ASSAMESE", "as", "asm", NULL}, + { "CORSICAN", "co", "cos", NULL}, + { "INTERLINGUE", "ie", "ine", NULL}, + { "KAZAKH", "kk", "kaz", NULL}, + { "LINGALA", "ln", "lin", NULL}, + { "MOLDAVIAN", "mo", "mol", NULL}, + { "PASHTO", "ps", "pus", NULL}, + { "QUECHUA", "qu", "que", NULL}, + { "SHONA", "sn", "sna", NULL}, + { "TAJIK", "tg", "tgk", NULL}, + { "TATAR", "tt", "tat", NULL}, + { "TONGA", "to", "tog", NULL}, + { "YORUBA", "yo", "yor", NULL}, + { "CREOLES_AND_PIDGINS_ENGLISH_BASED", NULL, "cpe", NULL}, + { "CREOLES_AND_PIDGINS_FRENCH_BASED", NULL, "cpf", NULL}, + { "CREOLES_AND_PIDGINS_PORTUGUESE_BASED", NULL, "cpp", NULL}, + { "CREOLES_AND_PIDGINS_OTHER", NULL, "crp", NULL}, + { "MAORI", "mi", "mao", NULL}, + { "WOLOF", "wo", "wol", NULL}, + { "ABKHAZIAN", "ab", "abk", NULL}, + { "AFAR", "aa", "aar", NULL}, + { "AYMARA", "ay", "aym", NULL}, + { "BASHKIR", "ba", "bak", NULL}, + { "BISLAMA", "bi", "bis", NULL}, + { "DZONGKHA", "dz", "dzo", NULL}, + { "FIJIAN", "fj", "fij", NULL}, + { "GREENLANDIC", "kl", "kal", NULL}, + { "HAUSA", "ha", "hau", NULL}, + { "HAITIAN_CREOLE", "ht", NULL, NULL}, + { "INUPIAK", "ik", "ipk", NULL}, + { "INUKTITUT", "iu", "iku", NULL}, + { "KASHMIRI", "ks", "kas", NULL}, + { "KINYARWANDA", "rw", "kin", NULL}, + { "MALAGASY", "mg", "mlg", NULL}, + { "NAURU", "na", "nau", NULL}, + { "OROMO", "om", "orm", NULL}, + { "RUNDI", "rn", "run", NULL}, + { "SAMOAN", "sm", "smo", NULL}, + { "SANGO", "sg", "sag", NULL}, + { "SANSKRIT", "sa", "san", NULL}, + { "SISWANT", "ss", "ssw", NULL}, + { "TSONGA", "ts", "tso", NULL}, + { "TSWANA", "tn", "tsn", NULL}, + { "VOLAPUK", "vo", "vol", NULL}, + { "ZHUANG", "za", "zha", NULL}, + { "KHASI", NULL, "kha", NULL}, + { "SCOTS", NULL, "sco", NULL}, + { "GANDA", "lg", "lug", NULL}, + { "MANX", "gv", "glv", NULL}, + { "MONTENEGRIN", NULL, NULL, "sr-ME"}, + { "XX", NULL, NULL, "XX"}, +}; + +COMPILE_ASSERT(arraysize(kLanguageInfoTable) == NUM_LANGUAGES + 1, + kLanguageInfoTable_has_incorrect_length); + + +// LANGUAGE NAMES + +const char* default_language_name() { + return kLanguageInfoTable[ENGLISH].language_name_; +} + +static const char* const kInvalidLanguageName = "invalid_language"; + +const char *invalid_language_name() { + return kInvalidLanguageName; +} + +const char* LanguageName(Language lang) { + return IsValidLanguage(lang) + ? kLanguageInfoTable[lang].language_name_ + : kInvalidLanguageName; +} + + + +// LANGUAGE CODES + + +// The space before invalid_language_code is intentional. It is used +// to prevent it matching any two letter language code. +// +static const char* const kInvalidLanguageCode = " invalid_language_code"; + +const char *invalid_language_code() { + return kInvalidLanguageCode; +} + +const char * LanguageCode(Language lang) { + if (! IsValidLanguage(lang)) + return kInvalidLanguageCode; + const LanguageInfo& info = kLanguageInfoTable[lang]; + if (info.language_code_639_1_) { + return info.language_code_639_1_; + } else if (info.language_code_639_2_) { + return info.language_code_639_2_; + } else if (info.language_code_other_) { + return info.language_code_other_; + } else { + return kInvalidLanguageCode; + } +} + +const char* default_language_code() { + return kLanguageInfoTable[ENGLISH].language_code_639_1_; +} + +const char* LanguageCodeISO639_1(Language lang) { + if (! IsValidLanguage(lang)) + return kInvalidLanguageCode; + if (const char* code = kLanguageInfoTable[lang].language_code_639_1_) + return code; + return kInvalidLanguageCode; +} + +const char* LanguageCodeISO639_2(Language lang) { + if (! IsValidLanguage(lang)) + return kInvalidLanguageCode; + if (const char* code = kLanguageInfoTable[lang].language_code_639_2_) + return code; + return kInvalidLanguageCode; +} + +const char* LanguageCodeWithDialects(Language lang) { + if (lang == CHINESE) + return "zh-CN"; + return LanguageCode(lang); +} + + + +bool LanguageFromCode(const char* lang_code, Language *language) { + *language = UNKNOWN_LANGUAGE; + if ( lang_code == NULL ) return false; + + for ( int i = 0 ; i < kNumLanguages ; i++ ) { + const LanguageInfo& info = kLanguageInfoTable[i]; + if ((info.language_code_639_1_ && + !base::strcasecmp(lang_code, info.language_code_639_1_)) || + (info.language_code_639_2_ && + !base::strcasecmp(lang_code, info.language_code_639_2_)) || + (info.language_code_other_ && + !base::strcasecmp(lang_code, info.language_code_other_))) { + *language = static_cast<Language>(i); + return true; + } + } + + // For convenience, this function can also parse the non-standard + // five-letter language codes "zh-cn" and "zh-tw" which are used by + // front-ends such as GWS to distinguish Simplified from Traditional + // Chinese. + if (!base::strcasecmp(lang_code, "zh-cn") || + !base::strcasecmp(lang_code, "zh_cn")) { + *language = CHINESE; + return true; + } + if (!base::strcasecmp(lang_code, "zh-tw") || + !base::strcasecmp(lang_code, "zh_tw")) { + *language = CHINESE_T; + return true; + } + if (!base::strcasecmp(lang_code, "sr-me") || + !base::strcasecmp(lang_code, "sr_me")) { + *language = MONTENEGRIN; + return true; + } + + // Process language-code synonyms. + if (!base::strcasecmp(lang_code, "he")) { + *language = HEBREW; // Use "iw". + return true; + } + if (!base::strcasecmp(lang_code, "in")) { + *language = INDONESIAN; // Use "id". + return true; + } + if (!base::strcasecmp(lang_code, "ji")) { + *language = YIDDISH; // Use "yi". + return true; + } + + // Process language-detection synonyms. + // These distinct languages cannot be differentiated by our current + // language-detection algorithms. + if (!base::strcasecmp(lang_code, "fil")) { + *language = TAGALOG; + return true; + } + + return false; +} diff --git a/contrib/google-ced/util/languages/languages.h b/contrib/google-ced/util/languages/languages.h new file mode 100644 index 0000000..4237961 --- /dev/null +++ b/contrib/google-ced/util/languages/languages.h @@ -0,0 +1,381 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef UTIL_LANGUAGES_LANGUAGES_H_ +#define UTIL_LANGUAGES_LANGUAGES_H_ + +// This interface defines the Language enum and functions that depend +// only on Language values. + +// A hash-function for Language, hash<Language>, is defined in +// i18n/languages/public/languages-hash.h + +#ifndef SWIG +// Language enum defined in languages.proto +// Also description on how to add languages. +#include "util/languages/languages.pb.h" + +#else + +// TODO: Include a header containing swig-compatible enum. + +#endif + +const int kNumLanguages = NUM_LANGUAGES; + +// Return the default language (ENGLISH). +Language default_language(); + + +// ******************************************* +// Language predicates +// IsValidLanguage() +// IS_LANGUAGE_UNKNOWN() +// IsCJKLanguage() +// IsChineseLanguage() +// IsNorwegianLanguage() +// IsPortugueseLanguage() +// IsRightToLeftLanguage() +// IsMaybeRightToLeftLanguage() +// IsSameLanguage() +// IsScriptRequiringLongerSnippets() +// ******************************************* + +// IsValidLanguage +// =============== +// +// Function to check if the input is within range of the Language enum. If +// IsValidLanguage(lang) returns true, it is safe to call +// static_cast<Language>(lang). +// +inline bool IsValidLanguage(int lang) { + return ((lang >= 0) && (lang < kNumLanguages)); +} + +// Return true if the language is "unknown". (This function was +// previously a macro, hence the spelling in all caps.) +// +inline bool IS_LANGUAGE_UNKNOWN(Language lang) { + return lang == TG_UNKNOWN_LANGUAGE || lang == UNKNOWN_LANGUAGE; +} + +// IsCJKLanguage +// ------------- +// +// This function returns true if the language is either Chinese +// (simplified or traditional), Japanese, or Korean. +bool IsCJKLanguage(Language lang); + +// IsChineseLanguage +// ----------------- +// +// This function returns true if the language is either Chinese +// (simplified or traditional) +bool IsChineseLanguage(Language lang); + +// IsNorwegianLanguage +// -------------------- +// +// This function returns true if the language is any of the Norwegian +// (regular or Nynorsk). +bool IsNorwegianLanguage(Language lang); + +// IsPortugueseLanguage +// -------------------- +// +// This function returns true if the language is any of the Portuguese +// languages (regular, Portugal or Brazil) +bool IsPortugueseLanguage(Language lang); + +// IsSameLanguage +// -------------- +// +// WARNING: This function provides only a simple test on the values of +// the two Language arguments. It returns false if either language is +// invalid. It returns true if the language arguments are equal, or +// if they are both Chinese languages, both Norwegian languages, or +// both Portuguese languages, as defined by IsChineseLanguage, +// IsNorwegianLanguage, and IsPortugueseLanguage. Otherwise it returns +// false. +bool IsSameLanguage(Language lang1, Language lang2); + + +// IsRightToLeftLanguage +// --------------------- +// +// This function returns true if the language is only written right-to-left +// (E.g., Hebrew, Arabic, Persian etc.) +// +// IMPORTANT NOTE: Technically we're talking about scripts, not languages. +// There are languages that can be written in more than one script. +// Examples: +// - Kurdish and Azeri ('AZERBAIJANI') can be written left-to-right in +// Latin or Cyrillic script, and right-to-left in Arabic script. +// - Sindhi and Punjabi are written in different scripts, depending on +// region and dialect. +// - Turkmen used an Arabic script historically, but not any more. +// - Pashto and Uyghur can use Arabic script, but use a Roman script +// on the Internet. +// - Kashmiri and Urdu are written either with Arabic or Devanagari script. +// +// This function only returns true for languages that are always, unequivocally +// written in right-to-left script. +// +// TODO: If we want to do anything special with multi-script languages +// we should create new 'languages' for each language+script, as we do for +// traditional vs. simplified Chinese. However most such languages are rare in +// use and even rarer on the web, so this is unlikely to be something we'll +// be concerned with for a while. +bool IsRightToLeftLanguage(Language lang); + +// IsMaybeRightToLeftLanguage +// -------------------------- +// +// This function returns true if the language may appear on the web in a +// right-to-left script (E.g., Hebrew, Arabic, Persian, Urdu, Kurdish, etc.) +// +// NOTE: See important notes under IsRightToLeftLanguage(...). +// +// This function returns true for languages that *may* appear on the web in a +// right-to-left script, even if they may also appear in a left-to-right +// script. +// +// This function should typically be used in cases where doing some work on +// left-to-right text would be OK (usually a no-op), and this function is used +// just to cut down on unnecessary work on regular, LTR text. +bool IsMaybeRightToLeftLanguage(Language lang); + +// IsScriptRequiringLongerSnippets +// -------------------- +// +// This function returns true if the script chracteristics require longer +// snippet length (Devanagari, Bengali, Gurmukhi, +// Gujarati, Oriya, Tamil, Telugu, Kannada, Malayalam). +// COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE +// bool IsScriptRequiringLongerSnippets(UnicodeScript script); + + +// ******************************************* +// LANGUAGE NAMES +// +// This interface defines a standard name for each valid Language, +// and a standard name for invalid languages. Some language names use all +// uppercase letters, but others use mixed case. +// LanguageName() [Language to name] +// LanguageEnumName() [language to enum name] +// LanguageFromName() [name to Language] +// default_language_name() +// invalid_language_name() +// ******************************************* + +// Given a Language, returns its standard name. +// Return invalid_language_name() if the language is invalid. +const char* LanguageName(Language lang); + +// Given a Language, return the name of the enum constant for that +// language. In all but a few cases, this is the same as its standard +// name. For example, LanguageName(CHINESE) returns "Chinese", but +// LanguageEnumName(CHINESE) returns "CHINESE". This is intended for +// code that is generating C++ code, where the enum constant is more +// useful than its integer value. Return "NUM_LANGUAGES" if +// the language is invalid. +const char* LanguageEnumName(Language lang); + +// The maximum length of a standard language name. +const int kMaxLanguageNameSize = 50; + +// The standard name for the default language. +const char* default_language_name(); + +// The standard name for all invalid languages. +const char* invalid_language_name(); + +// If lang_name matches the standard name of a Language, using a +// case-insensitive comparison, set *language to that Language and +// return true. +// Otherwise, set *language to UNKNOWN_LANGUAGE and return false. +// +// For backwards compatibility, "HATIAN_CREOLE" is allowed as a name +// for HAITIAN_CREOLE, and "QUECHAU" is allowed as a name for QUECHUA. +// For compatibility with LanguageEnumName, "UNKNOWN_LANGUAGE" is allowed +// as a name for UNKNOWN_LANGUAGE (the return value is true in this case, +// as it is for "Unknown"), and "CHINESE_T" is allowed as a name for +// CHINESE_T (i.e., a synonym for "ChineseT"). +// +// REQUIRES: language must not be NULL. +// +bool LanguageFromName(const char* lang_name, Language *language); + + + +// ******************************************* +// LANGUAGE CODES +// +// This interface defines a standard code for each valid language, and +// a standard code for invalid languages. These are derived from ISO codes, +// with some Google additions. +// LanguageCode() +// default_language_code() +// invalid_language_code() +// LanguageCodeWithDialects() +// LanguageCodeISO639_1() +// LanguageCodeISO639_2() +// ******************************************* + +// Given a Language, return its standard code. There are Google-specific codes: +// For CHINESE_T, return "zh-TW". +// For TG_UNKNOWN_LANGUAGE, return "ut". +// For UNKNOWN_LANGUAGE, return "un". +// For PORTUGUESE_P, return "pt-PT". +// For PORTUGUESE_B, return "pt-BR". +// For LIMBU, return "sit-NP". +// For CHEROKEE, return "chr". +// For SYRIAC, return "syr". +// Otherwise return the ISO 639-1 two-letter language code for lang. +// If lang is invalid, return invalid_language_code(). +// +// NOTE: See the note below about the codes for Chinese languages. +// +const char* LanguageCode(Language lang); + +// The maximum length of a language code. +const int kMaxLanguageCodeSize = 50; + +// The standard code for the default language. +const char* default_language_code(); + +// The standard code for all invalid languages. +const char* invalid_language_code(); + + +// -------------------------------------------- +// NOTE: CHINESE LANGUAGE CODES +// +// There are three functions that return codes for Chinese languages. +// LanguageCode(lang) and LanguageCodeWithDialects(lang) are defined here. +// LanguageCode(lang, encoding) is defined in i18n/encodings.lang_enc.h. +// The following list shows the different results. +// +// LanguageCode(CHINESE) returns "zh" +// LanguageCode(CHINESE_T) returns "zh-TW". +// +// LanguageCodeWithDialects(CHINESE) returns "zh-CN". +// LanguageCodeWithDialects(CHINESE_T) returns "zh-TW". +// +// LanguageCode(CHINESE_T, <any encoding>) returns "zh-TW". +// LanguageCode(CHINESE, CHINESE_BIG5) returns "zh-TW". +// LanguageCode(CHINESE, <any other encoding>) returns "zh-CN". +// +// -------------------------------------------- + +// LanguageCodeWithDialects +// ------------------------ +// +// If lang is CHINESE, return "zh-CN". Otherwise return LanguageCode(lang). +const char* LanguageCodeWithDialects(Language lang); + +// LanguageCodeISO639_1 +// -------------------- +// +// Return the ISO 639-1 two-letter language code for lang. +// Return invalid_language_code() if lang is invalid or does not have +// an ISO 639-1 two-letter language code. +const char* LanguageCodeISO639_1(Language lang); + +// LanguageCodeISO639_2 +// -------------------- +// +// Return the ISO 639-2 three-letter language for lang. +// Return invalid_language_code() if lang is invalid or does not have +// an ISO 639-2 three-letter language code. +const char* LanguageCodeISO639_2(Language lang); + +// LanguageFromCode +// ---------------- +// +// If lang_code matches the code for a Language, using a case-insensitive +// comparison, set *lang to that Language and return true. +// Otherwise, set *lang to UNKNOWN_LANGUAGE and return false. +// +// lang_code can be an ISO 639-1 (two-letter) code, an ISO 639-2 +// (three-letter) code, or a Google-specific code (see LanguageCode). +// +// Certain language-code aliases are also allowed: +// For "zh-cn" and "zh_cn", set *lang to CHINESE. +// For "zh-tw" and "zh_tw", set *lang to CHINESE_T. +// For "he", set *lang to HEBREW. +// For "in", set *lang to INDONESIAN. +// For "ji", set *lang to YIDDISH. +// For "fil", set *lang to TAGALOG. +// +// REQUIRES: 'lang' must not be NULL. +bool LanguageFromCode(const char* lang_code, Language *language); + + +// LanguageFromCodeOrName +// ---------------------- +// +// If lang_code_or_name is a language code or a language name. +// set *language to the corresponding Language and return true. +// Otherwise set *language to UNKNOWN_LANGUAGE and return false. +// +bool LanguageFromCodeOrName(const char* lang_code_or_name, + Language* language); + +// LanguageNameFromCode +// -------------------- +// +// If language_code is the code for a Language (see LanguageFromCode), +// return the standard name of that language (see LanguageName). +// Otherwise return invalid_language_name(). +// +const char* LanguageNameFromCode(const char* language_code); + + +// Miscellany + +// LanguageCodeToUnderscoreForm +// ---------------------------- +// +// Given a language code, convert the dash "-" to underscore "_". +// +// Specifically, if result_length <= strlen(lang_code), set result[0] +// to '\0' and return false. Otherwise, copy lang_code to result, +// converting every dash to an underscore, converting every character +// before the first dash or underscore to lower case, and converting +// every character after the first dash or underscore to upper +// case. If there is no dash or underscore, convert the entire string +// to lower case. +// +// REQUIRES: 'lang_code' must not be NULL. 'result' must not be NULL. + +bool LanguageCodeToUnderscoreForm(const char* lang_code, + char* result, + int result_length); + +// +// AlwaysPutInExpectedRestrict +// --------------------------- +// +// For Web pages in certain top-level domains, Web Search always +// applies a "country restrict". If 'tld' matches one of those, using +// a case-SENSITIVE comparison, set *expected_language to the Language +// most commonly found in that top-level domain and return true. +// Otherwise, set *expected_language to UNKNOWN_LANGUAGE and return false. +bool AlwaysPutInExpectedRestrict(const char *tld, Language *expected_language); + + +#endif // UTIL_LANGUAGES_LANGUAGES_H_ diff --git a/contrib/google-ced/util/languages/languages.pb.h b/contrib/google-ced/util/languages/languages.pb.h new file mode 100644 index 0000000..84f1d6a --- /dev/null +++ b/contrib/google-ced/util/languages/languages.pb.h @@ -0,0 +1,191 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef UTIL_LANGUAGES_LANGUAGES_PB_H_ +#define UTIL_LANGUAGES_LANGUAGES_PB_H_ + +enum Language { + ENGLISH = 0, + DANISH = 1, + DUTCH = 2, + FINNISH = 3, + FRENCH = 4, + GERMAN = 5, + HEBREW = 6, + ITALIAN = 7, + JAPANESE = 8, + KOREAN = 9, + NORWEGIAN = 10, + POLISH = 11, + PORTUGUESE = 12, + RUSSIAN = 13, + SPANISH = 14, + SWEDISH = 15, + CHINESE = 16, + CZECH = 17, + GREEK = 18, + ICELANDIC = 19, + LATVIAN = 20, + LITHUANIAN = 21, + ROMANIAN = 22, + HUNGARIAN = 23, + ESTONIAN = 24, + TG_UNKNOWN_LANGUAGE = 25, + UNKNOWN_LANGUAGE = 26, + BULGARIAN = 27, + CROATIAN = 28, + SERBIAN = 29, + IRISH = 30, // UI only. + GALICIAN = 31, + TAGALOG = 32, // Tagalog (tl) + Filipino (fil), + TURKISH = 33, + UKRAINIAN = 34, + HINDI = 35, + MACEDONIAN = 36, + BENGALI = 37, + INDONESIAN = 38, + LATIN = 39, // UI only. + MALAY = 40, + MALAYALAM = 41, + WELSH = 42, // UI only. + NEPALI = 43, + TELUGU = 44, + ALBANIAN = 45, + TAMIL = 46, + BELARUSIAN = 47, + JAVANESE = 48, // UI only. + OCCITAN = 49, // UI only. + URDU = 50, + BIHARI = 51, + GUJARATI = 52, + THAI = 53, + ARABIC = 54, + CATALAN = 55, + ESPERANTO = 56, + BASQUE = 57, + INTERLINGUA = 58, // UI only. + KANNADA = 59, + PUNJABI = 60, + SCOTS_GAELIC = 61, // UI only. + SWAHILI = 62, + SLOVENIAN = 63, + MARATHI = 64, + MALTESE = 65, + VIETNAMESE = 66, + FRISIAN = 67, // UI only. + SLOVAK = 68, + CHINESE_T = 69, // This is added to solve the problem of + // distinguishing Traditional and Simplified + // Chinese when the encoding is UTF8. + FAROESE = 70, // UI only. + SUNDANESE = 71, // UI only. + UZBEK = 72, + AMHARIC = 73, + AZERBAIJANI = 74, + GEORGIAN = 75, + TIGRINYA = 76, // UI only. + PERSIAN = 77, + BOSNIAN = 78, // UI only. LangId language: CROATIAN (28) + SINHALESE = 79, + NORWEGIAN_N = 80, // UI only. LangId language: NORWEGIAN (10) + PORTUGUESE_P = 81, // UI only. LangId language: PORTUGUESE (12) + PORTUGUESE_B = 82, // UI only. LangId language: PORTUGUESE (12) + XHOSA = 83, // UI only. + ZULU = 84, // UI only. + GUARANI = 85, + SESOTHO = 86, // UI only. + TURKMEN = 87, // UI only. + KYRGYZ = 88, + BRETON = 89, // UI only. + TWI = 90, // UI only. + YIDDISH = 91, // UI only. + SERBO_CROATIAN= 92, // UI only. LangId language: SERBIAN (29) + SOMALI = 93, // UI only. + UIGHUR = 94, + KURDISH = 95, + MONGOLIAN = 96, + ARMENIAN = 97, + LAOTHIAN = 98, + SINDHI = 99, + RHAETO_ROMANCE= 100, // UI only. + AFRIKAANS = 101, + LUXEMBOURGISH = 102, // UI only. + BURMESE = 103, + KHMER = 104, + TIBETAN = 105, + DHIVEHI = 106, // sometimes spelled Divehi, lang of Maldives + CHEROKEE = 107, + SYRIAC = 108, // UI only. + LIMBU = 109, // UI only. + ORIYA = 110, + ASSAMESE = 111, // UI only. + CORSICAN = 112, // UI only. + INTERLINGUE = 113, // UI only. + KAZAKH = 114, + LINGALA = 115, // UI only. + MOLDAVIAN = 116, // UI only. LangId language: ROMANIAN (22) + PASHTO = 117, + QUECHUA = 118, // UI only. + SHONA = 119, // UI only. + TAJIK = 120, + TATAR = 121, // UI only. + TONGA = 122, // UI only. + YORUBA = 123, // UI only. + CREOLES_AND_PIDGINS_ENGLISH_BASED = 124, // UI only. + CREOLES_AND_PIDGINS_FRENCH_BASED = 125, // UI only. + CREOLES_AND_PIDGINS_PORTUGUESE_BASED = 126, // UI only. + CREOLES_AND_PIDGINS_OTHER = 127, // UI only. + MAORI = 128, // UI only. + WOLOF = 129, // UI only. + ABKHAZIAN = 130, // UI only. + AFAR = 131, // UI only. + AYMARA = 132, // UI only. + BASHKIR = 133, // UI only. + BISLAMA = 134, // UI only. + DZONGKHA = 135, // UI only. + FIJIAN = 136, // UI only. + GREENLANDIC = 137, // UI only. + HAUSA = 138, // UI only. + HAITIAN_CREOLE= 139, // UI only. + INUPIAK = 140, // UI only. + INUKTITUT = 141, + KASHMIRI = 142, // UI only. + KINYARWANDA = 143, // UI only. + MALAGASY = 144, // UI only. + NAURU = 145, // UI only. + OROMO = 146, // UI only. + RUNDI = 147, // UI only. + SAMOAN = 148, // UI only. + SANGO = 149, // UI only. + SANSKRIT = 150, + SISWANT = 151, // UI only. + TSONGA = 152, // UI only. + TSWANA = 153, // UI only. + VOLAPUK = 154, // UI only. + ZHUANG = 155, // UI only. + KHASI = 156, // UI only. + SCOTS = 157, // UI only. + GANDA = 158, // UI only. + MANX = 159, // UI only. + MONTENEGRIN = 160, // UI only. LangId language: SERBIAN (29) + NUM_LANGUAGES = 161, // Always keep this at the end. It is not a + // valid Language enum. It is only used to + // indicate the total number of Languages. + // NOTE: If you add a language, you will break a unittest. See the note + // at the top of this enum. +}; + +#endif // UTIL_LANGUAGES_LANGUAGES_PB_H_ |