Adding upstream version 3.8.1.upstream/3.8.1 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-10 21:30:40 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-10 21:30:40 +0000
commit: 133a45c109da5310add55824db21af5239951f93 (patch)
tree: ba6ac4c0a950a0dda56451944315d66409923918 /contrib/google-ced/util/languages
parent: Initial commit. (diff)
download: rspamd-133a45c109da5310add55824db21af5239951f93.tar.xz
rspamd-133a45c109da5310add55824db21af5239951f93.zip
3 files changed, 921 insertions, 0 deletions
diff --git a/contrib/google-ced/util/languages/languages.cc b/contrib/google-ced/util/languages/languages.cc
new file mode 100644
index 0000000..852351f
--- /dev/null
+++ b/contrib/google-ced/util/languages/languages.cc
@@ -0,0 +1,349 @@
+// Copyright 2016 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "util/languages/languages.h"
+
+#include "util/basictypes.h"
+#include "util/string_util.h"
+
+
+Language default_language() {return ENGLISH;}
+
+
+// Language names and codes
+
+struct LanguageInfo {
+  const char * language_name_;
+  const char * language_code_639_1_;   // the ISO-639-1 code for the language
+  const char * language_code_639_2_;   // the ISO-639-2 code for the language
+  const char * language_code_other_;   // some nonstandard code for the language
+};
+
+static const LanguageInfo kLanguageInfoTable[] = {
+  { "ENGLISH",             "en", "eng", NULL},
+  { "DANISH",              "da", "dan", NULL},
+  { "DUTCH",               "nl", "dut", NULL},
+  { "FINNISH",             "fi", "fin", NULL},
+  { "FRENCH",              "fr", "fre", NULL},
+  { "GERMAN",              "de", "ger", NULL},
+  { "HEBREW",              "he", "heb", NULL},
+  { "ITALIAN",             "it", "ita", NULL},
+  { "Japanese",            "ja", "jpn", NULL},
+  { "Korean",              "ko", "kor", NULL},
+  { "NORWEGIAN",           "nb", "nor", NULL},
+  { "POLISH",              "pl", "pol", NULL},
+  { "PORTUGUESE",          "pt", "por", NULL},
+  { "RUSSIAN",             "ru", "rus", NULL},
+  { "SPANISH",             "es", "spa", NULL},
+  { "SWEDISH",             "sv", "swe", NULL},
+  { "Chinese",             "zh", "chi", "zh-CN"},
+  { "CZECH",               "cs", "cze", NULL},
+  { "GREEK",               "el", "gre", NULL},
+  { "ICELANDIC",           "is", "ice", NULL},
+  { "LATVIAN",             "lv", "lav", NULL},
+  { "LITHUANIAN",          "lt", "lit", NULL},
+  { "ROMANIAN",            "ro", "rum", NULL},
+  { "HUNGARIAN",           "hu", "hun", NULL},
+  { "ESTONIAN",            "et", "est", NULL},
+  // TODO: Although Teragram has two output names "TG_UNKNOWN_LANGUAGE"
+  // and "Unknown", they are essentially the same. Need to unify them.
+  // "un" and "ut" are invented by us, not from ISO-639.
+  //
+  { "TG_UNKNOWN_LANGUAGE", NULL, NULL, "ut"},
+  { "Unknown",             NULL, NULL, "un"},
+  { "BULGARIAN",           "bg", "bul", NULL},
+  { "CROATIAN",            "hr", "scr", NULL},
+  { "SERBIAN",             "sr", "scc", NULL},
+  { "IRISH",               "ga", "gle", NULL},
+  { "GALICIAN",            "gl", "glg", NULL},
+  // Impossible to tell Tagalog from Filipino at the moment.
+  // Use ISO 639-2 code for Filipino here.
+  { "TAGALOG",             NULL, "fil", NULL},
+  { "TURKISH",             "tr", "tur", NULL},
+  { "UKRAINIAN",           "uk", "ukr", NULL},
+  { "HINDI",               "hi", "hin", NULL},
+  { "MACEDONIAN",          "mk", "mac", NULL},
+  { "BENGALI",             "bn", "ben", NULL},
+  { "INDONESIAN",          "id", "ind", NULL},
+  { "LATIN",               "la", "lat", NULL},
+  { "MALAY",               "ms", "may", NULL},
+  { "MALAYALAM",           "ml", "mal", NULL},
+  { "WELSH",               "cy", "wel", NULL},
+  { "NEPALI",              "ne", "nep", NULL},
+  { "TELUGU",              "te", "tel", NULL},
+  { "ALBANIAN",            "sq", "alb", NULL},
+  { "TAMIL",               "ta", "tam", NULL},
+  { "BELARUSIAN",          "be", "bel", NULL},
+  { "JAVANESE",            "jw", "jav", NULL},
+  { "OCCITAN",             "oc", "oci", NULL},
+  { "URDU",                "ur", "urd", NULL},
+  { "BIHARI",              "bh", "bih", NULL},
+  { "GUJARATI",            "gu", "guj", NULL},
+  { "THAI",                "th", "tha", NULL},
+  { "ARABIC",              "ar", "ara", NULL},
+  { "CATALAN",             "ca", "cat", NULL},
+  { "ESPERANTO",           "eo", "epo", NULL},
+  { "BASQUE",              "eu", "baq", NULL},
+  { "INTERLINGUA",         "ia", "ina", NULL},
+  { "KANNADA",             "kn", "kan", NULL},
+  { "PUNJABI",             "pa", "pan", NULL},
+  { "SCOTS_GAELIC",        "gd", "gla", NULL},
+  { "SWAHILI",             "sw", "swa", NULL},
+  { "SLOVENIAN",           "sl", "slv", NULL},
+  { "MARATHI",             "mr", "mar", NULL},
+  { "MALTESE",             "mt", "mlt", NULL},
+  { "VIETNAMESE",          "vi", "vie", NULL},
+  { "FRISIAN",             "fy", "fry", NULL},
+  { "SLOVAK",              "sk", "slo", NULL},
+  { "ChineseT",
+    NULL,  NULL,  // We intentionally set these 2 fields to NULL to avoid
+                  // confusion between CHINESE_T and CHINESE.
+    "zh-TW"},
+  { "FAROESE",             "fo", "fao", NULL},
+  { "SUNDANESE",           "su", "sun", NULL},
+  { "UZBEK",               "uz", "uzb", NULL},
+  { "AMHARIC",             "am", "amh", NULL},
+  { "AZERBAIJANI",         "az", "aze", NULL},
+  { "GEORGIAN",            "ka", "geo", NULL},
+  { "TIGRINYA",            "ti", "tir", NULL},
+  { "PERSIAN",             "fa", "per", NULL},
+  { "BOSNIAN",             "bs", "bos", NULL},
+  { "SINHALESE",           "si", "sin", NULL},
+  { "NORWEGIAN_N",         "nn", "nno", NULL},
+  { "PORTUGUESE_P",        NULL, NULL, "pt-PT"},
+  { "PORTUGUESE_B",        NULL, NULL, "pt-BR"},
+  { "XHOSA",               "xh", "xho", NULL},
+  { "ZULU",                "zu", "zul", NULL},
+  { "GUARANI",             "gn", "grn", NULL},
+  { "SESOTHO",             "st", "sot", NULL},
+  { "TURKMEN",             "tk", "tuk", NULL},
+  { "KYRGYZ",              "ky", "kir", NULL},
+  { "BRETON",              "br", "bre", NULL},
+  { "TWI",                 "tw", "twi", NULL},
+  { "YIDDISH",             "yi", "yid", NULL},
+  { "SERBO_CROATIAN",      "sh", NULL, NULL},
+  { "SOMALI",              "so", "som", NULL},
+  { "UIGHUR",              "ug", "uig", NULL},
+  { "KURDISH",             "ku", "kur", NULL},
+  { "MONGOLIAN",           "mn", "mon", NULL},
+  { "ARMENIAN",            "hy", "arm", NULL},
+  { "LAOTHIAN",            "lo", "lao", NULL},
+  { "SINDHI",              "sd", "snd", NULL},
+  { "RHAETO_ROMANCE",      "rm", "roh", NULL},
+  { "AFRIKAANS",           "af", "afr", NULL},
+  { "LUXEMBOURGISH",       "lb", "ltz", NULL},
+  { "BURMESE",             "my", "bur", NULL},
+  // KHMER is known as Cambodian for Google user interfaces.
+  { "KHMER",               "km", "khm", NULL},
+  { "TIBETAN",             "bo", "tib", NULL},
+  { "DHIVEHI",             "dv", "div", NULL},
+  { "CHEROKEE",            NULL, "chr", NULL},
+  { "SYRIAC",              NULL, "syr", NULL},
+  { "LIMBU",               NULL, NULL, "sit-NP"},
+  { "ORIYA",               "or", "ori", NULL},
+  { "ASSAMESE",            "as", "asm", NULL},
+  { "CORSICAN",            "co", "cos", NULL},
+  { "INTERLINGUE",         "ie", "ine", NULL},
+  { "KAZAKH",              "kk", "kaz", NULL},
+  { "LINGALA",             "ln", "lin", NULL},
+  { "MOLDAVIAN",           "mo", "mol", NULL},
+  { "PASHTO",              "ps", "pus", NULL},
+  { "QUECHUA",             "qu", "que", NULL},
+  { "SHONA",               "sn", "sna", NULL},
+  { "TAJIK",               "tg", "tgk", NULL},
+  { "TATAR",               "tt", "tat", NULL},
+  { "TONGA",               "to", "tog", NULL},
+  { "YORUBA",              "yo", "yor", NULL},
+  { "CREOLES_AND_PIDGINS_ENGLISH_BASED", NULL, "cpe", NULL},
+  { "CREOLES_AND_PIDGINS_FRENCH_BASED",  NULL, "cpf", NULL},
+  { "CREOLES_AND_PIDGINS_PORTUGUESE_BASED", NULL, "cpp", NULL},
+  { "CREOLES_AND_PIDGINS_OTHER", NULL, "crp", NULL},
+  { "MAORI",               "mi", "mao", NULL},
+  { "WOLOF",               "wo", "wol", NULL},
+  { "ABKHAZIAN",           "ab", "abk", NULL},
+  { "AFAR",                "aa", "aar", NULL},
+  { "AYMARA",              "ay", "aym", NULL},
+  { "BASHKIR",             "ba", "bak", NULL},
+  { "BISLAMA",             "bi", "bis", NULL},
+  { "DZONGKHA",            "dz", "dzo", NULL},
+  { "FIJIAN",              "fj", "fij", NULL},
+  { "GREENLANDIC",         "kl", "kal", NULL},
+  { "HAUSA",               "ha", "hau", NULL},
+  { "HAITIAN_CREOLE",       "ht", NULL, NULL},
+  { "INUPIAK",             "ik", "ipk", NULL},
+  { "INUKTITUT",           "iu", "iku", NULL},
+  { "KASHMIRI",            "ks", "kas", NULL},
+  { "KINYARWANDA",         "rw", "kin", NULL},
+  { "MALAGASY",            "mg", "mlg", NULL},
+  { "NAURU",               "na", "nau", NULL},
+  { "OROMO",               "om", "orm", NULL},
+  { "RUNDI",               "rn", "run", NULL},
+  { "SAMOAN",              "sm", "smo", NULL},
+  { "SANGO",               "sg", "sag", NULL},
+  { "SANSKRIT",            "sa", "san", NULL},
+  { "SISWANT",             "ss", "ssw", NULL},
+  { "TSONGA",              "ts", "tso", NULL},
+  { "TSWANA",              "tn", "tsn", NULL},
+  { "VOLAPUK",             "vo", "vol", NULL},
+  { "ZHUANG",              "za", "zha", NULL},
+  { "KHASI",               NULL, "kha", NULL},
+  { "SCOTS",               NULL, "sco", NULL},
+  { "GANDA",               "lg", "lug", NULL},
+  { "MANX",                "gv", "glv", NULL},
+  { "MONTENEGRIN",         NULL, NULL, "sr-ME"},
+  { "XX",                  NULL, NULL, "XX"},
+};
+
+COMPILE_ASSERT(arraysize(kLanguageInfoTable) == NUM_LANGUAGES + 1,
+               kLanguageInfoTable_has_incorrect_length);
+
+
+// LANGUAGE NAMES
+
+const char* default_language_name() {
+  return kLanguageInfoTable[ENGLISH].language_name_;
+}
+
+static const char* const kInvalidLanguageName = "invalid_language";
+
+const char *invalid_language_name() {
+  return kInvalidLanguageName;
+}
+
+const char* LanguageName(Language lang) {
+  return IsValidLanguage(lang)
+      ? kLanguageInfoTable[lang].language_name_
+      : kInvalidLanguageName;
+}
+
+
+
+// LANGUAGE CODES
+
+
+// The space before invalid_language_code is intentional. It is used
+// to prevent it matching any two letter language code.
+//
+static const char* const kInvalidLanguageCode = " invalid_language_code";
+
+const char *invalid_language_code() {
+  return kInvalidLanguageCode;
+}
+
+const char * LanguageCode(Language lang) {
+  if (! IsValidLanguage(lang))
+    return kInvalidLanguageCode;
+  const LanguageInfo& info = kLanguageInfoTable[lang];
+  if (info.language_code_639_1_) {
+    return info.language_code_639_1_;
+  } else if (info.language_code_639_2_) {
+    return info.language_code_639_2_;
+  } else if (info.language_code_other_) {
+    return info.language_code_other_;
+  } else {
+    return kInvalidLanguageCode;
+  }
+}
+
+const char* default_language_code() {
+  return kLanguageInfoTable[ENGLISH].language_code_639_1_;
+}
+
+const char* LanguageCodeISO639_1(Language lang) {
+  if (! IsValidLanguage(lang))
+    return kInvalidLanguageCode;
+  if (const char* code = kLanguageInfoTable[lang].language_code_639_1_)
+    return code;
+  return kInvalidLanguageCode;
+}
+
+const char* LanguageCodeISO639_2(Language lang) {
+  if (! IsValidLanguage(lang))
+    return kInvalidLanguageCode;
+  if (const char* code = kLanguageInfoTable[lang].language_code_639_2_)
+    return code;
+  return kInvalidLanguageCode;
+}
+
+const char* LanguageCodeWithDialects(Language lang) {
+  if (lang == CHINESE)
+    return "zh-CN";
+  return LanguageCode(lang);
+}
+
+
+
+bool LanguageFromCode(const char* lang_code, Language *language) {
+  *language = UNKNOWN_LANGUAGE;
+  if ( lang_code == NULL ) return false;
+
+  for ( int i = 0 ; i < kNumLanguages ; i++ ) {
+    const LanguageInfo& info = kLanguageInfoTable[i];
+    if ((info.language_code_639_1_ &&
+         !base::strcasecmp(lang_code, info.language_code_639_1_)) ||
+        (info.language_code_639_2_ &&
+         !base::strcasecmp(lang_code, info.language_code_639_2_)) ||
+        (info.language_code_other_ &&
+         !base::strcasecmp(lang_code, info.language_code_other_))) {
+      *language = static_cast<Language>(i);
+      return true;
+    }
+  }
+
+  // For convenience, this function can also parse the non-standard
+  // five-letter language codes "zh-cn" and "zh-tw" which are used by
+  // front-ends such as GWS to distinguish Simplified from Traditional
+  // Chinese.
+  if (!base::strcasecmp(lang_code, "zh-cn") ||
+      !base::strcasecmp(lang_code, "zh_cn")) {
+    *language = CHINESE;
+    return true;
+  }
+  if (!base::strcasecmp(lang_code, "zh-tw") ||
+      !base::strcasecmp(lang_code, "zh_tw")) {
+    *language = CHINESE_T;
+    return true;
+  }
+  if (!base::strcasecmp(lang_code, "sr-me") ||
+      !base::strcasecmp(lang_code, "sr_me")) {
+    *language = MONTENEGRIN;
+    return true;
+  }
+
+  // Process language-code synonyms.
+  if (!base::strcasecmp(lang_code, "he")) {
+    *language = HEBREW;  // Use "iw".
+    return true;
+  }
+  if (!base::strcasecmp(lang_code, "in")) {
+    *language = INDONESIAN;  // Use "id".
+    return true;
+  }
+  if (!base::strcasecmp(lang_code, "ji")) {
+    *language = YIDDISH;  // Use "yi".
+    return true;
+  }
+
+  // Process language-detection synonyms.
+  // These distinct languages cannot be differentiated by our current
+  // language-detection algorithms.
+  if (!base::strcasecmp(lang_code, "fil")) {
+    *language = TAGALOG;
+    return true;
+  }
+
+  return false;
+}
diff --git a/contrib/google-ced/util/languages/languages.h b/contrib/google-ced/util/languages/languages.h
new file mode 100644
index 0000000..4237961
--- /dev/null
+++ b/contrib/google-ced/util/languages/languages.h
@@ -0,0 +1,381 @@
+// Copyright 2016 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef UTIL_LANGUAGES_LANGUAGES_H_
+#define UTIL_LANGUAGES_LANGUAGES_H_
+
+// This interface defines the Language enum and functions that depend
+// only on Language values.
+
+// A hash-function for Language, hash<Language>, is defined in
+// i18n/languages/public/languages-hash.h
+
+#ifndef SWIG
+// Language enum defined in languages.proto
+// Also description on how to add languages.
+#include "util/languages/languages.pb.h"
+
+#else
+
+// TODO: Include a header containing swig-compatible enum.
+
+#endif
+
+const int kNumLanguages = NUM_LANGUAGES;
+
+// Return the default language (ENGLISH).
+Language default_language();
+
+
+// *******************************************
+// Language predicates
+//   IsValidLanguage()
+//   IS_LANGUAGE_UNKNOWN()
+//   IsCJKLanguage()
+//   IsChineseLanguage()
+//   IsNorwegianLanguage()
+//   IsPortugueseLanguage()
+//   IsRightToLeftLanguage()
+//   IsMaybeRightToLeftLanguage()
+//   IsSameLanguage()
+//   IsScriptRequiringLongerSnippets()
+// *******************************************
+
+// IsValidLanguage
+// ===============
+//
+// Function to check if the input is within range of the Language enum. If
+// IsValidLanguage(lang) returns true, it is safe to call
+// static_cast<Language>(lang).
+//
+inline bool IsValidLanguage(int lang) {
+  return ((lang >= 0) && (lang < kNumLanguages));
+}
+
+// Return true if the language is "unknown". (This function was
+// previously a macro, hence the spelling in all caps.)
+//
+inline bool IS_LANGUAGE_UNKNOWN(Language lang) {
+  return lang == TG_UNKNOWN_LANGUAGE || lang == UNKNOWN_LANGUAGE;
+}
+
+// IsCJKLanguage
+// -------------
+//
+// This function returns true if the language is either Chinese
+// (simplified or traditional), Japanese, or Korean.
+bool IsCJKLanguage(Language lang);
+
+// IsChineseLanguage
+// -----------------
+//
+// This function returns true if the language is either Chinese
+// (simplified or traditional)
+bool IsChineseLanguage(Language lang);
+
+// IsNorwegianLanguage
+// --------------------
+//
+// This function returns true if the language is any of the Norwegian
+// (regular or Nynorsk).
+bool IsNorwegianLanguage(Language lang);
+
+// IsPortugueseLanguage
+// --------------------
+//
+// This function returns true if the language is any of the Portuguese
+// languages (regular, Portugal or Brazil)
+bool IsPortugueseLanguage(Language lang);
+
+// IsSameLanguage
+// --------------
+//
+// WARNING: This function provides only a simple test on the values of
+// the two Language arguments. It returns false if either language is
+// invalid. It returns true if the language arguments are equal, or
+// if they are both Chinese languages, both Norwegian languages, or
+// both Portuguese languages, as defined by IsChineseLanguage,
+// IsNorwegianLanguage, and IsPortugueseLanguage. Otherwise it returns
+// false.
+bool IsSameLanguage(Language lang1, Language lang2);
+
+
+// IsRightToLeftLanguage
+// ---------------------
+//
+// This function returns true if the language is only written right-to-left
+// (E.g., Hebrew, Arabic, Persian etc.)
+//
+// IMPORTANT NOTE: Technically we're talking about scripts, not languages.
+// There are languages that can be written in more than one script.
+// Examples:
+//   - Kurdish and Azeri ('AZERBAIJANI') can be written left-to-right in
+//     Latin or Cyrillic script, and right-to-left in Arabic script.
+//   - Sindhi and Punjabi are written in different scripts, depending on
+//     region and dialect.
+//   - Turkmen used an Arabic script historically, but not any more.
+//   - Pashto and Uyghur can use Arabic script, but use a Roman script
+//     on the Internet.
+//   - Kashmiri and Urdu are written either with Arabic or Devanagari script.
+//
+// This function only returns true for languages that are always, unequivocally
+// written in right-to-left script.
+//
+// TODO: If we want to do anything special with multi-script languages
+// we should create new 'languages' for each language+script, as we do for
+// traditional vs. simplified Chinese. However most such languages are rare in
+// use and even rarer on the web, so this is unlikely to be something we'll
+// be concerned with for a while.
+bool IsRightToLeftLanguage(Language lang);
+
+// IsMaybeRightToLeftLanguage
+// --------------------------
+//
+// This function returns true if the language may appear on the web in a
+// right-to-left script (E.g., Hebrew, Arabic, Persian, Urdu, Kurdish, etc.)
+//
+// NOTE: See important notes under IsRightToLeftLanguage(...).
+//
+// This function returns true for languages that *may* appear on the web in a
+// right-to-left script, even if they may also appear in a left-to-right
+// script.
+//
+// This function should typically be used in cases where doing some work on
+// left-to-right text would be OK (usually a no-op), and this function is used
+// just to cut down on unnecessary work on regular, LTR text.
+bool IsMaybeRightToLeftLanguage(Language lang);
+
+// IsScriptRequiringLongerSnippets
+// --------------------
+//
+// This function returns true if the script chracteristics require longer
+// snippet length (Devanagari, Bengali, Gurmukhi,
+// Gujarati, Oriya, Tamil, Telugu, Kannada, Malayalam).
+// COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE
+// bool IsScriptRequiringLongerSnippets(UnicodeScript script);
+
+
+// *******************************************
+// LANGUAGE NAMES
+//
+// This interface defines a standard name for each valid Language,
+// and a standard name for invalid languages. Some language names use all
+// uppercase letters, but others use mixed case.
+//   LanguageName() [Language to name]
+//   LanguageEnumName() [language to enum name]
+//   LanguageFromName() [name to Language]
+//   default_language_name()
+//   invalid_language_name()
+// *******************************************
+
+// Given a Language, returns its standard name.
+// Return invalid_language_name() if the language is invalid.
+const char* LanguageName(Language lang);
+
+// Given a Language, return the name of the enum constant for that
+// language. In all but a few cases, this is the same as its standard
+// name. For example, LanguageName(CHINESE) returns "Chinese", but
+// LanguageEnumName(CHINESE) returns "CHINESE". This is intended for
+// code that is generating C++ code, where the enum constant is more
+// useful than its integer value.  Return "NUM_LANGUAGES" if
+// the language is invalid.
+const char* LanguageEnumName(Language lang);
+
+// The maximum length of a standard language name.
+const int kMaxLanguageNameSize = 50;
+
+// The standard name for the default language.
+const char* default_language_name();
+
+// The standard name for all invalid languages.
+const char* invalid_language_name();
+
+// If lang_name matches the standard name of a Language, using a
+// case-insensitive comparison, set *language to that Language and
+// return true.
+// Otherwise, set *language to UNKNOWN_LANGUAGE and return false.
+//
+// For backwards compatibility, "HATIAN_CREOLE" is allowed as a name
+// for HAITIAN_CREOLE, and "QUECHAU" is allowed as a name for QUECHUA.
+// For compatibility with LanguageEnumName, "UNKNOWN_LANGUAGE" is allowed
+// as a name for UNKNOWN_LANGUAGE (the return value is true in this case,
+// as it is for "Unknown"), and "CHINESE_T" is allowed as a name for
+// CHINESE_T (i.e., a synonym for "ChineseT").
+//
+// REQUIRES: language must not be NULL.
+//
+bool LanguageFromName(const char* lang_name, Language *language);
+
+
+
+// *******************************************
+// LANGUAGE CODES
+//
+// This interface defines a standard code for each valid language, and
+// a standard code for invalid languages. These are derived from ISO codes,
+// with some Google additions.
+//   LanguageCode()
+//   default_language_code()
+//   invalid_language_code()
+//   LanguageCodeWithDialects()
+//   LanguageCodeISO639_1()
+//   LanguageCodeISO639_2()
+// *******************************************
+
+// Given a Language, return its standard code. There are Google-specific codes:
+//     For CHINESE_T, return "zh-TW".
+//     For TG_UNKNOWN_LANGUAGE, return "ut".
+//     For UNKNOWN_LANGUAGE, return "un".
+//     For PORTUGUESE_P, return "pt-PT".
+//     For PORTUGUESE_B, return "pt-BR".
+//     For LIMBU, return "sit-NP".
+//     For CHEROKEE, return "chr".
+//     For SYRIAC, return "syr".
+// Otherwise return the ISO 639-1 two-letter language code for lang.
+// If lang is invalid, return invalid_language_code().
+//
+// NOTE: See the note below about the codes for Chinese languages.
+//
+const char* LanguageCode(Language lang);
+
+// The maximum length of a language code.
+const int kMaxLanguageCodeSize = 50;
+
+// The standard code for the default language.
+const char* default_language_code();
+
+// The standard code for all invalid languages.
+const char* invalid_language_code();
+
+
+// --------------------------------------------
+// NOTE: CHINESE LANGUAGE CODES
+//
+// There are three functions that return codes for Chinese languages.
+// LanguageCode(lang) and LanguageCodeWithDialects(lang) are defined here.
+// LanguageCode(lang, encoding) is defined in i18n/encodings.lang_enc.h.
+// The following list shows the different results.
+//
+// LanguageCode(CHINESE) returns "zh"
+// LanguageCode(CHINESE_T) returns "zh-TW".
+//
+// LanguageCodeWithDialects(CHINESE) returns "zh-CN".
+// LanguageCodeWithDialects(CHINESE_T) returns "zh-TW".
+//
+// LanguageCode(CHINESE_T, <any encoding>) returns "zh-TW".
+// LanguageCode(CHINESE, CHINESE_BIG5) returns "zh-TW".
+// LanguageCode(CHINESE, <any other encoding>) returns "zh-CN".
+//
+// --------------------------------------------
+
+// LanguageCodeWithDialects
+// ------------------------
+//
+// If lang is CHINESE, return "zh-CN". Otherwise return LanguageCode(lang).
+const char* LanguageCodeWithDialects(Language lang);
+
+// LanguageCodeISO639_1
+// --------------------
+//
+// Return the ISO 639-1 two-letter language code for lang.
+// Return invalid_language_code() if lang is invalid or does not have
+// an ISO 639-1 two-letter language code.
+const char* LanguageCodeISO639_1(Language lang);
+
+// LanguageCodeISO639_2
+// --------------------
+//
+// Return the ISO 639-2 three-letter language for lang.
+// Return invalid_language_code() if lang is invalid or does not have
+// an ISO 639-2 three-letter language code.
+const char* LanguageCodeISO639_2(Language lang);
+
+// LanguageFromCode
+// ----------------
+//
+// If lang_code matches the code for a Language, using a case-insensitive
+// comparison, set *lang to that Language and return true.
+// Otherwise, set *lang to UNKNOWN_LANGUAGE and return false.
+//
+// lang_code can be an ISO 639-1 (two-letter) code, an ISO 639-2
+// (three-letter) code, or a Google-specific code (see LanguageCode).
+//
+// Certain language-code aliases are also allowed:
+//   For "zh-cn" and "zh_cn", set *lang to CHINESE.
+//   For "zh-tw" and "zh_tw", set *lang to CHINESE_T.
+//   For "he", set *lang to HEBREW.
+//   For "in", set *lang to INDONESIAN.
+//   For "ji", set *lang to YIDDISH.
+//   For "fil", set *lang to TAGALOG.
+//
+// REQUIRES: 'lang' must not be NULL.
+bool LanguageFromCode(const char* lang_code, Language *language);
+
+
+// LanguageFromCodeOrName
+// ----------------------
+//
+// If lang_code_or_name is a language code or a language name.
+// set *language to the corresponding Language and return true.
+// Otherwise set *language to UNKNOWN_LANGUAGE and return false.
+//
+bool LanguageFromCodeOrName(const char* lang_code_or_name,
+                            Language* language);
+
+// LanguageNameFromCode
+// --------------------
+//
+// If language_code is the code for a Language (see LanguageFromCode),
+// return the standard name of that language (see LanguageName).
+// Otherwise return invalid_language_name().
+//
+const char* LanguageNameFromCode(const char* language_code);
+
+
+// Miscellany
+
+// LanguageCodeToUnderscoreForm
+// ----------------------------
+//
+// Given a language code, convert the dash "-" to underscore "_".
+//
+// Specifically, if result_length <= strlen(lang_code), set result[0]
+// to '\0' and return false. Otherwise, copy lang_code to result,
+// converting every dash to an underscore, converting every character
+// before the first dash or underscore to lower case, and converting
+// every character after the first dash or underscore to upper
+// case. If there is no dash or underscore, convert the entire string
+// to lower case.
+//
+// REQUIRES: 'lang_code' must not be NULL. 'result' must not be NULL.
+
+bool LanguageCodeToUnderscoreForm(const char* lang_code,
+                                  char* result,
+                                  int result_length);
+
+//
+// AlwaysPutInExpectedRestrict
+// ---------------------------
+//
+// For Web pages in certain top-level domains, Web Search always
+// applies a "country restrict". If 'tld' matches one of those, using
+// a case-SENSITIVE comparison, set *expected_language to the Language
+// most commonly found in that top-level domain and return true.
+// Otherwise, set *expected_language to UNKNOWN_LANGUAGE and return false.
+bool AlwaysPutInExpectedRestrict(const char *tld, Language *expected_language);
+
+
+#endif  // UTIL_LANGUAGES_LANGUAGES_H_
diff --git a/contrib/google-ced/util/languages/languages.pb.h b/contrib/google-ced/util/languages/languages.pb.h
new file mode 100644
index 0000000..84f1d6a
--- /dev/null
+++ b/contrib/google-ced/util/languages/languages.pb.h
@@ -0,0 +1,191 @@
+// Copyright 2016 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef UTIL_LANGUAGES_LANGUAGES_PB_H_
+#define UTIL_LANGUAGES_LANGUAGES_PB_H_
+
+enum Language {
+  ENGLISH       = 0,
+  DANISH        = 1,
+  DUTCH         = 2,
+  FINNISH       = 3,
+  FRENCH        = 4,
+  GERMAN        = 5,
+  HEBREW        = 6,
+  ITALIAN       = 7,
+  JAPANESE      = 8,
+  KOREAN        = 9,
+  NORWEGIAN     = 10,
+  POLISH        = 11,
+  PORTUGUESE    = 12,
+  RUSSIAN       = 13,
+  SPANISH       = 14,
+  SWEDISH       = 15,
+  CHINESE       = 16,
+  CZECH         = 17,
+  GREEK         = 18,
+  ICELANDIC     = 19,
+  LATVIAN       = 20,
+  LITHUANIAN    = 21,
+  ROMANIAN      = 22,
+  HUNGARIAN     = 23,
+  ESTONIAN      = 24,
+  TG_UNKNOWN_LANGUAGE   = 25,
+  UNKNOWN_LANGUAGE      = 26,
+  BULGARIAN     = 27,
+  CROATIAN      = 28,
+  SERBIAN       = 29,
+  IRISH         = 30,      // UI only.
+  GALICIAN      = 31,
+  TAGALOG       = 32,      // Tagalog (tl) + Filipino (fil),
+  TURKISH       = 33,
+  UKRAINIAN     = 34,
+  HINDI         = 35,
+  MACEDONIAN    = 36,
+  BENGALI       = 37,
+  INDONESIAN    = 38,
+  LATIN         = 39,      // UI only.
+  MALAY         = 40,
+  MALAYALAM     = 41,
+  WELSH         = 42,      // UI only.
+  NEPALI        = 43,
+  TELUGU        = 44,
+  ALBANIAN      = 45,
+  TAMIL         = 46,
+  BELARUSIAN    = 47,
+  JAVANESE      = 48,      // UI only.
+  OCCITAN       = 49,      // UI only.
+  URDU          = 50,
+  BIHARI        = 51,
+  GUJARATI      = 52,
+  THAI          = 53,
+  ARABIC        = 54,
+  CATALAN       = 55,
+  ESPERANTO     = 56,
+  BASQUE        = 57,
+  INTERLINGUA   = 58,      // UI only.
+  KANNADA       = 59,
+  PUNJABI       = 60,
+  SCOTS_GAELIC  = 61,      // UI only.
+  SWAHILI       = 62,
+  SLOVENIAN     = 63,
+  MARATHI       = 64,
+  MALTESE       = 65,
+  VIETNAMESE    = 66,
+  FRISIAN       = 67,      // UI only.
+  SLOVAK        = 68,
+  CHINESE_T     = 69,      // This is added to solve the problem of
+                           // distinguishing Traditional and Simplified
+                           // Chinese when the encoding is UTF8.
+  FAROESE       = 70,      // UI only.
+  SUNDANESE     = 71,      // UI only.
+  UZBEK         = 72,
+  AMHARIC       = 73,
+  AZERBAIJANI   = 74,
+  GEORGIAN      = 75,
+  TIGRINYA      = 76,      // UI only.
+  PERSIAN       = 77,
+  BOSNIAN       = 78,      // UI only. LangId language: CROATIAN (28)
+  SINHALESE     = 79,
+  NORWEGIAN_N   = 80,      // UI only. LangId language: NORWEGIAN (10)
+  PORTUGUESE_P  = 81,      // UI only. LangId language: PORTUGUESE (12)
+  PORTUGUESE_B  = 82,      // UI only. LangId language: PORTUGUESE (12)
+  XHOSA         = 83,      // UI only.
+  ZULU          = 84,      // UI only.
+  GUARANI       = 85,
+  SESOTHO       = 86,      // UI only.
+  TURKMEN       = 87,      // UI only.
+  KYRGYZ        = 88,
+  BRETON        = 89,      // UI only.
+  TWI           = 90,      // UI only.
+  YIDDISH       = 91,      // UI only.
+  SERBO_CROATIAN= 92,      // UI only. LangId language: SERBIAN (29)
+  SOMALI        = 93,      // UI only.
+  UIGHUR        = 94,
+  KURDISH       = 95,
+  MONGOLIAN     = 96,
+  ARMENIAN      = 97,
+  LAOTHIAN      = 98,
+  SINDHI        = 99,
+  RHAETO_ROMANCE= 100,     // UI only.
+  AFRIKAANS     = 101,
+  LUXEMBOURGISH = 102,     // UI only.
+  BURMESE       = 103,
+  KHMER         = 104,
+  TIBETAN       = 105,
+  DHIVEHI       = 106,     // sometimes spelled Divehi, lang of Maldives
+  CHEROKEE      = 107,
+  SYRIAC        = 108,     // UI only.
+  LIMBU         = 109,     // UI only.
+  ORIYA         = 110,
+  ASSAMESE      = 111,     // UI only.
+  CORSICAN      = 112,     // UI only.
+  INTERLINGUE   = 113,     // UI only.
+  KAZAKH        = 114,
+  LINGALA       = 115,     // UI only.
+  MOLDAVIAN     = 116,     // UI only. LangId language: ROMANIAN (22)
+  PASHTO        = 117,
+  QUECHUA       = 118,     // UI only.
+  SHONA         = 119,     // UI only.
+  TAJIK         = 120,
+  TATAR         = 121,     // UI only.
+  TONGA         = 122,     // UI only.
+  YORUBA        = 123,     // UI only.
+  CREOLES_AND_PIDGINS_ENGLISH_BASED       = 124,   // UI only.
+  CREOLES_AND_PIDGINS_FRENCH_BASED        = 125,   // UI only.
+  CREOLES_AND_PIDGINS_PORTUGUESE_BASED    = 126,   // UI only.
+  CREOLES_AND_PIDGINS_OTHER               = 127,   // UI only.
+  MAORI         = 128,     // UI only.
+  WOLOF         = 129,     // UI only.
+  ABKHAZIAN     = 130,     // UI only.
+  AFAR          = 131,     // UI only.
+  AYMARA        = 132,     // UI only.
+  BASHKIR       = 133,     // UI only.
+  BISLAMA       = 134,     // UI only.
+  DZONGKHA      = 135,     // UI only.
+  FIJIAN        = 136,     // UI only.
+  GREENLANDIC   = 137,     // UI only.
+  HAUSA         = 138,     // UI only.
+  HAITIAN_CREOLE= 139,     // UI only.
+  INUPIAK       = 140,     // UI only.
+  INUKTITUT     = 141,
+  KASHMIRI      = 142,     // UI only.
+  KINYARWANDA   = 143,     // UI only.
+  MALAGASY      = 144,     // UI only.
+  NAURU         = 145,     // UI only.
+  OROMO         = 146,     // UI only.
+  RUNDI         = 147,     // UI only.
+  SAMOAN        = 148,     // UI only.
+  SANGO         = 149,     // UI only.
+  SANSKRIT      = 150,
+  SISWANT       = 151,     // UI only.
+  TSONGA        = 152,     // UI only.
+  TSWANA        = 153,     // UI only.
+  VOLAPUK       = 154,     // UI only.
+  ZHUANG        = 155,     // UI only.
+  KHASI         = 156,     // UI only.
+  SCOTS         = 157,     // UI only.
+  GANDA         = 158,     // UI only.
+  MANX          = 159,     // UI only.
+  MONTENEGRIN   = 160,     // UI only. LangId language: SERBIAN (29)
+  NUM_LANGUAGES = 161,        // Always keep this at the end. It is not a
+                              // valid Language enum. It is only used to
+                              // indicate the total number of Languages.
+  // NOTE: If you add a language, you will break a unittest. See the note
+  // at the top of this enum.
+};
+
+#endif  // UTIL_LANGUAGES_LANGUAGES_PB_H_
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-10 21:30:40 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-10 21:30:40 +0000
commit	133a45c109da5310add55824db21af5239951f93 (patch)
tree	ba6ac4c0a950a0dda56451944315d66409923918 /contrib/google-ced/util/languages
parent	Initial commit. (diff)
download	rspamd-133a45c109da5310add55824db21af5239951f93.tar.xz rspamd-133a45c109da5310add55824db21af5239951f93.zip