diff options
Diffstat (limited to 'contrib/google-ced/util/languages/languages.h')
-rw-r--r-- | contrib/google-ced/util/languages/languages.h | 381 |
1 files changed, 381 insertions, 0 deletions
diff --git a/contrib/google-ced/util/languages/languages.h b/contrib/google-ced/util/languages/languages.h new file mode 100644 index 0000000..4237961 --- /dev/null +++ b/contrib/google-ced/util/languages/languages.h @@ -0,0 +1,381 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef UTIL_LANGUAGES_LANGUAGES_H_ +#define UTIL_LANGUAGES_LANGUAGES_H_ + +// This interface defines the Language enum and functions that depend +// only on Language values. + +// A hash-function for Language, hash<Language>, is defined in +// i18n/languages/public/languages-hash.h + +#ifndef SWIG +// Language enum defined in languages.proto +// Also description on how to add languages. +#include "util/languages/languages.pb.h" + +#else + +// TODO: Include a header containing swig-compatible enum. + +#endif + +const int kNumLanguages = NUM_LANGUAGES; + +// Return the default language (ENGLISH). +Language default_language(); + + +// ******************************************* +// Language predicates +// IsValidLanguage() +// IS_LANGUAGE_UNKNOWN() +// IsCJKLanguage() +// IsChineseLanguage() +// IsNorwegianLanguage() +// IsPortugueseLanguage() +// IsRightToLeftLanguage() +// IsMaybeRightToLeftLanguage() +// IsSameLanguage() +// IsScriptRequiringLongerSnippets() +// ******************************************* + +// IsValidLanguage +// =============== +// +// Function to check if the input is within range of the Language enum. If +// IsValidLanguage(lang) returns true, it is safe to call +// static_cast<Language>(lang). +// +inline bool IsValidLanguage(int lang) { + return ((lang >= 0) && (lang < kNumLanguages)); +} + +// Return true if the language is "unknown". (This function was +// previously a macro, hence the spelling in all caps.) +// +inline bool IS_LANGUAGE_UNKNOWN(Language lang) { + return lang == TG_UNKNOWN_LANGUAGE || lang == UNKNOWN_LANGUAGE; +} + +// IsCJKLanguage +// ------------- +// +// This function returns true if the language is either Chinese +// (simplified or traditional), Japanese, or Korean. +bool IsCJKLanguage(Language lang); + +// IsChineseLanguage +// ----------------- +// +// This function returns true if the language is either Chinese +// (simplified or traditional) +bool IsChineseLanguage(Language lang); + +// IsNorwegianLanguage +// -------------------- +// +// This function returns true if the language is any of the Norwegian +// (regular or Nynorsk). +bool IsNorwegianLanguage(Language lang); + +// IsPortugueseLanguage +// -------------------- +// +// This function returns true if the language is any of the Portuguese +// languages (regular, Portugal or Brazil) +bool IsPortugueseLanguage(Language lang); + +// IsSameLanguage +// -------------- +// +// WARNING: This function provides only a simple test on the values of +// the two Language arguments. It returns false if either language is +// invalid. It returns true if the language arguments are equal, or +// if they are both Chinese languages, both Norwegian languages, or +// both Portuguese languages, as defined by IsChineseLanguage, +// IsNorwegianLanguage, and IsPortugueseLanguage. Otherwise it returns +// false. +bool IsSameLanguage(Language lang1, Language lang2); + + +// IsRightToLeftLanguage +// --------------------- +// +// This function returns true if the language is only written right-to-left +// (E.g., Hebrew, Arabic, Persian etc.) +// +// IMPORTANT NOTE: Technically we're talking about scripts, not languages. +// There are languages that can be written in more than one script. +// Examples: +// - Kurdish and Azeri ('AZERBAIJANI') can be written left-to-right in +// Latin or Cyrillic script, and right-to-left in Arabic script. +// - Sindhi and Punjabi are written in different scripts, depending on +// region and dialect. +// - Turkmen used an Arabic script historically, but not any more. +// - Pashto and Uyghur can use Arabic script, but use a Roman script +// on the Internet. +// - Kashmiri and Urdu are written either with Arabic or Devanagari script. +// +// This function only returns true for languages that are always, unequivocally +// written in right-to-left script. +// +// TODO: If we want to do anything special with multi-script languages +// we should create new 'languages' for each language+script, as we do for +// traditional vs. simplified Chinese. However most such languages are rare in +// use and even rarer on the web, so this is unlikely to be something we'll +// be concerned with for a while. +bool IsRightToLeftLanguage(Language lang); + +// IsMaybeRightToLeftLanguage +// -------------------------- +// +// This function returns true if the language may appear on the web in a +// right-to-left script (E.g., Hebrew, Arabic, Persian, Urdu, Kurdish, etc.) +// +// NOTE: See important notes under IsRightToLeftLanguage(...). +// +// This function returns true for languages that *may* appear on the web in a +// right-to-left script, even if they may also appear in a left-to-right +// script. +// +// This function should typically be used in cases where doing some work on +// left-to-right text would be OK (usually a no-op), and this function is used +// just to cut down on unnecessary work on regular, LTR text. +bool IsMaybeRightToLeftLanguage(Language lang); + +// IsScriptRequiringLongerSnippets +// -------------------- +// +// This function returns true if the script chracteristics require longer +// snippet length (Devanagari, Bengali, Gurmukhi, +// Gujarati, Oriya, Tamil, Telugu, Kannada, Malayalam). +// COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE +// bool IsScriptRequiringLongerSnippets(UnicodeScript script); + + +// ******************************************* +// LANGUAGE NAMES +// +// This interface defines a standard name for each valid Language, +// and a standard name for invalid languages. Some language names use all +// uppercase letters, but others use mixed case. +// LanguageName() [Language to name] +// LanguageEnumName() [language to enum name] +// LanguageFromName() [name to Language] +// default_language_name() +// invalid_language_name() +// ******************************************* + +// Given a Language, returns its standard name. +// Return invalid_language_name() if the language is invalid. +const char* LanguageName(Language lang); + +// Given a Language, return the name of the enum constant for that +// language. In all but a few cases, this is the same as its standard +// name. For example, LanguageName(CHINESE) returns "Chinese", but +// LanguageEnumName(CHINESE) returns "CHINESE". This is intended for +// code that is generating C++ code, where the enum constant is more +// useful than its integer value. Return "NUM_LANGUAGES" if +// the language is invalid. +const char* LanguageEnumName(Language lang); + +// The maximum length of a standard language name. +const int kMaxLanguageNameSize = 50; + +// The standard name for the default language. +const char* default_language_name(); + +// The standard name for all invalid languages. +const char* invalid_language_name(); + +// If lang_name matches the standard name of a Language, using a +// case-insensitive comparison, set *language to that Language and +// return true. +// Otherwise, set *language to UNKNOWN_LANGUAGE and return false. +// +// For backwards compatibility, "HATIAN_CREOLE" is allowed as a name +// for HAITIAN_CREOLE, and "QUECHAU" is allowed as a name for QUECHUA. +// For compatibility with LanguageEnumName, "UNKNOWN_LANGUAGE" is allowed +// as a name for UNKNOWN_LANGUAGE (the return value is true in this case, +// as it is for "Unknown"), and "CHINESE_T" is allowed as a name for +// CHINESE_T (i.e., a synonym for "ChineseT"). +// +// REQUIRES: language must not be NULL. +// +bool LanguageFromName(const char* lang_name, Language *language); + + + +// ******************************************* +// LANGUAGE CODES +// +// This interface defines a standard code for each valid language, and +// a standard code for invalid languages. These are derived from ISO codes, +// with some Google additions. +// LanguageCode() +// default_language_code() +// invalid_language_code() +// LanguageCodeWithDialects() +// LanguageCodeISO639_1() +// LanguageCodeISO639_2() +// ******************************************* + +// Given a Language, return its standard code. There are Google-specific codes: +// For CHINESE_T, return "zh-TW". +// For TG_UNKNOWN_LANGUAGE, return "ut". +// For UNKNOWN_LANGUAGE, return "un". +// For PORTUGUESE_P, return "pt-PT". +// For PORTUGUESE_B, return "pt-BR". +// For LIMBU, return "sit-NP". +// For CHEROKEE, return "chr". +// For SYRIAC, return "syr". +// Otherwise return the ISO 639-1 two-letter language code for lang. +// If lang is invalid, return invalid_language_code(). +// +// NOTE: See the note below about the codes for Chinese languages. +// +const char* LanguageCode(Language lang); + +// The maximum length of a language code. +const int kMaxLanguageCodeSize = 50; + +// The standard code for the default language. +const char* default_language_code(); + +// The standard code for all invalid languages. +const char* invalid_language_code(); + + +// -------------------------------------------- +// NOTE: CHINESE LANGUAGE CODES +// +// There are three functions that return codes for Chinese languages. +// LanguageCode(lang) and LanguageCodeWithDialects(lang) are defined here. +// LanguageCode(lang, encoding) is defined in i18n/encodings.lang_enc.h. +// The following list shows the different results. +// +// LanguageCode(CHINESE) returns "zh" +// LanguageCode(CHINESE_T) returns "zh-TW". +// +// LanguageCodeWithDialects(CHINESE) returns "zh-CN". +// LanguageCodeWithDialects(CHINESE_T) returns "zh-TW". +// +// LanguageCode(CHINESE_T, <any encoding>) returns "zh-TW". +// LanguageCode(CHINESE, CHINESE_BIG5) returns "zh-TW". +// LanguageCode(CHINESE, <any other encoding>) returns "zh-CN". +// +// -------------------------------------------- + +// LanguageCodeWithDialects +// ------------------------ +// +// If lang is CHINESE, return "zh-CN". Otherwise return LanguageCode(lang). +const char* LanguageCodeWithDialects(Language lang); + +// LanguageCodeISO639_1 +// -------------------- +// +// Return the ISO 639-1 two-letter language code for lang. +// Return invalid_language_code() if lang is invalid or does not have +// an ISO 639-1 two-letter language code. +const char* LanguageCodeISO639_1(Language lang); + +// LanguageCodeISO639_2 +// -------------------- +// +// Return the ISO 639-2 three-letter language for lang. +// Return invalid_language_code() if lang is invalid or does not have +// an ISO 639-2 three-letter language code. +const char* LanguageCodeISO639_2(Language lang); + +// LanguageFromCode +// ---------------- +// +// If lang_code matches the code for a Language, using a case-insensitive +// comparison, set *lang to that Language and return true. +// Otherwise, set *lang to UNKNOWN_LANGUAGE and return false. +// +// lang_code can be an ISO 639-1 (two-letter) code, an ISO 639-2 +// (three-letter) code, or a Google-specific code (see LanguageCode). +// +// Certain language-code aliases are also allowed: +// For "zh-cn" and "zh_cn", set *lang to CHINESE. +// For "zh-tw" and "zh_tw", set *lang to CHINESE_T. +// For "he", set *lang to HEBREW. +// For "in", set *lang to INDONESIAN. +// For "ji", set *lang to YIDDISH. +// For "fil", set *lang to TAGALOG. +// +// REQUIRES: 'lang' must not be NULL. +bool LanguageFromCode(const char* lang_code, Language *language); + + +// LanguageFromCodeOrName +// ---------------------- +// +// If lang_code_or_name is a language code or a language name. +// set *language to the corresponding Language and return true. +// Otherwise set *language to UNKNOWN_LANGUAGE and return false. +// +bool LanguageFromCodeOrName(const char* lang_code_or_name, + Language* language); + +// LanguageNameFromCode +// -------------------- +// +// If language_code is the code for a Language (see LanguageFromCode), +// return the standard name of that language (see LanguageName). +// Otherwise return invalid_language_name(). +// +const char* LanguageNameFromCode(const char* language_code); + + +// Miscellany + +// LanguageCodeToUnderscoreForm +// ---------------------------- +// +// Given a language code, convert the dash "-" to underscore "_". +// +// Specifically, if result_length <= strlen(lang_code), set result[0] +// to '\0' and return false. Otherwise, copy lang_code to result, +// converting every dash to an underscore, converting every character +// before the first dash or underscore to lower case, and converting +// every character after the first dash or underscore to upper +// case. If there is no dash or underscore, convert the entire string +// to lower case. +// +// REQUIRES: 'lang_code' must not be NULL. 'result' must not be NULL. + +bool LanguageCodeToUnderscoreForm(const char* lang_code, + char* result, + int result_length); + +// +// AlwaysPutInExpectedRestrict +// --------------------------- +// +// For Web pages in certain top-level domains, Web Search always +// applies a "country restrict". If 'tld' matches one of those, using +// a case-SENSITIVE comparison, set *expected_language to the Language +// most commonly found in that top-level domain and return true. +// Otherwise, set *expected_language to UNKNOWN_LANGUAGE and return false. +bool AlwaysPutInExpectedRestrict(const char *tld, Language *expected_language); + + +#endif // UTIL_LANGUAGES_LANGUAGES_H_ |