summaryrefslogtreecommitdiffstats
path: root/contrib/google-ced/util/languages/languages.h
blob: 4237961e32626304a81c5e3aa45732b02347a123 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
// Copyright 2016 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
////////////////////////////////////////////////////////////////////////////////

#ifndef UTIL_LANGUAGES_LANGUAGES_H_
#define UTIL_LANGUAGES_LANGUAGES_H_

// This interface defines the Language enum and functions that depend
// only on Language values.

// A hash-function for Language, hash<Language>, is defined in
// i18n/languages/public/languages-hash.h

#ifndef SWIG
// Language enum defined in languages.proto
// Also description on how to add languages.
#include "util/languages/languages.pb.h"

#else

// TODO: Include a header containing swig-compatible enum.

#endif

const int kNumLanguages = NUM_LANGUAGES;

// Return the default language (ENGLISH).
Language default_language();


// *******************************************
// Language predicates
//   IsValidLanguage()
//   IS_LANGUAGE_UNKNOWN()
//   IsCJKLanguage()
//   IsChineseLanguage()
//   IsNorwegianLanguage()
//   IsPortugueseLanguage()
//   IsRightToLeftLanguage()
//   IsMaybeRightToLeftLanguage()
//   IsSameLanguage()
//   IsScriptRequiringLongerSnippets()
// *******************************************

// IsValidLanguage
// ===============
//
// Function to check if the input is within range of the Language enum. If
// IsValidLanguage(lang) returns true, it is safe to call
// static_cast<Language>(lang).
//
inline bool IsValidLanguage(int lang) {
  return ((lang >= 0) && (lang < kNumLanguages));
}

// Return true if the language is "unknown". (This function was
// previously a macro, hence the spelling in all caps.)
//
inline bool IS_LANGUAGE_UNKNOWN(Language lang) {
  return lang == TG_UNKNOWN_LANGUAGE || lang == UNKNOWN_LANGUAGE;
}

// IsCJKLanguage
// -------------
//
// This function returns true if the language is either Chinese
// (simplified or traditional), Japanese, or Korean.
bool IsCJKLanguage(Language lang);

// IsChineseLanguage
// -----------------
//
// This function returns true if the language is either Chinese
// (simplified or traditional)
bool IsChineseLanguage(Language lang);

// IsNorwegianLanguage
// --------------------
//
// This function returns true if the language is any of the Norwegian
// (regular or Nynorsk).
bool IsNorwegianLanguage(Language lang);

// IsPortugueseLanguage
// --------------------
//
// This function returns true if the language is any of the Portuguese
// languages (regular, Portugal or Brazil)
bool IsPortugueseLanguage(Language lang);

// IsSameLanguage
// --------------
//
// WARNING: This function provides only a simple test on the values of
// the two Language arguments. It returns false if either language is
// invalid. It returns true if the language arguments are equal, or
// if they are both Chinese languages, both Norwegian languages, or
// both Portuguese languages, as defined by IsChineseLanguage,
// IsNorwegianLanguage, and IsPortugueseLanguage. Otherwise it returns
// false.
bool IsSameLanguage(Language lang1, Language lang2);


// IsRightToLeftLanguage
// ---------------------
//
// This function returns true if the language is only written right-to-left
// (E.g., Hebrew, Arabic, Persian etc.)
//
// IMPORTANT NOTE: Technically we're talking about scripts, not languages.
// There are languages that can be written in more than one script.
// Examples:
//   - Kurdish and Azeri ('AZERBAIJANI') can be written left-to-right in
//     Latin or Cyrillic script, and right-to-left in Arabic script.
//   - Sindhi and Punjabi are written in different scripts, depending on
//     region and dialect.
//   - Turkmen used an Arabic script historically, but not any more.
//   - Pashto and Uyghur can use Arabic script, but use a Roman script
//     on the Internet.
//   - Kashmiri and Urdu are written either with Arabic or Devanagari script.
//
// This function only returns true for languages that are always, unequivocally
// written in right-to-left script.
//
// TODO: If we want to do anything special with multi-script languages
// we should create new 'languages' for each language+script, as we do for
// traditional vs. simplified Chinese. However most such languages are rare in
// use and even rarer on the web, so this is unlikely to be something we'll
// be concerned with for a while.
bool IsRightToLeftLanguage(Language lang);

// IsMaybeRightToLeftLanguage
// --------------------------
//
// This function returns true if the language may appear on the web in a
// right-to-left script (E.g., Hebrew, Arabic, Persian, Urdu, Kurdish, etc.)
//
// NOTE: See important notes under IsRightToLeftLanguage(...).
//
// This function returns true for languages that *may* appear on the web in a
// right-to-left script, even if they may also appear in a left-to-right
// script.
//
// This function should typically be used in cases where doing some work on
// left-to-right text would be OK (usually a no-op), and this function is used
// just to cut down on unnecessary work on regular, LTR text.
bool IsMaybeRightToLeftLanguage(Language lang);

// IsScriptRequiringLongerSnippets
// --------------------
//
// This function returns true if the script chracteristics require longer
// snippet length (Devanagari, Bengali, Gurmukhi,
// Gujarati, Oriya, Tamil, Telugu, Kannada, Malayalam).
// COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE
// bool IsScriptRequiringLongerSnippets(UnicodeScript script);


// *******************************************
// LANGUAGE NAMES
//
// This interface defines a standard name for each valid Language,
// and a standard name for invalid languages. Some language names use all
// uppercase letters, but others use mixed case.
//   LanguageName() [Language to name]
//   LanguageEnumName() [language to enum name]
//   LanguageFromName() [name to Language]
//   default_language_name()
//   invalid_language_name()
// *******************************************

// Given a Language, returns its standard name.
// Return invalid_language_name() if the language is invalid.
const char* LanguageName(Language lang);

// Given a Language, return the name of the enum constant for that
// language. In all but a few cases, this is the same as its standard
// name. For example, LanguageName(CHINESE) returns "Chinese", but
// LanguageEnumName(CHINESE) returns "CHINESE". This is intended for
// code that is generating C++ code, where the enum constant is more
// useful than its integer value.  Return "NUM_LANGUAGES" if
// the language is invalid.
const char* LanguageEnumName(Language lang);

// The maximum length of a standard language name.
const int kMaxLanguageNameSize = 50;

// The standard name for the default language.
const char* default_language_name();

// The standard name for all invalid languages.
const char* invalid_language_name();

// If lang_name matches the standard name of a Language, using a
// case-insensitive comparison, set *language to that Language and
// return true.
// Otherwise, set *language to UNKNOWN_LANGUAGE and return false.
//
// For backwards compatibility, "HATIAN_CREOLE" is allowed as a name
// for HAITIAN_CREOLE, and "QUECHAU" is allowed as a name for QUECHUA.
// For compatibility with LanguageEnumName, "UNKNOWN_LANGUAGE" is allowed
// as a name for UNKNOWN_LANGUAGE (the return value is true in this case,
// as it is for "Unknown"), and "CHINESE_T" is allowed as a name for
// CHINESE_T (i.e., a synonym for "ChineseT").
//
// REQUIRES: language must not be NULL.
//
bool LanguageFromName(const char* lang_name, Language *language);



// *******************************************
// LANGUAGE CODES
//
// This interface defines a standard code for each valid language, and
// a standard code for invalid languages. These are derived from ISO codes,
// with some Google additions.
//   LanguageCode()
//   default_language_code()
//   invalid_language_code()
//   LanguageCodeWithDialects()
//   LanguageCodeISO639_1()
//   LanguageCodeISO639_2()
// *******************************************

// Given a Language, return its standard code. There are Google-specific codes:
//     For CHINESE_T, return "zh-TW".
//     For TG_UNKNOWN_LANGUAGE, return "ut".
//     For UNKNOWN_LANGUAGE, return "un".
//     For PORTUGUESE_P, return "pt-PT".
//     For PORTUGUESE_B, return "pt-BR".
//     For LIMBU, return "sit-NP".
//     For CHEROKEE, return "chr".
//     For SYRIAC, return "syr".
// Otherwise return the ISO 639-1 two-letter language code for lang.
// If lang is invalid, return invalid_language_code().
//
// NOTE: See the note below about the codes for Chinese languages.
//
const char* LanguageCode(Language lang);

// The maximum length of a language code.
const int kMaxLanguageCodeSize = 50;

// The standard code for the default language.
const char* default_language_code();

// The standard code for all invalid languages.
const char* invalid_language_code();


// --------------------------------------------
// NOTE: CHINESE LANGUAGE CODES
//
// There are three functions that return codes for Chinese languages.
// LanguageCode(lang) and LanguageCodeWithDialects(lang) are defined here.
// LanguageCode(lang, encoding) is defined in i18n/encodings.lang_enc.h.
// The following list shows the different results.
//
// LanguageCode(CHINESE) returns "zh"
// LanguageCode(CHINESE_T) returns "zh-TW".
//
// LanguageCodeWithDialects(CHINESE) returns "zh-CN".
// LanguageCodeWithDialects(CHINESE_T) returns "zh-TW".
//
// LanguageCode(CHINESE_T, <any encoding>) returns "zh-TW".
// LanguageCode(CHINESE, CHINESE_BIG5) returns "zh-TW".
// LanguageCode(CHINESE, <any other encoding>) returns "zh-CN".
//
// --------------------------------------------

// LanguageCodeWithDialects
// ------------------------
//
// If lang is CHINESE, return "zh-CN". Otherwise return LanguageCode(lang).
const char* LanguageCodeWithDialects(Language lang);

// LanguageCodeISO639_1
// --------------------
//
// Return the ISO 639-1 two-letter language code for lang.
// Return invalid_language_code() if lang is invalid or does not have
// an ISO 639-1 two-letter language code.
const char* LanguageCodeISO639_1(Language lang);

// LanguageCodeISO639_2
// --------------------
//
// Return the ISO 639-2 three-letter language for lang.
// Return invalid_language_code() if lang is invalid or does not have
// an ISO 639-2 three-letter language code.
const char* LanguageCodeISO639_2(Language lang);

// LanguageFromCode
// ----------------
//
// If lang_code matches the code for a Language, using a case-insensitive
// comparison, set *lang to that Language and return true.
// Otherwise, set *lang to UNKNOWN_LANGUAGE and return false.
//
// lang_code can be an ISO 639-1 (two-letter) code, an ISO 639-2
// (three-letter) code, or a Google-specific code (see LanguageCode).
//
// Certain language-code aliases are also allowed:
//   For "zh-cn" and "zh_cn", set *lang to CHINESE.
//   For "zh-tw" and "zh_tw", set *lang to CHINESE_T.
//   For "he", set *lang to HEBREW.
//   For "in", set *lang to INDONESIAN.
//   For "ji", set *lang to YIDDISH.
//   For "fil", set *lang to TAGALOG.
//
// REQUIRES: 'lang' must not be NULL.
bool LanguageFromCode(const char* lang_code, Language *language);


// LanguageFromCodeOrName
// ----------------------
//
// If lang_code_or_name is a language code or a language name.
// set *language to the corresponding Language and return true.
// Otherwise set *language to UNKNOWN_LANGUAGE and return false.
//
bool LanguageFromCodeOrName(const char* lang_code_or_name,
                            Language* language);

// LanguageNameFromCode
// --------------------
//
// If language_code is the code for a Language (see LanguageFromCode),
// return the standard name of that language (see LanguageName).
// Otherwise return invalid_language_name().
//
const char* LanguageNameFromCode(const char* language_code);


// Miscellany

// LanguageCodeToUnderscoreForm
// ----------------------------
//
// Given a language code, convert the dash "-" to underscore "_".
//
// Specifically, if result_length <= strlen(lang_code), set result[0]
// to '\0' and return false. Otherwise, copy lang_code to result,
// converting every dash to an underscore, converting every character
// before the first dash or underscore to lower case, and converting
// every character after the first dash or underscore to upper
// case. If there is no dash or underscore, convert the entire string
// to lower case.
//
// REQUIRES: 'lang_code' must not be NULL. 'result' must not be NULL.

bool LanguageCodeToUnderscoreForm(const char* lang_code,
                                  char* result,
                                  int result_length);

//
// AlwaysPutInExpectedRestrict
// ---------------------------
//
// For Web pages in certain top-level domains, Web Search always
// applies a "country restrict". If 'tld' matches one of those, using
// a case-SENSITIVE comparison, set *expected_language to the Language
// most commonly found in that top-level domain and return true.
// Otherwise, set *expected_language to UNKNOWN_LANGUAGE and return false.
bool AlwaysPutInExpectedRestrict(const char *tld, Language *expected_language);


#endif  // UTIL_LANGUAGES_LANGUAGES_H_