diff options
Diffstat (limited to 'toolkit/components/translation/cld2/internal/compact_lang_det_hint_code.cc')
-rw-r--r-- | toolkit/components/translation/cld2/internal/compact_lang_det_hint_code.cc | 1649 |
1 files changed, 1649 insertions, 0 deletions
diff --git a/toolkit/components/translation/cld2/internal/compact_lang_det_hint_code.cc b/toolkit/components/translation/cld2/internal/compact_lang_det_hint_code.cc new file mode 100644 index 0000000000..9bde8a86a8 --- /dev/null +++ b/toolkit/components/translation/cld2/internal/compact_lang_det_hint_code.cc @@ -0,0 +1,1649 @@ +// Copyright 2013 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +// Author: dsites@google.com (Dick Sites) +// + +#include "compact_lang_det_hint_code.h" + +#include <stdlib.h> // for abs() +#include <stdio.h> // for sprintf() +#include <string.h> // +#include "lang_script.h" +#include "port.h" + +using namespace std; + +namespace CLD2 { + +static const int kCLDPriorEncodingWeight = 4; // 100x more likely +static const int kCLDPriorLanguageWeight = 8; // 10000x more likely + + +// Tables to map lang="..." language code lists to actual languages. +// based on scraping and hand-edits, dsites June 2011 + +// n = f(string, &a) gives list of n<=4 language pairs: primary, secondary + +// For close pairs like ms/id, more weight on TLD and lang= +// Alternately, weaker boost but mark others of set as negative; +// makes "neither" an easier result. +// lang=en low weight 4 +// tld=lu boost lu maaybe 4. but lang= alwyas overcomes tld and encoding +// (except maybe en) + +// TLD to separate, e.g., burundi from rwanda + +// Encoding lookup: OneLangProb array +// TLD lookup: tld OneLangProb pairs + + +typedef struct { + const char* const langtag; // Lowercased, hyphen only lookup key + const char* const langcode; // Canonical language codes; two if ambiguous + OneCLDLangPrior onelangprior1; + OneCLDLangPrior onelangprior2; +} LangTagLookup; + +typedef struct { + const char* const tld; // Lowercased, hyphen only lookup key + OneCLDLangPrior onelangprior1; + OneCLDLangPrior onelangprior2; +} TLDLookup; + + +#define W2 (2 << 10) // 3**2 = 10x more likely +#define W4 (4 << 10) // 3**4 = 100x more likely +#define W6 (6 << 10) // 3**6 = 1000x more likely +#define W8 (8 << 10) // 3**8 = 10K x more likely +#define W10 (10 << 10) // 3**10 = 100K x more likely +#define W12 (12 << 10) // 3**12 = 1M x more likely + +// TODO: more about ba hr sr sr-ME and sl +// Temporary state of affairs: +// BOSNIAN CROATIAN MONTENEGRIN SERBIAN detecting just CROATIAN SERBIAN +// Eventually, we want to do all four, but it requires a CLD change to handle +// up to six languages per quadgram. + + +// Close pairs boost one of pair, demote other. +// Statistically close pairs: +// INDONESIAN/MALAY difficult to distinguish -- extra word-based lookups used +// +// INDONESIAN MALAY coef=0.4698 Problematic w/o extra words +// TIBETAN DZONGKHA coef=0.4571 +// CZECH SLOVAK coef=0.4273 +// NORWEGIAN NORWEGIAN_N coef=0.4182 +// +// HINDI MARATHI coef=0.3795 +// ZULU XHOSA coef=0.3716 +// +// DANISH NORWEGIAN coef=0.3672 Usually OK +// BIHARI HINDI coef=0.3668 Usually OK +// ICELANDIC FAROESE coef=0.3519 Usually OK + +// +// Table to look up lang= tags longer than three characters +// Overrides table below, which is truncated at first hyphen +// In alphabetical order for binary search +static const int kCLDTable1Size = 213; +static const LangTagLookup kCLDLangTagsHintTable1[kCLDTable1Size] = { + {"abkhazian", "ab", ABKHAZIAN + W10, 0}, + {"afar", "aa", AFAR + W10, 0}, + {"afrikaans", "af", AFRIKAANS + W10, 0}, + {"akan", "ak", AKAN + W10, 0}, + {"albanian", "sq", ALBANIAN + W10, 0}, + {"am-am", "hy", ARMENIAN + W10, 0}, // 1:2 Armenian, not ambiguous + {"amharic", "am", AMHARIC + W10, 0}, + {"arabic", "ar", ARABIC + W10, 0}, + {"argentina", "es", SPANISH + W10, 0}, + {"armenian", "hy", ARMENIAN + W10, 0}, + {"assamese", "as", ASSAMESE + W10, 0}, + {"aymara", "ay", AYMARA + W10, 0}, + {"azerbaijani", "az", AZERBAIJANI + W10, 0}, + + {"bangla", "bn", BENGALI + W10, 0}, + {"bashkir", "ba", BASHKIR + W10, 0}, + {"basque", "eu", BASQUE + W10, 0}, + {"belarusian", "be", BELARUSIAN + W10, 0}, + {"bengali", "bn", BENGALI + W10, 0}, + {"bihari", "bh", BIHARI + W10, HINDI - W4}, + {"bislama", "bi", BISLAMA + W10, 0}, + {"bosnian", "bs", BOSNIAN + W10, 0}, // Bosnian => Bosnian + {"br-br", "pt", PORTUGUESE + W10, 0}, // 1:2 Portuguese, not ambiguous + {"br-fr", "br", BRETON + W10, 0}, // 1:2 Breton, not ambiguous + {"breton", "br", BRETON + W10, 0}, + {"bulgarian", "bg", BULGARIAN + W10, 0}, + {"burmese", "my", BURMESE + W10, 0}, // Myanmar + + {"catalan", "ca", CATALAN + W10, 0}, + {"cherokee", "chr", CHEROKEE + W10, 0}, + {"chichewa", "ny", NYANJA + W10, 0}, + + {"chinese", "zh", CHINESE + W10, 0}, + {"chinese-t", "zhT", CHINESE_T + W10, 0}, + {"chineset", "zhT", CHINESE_T + W10, 0}, + {"corsican", "co", CORSICAN + W10, 0}, + {"cpf-hat", "ht", HAITIAN_CREOLE + W10, 0}, // Creole, French-based + {"croatian", "hr", CROATIAN + W10, 0}, + {"czech", "cs", CZECH + W10, SLOVAK - W4}, + + {"danish", "da", DANISH + W10, NORWEGIAN - W4}, + {"deutsch", "de", GERMAN + W10, 0}, + {"dhivehi", "dv", DHIVEHI + W10, 0}, + {"dutch", "nl", DUTCH + W10, 0}, + {"dzongkha", "dz", DZONGKHA + W10, TIBETAN - W4}, + + {"ell-gr", "el", GREEK + W10, 0}, + {"english", "en", ENGLISH + W4, 0}, + {"esperanto", "eo", ESPERANTO + W10, 0}, + {"estonian", "et", ESTONIAN + W10, 0}, + {"euc-jp", "ja", JAPANESE + W10, 0}, // Japanese encoding + {"euc-kr", "ko", KOREAN + W10, 0}, // Korean encoding + + {"faroese", "fo", FAROESE + W10, ICELANDIC - W4}, + {"fijian", "fj", FIJIAN + W10, 0}, + {"finnish", "fi", FINNISH + W10, 0}, + {"fran", "fr", FRENCH + W10, 0}, // Truncated at non-ASCII + {"francais", "fr", FRENCH + W10, 0}, + {"french", "fr", FRENCH + W10, 0}, + {"frisian", "fy", FRISIAN + W10, 0}, + + {"ga-es", "gl", GALICIAN + W10, 0}, // 1:2 Galician, not ambiguous + {"galician", "gl", GALICIAN + W10, 0}, + {"ganda", "lg", GANDA + W10, 0}, + {"georgian", "ka", GEORGIAN + W10, 0}, + {"german", "de", GERMAN + W10, 0}, + {"greek", "el", GREEK + W10, 0}, + {"greenlandic", "kl", GREENLANDIC + W10, 0}, + {"guarani", "gn", GUARANI + W10, 0}, + {"gujarati", "gu", GUJARATI + W10, 0}, + + {"haitian_creole", "ht", HAITIAN_CREOLE + W10, 0}, + {"hausa", "ha", HAUSA + W10, 0}, + {"hawaiian", "haw", HAWAIIAN + W10, 0}, + {"hebrew", "he", HEBREW + W10, 0}, + {"hindi", "hi", HINDI + W10, MARATHI - W4}, + {"hn-in", "hi", HINDI + W10, MARATHI - W4}, + {"hungarian", "hu", HUNGARIAN + W10, 0}, + + {"icelandic", "is", ICELANDIC + W10, FAROESE - W4}, + {"igbo", "ig", IGBO + W10, 0}, + {"indonesian", "id", INDONESIAN + W10, MALAY - W4}, + {"interlingua", "ia", INTERLINGUA + W10, 0}, + {"interlingue", "ie", INTERLINGUE + W10, 0}, + // 1:2 iu-Cans ik-Latn + {"inuktitut", "iu,ik", INUKTITUT + W10, INUPIAK + W10}, // 1:2 + {"inupiak", "ik,iu", INUPIAK + W10, INUKTITUT + W10}, // 1:2 + {"ir-ie", "ga", IRISH + W10, 0}, // Irish + {"irish", "ga", IRISH + W10, 0}, + {"italian", "it", ITALIAN + W10, 0}, + + {"ja-euc", "ja", JAPANESE + W10, 0}, // Japanese encoding + {"jan-jp", "ja", JAPANESE + W10, 0}, // Japanese encoding + {"japanese", "ja", JAPANESE + W10, 0}, + {"javanese", "jw", JAVANESE + W10, 0}, + + {"kannada", "kn", KANNADA + W10, 0}, + {"kashmiri", "ks", KASHMIRI + W10, 0}, + {"kazakh", "kk", KAZAKH + W10, 0}, + {"khasi", "kha", KHASI + W10, 0}, + {"khmer", "km", KHMER + W10, 0}, + {"kinyarwanda", "rw", KINYARWANDA + W10, 0}, + {"klingon", "tlh", X_KLINGON + W10, 0}, + {"korean", "ko", KOREAN + W10, 0}, + {"kurdish", "ku", KURDISH + W10, 0}, + {"kyrgyz", "ky", KYRGYZ + W10, 0}, + + {"laothian", "lo", LAOTHIAN + W10, 0}, + {"latin", "la", LATIN + W10, 0}, + {"latvian", "lv", LATVIAN + W10, 0}, + {"limbu", "sit", LIMBU + W10, 0}, + {"lingala", "ln", LINGALA + W10, 0}, + {"lithuanian", "lt", LITHUANIAN + W10, 0}, + {"luxembourgish", "lb", LUXEMBOURGISH + W10, 0}, + + {"macedonian", "mk", MACEDONIAN + W10, 0}, + {"malagasy", "mg", MALAGASY + W10, 0}, + {"malay", "ms", MALAY + W10, INDONESIAN - W4}, + {"malayalam", "ml", MALAYALAM + W10, 0}, + {"maltese", "mt", MALTESE + W10, 0}, + {"manx", "gv", MANX + W10, 0}, + {"maori", "mi", MAORI + W10, 0}, + {"marathi", "mr", MARATHI + W10, HINDI - W4}, + {"mauritian_creole", "mfe", MAURITIAN_CREOLE + W10, 0}, + {"moldavian", "mo", ROMANIAN + W10, 0}, + {"mongolian", "mn", MONGOLIAN + W10, 0}, + {"montenegrin", "sr-me", MONTENEGRIN + W10, 0}, + {"myanmar", "my", BURMESE + W10, 0}, // Myanmar + {"nauru", "na", NAURU + W10, 0}, + {"ndebele", "nr", NDEBELE + W10, 0}, + {"nepali", "ne", NEPALI + W10, 0}, + {"no-bok", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, // Bokmaal + {"no-bokmaal", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, + {"no-nb", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, // Bokmaal + {"no-no", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, + {"no-nyn", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4}, // Nynorsk + {"no-nynorsk", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4}, + {"norwegian", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, + {"norwegian_n", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4}, + {"nyanja", "ny", NYANJA + W10, 0}, + + {"occitan", "oc", OCCITAN + W10, 0}, + {"oriya", "or", ORIYA + W10, 0}, + {"oromo", "om", OROMO + W10, 0}, + {"parsi", "fa", PERSIAN + W10, 0}, + + {"pashto", "ps", PASHTO + W10, 0}, + {"pedi", "nso", PEDI + W10, 0}, + {"persian", "fa", PERSIAN + W10, 0}, + {"polish", "pl", POLISH + W10, 0}, + {"polska", "pl", POLISH + W10, 0}, + {"polski", "pl", POLISH + W10, 0}, + {"portugu", "pt", PORTUGUESE + W10, 0}, // Truncated at non-ASCII + {"portuguese", "pt", PORTUGUESE + W10, 0}, + {"punjabi", "pa", PUNJABI + W10, 0}, + + {"quechua", "qu", QUECHUA + W10, 0}, + + {"rhaeto_romance", "rm", RHAETO_ROMANCE + W10, 0}, + {"romanian", "ro", ROMANIAN + W10, 0}, + {"rundi", "rn", RUNDI + W10, 0}, + {"russian", "ru", RUSSIAN + W10, 0}, + + {"samoan", "sm", SAMOAN + W10, 0}, + {"sango", "sg", SANGO + W10, 0}, + {"sanskrit", "sa", SANSKRIT + W10, 0}, + {"scots", "sco", SCOTS + W10, ENGLISH - W4}, + {"scots_gaelic", "gd", SCOTS_GAELIC + W10, 0}, + {"serbian", "sr", SERBIAN + W10, 0}, + {"seselwa", "crs", SESELWA + W10, 0}, + {"sesotho", "st", SESOTHO + W10, 0}, + {"shift-jis", "ja", JAPANESE + W10, 0}, // Japanese encoding + {"shift-js", "ja", JAPANESE + W10, 0}, // Japanese encoding + {"shona", "sn", SHONA + W10, 0}, + {"si-lk", "si", SINHALESE + W10, 0}, // 1:2 Sri Lanka, not ambiguous + {"si-si", "sl", SLOVENIAN + W10, 0}, // 1:2 Slovenia, not ambiguous + {"si-sl", "sl", SLOVENIAN + W10, 0}, // 1:2 Slovenia, not ambiguous + {"sindhi", "sd", SINDHI + W10, 0}, + {"sinhalese", "si", SINHALESE + W10, 0}, + {"siswant", "ss", SISWANT + W10, 0}, + {"sit-np", "sit", LIMBU + W10, 0}, + {"slovak", "sk", SLOVAK + W10, CZECH - W4}, + {"slovenian", "sl", SLOVENIAN + W10, 0}, + {"somali", "so", SOMALI + W10, 0}, + {"spanish", "es", SPANISH + W10, 0}, + {"sr-me", "sr-me", MONTENEGRIN + W10, 0}, // Montenegrin => Montenegrin + {"sundanese", "su", SUNDANESE + W10, 0}, + {"suomi", "fi", FINNISH + W10, 0}, // Finnish + {"swahili", "sw", SWAHILI + W10, 0}, + {"swedish", "sv", SWEDISH + W10, 0}, + {"syriac", "syr", SYRIAC + W10, 0}, + + {"tagalog", "tl", TAGALOG + W10, 0}, + {"tajik", "tg", TAJIK + W10, 0}, + {"tamil", "ta", TAMIL + W10, 0}, + {"tatar", "tt", TATAR + W10, 0}, + {"tb-tb", "bo", TIBETAN + W10, DZONGKHA - W4}, // Tibet + {"tchinese", "zhT", CHINESE_T + W10, 0}, + {"telugu", "te", TELUGU + W10, 0}, + {"thai", "th", THAI + W10, 0}, + {"tibetan", "bo", TIBETAN + W10, DZONGKHA - W4}, + {"tigrinya", "ti", TIGRINYA + W10, 0}, + {"tonga", "to", TONGA + W10, 0}, + {"tsonga", "ts", TSONGA + W10, 0}, + {"tswana", "tn", TSWANA + W10, 0}, + {"tt-ru", "tt", TATAR + W10, 0}, + {"tur-tr", "tr", TURKISH + W10, 0}, + {"turkish", "tr", TURKISH + W10, 0}, + {"turkmen", "tk", TURKMEN + W10, 0}, + {"uighur", "ug", UIGHUR + W10, 0}, + {"ukrainian", "uk", UKRAINIAN + W10, 0}, + {"urdu", "ur", URDU + W10, 0}, + {"uzbek", "uz", UZBEK + W10, 0}, + + {"venda", "ve", VENDA + W10, 0}, + {"vietnam", "vi", VIETNAMESE + W10, 0}, + {"vietnamese", "vi", VIETNAMESE + W10, 0}, + {"volapuk", "vo", VOLAPUK + W10, 0}, + + {"welsh", "cy", WELSH + W10, 0}, + {"wolof", "wo", WOLOF + W10, 0}, + + {"xhosa", "xh", XHOSA + W10, ZULU - W4}, + + {"yiddish", "yi", YIDDISH + W10, 0}, + {"yoruba", "yo", YORUBA + W10, 0}, + + {"zh-classical", "zhT", CHINESE_T + W10, 0}, + {"zh-cn", "zh", CHINESE + W10, 0}, + {"zh-hans", "zh", CHINESE + W10, 0}, + {"zh-hant", "zhT", CHINESE_T + W10, 0}, + {"zh-hk", "zhT", CHINESE_T + W10, 0}, + {"zh-min-nan", "zhT", CHINESE_T + W10, 0}, // Min Nan => ChineseT + {"zh-sg", "zhT", CHINESE_T + W10, 0}, + {"zh-tw", "zhT", CHINESE_T + W10, 0}, + {"zh-yue", "zh", CHINESE + W10, 0}, // Yue (Cantonese) => Chinese + {"zhuang", "za", ZHUANG + W10, 0}, + {"zulu", "zu", ZULU + W10, XHOSA - W4}, +}; + + + +// Table to look up lang= tags of two/three characters after truncate at hyphen +// In alphabetical order for binary search +static const int kCLDTable2Size = 257; +static const LangTagLookup kCLDLangTagsHintTable2[kCLDTable2Size] = { + {"aa", "aa", AFAR + W10, 0}, + {"ab", "ab", ABKHAZIAN + W10, 0}, + {"af", "af", AFRIKAANS + W10, 0}, + {"ak", "ak", AKAN + W10, 0}, + {"al", "sq", ALBANIAN + W10, 0}, // Albania + {"am", "am,hy", AMHARIC + W10, ARMENIAN + W10}, // 1:2 Amharic Armenian + {"ar", "ar", ARABIC + W10, 0}, + {"ara", "ar", ARABIC + W10, 0}, + {"arm", "hy", ARMENIAN + W10, 0}, // Armenia + {"arz", "ar", ARABIC + W10, 0}, // Egyptian Arabic + {"as", "as", ASSAMESE + W10, 0}, + {"at", "de", GERMAN + W10, 0}, // Austria + {"au", "de", GERMAN + W10, 0}, // Austria + {"ay", "ay", AYMARA + W10, 0}, + {"az", "az", AZERBAIJANI + W10, 0}, + {"aze", "az", AZERBAIJANI + W10, 0}, + + {"ba", "ba,bs", BASHKIR + W10, BOSNIAN + W10}, // 1:2 Bashkir Bosnia + {"be", "be", BELARUSIAN + W10, 0}, + {"bel", "be", BELARUSIAN + W10, 0}, + {"bg", "bg", BULGARIAN + W10, 0}, + {"bh", "bh", BIHARI + W10, HINDI - W4}, + {"bi", "bi", BISLAMA + W10, 0}, + {"big", "zhT", CHINESE_T + W10, 0}, // Big5 encoding + {"bm", "ms", MALAY + W10, INDONESIAN - W4}, // Bahasa Malaysia + {"bn", "bn", BENGALI + W10, 0}, + {"bo", "bo", TIBETAN + W10, DZONGKHA - W4}, + // 1:2 Breton, Brazil country code, both Latn .br TLD enough for pt to win + {"br", "br,pt", BRETON + W10, PORTUGUESE + W8}, // 1:2 Breton, Brazil + {"bs", "bs", BOSNIAN + W10, 0}, // Bosnian => Bosnian + + {"ca", "ca", CATALAN + W10, 0}, + {"cat", "ca", CATALAN + W10, 0}, + {"ch", "de,fr", GERMAN + W10, FRENCH + W10}, // 1:2 Switzerland + {"chn", "zh", CHINESE + W10, 0}, + {"chr", "chr", CHEROKEE + W10, 0}, + {"ckb", "ku", KURDISH + W10, 0}, // Central Kurdish + {"cn", "zh,zhT", CHINESE + W6, CHINESE_T + W4}, // Ambiguous, so weaker. + // Offset by 2 so that TLD=tw or + // enc=big5 will put zhT ahead + {"co", "co", CORSICAN + W10, 0}, + {"cro", "hr", CROATIAN + W10, 0}, // Croatia + {"crs", "crs", SESELWA + W10, 0}, + {"cs", "cs", CZECH + W10, SLOVAK - W4}, + {"ct", "ca", CATALAN + W10, 0}, + {"cy", "cy", WELSH + W10, 0}, + {"cym", "cy", WELSH + W10, 0}, + {"cz", "cs", CZECH + W10, SLOVAK - W4}, + + {"da", "da", DANISH + W10, NORWEGIAN - W4}, + {"dan", "da", DANISH + W10, NORWEGIAN - W4}, + {"de", "de", GERMAN + W10, 0}, + {"deu", "de", GERMAN + W10, 0}, + {"div", "dv", DHIVEHI + W10, 0}, + {"dk", "da", DANISH + W10, NORWEGIAN - W4}, // Denmark + {"dut", "nl", DUTCH + W10, 0}, // Dutch + {"dv", "dv", DHIVEHI + W10, 0}, + {"dz", "dz", DZONGKHA + W10, TIBETAN - W4}, + + {"ee", "et", ESTONIAN + W10, 0}, // Estonia + {"eg", "ar", ARABIC + W10, 0}, // Egypt + {"el", "el", GREEK + W10, 0}, + {"en", "en", ENGLISH + W4, 0}, + {"eng", "en", ENGLISH + W4, 0}, + {"eo", "eo", ESPERANTO + W10, 0}, + {"er", "ur", URDU + W10, 0}, // "Erdu" + {"es", "es", SPANISH + W10, 0}, + {"esp", "es", SPANISH + W10, 0}, + {"est", "et", ESTONIAN + W10, 0}, + {"et", "et", ESTONIAN + W10, 0}, + {"eu", "eu", BASQUE + W10, 0}, + + {"fa", "fa", PERSIAN + W10, 0}, + {"far", "fa", PERSIAN + W10, 0}, + {"fi", "fi", FINNISH + W10, 0}, + {"fil", "tl", TAGALOG + W10, 0}, // Philippines + {"fj", "fj", FIJIAN + W10, 0}, + {"fo", "fo", FAROESE + W10, ICELANDIC - W4}, + {"fr", "fr", FRENCH + W10, 0}, + {"fra", "fr", FRENCH + W10, 0}, + {"fre", "fr", FRENCH + W10, 0}, + {"fy", "fy", FRISIAN + W10, 0}, + + {"ga", "ga,gl", IRISH + W10, GALICIAN + W10}, // 1:2 Irish, Galician + {"gae", "gd,ga", SCOTS_GAELIC + W10, IRISH + W10}, // 1:2 Gaelic, either + {"gal", "gl", GALICIAN + W10, 0}, + {"gb", "zh", CHINESE + W10, 0}, // GB2312 encoding + {"gbk", "zh", CHINESE + W10, 0}, // GBK encoding + {"gd", "gd", SCOTS_GAELIC + W10, 0}, + {"ge", "ka", GEORGIAN + W10, 0}, // Georgia + {"geo", "ka", GEORGIAN + W10, 0}, + {"ger", "de", GERMAN + W10, 0}, + {"gl", "gl", GALICIAN + W10, 0}, // Also Greenland; hard to confuse + {"gn", "gn", GUARANI + W10, 0}, + {"gr", "el", GREEK + W10, 0}, // Greece + {"gu", "gu", GUJARATI + W10, 0}, + {"gv", "gv", MANX + W10, 0}, + + {"ha", "ha", HAUSA + W10, 0}, + {"hat", "ht", HAITIAN_CREOLE + W10, 0}, // Haiti + {"haw", "haw", HAWAIIAN + W10, 0}, + {"hb", "he", HEBREW + W10, 0}, + {"he", "he", HEBREW + W10, 0}, + {"heb", "he", HEBREW + W10, 0}, + {"hi", "hi", HINDI + W10, MARATHI - W4}, + {"hk", "zhT", CHINESE_T + W10, 0}, // Hong Kong + {"hr", "hr", CROATIAN + W10, 0}, + {"ht", "ht", HAITIAN_CREOLE + W10, 0}, + {"hu", "hu", HUNGARIAN + W10, 0}, + {"hun", "hu", HUNGARIAN + W10, 0}, + {"hy", "hy", ARMENIAN + W10, 0}, + + {"ia", "ia", INTERLINGUA + W10, 0}, + {"ice", "is", ICELANDIC + W10, FAROESE - W4}, // Iceland + {"id", "id", INDONESIAN + W10, MALAY - W4}, + {"ids", "id", INDONESIAN + W10, MALAY - W4}, + {"ie", "ie", INTERLINGUE + W10, 0}, + {"ig", "ig", IGBO + W10, 0}, + // 1:2 iu-Cans ik-Latn + {"ik", "ik,iu", INUPIAK + W10, INUKTITUT + W10}, // 1:2 + {"in", "id", INDONESIAN + W10, MALAY - W4}, + {"ind", "id", INDONESIAN + W10, MALAY - W4}, // Indonesia + {"inu", "iu,ik", INUKTITUT + W10, INUPIAK + W10}, // 1:2 + {"is", "is", ICELANDIC + W10, FAROESE - W4}, + {"it", "it", ITALIAN + W10, 0}, + {"ita", "it", ITALIAN + W10, 0}, + {"iu", "iu,ik", INUKTITUT + W10, INUPIAK + W10}, // 1:2 + {"iw", "he", HEBREW + W10, 0}, + + {"ja", "ja", JAPANESE + W10, 0}, + {"jp", "ja", JAPANESE + W10, 0}, // Japan + {"jpn", "ja", JAPANESE + W10, 0}, + {"jv", "jw", JAVANESE + W10, 0}, + {"jw", "jw", JAVANESE + W10, 0}, + + {"ka", "ka", GEORGIAN + W10, 0}, + {"kc", "qu", QUECHUA + W10, 0}, // (K)Quechua + {"kg", "ky", KYRGYZ + W10, 0}, // Kyrgyzstan + {"kh", "km", KHMER + W10, 0}, // Country code Khmer (Cambodia) + {"kha", "kha", KHASI + W10, 0}, + {"kk", "kk", KAZAKH + W10, 0}, // Kazakh + {"kl", "kl", GREENLANDIC + W10, 0}, + {"km", "km", KHMER + W10, 0}, + {"kn", "kn", KANNADA + W10, 0}, + {"ko", "ko", KOREAN + W10, 0}, + {"kor", "ko", KOREAN + W10, 0}, + {"kr", "ko", KOREAN + W10, 0}, // Country code Korea + {"ks", "ks", KASHMIRI + W10, 0}, + {"ksc", "ko", KOREAN + W10, 0}, // KSC encoding + {"ku", "ku", KURDISH + W10, 0}, + {"ky", "ky", KYRGYZ + W10, 0}, + {"kz", "kk", KAZAKH + W10, 0}, // Kazakhstan + {"la", "la", LATIN + W10, 0}, + {"lao", "lo", LAOTHIAN + W10, 0}, // Laos + + {"lb", "lb", LUXEMBOURGISH + W10, 0}, + {"lg", "lg", GANDA + W10, 0}, + {"lit", "lt", LITHUANIAN + W10, 0}, + {"ln", "ln", LINGALA + W10, 0}, + {"lo", "lo", LAOTHIAN + W10, 0}, + {"lt", "lt", LITHUANIAN + W10, 0}, + {"ltu", "lt", LITHUANIAN + W10, 0}, + {"lv", "lv", LATVIAN + W10, 0}, + + {"mfe", "mfe", MAURITIAN_CREOLE + W10, 0}, + {"mg", "mg", MALAGASY + W10, 0}, + {"mi", "mi", MAORI + W10, 0}, + {"mk", "mk", MACEDONIAN + W10, 0}, + {"ml", "ml", MALAYALAM + W10, 0}, + {"mn", "mn", MONGOLIAN + W10, 0}, + {"mo", "mo", ROMANIAN + W10, 0}, + {"mon", "mn", MONGOLIAN + W10, 0}, // Mongolian + {"mr", "mr", MARATHI + W10, HINDI - W4}, + {"ms", "ms", MALAY + W10, INDONESIAN - W4}, + {"mt", "mt", MALTESE + W10, 0}, + {"mx", "es", SPANISH + W10, 0}, // Mexico + {"my", "my,ms", BURMESE + W10, MALAY + W10}, // Myanmar, Malaysia + + {"na", "na", NAURU + W10, 0}, + {"nb", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, + {"ne", "ne", NEPALI + W10, 0}, + {"nl", "nl", DUTCH + W10, 0}, + {"nn", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4}, + {"no", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, + {"nr", "nr", NDEBELE + W10, 0}, + {"nso", "nso", PEDI + W10, 0}, + {"ny", "ny", NYANJA + W10, 0}, + + {"oc", "oc", OCCITAN + W10, 0}, + {"om", "om", OROMO + W10, 0}, + {"or", "or", ORIYA + W10, 0}, + + {"pa", "pa,ps", PUNJABI + W10, PASHTO + W10}, // 1:2 pa-Guru ps-Arab + {"per", "fa", PERSIAN + W10, 0}, + {"ph", "tl", TAGALOG + W10, 0}, // Philippines + {"pk", "ur", URDU + W10, 0}, // Pakistan + {"pl", "pl", POLISH + W10, 0}, + {"pnb", "pa", PUNJABI + W10, 0}, // Western Punjabi + {"pol", "pl", POLISH + W10, 0}, + {"por", "pt", PORTUGUESE + W10, 0}, + {"ps", "ps", PASHTO + W10, 0}, + {"pt", "pt", PORTUGUESE + W10, 0}, + {"ptg", "pt", PORTUGUESE + W10, 0}, + {"qc", "fr", FRENCH + W10, 0}, // Quebec "country" code + {"qu", "qu", QUECHUA + W10, 0}, + + {"rm", "rm", RHAETO_ROMANCE + W10, 0}, + {"rn", "rn", RUNDI + W10, 0}, + {"ro", "ro", ROMANIAN + W10, 0}, + {"rs", "sr", SERBIAN + W10, 0}, // Serbia country code + {"ru", "ru", RUSSIAN + W10, 0}, + {"rus", "ru", RUSSIAN + W10, 0}, + {"rw", "rw", KINYARWANDA + W10, 0}, + + {"sa", "sa", SANSKRIT + W10, 0}, + {"sco", "sco", SCOTS + W10, ENGLISH - W4}, + {"sd", "sd", SINDHI + W10, 0}, + {"se", "sv", SWEDISH + W10, 0}, + {"sg", "sg", SANGO + W10, 0}, + {"si", "si,sl", SINHALESE + W10, SLOVENIAN + W10}, // 1:2 Sinhalese, Slovinia + {"sk", "sk", SLOVAK + W10, CZECH - W4}, + {"sl", "sl", SLOVENIAN + W10, 0}, + {"slo", "sl", SLOVENIAN + W10, 0}, + {"sm", "sm", SAMOAN + W10, 0}, + {"sn", "sn", SHONA + W10, 0}, + {"so", "so", SOMALI + W10, 0}, + {"sp", "es", SPANISH + W10, 0}, + {"sq", "sq", ALBANIAN + W10, 0}, + {"sr", "sr", SERBIAN + W10, 0}, + {"srb", "sr", SERBIAN + W10, 0}, + {"srl", "sr", SERBIAN + W10, 0}, // Serbian Latin + {"srp", "sr", SERBIAN + W10, 0}, + {"ss", "ss", SISWANT + W10, 0}, + {"st", "st", SESOTHO + W10, 0}, + {"su", "su", SUNDANESE + W10, 0}, + {"sv", "sv", SWEDISH + W10, 0}, + {"sve", "sv", SWEDISH + W10, 0}, + {"sw", "sw", SWAHILI + W10, 0}, + {"swe", "sv", SWEDISH + W10, 0}, + {"sy", "syr", SYRIAC + W10, 0}, + {"syr", "syr", SYRIAC + W10, 0}, + + {"ta", "ta", TAMIL + W10, 0}, + {"te", "te", TELUGU + W10, 0}, + {"tg", "tg", TAJIK + W10, 0}, + {"th", "th", THAI + W10, 0}, + {"ti", "ti,bo", TIGRINYA + W10, TIBETAN + W10}, // 1:2 Tigrinya, Tibet + {"tj", "tg", TAJIK + W10, 0}, // Tajikistan + {"tk", "tk", TURKMEN + W10, 0}, + {"tl", "tl", TAGALOG + W10, 0}, + {"tlh", "tlh", X_KLINGON + W10, 0}, + {"tn", "tn", TSWANA + W10, 0}, + {"to", "to", TONGA + W10, 0}, + {"tr", "tr", TURKISH + W10, 0}, + {"ts", "ts", TSONGA + W10, 0}, + {"tt", "tt", TATAR + W10, 0}, + {"tw", "ak,zhT", AKAN + W10, CHINESE_T + W10}, // 1:2 Twi => Akan, Taiwan + {"twi", "ak", AKAN + W10, 0}, // Twi => Akan + + {"ua", "uk", UKRAINIAN + W10, 0}, // Ukraine + {"ug", "ug", UIGHUR + W10, 0}, + {"uk", "uk", UKRAINIAN + W10, 0}, + {"ur", "ur", URDU + W10, 0}, + {"uz", "uz", UZBEK + W10, 0}, + + {"va", "ca", CATALAN + W10, 0}, // Valencia => Catalan + {"val", "ca", CATALAN + W10, 0}, // Valencia => Catalan + {"ve", "ve", VENDA + W10, 0}, + {"vi", "vi", VIETNAMESE + W10, 0}, + {"vie", "vi", VIETNAMESE + W10, 0}, + {"vn", "vi", VIETNAMESE + W10, 0}, + {"vo", "vo", VOLAPUK + W10, 0}, + + {"wo", "wo", WOLOF + W10, 0}, + + {"xh", "xh", XHOSA + W10, ZULU - W4}, + {"xho", "xh", XHOSA + W10, ZULU - W4}, + + {"yi", "yi", YIDDISH + W10, 0}, + {"yo", "yo", YORUBA + W10, 0}, + + {"za", "za", ZHUANG + W10, 0}, + {"zh", "zh", CHINESE + W10, 0}, + {"zht", "zhT", CHINESE_T + W10, 0}, + {"zu", "zu", ZULU + W10, XHOSA - W4}, +}; + + +// Possibly map to tl: +// -LangTags tl-Latn /7val.com/ ,bcl 2 Central Bicolano +// -LangTags tl-Latn /7val.com/ ,ceb 6 Cebuano +// -LangTags tl-Latn /7val.com/ ,war 1 Waray + + + +// Table to look up country TLD (no general TLD) +// In alphabetical order for binary search +static const int kCLDTable3Size = 181; +static const TLDLookup kCLDTLDHintTable[kCLDTable3Size] = { + {"ac", JAPANESE + W2, 0}, + {"ad", CATALAN + W4, 0}, + {"ae", ARABIC + W4, 0}, + {"af", PASHTO + W4, PERSIAN + W4}, + {"ag", GERMAN + W2, 0}, // meager + // {"ai", 0, 0}, // meager + {"al", ALBANIAN + W4, 0}, + {"am", ARMENIAN + W4, 0}, + {"an", DUTCH + W4, 0}, // meager + {"ao", PORTUGUESE + W4, 0}, + // {"aq", 0, 0}, // meager + {"ar", SPANISH + W4, 0}, + // {"as", 0, 0}, + {"at", GERMAN + W4, 0}, + {"au", ENGLISH + W2, 0}, + {"aw", DUTCH + W4, 0}, + {"ax", SWEDISH + W4, 0}, + {"az", AZERBAIJANI + W4, 0}, + + {"ba", BOSNIAN + W8, CROATIAN - W4}, + // {"bb", 0, 0}, + {"bd", BENGALI + W4, 0}, + {"be", DUTCH + W4, FRENCH + W4}, + {"bf", FRENCH + W4, 0}, + {"bg", BULGARIAN + W4, 0}, + {"bh", ARABIC + W4, 0}, + {"bi", RUNDI + W4, FRENCH + W4}, + {"bj", FRENCH + W4, 0}, + {"bm", ENGLISH + W2, 0}, + {"bn", MALAY + W4, INDONESIAN - W4}, + {"bo", SPANISH + W4, AYMARA + W2}, // and GUARANI QUECHUA + {"br", PORTUGUESE + W4, 0}, + // {"bs", 0, 0}, + {"bt", DZONGKHA + W10, TIBETAN - W10}, // Strong presumption of Dzongha + {"bw", TSWANA + W4, 0}, + {"by", BELARUSIAN + W4, 0}, + // {"bz", 0, 0}, + + {"ca", FRENCH + W4, ENGLISH + W2}, + {"cat", CATALAN + W4, 0}, + {"cc", 0, 0}, + {"cd", FRENCH + W4, 0}, + {"cf", FRENCH + W4, 0}, + {"cg", FRENCH + W4, 0}, + {"ch", GERMAN + W4, FRENCH + W4}, + {"ci", FRENCH + W4, 0}, + // {"ck", 0, 0}, + {"cl", SPANISH + W4, 0}, + {"cm", FRENCH + W4, 0}, + {"cn", CHINESE + W4, 0}, + {"co", SPANISH + W4, 0}, + {"cr", SPANISH + W4, 0}, + {"cu", SPANISH + W4, 0}, + {"cv", PORTUGUESE + W4, 0}, + // {"cx", 0, 0}, + {"cy", GREEK + W4, TURKISH + W4}, + {"cz", CZECH + W4, SLOVAK - W4}, + + {"de", GERMAN + W4, 0}, + {"dj", 0, 0}, + {"dk", DANISH + W4, NORWEGIAN - W4}, + {"dm", 0, 0}, + {"do", SPANISH + W4, 0}, + {"dz", FRENCH + W4, ARABIC + W4}, + + {"ec", SPANISH + W4, 0}, + {"ee", ESTONIAN + W4, 0}, + {"eg", ARABIC + W4, 0}, + {"er", AFAR + W4, 0}, + {"es", SPANISH + W4, 0}, + {"et", AMHARIC + W4, AFAR + W4}, + + {"fi", FINNISH + W4, 0}, + {"fj", FIJIAN + W4, 0}, + // {"fk", 0, 0}, + // {"fm", 0, 0}, + {"fo", FAROESE + W4, ICELANDIC - W4}, + {"fr", FRENCH + W4, 0}, + + {"ga", FRENCH + W4, 0}, + {"gd", 0, 0}, + {"ge", GEORGIAN + W4, 0}, + {"gf", FRENCH + W4, 0}, + // {"gg", 0, 0}, + // {"gh", 0, 0}, + // {"gi", 0, 0}, + {"gl", GREENLANDIC + W4, DANISH + W4}, + // {"gm", 0, 0}, + {"gn", FRENCH + W4, 0}, + // {"gp", 0, 0}, + // {"gq", 0, 0}, + {"gr", GREEK + W4, 0}, + // {"gs", 0, 0}, + {"gt", SPANISH + W4, 0}, + // {"gu", 0, 0}, + // {"gy", 0, 0}, + + {"hk", CHINESE_T + W4, 0}, + // {"hm", 0, 0}, + {"hn", SPANISH + W4, 0}, + {"hr", CROATIAN + W8, BOSNIAN - W4}, + {"ht", HAITIAN_CREOLE + W4, FRENCH + W4}, + {"hu", HUNGARIAN + W4, 0}, + + {"id", INDONESIAN + W4, MALAY - W4}, + {"ie", IRISH + W4, 0}, + {"il", HEBREW + W4, 0}, + {"im", MANX + W4, 0}, + // {"in", 0, 0}, + // {"io", 0, 0}, + {"iq", ARABIC + W4, 0}, + {"ir", PERSIAN + W4, 0}, + {"is", ICELANDIC + W4, FAROESE - W4}, + {"it", ITALIAN + W4, 0}, + + // {"je", 0, 0}, + // {"jm", 0, 0}, + {"jo", ARABIC + W4, 0}, + {"jp", JAPANESE + W4, 0}, + + // {"ke", 0, 0}, + {"kg", KYRGYZ + W4, 0}, + {"kh", KHMER + W4, 0}, + // {"ki", 0, 0}, + {"km", FRENCH + W4, 0}, + // {"kn", 0, 0}, + {"kp", KOREAN + W4, 0}, + {"kr", KOREAN + W4, 0}, + {"kw", ARABIC + W4, 0}, + // {"ky", 0, 0}, + {"kz", KAZAKH + W4, 0}, + + {"la", LAOTHIAN + W4, 0}, + {"lb", ARABIC + W4, FRENCH + W4}, + // {"lc", 0, 0}, + {"li", GERMAN + W4, 0}, + {"lk", SINHALESE + W4, 0}, + // {"lr", 0, 0}, + {"ls", SESOTHO + W4, 0}, + {"lt", LITHUANIAN + W4, 0}, + {"lu", LUXEMBOURGISH + W4}, + {"lv", LATVIAN + W4, 0}, + {"ly", ARABIC + W4, 0}, + + {"ma", FRENCH + W4, 0}, + {"mc", FRENCH + W4, 0}, + {"md", ROMANIAN + W4, 0}, + {"me", MONTENEGRIN + W8, SERBIAN - W4}, + {"mg", FRENCH + W4, 0}, + {"mk", MACEDONIAN + W4, 0}, + {"ml", FRENCH + W4, 0}, + {"mm", BURMESE + W4, 0}, + {"mn", MONGOLIAN + W4, 0}, + {"mo", CHINESE_T + W4, PORTUGUESE + W4}, + // {"mp", 0, 0}, + {"mq", FRENCH + W4, 0}, + {"mr", FRENCH + W4, ARABIC + W4}, + // {"ms", 0, 0}, + {"mt", MALTESE + W4, 0}, + // {"mu", 0, 0}, + {"mv", DHIVEHI + W4, 0}, + // {"mw", 0, 0}, + {"mx", SPANISH + W4, 0}, + {"my", MALAY + W4, INDONESIAN - W4}, + {"mz", PORTUGUESE + W4, 0}, + + {"na", 0, 0}, // Namibia + {"nc", FRENCH + W4, 0}, + {"ne", FRENCH + W4, 0}, + {"nf", FRENCH + W4, 0}, + // {"ng", 0, 0}, + {"ni", SPANISH + W4, 0}, + {"nl", DUTCH + W4, 0}, + {"no", NORWEGIAN + W4, NORWEGIAN_N + W2}, + {"np", NEPALI + W4, 0}, + {"nr", NAURU + W4, 0}, + {"nu", SWEDISH + W4, 0}, + {"nz", MAORI + W4, ENGLISH + W2}, + + {"om", ARABIC + W4, 0}, + + {"pa", SPANISH + W4, 0}, + {"pe", SPANISH + W4, QUECHUA + W2}, // also AYMARA + {"pf", FRENCH + W4, 0}, + // {"pg", 0, 0}, + {"ph", TAGALOG + W4, 0}, + {"pk", URDU + W4, 0}, + {"pl", POLISH + W4, 0}, + // {"pn", 0, 0}, + {"pr", SPANISH + W4, 0}, + {"ps", ARABIC + W4, 0}, + {"pt", PORTUGUESE + W4, 0}, + {"py", SPANISH + W4, GUARANI + W2}, + + {"qa", ARABIC + W4, 0}, + + {"re", FRENCH + W4, 0}, + {"ro", ROMANIAN + W4, 0}, + {"rs", SERBIAN + W8, MONTENEGRIN - W4}, + {"ru", RUSSIAN + W4, 0}, + {"rw", KINYARWANDA + W4, FRENCH + W2}, + + {"sa", ARABIC + W4, 0}, + // {"sb", 0, 0}, + {"sc", SESELWA + W4, 0}, + {"sd", ARABIC + W4, 0}, + {"se", SWEDISH + W4, 0}, + // {"sg", 0, 0}, + // {"sh", 0, 0}, + {"si", SLOVENIAN + W4, 0}, + {"sk", SLOVAK + W4, CZECH - W4}, + // {"sl", 0, 0}, + {"sm", ITALIAN + W4, 0}, + {"sn", FRENCH + W4, 0}, + // {"sr", 0, 0}, + {"ss", ARABIC + W4, 0}, // Presumed South Sudan TLD. dsites 2011.07.07 + // {"st", 0, 0}, + {"su", RUSSIAN + W4, 0}, + {"sv", SPANISH + W4, 0}, + {"sy", ARABIC + W4, 0}, + // {"sz", 0, 0}, + + // {"tc", 0, 0}, + {"td", FRENCH + W4, 0}, + // {"tf", 0, 0}, + {"tg", FRENCH + W4, 0}, + {"th", THAI + W4, 0}, + // Tibet has no country code (see .cn) + {"tj", TAJIK + W4, 0}, + // {"tk", 0, 0}, + // {"tl", 0, 0}, + {"tm", TURKISH + W4, 0}, + {"tn", FRENCH + W4, ARABIC + W4}, + // {"to", 0, 0}, + {"tp", JAPANESE + W4, 0}, + {"tr", TURKISH + W4, 0}, + // {"tt", 0, 0}, + // {"tv", 0, 0}, + {"tw", CHINESE_T + W4, 0}, + {"tz", SWAHILI + W4, AKAN + W4}, + + {"ua", UKRAINIAN + W4, 0}, + {"ug", GANDA + W4, 0}, + {"uk", ENGLISH + W2, 0}, + {"us", ENGLISH + W2, 0}, + {"uy", SPANISH + W4, 0}, + {"uz", UZBEK + W4, 0}, + + {"va", ITALIAN + W4, LATIN + W2}, + // {"vc", 0, 0}, + {"ve", SPANISH + W4, 0}, + // {"vg", 0, 0}, + // {"vi", 0, 0}, + {"vn", VIETNAMESE + W4, 0}, + // {"vu", 0, 0}, + + {"wf", FRENCH + W4, 0}, + // {"ws", 0, 0}, + + {"ye", ARABIC + W4, 0}, + + {"za", AFRIKAANS + W4, 0}, + // {"zm", 0, 0}, + // {"zw", 0, 0}, +}; + +#undef W2 +#undef W4 +#undef W6 +#undef W8 +#undef W10 +#undef W12 + + + + + +inline void SetCLDPriorWeight(int w, OneCLDLangPrior* olp) { + *olp = (*olp & 0x3ff) + (w << 10); +} +inline void SetCLDPriorLang(Language lang, OneCLDLangPrior* olp) { + *olp = (*olp & ~0x3ff) + lang; +} + +OneCLDLangPrior PackCLDPriorLangWeight(Language lang, int w) { + return (w << 10) + lang; +} + +inline int MaxInt(int a, int b) { + return (a >= b) ? a : b; +} + +// Merge in another language prior, taking max if already there +void MergeCLDLangPriorsMax(OneCLDLangPrior olp, CLDLangPriors* lps) { + if (olp == 0) {return;} + Language target_lang = GetCLDPriorLang(olp); + for (int i = 0; i < lps->n; ++i) { + if (GetCLDPriorLang(lps->prior[i]) == target_lang) { + int new_weight = MaxInt(GetCLDPriorWeight(lps->prior[i]), + GetCLDPriorWeight(olp)); + SetCLDPriorWeight(new_weight, &lps->prior[i]); + return; + } + } + // Not found; add it if room + if (lps->n >= kMaxOneCLDLangPrior) {return;} + lps->prior[lps->n++] = olp; +} + +// Merge in another language prior, boosting 10x if already there +void MergeCLDLangPriorsBoost(OneCLDLangPrior olp, CLDLangPriors* lps) { + if (olp == 0) {return;} + Language target_lang = GetCLDPriorLang(olp); + for (int i = 0; i < lps->n; ++i) { + if (GetCLDPriorLang(lps->prior[i]) == target_lang) { + int new_weight = GetCLDPriorWeight(lps->prior[i]) + 2; + SetCLDPriorWeight(new_weight, &lps->prior[i]); + return; + } + } + // Not found; add it if room + if (lps->n >= kMaxOneCLDLangPrior) {return;} + lps->prior[lps->n++] = olp; +} + + +// Trim language priors to no more than max_entries, keeping largest abs weights +void TrimCLDLangPriors(int max_entries, CLDLangPriors* lps) { + if (lps->n <= max_entries) {return;} + + // Insertion sort in-place by abs(weight) + for (int i = 0; i < lps->n; ++i) { + OneCLDLangPrior temp_olp = lps->prior[i]; + int w = abs(GetCLDPriorWeight(temp_olp)); + int kk = i; + for (; kk > 0; --kk) { + if (abs(GetCLDPriorWeight(lps->prior[kk - 1])) < w) { + // Move down and continue + lps->prior[kk] = lps->prior[kk - 1]; + } else { + // abs(weight[kk - 1]) >= w, time to stop + break; + } + } + lps->prior[kk] = temp_olp; + } + + lps->n = max_entries; +} + +int CountCommas(const string& langtags) { + int commas = 0; + for (int i = 0; i < static_cast<int>(langtags.size()); ++i) { + if (langtags[i] == ',') {++commas;} + } + return commas; +} + +// Binary lookup on language tag +const LangTagLookup* DoLangTagLookup(const char* key, + const LangTagLookup* tbl, int tbl_size) { + // Key is always in range [lo..hi) + int lo = 0; + int hi = tbl_size; + while (lo < hi) { + int mid = (lo + hi) >> 1; + int comp = strcmp(tbl[mid].langtag, key); + if (comp < 0) { + lo = mid + 1; + } else if (comp > 0) { + hi = mid; + } else { + return &tbl[mid]; + } + } + return NULL; +} + +// Binary lookup on tld +const TLDLookup* DoTLDLookup(const char* key, + const TLDLookup* tbl, int tbl_size) { + // Key is always in range [lo..hi) + int lo = 0; + int hi = tbl_size; + while (lo < hi) { + int mid = (lo + hi) >> 1; + int comp = strcmp(tbl[mid].tld, key); + if (comp < 0) { + lo = mid + 1; + } else if (comp > 0) { + hi = mid; + } else { + return &tbl[mid]; + } + } + return NULL; +} + + + +// Trim language tag string to canonical form for each language +// Input is from GetLangTagsFromHtml(), already lowercased +string TrimCLDLangTagsHint(const string& langtags) { + string retval; + if (langtags.empty()) {return retval;} + int commas = CountCommas(langtags); + if (commas > 4) {return retval;} // Ignore if too many language tags + + char temp[20]; + int pos = 0; + while (pos < static_cast<int>(langtags.size())) { + int comma = langtags.find(',', pos); + if (comma == string::npos) {comma = langtags.size();} // fake trailing comma + int len = comma - pos; + if (len <= 16) { + // Short enough to use + memcpy(temp, &langtags[pos], len); + temp[len] = '\0'; + const LangTagLookup* entry = DoLangTagLookup(temp, + kCLDLangTagsHintTable1, + kCLDTable1Size); + if (entry != NULL) { + // First table hit + retval.append(entry->langcode); // may be "code1,code2" + retval.append(1, ','); + } else { + // Try second table with language code truncated at first hyphen + char* hyphen = strchr(temp, '-'); + if (hyphen != NULL) {*hyphen = '\0';} + len = strlen(temp); + if (len <= 3) { // Short enough to use + entry = DoLangTagLookup(temp, + kCLDLangTagsHintTable2, + kCLDTable2Size); + if (entry != NULL) { + // Second table hit + retval.append(entry->langcode); // may be "code1,code2" + retval.append(1, ','); + } + } + } + } + pos = comma + 1; + } + + // Remove trainling comma, if any + if (!retval.empty()) {retval.resize(retval.size() - 1);} + return retval; +} + + + +//============================================================================== + +// Little state machine to scan insides of language attribute quoted-string. +// Each language code is lowercased and copied to the output string. Underscore +// is mapped to minus. Space, tab, and comma are all mapped to comma, and +// multiple consecutive commas are removed. +// Each language code in the output list will be followed by a single comma. + +// There are three states, and we start in state 1: +// State 0: After a letter. +// Copy all letters/minus[0], copy comma[1]; all others copy comma and skip [2] +// State 1: Just after a comma. +// Copy letter [0], Ignore subsequent commas[1]. minus and all others skip [2] +// State 2: Skipping. +// All characters except comma skip and stay in [2]. comma goes to [1] + +// The thing that is copied is kLangCodeRemap[c] when going to state 0, +// and always comma when going to state 1 or 2. The design depends on copying +// a comma at the *beginning* of skipping, and in state 2 never doing a copy. + +// We pack all this into 8 bits: +// +--+---+---+ +// |78|654|321| +// +--+---+---+ +// +// Shift byte right by 3*state, giving [0] 321, [1] 654, [2] .78 +// where . is always zero +// Of these 3 bits, low two are next state ss, high bit is copy bit C. +// If C=1 and ss == 0, copy kLangCodeRemap[c], else copy a comma + +#define SKIP0 0 +#define SKIP1 1 +#define SKIP2 2 +#define COPY0 4 // copy kLangCodeRemap[c] +#define COPY1 5 // copy ',' +#define COPY2 6 // copy ',' + +// These combined actions pack three states into one byte. +// Ninth bit must be zero, so all state 2 values must be skips. +// state[2] state[1] state[0] +#define LTR ((SKIP2 << 6) + (COPY0 << 3) + COPY0) +#define MINUS ((SKIP2 << 6) + (COPY2 << 3) + COPY0) +#define COMMA ((SKIP1 << 6) + (SKIP1 << 3) + COPY1) +#define Bad ((SKIP2 << 6) + (COPY2 << 3) + COPY2) + +// Treat as letter: a-z, A-Z +// Treat as minus: 2D minus, 5F underscore +// Treat as comma: 09 tab, 20 space, 2C comma + +static const unsigned char kLangCodeAction[256] = { + Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,COMMA,Bad,Bad,Bad,Bad,Bad,Bad, + Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, + COMMA,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,COMMA,MINUS,Bad,Bad, + Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, + + Bad,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR, + LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,Bad,Bad,Bad,Bad,MINUS, + Bad,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR, + LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,Bad,Bad,Bad,Bad,Bad, + + Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, + Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, + Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, + Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, + + Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, + Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, + Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, + Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, +}; + +// This does lowercasing, maps underscore to minus, and maps tab/space to comma +static const unsigned char kLangCodeRemap[256] = { + 0,0,0,0,0,0,0,0, 0,',',0,0,0,0,0,0, // 09 tab + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + ',',0,0,0,0,0,0,0, 0,0,0,0,',','-',0,0, // 20 space 2C comma 2D minus + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + + 0,'a','b','c','d','e','f','g', 'h','i','j','k','l','m','n','o', + 'p','q','r','s','t','u','v','w', 'x','y','z',0,0,0,0,'-', // 5F underscore + 0,'a','b','c','d','e','f','g', 'h','i','j','k','l','m','n','o', + 'p','q','r','s','t','u','v','w', 'x','y','z',0,0,0,0,0, + + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, +}; + +#undef LTR +#undef MINUS +#undef COMMA +#undef Bad + +#undef SKIP0 +#undef SKIP1 +#undef SKIP2 +#undef COPY0 +#undef COPY1 +#undef COPY2 + + +// Find opening '<' for HTML tag +// Note: this is all somewhat insensitive to mismatched quotes +int32 FindTagStart(const char* utf8_body, int32 pos, int32 max_pos) { + int i = pos; + // Advance i by 4 if none of the next 4 bytes are '<' + for (i = pos; i < (max_pos - 3); i += 4) { + // Fast check for any < + const char* p = &utf8_body[i]; + uint32 s0123 = UNALIGNED_LOAD32(p); + uint32 temp = s0123 ^ 0x3c3c3c3c; // <<<< + if (((temp - 0x01010101) & (~temp & 0x80808080)) != 0) { + // At least one byte is '<' + break; + } + } + // Continue, advancing i by 1 + for (; i < max_pos; ++i) { + if (utf8_body[i] == '<') {return i;} + } + return -1; +} + + +// Find closing '>' for HTML tag. Also stop on < and & (simplistic parsing) +int32 FindTagEnd(const char* utf8_body, int32 pos, int32 max_pos) { + // Always outside quotes + for (int i = pos; i < max_pos; ++i) { + char c = utf8_body[i]; + if (c == '>') {return i;} + if (c == '<') {return i - 1;} + if (c == '&') {return i - 1;} + } + return -1; // nothing found +} + +// Find opening quote or apostrophe, skipping spaces +// Note: this is all somewhat insensitive to mismatched quotes +int32 FindQuoteStart(const char* utf8_body, int32 pos, int32 max_pos) { + for (int i = pos; i < max_pos; ++i) { + char c = utf8_body[i]; + if (c == '"') {return i;} + if (c == '\'') {return i;} + if (c != ' ') {return -1;} + } + return -1; +} + +// Find closing quot/apos. Also stop on = > < and & (simplistic parsing) +int32 FindQuoteEnd(const char* utf8_body, int32 pos, int32 max_pos) { + // Always outside quotes + for (int i = pos; i < max_pos; ++i) { + char c = utf8_body[i]; + if (c == '"') {return i;} + if (c == '\'') {return i;} + if (c == '>') {return i - 1;} + if (c == '=') {return i - 1;} + if (c == '<') {return i - 1;} + if (c == '&') {return i - 1;} + } + return -1; // nothing found +} + +int32 FindEqualSign(const char* utf8_body, int32 pos, int32 max_pos) { + // Outside quotes/apostrophes loop + for (int i = pos; i < max_pos; ++i) { + char c = utf8_body[i]; + if (c == '=') { // Found bare equal sign inside tag + return i; + } else if (c == '"') { + // Inside quotes loop + int j; + for (j = i + 1; j < max_pos; ++j) { + if (utf8_body[j] == '"') { + break; + } else if (utf8_body[j] == '\\') { + ++j; + } + } + i = j; + } else if (c == '\'') { + // Inside apostrophes loop + int j; + for (j = i + 1; j < max_pos; ++j) { + if (utf8_body[j] == '\'') { + break; + } else if (utf8_body[j] == '\\') { + ++j; + } + } + i = j; + } + + } + return -1; // nothing found +} + +// Scan backwards for case-insensitive string s in [min_pos..pos) +// Bytes of s must already be lowercase, i.e. in [20..3f] or [60..7f] +// Cheap lowercase. Control codes will masquerade as 20..3f +bool FindBefore(const char* utf8_body, + int32 min_pos, int32 pos, const char* s) { + int len = strlen(s); + if ((pos - min_pos) < len) {return false;} // Too small to fit s + + // Skip trailing spaces + int i = pos; + while ((i > (min_pos + len)) && (utf8_body[i - 1] == ' ')) {--i;} + i -= len; + if (i < min_pos) {return false;} // pos - min_pos < len, so s can't be found + + const char* p = &utf8_body[i]; + for (int j = 0; j < len; ++j) { + if ((p[j] | 0x20) != s[j]) {return false;} // Unequal byte + } + return true; // All bytes equal at i +} + +// Scan forwards for case-insensitive string s in [pos..max_pos) +// Bytes of s must already be lowercase, i.e. in [20..3f] or [60..7f] +// Cheap lowercase. Control codes will masquerade as 20..3f +// Allows but does not require quoted/apostrophe string +bool FindAfter(const char* utf8_body, + int32 pos, int32 max_pos, const char* s) { + int len = strlen(s); + if ((max_pos - pos) < len) {return false;} // Too small to fit s + + // Skip leading spaces, quote, apostrophe + int i = pos; + while (i < (max_pos - len)) { + unsigned char c = utf8_body[i]; + if ((c == ' ') || (c == '"') || (c == '\'')) {++i;} + else {break;} + } + + const char* p = &utf8_body[i]; + for (int j = 0; j < len; ++j) { + if ((p[j] | 0x20) != s[j]) {return false;} // Unequal byte + } + return true; // All bytes equal +} + + + +// Copy attribute value in [pos..max_pos) +// pos is just after an opening quote/apostrophe and max_pos is the ending one +// String must all be on a single line. +// Return slightly-normalized language list, empty or ending in comma +// Does lowercasing and removes excess punctuation/space +string CopyOneQuotedString(const char* utf8_body, + int32 pos, int32 max_pos) { + string s; + int state = 1; // Front is logically just after a comma + for (int i = pos; i < max_pos; ++i) { + unsigned char c = utf8_body[i]; + int e = kLangCodeAction[c] >> (3 * state); + state = e & 3; // Update to next state + if ((e & 4) != 0) { + // Copy a remapped byte if going to state 0, else copy a comma + if (state == 0) { + s.append(1, kLangCodeRemap[c]); + } else { + s.append(1, ','); + } + } + } + + // Add final comma if needed + if (state == 0) { + s.append(1, ','); + } + return s; +} + +// Find and copy attribute value: quoted string in [pos..max_pos) +// Return slightly-normalized language list, empty or ending in comma +string CopyQuotedString(const char* utf8_body, + int32 pos, int32 max_pos) { + int32 start_quote = FindQuoteStart(utf8_body, pos, max_pos); + if (start_quote < 0) {return string("");} + int32 end_quote = FindQuoteEnd(utf8_body, start_quote + 1, max_pos); + if (end_quote < 0) {return string("");} + + return CopyOneQuotedString(utf8_body, start_quote + 1, end_quote); +} + +// Add hints to vector of langpriors +// Input is from GetLangTagsFromHtml(), already lowercased +void SetCLDLangTagsHint(const string& langtags, CLDLangPriors* langpriors) { + if (langtags.empty()) {return;} + int commas = CountCommas(langtags); + if (commas > 4) {return;} // Ignore if too many language tags + + char temp[20]; + int pos = 0; + while (pos < static_cast<int>(langtags.size())) { + int comma = langtags.find(',', pos); + if (comma == string::npos) {comma = langtags.size();} // fake trailing comma + int len = comma - pos; + if (len <= 16) { + // Short enough to use + memcpy(temp, &langtags[pos], len); + temp[len] = '\0'; + const LangTagLookup* entry = DoLangTagLookup(temp, + kCLDLangTagsHintTable1, + kCLDTable1Size); + if (entry != NULL) { + // First table hit + MergeCLDLangPriorsMax(entry->onelangprior1, langpriors); + MergeCLDLangPriorsMax(entry->onelangprior2, langpriors); + } else { + // Try second table with language code truncated at first hyphen + char* hyphen = strchr(temp, '-'); + if (hyphen != NULL) {*hyphen = '\0';} + len = strlen(temp); + if (len <= 3) { // Short enough to use + entry = DoLangTagLookup(temp, + kCLDLangTagsHintTable2, + kCLDTable2Size); + if (entry != NULL) { + // Second table hit + MergeCLDLangPriorsMax(entry->onelangprior1, langpriors); + MergeCLDLangPriorsMax(entry->onelangprior2, langpriors); + } + } + } + } + pos = comma + 1; + } +} + +// Add hints to vector of langpriors +// Input is string after HTTP header Content-Language: +void SetCLDContentLangHint(const char* contentlang, CLDLangPriors* langpriors) { + string langtags = CopyOneQuotedString(contentlang, 0, strlen(contentlang)); + SetCLDLangTagsHint(langtags, langpriors); +} + +// Add hints to vector of langpriors +// Input is last element of hostname (no dot), e.g. from GetTLD() +void SetCLDTLDHint(const char* tld, CLDLangPriors* langpriors) { + int len = strlen(tld); + if (len > 3) {return;} // Ignore if more than three letters + char local_tld[4]; + strncpy(local_tld, tld, 4); + local_tld[3] = '\0'; // Safety move + // Lowercase + for (int i = 0; i < len; ++i) {local_tld[i] |= 0x20;} + const TLDLookup* entry = DoTLDLookup(local_tld, + kCLDTLDHintTable, + kCLDTable3Size); + if (entry != NULL) { + // Table hit + MergeCLDLangPriorsBoost(entry->onelangprior1, langpriors); + MergeCLDLangPriorsBoost(entry->onelangprior2, langpriors); + } +} + +// Add hints to vector of langpriors +// Input is from DetectEncoding() +void SetCLDEncodingHint(Encoding enc, CLDLangPriors* langpriors) { + OneCLDLangPrior olp; + switch (enc) { + case CHINESE_GB: + case GBK: + case GB18030: + case ISO_2022_CN: + case HZ_GB_2312: + olp = PackCLDPriorLangWeight(CHINESE, kCLDPriorEncodingWeight); + MergeCLDLangPriorsBoost(olp, langpriors); + break; + case CHINESE_BIG5: + case CHINESE_BIG5_CP950: + case BIG5_HKSCS: + olp = PackCLDPriorLangWeight(CHINESE_T, kCLDPriorEncodingWeight); + MergeCLDLangPriorsBoost(olp, langpriors); + break; + case JAPANESE_EUC_JP: + case JAPANESE_SHIFT_JIS: + case JAPANESE_CP932: + case JAPANESE_JIS: // ISO-2022-JP + olp = PackCLDPriorLangWeight(JAPANESE, kCLDPriorEncodingWeight); + MergeCLDLangPriorsBoost(olp, langpriors); + break; + case KOREAN_EUC_KR: + case ISO_2022_KR: + olp = PackCLDPriorLangWeight(KOREAN, kCLDPriorEncodingWeight); + MergeCLDLangPriorsBoost(olp, langpriors); + break; + + default: + break; + } +} + +// Add hints to vector of langpriors +// Input is from random source +void SetCLDLanguageHint(Language lang, CLDLangPriors* langpriors) { + OneCLDLangPrior olp = PackCLDPriorLangWeight(lang, kCLDPriorLanguageWeight); + MergeCLDLangPriorsBoost(olp, langpriors); +} + + +// Make printable string of priors +string DumpCLDLangPriors(const CLDLangPriors* langpriors) { + string retval; + for (int i = 0; i < langpriors->n; ++i) { + char temp[64]; + sprintf(temp, "%s.%d ", + LanguageCode(GetCLDPriorLang(langpriors->prior[i])), + GetCLDPriorWeight(langpriors->prior[i])); + retval.append(temp); + } + return retval; +} + + + + +// Look for +// <html lang="en"> +// <doc xml:lang="en"> +// <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en-US"> +// <meta http-equiv="content-language" content="en-GB" /> +// <meta name="language" content="Srpski"> +// <meta name="DC.language" scheme="RFCOMMA766" content="en"> +// <SPAN id="msg1" class="info" lang='en'> +// +// Do not trigger on +// <!-- lang=french ...--> +// <font lang=postscript ...> +// <link href="index.fr.html" hreflang="fr-FR" xml:lang="fr-FR" /> +// <META name="Author" lang="fr" content="Arnaud Le Hors"> +// +// Stop fairly quickly on mismatched quotes +// +// Allowed language characters +// a-z A-Z -_ , space\t +// Think about: GB2312, big5, shift-jis, euc-jp, ksc euc-kr +// zh-hans zh-TW cmn-Hani zh_cn.gb18030_CN zh-min-nan zh-yue +// de-x-mtfrom-en zh-tw-x-mtfrom-en (machine translation) +// GB2312 => gb +// Big5 => big +// zh_CN.gb18030_C => zh-cn +// +// Remove duplicates and extra spaces as we go +// Lowercase as we go. + +// Get language tag hints from HTML body +// Normalize: remove spaces and make lowercase comma list + +string GetLangTagsFromHtml(const char* utf8_body, int32 utf8_body_len, + int32 max_scan_bytes) { + string retval; + if (max_scan_bytes > utf8_body_len) { + max_scan_bytes = utf8_body_len; + } + + int32 k = 0; + while (k < max_scan_bytes) { + int32 start_tag = FindTagStart(utf8_body, k, max_scan_bytes); + if (start_tag < 0) {break;} + int32 end_tag = FindTagEnd(utf8_body, start_tag + 1, max_scan_bytes); + // FindTagEnd exits on < > & + if (end_tag < 0) {break;} + + // Skip <!--...> + // Skip <font ...> + // Skip <script ...> + // Skip <link ...> + // Skip <img ...> + // Skip <a ...> + if (FindAfter(utf8_body, start_tag + 1, end_tag, "!--") || + FindAfter(utf8_body, start_tag + 1, end_tag, "font ") || + FindAfter(utf8_body, start_tag + 1, end_tag, "script ") || + FindAfter(utf8_body, start_tag + 1, end_tag, "link ") || + FindAfter(utf8_body, start_tag + 1, end_tag, "img ") || + FindAfter(utf8_body, start_tag + 1, end_tag, "a ")) { + k = end_tag + 1; + continue; + } + + // Remember <meta ...> + bool in_meta = false; + if (FindAfter(utf8_body, start_tag + 1, end_tag, "meta ")) { + in_meta = true; + } + + // Scan for each equal sign inside tag + bool content_is_lang = false; + int32 kk = start_tag + 1; + int32 equal_sign; + while ((equal_sign = FindEqualSign(utf8_body, kk, end_tag)) >= 0) { + // eq exits on < > & + + // Look inside a meta tag + // <meta ... http-equiv="content-language" ...> + // <meta ... name="language" ...> + // <meta ... name="dc.language" ...> + if (in_meta) { + if (FindBefore(utf8_body, kk, equal_sign, " http-equiv") && + FindAfter(utf8_body, equal_sign + 1, end_tag, + "content-language ")) { + content_is_lang = true; + } else if (FindBefore(utf8_body, kk, equal_sign, " name") && + (FindAfter(utf8_body, equal_sign + 1, end_tag, + "dc.language ") || + FindAfter(utf8_body, equal_sign + 1, end_tag, + "language "))) { + content_is_lang = true; + } + } + + // Look inside any tag + // <meta ... content="lang-list" ...> + // <... lang="lang-list" ...> + // <... xml:lang="lang-list" ...> + if ((content_is_lang && FindBefore(utf8_body, kk, equal_sign, + " content")) || + FindBefore(utf8_body, kk, equal_sign, " lang") || + FindBefore(utf8_body, kk, equal_sign, ":lang")) { + string temp = CopyQuotedString(utf8_body, equal_sign + 1, end_tag); + + // Append new lang tag(s) if not a duplicate + if (!temp.empty() && (retval.find(temp) == string::npos)) { + retval.append(temp); + } + } + + kk = equal_sign + 1; + } + k = end_tag + 1; + } + + // Strip last comma + if (retval.size() > 1) { + retval.erase(retval.size() - 1); + } + return retval; +} + +} // End namespace CLD2 + +//============================================================================== |