From ed5640d8b587fbcfed7dd7967f3de04b37a76f26 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 11:06:44 +0200 Subject: Adding upstream version 4:7.4.7. Signed-off-by: Daniel Baumann --- i18nlangtag/source/isolang/MS-LCID-to-list.sh | 104 + i18nlangtag/source/isolang/MS-LCID.lst | 468 +++ i18nlangtag/source/isolang/insys.cxx | 34 + i18nlangtag/source/isolang/inunx.cxx | 162 + i18nlangtag/source/isolang/inwnt.cxx | 93 + i18nlangtag/source/isolang/isolang.cxx | 1548 ++++++++++ i18nlangtag/source/isolang/langid.pl | 471 +++ i18nlangtag/source/isolang/lcid.awk | 187 ++ i18nlangtag/source/isolang/mslangid.cxx | 601 ++++ i18nlangtag/source/languagetag/languagetag.cxx | 3261 +++++++++++++++++++++ i18nlangtag/source/languagetag/languagetagicu.cxx | 71 + 11 files changed, 7000 insertions(+) create mode 100755 i18nlangtag/source/isolang/MS-LCID-to-list.sh create mode 100644 i18nlangtag/source/isolang/MS-LCID.lst create mode 100644 i18nlangtag/source/isolang/insys.cxx create mode 100644 i18nlangtag/source/isolang/inunx.cxx create mode 100644 i18nlangtag/source/isolang/inwnt.cxx create mode 100644 i18nlangtag/source/isolang/isolang.cxx create mode 100755 i18nlangtag/source/isolang/langid.pl create mode 100644 i18nlangtag/source/isolang/lcid.awk create mode 100644 i18nlangtag/source/isolang/mslangid.cxx create mode 100644 i18nlangtag/source/languagetag/languagetag.cxx create mode 100644 i18nlangtag/source/languagetag/languagetagicu.cxx (limited to 'i18nlangtag/source') diff --git a/i18nlangtag/source/isolang/MS-LCID-to-list.sh b/i18nlangtag/source/isolang/MS-LCID-to-list.sh new file mode 100755 index 000000000..adb2e1b14 --- /dev/null +++ b/i18nlangtag/source/isolang/MS-LCID-to-list.sh @@ -0,0 +1,104 @@ +#!/usr/bin/env bash +# +# This file is part of the LibreOffice project. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# +# Generates language ID table and defines and mappings of +# https://winprotocoldoc.blob.core.windows.net/productionwindowsarchives/MS-LCID/[MS-LCID].pdf +# downloaded from http://msdn.microsoft.com/library/cc233965.aspx +# At least this worked for Release: Monday, July 22, 2013; 08/08/2013 Revision 6.0 +# Also worked for 6/30/2015 revision 7.0 +# Also worked for 12/1/2017 revision 11.0 +# +# Uses pdftotext (from poppler-utils), grep and gawk. +# +# The script expects the downloaded [MS-LCID].pdf as MS-LCID.pdf +# +# Files created/OVERWRITTEN: MS-LCID.txt, MS-LCID.lst, MS-LCID.lst.h +# +# Best invoked in a temporary directory ... +# +# As the PDF layout may change, MS-LCID.lst is generated with uppercase hex +# digits and unified spaces (which gawk $1=... automatically does). +# Still, if needed, diff MS-LCID.lst with ignore spaces against the previous +# version for changes and additions, e.g. +# gvimdiff -c 'set diffopt+=iwhite' ../MS-LCID.lst MS-LCID.lst +# The generated MS-LCID.lst.h file is only a copy&paste help to add entries in +# isolang.cxx and not to be committed, the #define names have to be adapted for +# lang.h and isolang.cxx + +pdftotext -layout MS-LCID.pdf +grep '^ *0x[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] ' MS-LCID.txt | \ + gawk -e '{ $1 = "0x" toupper( substr( $1, 3)); print; }' > MS-LCID.lst +gawk -e ' +{ + val = $1; + tag = $2; + tag = gensub( /,.*/, "", 1, tag); + def = $2; + for (i=3; i<=NF; ++i) + { + def = def "_" $i; + } + def = gensub( /[^a-zA-Z0-9_]/, "_", "g", def); + def = "LANGUAGE_" def + if (def == "LANGUAGE_Neither_defined_nor_reserved") + { + def = def "_" val + } + usedef = def "," + n = split( tag, arr, /-/); + switch (n) + { + case 1: + # lll + mapping = sprintf( " { %-36s %5s, \"\" , k0 },", usedef, "\"" arr[1] "\""); + break; + case 2: + if (length(arr[2]) == 2) + { + # lll-CC + mapping = sprintf( " { %-36s %5s, \"%s\", k0 },", usedef, "\"" arr[1] "\"", arr[2]); + } + else if (length(arr[2]) == 4) + { + # lll-Ssss + mapping = sprintf( " { %-44s %10s, \"\" , k0 },", usedef, "\"" tag "\""); + } + else + { + # lll-### or lll-vvvvvvvv + mapping = sprintf( " { %-33s %16s, \"\", \"\" },", usedef, "\"" tag "\""); + } + break; + default: + if (length(arr[2]) == 2) + { + # lll-CC-vvvvvvvv + mapping = sprintf( " { %-33s %16s, \"%s\", \"%s\" },", usedef, "\"" tag "\"", arr[2], arr[1] "-" arr[3]); + } + else if (length(arr[2]) == 4) + { + # lll-Ssss-CC + mapping = sprintf( " { %-44s %10s, \"%s\", k0 },", usedef, "\"" arr[1] "-" arr[2] "\"", arr[3]); + } + else + { + # grandfathered or stuff + if (length(arr[3] == 2)) + mapping = sprintf( " { %-33s %16s, \"%s\", \"\" },", usedef, "\"" tag "\"", arr[3]); + else + mapping = sprintf( " { %-33s %16s, \"\", \"\" },", usedef, "\"" tag "\""); + } + break; + } + printf "#define %-35s LanguageType(%s)\n", def, val; + print mapping; + print "" +} +' MS-LCID.lst > MS-LCID.lst.h + +# vim: set noet sw=4 ts=4: diff --git a/i18nlangtag/source/isolang/MS-LCID.lst b/i18nlangtag/source/isolang/MS-LCID.lst new file mode 100644 index 000000000..e09eb8c44 --- /dev/null +++ b/i18nlangtag/source/isolang/MS-LCID.lst @@ -0,0 +1,468 @@ +0x0001 ar +0x0002 bg +0x0003 ca +0x0004 zh-Hans +0x0005 cs +0x0006 da +0x0007 de +0x0008 el +0x0009 en +0x000A es +0x000B fi +0x000C fr +0x000D he +0x000E hu +0x000F is +0x0010 it +0x0011 ja +0x0012 ko +0x0013 nl +0x0014 no +0x0015 pl +0x0016 pt +0x0017 rm +0x0018 ro +0x0019 ru +0x001A hr +0x001B sk +0x001C sq +0x001D sv +0x001E th +0x001F tr +0x0020 ur +0x0021 id +0x0022 uk +0x0023 be +0x0024 sl +0x0025 et +0x0026 lv +0x0027 lt +0x0028 tg +0x0029 fa +0x002A vi +0x002B hy +0x002C az +0x002D eu +0x002E hsb +0x002F mk +0x0030 st +0x0031 ts +0x0032 tn +0x0033 ve +0x0034 xh +0x0035 zu +0x0036 af +0x0037 ka +0x0038 fo +0x0039 hi +0x003A mt +0x003B se +0x003C ga +0x003D yi, reserved +0x003E ms +0x003F kk +0x0040 ky +0x0041 sw +0x0042 tk +0x0043 uz +0x0044 tt +0x0045 bn +0x0046 pa +0x0047 gu +0x0048 or +0x0049 ta +0x004A te +0x004B kn +0x004C ml +0x004D as +0x004E mr +0x004F sa +0x0050 mn +0x0051 bo +0x0052 cy +0x0053 km +0x0054 lo +0x0055 my +0x0056 gl +0x0057 kok +0x0058 mni, reserved +0x0059 sd +0x005A syr +0x005B si +0x005C chr +0x005D iu +0x005E am +0x005F tzm +0x0060 ks +0x0061 ne +0x0062 fy +0x0063 ps +0x0064 fil +0x0065 dv +0x0066 bin, reserved +0x0067 ff +0x0068 ha +0x0069 ibb, reserved +0x006A yo +0x006B quz +0x006C nso +0x006D ba +0x006E lb +0x006F kl +0x0070 ig +0x0071 kr, reserved +0x0072 om +0x0073 ti +0x0074 gn +0x0075 haw +0x0076 la, reserved +0x0077 so, reserved +0x0078 ii +0x0079 pap, reserved +0x007A arn +0x007B Neither defined nor reserved +0x007C moh +0x007D Neither defined nor reserved +0x007E br +0x007F Reserved for invariant locale behavior +0x0080 ug +0x0081 mi +0x0082 oc +0x0083 co +0x0084 gsw +0x0085 sah +0x0086 qut +0x0087 rw +0x0088 wo +0x0089 Neither defined nor reserved +0x008A Neither defined nor reserved +0x008B Neither defined nor reserved +0x008C prs +0x008D Neither defined nor reserved +0x008E Neither defined nor reserved +0x008F Neither defined nor reserved +0x0090 Neither defined nor reserved +0x0091 gd +0x0092 ku +0x0093 quc, reserved +0x0401 ar-SA +0x0402 bg-BG +0x0403 ca-ES +0x0404 zh-TW +0x0405 cs-CZ +0x0406 da-DK +0x0407 de-DE +0x0408 el-GR +0x0409 en-US +0x040A es-ES_tradnl +0x040B fi-FI +0x040C fr-FR +0x040D he-IL +0x040E hu-HU +0x040F is-IS +0x0410 it-IT +0x0411 ja-JP +0x0412 ko-KR +0x0413 nl-NL +0x0414 nb-NO +0x0415 pl-PL +0x0416 pt-BR +0x0417 rm-CH +0x0418 ro-RO +0x0419 ru-RU +0x041A hr-HR +0x041B sk-SK +0x041C sq-AL +0x041D sv-SE +0x041E th-TH +0x041F tr-TR +0x0420 ur-PK +0x0421 id-ID +0x0422 uk-UA +0x0423 be-BY +0x0424 sl-SI +0x0425 et-EE +0x0426 lv-LV +0x0427 lt-LT +0x0428 tg-Cyrl-TJ +0x0429 fa-IR +0x042A vi-VN +0x042B hy-AM +0x042C az-Latn-AZ +0x042D eu-ES +0x042E hsb-DE +0x042F mk-MK +0x0430 st-ZA +0x0431 ts-ZA +0x0432 tn-ZA +0x0433 ve-ZA +0x0434 xh-ZA +0x0435 zu-ZA +0x0436 af-ZA +0x0437 ka-GE +0x0438 fo-FO +0x0439 hi-IN +0x043A mt-MT +0x043B se-NO +0x043D yi-001 +0x043E ms-MY +0x043F kk-KZ +0x0440 ky-KG +0x0441 sw-KE +0x0442 tk-TM +0x0443 uz-Latn-UZ +0x0444 tt-RU +0x0445 bn-IN +0x0446 pa-IN +0x0447 gu-IN +0x0448 or-IN +0x0449 ta-IN +0x044A te-IN +0x044B kn-IN +0x044C ml-IN +0x044D as-IN +0x044E mr-IN +0x044F sa-IN +0x0450 mn-MN +0x0451 bo-CN +0x0452 cy-GB +0x0453 km-KH +0x0454 lo-LA +0x0455 my-MM +0x0456 gl-ES +0x0457 kok-IN +0x0458 mni-IN, reserved +0x0459 sd-Deva-IN, reserved +0x045A syr-SY +0x045B si-LK +0x045C chr-Cher-US +0x045D iu-Cans-CA +0x045E am-ET +0x045F tzm-Arab-MA +0x0460 ks-Arab +0x0461 ne-NP +0x0462 fy-NL +0x0463 ps-AF +0x0464 fil-PH +0x0465 dv-MV +0x0466 bin-NG, reserved +0x0467 ff-NG, ff-Latn-NG +0x0468 ha-Latn-NG +0x0469 ibb-NG, reserved +0x046A yo-NG +0x046B quz-BO +0x046C nso-ZA +0x046D ba-RU +0x046E lb-LU +0x046F kl-GL +0x0470 ig-NG +0x0471 kr-Latn-NG +0x0472 om-ET +0x0473 ti-ET +0x0474 gn-PY +0x0475 haw-US +0x0476 la-VA +0x0477 so-SO +0x0478 ii-CN +0x0479 pap-029, reserved +0x047A arn-CL +0x047C moh-CA +0x047E br-FR +0x0480 ug-CN +0x0481 mi-NZ +0x0482 oc-FR +0x0483 co-FR +0x0484 gsw-FR +0x0485 sah-RU +0x0486 qut-GT, reserved +0x0487 rw-RW +0x0488 wo-SN +0x048C prs-AF +0x048D plt-MG, reserved +0x048E zh-yue-HK, reserved +0x048F tdd-Tale-CN, reserved +0x0490 khb-Talu-CN, reserved +0x0491 gd-GB +0x0492 ku-Arab-IQ +0x0493 quc-CO, reserved +0x0501 qps-ploc +0x05FE qps-ploca +0x0801 ar-IQ +0x0803 ca-ES-valencia +0x0804 zh-CN +0x0807 de-CH +0x0809 en-GB +0x080A es-MX +0x080C fr-BE +0x0810 it-CH +0x0811 ja-Ploc-JP, reserved +0x0813 nl-BE +0x0814 nn-NO +0x0816 pt-PT +0x0818 ro-MD +0x0819 ru-MD +0x081A sr-Latn-CS +0x081D sv-FI +0x0820 ur-IN +0x0827 Neither defined nor reserved +0x082C az-Cyrl-AZ, reserved +0x082E dsb-DE +0x0832 tn-BW +0x083B se-SE +0x083C ga-IE +0x083E ms-BN +0x083F kk-Latn-KZ, reserved +0x0843 uz-Cyrl-UZ, reserved +0x0845 bn-BD +0x0846 pa-Arab-PK +0x0849 ta-LK +0x0850 mn-Mong-CN, reserved +0x0851 bo-BT, reserved +0x0859 sd-Arab-PK +0x085D iu-Latn-CA +0x085F tzm-Latn-DZ +0x0860 ks-Deva-IN +0x0861 ne-IN +0x0867 ff-Latn-SN +0x086B quz-EC +0x0873 ti-ER +0x09FF qps-plocm +0x0C01 ar-EG +0x0C04 zh-HK +0x0C07 de-AT +0x0C09 en-AU +0x0C0A es-ES +0x0C0C fr-CA +0x0C1A sr-Cyrl-CS +0x0C3B se-FI +0x0C50 mn-Mong-MN +0x0C51 dz-BT +0x0C5F tmz-MA, reserved +0x0C6B quz-PE +0x1001 ar-LY +0x1004 zh-SG +0x1007 de-LU +0x1009 en-CA +0x100A es-GT +0x100C fr-CH +0x101A hr-BA +0x103B smj-NO +0x105F tzm-Tfng-MA +0x1401 ar-DZ +0x1404 zh-MO +0x1407 de-LI +0x1409 en-NZ +0x140A es-CR +0x140C fr-LU +0x141A bs-Latn-BA +0x143B smj-SE +0x1801 ar-MA +0x1809 en-IE +0x180A es-PA +0x180C fr-MC +0x181A sr-Latn-BA +0x183B sma-NO +0x1C01 ar-TN +0x1C09 en-ZA +0x1C0A es-DO +0x1C0C fr-029 +0x1C1A sr-Cyrl-BA +0x1C3B sma-SE +0x2001 ar-OM +0x2008 Neither defined nor reserved +0x2009 en-JM +0x200A es-VE +0x200C fr-RE +0x201A bs-Cyrl-BA +0x203B sms-FI +0x2401 ar-YE +0x2409 en-029, reserved +0x240A es-CO +0x240C fr-CD +0x241A sr-Latn-RS +0x243B smn-FI +0x2801 ar-SY +0x2809 en-BZ +0x280A es-PE +0x280C fr-SN +0x281A sr-Cyrl-RS +0x2C01 ar-JO +0x2C09 en-TT +0x2C0A es-AR +0x2C0C fr-CM +0x2C1A sr-Latn-ME +0x3001 ar-LB +0x3009 en-ZW +0x300A es-EC +0x300C fr-CI +0x301A sr-Cyrl-ME +0x3401 ar-KW +0x3409 en-PH +0x340A es-CL +0x340C fr-ML +0x3801 ar-AE +0x3809 en-ID, reserved +0x380A es-UY +0x380C fr-MA +0x3C01 ar-BH +0x3C09 en-HK +0x3C0A es-PY +0x3C0C fr-HT +0x4001 ar-QA +0x4009 en-IN +0x400A es-BO +0x4401 ar-Ploc-SA, reserved +0x4409 en-MY +0x440A es-SV +0x4801 ar-145, reserved +0x4809 en-SG +0x480A es-HN +0x4C09 en-AE +0x4C0A es-NI +0x5009 en-BH, reserved +0x500A es-PR +0x5409 en-EG, reserved +0x540A es-US +0x5809 en-JO, reserved +0x580A es-419, reserved +0x5C09 en-KW, reserved +0x5C0A es-CU +0x6009 en-TR, reserved +0x6409 en-YE, reserved +0x641A bs-Cyrl +0x681A bs-Latn +0x6C1A sr-Cyrl +0x701A sr-Latn +0x703B smn +0x742C az-Cyrl +0x743B sms +0x7804 zh +0x7814 nn +0x781A bs +0x782C az-Latn +0x783B sma +0x783F kk-Cyrl, reserved +0x7843 uz-Cyrl +0x7850 mn-Cyrl +0x785D iu-Cans +0x785F tzm-Tfng +0x7C04 zh-Hant +0x7C14 nb +0x7C1A sr +0x7C28 tg-Cyrl +0x7C2E dsb +0x7C3B smj +0x7C3F kk-Latn, reserved +0x7C43 uz-Latn +0x7C46 pa-Arab +0x7C50 mn-Mong +0x7C59 sd-Arab +0x7C5C chr-Cher +0x7C5D iu-Latn +0x7C5F tzm-Latn +0x7C67 ff-Latn +0x7C68 ha-Latn +0x7C92 ku-Arab +0xF2EE reserved +0xE40C fr-015, reserved +0xEEEE reserved diff --git a/i18nlangtag/source/isolang/insys.cxx b/i18nlangtag/source/isolang/insys.cxx new file mode 100644 index 000000000..1bfe10d68 --- /dev/null +++ b/i18nlangtag/source/isolang/insys.cxx @@ -0,0 +1,34 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ + +#if defined(_WIN32) + +#include "inwnt.cxx" + +#elif defined(UNX) + +#include "inunx.cxx" + +#else + +#error unknown platform + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/i18nlangtag/source/isolang/inunx.cxx b/i18nlangtag/source/isolang/inunx.cxx new file mode 100644 index 000000000..4dd4cdb35 --- /dev/null +++ b/i18nlangtag/source/isolang/inunx.cxx @@ -0,0 +1,162 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ + +#include + +#ifdef MACOSX +#include +#include +#include +#include + +#else // MACOSX +#include +#endif // MACOSX + +#include +#include +#include + + +static LanguageType nImplSystemLanguage = LANGUAGE_DONTKNOW; +static LanguageType nImplSystemUILanguage = LANGUAGE_DONTKNOW; + + +// Get locale of category LC_CTYPE of environment variables +static const char* getLangFromEnvironment( bool& rbColonList ) +{ + static const char* const pFallback = "C"; + const char *pLang = nullptr; + + rbColonList = false; + pLang = getenv ( "LC_ALL" ); + if (! pLang || pLang[0] == 0) + pLang = getenv ( "LC_CTYPE" ); + if (! pLang || pLang[0] == 0) + pLang = getenv( "LANG" ); + if (! pLang || pLang[0] == 0) + pLang = pFallback; + + return pLang; +} + + +// Get locale of category LC_MESSAGES of environment variables +static const char* getUILangFromEnvironment( bool& rbColonList ) +{ + static const char* const pFallback = "C"; + const char *pLang = nullptr; + + rbColonList = true; + pLang = getenv ( "LANGUAGE" ); // respect the GNU extension + if (! pLang || pLang[0] == 0) + { + rbColonList = false; + pLang = getenv ( "LC_ALL" ); + } + if (! pLang || pLang[0] == 0) + pLang = getenv ( "LC_MESSAGES" ); + if (! pLang || pLang[0] == 0) + pLang = getenv( "LANG" ); + if (! pLang || pLang[0] == 0) + pLang = pFallback; + + return pLang; +} + + +typedef const char * (*getLangFromEnv)( bool& rbColonList ); + +static void getPlatformSystemLanguageImpl( LanguageType& rSystemLanguage, + getLangFromEnv pGetLangFromEnv ) +{ + /* get the language from the user environment */ + LanguageType nLang = rSystemLanguage; + if ( nLang != LANGUAGE_DONTKNOW ) + return; + + ::osl::MutexGuard aGuard( ::osl::Mutex::getGlobalMutex()); + nLang = rSystemLanguage; + if ( nLang == LANGUAGE_DONTKNOW ) + { +#ifdef MACOSX + rtl_Locale *procLocale; + (void) pGetLangFromEnv; /* unused */ + + if ( osl_getProcessLocale(&procLocale) == osl_Process_E_None ) + { + nLang = LanguageTag( *procLocale ).makeFallback().getLanguageType(); + OSL_DOUBLE_CHECKED_LOCKING_MEMORY_BARRIER(); + rSystemLanguage = nLang; +#ifdef DEBUG + if ( rSystemLanguage == LANGUAGE_DONTKNOW ) + fprintf( stderr, "intnunx.cxx: failed to convert osl_getProcessLocale() language to system language.\n" ); +#endif + } +#else /* MACOSX */ + bool bColonList = false; + OString aUnxLang( pGetLangFromEnv( bColonList)); + if (bColonList) + { + // Only a very simple "take first". If empty try second or keep empty. + sal_Int32 n = aUnxLang.indexOf(':'); + if (n >= 0) + { + sal_Int32 s = 0; + if (n == 0 && aUnxLang.getLength() > 1) + { + n = aUnxLang.indexOf(':', 1); + if (n < 0) + n = aUnxLang.getLength(); + if (n < 2) + s = n = 0; + else + { + s = 1; + --n; + } + } + aUnxLang = aUnxLang.copy(s,n); + } + } + nLang = MsLangId::convertUnxByteStringToLanguage( aUnxLang ); + OSL_DOUBLE_CHECKED_LOCKING_MEMORY_BARRIER(); + rSystemLanguage = nLang; +#endif /* MACOSX */ + } + else { + OSL_DOUBLE_CHECKED_LOCKING_MEMORY_BARRIER(); + } +} + + +LanguageType MsLangId::getPlatformSystemLanguage() +{ + getPlatformSystemLanguageImpl( nImplSystemLanguage, &getLangFromEnvironment); + return nImplSystemLanguage; +} + + +LanguageType MsLangId::getPlatformSystemUILanguage() +{ + getPlatformSystemLanguageImpl( nImplSystemUILanguage, &getUILangFromEnvironment); + return nImplSystemUILanguage; +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/i18nlangtag/source/isolang/inwnt.cxx b/i18nlangtag/source/isolang/inwnt.cxx new file mode 100644 index 000000000..76fe58b04 --- /dev/null +++ b/i18nlangtag/source/isolang/inwnt.cxx @@ -0,0 +1,93 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ + + +#include + +#include +#include +#include +#include +#include +#include + +static LanguageType nImplSystemLanguage = LANGUAGE_DONTKNOW; +static LanguageType nImplSystemUILanguage = LANGUAGE_DONTKNOW; + + +static LanguageType GetSVLang( LANGID nWinLangId ) +{ + // No Translation, we work with the original MS code without the SORT_ID. + // So we can get never LANG-ID's from MS, which are currently not defined + // by us. + return LanguageType( static_cast(nWinLangId & 0xffff)); +} + + +typedef LANGID (WINAPI *getLangFromEnv)(); + +static void getPlatformSystemLanguageImpl( LanguageType& rSystemLanguage, + getLangFromEnv pGetUserDefault, getLangFromEnv pGetSystemDefault ) +{ + LanguageType nLang = rSystemLanguage; + if ( nLang == LANGUAGE_DONTKNOW ) + { + ::osl::MutexGuard aGuard( ::osl::Mutex::getGlobalMutex()); + nLang = rSystemLanguage; + if ( nLang == LANGUAGE_DONTKNOW ) + { + LANGID nLangId; + + nLangId = pGetUserDefault(); + nLang = GetSVLang( nLangId ); + + if ( nLang == LANGUAGE_DONTKNOW ) + { + nLangId = pGetSystemDefault(); + nLang = GetSVLang( nLangId ); + } + OSL_DOUBLE_CHECKED_LOCKING_MEMORY_BARRIER(); + rSystemLanguage = nLang; + } + else + { + OSL_DOUBLE_CHECKED_LOCKING_MEMORY_BARRIER(); + } + } +} + + +LanguageType MsLangId::getPlatformSystemLanguage() +{ + getPlatformSystemLanguageImpl( nImplSystemLanguage, + &GetUserDefaultLangID, &GetSystemDefaultLangID); + return nImplSystemLanguage; +} + + +LanguageType MsLangId::getPlatformSystemUILanguage() +{ + // TODO: this could be distinguished, #if(WINVER >= 0x0500) + // needs _run_ time differentiation though, not at compile time. + getPlatformSystemLanguageImpl( nImplSystemUILanguage, + &GetUserDefaultUILanguage, &GetSystemDefaultUILanguage); + return nImplSystemUILanguage; +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/i18nlangtag/source/isolang/isolang.cxx b/i18nlangtag/source/isolang/isolang.cxx new file mode 100644 index 000000000..d20014e67 --- /dev/null +++ b/i18nlangtag/source/isolang/isolang.cxx @@ -0,0 +1,1548 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ + +#include +#include +#include + +#include +#include + +using namespace com::sun::star; + + +constexpr LanguageType k0(0); +constexpr LanguageType kSAME(0xffff); + +namespace { +LanguageType getOverrideLang( LanguageType nLang, LanguageType nOverride ) +{ + return nOverride ? ((nOverride == kSAME) ? nLang : nOverride) : nLang; +} +} + +/* Usage of override mechanism: + * If a table entry's mnOverride is not 0, an override entry with an mnLang + * value of (mnOverride==kSAME ? mnLang : mnOverride) exists that should be + * used instead. There MUST exist one such entry that does not have an + * mnOverride value and within one table it MUST be located before any entry + * with that mnLang and an mnOverride value of not 0. Usually kSAME is used as + * override value, with rare exceptions, see tables below. + * + * The override serves these purposes: + * - With getOverride() it indicates that there is a different language tag + * (locale) that the requested language tag should be "canonicalized" to. + * - With lookupFallbackLocale() a locale may be returned where the language + * tag differs. + * - With convertLanguageToLocaleImpl() and bIgnoreOverride=false the override + * is followed and the override locale returned. + * - With convertLocaleToLanguageImpl() a different LangID may be returned in + * rare cases where the actual mapped ID differs. + */ + +struct IsoLanguageCountryEntry +{ + LanguageType mnLang; + char maLanguage[4]; + char maCountry[3]; + LanguageType mnOverride; + + /** Obtain a language tag string with '-' separator. */ + OUString getTagString() const; + + /** Obtain a locale. */ + css::lang::Locale getLocale() const; +}; + +struct IsoLanguageScriptCountryEntry +{ + LanguageType mnLang; + char maLanguageScript[9]; ///< "ll-Ssss" or "lll-Ssss" + char maCountry[3]; + LanguageType mnOverride; + + /** Obtain a language tag string with '-' separator. */ + OUString getTagString() const; + + /** Obtain a locale. */ + css::lang::Locale getLocale() const; + + /** If rStr starts with maLanguageScript ignoring case. + + We don't have OUString::startsWithIgnoreAsciiCaseAscii() + */ + bool startsInIgnoreAsciiCase( const OUString & rStr ) const; +}; + +struct Bcp47CountryEntry +{ + LanguageType mnLang; + const char* mpBcp47; + char maCountry[3]; + const char* mpFallback; + LanguageType mnOverride; + + /** Obtain a language tag string with '-' separator. */ + OUString getTagString() const; + + /** Obtain a locale. */ + css::lang::Locale getLocale() const; +}; + +namespace { + +struct IsoLangEngEntry +{ + LanguageType mnLang; + char maCountry[3]; +}; + +struct IsoLangNoneStdEntry +{ + LanguageType mnLang; + char maLanguage[4]; + char maCountry[9]; +}; + +struct IsoLangOtherEntry +{ + LanguageType mnLang; + const char* mpLanguage; +}; + +} + +// Entries for languages are lower case, for countries upper case, as +// recommended by rfc5646 (obsoletes rfc4646 (obsoletes rfc3066 (obsoletes +// rfc1766))). convertIsoNamesToLanguage(), convertLocaleToLanguageImpl() +// and lookupFallbackLocale() are case insensitive. +// +// Sort order: Most used first and within one language the default fallback +// locale of that language first. +// +// The default entry for a LangID <-> ISO mapping has to be first. For +// conversion of legacy mappings one LangID can map to multiple ISO codes +// except if the LangID is primary-only, and one ISO code combination can map +// to multiple LangIDs. + +/* Currently (2013-08-29 and 2021-10-24) only these primary LangID are still + * used literally in code: + * LANGUAGE_ENGLISH LANGUAGE_ARABIC_PRIMARY_ONLY + */ + +IsoLanguageCountryEntry const aImplIsoLangEntries[] = +{ + // MS-LANGID codes, ISO639-1/2/3, ISO3166, override + { LANGUAGE_ENGLISH_US, "en", "US", k0 }, + { LANGUAGE_ENGLISH_UK, "en", "GB", k0 }, + { LANGUAGE_ENGLISH, "en", "" , k0 }, + { LANGUAGE_ENGLISH_AUS, "en", "AU", k0 }, + { LANGUAGE_ENGLISH_CAN, "en", "CA", k0 }, + { LANGUAGE_FRENCH, "fr", "FR", k0 }, + { LANGUAGE_GERMAN, "de", "DE", k0 }, + { LANGUAGE_ITALIAN, "it", "IT", k0 }, + { LANGUAGE_DUTCH, "nl", "NL", k0 }, + { LANGUAGE_SPANISH_MODERN, "es", "ES", k0 }, + { LANGUAGE_PORTUGUESE, "pt", "PT", k0 }, + { LANGUAGE_PORTUGUESE_BRAZILIAN, "pt", "BR", k0 }, + { LANGUAGE_DANISH, "da", "DK", k0 }, + { LANGUAGE_GREEK, "el", "GR", k0 }, + { LANGUAGE_CHINESE_SIMPLIFIED, "zh", "CN", k0 }, + { LANGUAGE_CHINESE_SIMPLIFIED_LEGACY, "zh", "CN", k0 }, + { LANGUAGE_CHINESE_SIMPLIFIED, "cmn", "CN", kSAME }, + { LANGUAGE_CHINESE_TRADITIONAL, "zh", "TW", k0 }, + { LANGUAGE_CHINESE_TRADITIONAL, "cmn", "TW", kSAME }, + { LANGUAGE_CHINESE_HONGKONG, "zh", "HK", k0 }, + { LANGUAGE_CHINESE_SINGAPORE, "zh", "SG", k0 }, + { LANGUAGE_CHINESE_MACAU, "zh", "MO", k0 }, + { LANGUAGE_CHINESE_LSO, "zh", "" , k0 }, + { LANGUAGE_YUE_CHINESE_HONGKONG, "yue", "HK", k0 }, + { LANGUAGE_ENGLISH_HONG_KONG_SAR, "en", "HK", k0 }, + { LANGUAGE_JAPANESE, "ja", "JP", k0 }, + { LANGUAGE_KOREAN, "ko", "KR", k0 }, + { LANGUAGE_KOREAN_JOHAB, "ko", "KR", k0 }, + { LANGUAGE_USER_KOREAN_NORTH, "ko", "KP", k0 }, + { LANGUAGE_SWEDISH, "sv", "SE", k0 }, + { LANGUAGE_SWEDISH_FINLAND, "sv", "FI", k0 }, + { LANGUAGE_FINNISH, "fi", "FI", k0 }, + { LANGUAGE_RUSSIAN, "ru", "RU", k0 }, + { LANGUAGE_TATAR, "tt", "RU", k0 }, + { LANGUAGE_ENGLISH_NZ, "en", "NZ", k0 }, + { LANGUAGE_ENGLISH_EIRE, "en", "IE", k0 }, + { LANGUAGE_DUTCH_BELGIAN, "nl", "BE", k0 }, + { LANGUAGE_FRENCH_BELGIAN, "fr", "BE", k0 }, + { LANGUAGE_FRENCH_CANADIAN, "fr", "CA", k0 }, + { LANGUAGE_FRENCH_SWISS, "fr", "CH", k0 }, + { LANGUAGE_GERMAN_SWISS, "de", "CH", k0 }, + { LANGUAGE_GERMAN_AUSTRIAN, "de", "AT", k0 }, + { LANGUAGE_ITALIAN_SWISS, "it", "CH", k0 }, + { LANGUAGE_ALBANIAN, "sq", "AL", k0 }, + { LANGUAGE_ARABIC_SAUDI_ARABIA, "ar", "SA", k0 }, + { LANGUAGE_ARABIC_EGYPT, "ar", "EG", k0 }, + { LANGUAGE_ARABIC_UAE, "ar", "AE", k0 }, + { LANGUAGE_ARABIC_IRAQ, "ar", "IQ", k0 }, + { LANGUAGE_ARABIC_LIBYA, "ar", "LY", k0 }, + { LANGUAGE_ARABIC_ALGERIA, "ar", "DZ", k0 }, + { LANGUAGE_ARABIC_MOROCCO, "ar", "MA", k0 }, + { LANGUAGE_ARABIC_TUNISIA, "ar", "TN", k0 }, + { LANGUAGE_ARABIC_OMAN, "ar", "OM", k0 }, + { LANGUAGE_ARABIC_YEMEN, "ar", "YE", k0 }, + { LANGUAGE_ARABIC_SYRIA, "ar", "SY", k0 }, + { LANGUAGE_ARABIC_JORDAN, "ar", "JO", k0 }, + { LANGUAGE_ARABIC_LEBANON, "ar", "LB", k0 }, + { LANGUAGE_ARABIC_KUWAIT, "ar", "KW", k0 }, + { LANGUAGE_ARABIC_BAHRAIN, "ar", "BH", k0 }, + { LANGUAGE_ARABIC_QATAR, "ar", "QA", k0 }, + { LANGUAGE_USER_ARABIC_CHAD, "ar", "TD", k0 }, + { LANGUAGE_USER_ARABIC_COMOROS, "ar", "KM", k0 }, + { LANGUAGE_USER_ARABIC_DJIBOUTI, "ar", "DJ", k0 }, + { LANGUAGE_USER_ARABIC_ERITREA, "ar", "ER", k0 }, + { LANGUAGE_USER_ARABIC_ISRAEL, "ar", "IL", k0 }, + { LANGUAGE_USER_ARABIC_MAURITANIA, "ar", "MR", k0 }, + { LANGUAGE_USER_ARABIC_PALESTINE, "ar", "PS", k0 }, + { LANGUAGE_USER_ARABIC_SOMALIA, "ar", "SO", k0 }, + { LANGUAGE_USER_ARABIC_SUDAN, "ar", "SD", k0 }, + { LANGUAGE_ARABIC_PRIMARY_ONLY, "ar", "" , k0 }, + { LANGUAGE_BASQUE, "eu", "ES", k0 }, + { LANGUAGE_BASQUE, "eu", "" , kSAME }, // our earlier definition + { LANGUAGE_BULGARIAN, "bg", "BG", k0 }, + { LANGUAGE_CZECH, "cs", "CZ", k0 }, + { LANGUAGE_CZECH, "cz", "" , kSAME }, + { LANGUAGE_ENGLISH_JAMAICA, "en", "JM", k0 }, + { LANGUAGE_ENGLISH_CARIBBEAN, "en", "BS", k0 }, // not 100%, because AG is Bahamas + { LANGUAGE_ENGLISH_BELIZE, "en", "BZ", k0 }, + { LANGUAGE_ENGLISH_TRINIDAD, "en", "TT", k0 }, + { LANGUAGE_ENGLISH_ZIMBABWE, "en", "ZW", k0 }, + { LANGUAGE_ENGLISH_INDONESIA, "en", "ID", k0 }, // MS reserved + { LANGUAGE_ESTONIAN, "et", "EE", k0 }, + { LANGUAGE_FAEROESE, "fo", "FO", k0 }, + { LANGUAGE_FARSI, "fa", "IR", k0 }, + { LANGUAGE_FRENCH_LUXEMBOURG, "fr", "LU", k0 }, + { LANGUAGE_FRENCH_MONACO, "fr", "MC", k0 }, + { LANGUAGE_GERMAN_LUXEMBOURG, "de", "LU", k0 }, + { LANGUAGE_GERMAN_LIECHTENSTEIN, "de", "LI", k0 }, + { LANGUAGE_HEBREW, "he", "IL", k0 }, // new: old was "iw" + { LANGUAGE_HEBREW, "iw", "IL", kSAME }, // old: new is "he" + { LANGUAGE_HUNGARIAN, "hu", "HU", k0 }, + { LANGUAGE_ICELANDIC, "is", "IS", k0 }, + { LANGUAGE_INDONESIAN, "id", "ID", k0 }, // new: old was "in" + { LANGUAGE_INDONESIAN, "in", "ID", kSAME }, // old: new is "id" + { LANGUAGE_NORWEGIAN, "no", "NO", k0 }, + { LANGUAGE_NORWEGIAN_BOKMAL, "nb", "NO", k0 }, + { LANGUAGE_NORWEGIAN_BOKMAL_LSO, "nb", "" , k0 }, + { LANGUAGE_NORWEGIAN_NYNORSK, "nn", "NO", k0 }, + { LANGUAGE_NORWEGIAN_NYNORSK_LSO, "nn", "" , k0 }, + { LANGUAGE_POLISH, "pl", "PL", k0 }, + { LANGUAGE_RHAETO_ROMAN, "rm", "CH", k0 }, + { LANGUAGE_ROMANIAN, "ro", "RO", k0 }, + { LANGUAGE_ROMANIAN_MOLDOVA, "ro", "MD", k0 }, + { LANGUAGE_SLOVAK, "sk", "SK", k0 }, + { LANGUAGE_SLOVENIAN, "sl", "SI", k0 }, + { LANGUAGE_SPANISH_MEXICAN, "es", "MX", k0 }, + { LANGUAGE_SPANISH_GUATEMALA, "es", "GT", k0 }, + { LANGUAGE_SPANISH_COSTARICA, "es", "CR", k0 }, + { LANGUAGE_SPANISH_PANAMA, "es", "PA", k0 }, + { LANGUAGE_SPANISH_DOMINICAN_REPUBLIC, "es", "DO", k0 }, + { LANGUAGE_SPANISH_VENEZUELA, "es", "VE", k0 }, + { LANGUAGE_SPANISH_COLOMBIA, "es", "CO", k0 }, + { LANGUAGE_SPANISH_PERU, "es", "PE", k0 }, + { LANGUAGE_SPANISH_ARGENTINA, "es", "AR", k0 }, + { LANGUAGE_SPANISH_ECUADOR, "es", "EC", k0 }, + { LANGUAGE_SPANISH_CHILE, "es", "CL", k0 }, + { LANGUAGE_SPANISH_URUGUAY, "es", "UY", k0 }, + { LANGUAGE_SPANISH_PARAGUAY, "es", "PY", k0 }, + { LANGUAGE_SPANISH_BOLIVIA, "es", "BO", k0 }, + { LANGUAGE_SPANISH_EL_SALVADOR, "es", "SV", k0 }, + { LANGUAGE_SPANISH_HONDURAS, "es", "HN", k0 }, + { LANGUAGE_SPANISH_NICARAGUA, "es", "NI", k0 }, + { LANGUAGE_SPANISH_PUERTO_RICO, "es", "PR", k0 }, + { LANGUAGE_SPANISH_UNITED_STATES, "es", "US", k0 }, + { LANGUAGE_SPANISH_LATIN_AMERICA, "es", "" , k0 }, + { LANGUAGE_TURKISH, "tr", "TR", k0 }, + { LANGUAGE_UKRAINIAN, "uk", "UA", k0 }, + { LANGUAGE_VIETNAMESE, "vi", "VN", k0 }, + { LANGUAGE_LATVIAN, "lv", "LV", k0 }, + { LANGUAGE_MACEDONIAN, "mk", "MK", k0 }, + { LANGUAGE_MALAY_MALAYSIA, "ms", "MY", k0 }, + { LANGUAGE_MALAY_BRUNEI_DARUSSALAM, "ms", "BN", k0 }, + { LANGUAGE_ENGLISH_MALAYSIA, "en", "MY", k0 }, + { LANGUAGE_THAI, "th", "TH", k0 }, + { LANGUAGE_LITHUANIAN, "lt", "LT", k0 }, + { LANGUAGE_LITHUANIAN_CLASSIC, "lt", "LT", k0 }, + { LANGUAGE_CROATIAN, "hr", "HR", k0 }, // Croatian in Croatia + { LANGUAGE_CROATIAN_BOSNIA_HERZEGOVINA, "hr", "BA", k0 }, + { LANGUAGE_BOSNIAN_LATIN_BOSNIA_HERZEGOVINA, "bs", "BA", k0 }, + { LANGUAGE_BOSNIAN_LSO, "bs", "" , k0 }, // so what is 'bs' vs 'bs-Latn'? + { LANGUAGE_SERBIAN_CYRILLIC_SERBIA, "sr", "RS", k0 }, // Serbian Cyrillic in Serbia + { LANGUAGE_OBSOLETE_USER_SERBIAN_CYRILLIC_SERBIA, "sr", "RS", k0 }, + { LANGUAGE_SERBIAN_CYRILLIC_SAM, "sr", "CS", k0 }, // Serbian Cyrillic in Serbia and Montenegro + { LANGUAGE_SERBIAN_CYRILLIC_SAM, "sr", "YU", kSAME }, // legacy Serbian Cyrillic in Serbia and Montenegro (former Yugoslavia); kludge, sr_CS not supported by ICU 2.6 (3.4 does) + { LANGUAGE_SERBIAN_CYRILLIC_MONTENEGRO, "sr", "ME", k0 }, + { LANGUAGE_OBSOLETE_USER_SERBIAN_CYRILLIC_MONTENEGRO, "sr", "ME", k0 }, + { LANGUAGE_SERBIAN_CYRILLIC_BOSNIA_HERZEGOVINA, "sr", "BA", k0 }, + { LANGUAGE_SERBIAN_CYRILLIC_LSO, "sr", "" , k0 }, + { LANGUAGE_SERBIAN_LATIN_SERBIA, "sh", "RS", kSAME }, // legacy kludge, is sr-Latn-RS now + { LANGUAGE_OBSOLETE_USER_SERBIAN_LATIN_SERBIA, "sh", "RS", kSAME }, // legacy kludge, is sr-Latn-RS now + { LANGUAGE_SERBIAN_LATIN_SAM, "sh", "CS", kSAME }, // legacy kludge, is sr-Latn-CS now + { LANGUAGE_SERBIAN_LATIN_SAM, "sh", "YU", kSAME }, // legacy kludge, is sr-Latn-YU now + { LANGUAGE_SERBIAN_LATIN_MONTENEGRO, "sh", "ME", kSAME }, // legacy kludge, is sr-Latn-ME now + { LANGUAGE_OBSOLETE_USER_SERBIAN_LATIN_MONTENEGRO, "sh", "ME", kSAME }, // legacy kludge, is sr-Latn-ME now + { LANGUAGE_SERBIAN_LATIN_BOSNIA_HERZEGOVINA, "sh", "BA", kSAME }, // legacy kludge, is sr-Latn-BA now + { LANGUAGE_SERBIAN_LATIN_LSO, "sh", "" , kSAME }, // legacy kludge, is sr-Latn now + { LANGUAGE_ARMENIAN, "hy", "AM", k0 }, + { LANGUAGE_USER_ARMENIAN_RUSSIA, "hy", "RU", k0 }, + { LANGUAGE_USER_ARMENIAN_IRAN, "hy", "IR", k0 }, + { LANGUAGE_AZERI_LATIN, "az", "AZ", k0 }, // macrolanguage code + { LANGUAGE_UZBEK_LATIN, "uz", "UZ", k0 }, // macrolanguage code + { LANGUAGE_UZBEK_LATIN_LSO, "uz", "" , k0 }, // macrolanguage code + { LANGUAGE_BENGALI_BANGLADESH, "bn", "BD", k0 }, + { LANGUAGE_BENGALI, "bn", "IN", k0 }, + { LANGUAGE_BURMESE, "my", "MM", k0 }, + { LANGUAGE_KAZAKH, "kk", "KZ", k0 }, + { LANGUAGE_ENGLISH_INDIA, "en", "IN", k0 }, + { LANGUAGE_URDU_INDIA, "ur", "IN", k0 }, + { LANGUAGE_URDU_PAKISTAN, "ur", "PK", k0 }, + { LANGUAGE_HINDI, "hi", "IN", k0 }, + { LANGUAGE_GUJARATI, "gu", "IN", k0 }, + { LANGUAGE_KANNADA, "kn", "IN", k0 }, + { LANGUAGE_ASSAMESE, "as", "IN", k0 }, + { LANGUAGE_KASHMIRI_INDIA, "ks", "IN", kSAME }, + { LANGUAGE_KASHMIRI, "ks", "" , kSAME }, // Kashmiri in "Jammu and Kashmir" ... no ISO3166 code for that + { LANGUAGE_MALAYALAM, "ml", "IN", k0 }, + { LANGUAGE_MANIPURI, "mni", "IN", k0 }, // MS reserved + { LANGUAGE_MARATHI, "mr", "IN", k0 }, + { LANGUAGE_KONKANI, "kok", "IN", k0 }, + { LANGUAGE_NEPALI, "ne", "NP", k0 }, + { LANGUAGE_NEPALI_INDIA, "ne", "IN", k0 }, + { LANGUAGE_ODIA, "or", "IN", k0 }, + { LANGUAGE_PUNJABI, "pa", "IN", k0 }, + { LANGUAGE_SANSKRIT, "sa", "IN", k0 }, + { LANGUAGE_TAMIL, "ta", "IN", k0 }, + { LANGUAGE_TAMIL_SRI_LANKA, "ta", "LK", k0 }, + { LANGUAGE_TELUGU, "te", "IN", k0 }, + { LANGUAGE_PUNJABI_PAKISTAN, "pnb", "PK", k0 }, + { LANGUAGE_PUNJABI_ARABIC_LSO, "pnb", "" , k0 }, + { LANGUAGE_PUNJABI_PAKISTAN, "lah", "PK", kSAME }, // macrolanguage code, earlier preferred 'lah' over 'pa' for Western Panjabi, now there is 'pnb' + { LANGUAGE_PUNJABI_PAKISTAN, "pa", "PK", kSAME }, // MS maps this to 'pa-Arab-PK', but 'pa'='pan' Eastern Panjabi is not used in PK, only in + { LANGUAGE_SINDHI_PAKISTAN, "sd", "PK", kSAME }, // Arabic script + { LANGUAGE_SINDHI, "sd", "IN", kSAME }, // Devanagari script + { LANGUAGE_BELARUSIAN, "be", "BY", k0 }, + { LANGUAGE_CATALAN, "ca", "ES", k0 }, // Spain (default) + { LANGUAGE_CATALAN, "ca", "AD", k0 }, // Andorra + //LANGUAGE_CATALAN_VALENCIAN ca-ES-valencia Bcp47CountryEntry takes precedence + { LANGUAGE_CATALAN_VALENCIAN, "ca", "XV", kSAME }, // XV: ISO 3166 user-assigned; old workaround for UI localization only, in case it escaped to document content + { LANGUAGE_CATALAN_VALENCIAN, "qcv", "ES", kSAME }, // qcv: ISO 639-3 reserved-for-local-use; old UI localization quirk only, in case it escaped to document content + { LANGUAGE_FRENCH_CAMEROON, "fr", "CM", k0 }, + { LANGUAGE_FRENCH_COTE_D_IVOIRE, "fr", "CI", k0 }, + { LANGUAGE_FRENCH_MALI, "fr", "ML", k0 }, + { LANGUAGE_FRENCH_SENEGAL, "fr", "SN", k0 }, + { LANGUAGE_FRENCH_ZAIRE, "fr", "CD", k0 }, // Democratic Republic Of Congo + { LANGUAGE_FRENCH_MOROCCO, "fr", "MA", k0 }, + { LANGUAGE_FRENCH_REUNION, "fr", "RE", k0 }, + { LANGUAGE_FRISIAN_NETHERLANDS, "fy", "NL", k0 }, + { LANGUAGE_GAELIC_IRELAND, "ga", "IE", k0 }, + { LANGUAGE_GAELIC_SCOTLAND, "gd", "GB", k0 }, + { LANGUAGE_GAELIC_SCOTLAND_LEGACY, "gd", "GB", k0 }, + { LANGUAGE_GALICIAN, "gl", "ES", k0 }, + { LANGUAGE_GEORGIAN, "ka", "GE", k0 }, + { LANGUAGE_KHMER, "km", "KH", k0 }, + { LANGUAGE_KIRGHIZ, "ky", "KG", k0 }, + { LANGUAGE_LAO, "lo", "LA", k0 }, + { LANGUAGE_MALTESE, "mt", "MT", k0 }, + { LANGUAGE_MONGOLIAN_CYRILLIC_MONGOLIA, "mn", "MN", k0 }, // macrolanguage code; should be khk-MN; Cyrillic script + { LANGUAGE_MONGOLIAN_CYRILLIC_LSO, "mn", "" , k0 }, // macrolanguage code; should be khk; Cyrillic script + { LANGUAGE_ROMANIAN_MOLDOVA, "ro", "MD", k0 }, + { LANGUAGE_ROMANIAN_MOLDOVA, "mo", "MD", k0 }, // mo-MD was associated with Russian Moldova LCID, apparently an error; 'mo' is retired, merged with 'ro', see http://www-01.sil.org/iso639-3/documentation.asp?id=mol + { LANGUAGE_RUSSIAN_MOLDOVA, "ru", "MD", k0 }, // as per [MS-LCID] rev. 7.0 2015-06-30 + { LANGUAGE_SWAHILI, "sw", "KE", k0 }, + { LANGUAGE_USER_SWAHILI_TANZANIA, "sw", "TZ", k0 }, + { LANGUAGE_TAJIK, "tg", "TJ", k0 }, + { LANGUAGE_TAJIK_LSO, "tg", "" , k0 }, + { LANGUAGE_TIBETAN, "bo", "CN", k0 }, // CN politically correct? + { LANGUAGE_USER_TIBETAN_INDIA, "bo", "IN", k0 }, + { LANGUAGE_USER_TIBETAN_BHUTAN, "bo", "BT", k0 }, + { LANGUAGE_DZONGKHA_BHUTAN, "dz", "BT", k0 }, + { LANGUAGE_TIBETAN_BHUTAN, "dz", "BT", k0 }, // MS reserved for bo-BT, but LCID was used as Dzongkha, see #i53497# + { LANGUAGE_USER_DZONGKHA_MAP_LONLY, "dz", "" , k0 }, // because of the MS error, see lang.h + { LANGUAGE_TURKMEN, "tk", "TM", k0 }, + { LANGUAGE_WELSH, "cy", "GB", k0 }, + { LANGUAGE_SESOTHO, "st", "ZA", k0 }, + { LANGUAGE_SEPEDI, "nso", "ZA", k0 }, + { LANGUAGE_SEPEDI, "ns", "ZA", kSAME }, // fake "ns" for compatibility with existing OOo1.1.x localization to be able to read those documents + { LANGUAGE_TSONGA, "ts", "ZA", k0 }, + { LANGUAGE_TSWANA, "tn", "ZA", k0 }, + { LANGUAGE_ENGLISH_SAFRICA, "en", "ZA", k0 }, + { LANGUAGE_AFRIKAANS, "af", "ZA", k0 }, + { LANGUAGE_VENDA, "ve", "ZA", k0 }, // default 639-1 + { LANGUAGE_VENDA, "ven", "ZA", kSAME }, // 639-2 may have been used temporarily since 2004-07-23 + { LANGUAGE_XHOSA, "xh", "ZA", k0 }, + { LANGUAGE_ZULU, "zu", "ZA", k0 }, +// { LANGUAGE_QUECHUA_COLOMBIA, "quc", "CO", k0 }, // MS reserved, and looks wrong, quc would be in Guatemala, not Colombia + { LANGUAGE_QUECHUA_ECUADOR, "quz", "EC", k0 }, // MS + { LANGUAGE_QUECHUA_ECUADOR, "qu", "EC", kSAME }, // macrolanguage code + { LANGUAGE_QUECHUA_PERU, "quz", "PE", k0 }, // MS + { LANGUAGE_QUECHUA_PERU, "qu", "PE", kSAME }, // macrolanguage code + { LANGUAGE_QUECHUA_BOLIVIA, "qu", "BO", k0 }, // macrolanguage code, TODO instead: quh-BO or qul-BO; MS says quz-BO which is wrong + { LANGUAGE_PASHTO, "ps", "AF", k0 }, + { LANGUAGE_OROMO, "om", "ET", k0 }, + { LANGUAGE_DHIVEHI, "dv", "MV", k0 }, + { LANGUAGE_UIGHUR_CHINA, "ug", "CN", k0 }, + { LANGUAGE_TIGRIGNA_ETHIOPIA, "ti", "ET", k0 }, + { LANGUAGE_TIGRIGNA_ERITREA, "ti", "ER", k0 }, + { LANGUAGE_AMHARIC_ETHIOPIA, "am", "ET", k0 }, + { LANGUAGE_GUARANI_PARAGUAY, "gug", "PY", k0 }, + { LANGUAGE_HAWAIIAN_UNITED_STATES, "haw", "US", k0 }, + { LANGUAGE_EDO, "bin", "NG", k0 }, // MS reserved + { LANGUAGE_FULFULDE_NIGERIA, "ff", "NG", k0 }, // macrolanguage code; MS since rev.15 + { LANGUAGE_FULFULDE_NIGERIA, "fuv", "NG", kSAME }, // MS reserved until rev.15, since rev.15 "ff-NG" and "ff-Latn-NG" + { LANGUAGE_FULFULDE_SENEGAL, "ff", "SN", k0 }, // macrolanguage code + { LANGUAGE_HAUSA_NIGERIA, "ha", "NG", kSAME }, + { LANGUAGE_USER_HAUSA_GHANA, "ha", "GH", kSAME }, + { LANGUAGE_IGBO_NIGERIA, "ig", "NG", k0 }, + { LANGUAGE_KANURI_NIGERIA, "kr", "NG", k0 }, // macrolanguage code; MS reserved until rev.15 + { LANGUAGE_YORUBA, "yo", "NG", k0 }, + { LANGUAGE_SOMALI, "so", "SO", k0 }, + { LANGUAGE_PAPIAMENTU, "pap", "AN", k0 }, + { LANGUAGE_USER_PAPIAMENTU_ARUBA, "pap", "AW", k0 }, + { LANGUAGE_USER_PAPIAMENTU_CURACAO, "pap", "CW", k0 }, + { LANGUAGE_USER_PAPIAMENTU_BONAIRE, "pap", "BQ", k0 }, + { LANGUAGE_ENGLISH_SINGAPORE, "en", "SG", k0 }, + { LANGUAGE_USER_YIDDISH_US, "yi", "US", k0 }, + { LANGUAGE_USER_YIDDISH_ISRAEL, "yi", "IL", k0 }, // new: old was "ji" + { LANGUAGE_USER_YIDDISH_ISRAEL, "ji", "IL", kSAME }, // old: new is "yi" + { LANGUAGE_SYRIAC, "syr", "TR", k0 }, // "TR" according to http://www.ethnologue.com/show_language.asp?code=SYC + { LANGUAGE_SINHALESE_SRI_LANKA, "si", "LK", k0 }, + { LANGUAGE_CHEROKEE_UNITED_STATES, "chr", "US", kSAME }, + { LANGUAGE_INUKTITUT_LATIN_CANADA, "iu", "CA", kSAME }, // macrolanguage code + { LANGUAGE_INUKTITUT_LATIN_LSO, "iu", "" , kSAME }, // macrolanguage code + { LANGUAGE_SAMI_NORTHERN_NORWAY, "se", "NO", k0 }, + { LANGUAGE_SAMI_INARI, "smn", "FI", k0 }, + { LANGUAGE_SAMI_INARI_LSO, "smn", "" , k0 }, + { LANGUAGE_SAMI_LULE_NORWAY, "smj", "NO", k0 }, + { LANGUAGE_SAMI_LULE_SWEDEN, "smj", "SE", k0 }, + { LANGUAGE_SAMI_LULE_LSO, "smj", "" , k0 }, + { LANGUAGE_SAMI_NORTHERN_FINLAND, "se", "FI", k0 }, + { LANGUAGE_SAMI_NORTHERN_SWEDEN, "se", "SE", k0 }, + { LANGUAGE_SAMI_SKOLT, "sms", "FI", k0 }, + { LANGUAGE_SAMI_SKOLT_LSO, "sms", "" , k0 }, + { LANGUAGE_SAMI_SOUTHERN_NORWAY, "sma", "NO", k0 }, + { LANGUAGE_SAMI_SOUTHERN_SWEDEN, "sma", "SE", k0 }, + { LANGUAGE_SAMI_SOUTHERN_LSO, "sma", "" , k0 }, + { LANGUAGE_USER_SAMI_KILDIN_RUSSIA, "sjd", "RU", k0 }, + { LANGUAGE_MAPUDUNGUN_CHILE, "arn", "CL", k0 }, + { LANGUAGE_CORSICAN_FRANCE, "co", "FR", k0 }, + { LANGUAGE_ALSATIAN_FRANCE, "gsw", "FR", k0 }, // in fact 'gsw' is Schwyzerduetsch (Swiss German), which is a dialect of Alemannic German, as is Alsatian. They aren't distinct languages and share this code. + { LANGUAGE_YAKUT_RUSSIA, "sah", "RU", k0 }, + { LANGUAGE_MOHAWK_CANADA, "moh", "CA", k0 }, + { LANGUAGE_BASHKIR_RUSSIA, "ba", "RU", k0 }, + { LANGUAGE_KICHE_GUATEMALA, "qut", "GT", k0 }, // MS reserved since rev.15 + { LANGUAGE_DARI_AFGHANISTAN, "prs", "AF", k0 }, + { LANGUAGE_DARI_AFGHANISTAN, "gbz", "AF", kSAME }, // was an error + { LANGUAGE_WOLOF_SENEGAL, "wo", "SN", k0 }, + { LANGUAGE_FILIPINO, "fil", "PH", k0 }, + { LANGUAGE_USER_TAGALOG, "tl", "PH", k0 }, + { LANGUAGE_ENGLISH_PHILIPPINES, "en", "PH", k0 }, + { LANGUAGE_IBIBIO_NIGERIA, "ibb", "NG", k0 }, // MS reserved + { LANGUAGE_YI, "ii", "CN", k0 }, + { LANGUAGE_ENGLISH_ARAB_EMIRATES, "en", "AE", k0 }, + { LANGUAGE_ENGLISH_BAHRAIN, "en", "BH", k0 }, // MS reserved + { LANGUAGE_ENGLISH_EGYPT, "en", "EG", k0 }, // MS reserved + { LANGUAGE_ENGLISH_JORDAN, "en", "JO", k0 }, // MS reserved + { LANGUAGE_ENGLISH_KUWAIT, "en", "KW", k0 }, // MS reserved + { LANGUAGE_ENGLISH_TURKEY, "en", "TR", k0 }, // MS reserved + { LANGUAGE_ENGLISH_YEMEN, "en", "YE", k0 }, // MS reserved + { LANGUAGE_TAMAZIGHT_LATIN_ALGERIA, "kab", "DZ", k0 }, // In practice Kabyle is the language used for this + { LANGUAGE_OBSOLETE_USER_KABYLE, "kab", "DZ", k0 }, + { LANGUAGE_TAMAZIGHT_LATIN_ALGERIA, "ber", "DZ", kSAME }, // In practice Algeria has standardized on Kabyle as the member of the "ber" collective which gets used there. + { LANGUAGE_TAMAZIGHT_TIFINAGH_MOROCCO, "tmz", "MA", kSAME }, + { LANGUAGE_TAMAZIGHT_MOROCCO, "tmz", "MA", k0 }, // MS reserved + { LANGUAGE_TAMAZIGHT_TIFINAGH_MOROCCO, "ber", "MA", kSAME }, // Morocco is officially using Tifinagh for its Berber languages, old kludge to distinguish from LANGUAGE_TAMAZIGHT_LATIN_ALGERIA + { LANGUAGE_LATIN, "la", "VA", k0 }, + { LANGUAGE_OBSOLETE_USER_LATIN_VATICAN, "la", "VA", LANGUAGE_LATIN }, + { LANGUAGE_OBSOLETE_USER_LATIN, "la", "VA", LANGUAGE_LATIN }, + { LANGUAGE_LATIN, "la", "" , kSAME }, + { LANGUAGE_USER_ESPERANTO, "eo", "" , k0 }, + { LANGUAGE_USER_INTERLINGUA, "ia", "" , k0 }, + { LANGUAGE_USER_INTERLINGUE, "ie", "" , k0 }, + { LANGUAGE_MAORI_NEW_ZEALAND, "mi", "NZ", k0 }, + { LANGUAGE_OBSOLETE_USER_MAORI, "mi", "NZ", k0 }, + { LANGUAGE_KINYARWANDA_RWANDA, "rw", "RW", k0 }, + { LANGUAGE_OBSOLETE_USER_KINYARWANDA, "rw", "RW", k0 }, + { LANGUAGE_UPPER_SORBIAN_GERMANY, "hsb", "DE", k0 }, // MS maps this to 'wen-DE', which is nonsense. 'wen' is a collective language code, 'WEN' is a SIL code, see http://www.ethnologue.com/14/show_iso639.asp?code=wen and http://www.ethnologue.com/14/show_language.asp?code=WEN + { LANGUAGE_OBSOLETE_USER_UPPER_SORBIAN,"hsb", "DE", k0 }, + { LANGUAGE_LOWER_SORBIAN_GERMANY, "dsb", "DE", k0 }, // MS maps this to 'wee-DE', which is nonsense. 'WEE' is a SIL code, see http://www.ethnologue.com/14/show_language.asp?code=WEE + { LANGUAGE_LOWER_SORBIAN_LSO, "dsb", "" , k0 }, + { LANGUAGE_OBSOLETE_USER_LOWER_SORBIAN,"dsb", "DE", k0 }, + { LANGUAGE_OCCITAN_FRANCE, "oc", "FR", kSAME }, + { LANGUAGE_OBSOLETE_USER_OCCITAN, "oc", "FR", LANGUAGE_OCCITAN_FRANCE }, + { LANGUAGE_USER_KURDISH_TURKEY, "kmr", "TR", kSAME }, + { LANGUAGE_USER_KURDISH_TURKEY, "ku", "TR", kSAME }, + { LANGUAGE_USER_KURDISH_SYRIA, "kmr", "SY", kSAME }, + { LANGUAGE_USER_KURDISH_SYRIA, "ku", "SY", kSAME }, + { LANGUAGE_KURDISH_ARABIC_IRAQ, "ckb", "IQ", k0 }, + { LANGUAGE_KURDISH_ARABIC_IRAQ, "ku", "IQ", kSAME }, + { LANGUAGE_OBSOLETE_USER_KURDISH_IRAQ, "ku", "IQ", LANGUAGE_KURDISH_ARABIC_IRAQ }, + { LANGUAGE_USER_KURDISH_SOUTHERN_IRAN, "sdh", "IR", k0 }, + { LANGUAGE_USER_KURDISH_SOUTHERN_IRAQ, "sdh", "IQ", k0 }, + { LANGUAGE_USER_KURDISH_IRAN, "ckb", "IR", k0 }, + { LANGUAGE_USER_KURDISH_IRAN, "ku", "IR", kSAME }, + { LANGUAGE_KURDISH_ARABIC_LSO, "ckb", "" , k0 }, + { LANGUAGE_USER_SARDINIAN, "sc", "IT", k0 }, // macrolanguage code + { LANGUAGE_USER_SARDINIAN_CAMPIDANESE, "sro", "IT", k0 }, + { LANGUAGE_USER_SARDINIAN_GALLURESE, "sdn", "IT", k0 }, + { LANGUAGE_USER_SARDINIAN_LOGUDORESE, "src", "IT", k0 }, + { LANGUAGE_USER_SARDINIAN_SASSARESE, "sdc", "IT", k0 }, + { LANGUAGE_BRETON_FRANCE, "br", "FR", k0 }, + { LANGUAGE_OBSOLETE_USER_BRETON, "br", "FR", k0 }, + { LANGUAGE_KALAALLISUT_GREENLAND, "kl", "GL", k0 }, + { LANGUAGE_OBSOLETE_USER_KALAALLISUT, "kl", "GL", k0 }, + { LANGUAGE_USER_SWAZI, "ss", "ZA", k0 }, + { LANGUAGE_USER_NDEBELE_SOUTH, "nr", "ZA", k0 }, + { LANGUAGE_TSWANA_BOTSWANA, "tn", "BW", k0 }, + { LANGUAGE_OBSOLETE_USER_TSWANA_BOTSWANA, "tn", "BW", k0 }, + { LANGUAGE_USER_ENGLISH_BOTSWANA, "en", "BW", k0 }, + { LANGUAGE_USER_MOORE, "mos", "BF", k0 }, + { LANGUAGE_USER_BAMBARA, "bm", "ML", k0 }, + { LANGUAGE_USER_AKAN, "ak", "GH", k0 }, + { LANGUAGE_LUXEMBOURGISH_LUXEMBOURG, "lb", "LU", k0 }, + { LANGUAGE_OBSOLETE_USER_LUXEMBOURGISH, "lb", "LU", k0 }, + { LANGUAGE_USER_FRIULIAN, "fur", "IT", k0 }, + { LANGUAGE_USER_FIJIAN, "fj", "FJ", k0 }, + { LANGUAGE_USER_AFRIKAANS_NAMIBIA, "af", "NA", k0 }, + { LANGUAGE_USER_ENGLISH_NAMIBIA, "en", "NA", k0 }, + { LANGUAGE_USER_WALLOON, "wa", "BE", k0 }, + { LANGUAGE_USER_COPTIC, "cop", "EG", k0 }, + { LANGUAGE_USER_GASCON, "gsc", "FR", k0 }, + { LANGUAGE_USER_GERMAN_BELGIUM, "de", "BE", k0 }, + { LANGUAGE_USER_CHUVASH, "cv", "RU", k0 }, + { LANGUAGE_USER_EWE_GHANA, "ee", "GH", k0 }, + { LANGUAGE_USER_ENGLISH_GHANA, "en", "GH", k0 }, + { LANGUAGE_USER_SANGO, "sg", "CF", k0 }, + { LANGUAGE_USER_GANDA, "lg", "UG", k0 }, + { LANGUAGE_USER_LINGALA_DRCONGO, "ln", "CD", k0 }, + { LANGUAGE_USER_LOW_GERMAN, "nds", "DE", k0 }, + { LANGUAGE_USER_HILIGAYNON, "hil", "PH", k0 }, + { LANGUAGE_USER_ENGLISH_MALAWI, "en", "MW", k0 }, /* en default for MW */ + { LANGUAGE_USER_NYANJA, "ny", "MW", k0 }, + { LANGUAGE_USER_KASHUBIAN, "csb", "PL", k0 }, + { LANGUAGE_SPANISH_CUBA, "es", "CU", k0 }, + { LANGUAGE_OBSOLETE_USER_SPANISH_CUBA, "es", "CU", k0 }, + { LANGUAGE_USER_QUECHUA_NORTH_BOLIVIA, "qul", "BO", k0 }, + { LANGUAGE_USER_QUECHUA_SOUTH_BOLIVIA, "quh", "BO", k0 }, + { LANGUAGE_USER_BODO_INDIA, "brx", "IN", k0 }, + { LANGUAGE_USER_DOGRI_INDIA, "dgo", "IN", k0 }, + { LANGUAGE_USER_MAITHILI_INDIA, "mai", "IN", k0 }, + { LANGUAGE_USER_SANTALI_INDIA, "sat", "IN", k0 }, + { LANGUAGE_USER_TETUN, "tet", "ID", k0 }, + { LANGUAGE_USER_TETUN_TIMOR_LESTE, "tet", "TL", k0 }, + { LANGUAGE_USER_TOK_PISIN, "tpi", "PG", k0 }, + { LANGUAGE_USER_SHUSWAP, "shs", "CA", k0 }, + { LANGUAGE_USER_ANCIENT_GREEK, "grc", "GR", k0 }, + { LANGUAGE_USER_ASTURIAN, "ast", "ES", k0 }, + { LANGUAGE_USER_LATGALIAN, "ltg", "LV", k0 }, + { LANGUAGE_USER_MAORE, "swb", "YT", k0 }, + { LANGUAGE_USER_BUSHI, "buc", "YT", k0 }, + { LANGUAGE_USER_TAHITIAN, "ty", "PF", k0 }, + { LANGUAGE_MALAGASY_PLATEAU, "plt", "MG", k0 }, // MS reserved + { LANGUAGE_MALAGASY_PLATEAU, "mg", "MG", kSAME }, + { LANGUAGE_OBSOLETE_USER_MALAGASY_PLATEAU, "plt", "MG", k0 }, + { LANGUAGE_USER_BAFIA, "ksf", "CM", k0 }, + { LANGUAGE_USER_GIKUYU, "ki", "KE", k0 }, + { LANGUAGE_USER_RUSYN_UKRAINE, "rue", "UA", k0 }, + { LANGUAGE_USER_RUSYN_SLOVAKIA, "rue", "SK", k0 }, + { LANGUAGE_USER_LIMBU, "lif", "NP", k0 }, + { LANGUAGE_USER_LOJBAN, "jbo", "" , k0 }, + { LANGUAGE_USER_HAITIAN, "ht", "HT", k0 }, + { LANGUAGE_FRENCH_HAITI, "fr", "HT", k0 }, + { LANGUAGE_USER_BEEMBE, "beq", "CG", k0 }, + { LANGUAGE_USER_BEKWEL, "bkw", "CG", k0 }, + { LANGUAGE_USER_KITUBA, "mkw", "CG", k0 }, + { LANGUAGE_USER_LARI, "ldi", "CG", k0 }, + { LANGUAGE_USER_MBOCHI, "mdw", "CG", k0 }, + { LANGUAGE_USER_TEKE_EBOO, "ebo", "CG", k0 }, + { LANGUAGE_USER_TEKE_IBALI, "tek", "CG", k0 }, + { LANGUAGE_USER_TEKE_TYEE, "tyx", "CG", k0 }, + { LANGUAGE_USER_VILI, "vif", "CG", k0 }, + { LANGUAGE_USER_PORTUGUESE_ANGOLA, "pt", "AO", k0 }, + { LANGUAGE_USER_MANX, "gv", "GB", k0 }, + { LANGUAGE_USER_ARAGONESE, "an", "ES", k0 }, + { LANGUAGE_USER_KEYID, "qtz", "" , k0 }, // key id pseudolanguage used for UI testing + { LANGUAGE_USER_PALI_LATIN, "pli", "" , kSAME }, // Pali with Latin script, ISO 639-3 (sigh..) back-compat, Latin is not a default script though... + { LANGUAGE_USER_KYRGYZ_CHINA, "ky", "CN", k0 }, + { LANGUAGE_USER_KOMI_ZYRIAN, "kpv", "RU", k0 }, + { LANGUAGE_USER_KOMI_PERMYAK, "koi", "RU", k0 }, + { LANGUAGE_USER_PITJANTJATJARA, "pjt", "AU", k0 }, + { LANGUAGE_USER_ERZYA, "myv", "RU", k0 }, + { LANGUAGE_USER_MARI_MEADOW, "mhr", "RU", k0 }, + { LANGUAGE_USER_KHANTY, "kca", "RU", k0 }, + { LANGUAGE_USER_LIVONIAN, "liv", "RU", k0 }, + { LANGUAGE_USER_MOKSHA, "mdf", "RU", k0 }, + { LANGUAGE_USER_MARI_HILL, "mrj", "RU", k0 }, + { LANGUAGE_USER_NGANASAN, "nio", "RU", k0 }, + { LANGUAGE_USER_OLONETS, "olo", "RU", k0 }, + { LANGUAGE_USER_VEPS, "vep", "RU", k0 }, + { LANGUAGE_USER_VORO, "vro", "EE", k0 }, + { LANGUAGE_USER_NENETS, "yrk", "RU", k0 }, + { LANGUAGE_USER_AKA, "axk", "CF", k0 }, + { LANGUAGE_USER_AKA_CONGO, "axk", "CG", k0 }, + { LANGUAGE_USER_DIBOLE, "bvx", "CG", k0 }, + { LANGUAGE_USER_DOONDO, "dde", "CG", k0 }, + { LANGUAGE_USER_KAAMBA, "xku", "CG", k0 }, + { LANGUAGE_USER_KOONGO, "kng", "CD", k0 }, + { LANGUAGE_USER_KOONGO_CONGO, "kng", "CG", k0 }, + { LANGUAGE_USER_KUNYI, "njx", "CG", k0 }, + { LANGUAGE_USER_NGUNGWEL, "ngz", "CG", k0 }, + { LANGUAGE_USER_NJYEM, "njy", "CM", k0 }, + { LANGUAGE_USER_NJYEM_CONGO, "njy", "CG", k0 }, + { LANGUAGE_USER_PUNU, "puu", "GA", k0 }, + { LANGUAGE_USER_PUNU_CONGO, "puu", "CG", k0 }, + { LANGUAGE_USER_SUUNDI, "sdj", "CG", k0 }, + { LANGUAGE_USER_TEKE_KUKUYA, "kkw", "CG", k0 }, + { LANGUAGE_USER_TSAANGI, "tsa", "CG", k0 }, + { LANGUAGE_USER_YAKA, "iyx", "CG", k0 }, + { LANGUAGE_USER_YOMBE, "yom", "CD", k0 }, + { LANGUAGE_USER_YOMBE_CONGO, "yom", "CG", k0 }, + { LANGUAGE_USER_SIDAMA, "sid", "ET", k0 }, + { LANGUAGE_USER_NKO, "nqo", "GN", k0 }, + { LANGUAGE_USER_UDMURT, "udm", "RU", k0 }, + { LANGUAGE_USER_CORNISH, "kw", "GB", k0 }, + { LANGUAGE_USER_CORNISH, "kw", "UK", kSAME }, // old erroneous tag + { LANGUAGE_USER_SAMI_PITE_SWEDEN, "sje", "SE", k0 }, + { LANGUAGE_USER_NGAEBERE, "gym", "PA", k0 }, + { LANGUAGE_USER_KUMYK, "kum", "RU", k0 }, + { LANGUAGE_USER_NOGAI, "nog", "RU", k0 }, + { LANGUAGE_USER_LADIN, "lld", "IT", k0 }, + { LANGUAGE_USER_FRENCH_BURKINA_FASO, "fr", "BF", k0 }, + { LANGUAGE_USER_PUINAVE, "pui", "CO", k0 }, + { LANGUAGE_USER_AVAR, "av", "RU", k0 }, + { LANGUAGE_USER_LENGO, "lgr", "SB", k0 }, + { LANGUAGE_USER_FRENCH_BENIN, "fr", "BJ", k0 }, + { LANGUAGE_USER_FRENCH_NIGER, "fr", "NE", k0 }, + { LANGUAGE_USER_FRENCH_TOGO, "fr", "TG", k0 }, + { LANGUAGE_USER_KVEN_FINNISH, "fkv", "NO", k0 }, + { LANGUAGE_USER_CHURCH_SLAVIC, "cu", "RU", k0 }, + { LANGUAGE_USER_VENETIAN, "vec", "IT", k0 }, + { LANGUAGE_USER_ENGLISH_GAMBIA, "en", "GM", k0 }, + { LANGUAGE_USER_OCCITAN_ARANESE, "oc", "ES", kSAME }, + { LANGUAGE_USER_ARPITAN_FRANCE, "frp", "FR", k0 }, + { LANGUAGE_USER_ARPITAN_ITALY, "frp", "IT", k0 }, + { LANGUAGE_USER_ARPITAN_SWITZERLAND, "frp", "CH", k0 }, + { LANGUAGE_USER_APATANI, "apt", "IN", k0 }, + { LANGUAGE_USER_ENGLISH_MAURITIUS, "en", "MU", k0 }, + { LANGUAGE_USER_FRENCH_MAURITIUS, "fr", "MU", k0 }, + { LANGUAGE_USER_SILESIAN, "szl", "PL", k0 }, + { LANGUAGE_USER_MANCHU, "mnc", "CN", k0 }, + { LANGUAGE_USER_XIBE, "sjo", "CN", k0 }, + { LANGUAGE_USER_KITUBA_DRCONGO, "ktu", "CD", k0 }, + { LANGUAGE_USER_FON, "fon", "BJ", k0 }, + { LANGUAGE_USER_PLAUTDIETSCH, "pdt", "CA", k0 }, + { LANGUAGE_USER_ARMENIAN_WESTERN, "hyw", "AM", k0 }, + { LANGUAGE_USER_ARMENIAN_CLASSICAL, "xcl", "AM", k0 }, + { LANGUAGE_USER_JUHOAN, "ktz", "NA", k0 }, + { LANGUAGE_USER_NARO, "nhr", "BW", k0 }, + { LANGUAGE_USER_ILOKO, "ilo", "PH", k0 }, + { LANGUAGE_USER_ENGLISH_ZAMBIA, "en", "ZM", k0 }, + { LANGUAGE_USER_ENGLISH_SRI_LANKA, "en", "LK", k0 }, + { LANGUAGE_USER_ENGLISH_NIGERIA, "en", "NG", k0 }, + { LANGUAGE_USER_KABARDIAN, "kbd", "RU", k0 }, // Cyrillic script + { LANGUAGE_USER_GUADELOUPEAN_CREOLE_FRENCH, "gcf", "GP", k0 }, + { LANGUAGE_USER_LIGURIAN, "lij", "IT", k0 }, + { LANGUAGE_USER_MINANGKABAU, "min", "ID", k0 }, + { LANGUAGE_USER_SUNDANESE, "sun", "ID", k0 }, + { LANGUAGE_USER_YAKA_DRCONGO, "yaf", "CD", k0 }, + { LANGUAGE_USER_ENGLISH_KENYA, "en", "KE", k0 }, + { LANGUAGE_USER_CABECAR, "cjp", "CR", k0 }, + { LANGUAGE_USER_BRIBRI, "bzd", "CR", k0 }, + { LANGUAGE_USER_ENGLISH_DENMARK, "en", "DK", k0 }, + { LANGUAGE_USER_SESOTHO_LESOTHO, "st", "LS", k0 }, + { LANGUAGE_USER_KLINGON, "tlh", "" , k0 }, + { LANGUAGE_USER_ENGLISH_ISRAEL, "en", "IL", k0 }, + { LANGUAGE_USER_PENNSYLVANIA_DUTCH, "pdc", "US", k0 }, + { LANGUAGE_MULTIPLE, "mul", "" , k0 }, // multiple languages, many languages are used + { LANGUAGE_UNDETERMINED, "und", "" , k0 }, // undetermined language, language cannot be identified + { LANGUAGE_NONE, "zxx", "" , k0 }, // added to ISO 639-2 on 2006-01-11: Used to declare the absence of linguistic information + { LANGUAGE_DONTKNOW, "", "" , k0 } // marks end of table +}; + +IsoLanguageScriptCountryEntry const aImplIsoLangScriptEntries[] = +{ + // MS-LangID, ISO639-ISO15924, ISO3166, override + { LANGUAGE_SERBIAN_LATIN_SERBIA, "sr-Latn", "RS", k0 }, + { LANGUAGE_OBSOLETE_USER_SERBIAN_LATIN_SERBIA, "sr-Latn", "RS", k0 }, + { LANGUAGE_SERBIAN_LATIN_MONTENEGRO, "sr-Latn", "ME", k0 }, + { LANGUAGE_OBSOLETE_USER_SERBIAN_LATIN_MONTENEGRO,"sr-Latn", "ME", k0 }, + { LANGUAGE_SERBIAN_LATIN_BOSNIA_HERZEGOVINA, "sr-Latn", "BA", k0 }, + { LANGUAGE_SERBIAN_LATIN_SAM, "sr-Latn", "CS", k0 }, // Serbian Latin in Serbia and Montenegro; note that not all applications may know about the 'CS' reusage mess, see https://en.wikipedia.org/wiki/ISO_3166-2:CS + { LANGUAGE_SERBIAN_LATIN_SAM, "sr-Latn", "YU", k0 }, // legacy Serbian Latin in Yugoslavia + { LANGUAGE_SERBIAN_LATIN_LSO, "sr-Latn", "" , k0 }, + { LANGUAGE_SERBIAN_LATIN_NEUTRAL, "sr-Latn", "" , LANGUAGE_SERBIAN_LATIN_LSO }, // MS lists this as 'sr' only, what a mess + { LANGUAGE_SERBIAN_CYRILLIC_SERBIA, "sr-Cyrl", "RS", kSAME }, // MS + { LANGUAGE_SERBIAN_CYRILLIC_MONTENEGRO, "sr-Cyrl", "ME", kSAME }, // MS + { LANGUAGE_SERBIAN_CYRILLIC_BOSNIA_HERZEGOVINA, "sr-Cyrl", "BA", kSAME }, // MS + { LANGUAGE_SERBIAN_CYRILLIC_SAM, "sr-Cyrl", "CS", kSAME }, // MS + { LANGUAGE_SERBIAN_CYRILLIC_LSO, "sr-Cyrl", "" , kSAME }, // MS + { LANGUAGE_BOSNIAN_CYRILLIC_BOSNIA_HERZEGOVINA, "bs-Cyrl", "BA", k0 }, + { LANGUAGE_BOSNIAN_CYRILLIC_LSO, "bs-Cyrl", "" , k0 }, + { LANGUAGE_AZERI_CYRILLIC, "az-Cyrl", "AZ", k0 }, // macrolanguage code; MS reserved since rev.15 + { LANGUAGE_AZERI_CYRILLIC_LSO, "az-Cyrl", "" , k0 }, // macrolanguage code + { LANGUAGE_UZBEK_CYRILLIC, "uz-Cyrl", "UZ", k0 }, // macrolanguage code; MS reserved since rev.15 + { LANGUAGE_UZBEK_CYRILLIC_LSO, "uz-Cyrl", "" , k0 }, // macrolanguage code + { LANGUAGE_MONGOLIAN_CYRILLIC_MONGOLIA, "mn-Cyrl", "MN", k0 }, // macrolanguage code; should be khk-MN or khk-Cyrl-MN + { LANGUAGE_MONGOLIAN_CYRILLIC_LSO, "mn-Cyrl", "" , k0 }, // macrolanguage code; MS, should be khk or khk-Cyrl + { LANGUAGE_MONGOLIAN_MONGOLIAN_MONGOLIA, "mn-Mong", "MN", k0 }, // macrolanguage code; MS, should be khk-Mong-MN + { LANGUAGE_MONGOLIAN_MONGOLIAN_CHINA, "mn-Mong", "CN", k0 }, // macrolanguage code; MS reserved since rev.15; should actually be mvf-CN + { LANGUAGE_MONGOLIAN_MONGOLIAN_LSO, "mn-Mong", "" , k0 }, // macrolanguage code + { LANGUAGE_USER_PALI_LATIN, "pi-Latn", "" , k0 }, + { LANGUAGE_USER_PALI_THAI, "pi-Thai", "" , k0 }, + { LANGUAGE_USER_KARAKALPAK_LATIN, "kaa-Latn", "UZ", k0 }, + { LANGUAGE_TAJIK, "tg-Cyrl", "TJ", k0 }, // MS + { LANGUAGE_TAJIK_LSO, "tg-Cyrl", "" , k0 }, // MS + { LANGUAGE_AZERI_LATIN, "az-Latn", "AZ", k0 }, // macrolanguage code; MS + { LANGUAGE_AZERI_LATIN_LSO, "az-Latn", "" , k0 }, // macrolanguage code; MS + { LANGUAGE_USER_YIDDISH_US, "yi-Hebr", "US", kSAME }, // macrolanguage code; MS, Hebr is suppress-script + { LANGUAGE_USER_YIDDISH_ISRAEL, "yi-Hebr", "IL", kSAME }, // macrolanguage code; MS, Hebr is suppress-script + { LANGUAGE_UZBEK_LATIN, "uz-Latn", "UZ", k0 }, // macrolanguage code + { LANGUAGE_UZBEK_LATIN_LSO, "uz-Latn", "" , k0 }, + { LANGUAGE_SINDHI, "sd-Deva", "IN", k0 }, // MS reserved + { LANGUAGE_SINDHI_PAKISTAN, "sd-Arab", "PK", k0 }, // MS + { LANGUAGE_SINDHI_ARABIC_LSO, "sd-Arab", "" , k0 }, + { LANGUAGE_CHEROKEE_UNITED_STATES, "chr-Cher", "US", k0 }, // MS + { LANGUAGE_CHEROKEE_CHEROKEE_LSO, "chr-Cher", "" , k0 }, + { LANGUAGE_INUKTITUT_SYLLABICS_CANADA, "iu-Cans", "CA", k0 }, // macrolanguage code, MS + { LANGUAGE_INUKTITUT_SYLLABICS_LSO, "iu-Cans", "" , k0 }, // macrolanguage code, MS + { LANGUAGE_INUKTITUT_LATIN_CANADA, "iu-Latn", "CA", k0 }, // macrolanguage code, MS + { LANGUAGE_INUKTITUT_LATIN_LSO, "iu-Latn", "" , k0 }, // macrolanguage code, MS + { LANGUAGE_TAMAZIGHT_TIFINAGH_MOROCCO, "tzm-Tfng", "MA", k0 }, + { LANGUAGE_TAMAZIGHT_TIFINAGH_LSO, "tzm-Tfng", "" , k0 }, + { LANGUAGE_KASHMIRI_INDIA, "ks-Deva", "IN", k0 }, // MS since rev.15, earlier was "ks-Deva" reserved + { LANGUAGE_KASHMIRI, "ks-Arab", "" , k0 }, // MS, Kashmiri in "Jammu and Kashmir" ... no ISO3166 code for that + { LANGUAGE_HAUSA_NIGERIA, "ha-Latn", "NG", k0 }, // MS + { LANGUAGE_USER_HAUSA_GHANA, "ha-Latn", "GH", k0 }, + { LANGUAGE_HAUSA_LATIN_LSO, "ha-Latn", "" , k0 }, + { LANGUAGE_LATIN, "la-Latn", "" , kSAME }, // MS reserved until rev.15, though Latn is suppress-script, "la-VA" since rev.15 + { LANGUAGE_TAI_NUA_CHINA, "tdd-Tale", "CN", k0 }, // MS reserved + { LANGUAGE_LU_CHINA, "khb-Talu", "CN", k0 }, // MS reserved + { LANGUAGE_KURDISH_ARABIC_IRAQ, "ku-Arab", "IQ", kSAME }, // macrolanguage code, MS + { LANGUAGE_KURDISH_ARABIC_LSO, "ku-Arab", "" , kSAME }, // macrolanguage code, MS + { LANGUAGE_USER_KURDISH_TURKEY, "kmr-Latn", "TR", k0 }, + { LANGUAGE_USER_KURDISH_SYRIA, "kmr-Latn", "SY", k0 }, + { LANGUAGE_PUNJABI_PAKISTAN, "pnb-Arab", "PK", k0 }, + { LANGUAGE_PUNJABI_ARABIC_LSO, "pnb-Arab", "" , k0 }, + { LANGUAGE_PUNJABI_PAKISTAN, "pa-Arab", "PK", k0 }, // MS, incorrect + { LANGUAGE_PUNJABI_ARABIC_LSO, "pa-Arab", "" , k0 }, // MS, incorrect + { LANGUAGE_TAMAZIGHT_LATIN_ALGERIA, "tzm-Latn", "DZ", kSAME }, // MS + { LANGUAGE_TAMAZIGHT_LATIN_LSO, "tzm-Latn", "" , k0 }, // MS + { LANGUAGE_FULFULDE_NIGERIA, "ff-Latn", "NG", kSAME }, // macrolanguage code; MS since rev.15 + { LANGUAGE_FULFULDE_SENEGAL, "ff-Latn", "SN", k0 }, // macrolanguage code, MS + { LANGUAGE_FULFULDE_LATIN_LSO, "ff-Latn", "" , k0 }, // macrolanguage code + { LANGUAGE_BOSNIAN_LATIN_BOSNIA_HERZEGOVINA, "bs-Latn", "BA", kSAME }, // MS, though Latn is suppress-script + { LANGUAGE_BOSNIAN_LATIN_LSO, "bs-Latn", "" , LANGUAGE_BOSNIAN_LSO }, // MS, though Latn is suppress-script + { LANGUAGE_CHINESE_TRADITIONAL_LSO, "zh-Hant", "" , k0 }, + { LANGUAGE_USER_MANINKAKAN_EASTERN_LATIN, "emk-Latn", "GN", k0 }, + { LANGUAGE_USER_CREE_PLAINS_LATIN, "crk-Latn", "CA", k0 }, + { LANGUAGE_USER_CREE_PLAINS_SYLLABICS, "crk-Cans", "CA", k0 }, + { LANGUAGE_USER_CREE_PLAINS_LATIN, "crk-Latn", "CN", kSAME }, // erroneous tdf#73973 + { LANGUAGE_USER_CREE_PLAINS_SYLLABICS, "crk-Cans", "CN", kSAME }, // erroneous tdf#73973 + { LANGUAGE_USER_HUNGARIAN_ROVAS, "hu-Hung", "HU", k0 }, + { LANGUAGE_USER_MALAY_ARABIC_MALAYSIA, "ms-Arab", "MY", k0 }, + { LANGUAGE_USER_MALAY_ARABIC_BRUNEI, "ms-Arab", "BN", k0 }, + { LANGUAGE_KAZAKH_LATIN, "kk-Latn", "KZ", k0 }, + { LANGUAGE_KAZAKH_LATIN_LSO, "kk-Latn", "" , k0 }, // MS reserved + { LANGUAGE_KAZAKH_CYRILLIC_LSO, "kk-Cyrl", "" , k0 }, // MS reserved + { LANGUAGE_KANURI_NIGERIA, "kr-Latn", "NG", k0 }, // macrolanguage code; MS since rev.15 + { LANGUAGE_TAMAZIGHT_ARABIC_MOROCCO, "tzm-Arab", "MA", k0 }, // MS since rev.15, was reserved + { LANGUAGE_DONTKNOW, "", "" , k0 } // marks end of table +}; + +Bcp47CountryEntry const aImplBcp47CountryEntries[] = +{ + // MS-LangID full BCP47, ISO3166, ISO639-Variant or other fallback + { LANGUAGE_CATALAN_VALENCIAN, "ca-ES-valencia", "ES", "ca-valencia", k0 }, + { LANGUAGE_OBSOLETE_USER_CATALAN_VALENCIAN, "ca-ES-valencia", "ES", "", k0 }, // In case MS format files using the old value escaped into the wild, map them back. + { LANGUAGE_USER_ENGLISH_UK_OXENDICT, "en-GB-oxendict", "GB", "", k0 }, + { LANGUAGE_USER_ENGLISH_UK_OED, "en-GB-oed", "GB", "", LANGUAGE_USER_ENGLISH_UK_OXENDICT }, // grandfathered, deprecated, prefer en-GB-oxendict + { LANGUAGE_SPANISH_DATED, "es-ES-u-co-trad", "ES", "es-u-co-trad", k0 }, // RFC6067/CLDR + { LANGUAGE_SPANISH_DATED, "es-ES_tradnl", "ES", "", kSAME }, // MS malformed + { LANGUAGE_OCCITAN_FRANCE, "oc-FR-lengadoc", "FR", "oc-lengadoc", k0 }, + { LANGUAGE_USER_OCCITAN_ARANESE, "oc-ES-aranes", "ES", "oc-aranes", k0 }, +// { LANGUAGE_YUE_CHINESE_HONGKONG, "zh-yue-HK", "HK", "", 0 }, // MS reserved, prefer yue-HK; do not add unless LanguageTag::simpleExtract() can handle it to not call liblangtag for rsc! + { LANGUAGE_YIDDISH, "yi-001", "", "", k0 }, // MS since rev.15, was "yi-Hebr" reserved, "001"="World" + { LANGUAGE_FRENCH_WEST_INDIES, "fr-029", "", "", k0 }, // MS since rev.15, was "Neither defined nor reserved", "029"="Caribbean" + { LANGUAGE_USER_INTERSLAVIC_LATIN, "art-Latn-x-interslv", "", "", k0 }, // see discussion in tdf#145853 + { LANGUAGE_USER_INTERSLAVIC_CYRILLIC, "art-Cyrl-x-interslv", "", "", k0 }, + { LANGUAGE_DONTKNOW, "", "", "", k0 } // marks end of table +}; + +const IsoLanguageCountryEntry aLastResortFallbackEntry = +{ LANGUAGE_ENGLISH_US, "en", "US", k0 }; + +OUString IsoLanguageCountryEntry::getTagString() const +{ + if (maCountry[0]) + return OUString( OUString::createFromAscii( maLanguage) + "-" + OUString::createFromAscii( maCountry)); + else + return OUString::createFromAscii( maLanguage); +} + +css::lang::Locale IsoLanguageCountryEntry::getLocale() const +{ + return lang::Locale( OUString::createFromAscii( maLanguage), OUString::createFromAscii( maCountry), OUString()); +} + +OUString IsoLanguageScriptCountryEntry::getTagString() const +{ + if (maCountry[0]) + return OUString( OUString::createFromAscii( maLanguageScript) + "-" + OUString::createFromAscii( maCountry)); + else + return OUString::createFromAscii( maLanguageScript); +} + +css::lang::Locale IsoLanguageScriptCountryEntry::getLocale() const +{ + return lang::Locale( I18NLANGTAG_QLT, OUString::createFromAscii( maCountry), getTagString()); +} + +bool IsoLanguageScriptCountryEntry::startsInIgnoreAsciiCase( const OUString & rStr ) const +{ + return rStr.matchIgnoreAsciiCaseAsciiL( maLanguageScript, strlen( maLanguageScript) ); +} + +OUString Bcp47CountryEntry::getTagString() const +{ + return OUString::createFromAscii( mpBcp47); +} + +css::lang::Locale Bcp47CountryEntry::getLocale() const +{ + return lang::Locale( I18NLANGTAG_QLT, OUString::createFromAscii( maCountry), getTagString()); +} + + +// In this table are the countries which should mapped to a specific +// english language +IsoLangEngEntry const aImplIsoLangEngEntries[] = +{ + { LANGUAGE_ENGLISH_UK, "AO" }, // Angola + { LANGUAGE_ENGLISH_UK, "BJ" }, // Benin + { LANGUAGE_ENGLISH_UK, "BW" }, // Botswana + { LANGUAGE_ENGLISH_UK, "BI" }, // Burundi + { LANGUAGE_ENGLISH_UK, "CM" }, // Cameroon + { LANGUAGE_ENGLISH_UK, "GA" }, // Gabon + { LANGUAGE_ENGLISH_UK, "GM" }, // Gambia + { LANGUAGE_ENGLISH_UK, "GH" }, // Ghana + { LANGUAGE_ENGLISH_UK, "GN" }, // Guinea + { LANGUAGE_ENGLISH_UK, "LS" }, // Lesotho + { LANGUAGE_ENGLISH_UK, "MW" }, // Malawi + { LANGUAGE_ENGLISH_UK, "MT" }, // Malta + { LANGUAGE_ENGLISH_UK, "NA" }, // Namibia + { LANGUAGE_ENGLISH_UK, "NG" }, // Nigeria + { LANGUAGE_ENGLISH_UK, "UG" }, // Uganda + { LANGUAGE_ENGLISH_UK, "ZM" }, // Zambia + { LANGUAGE_ENGLISH_UK, "ZW" }, // Zimbabwe + { LANGUAGE_ENGLISH_UK, "SZ" }, // Swaziland + { LANGUAGE_ENGLISH_UK, "NG" }, // Sierra Leone + { LANGUAGE_ENGLISH_UK, "KN" }, // Saint Kitts and Nevis + { LANGUAGE_ENGLISH_UK, "SH" }, // St. Helena + { LANGUAGE_ENGLISH_UK, "IO" }, // British Indian Oceanic Territory + { LANGUAGE_ENGLISH_UK, "FK" }, // Falkland Islands + { LANGUAGE_ENGLISH_UK, "GI" }, // Gibraltar + { LANGUAGE_ENGLISH_UK, "KI" }, // Kiribati + { LANGUAGE_ENGLISH_UK, "VG" }, // Virgin Islands + { LANGUAGE_ENGLISH_UK, "MU" }, // Mauritius + { LANGUAGE_ENGLISH_UK, "FJ" }, // Fiji + { LANGUAGE_ENGLISH_US, "KI" }, // Kiribati + { LANGUAGE_ENGLISH_US, "LR" }, // Liberia + { LANGUAGE_ENGLISH_US, "GU" }, // Guam + { LANGUAGE_ENGLISH_US, "MH" }, // Marshall Islands + { LANGUAGE_ENGLISH_US, "PW" }, // Palau + { LANGUAGE_ENGLISH_CARIBBEAN, "AI" }, // Anguilla + { LANGUAGE_ENGLISH_CARIBBEAN, "AG" }, // Antigua and Barbuda + { LANGUAGE_ENGLISH_CARIBBEAN, "BS" }, // Bahamas + { LANGUAGE_ENGLISH_CARIBBEAN, "BB" }, // Barbados + { LANGUAGE_ENGLISH_CARIBBEAN, "BM" }, // Bermuda + { LANGUAGE_ENGLISH_CARIBBEAN, "KY" }, // Cayman Islands + { LANGUAGE_ENGLISH_CARIBBEAN, "GD" }, // Grenada + { LANGUAGE_ENGLISH_CARIBBEAN, "DM" }, // Dominica + { LANGUAGE_ENGLISH_CARIBBEAN, "HT" }, // Haiti + { LANGUAGE_ENGLISH_CARIBBEAN, "MS" }, // Montserrat + { LANGUAGE_ENGLISH_CARIBBEAN, "FM" }, // Micronesia + { LANGUAGE_ENGLISH_CARIBBEAN, "VC" }, // St. Vincent / Grenadines + { LANGUAGE_ENGLISH_CARIBBEAN, "LC" }, // Saint Lucia + { LANGUAGE_ENGLISH_CARIBBEAN, "TC" }, // Turks & Caicos Islands + { LANGUAGE_ENGLISH_CARIBBEAN, "GY" }, // Guyana + { LANGUAGE_ENGLISH_CARIBBEAN, "TT" }, // Trinidad and Tobago + { LANGUAGE_ENGLISH_AUS, "CX" }, // Christmas Islands + { LANGUAGE_ENGLISH_AUS, "CC" }, // Cocos (Keeling) Islands + { LANGUAGE_ENGLISH_AUS, "NF" }, // Norfolk Island + { LANGUAGE_ENGLISH_AUS, "PG" }, // Papua New Guinea + { LANGUAGE_ENGLISH_AUS, "SB" }, // Solomon Islands + { LANGUAGE_ENGLISH_AUS, "TV" }, // Tuvalu + { LANGUAGE_ENGLISH_AUS, "NR" }, // Nauru + { LANGUAGE_ENGLISH_NZ, "CK" }, // Cook Islands + { LANGUAGE_ENGLISH_NZ, "NU" }, // Niue + { LANGUAGE_ENGLISH_NZ, "TK" }, // Tokelau + { LANGUAGE_ENGLISH_NZ, "TO" }, // Tonga + { LANGUAGE_DONTKNOW, "" } // marks end of table +}; + + +IsoLangNoneStdEntry const aImplIsoNoneStdLangEntries[] = +{ + { LANGUAGE_NORWEGIAN_BOKMAL, "no", "BOK" }, // registered subtags for "no" in rfc1766 + { LANGUAGE_NORWEGIAN_NYNORSK, "no", "NYN" }, // registered subtags for "no" in rfc1766 + { LANGUAGE_SERBIAN_LATIN_SAM, "sr", "latin" }, + { LANGUAGE_SERBIAN_CYRILLIC_SAM, "sr", "cyrillic" }, + { LANGUAGE_AZERI_LATIN, "az", "latin" }, + { LANGUAGE_AZERI_CYRILLIC, "az", "cyrillic" }, + { LANGUAGE_DONTKNOW, "", "" } // marks end of table +}; + + +// in this table are only names to find the best language +IsoLangNoneStdEntry const aImplIsoNoneStdLangEntries2[] = +{ + { LANGUAGE_NORWEGIAN_BOKMAL, "no", "bokmaal" }, + { LANGUAGE_NORWEGIAN_BOKMAL, "no", "bokmal" }, + { LANGUAGE_NORWEGIAN_NYNORSK, "no", "nynorsk" }, + { LANGUAGE_DONTKNOW, "", "" } // marks end of table +}; + + +// in this table are only names to find the best language +IsoLangOtherEntry const aImplOtherEntries[] = +{ + { LANGUAGE_ENGLISH_US, "c" }, + { LANGUAGE_CHINESE, "chinese" }, + { LANGUAGE_GERMAN, "german" }, + { LANGUAGE_JAPANESE, "japanese" }, + { LANGUAGE_KOREAN, "korean" }, + { LANGUAGE_ENGLISH_US, "posix" }, + { LANGUAGE_CHINESE_TRADITIONAL, "tchinese" }, + { LANGUAGE_DONTKNOW, nullptr } // marks end of table +}; + + +// in this table are only privateuse names +IsoLangOtherEntry const aImplPrivateUseEntries[] = +{ + { LANGUAGE_USER_PRIV_NOTRANSLATE, "x-no-translate" }, //! not BCP47 but legacy in .xcu configmgr + { LANGUAGE_USER_PRIV_DEFAULT, "x-default" }, + { LANGUAGE_USER_PRIV_COMMENT, "x-comment" }, + { LANGUAGE_USER_PRIV_NONE, "x-none" }, + { LANGUAGE_USER_PRIV_JOKER, "*" }, //! not BCP47 but transferable in configmgr + { LANGUAGE_DONTKNOW, nullptr } // marks end of table +}; + + +// static +void MsLangId::Conversion::convertLanguageToLocaleImpl( LanguageType nLang, + css::lang::Locale & rLocale, bool bIgnoreOverride ) +{ + if (nLang == LANGUAGE_ENGLISH_US) + { + // Speed-up a gazillion fallback cases, not iterating through + // aImplBcp47CountryEntries nor aImplIsoLangScriptEntries. + rLocale.Language = "en"; + rLocale.Country = "US"; + rLocale.Variant.clear(); + return; + } + + const Bcp47CountryEntry* pBcp47EntryOverride = nullptr; + const IsoLanguageScriptCountryEntry* pScriptEntryOverride = nullptr; + const IsoLanguageCountryEntry* pEntryOverride = nullptr; + +Label_Override_Lang_Locale: + + // Search for LangID in BCP47 + for (const Bcp47CountryEntry* pBcp47Entry = aImplBcp47CountryEntries; + pBcp47Entry->mnLang != LANGUAGE_DONTKNOW; ++pBcp47Entry) + { + if (pBcp47Entry->mnLang == nLang) + { + if (bIgnoreOverride || !pBcp47Entry->mnOverride) + { + rLocale.Language = I18NLANGTAG_QLT; + rLocale.Country = OUString::createFromAscii( pBcp47Entry->maCountry); + rLocale.Variant = pBcp47Entry->getTagString(); + return; + } + else if (pBcp47Entry->mnOverride && pBcp47EntryOverride != pBcp47Entry) + { + pBcp47EntryOverride = pBcp47Entry; + nLang = getOverrideLang( pBcp47Entry->mnLang, pBcp47Entry->mnOverride); + goto Label_Override_Lang_Locale; + } + } + } + + // Search for LangID in ISO lll-Ssss-CC + for (const IsoLanguageScriptCountryEntry* pScriptEntry = aImplIsoLangScriptEntries; + pScriptEntry->mnLang != LANGUAGE_DONTKNOW; ++pScriptEntry) + { + if (pScriptEntry->mnLang == nLang) + { + if (bIgnoreOverride || !pScriptEntry->mnOverride) + { + rLocale.Language = I18NLANGTAG_QLT; + rLocale.Country = OUString::createFromAscii( pScriptEntry->maCountry); + rLocale.Variant = pScriptEntry->getTagString(); + return; + } + else if (pScriptEntry->mnOverride && pScriptEntryOverride != pScriptEntry) + { + pScriptEntryOverride = pScriptEntry; + nLang = getOverrideLang( pScriptEntry->mnLang, pScriptEntry->mnOverride); + goto Label_Override_Lang_Locale; + } + } + } + + // Search for LangID in ISO lll-CC + for (const IsoLanguageCountryEntry* pEntry = aImplIsoLangEntries; + pEntry->mnLang != LANGUAGE_DONTKNOW; ++pEntry) + { + if (pEntry->mnLang == nLang) + { + if (bIgnoreOverride || !pEntry->mnOverride) + { + rLocale.Language = OUString::createFromAscii( pEntry->maLanguage ); + rLocale.Country = OUString::createFromAscii( pEntry->maCountry ); + rLocale.Variant.clear(); + return; + } + else if (pEntry->mnOverride && pEntryOverride != pEntry) + { + pEntryOverride = pEntry; + nLang = getOverrideLang( pEntry->mnLang, pEntry->mnOverride); + goto Label_Override_Lang_Locale; + } + } + } + + // Look for privateuse definitions. + for (const IsoLangOtherEntry* pPrivateEntry = aImplPrivateUseEntries; + pPrivateEntry->mnLang != LANGUAGE_DONTKNOW; ++pPrivateEntry) + { + if (pPrivateEntry->mnLang == nLang) + { + rLocale.Language = I18NLANGTAG_QLT; + rLocale.Country.clear(); + rLocale.Variant = OUString::createFromAscii( pPrivateEntry->mpLanguage ); + return; + } + } + + // Not found. Passed rLocale argument remains unchanged. +} + + +// static +css::lang::Locale MsLangId::Conversion::getLocale( const IsoLanguageCountryEntry * pEntry ) +{ + if (pEntry->mnOverride) + { + lang::Locale aLocale; + convertLanguageToLocaleImpl( getOverrideLang( pEntry->mnLang, pEntry->mnOverride), aLocale, false); + return aLocale; + } + else + return pEntry->getLocale(); +} + +// static +css::lang::Locale MsLangId::Conversion::getLocale( const IsoLanguageScriptCountryEntry * pEntry ) +{ + if (pEntry->mnOverride) + { + lang::Locale aLocale; + convertLanguageToLocaleImpl( getOverrideLang( pEntry->mnLang, pEntry->mnOverride), aLocale, false); + return aLocale; + } + else + return pEntry->getLocale(); +} + +// static +css::lang::Locale MsLangId::Conversion::getLocale( const Bcp47CountryEntry * pEntry ) +{ + if (pEntry->mnOverride) + { + lang::Locale aLocale; + convertLanguageToLocaleImpl( getOverrideLang( pEntry->mnLang, pEntry->mnOverride), aLocale, false); + return aLocale; + } + else + return pEntry->getLocale(); +} + +// static +css::lang::Locale MsLangId::Conversion::lookupFallbackLocale( + const css::lang::Locale & rLocale ) +{ + // language is lower case in table + OUString aLowerLang = rLocale.Language.toAsciiLowerCase(); + // country is upper case in table + OUString aUpperCountry = rLocale.Country.toAsciiUpperCase(); + sal_Int32 nCountryLen = aUpperCountry.getLength(); + + if (rLocale.Language == I18NLANGTAG_QLT) + { + // Search in BCP47, only full match and one fallback, for other + // fallbacks only LanguageTag can decide. + for (const Bcp47CountryEntry* pBcp47Entry = aImplBcp47CountryEntries; + pBcp47Entry->mnLang != LANGUAGE_DONTKNOW; ++pBcp47Entry) + { + if ( rLocale.Variant.equalsIgnoreAsciiCase( pBcp47Entry->getTagString()) || + rLocale.Variant.equalsIgnoreAsciiCaseAscii( pBcp47Entry->mpFallback)) + return getLocale( pBcp47Entry); // may override + } + + // Search in ISO lll-Ssss-CC + const IsoLanguageScriptCountryEntry* pFirstScript = nullptr; + for (const IsoLanguageScriptCountryEntry* pScriptEntry = aImplIsoLangScriptEntries; + pScriptEntry->mnLang != LANGUAGE_DONTKNOW; ++pScriptEntry) + { + if (pScriptEntry->startsInIgnoreAsciiCase( rLocale.Variant)) + { + if (rLocale.Variant.equalsIgnoreAsciiCase( pScriptEntry->getTagString())) + return getLocale( pScriptEntry); // may override + if (!pFirstScript) + pFirstScript = pScriptEntry; + } + } + // If at least a lll-Ssss matched, try that with country or use it as + // fallback. + if (pFirstScript) + { + // Check for country only if there is more than lll-Ssss-CC in tag + // string, else we would had matched it already. + if (!aUpperCountry.isEmpty() && rLocale.Variant.getLength() > 11) + { + for (const IsoLanguageScriptCountryEntry* pScriptEntry = pFirstScript; + pScriptEntry->mnLang != LANGUAGE_DONTKNOW; ++pScriptEntry) + { + if (aUpperCountry.equalsAscii( pScriptEntry->maCountry) && + pScriptEntry->startsInIgnoreAsciiCase( rLocale.Variant)) + return getLocale( pScriptEntry); // may override + } + } + return getLocale( pFirstScript); // may override + } + + // Extract language from tag string, country is used as present in + // Locale because in the tables that follow we have only ISO 3166 + // countries and if that is in the tag string we also have it in the + // Locale. + aLowerLang = rLocale.Variant.getToken(0, '-').toAsciiLowerCase(); + // Nothing with "x-..." or "i-..." or any 1 letter in lll-CC table that + // follows. + if (aLowerLang.getLength() == 1) + return aLastResortFallbackEntry.getLocale(); + } + + // Search for locale and remember first lang-only. + const IsoLanguageCountryEntry* pFirstLang = nullptr; + const IsoLanguageCountryEntry* pEntry = aImplIsoLangEntries; + for ( ; pEntry->mnLang != LANGUAGE_DONTKNOW; ++pEntry) + { + if (aLowerLang.equalsAscii( pEntry->maLanguage)) + { + if (*pEntry->maCountry) + { + if (nCountryLen && aUpperCountry.equalsAscii( pEntry->maCountry)) + return getLocale( pEntry); // may override + } + else + { + if (pEntry->mnLang.anyOf( + // These are known to have no country assigned. + LANGUAGE_USER_ESPERANTO, + LANGUAGE_USER_INTERLINGUA, + LANGUAGE_USER_INTERLINGUE, + LANGUAGE_USER_LOJBAN, + LANGUAGE_KASHMIRI, + LANGUAGE_USER_KLINGON, + LANGUAGE_USER_KEYID, + // And the special codes without country. + LANGUAGE_MULTIPLE, + LANGUAGE_UNDETERMINED, + LANGUAGE_NONE)) + { + return getLocale( pEntry); // may override + } + } + if (!pFirstLang) + pFirstLang = pEntry; + } + } + + // Language not found at all => use default. + if (!pFirstLang) + return aLastResortFallbackEntry.getLocale(); + + // Search for first entry of language with any country. + pEntry = pFirstLang; + for ( ; pEntry->mnLang != LANGUAGE_DONTKNOW; ++pEntry) + { + if (aLowerLang.equalsAscii( pEntry->maLanguage)) + { + if (*pEntry->maCountry) + return getLocale( pEntry); // may override + } + } + + return aLastResortFallbackEntry.getLocale(); +} + + +// static +LanguageType MsLangId::Conversion::convertPrivateUseToLanguage( const OUString& rPriv ) +{ + for (const IsoLangOtherEntry* pPrivateEntry = aImplPrivateUseEntries; + pPrivateEntry->mnLang != LANGUAGE_DONTKNOW; ++pPrivateEntry) + { + if ( rPriv.equalsIgnoreAsciiCaseAscii( pPrivateEntry->mpLanguage ) ) + return pPrivateEntry->mnLang; + } + return LANGUAGE_DONTKNOW; +} + + +// static +LanguageType MsLangId::Conversion::convertLocaleToLanguageImpl( + const css::lang::Locale& rLocale ) +{ + if (rLocale.Language == I18NLANGTAG_QLT) + { + // "x-..." private use and the nasty "*" joker + if (rLocale.Variant.startsWithIgnoreAsciiCase( "x-") || (rLocale.Variant == "*")) + return convertPrivateUseToLanguage( rLocale.Variant); + + // Search in BCP47 + for (const Bcp47CountryEntry* pBcp47Entry = aImplBcp47CountryEntries; + pBcp47Entry->mnLang != LANGUAGE_DONTKNOW; ++pBcp47Entry) + { + if (rLocale.Variant.equalsIgnoreAsciiCase( pBcp47Entry->getTagString())) + return getOverrideLang( pBcp47Entry->mnLang, pBcp47Entry->mnOverride); + } + + // Search in ISO lll-Ssss-CC + for (const IsoLanguageScriptCountryEntry* pScriptEntry = aImplIsoLangScriptEntries; + pScriptEntry->mnLang != LANGUAGE_DONTKNOW; ++pScriptEntry) + { + if (pScriptEntry->startsInIgnoreAsciiCase( rLocale.Variant)) + { + if (rLocale.Variant.equalsIgnoreAsciiCase( pScriptEntry->getTagString())) + return getOverrideLang( pScriptEntry->mnLang, pScriptEntry->mnOverride); + } + } + } + else + { + // language is lower case in table + OUString aLowerLang = rLocale.Language.toAsciiLowerCase(); + // country is upper case in table + OUString aUpperCountry = rLocale.Country.toAsciiUpperCase(); + + // Search in ISO lll-CC + for (const IsoLanguageCountryEntry* pEntry = aImplIsoLangEntries; + pEntry->mnLang != LANGUAGE_DONTKNOW; ++pEntry) + { + if (aLowerLang.equalsAscii( pEntry->maLanguage) && aUpperCountry.equalsAscii( pEntry->maCountry)) + return getOverrideLang( pEntry->mnLang, pEntry->mnOverride); + } + } + return LANGUAGE_DONTKNOW; +} + + +// static +css::lang::Locale MsLangId::Conversion::getOverride( const css::lang::Locale& rLocale ) +{ + if (rLocale.Language == I18NLANGTAG_QLT) + { + // "x-..." private use and the nasty "*" joker + if (rLocale.Variant.startsWithIgnoreAsciiCase( "x-") || (rLocale.Variant == "*")) + return rLocale; // no overrides + + // Search in BCP47 + for (const Bcp47CountryEntry* pBcp47Entry = aImplBcp47CountryEntries; + pBcp47Entry->mnLang != LANGUAGE_DONTKNOW; ++pBcp47Entry) + { + if (rLocale.Variant.equalsIgnoreAsciiCase( pBcp47Entry->getTagString())) + return getLocale( pBcp47Entry); // may override + } + + // Search in ISO lll-Ssss-CC + for (const IsoLanguageScriptCountryEntry* pScriptEntry = aImplIsoLangScriptEntries; + pScriptEntry->mnLang != LANGUAGE_DONTKNOW; ++pScriptEntry) + { + if (pScriptEntry->startsInIgnoreAsciiCase( rLocale.Variant)) + { + if (rLocale.Variant.equalsIgnoreAsciiCase( pScriptEntry->getTagString())) + return getLocale( pScriptEntry); // may override + } + } + } + else + { + // language is lower case in table + OUString aLowerLang = rLocale.Language.toAsciiLowerCase(); + // country is upper case in table + OUString aUpperCountry = rLocale.Country.toAsciiUpperCase(); + + // Search in ISO lll-CC + for (const IsoLanguageCountryEntry* pEntry = aImplIsoLangEntries; + pEntry->mnLang != LANGUAGE_DONTKNOW; ++pEntry) + { + if (aLowerLang.equalsAscii( pEntry->maLanguage) && aUpperCountry.equalsAscii( pEntry->maCountry)) + return getLocale( pEntry); // may override + } + } + return lang::Locale(); +} + + +// static +LanguageType MsLangId::Conversion::convertIsoNamesToLanguage( const OUString& rLang, + const OUString& rCountry, bool bSkipIsoTable ) +{ + // language is lower case in table + OUString aLowerLang = rLang.toAsciiLowerCase(); + // country is upper case in table + OUString aUpperCountry = rCountry.toAsciiUpperCase(); + + if (!bSkipIsoTable) + { + // first look for exact match + for (const IsoLanguageCountryEntry* pEntry = aImplIsoLangEntries; + pEntry->mnLang != LANGUAGE_DONTKNOW; ++pEntry) + { + if ( aLowerLang.equalsAscii( pEntry->maLanguage ) ) + { + if ( aUpperCountry.isEmpty() || + aUpperCountry.equalsAscii( pEntry->maCountry ) ) + return pEntry->mnLang; + } + } + + // some eng countries should be mapped to a specific english language + if ( aLowerLang == "en" ) + { + for (const IsoLangEngEntry* pEngEntry = aImplIsoLangEngEntries; + pEngEntry->mnLang != LANGUAGE_DONTKNOW; ++pEngEntry) + { + if ( aUpperCountry.equalsAscii( pEngEntry->maCountry ) ) + return pEngEntry->mnLang; + } + } + } + + // test for specific languages which are not used standard ISO 3166 codes + for (const IsoLangNoneStdEntry* pNoneStdEntry = aImplIsoNoneStdLangEntries; + pNoneStdEntry->mnLang != LANGUAGE_DONTKNOW; ++pNoneStdEntry) + { + if ( aLowerLang.equalsAscii( pNoneStdEntry->maLanguage ) ) + { + // The countries in this table are not all in upper case + if ( aUpperCountry.equalsIgnoreAsciiCaseAscii( pNoneStdEntry->maCountry ) ) + return pNoneStdEntry->mnLang; + } + } + for (const IsoLangNoneStdEntry* pNoneStdEntry2 = aImplIsoNoneStdLangEntries2; + pNoneStdEntry2->mnLang != LANGUAGE_DONTKNOW; ++pNoneStdEntry2) + { + if ( aLowerLang.equalsAscii( pNoneStdEntry2->maLanguage ) ) + { + // The countries in this table are not all in upper case + if ( aUpperCountry.equalsIgnoreAsciiCaseAscii( pNoneStdEntry2->maCountry ) ) + return pNoneStdEntry2->mnLang; + } + } + + if (!bSkipIsoTable) + { + // if only the country is set, look for any entry matching the country + // (to allow reading country and language in separate steps, in any order) + if ( !rCountry.isEmpty() && rLang.isEmpty() ) + { + for (const IsoLanguageCountryEntry* pEntry2 = aImplIsoLangEntries; + pEntry2->mnLang != LANGUAGE_DONTKNOW; ++pEntry2) + { + if ( aUpperCountry.equalsAscii( pEntry2->maCountry ) ) + return pEntry2->mnLang; + } + + aLowerLang = aUpperCountry.toAsciiLowerCase(); + } + } + + // Look for privateuse definitions. + LanguageType nLang = convertPrivateUseToLanguage( aLowerLang); + if (nLang != LANGUAGE_DONTKNOW) + return nLang; + + // Now look for all other definitions, which are not standard + for (const IsoLangOtherEntry* pOtherEntry = aImplOtherEntries; + pOtherEntry->mnLang != LANGUAGE_DONTKNOW; ++pOtherEntry) + { + if ( aLowerLang.equalsAscii( pOtherEntry->mpLanguage ) ) + return pOtherEntry->mnLang; + } + + return LANGUAGE_DONTKNOW; +} + + +// static +LanguageType MsLangId::Conversion::convertIsoNamesToLanguage( std::string_view rLang, + std::string_view rCountry ) +{ + OUString aLang = OStringToOUString( rLang, RTL_TEXTENCODING_ASCII_US); + OUString aCountry = OStringToOUString( rCountry, RTL_TEXTENCODING_ASCII_US); + LanguageType nLang = convertIsoNamesToLanguage( aLang, aCountry, false); + + // XXX: called *only* by static convertUnxByteStringToLanguage() so we can + // actually call into LanguageTag to create an on-the-fly mapping. + if (nLang == LANGUAGE_DONTKNOW) + { + OUString aTag( aCountry.isEmpty() ? aLang : aLang + "-" + aCountry ); + nLang = LanguageTag( aTag).getLanguageType(false); + SAL_WARN("i18nlangtag", "convertIsoNamesToLanguage(string_view): on-the-fly for {" + << aTag << "} " << nLang); + // Do not leave empty as SYSTEM unresolved. + if (nLang == LANGUAGE_DONTKNOW || nLang == LANGUAGE_SYSTEM) + { + SAL_WARN("i18nlangtag", "convertIsoNamesToLanguage(string_view): on-the-fly bad, using {en-US}"); + nLang = LANGUAGE_ENGLISH_US; + } + } + return nLang; +} + +namespace { + +struct IsoLangGLIBCModifiersEntry +{ + LanguageType mnLang; + char maLanguage[4]; + char maCountry[3]; + char maAtString[9]; +}; + +} + +IsoLangGLIBCModifiersEntry const aImplIsoLangGLIBCModifiersEntries[] = +{ + // MS-LANGID codes ISO639-1/2/3 ISO3166 glibc modifier + { LANGUAGE_BOSNIAN_CYRILLIC_BOSNIA_HERZEGOVINA, "bs", "BA", "cyrillic" }, + { LANGUAGE_USER_SERBIAN_LATIN_SERBIA, "sr", "RS", "latin" }, // Serbian Latin in Serbia + { LANGUAGE_SERBIAN_LATIN_SAM, "sr", "CS", "latin" }, // Serbian Latin in Serbia and Montenegro + { LANGUAGE_USER_SERBIAN_LATIN_MONTENEGRO, "sr", "ME", "latin" }, // Serbian Latin in Montenegro + { LANGUAGE_SERBIAN_LATIN_LSO, "sr", "", "latin" }, + { LANGUAGE_AZERI_CYRILLIC, "az", "AZ", "cyrillic" }, + { LANGUAGE_UZBEK_CYRILLIC, "uz", "UZ", "cyrillic" }, + { LANGUAGE_CATALAN_VALENCIAN, "ca", "ES", "valencia" }, + { LANGUAGE_DONTKNOW, "", "", "" } // marks end of table +}; + +// convert a unix locale string into LanguageType + +// static +LanguageType MsLangId::convertUnxByteStringToLanguage( + std::string_view rString ) +{ + OString aLang; + OString aCountry; + OString aAtString; + + size_t nLangSepPos = rString.find( '_' ); + size_t nCountrySepPos = rString.find( '.' ); + size_t nAtPos = rString.find( '@' ); + + if (nCountrySepPos == std::string_view::npos) + nCountrySepPos = nAtPos; + if (nCountrySepPos == std::string_view::npos) + nCountrySepPos = rString.size(); + + if (nAtPos != std::string_view::npos) + aAtString = OString(rString.substr( nAtPos+1 )); + + if (((nLangSepPos != std::string_view::npos) && (nLangSepPos > nCountrySepPos)) || (nLangSepPos == std::string_view::npos)) + { + // eg. "el.sun_eu_greek", "tchinese", "es.ISO8859-15" + aLang = OString(rString.substr( 0, nCountrySepPos )); + } + else if ( nLangSepPos != std::string_view::npos ) + { + // well formed iso names like "en_US.UTF-8", "sh_BA.ISO8859-2@bosnia" + aLang = OString(rString.substr( 0, nLangSepPos )); + aCountry = OString(rString.substr( nLangSepPos+1, nCountrySepPos - nLangSepPos - 1)); + } + + // if there is a glibc modifier, first look for exact match in modifier table + if (!aAtString.isEmpty()) + { + // language is lower case in table + OString aLowerLang = aLang.toAsciiLowerCase(); + // country is upper case in table + OString aUpperCountry = aCountry.toAsciiUpperCase(); + for (const IsoLangGLIBCModifiersEntry* pGLIBCModifiersEntry = aImplIsoLangGLIBCModifiersEntries; + pGLIBCModifiersEntry->mnLang != LANGUAGE_DONTKNOW; ++pGLIBCModifiersEntry) + { // avoid embedded \0 warning + if (aLowerLang == pGLIBCModifiersEntry->maLanguage && + aAtString == pGLIBCModifiersEntry->maAtString ) + { + if (aUpperCountry.isEmpty() || + aUpperCountry == pGLIBCModifiersEntry->maCountry ) + { + return pGLIBCModifiersEntry->mnLang; + } + } + } + } + + return Conversion::convertIsoNamesToLanguage( aLang, aCountry ); +} + + +// static +::std::vector< MsLangId::LanguagetagMapping > MsLangId::getDefinedLanguagetags() +{ + ::std::vector< LanguagetagMapping > aVec; + for (const Bcp47CountryEntry* pEntry = aImplBcp47CountryEntries; + pEntry->mnLang != LANGUAGE_DONTKNOW; ++pEntry) + { + aVec.emplace_back( pEntry->getTagString(), pEntry->mnLang); + } + for (const IsoLanguageScriptCountryEntry* pEntry = aImplIsoLangScriptEntries; + pEntry->mnLang != LANGUAGE_DONTKNOW; ++pEntry) + { + aVec.emplace_back( pEntry->getTagString(), pEntry->mnLang); + } + for (const IsoLanguageCountryEntry* pEntry = aImplIsoLangEntries; + pEntry->mnLang != LANGUAGE_DONTKNOW; ++pEntry) + { + aVec.emplace_back( pEntry->getTagString(), pEntry->mnLang); + } + return aVec; +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/i18nlangtag/source/isolang/langid.pl b/i18nlangtag/source/isolang/langid.pl new file mode 100755 index 000000000..061c69288 --- /dev/null +++ b/i18nlangtag/source/isolang/langid.pl @@ -0,0 +1,471 @@ +: # -*- perl -*- vim: ft=perl +eval 'exec perl -w -S $0 ${1+"$@"}' +if 0; +# +# This file is part of the LibreOffice project. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# +# This file incorporates work covered by the following license notice: +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to you under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of +# the License at http://www.apache.org/licenses/LICENSE-2.0 . +# + +# See Usage() below or invoke without arguments for short instructions. +# For long instructions use the source, Luke ;-) + +use strict; + +sub Usage() +{ + print STDERR + "\n", + "langid - a hackish utility to lookup lang.h language defines and LangIDs,\n", + "isolang.cxx ISO639/ISO3166 mapping, locale data files, langtab.hrc language\n", + "listbox entries, langlist.mk, file_ooo.scp registry name, languages.pm and\n", + "msi-encodinglist.txt\n\n", + + "Usage: $0 [--single] {language string} | {LangID} | {primarylanguage sublanguage} | {language-country}\n\n", + + "A language string will be used as a generic string match in all searched files.\n", + "You may enclose the language string in word delimiters,\n", + "e.g. \\blanguage_german\\b for a specific match.\n", + "If the language string expression matches more than one define,\n", + "e.g. as in 'german', all matching defines will be processed.\n", + "If the language string does not match a define or an identifier in\n", + "langtab.hrc, a generic string match of the listbox entries will be tried.\n\n", + + "Numeric values of LangID,primarylanguage,sublanguage can be given\n", + "decimal, hexadecimal (leading 0x), octal (leading 0) or binary (leading 0b).\n", + "The exact language_define of an exact match will be used in remaining lookups.\n\n", + + "A language-country pair will lookup a xx-YY mapping from isolang.cxx,\n", + "for example: 'en-US' or 'de-' or '-CH',\n", + "xx and YY can be given case insensitive, will be lowered-uppered internally,\n", + "and xx and YY themselves may be regular expressions.\n", + "Also here a list of matches will be processed.\n\n", + + "If option --single is given, only the first match will be processed.\n\n"; +} + +my $SRC_ROOT = $ENV{"SRC_ROOT"}; +if (!defined($SRC_ROOT)) +{ + print "\nNeed \$SRC_ROOT, please set your LibreOffice environment!\n"; + Usage(); + exit 1; +} + +my $LANGUAGE_MASK_PRIMARY = 0x03ff; + +sub getPrimaryLanguage($) +{ + my($lcid) = @_; + return $lcid & $LANGUAGE_MASK_PRIMARY; +} + +sub getSubLanguage($) +{ + my($lcid) = @_; + return $lcid >> 10; +} + +sub makeLangID($$) +{ + my( $sub, $pri) = @_; + return ($sub << 10) | $pri; +} + + +# Note that a regex needs a duplicated pair of backslashes to produce a literal +# \\ like in \\\\* to search for zero or more \ backslashes. +# @addregex can be an optional "block to grep" definition +# (regex-to-start-block, regex-to-end-block, regex-to-find-in-block) +sub grepFile($$$$$@) +{ + my( $regex, $path, $module, $name, $printmsg, @addregex) = @_; + my @result; + my $found = 0; + my $areopen = 0; + my $arecloser = ''; + # Try module under current working directory first to catch local + # modifications. + my $file = "./$module/$name"; + if (!($found = open( IN, $file))) + { + # Then with the given path. + $file = "$path/$module/$name"; + if (!($found = open( IN, $file))) + { + print "No $file\n"; + } + } + if ($found) + { + $found = 0; + while (my $line = ) + { + if ($line =~ /$regex/) + { + if (!$found) + { + $found = 1; + print "$file:\n"; + } + chomp( $line); + print "$line\n"; + push( @result, $line); + } + elsif (@addregex) + { + # By convention first element is opener, second element is closer. + if (!$areopen) + { + if ($line =~ /$addregex[0]/) + { + $areopen = 1; + $arecloser = $addregex[1]; + } + } + if ($areopen) + { + for (my $i = 2; $i < @addregex; ++$i) + { + if ($line =~ /$addregex[$i]/) + { + if (!$found) + { + $found = 1; + print "$file:\n"; + } + chomp( $line); + print "$line\n"; + push( @result, $line); + } + } + if ($line =~ /$arecloser/) + { + $areopen = 0; + } + } + } + } + close( IN); + } + if (!$found && $printmsg) { + print "Not found in $file\n"; + #print "Not found in $file for $regex @addregex\n"; + } + return @result; +} + + +sub main() +{ + my( $lcid, @parts, $grepdef, $options, $single); + $grepdef = 0; + $single = 0; + for ($options = 0; $options < @ARGV && $ARGV[$options] =~ /^--/; ++$options) + { + if ($ARGV[$options] eq '--single') { $single = 1; } + else { print "Unknown option: $ARGV[$options]\n"; } + } + if (@ARGV == 1 + $options) + { + # 0x hex, 0b bin, 0 oct + if ($ARGV[$options] =~ /^0/) { + $lcid = oct( $ARGV[0]); } + elsif ($ARGV[$options] =~ /^[0-9]/) { + $lcid = $ARGV[$options]; } + else + { + $grepdef = $ARGV[$options]; + $lcid = 0; + } + $parts[0] = getPrimaryLanguage( $lcid); + $parts[1] = getSubLanguage( $lcid); + } + elsif (@ARGV == 2 + $options) + { + for (my $i = $options; $i < 2 + $options; ++$i) + { + if ($ARGV[$i] =~ /^0/) { + $parts[$i] = oct( $ARGV[$i]); } + else { + $parts[$i] = $ARGV[$i]; } + } + $lcid = makeLangID( $parts[1], $parts[0]); + } + else + { + Usage(); + return 1; + } + my $modifier = "(?i)"; + my (@resultlist, @greplist, $result); + # If no string was given on the command line, but value(s) were, lookup the + # LangID value to obtain the define identifier. + if ($grepdef) + { + # #define LANGUAGE_AFRIKAANS LanguageType(0x0436) + @resultlist = grepFile( + $modifier . '^\s*#\s*define\s+[A-Z_]*' . $grepdef, + "$SRC_ROOT", "include", "i18nlangtag/lang.h", 1, ()); + } + else + { + printf( "LangID: 0x%04X (dec %d), primary: 0x%03x, sub 0x%02x\n", $lcid, + $lcid, $parts[0], $parts[1]); + my $buf = sprintf( "0x%04X", $lcid); + # #define LANGUAGE_AFRIKAANS LanguageType(0x0436) + @resultlist = grepFile( + '^\s*#\s*define\s+\w+\s+LanguageType\(' . $buf . '\)', + "$SRC_ROOT", "include", "i18nlangtag/lang.h", 1, ()); + } + for $result (@resultlist) + { + # #define LANGUAGE_AFRIKAANS LanguageType(0x0436) + if ($result =~ /^\s*#\s*define\s+(\w+)\s+LanguageType\((0x[0-9a-fA-F]+)\)/) + { + push( @greplist, '\b' . $1 . '\b'); + $modifier = ""; # complete identifier now case sensitive + if ($single) { + last; } + } + } + # If the string given is of the form xx-yy lookup a language,country pair + # to obtain the define identifier. xx and yy may themselves be regexps. + # xx- is a short form for 'xx-.*' and -yy a short form for '.*-yy' + # Note that -Latn for '.*-Latn' also works, accidentally. + if ($grepdef =~ /^(.*)-$/) { + $grepdef = $1 . "-.*"; } + if ($grepdef =~ /^-(.*)$/) { + $grepdef = ".*-" . $1; } + if ($grepdef =~ /^([^-]{2,3})-([^-]{2,2})$/) # catches also .*-.* + { + my $lang = $1; + my $coun = $2; + $lang = lc($lang); + $coun = uc($coun); + # { LANGUAGE_AFRIKAANS, "af", "ZA", 0 }, + @resultlist = grepFile( + '^\s*\{\s*\w+\s*,\s*"' . $lang . '"\s*,\s*"' . $coun . '"\s*,\s*\w+\s*\}\s*,', + "$SRC_ROOT", "i18nlangtag", "source/isolang/isolang.cxx", 1, ()); + for $result (@resultlist) + { + if ($result =~ /^\s*\{\s*(\w+)\s*,\s*"(\w+)"\s*,\s*"(\w+)?"\s*,\s*\w+\s*\}\s*,/) + { + push( @greplist, '\b' . $1 . '\b'); + $modifier = ""; # complete identifier now case sensitive + if ($single) { + last; } + } + } + $grepdef = 0; + } + # Same for lll-Ssss or lll-Ssss-CC language tag. + if ($grepdef =~ /^([^-]{2,3})-([^-]{4,4})$/ || $grepdef =~ /^([^-]{2,3})-([^-]{4,4})-([^-]{2,2})$/) + { + my $lang = $1; + my $scri = $2; + my $coun = $3; + if (!defined($coun)) { + $coun = ""; } + $lang = lc($lang); + $scri = ucfirst(lc($scri)); + $coun = uc($coun); + # { LANGUAGE_SERBIAN_LATIN_SERBIA, "sr-Latn", "RS", 0 }, + @resultlist = grepFile( + '^\s*\{\s*\w+\s*,\s*"' . $lang . '-' . $scri . '"\s*,\s*"' . $coun . '"\s*,\s*\w+\s*\}\s*,', + "$SRC_ROOT", "i18nlangtag", "source/isolang/isolang.cxx", 1, ()); + for $result (@resultlist) + { + if ($result =~ /^\s*\{\s*(\w+)\s*,\s*"(\w+-\w+)"\s*,\s*"(\w+)?"\s*,\s*\w+\s*\}\s*,/) + { + push( @greplist, '\b' . $1 . '\b'); + $modifier = ""; # complete identifier now case sensitive + if ($single) { + last; } + } + } + $grepdef = 0; + } + # And for any other language tag that MUST match case. + if ($grepdef =~ /^[^-]+-/) + { + # { LANGUAGE_CATALAN_VALENCIAN, "ca-ES-valencia", "ES", "ca-valencia" }, + @resultlist = grepFile( + '^\s*\{\s*\w+\s*,\s*"' . $grepdef . '"\s*,\s*"(\w*)"\s*,\s*"([^"]*)"\s*\}\s*,', + "$SRC_ROOT", "i18nlangtag", "source/isolang/isolang.cxx", 1, ()); + for $result (@resultlist) + { + if ($result =~ /^\s*\{\s*(\w+)\s*,\s*"([^"]+)"\s*,\s*"(\w*)"\s*,\s*"([^"]*)"\s*\}\s*,/) + { + push( @greplist, '\b' . $1 . '\b'); + $modifier = ""; # complete identifier now case sensitive + if ($single) { + last; } + } + } + $grepdef = 0; + } + if (!@greplist && $grepdef) { + push( @greplist, $grepdef); } + for $grepdef (@greplist) + { + print "\nUsing: " . $grepdef . "\n"; + + # Decimal LCID, was needed for Langpack.ulf but isn't used anymore, + # keep just in case we'd need it again. + # #define LANGUAGE_AFRIKAANS 0x0436 + @resultlist = grepFile( + $modifier . '^\s*#\s*define\s+[A-Z_]*' . $grepdef, + "$SRC_ROOT", "include", "i18nlangtag/lang.h", 1, ()); + my @lcidlist; + for $result (@resultlist) + { + # #define LANGUAGE_AFRIKAANS LanguageType(0x0436) + if ($result =~ /^\s*#\s*define\s+(\w+)\s+LanguageType\((0x[0-9a-fA-F]+)\)/) + { + push( @lcidlist, oct( $2)); + } + } + + my @allresultslist; + # { LANGUAGE_AFRIKAANS, "af", "ZA", 0 }, + @resultlist = grepFile( + $modifier . '^\s*\{\s*.*' . $grepdef . '.*\s*,\s*"(\w+)"\s*,\s*"(\w+)?"\s*,\s*\w+\s*\}\s*,', + "$SRC_ROOT", "i18nlangtag", "source/isolang/isolang.cxx", 0, ()); + push( @allresultslist, @resultlist); + # { LANGUAGE_SERBIAN_LATIN_SERBIA, "sr-Latn", "RS", 0 }, + @resultlist = grepFile( + $modifier . '^\s*\{\s*.*' . $grepdef . '.*\s*,\s*"(\w+-\w+)"\s*,\s*"(\w+)?"\s*,\s*\w+\s*}\s*,', + "$SRC_ROOT", "i18nlangtag", "source/isolang/isolang.cxx", 0, ()); + push( @allresultslist, @resultlist); + # { LANGUAGE_CATALAN_VALENCIAN, "ca-ES-valencia", "ES", "ca-valencia" }, + @resultlist = grepFile( + $modifier . '^\s*\{\s*.*' . $grepdef . '.*\s*,\s*"([^"]+)"\s*,\s*"(\w*)"\s*,\s*"([^"]*)"\s*\}\s*,', + "$SRC_ROOT", "i18nlangtag", "source/isolang/isolang.cxx", 0, ()); + push( @allresultslist, @resultlist); + + my @langtaggreplist; + for $result (@allresultslist) + { + my $loca; + # { LANGUAGE_AFRIKAANS, "af", "ZA", 0 }, + # { LANGUAGE_SERBIAN_LATIN_SERBIA, "sr-Latn", "RS", 0 }, + if ($result =~ /^\s*\{\s*(\w+)\s*,\s*"(\w+)"\s*,\s*"(\w+)?"\s*,\s*\w+\s*\}\s*,/ || + $result =~ /^\s*\{\s*(\w+)\s*,\s*"(\w+-\w+)"\s*,\s*"(\w+)?"\s*,\s*\w+\s*\}\s*,/) + { + my $lang = $2; + my $coun = $3; + if ($coun) + { + $loca = $lang . "_" . $coun; + push( @langtaggreplist, '\b' . $lang . '\b(-' . $coun . ')?'); + } + else + { + $loca = $lang; + push( @langtaggreplist, '\b' . $lang . '\b'); + } + } + # { LANGUAGE_CATALAN_VALENCIAN, "ca-ES-valencia", "ES", "ca-valencia" }, + if ($result =~ /^\s*\{\s*(\w+)\s*,\s*"([^"]+)"\s*,\s*"(\w*)"\s*,\s*"([^"]*)"\s*\}\s*,/) + { + $loca = $2; + my $lang = $4; + my $coun = $3; + if ($lang) + { + if ($coun) + { + push( @langtaggreplist, '\b' . $lang . '\b(-' . $coun . ')?'); + } + else + { + push( @langtaggreplist, '\b' . $lang . '\b'); + } + } + } + if ($loca) + { + $loca =~ s/-/_/g; + my $file = "$SRC_ROOT/i18npool/source/localedata/data/$loca.xml"; + my $found = open( LD, $file); + if ($found) + { + print "Found $file:\n"; + my $on = 0; + while (my $line = ) + { + if ($line =~ /<(Language|Country|Variant)>/) { + $on = 1; } + if ($on) { + print $line; } + if ($line =~ /<\/(Language|Country|Variant)>/) { + $on = 0; } + } + close( LD); + } + else { + print "No $file\n"; } + } + } + + # Find any special treatment, may need inspection then. + # $grepdef already has \b word delimiters. + grepFile( + $modifier . $grepdef, + "$SRC_ROOT", "i18nlangtag", "source/isolang/mslangid.cxx", 1, ()); + + my $module = "svtools"; + my $name = "inc/langtab.hrc"; + # { NC_("STR_ARR_SVT_LANGUAGE_TABLE", "Afrikaans (South Africa)") , LANGUAGE_AFRIKAANS }, + # lookup define + @resultlist = grepFile( + $modifier . '^\s*\{\s*NC_\(\s*"[^"]*"\s*,\s*".*"\s*\)\s*,.*' . $grepdef . '.*\}', + "$SRC_ROOT", $module, $name, 1, ()); + # lookup string + if (!@resultlist) { + grepFile( + $modifier . '^\s*\{\s*NC_\(\s*"[^"]*"\s*,\s*".*' . $grepdef . '.*"\s*\)\s*,.*\}', + "$SRC_ROOT", $module, $name, 1, ()); } + + for my $langtag (@langtaggreplist) + { + # Name (xxx) = "/registry/spool/org/openoffice/Office/Common-ctl.xcu"; + grepFile( + '^\s*Name\s*\(' . $langtag . '\)\s*=', + "$SRC_ROOT", "scp2", "source/ooo/file_ooo.scp", 1, ()); + + # completelangiso=af ar as-IN ... zu + grepFile( + '^\s*completelangiso\s*=\s*(\s*([a-z]{2,3})(-[A-Z][A-Z])?)*' . $langtag . '', + "$SRC_ROOT", "solenv", "inc/langlist.mk", 1, + # Also grep the list of tags, one per line, \ backslash continued. + ('^\s*completelangiso\s*=', '^\s*$', '^\s*' . $langtag . '\s*\\\\*$')); + + # af 1252 1078 # Afrikaans + grepFile( + '^\s*' . $langtag . '', + "$SRC_ROOT", "l10ntools", "source/ulfconv/msi-encodinglist.txt", 1, ()); + + # 27:af:afrikaans + grepFile( + '^\d*:' . $langtag . '', + "$SRC_ROOT", "bin", "lo-xlate-lang", 1, ()); + } + } + return 0; +} + +main(); diff --git a/i18nlangtag/source/isolang/lcid.awk b/i18nlangtag/source/isolang/lcid.awk new file mode 100644 index 000000000..db1a48d57 --- /dev/null +++ b/i18nlangtag/source/isolang/lcid.awk @@ -0,0 +1,187 @@ +#!/usr/bin/awk -f +# +# This file is part of the LibreOffice project. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# +# This file incorporates work covered by the following license notice: +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to you under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of +# the License at http://www.apache.org/licenses/LICENSE-2.0 . +# +# Utility to compare MS-LANGID definitions with those defined in ../../inc/i18nlangtag/lang.h +# Run in i18nlangtag/source/isolang +# +# outputs new #define LANGUAGE_... 0x... and also some commented out substrings +# that were matched in already existing defines. +# +# ATTENTION! The sed filter in the command line examples below assures that a +# '|' border is drawn by html2text in data tables, and nowhere else, on which +# this awk script relies. This script also heavily relies on the column layout +# encountered. Should MS decide to change their layout or their CSS names +# ("data..."), this would probably break. Should html2text decide that the last +# border="..." attribute encountered wins instead of the first, this may break +# also. +# +# sed -e 's/|/,/g; s/]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' +# +# After html2text best if file cleaned up to _only_ contain the table entries, +# but not necessary, entries are filtered. Check output. +# +# Expects input from the saved page of one of +# +# (1) +# http://www.microsoft.com/globaldev/reference/lcid-all.mspx +# filtered through ``html2text -nobs ...'', generated table: +# blank,name,hex,dec,blank fields: +# |Afrikaans_-_South_Africa___|0436___|1078___| +# +# complete command line: +# lynx -dump -source http://www.microsoft.com/globaldev/reference/lcid-all.mspx | sed -e 's/|/,/g; s/
]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile +# +# +# (2) +# http://www.microsoft.com/globaldev/reference/winxp/xp-lcid.mspx +# filtered through ``html2text -nobs ...'', generated table: +# blank,name,hex,dec,inputlocales,collection,blank fields: +# |Afrikaans |0436 |1078 |0436:00000409, |Basic | +# +# complete command line: +# lynx -dump -source http://www.microsoft.com/globaldev/reference/winxp/xp-lcid.mspx | sed -e 's/|/,/g; s/
]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile +# +# +# (3) +# http://msdn.microsoft.com/library/en-us/intl/nls_238z.asp +# filtered through ``html2text -nobs ...'', generated table: +# blank,hex,locale,name,blank fields: +# |0x0436___|af-ZA___|Afrikaans_(South_Africa)___| +# +# complete command line: +# lynx -dump -source http://msdn.microsoft.com/library/en-us/intl/nls_238z.asp | sed -e 's/|/,/g; s/
]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile +# +# Author: Eike Rathke , +# + +BEGIN { + while ((getline < "../../inc/i18nlangtag/lang.h") > 0) + { + if ($0 ~ /^#define[ ]*LANGUAGE_[_A-Za-z0-9]*[ ]*0x[0-9a-fA-F]/) + { + # lang[HEX]=NAME + lang[toupper(substr($3,3))] = toupper($2) + #print substr($3,3) "=" $2 + } + } + # html2text table follows + FS = "\|" + filetype = 0 + lcid_all = 1 + xp_lcid = 2 + nls_238z = 3 + filetypename[filetype] = "unknown" + filetypename[lcid_all] = "lcid_all" + filetypename[xp_lcid] = "xp_lcid" + filetypename[nls_238z] = "nls_238z" + namefield[lcid_all] = 2 + namefield[xp_lcid] = 2 + namefield[nls_238z] = 4 + hexfield[lcid_all] = 3 + hexfield[xp_lcid] = 3 + hexfield[nls_238z] = 2 + locfield[lcid_all] = 0 + locfield[xp_lcid] = 0 + locfield[nls_238z] = 3 +} + +(NF < 5) { next } + +!filetype { + if (NF == 5) + { + if ($2 ~ /^0x/) + filetype = nls_238z + else if ($2 ~ /^Afrikaans/) + filetype = lcid_all + } + else if (NF == 7) + filetype = xp_lcid + if (!filetype) + next + name = namefield[filetype] + hex = hexfield[filetype] + loc = locfield[filetype] +} + +{ + gsub( /^[^:]*:/, "", $name) + gsub( /\..*/, "", $name) + gsub( /(^[ _]+)|([ _]+$)/, "", $hex) + gsub( /(^[ _]+)|([ _]+$)/, "", $name) + if (loc) + gsub( /(^[ _]+)|([ _]+$)/, "", $loc) +} + +($hex ~ /^0x/) { $hex = substr( $hex, 3) } + +# if only 464 instead of 0464, make it match lang.h +(length($hex) < 4) { $hex = "0" $hex } + +($hex !~ /^[0-9a-fA-F][0-9a-fA-F]*$/) { filtered[$hex] = $0; next } + +# all[HEX]=string +{ all[toupper($hex)] = $name } + +(loc) { comment[toupper($hex)] = " /* " $loc " */" } + +# new hex: newlang[HEX]=string +!(toupper($hex) in lang) { newlang[toupper($hex)] = $name } + +END { + if (!filetype) + { + print "No file type recognized." >>"/dev/stderr" + exit(1) + } + print "// assuming " filetypename[filetype] " file" + # every new language + for (x in newlang) + { + printf( "xxxxxxx LANGUAGE_%-26s 0x%s%s\n", newlang[x], x, comment[x]) + n = split(newlang[x],arr,/[^A-Za-z0-9]/) + def = "" + for (i=1; i<=n; ++i) + { + if (length(arr[i])) + { + # each identifier word of the language name + if (def) + def = def "_" + aup = toupper(arr[i]) + def = def aup + for (l in lang) + { + # contained in already existing definitions? + if (lang[l] ~ aup) + printf( "// %-50s %s\n", arr[i] ": " lang[l], l) + } + } + } + printf( "#define LANGUAGE_%-26s 0x%s\n", def, x) + } + print "\n// --- reverse check follows ----------------------------------\n" + for (x in lang) + { + if (!(x in all)) + print "// not in input file: " x " " lang[x] + } + print "\n// --- filtered table entries follow (if any) -----------------\n" + for (x in filtered) + print "// filtered: " x " " filtered[x] +} diff --git a/i18nlangtag/source/isolang/mslangid.cxx b/i18nlangtag/source/isolang/mslangid.cxx new file mode 100644 index 000000000..34c55b66e --- /dev/null +++ b/i18nlangtag/source/isolang/mslangid.cxx @@ -0,0 +1,601 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ + +#include +#include +#include +#include + +#include + +// Only very limited few functions that are guaranteed to not be called from +// LanguageTag may use LanguageTag ... +#include + + +LanguageType MsLangId::nConfiguredSystemLanguage = LANGUAGE_SYSTEM; +LanguageType MsLangId::nConfiguredSystemUILanguage = LANGUAGE_SYSTEM; + +LanguageType MsLangId::nConfiguredWesternFallback = LANGUAGE_SYSTEM; +LanguageType MsLangId::nConfiguredAsianFallback = LANGUAGE_SYSTEM; +LanguageType MsLangId::nConfiguredComplexFallback = LANGUAGE_SYSTEM; + +// static +void MsLangId::LanguageTagAccess::setConfiguredSystemLanguage( LanguageType nLang ) +{ + nConfiguredSystemLanguage = nLang; +} + + +// static +void MsLangId::setConfiguredSystemUILanguage( LanguageType nLang ) +{ + nConfiguredSystemUILanguage = nLang; +} + +// static +void MsLangId::setConfiguredWesternFallback( LanguageType nLang ) +{ + nConfiguredWesternFallback = nLang; +} + +// static +void MsLangId::setConfiguredAsianFallback( LanguageType nLang ) +{ + nConfiguredAsianFallback = nLang; +} + +// static +void MsLangId::setConfiguredComplexFallback( LanguageType nLang ) +{ + nConfiguredComplexFallback = nLang; +} + +// static +inline LanguageType MsLangId::simplifySystemLanguages( LanguageType nLang ) +{ + if (nLang.anyOf( LANGUAGE_PROCESS_OR_USER_DEFAULT, + LANGUAGE_SYSTEM_DEFAULT, + LANGUAGE_SYSTEM)) + nLang = LANGUAGE_SYSTEM; + return nLang; +} + +// static +LanguageType MsLangId::getRealLanguage( LanguageType nLang ) +{ + LanguageType simplifyLang = simplifySystemLanguages( nLang); + if (simplifyLang == LANGUAGE_SYSTEM ) + nLang = getConfiguredSystemLanguage(); + else if (simplifyLang == LANGUAGE_HID_HUMAN_INTERFACE_DEVICE) + nLang = getConfiguredSystemUILanguage(); + else + { + /* TODO: would this be useful here? */ + //nLang = MsLangId::getReplacementForObsoleteLanguage( nLang); + ; // nothing + } + if (nLang == LANGUAGE_DONTKNOW) + nLang = LANGUAGE_ENGLISH_US; + return nLang; +} + + +// static +LanguageType MsLangId::getConfiguredSystemLanguage() +{ + if (nConfiguredSystemLanguage != LANGUAGE_SYSTEM) + return nConfiguredSystemLanguage; + SAL_WARN("i18nlangtag", "MsLangId::getConfiguredSystemLanguage() - not configured yet"); + return getSystemLanguage(); +} + + +// static +LanguageType MsLangId::getConfiguredSystemUILanguage() +{ + if (nConfiguredSystemUILanguage != LANGUAGE_SYSTEM) + return nConfiguredSystemUILanguage; + SAL_WARN("i18nlangtag", "MsLangId::getConfiguredSystemUILanguage() - not configured yet"); + return getSystemUILanguage(); +} + + +// static +LanguageType MsLangId::getSystemLanguage() +{ + return getPlatformSystemLanguage(); +} + + +// static +LanguageType MsLangId::getSystemUILanguage() +{ + return getPlatformSystemUILanguage(); +} + + +// static +LanguageType MsLangId::resolveSystemLanguageByScriptType( LanguageType nLang, sal_Int16 nType ) +{ + if (nLang == LANGUAGE_NONE) + return nLang; + + nLang = getRealLanguage(nLang); + if (nType != css::i18n::ScriptType::WEAK && getScriptType(nLang) != nType) + { + switch(nType) + { + case css::i18n::ScriptType::ASIAN: + if (nConfiguredAsianFallback == LANGUAGE_SYSTEM) + nLang = LANGUAGE_CHINESE_SIMPLIFIED; + else + nLang = nConfiguredAsianFallback; + break; + case css::i18n::ScriptType::COMPLEX: + if (nConfiguredComplexFallback == LANGUAGE_SYSTEM) + nLang = LANGUAGE_HINDI; + else + nLang = nConfiguredComplexFallback; + break; + default: + if (nConfiguredWesternFallback == LANGUAGE_SYSTEM) + nLang = LANGUAGE_ENGLISH_US; + else + nLang = nConfiguredWesternFallback; + break; + } + } + return nLang; +} + +// static +bool MsLangId::usesHyphenation(LanguageType nLang) +{ + if (primary(nLang).anyOf( + primary(LANGUAGE_ARABIC_PRIMARY_ONLY), + primary(LANGUAGE_FARSI), + primary(LANGUAGE_KASHMIRI), + primary(LANGUAGE_KURDISH_ARABIC_IRAQ), + primary(LANGUAGE_PUNJABI), + primary(LANGUAGE_SINDHI), + primary(LANGUAGE_USER_MALAY_ARABIC_MALAYSIA), + primary(LANGUAGE_SOMALI), + primary(LANGUAGE_SWAHILI), + primary(LANGUAGE_URDU_PAKISTAN), + primary(LANGUAGE_PASHTO), + primary(LANGUAGE_VIETNAMESE)) + || isCJK(nLang)) + { + return false; + } + return true; +} + + +// static +css::lang::Locale MsLangId::Conversion::convertLanguageToLocale( + LanguageType nLang ) +{ + css::lang::Locale aLocale; + // Still resolve LANGUAGE_DONTKNOW if resolving is not requested, + // but not LANGUAGE_SYSTEM or others. + LanguageType nOrigLang = nLang; + nLang = MsLangId::getRealLanguage(nLang); + convertLanguageToLocaleImpl( nLang, aLocale, true ); + if (aLocale.Language.isEmpty() && simplifySystemLanguages(nOrigLang) == LANGUAGE_SYSTEM) + { + // None found but resolve requested, last resort is "en-US". + aLocale.Language = "en"; + aLocale.Country = "US"; + aLocale.Variant.clear(); + } + return aLocale; +} + + +// static +LanguageType MsLangId::Conversion::convertLocaleToLanguage( + const css::lang::Locale& rLocale ) +{ + // empty language => LANGUAGE_SYSTEM + if (rLocale.Language.isEmpty()) + return LANGUAGE_SYSTEM; + + return convertLocaleToLanguageImpl( rLocale); +} + + +// static +css::lang::Locale MsLangId::getFallbackLocale( + const css::lang::Locale & rLocale ) +{ + // empty language => LANGUAGE_SYSTEM + if (rLocale.Language.isEmpty()) + return Conversion::lookupFallbackLocale( Conversion::convertLanguageToLocale( LANGUAGE_SYSTEM )); + else + return Conversion::lookupFallbackLocale( rLocale); +} + +// static +bool MsLangId::isRightToLeft( LanguageType nLang ) +{ + if (primary(nLang).anyOf( + primary(LANGUAGE_ARABIC_SAUDI_ARABIA), + primary(LANGUAGE_HEBREW), + primary(LANGUAGE_YIDDISH), + primary(LANGUAGE_URDU_PAKISTAN), + primary(LANGUAGE_FARSI), + primary(LANGUAGE_KASHMIRI), + primary(LANGUAGE_SINDHI), + primary(LANGUAGE_UIGHUR_CHINA), + primary(LANGUAGE_USER_KYRGYZ_CHINA), + primary(LANGUAGE_USER_NKO))) + { + return true; + } + if (nLang.anyOf( + LANGUAGE_USER_KURDISH_IRAN, + LANGUAGE_OBSOLETE_USER_KURDISH_IRAQ, + LANGUAGE_KURDISH_ARABIC_IRAQ, + LANGUAGE_KURDISH_ARABIC_LSO, + LANGUAGE_USER_KURDISH_SOUTHERN_IRAN, + LANGUAGE_USER_KURDISH_SOUTHERN_IRAQ, + LANGUAGE_USER_HUNGARIAN_ROVAS, + LANGUAGE_USER_MALAY_ARABIC_MALAYSIA, + LANGUAGE_USER_MALAY_ARABIC_BRUNEI)) + { + return true; + } + if (LanguageTag::isOnTheFlyID(nLang)) + return LanguageTag::getOnTheFlyScriptType(nLang) == LanguageTag::ScriptType::RTL; + return false; +} + +// static +bool MsLangId::isRightToLeftMath( LanguageType nLang ) +{ + //http://www.w3.org/TR/arabic-math/ + if (nLang == LANGUAGE_FARSI || nLang == LANGUAGE_ARABIC_MOROCCO) + return false; + return isRightToLeft(nLang); +} + +// static +bool MsLangId::isSimplifiedChinese( LanguageType nLang ) +{ + return isChinese(nLang) && !isTraditionalChinese(nLang); +} + +// static +bool MsLangId::isSimplifiedChinese( const css::lang::Locale & rLocale ) +{ + return rLocale.Language == "zh" && !isTraditionalChinese(rLocale); +} + +// static +bool MsLangId::isTraditionalChinese( LanguageType nLang ) +{ + return nLang.anyOf( + LANGUAGE_CHINESE_TRADITIONAL, + LANGUAGE_CHINESE_HONGKONG, + LANGUAGE_CHINESE_MACAU); +} + +// static +bool MsLangId::isTraditionalChinese( const css::lang::Locale & rLocale ) +{ + return rLocale.Language == "zh" && (rLocale.Country == "TW" || rLocale.Country == "HK" || rLocale.Country == "MO"); +} + +//static +bool MsLangId::isChinese( LanguageType nLang ) +{ + return MsLangId::getPrimaryLanguage(nLang) == MsLangId::getPrimaryLanguage(LANGUAGE_CHINESE) || + MsLangId::getPrimaryLanguage(nLang) == MsLangId::getPrimaryLanguage(LANGUAGE_YUE_CHINESE_HONGKONG); +} + +//static +bool MsLangId::isKorean( LanguageType nLang ) +{ + return MsLangId::getPrimaryLanguage(nLang) == MsLangId::getPrimaryLanguage(LANGUAGE_KOREAN); +} + +// static +bool MsLangId::isCJK( LanguageType nLang ) +{ + if (primary(nLang).anyOf( + primary(LANGUAGE_CHINESE), + primary(LANGUAGE_YUE_CHINESE_HONGKONG), + primary(LANGUAGE_JAPANESE), + primary(LANGUAGE_KOREAN))) + { + return true; + } + if (LanguageTag::isOnTheFlyID(nLang)) + return LanguageTag::getOnTheFlyScriptType(nLang) == LanguageTag::ScriptType::CJK; + return false; +} + +// static +bool MsLangId::isFamilyNameFirst( LanguageType nLang ) +{ + return isCJK(nLang) || nLang == LANGUAGE_HUNGARIAN; +} + +// static +bool MsLangId::hasForbiddenCharacters( LanguageType nLang ) +{ + return isCJK(nLang); +} + + +// static +bool MsLangId::needsSequenceChecking( LanguageType nLang ) +{ + return primary(nLang).anyOf( + primary(LANGUAGE_BURMESE), + primary(LANGUAGE_KHMER), + primary(LANGUAGE_LAO), + primary(LANGUAGE_THAI)) + || nLang.anyOf( + LANGUAGE_USER_PALI_THAI); +} + + +// static +sal_Int16 MsLangId::getScriptType( LanguageType nLang ) +{ + sal_Int16 nScript; + + // CTL + if( nLang.anyOf( + LANGUAGE_MONGOLIAN_MONGOLIAN_MONGOLIA, + LANGUAGE_MONGOLIAN_MONGOLIAN_CHINA, + LANGUAGE_MONGOLIAN_MONGOLIAN_LSO, + LANGUAGE_USER_KURDISH_IRAN, + LANGUAGE_OBSOLETE_USER_KURDISH_IRAQ, + LANGUAGE_KURDISH_ARABIC_IRAQ, + LANGUAGE_KURDISH_ARABIC_LSO, + LANGUAGE_USER_KURDISH_SOUTHERN_IRAN, + LANGUAGE_USER_KURDISH_SOUTHERN_IRAQ, + LANGUAGE_USER_KYRGYZ_CHINA, + LANGUAGE_USER_HUNGARIAN_ROVAS, + LANGUAGE_USER_MANCHU, + LANGUAGE_USER_XIBE, + LANGUAGE_USER_MALAY_ARABIC_MALAYSIA, + LANGUAGE_USER_MALAY_ARABIC_BRUNEI, + LANGUAGE_USER_PALI_THAI)) + { + nScript = css::i18n::ScriptType::COMPLEX; + } + // "Western" + else if (nLang.anyOf( + LANGUAGE_MONGOLIAN_CYRILLIC_MONGOLIA, + LANGUAGE_MONGOLIAN_CYRILLIC_LSO, + LANGUAGE_USER_KURDISH_SYRIA, + LANGUAGE_USER_KURDISH_TURKEY)) + { + nScript = css::i18n::ScriptType::LATIN; + } +// currently not knowing scripttype - defaulted to LATIN: +/* +#define LANGUAGE_ARMENIAN 0x042B +#define LANGUAGE_INDONESIAN 0x0421 +#define LANGUAGE_KAZAKH 0x043F +#define LANGUAGE_KONKANI 0x0457 +#define LANGUAGE_MACEDONIAN 0x042F +#define LANGUAGE_TATAR 0x0444 +*/ + // CJK catcher + else if ( primary(nLang).anyOf( + primary(LANGUAGE_CHINESE ), + primary(LANGUAGE_YUE_CHINESE_HONGKONG ), + primary(LANGUAGE_JAPANESE ), + primary(LANGUAGE_KOREAN ) + )) + { + nScript = css::i18n::ScriptType::ASIAN; + } + // CTL catcher + else if (primary(nLang).anyOf( + primary(LANGUAGE_AMHARIC_ETHIOPIA ), + primary(LANGUAGE_ARABIC_SAUDI_ARABIA ), + primary(LANGUAGE_ASSAMESE ), + primary(LANGUAGE_BENGALI ), + primary(LANGUAGE_BURMESE ), + primary(LANGUAGE_DHIVEHI ), + primary(LANGUAGE_FARSI ), + primary(LANGUAGE_GUJARATI ), + primary(LANGUAGE_HEBREW ), + primary(LANGUAGE_HINDI ), + primary(LANGUAGE_KANNADA ), + primary(LANGUAGE_KASHMIRI ), + primary(LANGUAGE_KHMER ), + primary(LANGUAGE_LAO ), + primary(LANGUAGE_MALAYALAM ), + primary(LANGUAGE_MANIPURI ), + primary(LANGUAGE_MARATHI ), + primary(LANGUAGE_NEPALI ), + primary(LANGUAGE_ODIA ), + primary(LANGUAGE_PUNJABI ), + primary(LANGUAGE_SANSKRIT ), + primary(LANGUAGE_SINDHI ), + primary(LANGUAGE_SINHALESE_SRI_LANKA ), + primary(LANGUAGE_SYRIAC ), + primary(LANGUAGE_TAMIL ), + primary(LANGUAGE_TELUGU ), + primary(LANGUAGE_THAI ), + primary(LANGUAGE_TIBETAN ), // also LANGUAGE_DZONGKHA + primary(LANGUAGE_TIGRIGNA_ETHIOPIA ), + primary(LANGUAGE_UIGHUR_CHINA ), + primary(LANGUAGE_URDU_INDIA ), + primary(LANGUAGE_USER_BODO_INDIA ), + primary(LANGUAGE_USER_DOGRI_INDIA ), + primary(LANGUAGE_USER_LIMBU ), + primary(LANGUAGE_USER_MAITHILI_INDIA ), + primary(LANGUAGE_USER_NKO ), + primary(LANGUAGE_YIDDISH ))) + { + nScript = css::i18n::ScriptType::COMPLEX; + } + // Western (actually not necessarily Latin but also Cyrillic, + // for example) + else if (LanguageTag::isOnTheFlyID(nLang)) + { + switch (LanguageTag::getOnTheFlyScriptType(nLang)) + { + case LanguageTag::ScriptType::CJK : + nScript = css::i18n::ScriptType::ASIAN; + break; + case LanguageTag::ScriptType::CTL : + case LanguageTag::ScriptType::RTL : + nScript = css::i18n::ScriptType::COMPLEX; + break; + case LanguageTag::ScriptType::WESTERN : + case LanguageTag::ScriptType::UNKNOWN : + default: + nScript = css::i18n::ScriptType::LATIN; + break; + } + } + else + { + nScript = css::i18n::ScriptType::LATIN; + } + return nScript; +} + + +// static +bool MsLangId::isNonLatinWestern( LanguageType nLang ) +{ + if (nLang.anyOf( + LANGUAGE_AZERI_CYRILLIC, + LANGUAGE_AZERI_CYRILLIC_LSO, + LANGUAGE_BELARUSIAN, + LANGUAGE_BOSNIAN_CYRILLIC_BOSNIA_HERZEGOVINA, + LANGUAGE_BOSNIAN_CYRILLIC_LSO, + LANGUAGE_BULGARIAN, + LANGUAGE_GREEK, + LANGUAGE_MONGOLIAN_CYRILLIC_LSO, + LANGUAGE_MONGOLIAN_CYRILLIC_MONGOLIA, + LANGUAGE_RUSSIAN, + LANGUAGE_RUSSIAN_MOLDOVA, + LANGUAGE_SERBIAN_CYRILLIC_BOSNIA_HERZEGOVINA, + LANGUAGE_SERBIAN_CYRILLIC_LSO, + LANGUAGE_SERBIAN_CYRILLIC_MONTENEGRO, + LANGUAGE_SERBIAN_CYRILLIC_SAM, + LANGUAGE_SERBIAN_CYRILLIC_SERBIA, + LANGUAGE_UKRAINIAN, + LANGUAGE_UZBEK_CYRILLIC, + LANGUAGE_UZBEK_CYRILLIC_LSO)) + { + return true; + } + if (getScriptType( nLang) != css::i18n::ScriptType::LATIN) + return false; + LanguageTag aLanguageTag( nLang); + if (aLanguageTag.hasScript()) + return aLanguageTag.getScript() != "Latn"; + return false; +} + + +// static +bool MsLangId::isLegacy( LanguageType nLang ) +{ + return nLang.anyOf( + LANGUAGE_SERBIAN_CYRILLIC_SAM, + LANGUAGE_SERBIAN_LATIN_SAM); + /* TODO: activate once dictionary was renamed from pap-AN to + * pap-CW, or the pap-CW one supports also pap-AN, see fdo#44112 */ + //case LANGUAGE_PAPIAMENTU: +} + + +// static +LanguageType MsLangId::getReplacementForObsoleteLanguage( LanguageType nLang ) +{ + if (nLang == LANGUAGE_OBSOLETE_USER_LATIN) + nLang = LANGUAGE_LATIN; + else if (nLang == LANGUAGE_OBSOLETE_USER_LATIN_VATICAN) + nLang = LANGUAGE_LATIN; + else if (nLang == LANGUAGE_OBSOLETE_USER_MAORI) + nLang = LANGUAGE_MAORI_NEW_ZEALAND; + else if (nLang == LANGUAGE_OBSOLETE_USER_KINYARWANDA) + nLang = LANGUAGE_KINYARWANDA_RWANDA; + else if (nLang == LANGUAGE_OBSOLETE_USER_UPPER_SORBIAN) + nLang = LANGUAGE_UPPER_SORBIAN_GERMANY; + else if (nLang == LANGUAGE_OBSOLETE_USER_LOWER_SORBIAN) + nLang = LANGUAGE_LOWER_SORBIAN_GERMANY; + else if (nLang == LANGUAGE_OBSOLETE_USER_OCCITAN) + nLang = LANGUAGE_OCCITAN_FRANCE; + else if (nLang == LANGUAGE_OBSOLETE_USER_BRETON) + nLang = LANGUAGE_BRETON_FRANCE; + else if (nLang == LANGUAGE_OBSOLETE_USER_KALAALLISUT) + nLang = LANGUAGE_KALAALLISUT_GREENLAND; + else if (nLang == LANGUAGE_OBSOLETE_USER_LUXEMBOURGISH) + nLang = LANGUAGE_LUXEMBOURGISH_LUXEMBOURG; + else if (nLang == LANGUAGE_OBSOLETE_USER_KABYLE) + nLang = LANGUAGE_TAMAZIGHT_LATIN_ALGERIA; + else if (nLang == LANGUAGE_OBSOLETE_USER_CATALAN_VALENCIAN) + nLang = LANGUAGE_CATALAN_VALENCIAN; + else if (nLang == LANGUAGE_OBSOLETE_USER_MALAGASY_PLATEAU) + nLang = LANGUAGE_MALAGASY_PLATEAU; + else if (nLang == LANGUAGE_GAELIC_SCOTLAND_LEGACY) + nLang = LANGUAGE_GAELIC_SCOTLAND; + else if (nLang == LANGUAGE_OBSOLETE_USER_TSWANA_BOTSWANA) + nLang = LANGUAGE_TSWANA_BOTSWANA; + else if (nLang == LANGUAGE_OBSOLETE_USER_SERBIAN_LATIN_SERBIA) + nLang = LANGUAGE_SERBIAN_LATIN_SERBIA; + else if (nLang == LANGUAGE_OBSOLETE_USER_SERBIAN_LATIN_MONTENEGRO) + nLang = LANGUAGE_SERBIAN_LATIN_MONTENEGRO; + else if (nLang == LANGUAGE_OBSOLETE_USER_SERBIAN_CYRILLIC_SERBIA) + nLang = LANGUAGE_SERBIAN_CYRILLIC_SERBIA; + else if (nLang == LANGUAGE_OBSOLETE_USER_SERBIAN_CYRILLIC_MONTENEGRO) + nLang = LANGUAGE_SERBIAN_CYRILLIC_MONTENEGRO; + else if (nLang == LANGUAGE_OBSOLETE_USER_KURDISH_IRAQ) + nLang = LANGUAGE_KURDISH_ARABIC_IRAQ; + else if (nLang == LANGUAGE_OBSOLETE_USER_SPANISH_CUBA) + nLang = LANGUAGE_SPANISH_CUBA; + + // The following are not strictly obsolete but should be mapped to a + // replacement locale when encountered. + + // no_NO is an alias for nb_NO + else if (nLang == LANGUAGE_NORWEGIAN) + nLang = LANGUAGE_NORWEGIAN_BOKMAL; + + // The erroneous Tibetan vs. Dzongkha case, #i53497# + // We (and MS) have stored LANGUAGE_TIBETAN_BHUTAN. This will need + // special attention if MS one day decides to actually use + // LANGUAGE_TIBETAN_BHUTAN for bo-BT instead of having it reserved; + // then remove the mapping and hope every dz-BT user used ODF to store + // documents ;-) + else if (nLang == LANGUAGE_TIBETAN_BHUTAN) + nLang = LANGUAGE_DZONGKHA_BHUTAN; + + // en-GB-oed is deprecated, use en-GB-oxendict instead. + else if (nLang == LANGUAGE_USER_ENGLISH_UK_OED) + nLang = LANGUAGE_USER_ENGLISH_UK_OXENDICT; + return nLang; +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/i18nlangtag/source/languagetag/languagetag.cxx b/i18nlangtag/source/languagetag/languagetag.cxx new file mode 100644 index 000000000..7d881dd37 --- /dev/null +++ b/i18nlangtag/source/languagetag/languagetag.cxx @@ -0,0 +1,3261 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +//#define erDEBUG + +#if LIBLANGTAG_INLINE_FIX +#define LT_HAVE_INLINE +#endif +#include + +#ifdef ANDROID +#include +#endif + +using namespace com::sun::star; + +namespace { + +// Helper to ensure lt_error_t is free'd +struct myLtError +{ + lt_error_t* p; + myLtError() : p(nullptr) {} + ~myLtError() { if (p) lt_error_unref( p); } +}; + +} + +namespace { +std::recursive_mutex& theMutex() +{ + static std::recursive_mutex SINGLETON; + return SINGLETON; +} +} + +typedef std::unordered_set< OUString > KnownTagSet; +static const KnownTagSet & getKnowns() +{ + static KnownTagSet theKnowns = []() + { + KnownTagSet tmpSet; + ::std::vector< MsLangId::LanguagetagMapping > aDefined( MsLangId::getDefinedLanguagetags()); + for (auto const& elemDefined : aDefined) + { + // Do not use the BCP47 string here to initialize the + // LanguageTag because then canonicalize() would call this + // getKnowns() again... + ::std::vector< OUString > aFallbacks( LanguageTag( elemDefined.mnLang).getFallbackStrings( true)); + for (auto const& fallback : aFallbacks) + { + tmpSet.insert(fallback); + } + } + return tmpSet; + }(); + return theKnowns; +} + + +namespace { +struct compareIgnoreAsciiCaseLess +{ + bool operator()( std::u16string_view r1, std::u16string_view r2 ) const + { + return o3tl::compareToIgnoreAsciiCase(r1, r2) < 0; + } +}; +typedef ::std::map< OUString, LanguageTag::ImplPtr, compareIgnoreAsciiCaseLess > MapBcp47; +typedef ::std::map< LanguageType, LanguageTag::ImplPtr > MapLangID; +MapBcp47& theMapBcp47() +{ + static MapBcp47 SINGLETON; + return SINGLETON; +} +MapLangID& theMapLangID() +{ + static MapLangID SINGLETON; + return SINGLETON; +} +LanguageTag::ImplPtr& theSystemLocale() +{ + static LanguageTag::ImplPtr SINGLETON; + return SINGLETON; +} +} + + +static LanguageType getNextOnTheFlyLanguage() +{ + static LanguageType nOnTheFlyLanguage(0); + std::unique_lock aGuard( theMutex()); + if (!nOnTheFlyLanguage) + nOnTheFlyLanguage = MsLangId::makeLangID( LANGUAGE_ON_THE_FLY_SUB_START, LANGUAGE_ON_THE_FLY_START); + else + { + if (MsLangId::getPrimaryLanguage( nOnTheFlyLanguage) != LANGUAGE_ON_THE_FLY_END) + ++nOnTheFlyLanguage; + else + { + LanguageType nSub = MsLangId::getSubLanguage( nOnTheFlyLanguage); + if (nSub != LANGUAGE_ON_THE_FLY_SUB_END) + nOnTheFlyLanguage = MsLangId::makeLangID( ++nSub, LANGUAGE_ON_THE_FLY_START); + else + { + SAL_WARN( "i18nlangtag", "getNextOnTheFlyLanguage: none left! (" + << ((sal_uInt16(LANGUAGE_ON_THE_FLY_END) - sal_uInt16(LANGUAGE_ON_THE_FLY_START) + 1) + * (sal_uInt16(LANGUAGE_ON_THE_FLY_SUB_END) - sal_uInt16(LANGUAGE_ON_THE_FLY_SUB_START) + 1)) + << " consumed?!?)"); + return LanguageType(0); + } + } + } +#if OSL_DEBUG_LEVEL > 0 + static size_t nOnTheFlies = 0; + ++nOnTheFlies; + SAL_INFO( "i18nlangtag", "getNextOnTheFlyLanguage: number " << nOnTheFlies); +#endif + return nOnTheFlyLanguage; +} + + +// static +bool LanguageTag::isOnTheFlyID( LanguageType nLang ) +{ + LanguageType nPri = MsLangId::getPrimaryLanguage( nLang); + LanguageType nSub = MsLangId::getSubLanguage( nLang); + return + LANGUAGE_ON_THE_FLY_START <= nPri && nPri <= LANGUAGE_ON_THE_FLY_END && + LANGUAGE_ON_THE_FLY_SUB_START <= nSub && nSub <= LANGUAGE_ON_THE_FLY_SUB_END; +} + +namespace { + +/** A reference holder for liblangtag data de/initialization, one static + instance. Currently implemented such that the first "ref" inits and dtor + (our library deinitialized) tears down. +*/ +class LiblangtagDataRef +{ +public: + LiblangtagDataRef(); + ~LiblangtagDataRef(); + void init() + { + if (!mbInitialized) + setup(); + } +private: + OString maDataPath; // path to liblangtag data, "|" if system + bool mbInitialized; + + void setupDataPath(); + void setup(); + static void teardown(); +}; + +LiblangtagDataRef& theDataRef() +{ + static LiblangtagDataRef SINGLETON; + return SINGLETON; +} +} + +LiblangtagDataRef::LiblangtagDataRef() + : + mbInitialized(false) +{ +} + +LiblangtagDataRef::~LiblangtagDataRef() +{ + if (mbInitialized) + teardown(); +} + +void LiblangtagDataRef::setup() +{ + SAL_INFO( "i18nlangtag", "LiblangtagDataRef::setup: initializing database"); + if (maDataPath.isEmpty()) + setupDataPath(); + lt_db_initialize(); + mbInitialized = true; +} + +void LiblangtagDataRef::teardown() +{ + SAL_INFO( "i18nlangtag", "LiblangtagDataRef::teardown: finalizing database"); + lt_db_finalize(); +} + +void LiblangtagDataRef::setupDataPath() +{ +#if defined(ANDROID) + maDataPath = OString(lo_get_app_data_dir()) + "/share/liblangtag"; +#else + // maDataPath is assumed to be empty here. + OUString aURL("$BRAND_BASE_DIR/" LIBO_SHARE_FOLDER "/liblangtag"); + rtl::Bootstrap::expandMacros(aURL); //TODO: detect failure + + // Check if data is in our own installation, else assume system + // installation. + OUString aData = aURL + "/language-subtag-registry.xml"; + osl::DirectoryItem aDirItem; + if (osl::DirectoryItem::get( aData, aDirItem) == osl::DirectoryItem::E_None) + { + OUString aPath; + if (osl::FileBase::getSystemPathFromFileURL( aURL, aPath) == osl::FileBase::E_None) + maDataPath = OUStringToOString( aPath, RTL_TEXTENCODING_UTF8); + } +#endif + if (maDataPath.isEmpty()) + maDataPath = "|"; // assume system + else + lt_db_set_datadir( maDataPath.getStr()); +} + + +/* TODO: we could transform known vendor and browser-specific variants to known + * BCP 47 if available. For now just remove them to not confuse any later + * treatments that check for empty variants. This vendor stuff was never + * supported anyway. */ +static void handleVendorVariant( css::lang::Locale & rLocale ) +{ + if (!rLocale.Variant.isEmpty() && rLocale.Language != I18NLANGTAG_QLT) + rLocale.Variant.clear(); +} + + +class LanguageTagImpl +{ +public: + + explicit LanguageTagImpl( const LanguageTag & rLanguageTag ); + explicit LanguageTagImpl( const LanguageTagImpl & rLanguageTagImpl ); + ~LanguageTagImpl(); + LanguageTagImpl& operator=( const LanguageTagImpl & rLanguageTagImpl ); + +private: + + friend class LanguageTag; + + enum Decision + { + DECISION_DONTKNOW, + DECISION_NO, + DECISION_YES + }; + + mutable css::lang::Locale maLocale; + mutable OUString maBcp47; + mutable OUString maCachedLanguage; ///< cache getLanguage() + mutable OUString maCachedScript; ///< cache getScript() + mutable OUString maCachedCountry; ///< cache getCountry() + mutable OUString maCachedVariants; ///< cache getVariants() + mutable OUString maCachedGlibcString; ///< cache getGlibcLocaleString() + mutable lt_tag_t* mpImplLangtag; ///< liblangtag pointer + mutable LanguageType mnLangID; + mutable LanguageTag::ScriptType meScriptType; + mutable Decision meIsValid; + mutable Decision meIsIsoLocale; + mutable Decision meIsIsoODF; + mutable Decision meIsLiblangtagNeeded; ///< whether processing with liblangtag needed + bool mbSystemLocale : 1; + mutable bool mbInitializedBcp47 : 1; + mutable bool mbInitializedLocale : 1; + mutable bool mbInitializedLangID : 1; + mutable bool mbCachedLanguage : 1; + mutable bool mbCachedScript : 1; + mutable bool mbCachedCountry : 1; + mutable bool mbCachedVariants : 1; + mutable bool mbCachedGlibcString : 1; + + OUString const & getBcp47() const; + OUString const & getLanguage() const; + OUString const & getScript() const; + OUString const & getCountry() const; + OUString getRegion() const; + OUString const & getVariants() const; + bool hasScript() const; + OUString const & getGlibcLocaleString() const; + + void setScriptType(LanguageTag::ScriptType st); + LanguageTag::ScriptType getScriptType() const; + + bool isIsoLocale() const; + bool isIsoODF() const; + bool isValidBcp47() const; + + void convertLocaleToBcp47(); + bool convertLocaleToLang( bool bAllowOnTheFlyID ); + void convertBcp47ToLocale(); + void convertBcp47ToLang(); + void convertLangToLocale(); + void convertLangToBcp47(); + + /** @return whether BCP 47 language tag string was changed. */ + bool canonicalize(); + + /** Canonicalize if not yet done and synchronize initialized conversions. + + @return whether BCP 47 language tag string was changed. + */ + bool synCanonicalize(); + + OUString getLanguageFromLangtag(); + OUString getScriptFromLangtag(); + OUString getRegionFromLangtag(); + OUString getVariantsFromLangtag(); + + /** Generates on-the-fly LangID and registers the maBcp47,mnLangID pair. + + @param nRegisterID + If not 0 and not LANGUAGE_DONTKNOW, suggest (!) to use that ID + instead of generating an on-the-fly ID. Implementation may + still generate an ID if the suggested ID is already used for + another language tag. + + @return NULL if no ID could be obtained or registration failed. + */ + LanguageTag::ImplPtr registerOnTheFly( LanguageType nRegisterID ); + + /** Obtain Language, Script, Country and Variants via simpleExtract() and + assign them to the cached variables if successful. + + @return simpleExtract() successfully extracted and cached. + */ + bool cacheSimpleLSCV(); + + enum Extraction + { + EXTRACTED_NONE, + EXTRACTED_LSC, + EXTRACTED_LV, + EXTRACTED_C_LOCALE, + EXTRACTED_X, + EXTRACTED_X_JOKER, + EXTRACTED_KNOWN_BAD + }; + + /** Of a language tag of the form lll[-Ssss][-CC][-vvvvvvvv] extract the + portions. + + Does not check case or content! + + @return EXTRACTED_LSC if simple tag was detected (i.e. one that + would fulfill the isIsoODF() condition), + EXTRACTED_LV if a tag with variant was detected, + EXTRACTED_C_LOCALE if a 'C' locale was detected, + EXTRACTED_X if x-... privateuse tag was detected, + EXTRACTED_X_JOKER if "*" joker was detected, + EXTRACTED_KNOWN_BAD if a bad but known (to be remapped) tag was detected + EXTRACTED_NONE else. + */ + static Extraction simpleExtract( const OUString& rBcp47, + OUString& rLanguage, + OUString& rScript, + OUString& rCountry, + OUString& rVariants ); + + /** Convert Locale to BCP 47 string without resolving system and creating + temporary LanguageTag instances. */ + static OUString convertToBcp47( const css::lang::Locale& rLocale ); + +}; + + +LanguageTagImpl::LanguageTagImpl( const LanguageTag & rLanguageTag ) + : + maLocale( rLanguageTag.maLocale), + maBcp47( rLanguageTag.maBcp47), + mpImplLangtag( nullptr), + mnLangID( rLanguageTag.mnLangID), + meScriptType( LanguageTag::ScriptType::UNKNOWN), + meIsValid( DECISION_DONTKNOW), + meIsIsoLocale( DECISION_DONTKNOW), + meIsIsoODF( DECISION_DONTKNOW), + meIsLiblangtagNeeded( DECISION_DONTKNOW), + mbSystemLocale( rLanguageTag.mbSystemLocale), + mbInitializedBcp47( rLanguageTag.mbInitializedBcp47), + mbInitializedLocale( rLanguageTag.mbInitializedLocale), + mbInitializedLangID( rLanguageTag.mbInitializedLangID), + mbCachedLanguage( false), + mbCachedScript( false), + mbCachedCountry( false), + mbCachedVariants( false), + mbCachedGlibcString( false) +{ +} + + +LanguageTagImpl::LanguageTagImpl( const LanguageTagImpl & rLanguageTagImpl ) + : + maLocale( rLanguageTagImpl.maLocale), + maBcp47( rLanguageTagImpl.maBcp47), + maCachedLanguage( rLanguageTagImpl.maCachedLanguage), + maCachedScript( rLanguageTagImpl.maCachedScript), + maCachedCountry( rLanguageTagImpl.maCachedCountry), + maCachedVariants( rLanguageTagImpl.maCachedVariants), + maCachedGlibcString( rLanguageTagImpl.maCachedGlibcString), + mpImplLangtag( rLanguageTagImpl.mpImplLangtag ? + lt_tag_copy( rLanguageTagImpl.mpImplLangtag) : nullptr), + mnLangID( rLanguageTagImpl.mnLangID), + meScriptType( rLanguageTagImpl.meScriptType), + meIsValid( rLanguageTagImpl.meIsValid), + meIsIsoLocale( rLanguageTagImpl.meIsIsoLocale), + meIsIsoODF( rLanguageTagImpl.meIsIsoODF), + meIsLiblangtagNeeded( rLanguageTagImpl.meIsLiblangtagNeeded), + mbSystemLocale( rLanguageTagImpl.mbSystemLocale), + mbInitializedBcp47( rLanguageTagImpl.mbInitializedBcp47), + mbInitializedLocale( rLanguageTagImpl.mbInitializedLocale), + mbInitializedLangID( rLanguageTagImpl.mbInitializedLangID), + mbCachedLanguage( rLanguageTagImpl.mbCachedLanguage), + mbCachedScript( rLanguageTagImpl.mbCachedScript), + mbCachedCountry( rLanguageTagImpl.mbCachedCountry), + mbCachedVariants( rLanguageTagImpl.mbCachedVariants), + mbCachedGlibcString( rLanguageTagImpl.mbCachedGlibcString) +{ + if (mpImplLangtag) + theDataRef().init(); +} + + +LanguageTagImpl& LanguageTagImpl::operator=( const LanguageTagImpl & rLanguageTagImpl ) +{ + if (&rLanguageTagImpl == this) + return *this; + + maLocale = rLanguageTagImpl.maLocale; + maBcp47 = rLanguageTagImpl.maBcp47; + maCachedLanguage = rLanguageTagImpl.maCachedLanguage; + maCachedScript = rLanguageTagImpl.maCachedScript; + maCachedCountry = rLanguageTagImpl.maCachedCountry; + maCachedVariants = rLanguageTagImpl.maCachedVariants; + maCachedGlibcString = rLanguageTagImpl.maCachedGlibcString; + lt_tag_t * oldTag = mpImplLangtag; + mpImplLangtag = rLanguageTagImpl.mpImplLangtag ? + lt_tag_copy( rLanguageTagImpl.mpImplLangtag) : nullptr; + lt_tag_unref(oldTag); + mnLangID = rLanguageTagImpl.mnLangID; + meScriptType = rLanguageTagImpl.meScriptType; + meIsValid = rLanguageTagImpl.meIsValid; + meIsIsoLocale = rLanguageTagImpl.meIsIsoLocale; + meIsIsoODF = rLanguageTagImpl.meIsIsoODF; + meIsLiblangtagNeeded= rLanguageTagImpl.meIsLiblangtagNeeded; + mbSystemLocale = rLanguageTagImpl.mbSystemLocale; + mbInitializedBcp47 = rLanguageTagImpl.mbInitializedBcp47; + mbInitializedLocale = rLanguageTagImpl.mbInitializedLocale; + mbInitializedLangID = rLanguageTagImpl.mbInitializedLangID; + mbCachedLanguage = rLanguageTagImpl.mbCachedLanguage; + mbCachedScript = rLanguageTagImpl.mbCachedScript; + mbCachedCountry = rLanguageTagImpl.mbCachedCountry; + mbCachedVariants = rLanguageTagImpl.mbCachedVariants; + mbCachedGlibcString = rLanguageTagImpl.mbCachedGlibcString; + if (mpImplLangtag && !oldTag) + theDataRef().init(); + return *this; +} + + +LanguageTagImpl::~LanguageTagImpl() +{ + if (mpImplLangtag) + { + lt_tag_unref( mpImplLangtag); + } +} + + +LanguageTag::LanguageTag( const OUString & rBcp47LanguageTag, bool bCanonicalize ) + : + maBcp47( rBcp47LanguageTag), + mnLangID( LANGUAGE_DONTKNOW), + mbSystemLocale( rBcp47LanguageTag.isEmpty()), + mbInitializedBcp47( !mbSystemLocale), + mbInitializedLocale( false), + mbInitializedLangID( false), + mbIsFallback( false) +{ + if (bCanonicalize) + { + getImpl()->canonicalize(); + // Registration itself may already have canonicalized, so do an + // unconditional sync. + syncFromImpl(); + } + +} + + +LanguageTag::LanguageTag( const css::lang::Locale & rLocale ) + : + maLocale( rLocale), + mnLangID( LANGUAGE_DONTKNOW), + mbSystemLocale( rLocale.Language.isEmpty()), + mbInitializedBcp47( false), + mbInitializedLocale( false), // we do not know which mess we got passed in + mbInitializedLangID( false), + mbIsFallback( false) +{ + handleVendorVariant( maLocale); +} + + +LanguageTag::LanguageTag( LanguageType nLanguage ) + : + mnLangID( nLanguage), + mbSystemLocale( nLanguage == LANGUAGE_SYSTEM), + mbInitializedBcp47( false), + mbInitializedLocale( false), + mbInitializedLangID( !mbSystemLocale), + mbIsFallback( false) +{ +} + + +LanguageTag::LanguageTag( const OUString& rBcp47, const OUString& rLanguage, + std::u16string_view rScript, const OUString& rCountry ) + : + maBcp47( rBcp47), + mnLangID( LANGUAGE_DONTKNOW), + mbSystemLocale( rBcp47.isEmpty() && rLanguage.isEmpty()), + mbInitializedBcp47( !rBcp47.isEmpty()), + mbInitializedLocale( false), + mbInitializedLangID( false), + mbIsFallback( false) +{ + if (mbSystemLocale || mbInitializedBcp47) + return; + + if (rScript.empty()) + { + maBcp47 = rLanguage + "-" + rCountry; + mbInitializedBcp47 = true; + maLocale.Language = rLanguage; + maLocale.Country = rCountry; + mbInitializedLocale = true; + } + else + { + if (rCountry.isEmpty()) + maBcp47 = rLanguage + "-" + rScript; + else + maBcp47 = rLanguage + "-" + rScript + "-" + rCountry; + mbInitializedBcp47 = true; + maLocale.Language = I18NLANGTAG_QLT; + maLocale.Country = rCountry; + maLocale.Variant = maBcp47; + mbInitializedLocale = true; + } +} + + +LanguageTag::LanguageTag( const rtl_Locale & rLocale ) + : + maLocale( rLocale.Language, rLocale.Country, rLocale.Variant), + mnLangID( LANGUAGE_DONTKNOW), + mbSystemLocale( maLocale.Language.isEmpty()), + mbInitializedBcp47( false), + mbInitializedLocale( !mbSystemLocale), + mbInitializedLangID( false), + mbIsFallback( false) +{ + convertFromRtlLocale(); +} + +LanguageTag::~LanguageTag() {} + +LanguageTag::ImplPtr LanguageTagImpl::registerOnTheFly( LanguageType nRegisterID ) +{ + LanguageTag::ImplPtr pImpl; + + if (!mbInitializedBcp47) + { + if (mbInitializedLocale) + { + maBcp47 = LanguageTagImpl::convertToBcp47( maLocale); + mbInitializedBcp47 = !maBcp47.isEmpty(); + } + } + if (maBcp47.isEmpty()) + { + SAL_WARN( "i18nlangtag", "LanguageTagImpl::registerOnTheFly: no Bcp47 string, no registering"); + return pImpl; + } + + std::unique_lock aGuard( theMutex()); + + MapBcp47& rMapBcp47 = theMapBcp47(); + MapBcp47::const_iterator it( rMapBcp47.find( maBcp47)); + bool bOtherImpl = false; + if (it != rMapBcp47.end()) + { + SAL_INFO( "i18nlangtag", "LanguageTag::registerOnTheFly: found impl for '" << maBcp47 << "'"); + pImpl = (*it).second; + if (pImpl.get() != this) + { + // Could happen for example if during registerImpl() the tag was + // changed via canonicalize() and the result was already present in + // the map before, for example 'bn-Beng' => 'bn'. This specific + // case is now taken care of in registerImpl() and doesn't reach + // here. However, use the already existing impl if it matches. + SAL_WARN( "i18nlangtag", "LanguageTag::registerOnTheFly: using other impl for this '" << maBcp47 << "'"); + *this = *pImpl; // ensure consistency + bOtherImpl = true; + } + } + else + { + SAL_INFO( "i18nlangtag", "LanguageTag::registerOnTheFly: new impl for '" << maBcp47 << "'"); + pImpl = std::make_shared( *this); + rMapBcp47.insert( ::std::make_pair( maBcp47, pImpl)); + } + + if (!bOtherImpl || !pImpl->mbInitializedLangID) + { + if (nRegisterID == LanguageType(0) || nRegisterID == LANGUAGE_DONTKNOW) + nRegisterID = getNextOnTheFlyLanguage(); + else + { + // Accept a suggested ID only if it is not mapped yet to something + // different, otherwise we would end up with ambiguous assignments + // of different language tags, for example for the same primary + // LangID with "no", "nb" and "nn". + const MapLangID& rMapLangID = theMapLangID(); + MapLangID::const_iterator itID( rMapLangID.find( nRegisterID)); + if (itID != rMapLangID.end()) + { + if ((*itID).second->maBcp47 != maBcp47) + { + SAL_INFO( "i18nlangtag", "LanguageTag::registerOnTheFly: not using suggested 0x" + << ::std::hex << nRegisterID << " for '" << maBcp47 << "' have '" + << (*itID).second->maBcp47 << "'"); + nRegisterID = getNextOnTheFlyLanguage(); + } + else + { + SAL_WARN( "i18nlangtag", "LanguageTag::registerOnTheFly: suggested 0x" + << ::std::hex << nRegisterID << " for '" << maBcp47 << "' already registered"); + } + } + } + if (!nRegisterID) + { + // out of IDs, nothing to register + return pImpl; + } + pImpl->mnLangID = nRegisterID; + pImpl->mbInitializedLangID = true; + if (pImpl.get() != this) + { + mnLangID = nRegisterID; + mbInitializedLangID = true; + } + } + + ::std::pair< MapLangID::const_iterator, bool > res( + theMapLangID().insert( ::std::make_pair( pImpl->mnLangID, pImpl))); + if (res.second) + { + SAL_INFO( "i18nlangtag", "LanguageTag::registerOnTheFly: cross-inserted 0x" + << ::std::hex << pImpl->mnLangID << " for '" << maBcp47 << "'"); + } + else + { + SAL_WARN( "i18nlangtag", "LanguageTag::registerOnTheFly: not cross-inserted 0x" + << ::std::hex << pImpl->mnLangID << " for '" << maBcp47 << "' have '" + << (*res.first).second->maBcp47 << "'"); + } + + return pImpl; +} + + +LanguageTag::ScriptType LanguageTag::getOnTheFlyScriptType( LanguageType nRegisterID ) +{ + const MapLangID& rMapLangID = theMapLangID(); + MapLangID::const_iterator itID( rMapLangID.find( nRegisterID)); + if (itID != rMapLangID.end()) + return (*itID).second->getScriptType(); + else + return ScriptType::UNKNOWN; +} + + +// static +void LanguageTag::setConfiguredSystemLanguage( LanguageType nLang ) +{ + if (nLang == LANGUAGE_DONTKNOW || nLang == LANGUAGE_SYSTEM) + { + SAL_WARN( "i18nlangtag", + "LanguageTag::setConfiguredSystemLanguage: refusing to set unresolved system locale 0x" << + ::std::hex << nLang); + return; + } + SAL_INFO( "i18nlangtag", "LanguageTag::setConfiguredSystemLanguage: setting to 0x" << ::std::hex << nLang); + MsLangId::LanguageTagAccess::setConfiguredSystemLanguage( nLang); + // Reset system locale to none and let registerImpl() do the rest to + // initialize a new one. + theSystemLocale().reset(); + LanguageTag aLanguageTag( LANGUAGE_SYSTEM); + aLanguageTag.registerImpl(); +} + +static bool lt_tag_parse_disabled = false; + +// static +void LanguageTag::disable_lt_tag_parse() +{ + lt_tag_parse_disabled = true; +} + +static bool lcl_isKnownOnTheFlyID( LanguageType nLang ) +{ + return nLang != LANGUAGE_DONTKNOW && nLang != LANGUAGE_SYSTEM && + (LanguageTag::isOnTheFlyID( nLang) || (nLang == MsLangId::getPrimaryLanguage( nLang))); +} + + +LanguageTag::ImplPtr LanguageTag::registerImpl() const +{ + // XXX NOTE: Do not use non-static LanguageTag::convert...() member methods + // here as they access getImpl() and syncFromImpl() and would lead to + // recursion. Also do not use the static LanguageTag::convertTo...() + // methods as they may create temporary LanguageTag instances. Only + // LanguageTagImpl::convertToBcp47(Locale) is ok. + + ImplPtr pImpl; + +#if OSL_DEBUG_LEVEL > 0 + static size_t nCalls = 0; + ++nCalls; + SAL_INFO( "i18nlangtag", "LanguageTag::registerImpl: " << nCalls << " calls"); +#endif + + // Do not register unresolved system locale, also force LangID if system + // and take the system locale shortcut if possible. + if (mbSystemLocale) + { + pImpl = theSystemLocale(); + if (pImpl) + { +#if OSL_DEBUG_LEVEL > 0 + static size_t nCallsSystem = 0; + ++nCallsSystem; + SAL_INFO( "i18nlangtag", "LanguageTag::registerImpl: " << nCallsSystem << " system calls"); +#endif + return pImpl; + } + if (!mbInitializedLangID) + { + mnLangID = MsLangId::getRealLanguage( LANGUAGE_SYSTEM); + mbInitializedLangID = (mnLangID != LANGUAGE_SYSTEM); + SAL_WARN_IF( !mbInitializedLangID, "i18nlangtag", "LanguageTag::registerImpl: can't resolve system!"); + } + } + + if (mbInitializedLangID) + { + if (mnLangID == LANGUAGE_DONTKNOW) + { + static LanguageTag::ImplPtr theDontKnow; + // Heavy usage of LANGUAGE_DONTKNOW, make it an own Impl for all the + // conversion attempts. At the same time provide a central breakpoint + // to inspect such places. + if (!theDontKnow) + theDontKnow = std::make_shared( *this); + pImpl = theDontKnow; +#if OSL_DEBUG_LEVEL > 0 + static size_t nCallsDontKnow = 0; + ++nCallsDontKnow; + SAL_INFO( "i18nlangtag", "LanguageTag::registerImpl: " << nCallsDontKnow << " DontKnow calls"); +#endif + return pImpl; + } + else + { + // A great share are calls for a system equal locale. + pImpl = theSystemLocale(); + if (pImpl && pImpl->mnLangID == mnLangID) + { +#if OSL_DEBUG_LEVEL > 0 + static size_t nCallsSystemEqual = 0; + ++nCallsSystemEqual; + SAL_INFO( "i18nlangtag", "LanguageTag::registerImpl: " << nCallsSystemEqual + << " system equal LangID calls"); +#endif + return pImpl; + } + } + } + + // Force Bcp47 if not LangID. + if (!mbInitializedLangID && !mbInitializedBcp47) + { + // The one central point to set mbInitializedLocale=true if a + // LanguageTag was initialized with a Locale. We will now convert and + // possibly later resolve it. + if (!mbInitializedLocale && (mbSystemLocale || !maLocale.Language.isEmpty())) + mbInitializedLocale = true; + SAL_WARN_IF( !mbInitializedLocale, "i18nlangtag", "LanguageTag::registerImpl: still not mbInitializedLocale"); + + maBcp47 = LanguageTagImpl::convertToBcp47( maLocale); + mbInitializedBcp47 = !maBcp47.isEmpty(); + } + + if (mbInitializedBcp47) + { + // A great share are calls for a system equal locale. + pImpl = theSystemLocale(); + if (pImpl && pImpl->maBcp47 == maBcp47) + { +#if OSL_DEBUG_LEVEL > 0 + static size_t nCallsSystemEqual = 0; + ++nCallsSystemEqual; + SAL_INFO( "i18nlangtag", "LanguageTag::registerImpl: " << nCallsSystemEqual << " system equal BCP47 calls"); +#endif + return pImpl; + } + } + +#if OSL_DEBUG_LEVEL > 0 + static size_t nCallsNonSystem = 0; + ++nCallsNonSystem; + SAL_INFO( "i18nlangtag", "LanguageTag::registerImpl: " << nCallsNonSystem << " non-system calls"); +#endif + + std::unique_lock aGuard( theMutex()); + +#if OSL_DEBUG_LEVEL > 0 + static tools::Long nRunning = 0; + // Entering twice here is ok, which is needed for fallback init in + // getKnowns() in canonicalize() via pImpl->convertBcp47ToLocale() below, + // everything else is suspicious. + SAL_WARN_IF( nRunning > 1, "i18nlangtag", "LanguageTag::registerImpl: re-entered for '" + << maBcp47 << "' 0x" << ::std::hex << mnLangID ); + struct Runner { Runner() { ++nRunning; } ~Runner() { --nRunning; } } aRunner; +#endif + + // Prefer LangID map as find+insert needs less comparison work. + if (mbInitializedLangID) + { + MapLangID& rMap = theMapLangID(); + MapLangID::const_iterator it( rMap.find( mnLangID)); + if (it != rMap.end()) + { + SAL_INFO( "i18nlangtag", "LanguageTag::registerImpl: found impl for 0x" << ::std::hex << mnLangID); + pImpl = (*it).second; + } + else + { + SAL_INFO( "i18nlangtag", "LanguageTag::registerImpl: new impl for 0x" << ::std::hex << mnLangID); + pImpl = std::make_shared( *this); + rMap.insert( ::std::make_pair( mnLangID, pImpl)); + // Try round-trip. + if (!pImpl->mbInitializedLocale) + pImpl->convertLangToLocale(); + LanguageType nLang = MsLangId::Conversion::convertLocaleToLanguage( pImpl->maLocale); + // If round-trip is identical cross-insert to Bcp47 map. + if (nLang == pImpl->mnLangID) + { + if (!pImpl->mbInitializedBcp47) + pImpl->convertLocaleToBcp47(); + ::std::pair< MapBcp47::const_iterator, bool > res( + theMapBcp47().insert( ::std::make_pair( pImpl->maBcp47, pImpl))); + if (res.second) + { + SAL_INFO( "i18nlangtag", "LanguageTag::registerImpl: cross-inserted '" << pImpl->maBcp47 << "' for 0x" << ::std::hex << mnLangID); + } + else + { + SAL_INFO( "i18nlangtag", "LanguageTag::registerImpl: not cross-inserted '" << pImpl->maBcp47 << "' for 0x" << ::std::hex << mnLangID << " have 0x" + << ::std::hex << (*res.first).second->mnLangID); + } + } + else + { + if (!pImpl->mbInitializedBcp47) + pImpl->convertLocaleToBcp47(); + SAL_INFO( "i18nlangtag", "LanguageTag::registerImpl: not cross-inserted '" << pImpl->maBcp47 << "' for 0x" << ::std::hex << mnLangID << " round-trip to 0x" << ::std::hex << nLang); + } + } + } + else if (!maBcp47.isEmpty()) + { + MapBcp47& rMap = theMapBcp47(); + MapBcp47::const_iterator it( rMap.find( maBcp47)); + if (it != rMap.end()) + { + SAL_INFO( "i18nlangtag", "LanguageTag::registerImpl: found impl for '" << maBcp47 << "'"); + pImpl = (*it).second; + } + else + { + SAL_INFO( "i18nlangtag", "LanguageTag::registerImpl: new impl for '" << maBcp47 << "'"); + pImpl = std::make_shared( *this); + ::std::pair< MapBcp47::iterator, bool > insOrig( rMap.insert( ::std::make_pair( maBcp47, pImpl))); + // If changed after canonicalize() also add the resulting tag to + // the map. + if (pImpl->synCanonicalize()) + { + SAL_INFO( "i18nlangtag", "LanguageTag::registerImpl: canonicalized to '" << pImpl->maBcp47 << "'"); + ::std::pair< MapBcp47::const_iterator, bool > insCanon( + rMap.insert( ::std::make_pair( pImpl->maBcp47, pImpl))); + SAL_INFO( "i18nlangtag", "LanguageTag::registerImpl: " << (insCanon.second ? "" : "not ") + << "inserted '" << pImpl->maBcp47 << "'"); + // If the canonicalized tag already existed (was not inserted) + // and impls are different, make this impl that impl and skip + // the rest if that LangID is present as well. The existing + // entry may or may not be different, it may even be strictly + // identical to this if it differs only in case (e.g. ko-kr => + // ko-KR) which was corrected in canonicalize() hence also in + // the map entry but comparison is case insensitive and found + // it again. + if (!insCanon.second && (*insCanon.first).second != pImpl) + { + (*insOrig.first).second = pImpl = (*insCanon.first).second; + SAL_INFO( "i18nlangtag", "LanguageTag::registerImpl: share impl with 0x" + << ::std::hex << pImpl->mnLangID); + } + } + if (!pImpl->mbInitializedLangID) + { + // Try round-trip Bcp47->Locale->LangID->Locale->Bcp47. + if (!pImpl->mbInitializedLocale) + pImpl->convertBcp47ToLocale(); + if (!pImpl->mbInitializedLangID) + pImpl->convertLocaleToLang( true); + // Unconditionally insert (round-trip is possible) for + // on-the-fly IDs and (generated or not) suggested IDs. + bool bInsert = lcl_isKnownOnTheFlyID( pImpl->mnLangID); + OUString aBcp47; + if (!bInsert) + { + if (pImpl->mnLangID != LANGUAGE_DONTKNOW) + { + // May have involved canonicalize(), so compare with + // pImpl->maBcp47 instead of maBcp47! + aBcp47 = LanguageTagImpl::convertToBcp47( + MsLangId::Conversion::convertLanguageToLocale( pImpl->mnLangID )); + bInsert = (aBcp47 == pImpl->maBcp47); + } + } + // If round-trip is identical cross-insert to Bcp47 map. + if (bInsert) + { + ::std::pair< MapLangID::const_iterator, bool > res( + theMapLangID().insert( ::std::make_pair( pImpl->mnLangID, pImpl))); + if (res.second) + { + SAL_INFO( "i18nlangtag", "LanguageTag::registerImpl: cross-inserted 0x" + << ::std::hex << pImpl->mnLangID << " for '" << maBcp47 << "'"); + } + else + { + SAL_INFO( "i18nlangtag", "LanguageTag::registerImpl: not cross-inserted 0x" + << ::std::hex << pImpl->mnLangID << " for '" << maBcp47 << "' have '" + << (*res.first).second->maBcp47 << "'"); + } + } + else + { + SAL_INFO( "i18nlangtag", "LanguageTag::registerImpl: not cross-inserted 0x" + << ::std::hex << pImpl->mnLangID << " for '" << maBcp47 << "' round-trip to '" + << aBcp47 << "'"); + } + } + } + } + else + { + SAL_WARN( "i18nlangtag", "LanguageTag::registerImpl: can't register for 0x" << ::std::hex << mnLangID ); + pImpl = std::make_shared( *this); + } + + // If we reach here for mbSystemLocale we didn't have theSystemLocale + // above, so add it. + if (mbSystemLocale && mbInitializedLangID) + { + theSystemLocale() = pImpl; + SAL_INFO( "i18nlangtag", "LanguageTag::registerImpl: added system locale 0x" + << ::std::hex << pImpl->mnLangID << " '" << pImpl->maBcp47 << "'"); + } + + return pImpl; +} + + +LanguageTagImpl const * LanguageTag::getImpl() const +{ + if (!mpImpl) + { + mpImpl = registerImpl(); + syncVarsFromRawImpl(); + } + return mpImpl.get(); +} + +LanguageTagImpl * LanguageTag::getImpl() +{ + if (!mpImpl) + { + mpImpl = registerImpl(); + syncVarsFromRawImpl(); + } + return mpImpl.get(); +} + +void LanguageTag::resetVars() +{ + mpImpl.reset(); + maLocale = lang::Locale(); + maBcp47.clear(); + mnLangID = LANGUAGE_SYSTEM; + mbSystemLocale = true; + mbInitializedBcp47 = false; + mbInitializedLocale = false; + mbInitializedLangID = false; + mbIsFallback = false; +} + + +LanguageTag & LanguageTag::reset( const OUString & rBcp47LanguageTag ) +{ + resetVars(); + maBcp47 = rBcp47LanguageTag; + mbSystemLocale = rBcp47LanguageTag.isEmpty(); + mbInitializedBcp47 = !mbSystemLocale; + + return *this; +} + + +LanguageTag & LanguageTag::reset( const css::lang::Locale & rLocale ) +{ + resetVars(); + maLocale = rLocale; + mbSystemLocale = rLocale.Language.isEmpty(); + mbInitializedLocale = !mbSystemLocale; + handleVendorVariant( maLocale); + return *this; +} + + +LanguageTag & LanguageTag::reset( LanguageType nLanguage ) +{ + resetVars(); + mnLangID = nLanguage; + mbSystemLocale = nLanguage == LANGUAGE_SYSTEM; + mbInitializedLangID = !mbSystemLocale; + return *this; +} + + +bool LanguageTagImpl::canonicalize() +{ +#ifdef erDEBUG + // dump once + struct dumper + { + lt_tag_t** mpp; + explicit dumper( lt_tag_t** pp ) : mpp( *pp ? NULL : pp) {} + ~dumper() { if (mpp && *mpp) lt_tag_dump( *mpp); } + }; + dumper aDumper( &mpImplLangtag); +#endif + + bool bChanged = false; + + // Side effect: have maBcp47 in any case, resolved system. + // Some methods calling canonicalize() (or not calling it due to + // meIsLiblangtagNeeded==DECISION_NO) rely on this! Hence do not set + // meIsLiblangtagNeeded anywhere else than hereafter. + getBcp47(); + + // The simple cases and known locales don't need liblangtag processing, + // which also avoids loading liblangtag data on startup. + if (meIsLiblangtagNeeded == DECISION_DONTKNOW) + { + bool bTemporaryLocale = false; + bool bTemporaryLangID = false; + if (!mbInitializedLocale && !mbInitializedLangID) + { + if (mbSystemLocale) + { + mnLangID = MsLangId::getRealLanguage( LANGUAGE_SYSTEM); + mbInitializedLangID = true; + } + else + { + // Now this is getting funny... we only have some BCP47 string + // and want to determine if parsing it would be possible + // without using liblangtag just to see if it is a simple known + // locale or could fall back to one. + OUString aLanguage, aScript, aCountry, aVariants; + Extraction eExt = simpleExtract( maBcp47, aLanguage, aScript, aCountry, aVariants); + if (eExt != EXTRACTED_NONE) + { + if (eExt == EXTRACTED_LSC || eExt == EXTRACTED_LV) + { + // Rebuild bcp47 with proper casing of tags. + OUStringBuffer aBuf( aLanguage.getLength() + 1 + aScript.getLength() + + 1 + aCountry.getLength() + 1 + aVariants.getLength()); + aBuf.append( aLanguage); + if (!aScript.isEmpty()) + aBuf.append("-" + aScript); + if (!aCountry.isEmpty()) + aBuf.append("-" + aCountry); + if (!aVariants.isEmpty()) + aBuf.append("-" + aVariants); + OUString aStr( aBuf.makeStringAndClear()); + + if (maBcp47 != aStr) + { + maBcp47 = aStr; + bChanged = true; + } + } + if (eExt == EXTRACTED_LSC && aScript.isEmpty()) + { + maLocale.Language = aLanguage; + maLocale.Country = aCountry; + } + else if (eExt == EXTRACTED_C_LOCALE) + { + maLocale.Language = aLanguage; + maLocale.Country = aCountry; + } + else + { + maLocale.Language = I18NLANGTAG_QLT; + maLocale.Country = aCountry; + maLocale.Variant = maBcp47; + } + bTemporaryLocale = mbInitializedLocale = true; + } + } + } + if (mbInitializedLangID && !mbInitializedLocale) + { + // Do not call getLocale() here because that prefers + // convertBcp47ToLocale() which would end up in recursion via + // isIsoLocale()! + + // Prepare to verify that we have a known locale, not just an + // arbitrary MS-LangID. + convertLangToLocale(); + } + if (mbInitializedLocale) + { + if (!mbInitializedLangID) + { + if (convertLocaleToLang( false)) + bChanged = true; + if (bTemporaryLocale || mnLangID == LANGUAGE_DONTKNOW) + bTemporaryLangID = true; + } + if (mnLangID != LANGUAGE_DONTKNOW && mnLangID != LANGUAGE_SYSTEM) + meIsLiblangtagNeeded = DECISION_NO; // known locale + else + { + const KnownTagSet& rKnowns = getKnowns(); + if (rKnowns.find( maBcp47) != rKnowns.end()) + meIsLiblangtagNeeded = DECISION_NO; // known fallback + } + // We may have an internal override "canonicalization". + lang::Locale aNew( MsLangId::Conversion::getOverride( maLocale)); + if (!aNew.Language.isEmpty() && + (aNew.Language != maLocale.Language || + aNew.Country != maLocale.Country || + aNew.Variant != maLocale.Variant)) + { + maBcp47 = LanguageTagImpl::convertToBcp47( aNew); + bChanged = true; + meIsIsoLocale = DECISION_DONTKNOW; + meIsIsoODF = DECISION_DONTKNOW; + meIsLiblangtagNeeded = DECISION_NO; // known locale + } + } + if (bTemporaryLocale) + { + mbInitializedLocale = false; + maLocale = lang::Locale(); + } + if (bTemporaryLangID) + { + mbInitializedLangID = false; + mnLangID = LANGUAGE_DONTKNOW; + } + } + if (meIsLiblangtagNeeded == DECISION_NO) + { + meIsValid = DECISION_YES; // really, known must be valid ... + return bChanged; // that's it + } + + meIsLiblangtagNeeded = DECISION_YES; + SAL_INFO( "i18nlangtag", "LanguageTagImpl::canonicalize: using liblangtag for '" << maBcp47 << "'"); + + if (!mpImplLangtag) + { + theDataRef().init(); + mpImplLangtag = lt_tag_new(); + } + + myLtError aError; + + if (!lt_tag_parse_disabled && lt_tag_parse(mpImplLangtag, OUStringToOString(maBcp47, RTL_TEXTENCODING_UTF8).getStr(), &aError.p)) + { + if (aError.p) + { + SAL_WARN("i18nlangtag", "LanguageTagImpl::canonicalize: could not parse '" << maBcp47 << "'"); + } + else + { + char* pTag = lt_tag_canonicalize(mpImplLangtag, &aError.p); + SAL_WARN_IF(!pTag, "i18nlangtag", "LanguageTagImpl::canonicalize: could not canonicalize '" << maBcp47 << "'"); + if (pTag) + { + OUString aNew(OUString::createFromAscii(pTag)); + // Make the lt_tag_t follow the new string if different, which + // removes default script and such. + if (maBcp47 != aNew) + { + maBcp47 = aNew; + bChanged = true; + meIsIsoLocale = DECISION_DONTKNOW; + meIsIsoODF = DECISION_DONTKNOW; + if (!lt_tag_parse(mpImplLangtag, pTag, &aError.p)) + { + SAL_WARN("i18nlangtag", "LanguageTagImpl::canonicalize: could not reparse '" + << maBcp47 << "'"); + free(pTag); + meIsValid = DECISION_NO; + return bChanged; + } + } + free(pTag); + meIsValid = DECISION_YES; + return bChanged; + } + } + } + else + { + SAL_INFO( "i18nlangtag", "LanguageTagImpl::canonicalize: could not parse '" << maBcp47 << "'"); + } + meIsValid = DECISION_NO; + return bChanged; +} + + +bool LanguageTagImpl::synCanonicalize() +{ + bool bChanged = false; + if (meIsLiblangtagNeeded != DECISION_NO && !mpImplLangtag) + { + bChanged = canonicalize(); + if (bChanged) + { + if (mbInitializedLocale) + convertBcp47ToLocale(); + if (mbInitializedLangID) + convertBcp47ToLang(); + } + } + return bChanged; +} + + +void LanguageTag::syncFromImpl() +{ + LanguageTagImpl* pImpl = getImpl(); + bool bRegister = ((mbInitializedBcp47 && maBcp47 != pImpl->maBcp47) || + (mbInitializedLangID && mnLangID != pImpl->mnLangID)); + SAL_INFO_IF( bRegister, "i18nlangtag", + "LanguageTag::syncFromImpl: re-registering, '" << pImpl->maBcp47 << "' vs '" << maBcp47 << + " and 0x" << ::std::hex << pImpl->mnLangID << " vs 0x" << ::std::hex << mnLangID); + syncVarsFromRawImpl(); + if (bRegister) + mpImpl = registerImpl(); +} + + +void LanguageTag::syncVarsFromImpl() const +{ + if (!mpImpl) + getImpl(); // with side effect syncVarsFromRawImpl() + else + syncVarsFromRawImpl(); +} + + +void LanguageTag::syncVarsFromRawImpl() const +{ + // Do not use getImpl() here. + LanguageTagImpl* pImpl = mpImpl.get(); + if (!pImpl) + return; + + // Obviously only mutable variables. + mbInitializedBcp47 = pImpl->mbInitializedBcp47; + maBcp47 = pImpl->maBcp47; + mbInitializedLocale = pImpl->mbInitializedLocale; + maLocale = pImpl->maLocale; + mbInitializedLangID = pImpl->mbInitializedLangID; + mnLangID = pImpl->mnLangID; +} + + +bool LanguageTag::synCanonicalize() +{ + bool bChanged = getImpl()->synCanonicalize(); + if (bChanged) + syncFromImpl(); + return bChanged; +} + + +void LanguageTagImpl::convertLocaleToBcp47() +{ + if (mbSystemLocale && !mbInitializedLocale) + convertLangToLocale(); + + if (maLocale.Language.isEmpty()) + { + // Do not call LanguageTag::convertToBcp47(Locale) that for an empty + // locale via LanguageTag::convertToBcp47(LanguageType) and + // LanguageTag::convertToLocale(LanguageType) would instantiate another + // LanguageTag. + maLocale = MsLangId::Conversion::convertLanguageToLocale( LANGUAGE_SYSTEM ); + } + if (maLocale.Language.isEmpty()) + { + maBcp47.clear(); // bad luck + } + else if (maLocale.Language == I18NLANGTAG_QLT) + { + maBcp47 = maLocale.Variant; + meIsIsoLocale = DECISION_NO; + } + else + { + maBcp47 = LanguageTag::convertToBcp47( maLocale ); + } + mbInitializedBcp47 = true; +} + + +bool LanguageTagImpl::convertLocaleToLang( bool bAllowOnTheFlyID ) +{ + bool bRemapped = false; + if (mbSystemLocale) + { + mnLangID = MsLangId::getRealLanguage( LANGUAGE_SYSTEM); + } + else + { + mnLangID = MsLangId::Conversion::convertLocaleToLanguage( maLocale); + if (mnLangID == LANGUAGE_DONTKNOW) + { + // convertLocaleToLanguage() only searches in ISO and private + // definitions, search in remaining definitions, i.e. for the "C" + // locale and non-standard things like "sr-latin" or "german" to + // resolve to a known locale, skipping ISO lll-CC that were already + // searched. + mnLangID = MsLangId::Conversion::convertIsoNamesToLanguage( maLocale.Language, maLocale.Country, true); + if (mnLangID != LANGUAGE_DONTKNOW) + { + // If one found, convert back and adapt Locale and Bcp47 + // strings so we have a matching entry. + OUString aOrgBcp47( maBcp47); + convertLangToLocale(); + convertLocaleToBcp47(); + bRemapped = (maBcp47 != aOrgBcp47); + } + } + if (mnLangID == LANGUAGE_DONTKNOW && bAllowOnTheFlyID) + { + if (isValidBcp47()) + { + // For language-only (including script) look if we know some + // locale of that language and if so try to use the primary + // language ID of that instead of generating an on-the-fly ID. + if (getCountry().isEmpty() && isIsoODF()) + { + lang::Locale aLoc( MsLangId::Conversion::lookupFallbackLocale( maLocale)); + // 'en-US' is last resort, do not use except when looking + // for 'en'. + if (aLoc.Language != "en" || getLanguage() == "en") + { + mnLangID = MsLangId::Conversion::convertLocaleToLanguage( aLoc); + if (mnLangID != LANGUAGE_DONTKNOW) + mnLangID = MsLangId::getPrimaryLanguage( mnLangID); + } + } + registerOnTheFly( mnLangID); + } + else + { + SAL_WARN( "i18nlangtag", "LanguageTagImpl::convertLocaleToLang: with bAllowOnTheFlyID invalid '" + << maBcp47 << "'"); + } + } + } + mbInitializedLangID = true; + return bRemapped; +} + + +void LanguageTag::convertLocaleToLang() +{ + getImpl()->convertLocaleToLang( true); + syncFromImpl(); +} + + +void LanguageTagImpl::convertBcp47ToLocale() +{ + bool bIso = isIsoLocale(); + if (bIso) + { + maLocale.Language = getLanguageFromLangtag(); + maLocale.Country = getRegionFromLangtag(); + maLocale.Variant.clear(); + } + else + { + maLocale.Language = I18NLANGTAG_QLT; + maLocale.Country = getCountry(); + maLocale.Variant = maBcp47; + } + mbInitializedLocale = true; +} + + +void LanguageTag::convertBcp47ToLocale() +{ + getImpl()->convertBcp47ToLocale(); + syncFromImpl(); +} + + +void LanguageTagImpl::convertBcp47ToLang() +{ + if (mbSystemLocale) + { + mnLangID = MsLangId::getRealLanguage( LANGUAGE_SYSTEM); + } + else + { + if (!mbInitializedLocale) + convertBcp47ToLocale(); + convertLocaleToLang( true); + } + mbInitializedLangID = true; +} + + +void LanguageTag::convertBcp47ToLang() +{ + getImpl()->convertBcp47ToLang(); + syncFromImpl(); +} + + +void LanguageTagImpl::convertLangToLocale() +{ + if (mbSystemLocale && !mbInitializedLangID) + { + mnLangID = MsLangId::getRealLanguage( LANGUAGE_SYSTEM); + mbInitializedLangID = true; + } + // Resolve system here! The original is remembered as mbSystemLocale. + maLocale = MsLangId::Conversion::convertLanguageToLocale( mnLangID ); + mbInitializedLocale = true; +} + + +void LanguageTag::convertLangToLocale() +{ + getImpl()->convertLangToLocale(); + syncFromImpl(); +} + + +void LanguageTagImpl::convertLangToBcp47() +{ + if (!mbInitializedLocale) + convertLangToLocale(); + convertLocaleToBcp47(); + mbInitializedBcp47 = true; +} + + +void LanguageTag::convertFromRtlLocale() +{ + // The rtl_Locale follows the Open Group Base Specification, + // 8.2 Internationalization Variables + // language[_territory][.codeset][@modifier] + // On GNU/Linux systems usually being glibc locales. + // sal/osl/unx/nlsupport.c _parse_locale() parses them into + // Language: language 2 or 3 alpha code + // Country: [territory] 2 alpha code + // Variant: [.codeset][@modifier] + // Variant effectively contains anything that follows the territory, not + // looking for '.' dot delimiter or '@' modifier content. + if (maLocale.Variant.isEmpty()) + return; + + OString aStr = OUStringToOString(maLocale.Language, RTL_TEXTENCODING_UTF8) + "_" + OUStringToOString(OUStringConcatenation(maLocale.Country + maLocale.Variant), + RTL_TEXTENCODING_UTF8); + /* FIXME: let liblangtag parse this entirely with + * lt_tag_convert_from_locale() but that needs a patch to pass the + * string. */ +#if 0 + myLtError aError; + theDataRef::get().init(); + mpImplLangtag = lt_tag_convert_from_locale( aStr.getStr(), &aError.p); + maBcp47 = OStringToOUString( lt_tag_get_string( mpImplLangtag), RTL_TEXTENCODING_UTF8); + mbInitializedBcp47 = true; +#else + mnLangID = MsLangId::convertUnxByteStringToLanguage( aStr); + if (mnLangID == LANGUAGE_DONTKNOW) + { + SAL_WARN( "i18nlangtag", "LanguageTag(rtl_Locale) - unknown: " << aStr); + mnLangID = LANGUAGE_ENGLISH_US; // we need _something_ here + } + mbInitializedLangID = true; +#endif + maLocale = lang::Locale(); + mbInitializedLocale = false; +} + + +const OUString & LanguageTagImpl::getBcp47() const +{ + if (!mbInitializedBcp47) + { + if (mbInitializedLocale) + const_cast(this)->convertLocaleToBcp47(); + else + const_cast(this)->convertLangToBcp47(); + } + return maBcp47; +} + + +const OUString & LanguageTag::getBcp47( bool bResolveSystem ) const +{ + static const OUString theEmptyBcp47 = u""; + + if (!bResolveSystem && mbSystemLocale) + return theEmptyBcp47; + if (!mbInitializedBcp47) + syncVarsFromImpl(); + if (!mbInitializedBcp47) + { + getImpl()->getBcp47(); + const_cast(this)->syncFromImpl(); + } + return maBcp47; +} + + +OUString LanguageTagImpl::getLanguageFromLangtag() +{ + OUString aLanguage; + synCanonicalize(); + if (maBcp47.isEmpty()) + return aLanguage; + if (mpImplLangtag) + { + const lt_lang_t* pLangT = lt_tag_get_language( mpImplLangtag); + SAL_WARN_IF( !pLangT, "i18nlangtag", + "LanguageTag::getLanguageFromLangtag: pLangT==NULL for '" << maBcp47 << "'"); + if (!pLangT) + return aLanguage; + const char* pLang = lt_lang_get_tag( pLangT); + SAL_WARN_IF( !pLang, "i18nlangtag", + "LanguageTag::getLanguageFromLangtag: pLang==NULL for '" << maBcp47 << "'"); + if (pLang) + aLanguage = OUString::createFromAscii( pLang); + } + else + { + if (mbCachedLanguage || cacheSimpleLSCV()) + aLanguage = maCachedLanguage; + } + return aLanguage; +} + + +OUString LanguageTagImpl::getScriptFromLangtag() +{ + OUString aScript; + synCanonicalize(); + if (maBcp47.isEmpty()) + return aScript; + if (mpImplLangtag) + { + const lt_script_t* pScriptT = lt_tag_get_script( mpImplLangtag); + // pScriptT==NULL is valid for default scripts + if (!pScriptT) + return aScript; + const char* pScript = lt_script_get_tag( pScriptT); + SAL_WARN_IF( !pScript, "i18nlangtag", "LanguageTag::getScriptFromLangtag: pScript==NULL"); + if (pScript) + aScript = OUString::createFromAscii( pScript); + } + else + { + if (mbCachedScript || cacheSimpleLSCV()) + aScript = maCachedScript; + } + return aScript; +} + + +OUString LanguageTagImpl::getRegionFromLangtag() +{ + OUString aRegion; + synCanonicalize(); + if (maBcp47.isEmpty()) + return aRegion; + if (mpImplLangtag) + { + const lt_region_t* pRegionT = lt_tag_get_region( mpImplLangtag); + // pRegionT==NULL is valid for language only tags, rough check here + // that does not take sophisticated tags into account that actually + // should have a region, check for ll, lll, ll-Ssss and lll-Ssss so + // that ll-CC and lll-CC actually fail. + SAL_WARN_IF( !pRegionT && + maBcp47.getLength() != 2 && maBcp47.getLength() != 3 && + maBcp47.getLength() != 7 && maBcp47.getLength() != 8, + "i18nlangtag", "LanguageTag::getRegionFromLangtag: pRegionT==NULL for '" << maBcp47 << "'"); + if (!pRegionT) + return aRegion; + const char* pRegion = lt_region_get_tag( pRegionT); + SAL_WARN_IF( !pRegion, "i18nlangtag", + "LanguageTag::getRegionFromLangtag: pRegion==NULL for'" << maBcp47 << "'"); + if (pRegion) + aRegion = OUString::createFromAscii( pRegion); + } + else + { + if (mbCachedCountry || cacheSimpleLSCV()) + aRegion = maCachedCountry; + } + return aRegion; +} + + +OUString LanguageTagImpl::getVariantsFromLangtag() +{ + OUStringBuffer aVariants; + synCanonicalize(); + if (maBcp47.isEmpty()) + return OUString(); + if (mpImplLangtag) + { + const lt_list_t* pVariantsT = lt_tag_get_variants( mpImplLangtag); + for (const lt_list_t* pE = pVariantsT; pE; pE = lt_list_next( pE)) + { + const lt_variant_t* pVariantT = static_cast(lt_list_value( pE)); + if (pVariantT) + { + const char* p = lt_variant_get_tag( pVariantT); + if (p) + { + if (!aVariants.isEmpty()) + aVariants.append("-"); + aVariants.appendAscii(p); + } + } + } + } + else + { + if (mbCachedVariants || cacheSimpleLSCV()) + aVariants = maCachedVariants; + } + return aVariants.makeStringAndClear(); +} + + +const css::lang::Locale & LanguageTag::getLocale( bool bResolveSystem ) const +{ + // "static" to be returned as const reference to an empty locale. + static lang::Locale theEmptyLocale; + + if (!bResolveSystem && mbSystemLocale) + return theEmptyLocale; + if (!mbInitializedLocale) + syncVarsFromImpl(); + if (!mbInitializedLocale) + { + if (mbInitializedBcp47) + const_cast(this)->convertBcp47ToLocale(); + else + const_cast(this)->convertLangToLocale(); + } + return maLocale; +} + + +LanguageType LanguageTag::getLanguageType( bool bResolveSystem ) const +{ + if (!bResolveSystem && mbSystemLocale) + return LANGUAGE_SYSTEM; + if (!mbInitializedLangID) + syncVarsFromImpl(); + if (!mbInitializedLangID) + { + if (mbInitializedBcp47) + const_cast(this)->convertBcp47ToLang(); + else + { + const_cast(this)->convertLocaleToLang(); + + /* Resolve a locale only unknown due to some redundant information, + * like 'de-Latn-DE' with script tag. Never call canonicalize() + * from within convert...() methods due to possible recursion, so + * do it here. */ + if ((!mbSystemLocale && mnLangID == LANGUAGE_SYSTEM) || mnLangID == LANGUAGE_DONTKNOW) + const_cast(this)->synCanonicalize(); + } + } + return mnLangID; +} + + +void LanguageTag::getIsoLanguageScriptCountry( OUString& rLanguage, OUString& rScript, OUString& rCountry ) const +{ + // Calling isIsoODF() first is a predicate for getLanguage(), getScript() + // and getCountry() to work correctly in this context. + if (isIsoODF()) + { + rLanguage = getLanguage(); + rScript = getScript(); + rCountry = getCountry(); + } + else + { + rLanguage = (LanguageTag::isIsoLanguage( getLanguage()) ? getLanguage() : OUString()); + rScript = (LanguageTag::isIsoScript( getScript()) ? getScript() : OUString()); + rCountry = (LanguageTag::isIsoCountry( getCountry()) ? getCountry() : OUString()); + } +} + + +namespace +{ + +bool isLowerAscii( sal_Unicode c ) +{ + return 'a' <= c && c <= 'z'; +} + +bool isUpperAscii( sal_Unicode c ) +{ + return 'A' <= c && c <= 'Z'; +} + +} + + +// static +bool LanguageTag::isIsoLanguage( const OUString& rLanguage ) +{ + /* TODO: ignore case? For now let's see where rubbish is used. */ + bool b2chars = rLanguage.getLength() == 2; + if ((b2chars || rLanguage.getLength() == 3) && + isLowerAscii( rLanguage[0]) && isLowerAscii( rLanguage[1]) && + (b2chars || isLowerAscii( rLanguage[2]))) + return true; + SAL_WARN_IF( ((rLanguage.getLength() == 2 || rLanguage.getLength() == 3) && + (isUpperAscii( rLanguage[0]) || isUpperAscii( rLanguage[1]))) || + (rLanguage.getLength() == 3 && isUpperAscii( rLanguage[2])), "i18nlangtag", + "LanguageTag::isIsoLanguage: rejecting upper case " << rLanguage); + return false; +} + + +// static +bool LanguageTag::isIsoCountry( const OUString& rRegion ) +{ + /* TODO: ignore case? For now let's see where rubbish is used. */ + if (rRegion.isEmpty() || + (rRegion.getLength() == 2 && isUpperAscii( rRegion[0]) && isUpperAscii( rRegion[1]))) + return true; + SAL_WARN_IF( rRegion.getLength() == 2 && (isLowerAscii( rRegion[0]) || isLowerAscii( rRegion[1])), + "i18nlangtag", "LanguageTag::isIsoCountry: rejecting lower case " << rRegion); + return false; +} + + +// static +bool LanguageTag::isIsoScript( const OUString& rScript ) +{ + /* TODO: ignore case? For now let's see where rubbish is used. */ + if (rScript.isEmpty() || + (rScript.getLength() == 4 && + isUpperAscii( rScript[0]) && isLowerAscii( rScript[1]) && + isLowerAscii( rScript[2]) && isLowerAscii( rScript[3]))) + return true; + SAL_WARN_IF( rScript.getLength() == 4 && + (isLowerAscii( rScript[0]) || isUpperAscii( rScript[1]) || + isUpperAscii( rScript[2]) || isUpperAscii( rScript[3])), + "i18nlangtag", "LanguageTag::isIsoScript: rejecting case mismatch " << rScript); + return false; +} + + +OUString const & LanguageTagImpl::getLanguage() const +{ + if (!mbCachedLanguage) + { + maCachedLanguage = const_cast(this)->getLanguageFromLangtag(); + mbCachedLanguage = true; + } + return maCachedLanguage; +} + + +OUString LanguageTag::getLanguage() const +{ + LanguageTagImpl const* pImpl = getImpl(); + if (pImpl->mbCachedLanguage) + return pImpl->maCachedLanguage; + OUString aRet( pImpl->getLanguage()); + const_cast(this)->syncFromImpl(); + return aRet; +} + + +OUString const & LanguageTagImpl::getScript() const +{ + if (!mbCachedScript) + { + maCachedScript = const_cast(this)->getScriptFromLangtag(); + mbCachedScript = true; + } + return maCachedScript; +} + + +OUString LanguageTag::getScript() const +{ + LanguageTagImpl const* pImpl = getImpl(); + if (pImpl->mbCachedScript) + return pImpl->maCachedScript; + OUString aRet( pImpl->getScript()); + const_cast(this)->syncFromImpl(); + return aRet; +} + + +OUString LanguageTag::getLanguageAndScript() const +{ + OUString aLanguageScript( getLanguage()); + OUString aScript( getScript()); + if (!aScript.isEmpty()) + { + aLanguageScript += "-" + aScript; + } + return aLanguageScript; +} + + +OUString const & LanguageTagImpl::getCountry() const +{ + if (!mbCachedCountry) + { + maCachedCountry = const_cast(this)->getRegionFromLangtag(); + if (!LanguageTag::isIsoCountry( maCachedCountry)) + maCachedCountry.clear(); + mbCachedCountry = true; + } + return maCachedCountry; +} + + +OUString LanguageTag::getCountry() const +{ + LanguageTagImpl const* pImpl = getImpl(); + if (pImpl->mbCachedCountry) + return pImpl->maCachedCountry; + OUString aRet( pImpl->getCountry()); + const_cast(this)->syncFromImpl(); + return aRet; +} + + +OUString LanguageTagImpl::getRegion() const +{ + return const_cast(this)->getRegionFromLangtag(); +} + + +OUString const & LanguageTagImpl::getVariants() const +{ + if (!mbCachedVariants) + { + maCachedVariants = const_cast(this)->getVariantsFromLangtag(); + mbCachedVariants = true; + } + return maCachedVariants; +} + + +OUString LanguageTag::getVariants() const +{ + LanguageTagImpl const * pImpl = getImpl(); + if (pImpl->mbCachedVariants) + return pImpl->maCachedVariants; + OUString aRet( pImpl->getVariants()); + const_cast(this)->syncFromImpl(); + return aRet; +} + +OUString const & LanguageTagImpl::getGlibcLocaleString() const +{ + if (mbCachedGlibcString) + return maCachedGlibcString; + + if (!mpImplLangtag) + { + meIsLiblangtagNeeded = DECISION_YES; + const_cast(this)->synCanonicalize(); + } + if (mpImplLangtag) + { + char* pLang = lt_tag_convert_to_locale(mpImplLangtag, nullptr); + if (pLang) + { + maCachedGlibcString = OUString::createFromAscii( pLang); + mbCachedGlibcString = true; + free(pLang); + } + } + return maCachedGlibcString; +} + +OUString LanguageTag::getGlibcLocaleString( std::u16string_view rEncoding ) const +{ + OUString aRet; + if (isIsoLocale()) + { + OUString aCountry( getCountry()); + if (aCountry.isEmpty()) + aRet = getLanguage() + rEncoding; + else + aRet = getLanguage() + "_" + aCountry + rEncoding; + } + else + { + aRet = getImpl()->getGlibcLocaleString(); + sal_Int32 nAt = aRet.indexOf('@'); + if (nAt != -1) + aRet = OUString::Concat(aRet.subView(0, nAt)) + rEncoding + aRet.subView(nAt); + else + aRet += rEncoding; + } + return aRet; +} + +bool LanguageTagImpl::hasScript() const +{ + if (!mbCachedScript) + getScript(); + return !maCachedScript.isEmpty(); +} + + +bool LanguageTag::hasScript() const +{ + bool bRet = getImpl()->hasScript(); + const_cast(this)->syncFromImpl(); + return bRet; +} + + +LanguageTag::ScriptType LanguageTagImpl::getScriptType() const +{ + return meScriptType; +} + + +LanguageTag::ScriptType LanguageTag::getScriptType() const +{ + return getImpl()->getScriptType(); +} + + +void LanguageTagImpl::setScriptType(LanguageTag::ScriptType st) +{ + if (meScriptType == LanguageTag::ScriptType::UNKNOWN) // poor man's clash resolution + meScriptType = st; +} + + +void LanguageTag::setScriptType(LanguageTag::ScriptType st) +{ + getImpl()->setScriptType(st); +} + + +bool LanguageTagImpl::cacheSimpleLSCV() +{ + OUString aLanguage, aScript, aCountry, aVariants; + Extraction eExt = simpleExtract( maBcp47, aLanguage, aScript, aCountry, aVariants); + bool bRet = (eExt == EXTRACTED_LSC || eExt == EXTRACTED_LV); + if (bRet) + { + maCachedLanguage = aLanguage; + maCachedScript = aScript; + maCachedCountry = aCountry; + maCachedVariants = aVariants; + mbCachedLanguage = mbCachedScript = mbCachedCountry = mbCachedVariants = true; + } + return bRet; +} + + +bool LanguageTagImpl::isIsoLocale() const +{ + if (meIsIsoLocale == DECISION_DONTKNOW) + { + const_cast(this)->synCanonicalize(); + // It must be at most ll-CC or lll-CC + // Do not use getCountry() here, use getRegion() instead. + meIsIsoLocale = ((maBcp47.isEmpty() || + (maBcp47.getLength() <= 6 && LanguageTag::isIsoLanguage( getLanguage()) && + LanguageTag::isIsoCountry( getRegion()))) ? DECISION_YES : DECISION_NO); + } + return meIsIsoLocale == DECISION_YES; +} + + +bool LanguageTag::isIsoLocale() const +{ + bool bRet = getImpl()->isIsoLocale(); + const_cast(this)->syncFromImpl(); + return bRet; +} + + +bool LanguageTagImpl::isIsoODF() const +{ + if (meIsIsoODF == DECISION_DONTKNOW) + { + const_cast(this)->synCanonicalize(); + if (!LanguageTag::isIsoScript( getScript())) + { + meIsIsoODF = DECISION_NO; + return false; + } + // The usual case is lll-CC so simply check that first. + if (isIsoLocale()) + { + meIsIsoODF = DECISION_YES; + return true; + } + // If this is not ISO locale for which script must not exist it can + // still be ISO locale plus ISO script lll-Ssss-CC, but not ll-vvvv ... + // ll-vvvvvvvv + meIsIsoODF = ((maBcp47.getLength() <= 11 && LanguageTag::isIsoLanguage( getLanguage()) && + LanguageTag::isIsoCountry( getRegion()) && LanguageTag::isIsoScript( getScript()) && + getVariants().isEmpty()) ? DECISION_YES : DECISION_NO); + } + return meIsIsoODF == DECISION_YES; +} + + +bool LanguageTag::isIsoODF() const +{ + bool bRet = getImpl()->isIsoODF(); + const_cast(this)->syncFromImpl(); + return bRet; +} + + +bool LanguageTagImpl::isValidBcp47() const +{ + if (meIsValid == DECISION_DONTKNOW) + { + const_cast(this)->synCanonicalize(); + SAL_WARN_IF( meIsValid == DECISION_DONTKNOW, "i18nlangtag", + "LanguageTag::isValidBcp47: canonicalize() didn't set meIsValid"); + } + return meIsValid == DECISION_YES; +} + + +bool LanguageTag::isValidBcp47() const +{ + bool bRet = getImpl()->isValidBcp47(); + const_cast(this)->syncFromImpl(); + return bRet; +} + + +LanguageTag & LanguageTag::makeFallback() +{ + if (!mbIsFallback) + { + const lang::Locale& rLocale1 = getLocale(); + lang::Locale aLocale2( MsLangId::Conversion::lookupFallbackLocale( rLocale1)); + if ( rLocale1.Language != aLocale2.Language || + rLocale1.Country != aLocale2.Country || + rLocale1.Variant != aLocale2.Variant) + { + if (rLocale1.Language != "en" && aLocale2.Language == "en" && aLocale2.Country == "US") + { + // "en-US" is the last resort fallback, try if we get a better + // one for the fallback hierarchy of a non-"en" locale. + ::std::vector< OUString > aFallbacks( getFallbackStrings( false)); + for (auto const& fallback : aFallbacks) + { + lang::Locale aLocale3( LanguageTag(fallback).getLocale()); + aLocale2 = MsLangId::Conversion::lookupFallbackLocale( aLocale3); + if (aLocale2.Language != "en" || aLocale2.Country != "US") + break; // for, success + } + } + SAL_INFO( "i18nlangtag", "LanguageTag::makeFallback - for (" << + rLocale1.Language << "," << rLocale1.Country << "," << rLocale1.Variant << ") to (" << + aLocale2.Language << "," << aLocale2.Country << "," << aLocale2.Variant << ")"); + reset( aLocale2); + } + mbIsFallback = true; + } + return *this; +} + + +/* TODO: maybe this now could take advantage of the mnOverride field in + * isolang.cxx entries and search for kSAME instead of hardcoded special + * fallbacks. Though iterating through those tables would be slower and even + * then there would be some special cases, but we wouldn't lack entries that + * were missed out. */ +::std::vector< OUString > LanguageTag::getFallbackStrings( bool bIncludeFullBcp47 ) const +{ + ::std::vector< OUString > aVec; + OUString aLanguage( getLanguage()); + OUString aCountry( getCountry()); + if (isIsoLocale()) + { + if (!aCountry.isEmpty()) + { + if (bIncludeFullBcp47) + aVec.emplace_back(aLanguage + "-" + aCountry); + if (aLanguage == "zh") + { + // For zh-HK or zh-MO also list zh-TW, for all other zh-XX also + // list zh-CN. + if (aCountry == "HK" || aCountry == "MO") + aVec.emplace_back(aLanguage + "-TW"); + else if (aCountry != "CN") + aVec.emplace_back(aLanguage + "-CN"); + aVec.push_back( aLanguage); + } + else if (aLanguage == "sh") + { + // Manual list instead of calling + // LanguageTag( "sr-Latn-" + aCountry).getFallbackStrings( true) + // that would also include "sh-*" again. + aVec.emplace_back("sr-Latn-" + aCountry); + aVec.emplace_back("sr-Latn"); + aVec.emplace_back("sh"); // legacy with script, before default script with country + aVec.emplace_back("sr-" + aCountry); + aVec.emplace_back("sr"); + } + else if (aLanguage == "ca" && aCountry == "XV") + { + ::std::vector< OUString > aRep( LanguageTag( "ca-ES-valencia").getFallbackStrings( true)); + aVec.insert( aVec.end(), aRep.begin(), aRep.end()); + // Already includes 'ca' language fallback. + } + else if (aLanguage == "ku") + { + if (aCountry == "TR" || aCountry == "SY") + { + aVec.emplace_back("kmr-Latn-" + aCountry); + aVec.emplace_back("kmr-" + aCountry); + aVec.emplace_back("kmr-Latn"); + aVec.emplace_back("kmr"); + aVec.push_back( aLanguage); + } + else if (aCountry == "IQ" || aCountry == "IR") + { + aVec.emplace_back("ckb-" + aCountry); + aVec.emplace_back("ckb"); + } + } + else if (aLanguage == "kmr" && (aCountry == "TR" || aCountry == "SY")) + { + aVec.emplace_back("ku-Latn-" + aCountry); + aVec.emplace_back("ku-" + aCountry); + aVec.push_back( aLanguage); + aVec.emplace_back("ku"); + } + else if (aLanguage == "ckb" && (aCountry == "IQ" || aCountry == "IR")) + { + aVec.emplace_back("ku-Arab-" + aCountry); + aVec.emplace_back("ku-" + aCountry); + aVec.push_back( aLanguage); + // not 'ku' only, that was used for Latin script + } + else + aVec.push_back( aLanguage); + } + else + { + if (bIncludeFullBcp47) + aVec.push_back( aLanguage); + if (aLanguage == "sh") + { + aVec.emplace_back("sr-Latn"); + aVec.emplace_back("sr"); + } + else if (aLanguage == "pli") + { + // a special case for Pali dictionary, see fdo#41599 + aVec.emplace_back("pi-Latn"); + aVec.emplace_back("pi"); + } + } + return aVec; + } + + getBcp47(); // have maBcp47 now + if (bIncludeFullBcp47) + aVec.push_back( maBcp47); + + // Special cases for deprecated tags and their replacements, include both + // in fallbacks in a sensible order. + /* TODO: could such things be generalized and automated with liblangtag? */ + if (maBcp47 == "en-GB-oed") + aVec.emplace_back("en-GB-oxendict"); + else if (maBcp47 == "en-GB-oxendict") + aVec.emplace_back("en-GB-oed"); + + OUString aVariants( getVariants()); + OUString aTmp; + if (hasScript()) + { + OUString aScript = getScript(); + bool bHaveLanguageScriptVariant = false; + if (!aCountry.isEmpty()) + { + if (!aVariants.isEmpty()) + { + aTmp = aLanguage + "-" + aScript + "-" + aCountry + "-" + aVariants; + if (aTmp != maBcp47) + aVec.push_back( aTmp); + // Language with variant but without country before language + // without variant but with country. + aTmp = aLanguage + "-" + aScript + "-" + aVariants; + if (aTmp != maBcp47) + aVec.push_back( aTmp); + bHaveLanguageScriptVariant = true; + } + aTmp = aLanguage + "-" + aScript + "-" + aCountry; + if (aTmp != maBcp47) + aVec.push_back( aTmp); + if (aLanguage == "sr" && aScript == "Latn") + { + // sr-Latn-CS => sr-Latn-YU, sh-CS, sh-YU + if (aCountry == "CS") + { + aVec.emplace_back("sr-Latn-YU"); + aVec.emplace_back("sh-CS"); + aVec.emplace_back("sh-YU"); + } + else + aVec.emplace_back("sh-" + aCountry); + } + else if (aLanguage == "pi" && aScript == "Latn") + aVec.emplace_back("pli"); // a special case for Pali dictionary, see fdo#41599 + else if (aLanguage == "krm" && aScript == "Latn" && (aCountry == "TR" || aCountry == "SY")) + aVec.emplace_back("ku-" + aCountry); + } + if (!aVariants.isEmpty() && !bHaveLanguageScriptVariant) + { + aTmp = aLanguage + "-" + aScript + "-" + aVariants; + if (aTmp != maBcp47) + aVec.push_back( aTmp); + } + aTmp = aLanguage + "-" + aScript; + if (aTmp != maBcp47) + aVec.push_back( aTmp); + + // 'sh' actually denoted a script, so have it here instead of appended + // at the end as language-only. + if (aLanguage == "sr" && aScript == "Latn") + aVec.emplace_back("sh"); + else if (aLanguage == "ku" && aScript == "Arab") + aVec.emplace_back("ckb"); + // 'ku' only denoted Latin script + else if (aLanguage == "krm" && aScript == "Latn" && aCountry.isEmpty()) + aVec.emplace_back("ku"); + } + bool bHaveLanguageVariant = false; + if (!aCountry.isEmpty()) + { + if (!aVariants.isEmpty()) + { + aTmp = aLanguage + "-" + aCountry + "-" + aVariants; + if (aTmp != maBcp47) + aVec.push_back( aTmp); + if (maBcp47 == "ca-ES-valencia") + aVec.emplace_back("ca-XV"); + // Language with variant but without country before language + // without variant but with country. + // But only if variant is not from a grandfathered tag that + // wouldn't match the rules, i.e. "de-1901" is fine but "en-oed" is + // not. + if (aVariants.getLength() >= 5 || + (aVariants.getLength() == 4 && '0' <= aVariants[0] && aVariants[0] <= '9')) + { + aTmp = aLanguage + "-" + aVariants; + if (aTmp != maBcp47) + aVec.push_back( aTmp); + bHaveLanguageVariant = true; + } + } + aTmp = aLanguage + "-" + aCountry; + if (aTmp != maBcp47) + aVec.push_back( aTmp); + } + if (!aVariants.isEmpty() && !bHaveLanguageVariant) + { + // Only if variant is not from a grandfathered tag that wouldn't match + // the rules, i.e. "de-1901" is fine but "en-oed" is not. + if (aVariants.getLength() >= 5 || + (aVariants.getLength() == 4 && '0' <= aVariants[0] && aVariants[0] <= '9')) + { + aTmp = aLanguage + "-" + aVariants; + if (aTmp != maBcp47) + aVec.push_back( aTmp); + } + } + + // Insert legacy fallbacks with country before language-only, but only + // default script, script was handled already above. + if (!aCountry.isEmpty()) + { + if (aLanguage == "sr" && aCountry == "CS") + aVec.emplace_back("sr-YU"); + } + + // Original language-only. + if (aLanguage != maBcp47) + aVec.push_back( aLanguage); + + return aVec; +} + + +OUString LanguageTag::getBcp47MS() const +{ + if (getLanguageType() == LANGUAGE_SPANISH_DATED) + return "es-ES_tradnl"; + return getBcp47(); +} + + +bool LanguageTag::equals( const LanguageTag & rLanguageTag ) const +{ + // If SYSTEM is not to be resolved or either both are SYSTEM or none, we + // can use the operator==() optimization. + if (isSystemLocale() == rLanguageTag.isSystemLocale()) + return operator==( rLanguageTag); + + // Compare full language tag strings. + return getBcp47() == rLanguageTag.getBcp47(); +} + + +bool LanguageTag::operator==( const LanguageTag & rLanguageTag ) const +{ + if (isSystemLocale() && rLanguageTag.isSystemLocale()) + return true; // both SYSTEM + + // No need to convert to BCP47 if both Lang-IDs are available. + if (mbInitializedLangID && rLanguageTag.mbInitializedLangID) + { + // Equal if same ID and no SYSTEM is involved or both are SYSTEM. + return mnLangID == rLanguageTag.mnLangID && isSystemLocale() == rLanguageTag.isSystemLocale(); + } + + // Compare full language tag strings but SYSTEM unresolved. + return getBcp47( false) == rLanguageTag.getBcp47( false); +} + + +bool LanguageTag::operator!=( const LanguageTag & rLanguageTag ) const +{ + return !operator==( rLanguageTag); +} + + +bool LanguageTag::operator<( const LanguageTag & rLanguageTag ) const +{ + return getBcp47( false).compareToIgnoreAsciiCase( rLanguageTag.getBcp47( false)) < 0; +} + + +// static +LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp47, + OUString& rLanguage, OUString& rScript, OUString& rCountry, OUString& rVariants ) +{ + Extraction eRet = EXTRACTED_NONE; + const sal_Int32 nLen = rBcp47.getLength(); + const sal_Int32 nHyph1 = rBcp47.indexOf( '-'); + sal_Int32 nHyph2 = (nHyph1 < 0 ? -1 : rBcp47.indexOf( '-', nHyph1 + 1)); + sal_Int32 nHyph3 = (nHyph2 < 0 ? -1 : rBcp47.indexOf( '-', nHyph2 + 1)); + sal_Int32 nHyph4 = (nHyph3 < 0 ? -1 : rBcp47.indexOf( '-', nHyph3 + 1)); + if (nLen == 1 && rBcp47[0] == '*') // * the dreaded jolly joker + { + // It's f*d up but we need to recognize this. + eRet = EXTRACTED_X_JOKER; + } + else if (nHyph1 == 1 && rBcp47[0] == 'x') // x-... privateuse + { + // x-... privateuse tags MUST be known to us by definition. + eRet = EXTRACTED_X; + } + else if (nLen == 1 && rBcp47[0] == 'C') // the 'C' locale + { + eRet = EXTRACTED_C_LOCALE; + rLanguage = "C"; + rScript.clear(); + rCountry.clear(); + rVariants.clear(); + } + else if (nLen == 2 || nLen == 3) // ll or lll + { + if (nHyph1 < 0) + { + rLanguage = rBcp47.toAsciiLowerCase(); + rScript.clear(); + rCountry.clear(); + rVariants.clear(); + eRet = EXTRACTED_LSC; + } + } + else if ( (nHyph1 == 2 && nLen == 5) // ll-CC + || (nHyph1 == 3 && nLen == 6)) // lll-CC + { + if (nHyph2 < 0) + { + rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase(); + rCountry = rBcp47.copy( nHyph1 + 1, 2).toAsciiUpperCase(); + rScript.clear(); + rVariants.clear(); + eRet = EXTRACTED_LSC; + } + } + else if ( (nHyph1 == 2 && nLen == 7) // ll-Ssss or ll-vvvv + || (nHyph1 == 3 && nLen == 8)) // lll-Ssss or lll-vvvv + { + if (nHyph2 < 0) + { + sal_Unicode c = rBcp47[nHyph1+1]; + if ('0' <= c && c <= '9') + { + // (DIGIT 3ALNUM) vvvv variant instead of Ssss script + rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase(); + rScript.clear(); + rCountry.clear(); + rVariants = rBcp47.copy( nHyph1 + 1); + eRet = EXTRACTED_LV; + } + else + { + rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase(); + rScript = rBcp47.copy( nHyph1 + 1, 1).toAsciiUpperCase() + + rBcp47.copy( nHyph1 + 2, 3).toAsciiLowerCase(); + rCountry.clear(); + rVariants.clear(); + eRet = EXTRACTED_LSC; + } + } + } + else if ( (nHyph1 == 2 && nHyph2 == 7 && nLen == 10) // ll-Ssss-CC + || (nHyph1 == 3 && nHyph2 == 8 && nLen == 11)) // lll-Ssss-CC + { + if (nHyph3 < 0) + { + rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase(); + rScript = rBcp47.copy( nHyph1 + 1, 1).toAsciiUpperCase() + rBcp47.copy( nHyph1 + 2, 3).toAsciiLowerCase(); + rCountry = rBcp47.copy( nHyph2 + 1, 2).toAsciiUpperCase(); + rVariants.clear(); + eRet = EXTRACTED_LSC; + } + } + else if ( (nHyph1 == 2 && nHyph2 == 7 && nHyph3 == 10 && nLen >= 15) // ll-Ssss-CC-vvvv[vvvv][-...] + || (nHyph1 == 3 && nHyph2 == 8 && nHyph3 == 11 && nLen >= 16)) // lll-Ssss-CC-vvvv[vvvv][-...] + { + if (nHyph4 < 0) + nHyph4 = rBcp47.getLength(); + if (nHyph4 - nHyph3 > 4 && nHyph4 - nHyph3 <= 9) + { + rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase(); + rScript = rBcp47.copy( nHyph1 + 1, 1).toAsciiUpperCase() + rBcp47.copy( nHyph1 + 2, 3).toAsciiLowerCase(); + rCountry = rBcp47.copy( nHyph2 + 1, 2).toAsciiUpperCase(); + rVariants = rBcp47.copy( nHyph3 + 1); + eRet = EXTRACTED_LV; + } + } + else if ( (nHyph1 == 2 && nHyph2 == 5 && nHyph3 == 7) // ll-CC-u-... + || (nHyph1 == 3 && nHyph2 == 6 && nHyph3 == 8)) // lll-CC-u-... + { + if (rBcp47[nHyph3-1] == 'u') + { + // Need to recognize as known, otherwise getLanguage() and + // getCountry() return empty string because mpImplLangtag is not + // used with a known mapping. + /* TODO: if there were more this would get ugly and needed some + * table driven approach via isolang.cxx instead. */ + if (rBcp47.equalsIgnoreAsciiCase( "es-ES-u-co-trad")) + { + rLanguage = "es"; + rScript.clear(); + rCountry = "ES"; + rVariants = "u-co-trad"; // not strictly a variant, but used to reconstruct the tag. + eRet = EXTRACTED_LV; + } + } + } + else if ( (nHyph1 == 2 && nHyph2 == 5 && nLen >= 10) // ll-CC-vvvv[vvvv][-...] + || (nHyph1 == 3 && nHyph2 == 6 && nLen >= 11)) // lll-CC-vvvv[vvvv][-...] + { + if (nHyph3 < 0) + nHyph3 = rBcp47.getLength(); + if (nHyph3 - nHyph2 > 4 && nHyph3 - nHyph2 <= 9) + { + rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase(); + rScript.clear(); + rCountry = rBcp47.copy( nHyph1 + 1, 2).toAsciiUpperCase(); + rVariants = rBcp47.copy( nHyph2 + 1); + eRet = EXTRACTED_LV; + } + } + else if ( (nHyph1 == 2 && nLen >= 8) // ll-vvvvv[vvv][-...] + || (nHyph1 == 3 && nLen >= 9)) // lll-vvvvv[vvv][-...] + { + if (nHyph2 < 0) + nHyph2 = rBcp47.getLength(); + if (nHyph2 - nHyph1 > 5 && nHyph2 - nHyph1 <= 9) + { + rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase(); + rScript.clear(); + rCountry.clear(); + rVariants = rBcp47.copy( nHyph1 + 1); + eRet = EXTRACTED_LV; + } + else + { + // Known and handled grandfathered; ugly but effective ... + // Note that nLen must have matched above. + // Strictly not a variant, but so far we treat it as such. + if (rBcp47.equalsIgnoreAsciiCase( "en-GB-oed")) + { + rLanguage = "en"; + rScript.clear(); + rCountry = "GB"; + rVariants = "oed"; + eRet = EXTRACTED_LV; + } + // Other known and handled odd cases. + else if (rBcp47.equalsIgnoreAsciiCase( "es-ES_tradnl")) + { + // Will get overridden, but needs to be recognized as known. + rLanguage = "es"; + rScript.clear(); + rCountry = "ES"; + rVariants = "tradnl"; // this is nonsense, but... ignored. + eRet = EXTRACTED_KNOWN_BAD; + } + } + } + if (eRet == EXTRACTED_NONE) + { + SAL_INFO( "i18nlangtag", "LanguageTagImpl::simpleExtract: did not extract '" << rBcp47 << "'"); + rLanguage.clear(); + rScript.clear(); + rCountry.clear(); + rVariants.clear(); + } + return eRet; +} + + +// static +::std::vector< OUString >::const_iterator LanguageTag::getFallback( + const ::std::vector< OUString > & rList, const OUString & rReference ) +{ + if (rList.empty()) + return rList.end(); + + // Try the simple case first without constructing fallbacks. + ::std::vector< OUString >::const_iterator it = std::find(rList.begin(), rList.end(), rReference); + if (it != rList.end()) + return it; // exact match + + ::std::vector< OUString > aFallbacks( LanguageTag( rReference).getFallbackStrings( false)); + if (rReference != "en-US") + { + aFallbacks.emplace_back("en-US"); + if (rReference != "en") + aFallbacks.emplace_back("en"); + } + if (rReference != "x-default") + aFallbacks.emplace_back("x-default"); + if (rReference != "x-no-translate") + aFallbacks.emplace_back("x-no-translate"); + /* TODO: the original comphelper::Locale::getFallback() code had + * "x-notranslate" instead of "x-no-translate", but all .xcu files use + * "x-no-translate" and "x-notranslate" apparently was never used anywhere. + * Did that ever work? Was it supposed to work at all like this? */ + + for (const auto& fb : aFallbacks) + { + it = std::find(rList.begin(), rList.end(), fb); + if (it != rList.end()) + return it; // fallback found + } + + // Did not find anything so return something of the list, the first value + // will do as well as any other as none did match any of the possible + // fallbacks. + return rList.begin(); +} + + +// static +::std::vector< css::lang::Locale >::const_iterator LanguageTag::getMatchingFallback( + const ::std::vector< css::lang::Locale > & rList, + const css::lang::Locale & rReference ) +{ + if (rList.empty()) + return rList.end(); + + // Try the simple case first without constructing fallbacks. + ::std::vector< lang::Locale >::const_iterator it = std::find_if(rList.begin(), rList.end(), + [&rReference](const lang::Locale& rLocale) { + return rLocale.Language == rReference.Language + && rLocale.Country == rReference.Country + && rLocale.Variant == rReference.Variant; }); + if (it != rList.end()) + return it; // exact match + + // Now for each reference fallback test the fallbacks of the list in order. + ::std::vector< OUString > aFallbacks( LanguageTag( rReference).getFallbackStrings( false)); + ::std::vector< ::std::vector< OUString > > aListFallbacks( rList.size()); + size_t i = 0; + for (auto const& elem : rList) + { + ::std::vector< OUString > aTmp( LanguageTag(elem).getFallbackStrings( true)); + aListFallbacks[i++] = aTmp; + } + for (auto const& rfb : aFallbacks) + { + size_t nPosFb = 0; + for (auto const& lfb : aListFallbacks) + { + for (auto const& fb : lfb) + { + if (rfb == fb) + return rList.begin() + nPosFb; + } + ++nPosFb; + } + } + + // No match found. + return rList.end(); +} + + +static bool lcl_isSystem( LanguageType nLangID ) +{ + if (nLangID == LANGUAGE_SYSTEM) + return true; + // There are some special values that simplify to SYSTEM, + // getRealLanguage() catches and resolves them. + LanguageType nNewLangID = MsLangId::getRealLanguage( nLangID); + return nNewLangID != nLangID; +} + + +// static +css::lang::Locale LanguageTag::convertToLocale( LanguageType nLangID, bool bResolveSystem ) +{ + if (!bResolveSystem && lcl_isSystem( nLangID)) + return lang::Locale(); + + return LanguageTag( nLangID).getLocale( bResolveSystem); +} + + +// static +LanguageType LanguageTag::convertToLanguageType( const css::lang::Locale& rLocale, bool bResolveSystem ) +{ + if (rLocale.Language.isEmpty() && !bResolveSystem) + return LANGUAGE_SYSTEM; + + return LanguageTag( rLocale).getLanguageType( bResolveSystem); +} + + +// static +OUString LanguageTagImpl::convertToBcp47( const css::lang::Locale& rLocale ) +{ + OUString aBcp47; + if (rLocale.Language.isEmpty()) + { + // aBcp47 stays empty + } + else if (rLocale.Language == I18NLANGTAG_QLT) + { + aBcp47 = rLocale.Variant; + } + else + { + /* XXX NOTE: most legacy code never evaluated the Variant field, so for + * now just concatenate language and country. In case we stumbled over + * variant aware code we'd have to take care of that. */ + if (rLocale.Country.isEmpty()) + aBcp47 = rLocale.Language; + else + { + aBcp47 = rLocale.Language + "-" + rLocale.Country; + } + } + return aBcp47; +} + + +// static +OUString LanguageTag::convertToBcp47( const css::lang::Locale& rLocale, bool bResolveSystem ) +{ + OUString aBcp47; + if (rLocale.Language.isEmpty()) + { + if (bResolveSystem) + aBcp47 = LanguageTag::convertToBcp47( LANGUAGE_SYSTEM ); + // else aBcp47 stays empty + } + else + { + aBcp47 = LanguageTagImpl::convertToBcp47( rLocale); + } + return aBcp47; +} + + +// static +OUString LanguageTag::convertToBcp47( LanguageType nLangID ) +{ + lang::Locale aLocale( LanguageTag::convertToLocale( nLangID )); + // If system for some reason (should not happen... haha) could not be + // resolved DO NOT CALL LanguageTag::convertToBcp47(Locale) because that + // would recurse into this method here! + if (aLocale.Language.isEmpty()) + return OUString(); // bad luck, bail out + return LanguageTagImpl::convertToBcp47( aLocale); +} + + +// static +css::lang::Locale LanguageTag::convertToLocale( const OUString& rBcp47, bool bResolveSystem ) +{ + if (rBcp47.isEmpty() && !bResolveSystem) + return lang::Locale(); + + return LanguageTag( rBcp47).getLocale( bResolveSystem); +} + + +// static +LanguageType LanguageTag::convertToLanguageType( const OUString& rBcp47 ) +{ + return LanguageTag( rBcp47).getLanguageType(); +} + + +// static +LanguageType LanguageTag::convertToLanguageTypeWithFallback( const OUString& rBcp47 ) +{ + return LanguageTag( rBcp47).makeFallback().getLanguageType(); +} + + +// static +css::lang::Locale LanguageTag::convertToLocaleWithFallback( const OUString& rBcp47 ) +{ + return LanguageTag( rBcp47).makeFallback().getLocale(); +} + + +// static +LanguageType LanguageTag::convertToLanguageTypeWithFallback( const css::lang::Locale& rLocale ) +{ + if (rLocale.Language.isEmpty()) + return LANGUAGE_SYSTEM; + + return LanguageTag( rLocale).makeFallback().getLanguageType(); +} + + +// static +bool LanguageTag::isValidBcp47( const OUString& rString, OUString* o_pCanonicalized, bool bDisallowPrivate ) +{ + bool bValid = false; + + struct guard + { + lt_tag_t* mpLangtag; + guard() + { + theDataRef().init(); + mpLangtag = lt_tag_new(); + } + ~guard() + { + lt_tag_unref( mpLangtag); + } + } aVar; + + myLtError aError; + + if (!lt_tag_parse_disabled && lt_tag_parse(aVar.mpLangtag, OUStringToOString(rString, RTL_TEXTENCODING_UTF8).getStr(), &aError.p)) + { + char* pTag = lt_tag_canonicalize( aVar.mpLangtag, &aError.p); + SAL_WARN_IF( !pTag, "i18nlangtag", "LanguageTag:isValidBcp47: could not canonicalize '" << rString << "'"); + if (pTag) + { + bValid = true; + if (bDisallowPrivate) + { + const lt_string_t* pPrivate = lt_tag_get_privateuse( aVar.mpLangtag); + if (pPrivate && lt_string_length( pPrivate) > 0) + bValid = false; + else + { + const lt_lang_t* pLangT = lt_tag_get_language( aVar.mpLangtag); + if (pLangT) + { + const char* pLang = lt_lang_get_tag( pLangT); + if (pLang && strcmp( pLang, I18NLANGTAG_QLT_ASCII) == 0) + { + // Disallow 'qlt' privateuse code to prevent + // confusion with our internal usage. + bValid = false; + } + } + } + } + if (o_pCanonicalized) + *o_pCanonicalized = OUString::createFromAscii( pTag); + free( pTag); + return bValid; + } + } + else + { + SAL_INFO( "i18nlangtag", "LanguageTag:isValidBcp47: could not parse '" << rString << "'"); + } + return bValid; +} + +LanguageTag makeLanguageTagFromAppleLanguageId(AppleLanguageId nLanguage) +{ + //map the simple ones via LanguageTypes, and the hard ones explicitly + LanguageType nLang(LANGUAGE_DONTKNOW); + + switch (nLanguage) + { + case AppleLanguageId::ENGLISH: + nLang = LANGUAGE_ENGLISH_US; + break; + case AppleLanguageId::FRENCH: + nLang = LANGUAGE_FRENCH; + break; + case AppleLanguageId::GERMAN: + nLang = LANGUAGE_GERMAN; + break; + case AppleLanguageId::ITALIAN: + nLang = LANGUAGE_ITALIAN; + break; + case AppleLanguageId::DUTCH: + nLang = LANGUAGE_DUTCH; + break; + case AppleLanguageId::SWEDISH: + nLang = LANGUAGE_SWEDISH; + break; + case AppleLanguageId::SPANISH: + nLang = LANGUAGE_SPANISH; + break; + case AppleLanguageId::DANISH: + nLang = LANGUAGE_DANISH; + break; + case AppleLanguageId::PORTUGUESE: + nLang = LANGUAGE_PORTUGUESE; + break; + case AppleLanguageId::NORWEGIAN: + nLang = LANGUAGE_NORWEGIAN; + break; + case AppleLanguageId::HEBREW: + nLang = LANGUAGE_HEBREW; + break; + case AppleLanguageId::JAPANESE: + nLang = LANGUAGE_JAPANESE; + break; + case AppleLanguageId::ARABIC: + nLang = LANGUAGE_ARABIC_PRIMARY_ONLY; + break; + case AppleLanguageId::FINNISH: + nLang = LANGUAGE_FINNISH; + break; + case AppleLanguageId::GREEK: + nLang = LANGUAGE_GREEK; + break; + case AppleLanguageId::ICELANDIC: + nLang = LANGUAGE_ICELANDIC; + break; + case AppleLanguageId::MALTESE: + nLang = LANGUAGE_MALTESE; + break; + case AppleLanguageId::TURKISH: + nLang = LANGUAGE_TURKISH; + break; + case AppleLanguageId::CROATIAN: + nLang = LANGUAGE_CROATIAN; + break; + case AppleLanguageId::CHINESE_TRADITIONAL: + nLang = LANGUAGE_CHINESE_TRADITIONAL; + break; + case AppleLanguageId::URDU: + nLang = LANGUAGE_URDU_PAKISTAN; //probably, otherwise we need a LANGUAGE_URDU_PRIMARY_ONLY + break; + case AppleLanguageId::HINDI: + nLang = LANGUAGE_HINDI; + break; + case AppleLanguageId::THAI: + nLang = LANGUAGE_THAI; + break; + case AppleLanguageId::KOREAN: + nLang = LANGUAGE_KOREAN; + break; + case AppleLanguageId::LITHUANIAN: + nLang = LANGUAGE_LITHUANIAN; + break; + case AppleLanguageId::POLISH: + nLang = LANGUAGE_POLISH; + break; + case AppleLanguageId::HUNGARIAN: + nLang = LANGUAGE_HUNGARIAN; + break; + case AppleLanguageId::ESTONIAN: + nLang = LANGUAGE_ESTONIAN; + break; + case AppleLanguageId::LATVIAN: + nLang = LANGUAGE_LATVIAN; + break; + case AppleLanguageId::SAMI: + nLang = LANGUAGE_SAMI_NORTHERN_NORWAY; //maybe + break; + case AppleLanguageId::FAROESE: + nLang = LANGUAGE_FAEROESE; + break; + case AppleLanguageId::FARSI: + nLang = LANGUAGE_FARSI; + break; + case AppleLanguageId::RUSSIAN: + nLang = LANGUAGE_RUSSIAN; + break; + case AppleLanguageId::CHINESE_SIMPLIFIED: + nLang = LANGUAGE_CHINESE_SIMPLIFIED; + break; + case AppleLanguageId::FLEMISH: + nLang = LANGUAGE_DUTCH_BELGIAN; + break; + case AppleLanguageId::IRISH_GAELIC: + nLang = LANGUAGE_GAELIC_IRELAND; + break; + case AppleLanguageId::ALBANIAN: + nLang = LANGUAGE_ALBANIAN; + break; + case AppleLanguageId::ROMANIAN: + nLang = LANGUAGE_ROMANIAN; + break; + case AppleLanguageId::CZECH: + nLang = LANGUAGE_CZECH; + break; + case AppleLanguageId::SLOVAK: + nLang = LANGUAGE_SLOVAK; + break; + case AppleLanguageId::SLOVENIAN: + nLang = LANGUAGE_SLOVENIAN; + break; + case AppleLanguageId::YIDDISH: + nLang = LANGUAGE_YIDDISH; + break; + case AppleLanguageId::SERBIAN: + nLang = LANGUAGE_SERBIAN_CYRILLIC_SERBIA; //maybe + break; + case AppleLanguageId::MACEDONIAN: + nLang = LANGUAGE_MACEDONIAN; + break; + case AppleLanguageId::BULGARIAN: + nLang = LANGUAGE_BULGARIAN; + break; + case AppleLanguageId::UKRAINIAN: + nLang = LANGUAGE_UKRAINIAN; + break; + case AppleLanguageId::BYELORUSSIAN: + nLang = LANGUAGE_BELARUSIAN; + break; + case AppleLanguageId::UZBEK: + nLang = LANGUAGE_UZBEK_CYRILLIC; //maybe + break; + case AppleLanguageId::KAZAKH: + nLang = LANGUAGE_KAZAKH; + break; + case AppleLanguageId::AZERI_CYRILLIC: + nLang = LANGUAGE_AZERI_CYRILLIC; + break; + case AppleLanguageId::AZERI_ARABIC: + return LanguageTag("az-Arab"); + case AppleLanguageId::ARMENIAN: + nLang = LANGUAGE_ARMENIAN; + break; + case AppleLanguageId::GEORGIAN: + nLang = LANGUAGE_GEORGIAN; + break; + case AppleLanguageId::MOLDAVIAN: + nLang = LANGUAGE_ROMANIAN_MOLDOVA; + break; + case AppleLanguageId::KIRGHIZ: + nLang = LANGUAGE_KIRGHIZ; + break; + case AppleLanguageId::TAJIKI: + nLang = LANGUAGE_TAJIK; + break; + case AppleLanguageId::TURKMEN: + nLang = LANGUAGE_TURKMEN; + break; + case AppleLanguageId::MONGOLIAN_MONGOLIAN: + nLang = LANGUAGE_MONGOLIAN_MONGOLIAN_MONGOLIA; + break; + case AppleLanguageId::MONGOLIAN_CYRILLIC: + nLang = LANGUAGE_MONGOLIAN_CYRILLIC_MONGOLIA; + break; + case AppleLanguageId::PASHTO: + nLang = LANGUAGE_PASHTO; + break; + case AppleLanguageId::KURDISH: + nLang = LANGUAGE_USER_KURDISH_TURKEY; //maybe + break; + case AppleLanguageId::KASHMIRI: + nLang = LANGUAGE_KASHMIRI; + break; + case AppleLanguageId::SINDHI: + nLang = LANGUAGE_SINDHI; + break; + case AppleLanguageId::TIBETAN: + nLang = LANGUAGE_TIBETAN; + break; + case AppleLanguageId::NEPALI: + nLang = LANGUAGE_NEPALI; + break; + case AppleLanguageId::SANSKRIT: + nLang = LANGUAGE_SANSKRIT; + break; + case AppleLanguageId::MARATHI: + nLang = LANGUAGE_MARATHI; + break; + case AppleLanguageId::BENGALI: + nLang = LANGUAGE_BENGALI; + break; + case AppleLanguageId::ASSAMESE: + nLang = LANGUAGE_ASSAMESE; + break; + case AppleLanguageId::GUJARATI: + nLang = LANGUAGE_GUJARATI; + break; + case AppleLanguageId::PUNJABI: + nLang = LANGUAGE_PUNJABI; + break; + case AppleLanguageId::ORIYA: + nLang = LANGUAGE_ODIA; + break; + case AppleLanguageId::MALAYALAM: + nLang = LANGUAGE_MALAYALAM; + break; + case AppleLanguageId::KANNADA: + nLang = LANGUAGE_KANNADA; + break; + case AppleLanguageId::TAMIL: + nLang = LANGUAGE_TAMIL; + break; + case AppleLanguageId::TELUGU: + nLang = LANGUAGE_TELUGU; + break; + case AppleLanguageId::SINHALESE: + nLang = LANGUAGE_SINHALESE_SRI_LANKA; + break; + case AppleLanguageId::BURMESE: + nLang = LANGUAGE_BURMESE; + break; + case AppleLanguageId::KHMER: + nLang = LANGUAGE_KHMER; + break; + case AppleLanguageId::LAO: + nLang = LANGUAGE_LAO; + break; + case AppleLanguageId::VIETNAMESE: + nLang = LANGUAGE_VIETNAMESE; + break; + case AppleLanguageId::INDONESIAN: + nLang = LANGUAGE_INDONESIAN; + break; + case AppleLanguageId::TAGALONG: + nLang = LANGUAGE_USER_TAGALOG; + break; + case AppleLanguageId::MALAY_LATIN: + nLang = LANGUAGE_MALAY_MALAYSIA; + break; + case AppleLanguageId::MALAY_ARABIC: + nLang = LANGUAGE_USER_MALAY_ARABIC_MALAYSIA; + break; + case AppleLanguageId::AMHARIC: + nLang = LANGUAGE_AMHARIC_ETHIOPIA; + break; + case AppleLanguageId::TIGRINYA: + nLang = LANGUAGE_TIGRIGNA_ETHIOPIA; + break; + case AppleLanguageId::GALLA: + nLang = LANGUAGE_OROMO; + break; + case AppleLanguageId::SOMALI: + nLang = LANGUAGE_SOMALI; + break; + case AppleLanguageId::SWAHILI: + nLang = LANGUAGE_SWAHILI; + break; + case AppleLanguageId::KINYARWANDA: + nLang = LANGUAGE_KINYARWANDA_RWANDA; + break; + case AppleLanguageId::RUNDI: + return LanguageTag("rn"); + case AppleLanguageId::NYANJA: + nLang = LANGUAGE_USER_NYANJA; + break; + case AppleLanguageId::MALAGASY: + nLang = LANGUAGE_MALAGASY_PLATEAU; + break; + case AppleLanguageId::ESPERANTO: + nLang = LANGUAGE_USER_ESPERANTO; + break; + case AppleLanguageId::WELSH: + nLang = LANGUAGE_WELSH; + break; + case AppleLanguageId::BASQUE: + nLang = LANGUAGE_BASQUE; + break; + case AppleLanguageId::CATALAN: + nLang = LANGUAGE_CATALAN; + break; + case AppleLanguageId::LATIN: + nLang = LANGUAGE_LATIN; + break; + case AppleLanguageId::QUENCHUA: + nLang = LANGUAGE_QUECHUA_BOLIVIA; //maybe + break; + case AppleLanguageId::GUARANI: + nLang = LANGUAGE_GUARANI_PARAGUAY; + break; + case AppleLanguageId::AYMARA: + return LanguageTag("ay"); + case AppleLanguageId::TATAR: + nLang = LANGUAGE_TATAR; + break; + case AppleLanguageId::UIGHUR: + nLang = LANGUAGE_UIGHUR_CHINA; + break; + case AppleLanguageId::DZONGKHA: + nLang = LANGUAGE_DZONGKHA_BHUTAN; + break; + case AppleLanguageId::JAVANESE_LATIN: + return LanguageTag("jv-Latn"); + case AppleLanguageId::SUNDANESE_LATIN: + return LanguageTag("su-Latn"); + case AppleLanguageId::GALICIAN: + nLang = LANGUAGE_GALICIAN; + break; + case AppleLanguageId::AFRIKAANS: + nLang = LANGUAGE_AFRIKAANS; + break; + case AppleLanguageId::BRETON: + nLang = LANGUAGE_BRETON_FRANCE; + break; + case AppleLanguageId::INUKTITUT: + nLang = LANGUAGE_INUKTITUT_LATIN_CANADA; //probably + break; + case AppleLanguageId::SCOTTISH_GAELIC: + nLang = LANGUAGE_GAELIC_SCOTLAND; + break; + case AppleLanguageId::MANX_GAELIC: + nLang = LANGUAGE_USER_MANX; + break; + case AppleLanguageId::IRISH_GAELIC_WITH_DOT_ABOVE: + return LanguageTag("ga-Latg"); + case AppleLanguageId::TONGAN: + return LanguageTag("to"); + case AppleLanguageId::GREEK_POLYTONIC: + nLang = LANGUAGE_USER_ANCIENT_GREEK; + break; + case AppleLanguageId::GREENLANDIC: + nLang = LANGUAGE_KALAALLISUT_GREENLAND; + break; + case AppleLanguageId::AZERI_LATIN: + nLang = LANGUAGE_AZERI_LATIN; + break; + } + + return LanguageTag(nLang); +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/i18nlangtag/source/languagetag/languagetagicu.cxx b/i18nlangtag/source/languagetag/languagetagicu.cxx new file mode 100644 index 000000000..fd1c9bc75 --- /dev/null +++ b/i18nlangtag/source/languagetag/languagetagicu.cxx @@ -0,0 +1,71 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include +#include + + +// static +icu::Locale LanguageTagIcu::getIcuLocale( const LanguageTag & rLanguageTag ) +{ + if (rLanguageTag.isIsoLocale()) + { + // The simple case. + const css::lang::Locale& rLocale = rLanguageTag.getLocale(); + if (rLocale.Country.isEmpty()) + return icu::Locale( OUStringToOString( rLocale.Language, RTL_TEXTENCODING_ASCII_US).getStr()); + return icu::Locale( + OUStringToOString( rLocale.Language, RTL_TEXTENCODING_ASCII_US).getStr(), + OUStringToOString( rLocale.Country, RTL_TEXTENCODING_ASCII_US).getStr()); + } + + /* TODO: could we optimize this for the isIsoODF() case where only a script + * is added? */ + + // Let ICU decide how it wants a BCP47 string stuffed into its Locale. + return icu::Locale::createFromName( + OUStringToOString( rLanguageTag.getBcp47(), RTL_TEXTENCODING_ASCII_US).getStr()); +} + + +// static +icu::Locale LanguageTagIcu::getIcuLocale( const LanguageTag & rLanguageTag, std::u16string_view rVariant, std::u16string_view rKeywords ) +{ + /* FIXME: how should this work with any BCP47? */ + return icu::Locale( + OUStringToOString( rLanguageTag.getLanguage(), RTL_TEXTENCODING_ASCII_US).getStr(), + OUStringToOString( rLanguageTag.getCountry(), RTL_TEXTENCODING_ASCII_US).getStr(), + OUStringToOString( rVariant, RTL_TEXTENCODING_ASCII_US).getStr(), + OUStringToOString( rKeywords, RTL_TEXTENCODING_ASCII_US).getStr() + ); +} + +// static +OUString LanguageTagIcu::getDisplayName( const LanguageTag & rLanguageTag, const LanguageTag & rDisplayLanguage ) +{ + // This will be initialized by the first call; as the UI language doesn't + // change the tag mostly stays the same, unless someone overrides it for a + // call here, and thus obtaining the UI icu::Locale has to be done only + // once. + static thread_local LanguageTag aUITag( LANGUAGE_SYSTEM); + static thread_local icu::Locale aUILocale; + + if (aUITag != rDisplayLanguage) + { + aUITag = rDisplayLanguage; + aUILocale = getIcuLocale( rDisplayLanguage); + } + + icu::Locale aLocale( getIcuLocale( rLanguageTag)); + icu::UnicodeString aResult; + aLocale.getDisplayName( aUILocale, aResult); + return OUString( reinterpret_cast(aResult.getBuffer()), aResult.length()); +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ -- cgit v1.2.3