diff options
Diffstat (limited to 'include/i18nlangtag/languagetag.hxx')
-rw-r--r-- | include/i18nlangtag/languagetag.hxx | 592 |
1 files changed, 592 insertions, 0 deletions
diff --git a/include/i18nlangtag/languagetag.hxx b/include/i18nlangtag/languagetag.hxx new file mode 100644 index 000000000..3b5fb3d6d --- /dev/null +++ b/include/i18nlangtag/languagetag.hxx @@ -0,0 +1,592 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_I18NLANGTAG_LANGUAGETAG_HXX +#define INCLUDED_I18NLANGTAG_LANGUAGETAG_HXX + +#include <sal/config.h> +#include <rtl/locale.h> +#include <rtl/ustring.hxx> +#include <com/sun/star/lang/Locale.hpp> +#include <i18nlangtag/i18nlangtagdllapi.h> +#include <i18nlangtag/lang.h> + +#include <memory> +#include <string_view> +#include <vector> + +/** The ISO 639-2 code reserved for local use used to indicate that a + css::Locale contains a BCP 47 string in its Variant field. The + Locale's Language field then will contain this language code. + + @see LanguageTag::getLocale() + + Avoid use, only needed internally or if conversion from Locale to + LanguageTag is not wanted, i.e. during ODF import. To check whether a + LanguageTag contains a plain language/country combination or a more + detailed BCP 47 language tag use LanguageTag::isIsoLocale() instead. + */ +#define I18NLANGTAG_QLT_ASCII "qlt" +inline constexpr OUStringLiteral I18NLANGTAG_QLT = u"qlt"; + + +class LanguageTagImpl; + + +/** Wrapper for liblangtag BCP 47 language tags, MS-LangIDs, locales and + conversions in between. + + Note that member variables are mutable and may change their values even in + const methods. Getter methods return either the original value or matching + converted values. + + For standalone conversions if no LanguageTag instance is at hand, static + convertTo...() methods exist. + */ +class SAL_WARN_UNUSED I18NLANGTAG_DLLPUBLIC LanguageTag +{ + friend class LanguageTagImpl; + +public: + + /** ScriptType for a language. + + Used only in onTheFly languages as a way of marking key script behaviours + for the script of the language without having to store and analyse the + script each time. Used primarily from msLangId. + + These need to correspond to the ExtraLanguages.ScriptType template + property in officecfg/registry/schema/org/openoffice/VCL.xcs + */ + enum class ScriptType + { + UNKNOWN = 0, + WESTERN = 1, // Copies css::i18n::ScriptType for strong types + CJK = 2, + CTL = 3, + RTL = 4 // implies CTL + }; + + /** Init LanguageTag with existing BCP 47 language tag string. + + @param bCanonicalize + If TRUE, canonicalize tag and reparse, the resulting tag string may + be different. + IF FALSE, the tag is simply stored and can be retrieved with + getBcp47(). + + Note that conversions to ISO codes, locales or LanguageType or + obtaining language or script will canonicalize the tag string anyway, + so specifying bCanonicalize=false is not a guarantee that the tag will + stay identical to what was passed. + */ + explicit LanguageTag( const OUString & rBcp47LanguageTag, bool bCanonicalize = false ); + + /** Init LanguageTag with Locale. */ + explicit LanguageTag( const css::lang::Locale & rLocale ); + + /** Init LanguageTag with LanguageType MS-LangID. */ + explicit LanguageTag( LanguageType nLanguage ); + + /** Init LanguageTag with either BCP 47 language tag (precedence if not + empty), or a combination of language, script and country. + + This is a convenience ctor to be used in ODF import where these are + distinct attributes. + */ + explicit LanguageTag( const OUString& rBcp47, const OUString& rLanguage, + std::u16string_view rScript, const OUString& rCountry ); + + /** Init LanguageTag with rtl_Locale. + + This is a convenience ctor. + */ + explicit LanguageTag( const rtl_Locale & rLocale ); + + ~LanguageTag(); + + LanguageTag(LanguageTag const &) = default; + LanguageTag(LanguageTag &&) = default; + LanguageTag & operator =(LanguageTag const &) = default; + LanguageTag & operator =(LanguageTag &&) = default; + + /** Obtain BCP 47 language tag. + + @param bResolveSystem + If TRUE, resolve an empty language tag denoting the system + locale to the real locale used. + If FALSE, return an empty OUString for such a tag. + */ + const OUString & getBcp47( bool bResolveSystem = true ) const; + + /** Obtain BCP 47 language tag, but with MS malformed exceptions. + + To be used *only* in OOXML filter context. + For example, es-ES-u-co-trad is stored as es-ES_tradnl which is not a + valid BCP 47 language tag. + */ + OUString getBcp47MS() const; + + /** Obtain language tag as Locale. + + As a convention, language tags that can not be expressed as "pure" + css::lang::Locale content using Language and Country fields + store "qlt" (ISO 639 reserved for local use) in the Language field and + the entire BCP 47 language tag in the Variant field. The Country field + contains the corresponding ISO 3166 country code _if_ there is one, or + otherwise is empty. + + @param bResolveSystem + If TRUE, resolve an empty language tag denoting the system + locale to the real locale used. + If FALSE, return an empty Locale for such a tag. + */ + const css::lang::Locale & getLocale( bool bResolveSystem = true ) const; + + /** Obtain mapping to MS-LangID. + + @param bResolveSystem + If TRUE, resolve an empty language tag denoting the system + locale to the real locale used. + If FALSE, return LANGUAGE_SYSTEM for such a tag. + */ + LanguageType getLanguageType( bool bResolveSystem = true ) const; + + /** Obtain ISO strings for language, script and country. + + This is a convenience method for ODF export places only. Avoid use in + other code. + + ATTENTION! May return empty strings if the language tag is not + expressible in valid ISO codes! + + @see isIsoODF() + + Always resolves an empty tag to the system locale. + */ + void getIsoLanguageScriptCountry( OUString& rLanguage, + OUString& rScript, OUString& rCountry ) const; + + /** Get ISO 639 language code, or BCP 47 language. + + Always resolves an empty tag to the system locale. + */ + OUString getLanguage() const; + + /** Get ISO 15924 script code, if not the default script according to + BCP 47. For default script an empty string is returned. + + @see hasScript() + + Always resolves an empty tag to the system locale. + */ + OUString getScript() const; + + /** Get combined language and script code, separated by '-' if + non-default script, if default script only language. + + @see hasScript() + + Always resolves an empty tag to the system locale. + */ + OUString getLanguageAndScript() const; + + /** Get ISO 3166 country alpha code. Empty if the BCP 47 tags denote a + region not expressible as 2 character country code. + + Always resolves an empty tag to the system locale. + */ + OUString getCountry() const; + + /** Get BCP 47 variant subtags, of the IANA Language Subtag Registry. + + If there are multiple variant subtags they are separated by '-'. + + This is NOT related to Locale.Variant! + + Always resolves an empty tag to the system locale. + */ + OUString getVariants() const; + + /** Get a GLIBC locale string. + + Always resolves an empty tag to the system locale. + + @param rEncoding + An encoding to be appended to language_country, for example + ".UTF-8" including the dot. + + @return The resulting GLIBC locale string if it could be constructed, + if not an empty string is returned. + */ + OUString getGlibcLocaleString( std::u16string_view rEncoding ) const; + + /** If language tag has a non-default script specified. + */ + bool hasScript() const; + + /** If language tag is a locale that can be expressed using only ISO 639 + language codes and ISO 3166 country codes, thus is convertible to a + conforming Locale struct without using extension mechanisms. + + Note that an empty language tag or empty Locale::Language field or + LanguageType LANGUAGE_SYSTEM could be treated as a valid ISO locale in + some context, but here is not. If you want that ask for + aTag.isSystemLocale() || aTag.isIsoLocale() + + Always resolves an empty tag to the system locale. + */ + bool isIsoLocale() const; + + /** If language tag is a locale that can be expressed using only ISO 639 + language codes and ISO 15924 script codes and ISO 3166 country codes, + thus can be stored in an ODF document using only fo:language, fo:script + and fo:country attributes. If this is FALSE, the locale must be stored + as a <*:rfc-language-tag> element. + + Always resolves an empty tag to the system locale. + */ + bool isIsoODF() const; + + /** If this is a valid BCP 47 language tag. + + Always resolves an empty tag to the system locale. + + @seealso static bool isValidBcp47(const OUString&) + */ + bool isValidBcp47() const; + + /** If this tag was constructed as an empty tag denoting the system locale. + */ + bool isSystemLocale() const { return mbSystemLocale;} + + /** Returns the script type for this language, UNKNOWN if not set */ + ScriptType getScriptType() const; + + /** Sets the script type for this language */ + void setScriptType(ScriptType st); + + /** Reset with existing BCP 47 language tag string. See ctor. */ + LanguageTag & reset( const OUString & rBcp47LanguageTag ); + + /** Reset with Locale. */ + LanguageTag & reset( const css::lang::Locale & rLocale ); + + /** Reset with LanguageType MS-LangID. */ + LanguageTag & reset( LanguageType nLanguage ); + + + /** Fall back to a known locale. + + If the current tag does not represent a known (by us) locale, fall back + to the most likely locale possible known. + If the current tag is known, no change occurs. + */ + LanguageTag & makeFallback(); + + /** Return a vector of fall-back strings. + + In order: + full BCP 47 tag, same as getBcp47() + lll-Ssss-CC + lll-Ssss + lll-CC + lll + + If the tag includes variants the order is: + full BCP 47 tag, same as getBcp47() + lll-Ssss-CC-vvvvvvvv + lll-Ssss-vvvvvvvv + lll-Ssss-CC + lll-Ssss + lll-CC-vvvvvvvv + lll-vvvvvvvv + lll-CC + lll + + Only strings that differ from a higher order are included, for example + if there is no script the elements will be bcp47, lll-CC, lll; if the + bcp47 string is identical to lll-CC then only lll-CC, lll. + + Note that lll is only ISO 639-1/2 alpha code and CC is only ISO 3166 + alpha code. If the region can not be expressed as ISO 3166 then no -CC + tags are included. + + @param bIncludeFullBcp47 + If TRUE, the full BCP 47 tag is included as first element. + If FALSE, the full tag is not included; used if the caller + obtains the fallbacks only if the full tag did not lead to a + match, so subsequent tries need not to include it again. + */ + ::std::vector< OUString > getFallbackStrings( bool bIncludeFullBcp47 ) const; + + + /** @short Search for an equal or at least for a similar locale in a list + of possible ones. + + @descr First search for a locale that is equal to the reference + locale. (means: same BCP47 string) + + If the reference locale could not be located, check for + "similar" locales, in the same order as obtained by + getFallbackStrings(). + + If no similar locale could be located, we search for a locale + "en-US" inside the given locale list. + + If "en-US" could not be located, we search for a locale "en" + inside the given list. + + If no "same" nor any "similar" locale could be found, we try + "x-default" and "x-no-translate" explicitly. Sometimes + variables don't use real localization. For example, in case the + localized value is a fix product name. + + If no locale matched until then, we use any other locale that + exists inside the set of given ones, namely the first + encountered! + + @param rList + the vector of possible locales as BCP47 strings. + + @param rReference + the reference locale, BCP47 string. + + @return An iterator that points to the found element inside the given + locale list. If no matching locale could be found it points to + the beginning of the list. + */ + static ::std::vector< OUString >::const_iterator getFallback( const ::std::vector< OUString > & rList, + const OUString & rReference ); + + + /** @short Search for an equal or for a similar locale in a list + of possible ones where at least the language matches. + + @descr First search for a locale that is equal to the reference + locale. + + If the reference locale could not be located, check for + "similar" locales, in the same order as obtained by + getFallbackStrings(). + + If no locale matches, rList.end() is returned. + + @param rList + the vector of possible locales. + + @param rReference + the reference locale. + + @return An iterator that points to the found element inside the given + locale list. If no matching locale could be found it points to + the end of the list. + */ + static ::std::vector< css::lang::Locale >::const_iterator getMatchingFallback( + const ::std::vector< css::lang::Locale > & rList, + const css::lang::Locale & rReference ); + + + /** Test equality of two LanguageTag, possibly resolving system locale. + + Resolve empty language tags denoting the system + locale to the real locale used before comparing. + */ + bool equals( const LanguageTag & rLanguageTag ) const; + + /** Test equality of two LanguageTag. + + Does NOT resolve system, i.e. if the system locale is en-US + LanguageTag("")==LanguageTag("en-US") returns false! Use + equals(...) instead if system locales shall be resolved. + */ + bool operator==( const LanguageTag & rLanguageTag ) const; + + /** Test inequality of two LanguageTag. + + Does NOT resolve system, i.e. if the system locale is en-US + LanguageTag("")!=LanguageTag("en-US") returns true! Use + !equals(,..) instead if system locales shall be resolved. + */ + bool operator!=( const LanguageTag & rLanguageTag ) const; + + /** Test this LanguageTag less than that LanguageTag. + + For sorted containers. Does NOT resolve system. + */ + bool operator<( const LanguageTag & rLanguageTag ) const; + + /** Convert MS-LangID to Locale. + + @param bResolveSystem + If TRUE, resolve an empty language tag denoting the system + locale to the real locale used. + If FALSE, return an empty Locale for such a tag. + */ + static css::lang::Locale convertToLocale( LanguageType nLangID, bool bResolveSystem = true ); + + /** Convert Locale to MS-LangID. + + @param bResolveSystem + If TRUE, resolve an empty language tag denoting the system + locale to the real locale used. + If FALSE, return LANGUAGE_SYSTEM for such a tag. + */ + static LanguageType convertToLanguageType( const css::lang::Locale& rLocale, bool bResolveSystem = true ); + + /** Convert MS-LangID to BCP 47 string. + + Resolve an empty language tag denoting the system + locale to the real locale used. + */ + static OUString convertToBcp47( LanguageType nLangID ); + + /** Convert Locale to BCP 47 string. + + @param bResolveSystem + If TRUE, resolve an empty language tag denoting the system + locale to the real locale used. + If FALSE, return an empty OUString for such a tag. + */ + static OUString convertToBcp47( const css::lang::Locale& rLocale, bool bResolveSystem = true ); + + /** Convert BCP 47 string to Locale, convenience method. + + NOTE: exists only for consistency with the other convertTo...() + methods, internally uses a temporary LanguageTag instance for + conversion so does not save anything compared to + LanguageTag(rBcp47).getLocale(bResolveSystem). + + @param bResolveSystem + If TRUE, resolve an empty language tag denoting the system + locale to the real locale used. + If FALSE, return an empty Locale for such a tag. + */ + static css::lang::Locale convertToLocale( const OUString& rBcp47, bool bResolveSystem = true ); + + /** Convert BCP 47 string to MS-LangID, convenience method. + + NOTE: exists only for consistency with the other convertTo...() + methods, internally uses a temporary LanguageTag instance for + conversion so does not save anything compared to + LanguageTag(rBcp47).getLanguageType(bResolveSystem). + + Resolve an empty language tag denoting the system + locale to the real locale used. + */ + static LanguageType convertToLanguageType( const OUString& rBcp47 ); + + /** Convert BCP 47 string to MS-LangID with fallback, convenience method. + + NOTE: exists only for consistency with the other convertTo...() + methods, internally uses a temporary LanguageTag instance for + conversion so does not save anything compared to + LanguageTag(rBcp47).makeFallback().getLanguageType(bResolveSystem). + + @see makeFallback() + + Always resolves an empty tag to the system locale. + */ + static LanguageType convertToLanguageTypeWithFallback( const OUString& rBcp47 ); + + /** Convert BCP 47 string to Locale with fallback, convenience method. + + NOTE: exists only for consistency with the other convertTo...() + methods, internally uses a temporary LanguageTag instance for + conversion so does not save anything compared to + LanguageTag(rBcp47).makeFallback().getLocale(bResolveSystem). + + @see makeFallback() + + Always resolves an empty tag to the system locale. + */ + static css::lang::Locale convertToLocaleWithFallback( const OUString& rBcp47 ); + + /** Convert Locale to MS-LangID with fallback. + + Resolves an empty language tag denoting the system + locale to LANGUAGE_SYSTEM and does not fallback. + */ + static LanguageType convertToLanguageTypeWithFallback( const css::lang::Locale& rLocale ); + + /** If rString represents a valid BCP 47 language tag. + + Never resolves an empty tag to the system locale, in fact an empty + string is invalid here. Does not create an instance to be registered + with a conversion to Locale or LanguageType. + + @param o_pCanonicalized + If given and rString is a valid BCP 47 language tag, the + canonicalized form is assigned, which may differ from the + original string even if that was a valid tag. If rString is not + a valid tag, nothing is assigned. + + @param bDisallowPrivate + If TRUE, valid tags according to BCP 47 but reserved for + private use, like 'x-...', are not allowed and FALSE is + returned in this case. + */ + static bool isValidBcp47( const OUString& rString, OUString* o_pCanonicalized, + bool bDisallowPrivate = false ); + + /** If nLang is a generated on-the-fly LangID */ + static bool isOnTheFlyID( LanguageType nLang ); + static ScriptType getOnTheFlyScriptType( LanguageType nLang ); + + /** @ATTENTION: _ONLY_ to be called by the application's configuration! */ + static void setConfiguredSystemLanguage( LanguageType nLang ); + + /** @ATTENTION: _ONLY_ to be called by fuzzing setup */ + static void disable_lt_tag_parse(); + + typedef std::shared_ptr< LanguageTagImpl > ImplPtr; + +private: + + mutable css::lang::Locale maLocale; + mutable OUString maBcp47; + mutable LanguageType mnLangID; + mutable ImplPtr mpImpl; + bool mbSystemLocale : 1; + mutable bool mbInitializedBcp47 : 1; + mutable bool mbInitializedLocale : 1; + mutable bool mbInitializedLangID : 1; + bool mbIsFallback : 1; + + LanguageTagImpl* getImpl(); + LanguageTagImpl const* getImpl() const; + ImplPtr registerImpl() const; + void syncFromImpl(); + void syncVarsFromRawImpl() const; + void syncVarsFromImpl() const; + + void convertLocaleToLang(); + void convertBcp47ToLocale(); + void convertBcp47ToLang(); + void convertLangToLocale(); + + void convertFromRtlLocale(); + + /** Canonicalize if not yet done and synchronize initialized conversions. + + @return whether BCP 47 language tag string was changed. + */ + bool synCanonicalize(); + + void resetVars(); + + static bool isIsoLanguage( const OUString& rLanguage ); + static bool isIsoScript( const OUString& rScript ); + static bool isIsoCountry( const OUString& rRegion ); + +}; + +#endif // INCLUDED_I18NLANGTAG_LANGUAGETAG_HXX + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |