diff options
Diffstat (limited to 'intl/icu/source/common/loclikely.cpp')
-rw-r--r-- | intl/icu/source/common/loclikely.cpp | 1415 |
1 files changed, 1415 insertions, 0 deletions
diff --git a/intl/icu/source/common/loclikely.cpp b/intl/icu/source/common/loclikely.cpp new file mode 100644 index 0000000000..d2a05c6364 --- /dev/null +++ b/intl/icu/source/common/loclikely.cpp @@ -0,0 +1,1415 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1997-2016, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: loclikely.cpp +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2010feb25 +* created by: Markus W. Scherer +* +* Code for likely and minimized locale subtags, separated out from other .cpp files +* that then do not depend on resource bundle code and likely-subtags data. +*/ + +#include "unicode/bytestream.h" +#include "unicode/utypes.h" +#include "unicode/locid.h" +#include "unicode/putil.h" +#include "unicode/uchar.h" +#include "unicode/uloc.h" +#include "unicode/ures.h" +#include "unicode/uscript.h" +#include "bytesinkutil.h" +#include "charstr.h" +#include "cmemory.h" +#include "cstring.h" +#include "ulocimp.h" +#include "ustr_imp.h" + +/** + * These are the canonical strings for unknown languages, scripts and regions. + **/ +static const char* const unknownLanguage = "und"; +static const char* const unknownScript = "Zzzz"; +static const char* const unknownRegion = "ZZ"; + +/** + * This function looks for the localeID in the likelySubtags resource. + * + * @param localeID The tag to find. + * @param buffer A buffer to hold the matching entry + * @param bufferLength The length of the output buffer + * @return A pointer to "buffer" if found, or a null pointer if not. + */ +static const char* U_CALLCONV +findLikelySubtags(const char* localeID, + char* buffer, + int32_t bufferLength, + UErrorCode* err) { + const char* result = nullptr; + + if (!U_FAILURE(*err)) { + int32_t resLen = 0; + const char16_t* s = nullptr; + UErrorCode tmpErr = U_ZERO_ERROR; + icu::LocalUResourceBundlePointer subtags(ures_openDirect(nullptr, "likelySubtags", &tmpErr)); + if (U_SUCCESS(tmpErr)) { + icu::CharString und; + if (localeID != nullptr) { + if (*localeID == '\0') { + localeID = unknownLanguage; + } else if (*localeID == '_') { + und.append(unknownLanguage, *err); + und.append(localeID, *err); + if (U_FAILURE(*err)) { + return nullptr; + } + localeID = und.data(); + } + } + s = ures_getStringByKey(subtags.getAlias(), localeID, &resLen, &tmpErr); + + if (U_FAILURE(tmpErr)) { + /* + * If a resource is missing, it's not really an error, it's + * just that we don't have any data for that particular locale ID. + */ + if (tmpErr != U_MISSING_RESOURCE_ERROR) { + *err = tmpErr; + } + } + else if (resLen >= bufferLength) { + /* The buffer should never overflow. */ + *err = U_INTERNAL_PROGRAM_ERROR; + } + else { + u_UCharsToChars(s, buffer, resLen + 1); + if (resLen >= 3 && + uprv_strnicmp(buffer, unknownLanguage, 3) == 0 && + (resLen == 3 || buffer[3] == '_')) { + uprv_memmove(buffer, buffer + 3, resLen - 3 + 1); + } + result = buffer; + } + } else { + *err = tmpErr; + } + } + + return result; +} + +/** + * Append a tag to a buffer, adding the separator if necessary. The buffer + * must be large enough to contain the resulting tag plus any separator + * necessary. The tag must not be a zero-length string. + * + * @param tag The tag to add. + * @param tagLength The length of the tag. + * @param buffer The output buffer. + * @param bufferLength The length of the output buffer. This is an input/output parameter. + **/ +static void U_CALLCONV +appendTag( + const char* tag, + int32_t tagLength, + char* buffer, + int32_t* bufferLength, + UBool withSeparator) { + + if (withSeparator) { + buffer[*bufferLength] = '_'; + ++(*bufferLength); + } + + uprv_memmove( + &buffer[*bufferLength], + tag, + tagLength); + + *bufferLength += tagLength; +} + +/** + * Create a tag string from the supplied parameters. The lang, script and region + * parameters may be nullptr pointers. If they are, their corresponding length parameters + * must be less than or equal to 0. + * + * If any of the language, script or region parameters are empty, and the alternateTags + * parameter is not nullptr, it will be parsed for potential language, script and region tags + * to be used when constructing the new tag. If the alternateTags parameter is nullptr, or + * it contains no language tag, the default tag for the unknown language is used. + * + * If the length of the new string exceeds the capacity of the output buffer, + * the function copies as many bytes to the output buffer as it can, and returns + * the error U_BUFFER_OVERFLOW_ERROR. + * + * If an illegal argument is provided, the function returns the error + * U_ILLEGAL_ARGUMENT_ERROR. + * + * Note that this function can return the warning U_STRING_NOT_TERMINATED_WARNING if + * the tag string fits in the output buffer, but the null terminator doesn't. + * + * @param lang The language tag to use. + * @param langLength The length of the language tag. + * @param script The script tag to use. + * @param scriptLength The length of the script tag. + * @param region The region tag to use. + * @param regionLength The length of the region tag. + * @param trailing Any trailing data to append to the new tag. + * @param trailingLength The length of the trailing data. + * @param alternateTags A string containing any alternate tags. + * @param sink The output sink receiving the tag string. + * @param err A pointer to a UErrorCode for error reporting. + **/ +static void U_CALLCONV +createTagStringWithAlternates( + const char* lang, + int32_t langLength, + const char* script, + int32_t scriptLength, + const char* region, + int32_t regionLength, + const char* trailing, + int32_t trailingLength, + const char* alternateTags, + icu::ByteSink& sink, + UErrorCode* err) { + + if (U_FAILURE(*err)) { + goto error; + } + else if (langLength >= ULOC_LANG_CAPACITY || + scriptLength >= ULOC_SCRIPT_CAPACITY || + regionLength >= ULOC_COUNTRY_CAPACITY) { + goto error; + } + else { + /** + * ULOC_FULLNAME_CAPACITY will provide enough capacity + * that we can build a string that contains the language, + * script and region code without worrying about overrunning + * the user-supplied buffer. + **/ + char tagBuffer[ULOC_FULLNAME_CAPACITY]; + int32_t tagLength = 0; + UBool regionAppended = false; + + if (langLength > 0) { + appendTag( + lang, + langLength, + tagBuffer, + &tagLength, + /*withSeparator=*/false); + } + else if (alternateTags == nullptr) { + /* + * Use the empty string for an unknown language, if + * we found no language. + */ + } + else { + /* + * Parse the alternateTags string for the language. + */ + char alternateLang[ULOC_LANG_CAPACITY]; + int32_t alternateLangLength = sizeof(alternateLang); + + alternateLangLength = + uloc_getLanguage( + alternateTags, + alternateLang, + alternateLangLength, + err); + if(U_FAILURE(*err) || + alternateLangLength >= ULOC_LANG_CAPACITY) { + goto error; + } + else if (alternateLangLength == 0) { + /* + * Use the empty string for an unknown language, if + * we found no language. + */ + } + else { + appendTag( + alternateLang, + alternateLangLength, + tagBuffer, + &tagLength, + /*withSeparator=*/false); + } + } + + if (scriptLength > 0) { + appendTag( + script, + scriptLength, + tagBuffer, + &tagLength, + /*withSeparator=*/true); + } + else if (alternateTags != nullptr) { + /* + * Parse the alternateTags string for the script. + */ + char alternateScript[ULOC_SCRIPT_CAPACITY]; + + const int32_t alternateScriptLength = + uloc_getScript( + alternateTags, + alternateScript, + sizeof(alternateScript), + err); + + if (U_FAILURE(*err) || + alternateScriptLength >= ULOC_SCRIPT_CAPACITY) { + goto error; + } + else if (alternateScriptLength > 0) { + appendTag( + alternateScript, + alternateScriptLength, + tagBuffer, + &tagLength, + /*withSeparator=*/true); + } + } + + if (regionLength > 0) { + appendTag( + region, + regionLength, + tagBuffer, + &tagLength, + /*withSeparator=*/true); + + regionAppended = true; + } + else if (alternateTags != nullptr) { + /* + * Parse the alternateTags string for the region. + */ + char alternateRegion[ULOC_COUNTRY_CAPACITY]; + + const int32_t alternateRegionLength = + uloc_getCountry( + alternateTags, + alternateRegion, + sizeof(alternateRegion), + err); + if (U_FAILURE(*err) || + alternateRegionLength >= ULOC_COUNTRY_CAPACITY) { + goto error; + } + else if (alternateRegionLength > 0) { + appendTag( + alternateRegion, + alternateRegionLength, + tagBuffer, + &tagLength, + /*withSeparator=*/true); + + regionAppended = true; + } + } + + /** + * Copy the partial tag from our internal buffer to the supplied + * target. + **/ + sink.Append(tagBuffer, tagLength); + + if (trailingLength > 0) { + if (*trailing != '@') { + sink.Append("_", 1); + if (!regionAppended) { + /* extra separator is required */ + sink.Append("_", 1); + } + } + + /* + * Copy the trailing data into the supplied buffer. + */ + sink.Append(trailing, trailingLength); + } + + return; + } + +error: + + /** + * An overflow indicates the locale ID passed in + * is ill-formed. If we got here, and there was + * no previous error, it's an implicit overflow. + **/ + if (*err == U_BUFFER_OVERFLOW_ERROR || + U_SUCCESS(*err)) { + *err = U_ILLEGAL_ARGUMENT_ERROR; + } +} + +/** + * Create a tag string from the supplied parameters. The lang, script and region + * parameters may be nullptr pointers. If they are, their corresponding length parameters + * must be less than or equal to 0. If the lang parameter is an empty string, the + * default value for an unknown language is written to the output buffer. + * + * If the length of the new string exceeds the capacity of the output buffer, + * the function copies as many bytes to the output buffer as it can, and returns + * the error U_BUFFER_OVERFLOW_ERROR. + * + * If an illegal argument is provided, the function returns the error + * U_ILLEGAL_ARGUMENT_ERROR. + * + * @param lang The language tag to use. + * @param langLength The length of the language tag. + * @param script The script tag to use. + * @param scriptLength The length of the script tag. + * @param region The region tag to use. + * @param regionLength The length of the region tag. + * @param trailing Any trailing data to append to the new tag. + * @param trailingLength The length of the trailing data. + * @param sink The output sink receiving the tag string. + * @param err A pointer to a UErrorCode for error reporting. + **/ +static void U_CALLCONV +createTagString( + const char* lang, + int32_t langLength, + const char* script, + int32_t scriptLength, + const char* region, + int32_t regionLength, + const char* trailing, + int32_t trailingLength, + icu::ByteSink& sink, + UErrorCode* err) +{ + createTagStringWithAlternates( + lang, + langLength, + script, + scriptLength, + region, + regionLength, + trailing, + trailingLength, + nullptr, + sink, + err); +} + +/** + * Parse the language, script, and region subtags from a tag string, and copy the + * results into the corresponding output parameters. The buffers are null-terminated, + * unless overflow occurs. + * + * The langLength, scriptLength, and regionLength parameters are input/output + * parameters, and must contain the capacity of their corresponding buffers on + * input. On output, they will contain the actual length of the buffers, not + * including the null terminator. + * + * If the length of any of the output subtags exceeds the capacity of the corresponding + * buffer, the function copies as many bytes to the output buffer as it can, and returns + * the error U_BUFFER_OVERFLOW_ERROR. It will not parse any more subtags once overflow + * occurs. + * + * If an illegal argument is provided, the function returns the error + * U_ILLEGAL_ARGUMENT_ERROR. + * + * @param localeID The locale ID to parse. + * @param lang The language tag buffer. + * @param langLength The length of the language tag. + * @param script The script tag buffer. + * @param scriptLength The length of the script tag. + * @param region The region tag buffer. + * @param regionLength The length of the region tag. + * @param err A pointer to a UErrorCode for error reporting. + * @return The number of chars of the localeID parameter consumed. + **/ +static int32_t U_CALLCONV +parseTagString( + const char* localeID, + char* lang, + int32_t* langLength, + char* script, + int32_t* scriptLength, + char* region, + int32_t* regionLength, + UErrorCode* err) +{ + const char* position = localeID; + int32_t subtagLength = 0; + + if(U_FAILURE(*err) || + localeID == nullptr || + lang == nullptr || + langLength == nullptr || + script == nullptr || + scriptLength == nullptr || + region == nullptr || + regionLength == nullptr) { + goto error; + } + + subtagLength = ulocimp_getLanguage(position, &position, *err).extract(lang, *langLength, *err); + + /* + * Note that we explicit consider U_STRING_NOT_TERMINATED_WARNING + * to be an error, because it indicates the user-supplied tag is + * not well-formed. + */ + if(U_FAILURE(*err)) { + goto error; + } + + *langLength = subtagLength; + + /* + * If no language was present, use the empty string instead. + * Otherwise, move past any separator. + */ + if (_isIDSeparator(*position)) { + ++position; + } + + subtagLength = ulocimp_getScript(position, &position, *err).extract(script, *scriptLength, *err); + + if(U_FAILURE(*err)) { + goto error; + } + + *scriptLength = subtagLength; + + if (*scriptLength > 0) { + if (uprv_strnicmp(script, unknownScript, *scriptLength) == 0) { + /** + * If the script part is the "unknown" script, then don't return it. + **/ + *scriptLength = 0; + } + + /* + * Move past any separator. + */ + if (_isIDSeparator(*position)) { + ++position; + } + } + + subtagLength = ulocimp_getCountry(position, &position, *err).extract(region, *regionLength, *err); + + if(U_FAILURE(*err)) { + goto error; + } + + *regionLength = subtagLength; + + if (*regionLength > 0) { + if (uprv_strnicmp(region, unknownRegion, *regionLength) == 0) { + /** + * If the region part is the "unknown" region, then don't return it. + **/ + *regionLength = 0; + } + } else if (*position != 0 && *position != '@') { + /* back up over consumed trailing separator */ + --position; + } + +exit: + + return (int32_t)(position - localeID); + +error: + + /** + * If we get here, we have no explicit error, it's the result of an + * illegal argument. + **/ + if (!U_FAILURE(*err)) { + *err = U_ILLEGAL_ARGUMENT_ERROR; + } + + goto exit; +} + +static UBool U_CALLCONV +createLikelySubtagsString( + const char* lang, + int32_t langLength, + const char* script, + int32_t scriptLength, + const char* region, + int32_t regionLength, + const char* variants, + int32_t variantsLength, + icu::ByteSink& sink, + UErrorCode* err) { + /** + * ULOC_FULLNAME_CAPACITY will provide enough capacity + * that we can build a string that contains the language, + * script and region code without worrying about overrunning + * the user-supplied buffer. + **/ + char likelySubtagsBuffer[ULOC_FULLNAME_CAPACITY]; + + if(U_FAILURE(*err)) { + goto error; + } + + /** + * Try the language with the script and region first. + **/ + if (scriptLength > 0 && regionLength > 0) { + + const char* likelySubtags = nullptr; + + icu::CharString tagBuffer; + { + icu::CharStringByteSink sink(&tagBuffer); + createTagString( + lang, + langLength, + script, + scriptLength, + region, + regionLength, + nullptr, + 0, + sink, + err); + } + if(U_FAILURE(*err)) { + goto error; + } + + likelySubtags = + findLikelySubtags( + tagBuffer.data(), + likelySubtagsBuffer, + sizeof(likelySubtagsBuffer), + err); + if(U_FAILURE(*err)) { + goto error; + } + + if (likelySubtags != nullptr) { + /* Always use the language tag from the + maximal string, since it may be more + specific than the one provided. */ + createTagStringWithAlternates( + nullptr, + 0, + nullptr, + 0, + nullptr, + 0, + variants, + variantsLength, + likelySubtags, + sink, + err); + return true; + } + } + + /** + * Try the language with just the script. + **/ + if (scriptLength > 0) { + + const char* likelySubtags = nullptr; + + icu::CharString tagBuffer; + { + icu::CharStringByteSink sink(&tagBuffer); + createTagString( + lang, + langLength, + script, + scriptLength, + nullptr, + 0, + nullptr, + 0, + sink, + err); + } + if(U_FAILURE(*err)) { + goto error; + } + + likelySubtags = + findLikelySubtags( + tagBuffer.data(), + likelySubtagsBuffer, + sizeof(likelySubtagsBuffer), + err); + if(U_FAILURE(*err)) { + goto error; + } + + if (likelySubtags != nullptr) { + /* Always use the language tag from the + maximal string, since it may be more + specific than the one provided. */ + createTagStringWithAlternates( + nullptr, + 0, + nullptr, + 0, + region, + regionLength, + variants, + variantsLength, + likelySubtags, + sink, + err); + return true; + } + } + + /** + * Try the language with just the region. + **/ + if (regionLength > 0) { + + const char* likelySubtags = nullptr; + + icu::CharString tagBuffer; + { + icu::CharStringByteSink sink(&tagBuffer); + createTagString( + lang, + langLength, + nullptr, + 0, + region, + regionLength, + nullptr, + 0, + sink, + err); + } + if(U_FAILURE(*err)) { + goto error; + } + + likelySubtags = + findLikelySubtags( + tagBuffer.data(), + likelySubtagsBuffer, + sizeof(likelySubtagsBuffer), + err); + if(U_FAILURE(*err)) { + goto error; + } + + if (likelySubtags != nullptr) { + /* Always use the language tag from the + maximal string, since it may be more + specific than the one provided. */ + createTagStringWithAlternates( + nullptr, + 0, + script, + scriptLength, + nullptr, + 0, + variants, + variantsLength, + likelySubtags, + sink, + err); + return true; + } + } + + /** + * Finally, try just the language. + **/ + { + const char* likelySubtags = nullptr; + + icu::CharString tagBuffer; + { + icu::CharStringByteSink sink(&tagBuffer); + createTagString( + lang, + langLength, + nullptr, + 0, + nullptr, + 0, + nullptr, + 0, + sink, + err); + } + if(U_FAILURE(*err)) { + goto error; + } + + likelySubtags = + findLikelySubtags( + tagBuffer.data(), + likelySubtagsBuffer, + sizeof(likelySubtagsBuffer), + err); + if(U_FAILURE(*err)) { + goto error; + } + + if (likelySubtags != nullptr) { + /* Always use the language tag from the + maximal string, since it may be more + specific than the one provided. */ + createTagStringWithAlternates( + nullptr, + 0, + script, + scriptLength, + region, + regionLength, + variants, + variantsLength, + likelySubtags, + sink, + err); + return true; + } + } + + return false; + +error: + + if (!U_FAILURE(*err)) { + *err = U_ILLEGAL_ARGUMENT_ERROR; + } + + return false; +} + +#define CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength) UPRV_BLOCK_MACRO_BEGIN { \ + int32_t count = 0; \ + int32_t i; \ + for (i = 0; i < trailingLength; i++) { \ + if (trailing[i] == '-' || trailing[i] == '_') { \ + count = 0; \ + if (count > 8) { \ + goto error; \ + } \ + } else if (trailing[i] == '@') { \ + break; \ + } else if (count > 8) { \ + goto error; \ + } else { \ + count++; \ + } \ + } \ +} UPRV_BLOCK_MACRO_END + +static UBool +_uloc_addLikelySubtags(const char* localeID, + icu::ByteSink& sink, + UErrorCode* err) { + char lang[ULOC_LANG_CAPACITY]; + int32_t langLength = sizeof(lang); + char script[ULOC_SCRIPT_CAPACITY]; + int32_t scriptLength = sizeof(script); + char region[ULOC_COUNTRY_CAPACITY]; + int32_t regionLength = sizeof(region); + const char* trailing = ""; + int32_t trailingLength = 0; + int32_t trailingIndex = 0; + UBool success = false; + + if(U_FAILURE(*err)) { + goto error; + } + if (localeID == nullptr) { + goto error; + } + + trailingIndex = parseTagString( + localeID, + lang, + &langLength, + script, + &scriptLength, + region, + ®ionLength, + err); + if(U_FAILURE(*err)) { + /* Overflow indicates an illegal argument error */ + if (*err == U_BUFFER_OVERFLOW_ERROR) { + *err = U_ILLEGAL_ARGUMENT_ERROR; + } + + goto error; + } + + /* Find the length of the trailing portion. */ + while (_isIDSeparator(localeID[trailingIndex])) { + trailingIndex++; + } + trailing = &localeID[trailingIndex]; + trailingLength = (int32_t)uprv_strlen(trailing); + + CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength); + + success = + createLikelySubtagsString( + lang, + langLength, + script, + scriptLength, + region, + regionLength, + trailing, + trailingLength, + sink, + err); + + if (!success) { + const int32_t localIDLength = (int32_t)uprv_strlen(localeID); + + /* + * If we get here, we need to return localeID. + */ + sink.Append(localeID, localIDLength); + } + + return success; + +error: + + if (!U_FAILURE(*err)) { + *err = U_ILLEGAL_ARGUMENT_ERROR; + } + return false; +} + +// Add likely subtags to the sink +// return true if the value in the sink is produced by a match during the lookup +// return false if the value in the sink is the same as input because there are +// no match after the lookup. +static UBool _ulocimp_addLikelySubtags(const char*, icu::ByteSink&, UErrorCode*); + +static void +_uloc_minimizeSubtags(const char* localeID, + icu::ByteSink& sink, + UErrorCode* err) { + icu::CharString maximizedTagBuffer; + + char lang[ULOC_LANG_CAPACITY]; + int32_t langLength = sizeof(lang); + char script[ULOC_SCRIPT_CAPACITY]; + int32_t scriptLength = sizeof(script); + char region[ULOC_COUNTRY_CAPACITY]; + int32_t regionLength = sizeof(region); + const char* trailing = ""; + int32_t trailingLength = 0; + int32_t trailingIndex = 0; + UBool successGetMax = false; + + if(U_FAILURE(*err)) { + goto error; + } + else if (localeID == nullptr) { + goto error; + } + + trailingIndex = + parseTagString( + localeID, + lang, + &langLength, + script, + &scriptLength, + region, + ®ionLength, + err); + if(U_FAILURE(*err)) { + + /* Overflow indicates an illegal argument error */ + if (*err == U_BUFFER_OVERFLOW_ERROR) { + *err = U_ILLEGAL_ARGUMENT_ERROR; + } + + goto error; + } + + /* Find the spot where the variants or the keywords begin, if any. */ + while (_isIDSeparator(localeID[trailingIndex])) { + trailingIndex++; + } + trailing = &localeID[trailingIndex]; + trailingLength = (int32_t)uprv_strlen(trailing); + + CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength); + + { + icu::CharString base; + { + icu::CharStringByteSink baseSink(&base); + createTagString( + lang, + langLength, + script, + scriptLength, + region, + regionLength, + nullptr, + 0, + baseSink, + err); + } + + /** + * First, we need to first get the maximization + * from AddLikelySubtags. + **/ + { + icu::CharStringByteSink maxSink(&maximizedTagBuffer); + successGetMax = _ulocimp_addLikelySubtags(base.data(), maxSink, err); + } + } + + if(U_FAILURE(*err)) { + goto error; + } + + if (!successGetMax) { + /** + * If we got here, return the locale ID parameter unchanged. + **/ + const int32_t localeIDLength = (int32_t)uprv_strlen(localeID); + sink.Append(localeID, localeIDLength); + return; + } + + // In the following, the lang, script, region are referring to those in + // the maximizedTagBuffer, not the one in the localeID. + langLength = sizeof(lang); + scriptLength = sizeof(script); + regionLength = sizeof(region); + parseTagString( + maximizedTagBuffer.data(), + lang, + &langLength, + script, + &scriptLength, + region, + ®ionLength, + err); + if(U_FAILURE(*err)) { + goto error; + } + + /** + * Start first with just the language. + **/ + { + icu::CharString tagBuffer; + { + icu::CharStringByteSink tagSink(&tagBuffer); + createLikelySubtagsString( + lang, + langLength, + nullptr, + 0, + nullptr, + 0, + nullptr, + 0, + tagSink, + err); + } + + if(U_FAILURE(*err)) { + goto error; + } + else if (!tagBuffer.isEmpty() && + uprv_strnicmp( + maximizedTagBuffer.data(), + tagBuffer.data(), + tagBuffer.length()) == 0) { + + createTagString( + lang, + langLength, + nullptr, + 0, + nullptr, + 0, + trailing, + trailingLength, + sink, + err); + return; + } + } + + /** + * Next, try the language and region. + **/ + if (regionLength > 0) { + + icu::CharString tagBuffer; + { + icu::CharStringByteSink tagSink(&tagBuffer); + createLikelySubtagsString( + lang, + langLength, + nullptr, + 0, + region, + regionLength, + nullptr, + 0, + tagSink, + err); + } + + if(U_FAILURE(*err)) { + goto error; + } + else if (!tagBuffer.isEmpty() && + uprv_strnicmp( + maximizedTagBuffer.data(), + tagBuffer.data(), + tagBuffer.length()) == 0) { + + createTagString( + lang, + langLength, + nullptr, + 0, + region, + regionLength, + trailing, + trailingLength, + sink, + err); + return; + } + } + + /** + * Finally, try the language and script. This is our last chance, + * since trying with all three subtags would only yield the + * maximal version that we already have. + **/ + if (scriptLength > 0) { + icu::CharString tagBuffer; + { + icu::CharStringByteSink tagSink(&tagBuffer); + createLikelySubtagsString( + lang, + langLength, + script, + scriptLength, + nullptr, + 0, + nullptr, + 0, + tagSink, + err); + } + + if(U_FAILURE(*err)) { + goto error; + } + else if (!tagBuffer.isEmpty() && + uprv_strnicmp( + maximizedTagBuffer.data(), + tagBuffer.data(), + tagBuffer.length()) == 0) { + + createTagString( + lang, + langLength, + script, + scriptLength, + nullptr, + 0, + trailing, + trailingLength, + sink, + err); + return; + } + } + + { + /** + * If we got here, return the max + trail. + **/ + createTagString( + lang, + langLength, + script, + scriptLength, + region, + regionLength, + trailing, + trailingLength, + sink, + err); + return; + } + +error: + + if (!U_FAILURE(*err)) { + *err = U_ILLEGAL_ARGUMENT_ERROR; + } +} + +static int32_t +do_canonicalize(const char* localeID, + char* buffer, + int32_t bufferCapacity, + UErrorCode* err) +{ + int32_t canonicalizedSize = uloc_canonicalize( + localeID, + buffer, + bufferCapacity, + err); + + if (*err == U_STRING_NOT_TERMINATED_WARNING || + *err == U_BUFFER_OVERFLOW_ERROR) { + return canonicalizedSize; + } + else if (U_FAILURE(*err)) { + + return -1; + } + else { + return canonicalizedSize; + } +} + +U_CAPI int32_t U_EXPORT2 +uloc_addLikelySubtags(const char* localeID, + char* maximizedLocaleID, + int32_t maximizedLocaleIDCapacity, + UErrorCode* status) { + if (U_FAILURE(*status)) { + return 0; + } + + icu::CheckedArrayByteSink sink( + maximizedLocaleID, maximizedLocaleIDCapacity); + + ulocimp_addLikelySubtags(localeID, sink, status); + int32_t reslen = sink.NumberOfBytesAppended(); + + if (U_FAILURE(*status)) { + return sink.Overflowed() ? reslen : -1; + } + + if (sink.Overflowed()) { + *status = U_BUFFER_OVERFLOW_ERROR; + } else { + u_terminateChars( + maximizedLocaleID, maximizedLocaleIDCapacity, reslen, status); + } + + return reslen; +} + +static UBool +_ulocimp_addLikelySubtags(const char* localeID, + icu::ByteSink& sink, + UErrorCode* status) { + PreflightingLocaleIDBuffer localeBuffer; + do { + localeBuffer.requestedCapacity = do_canonicalize(localeID, localeBuffer.getBuffer(), + localeBuffer.getCapacity(), status); + } while (localeBuffer.needToTryAgain(status)); + + if (U_SUCCESS(*status)) { + return _uloc_addLikelySubtags(localeBuffer.getBuffer(), sink, status); + } else { + return false; + } +} + +U_CAPI void U_EXPORT2 +ulocimp_addLikelySubtags(const char* localeID, + icu::ByteSink& sink, + UErrorCode* status) { + _ulocimp_addLikelySubtags(localeID, sink, status); +} + +U_CAPI int32_t U_EXPORT2 +uloc_minimizeSubtags(const char* localeID, + char* minimizedLocaleID, + int32_t minimizedLocaleIDCapacity, + UErrorCode* status) { + if (U_FAILURE(*status)) { + return 0; + } + + icu::CheckedArrayByteSink sink( + minimizedLocaleID, minimizedLocaleIDCapacity); + + ulocimp_minimizeSubtags(localeID, sink, status); + int32_t reslen = sink.NumberOfBytesAppended(); + + if (U_FAILURE(*status)) { + return sink.Overflowed() ? reslen : -1; + } + + if (sink.Overflowed()) { + *status = U_BUFFER_OVERFLOW_ERROR; + } else { + u_terminateChars( + minimizedLocaleID, minimizedLocaleIDCapacity, reslen, status); + } + + return reslen; +} + +U_CAPI void U_EXPORT2 +ulocimp_minimizeSubtags(const char* localeID, + icu::ByteSink& sink, + UErrorCode* status) { + PreflightingLocaleIDBuffer localeBuffer; + do { + localeBuffer.requestedCapacity = do_canonicalize(localeID, localeBuffer.getBuffer(), + localeBuffer.getCapacity(), status); + } while (localeBuffer.needToTryAgain(status)); + + _uloc_minimizeSubtags(localeBuffer.getBuffer(), sink, status); +} + +// Pairs of (language subtag, + or -) for finding out fast if common languages +// are LTR (minus) or RTL (plus). +static const char LANG_DIR_STRING[] = + "root-en-es-pt-zh-ja-ko-de-fr-it-ar+he+fa+ru-nl-pl-th-tr-"; + +// Implemented here because this calls ulocimp_addLikelySubtags(). +U_CAPI UBool U_EXPORT2 +uloc_isRightToLeft(const char *locale) { + UErrorCode errorCode = U_ZERO_ERROR; + char script[8]; + int32_t scriptLength = uloc_getScript(locale, script, UPRV_LENGTHOF(script), &errorCode); + if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING || + scriptLength == 0) { + // Fastpath: We know the likely scripts and their writing direction + // for some common languages. + errorCode = U_ZERO_ERROR; + char lang[8]; + int32_t langLength = uloc_getLanguage(locale, lang, UPRV_LENGTHOF(lang), &errorCode); + if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING) { + return false; + } + if (langLength > 0) { + const char* langPtr = uprv_strstr(LANG_DIR_STRING, lang); + if (langPtr != nullptr) { + switch (langPtr[langLength]) { + case '-': return false; + case '+': return true; + default: break; // partial match of a longer code + } + } + } + // Otherwise, find the likely script. + errorCode = U_ZERO_ERROR; + icu::CharString likely; + { + icu::CharStringByteSink sink(&likely); + ulocimp_addLikelySubtags(locale, sink, &errorCode); + } + if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING) { + return false; + } + scriptLength = uloc_getScript(likely.data(), script, UPRV_LENGTHOF(script), &errorCode); + if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING || + scriptLength == 0) { + return false; + } + } + UScriptCode scriptCode = (UScriptCode)u_getPropertyValueEnum(UCHAR_SCRIPT, script); + return uscript_isRightToLeft(scriptCode); +} + +U_NAMESPACE_BEGIN + +UBool +Locale::isRightToLeft() const { + return uloc_isRightToLeft(getBaseName()); +} + +U_NAMESPACE_END + +// The following must at least allow for rg key value (6) plus terminator (1). +#define ULOC_RG_BUFLEN 8 + +U_CAPI int32_t U_EXPORT2 +ulocimp_getRegionForSupplementalData(const char *localeID, UBool inferRegion, + char *region, int32_t regionCapacity, UErrorCode* status) { + if (U_FAILURE(*status)) { + return 0; + } + char rgBuf[ULOC_RG_BUFLEN]; + UErrorCode rgStatus = U_ZERO_ERROR; + + // First check for rg keyword value + int32_t rgLen = uloc_getKeywordValue(localeID, "rg", rgBuf, ULOC_RG_BUFLEN, &rgStatus); + if (U_FAILURE(rgStatus) || rgLen != 6) { + rgLen = 0; + } else { + // rgBuf guaranteed to be zero terminated here, with text len 6 + char *rgPtr = rgBuf; + for (; *rgPtr!= 0; rgPtr++) { + *rgPtr = uprv_toupper(*rgPtr); + } + rgLen = (uprv_strcmp(rgBuf+2, "ZZZZ") == 0)? 2: 0; + } + + if (rgLen == 0) { + // No valid rg keyword value, try for unicode_region_subtag + rgLen = uloc_getCountry(localeID, rgBuf, ULOC_RG_BUFLEN, status); + if (U_FAILURE(*status)) { + rgLen = 0; + } else if (rgLen == 0 && inferRegion) { + // no unicode_region_subtag but inferRegion true, try likely subtags + rgStatus = U_ZERO_ERROR; + icu::CharString locBuf; + { + icu::CharStringByteSink sink(&locBuf); + ulocimp_addLikelySubtags(localeID, sink, &rgStatus); + } + if (U_SUCCESS(rgStatus)) { + rgLen = uloc_getCountry(locBuf.data(), rgBuf, ULOC_RG_BUFLEN, status); + if (U_FAILURE(*status)) { + rgLen = 0; + } + } + } + } + + rgBuf[rgLen] = 0; + uprv_strncpy(region, rgBuf, regionCapacity); + return u_terminateChars(region, regionCapacity, rgLen, status); +} + |