diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
commit | 36d22d82aa202bb199967e9512281e9a53db42c9 (patch) | |
tree | 105e8c98ddea1c1e4784a60a5a6410fa416be2de /intl/components/src | |
parent | Initial commit. (diff) | |
download | firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip |
Adding upstream version 115.7.0esr.upstream/115.7.0esr
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'intl/components/src')
65 files changed, 15246 insertions, 0 deletions
diff --git a/intl/components/src/Bidi.cpp b/intl/components/src/Bidi.cpp new file mode 100644 index 0000000000..2ce355c8eb --- /dev/null +++ b/intl/components/src/Bidi.cpp @@ -0,0 +1,138 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "mozilla/intl/Bidi.h" +#include "mozilla/Casting.h" +#include "mozilla/intl/ICU4CGlue.h" + +#include "unicode/ubidi.h" + +namespace mozilla::intl { + +Bidi::Bidi() { mBidi = ubidi_open(); } +Bidi::~Bidi() { ubidi_close(mBidi.GetMut()); } + +ICUResult Bidi::SetParagraph(Span<const char16_t> aParagraph, + BidiEmbeddingLevel aLevel) { + // Do not allow any reordering of the runs, as this can change the + // performance characteristics of working with runs. In the default mode, + // the levels can be iterated over directly, rather than relying on computing + // logical runs on the fly. This can have negative performance characteristics + // compared to iterating over the levels. + // + // In the UBIDI_REORDER_RUNS_ONLY the levels are encoded with additional + // information which can be safely ignored in this Bidi implementation. + // Note that this check is here since setting the mode must be done before + // calls to setting the paragraph. + MOZ_ASSERT(ubidi_getReorderingMode(mBidi.GetMut()) == UBIDI_REORDER_DEFAULT); + + UErrorCode status = U_ZERO_ERROR; + ubidi_setPara(mBidi.GetMut(), aParagraph.Elements(), + AssertedCast<int32_t>(aParagraph.Length()), aLevel, nullptr, + &status); + + mLevels = nullptr; + + return ToICUResult(status); +} + +Bidi::ParagraphDirection Bidi::GetParagraphDirection() const { + switch (ubidi_getDirection(mBidi.GetConst())) { + case UBIDI_LTR: + return Bidi::ParagraphDirection::LTR; + case UBIDI_RTL: + return Bidi::ParagraphDirection::RTL; + case UBIDI_MIXED: + return Bidi::ParagraphDirection::Mixed; + case UBIDI_NEUTRAL: + // This is only used in `ubidi_getBaseDirection` which is unused in this + // API. + MOZ_ASSERT_UNREACHABLE("Unexpected UBiDiDirection value."); + }; + return Bidi::ParagraphDirection::Mixed; +} + +/* static */ +void Bidi::ReorderVisual(const BidiEmbeddingLevel* aLevels, int32_t aLength, + int32_t* aIndexMap) { + ubidi_reorderVisual(reinterpret_cast<const uint8_t*>(aLevels), aLength, + aIndexMap); +} + +/* static */ +Bidi::BaseDirection Bidi::GetBaseDirection(Span<const char16_t> aParagraph) { + UBiDiDirection direction = ubidi_getBaseDirection( + aParagraph.Elements(), AssertedCast<int32_t>(aParagraph.Length())); + + switch (direction) { + case UBIDI_LTR: + return Bidi::BaseDirection::LTR; + case UBIDI_RTL: + return Bidi::BaseDirection::RTL; + case UBIDI_NEUTRAL: + return Bidi::BaseDirection::Neutral; + case UBIDI_MIXED: + MOZ_ASSERT_UNREACHABLE("Unexpected UBiDiDirection value."); + } + + return Bidi::BaseDirection::Neutral; +} + +static BidiDirection ToBidiDirection(UBiDiDirection aDirection) { + switch (aDirection) { + case UBIDI_LTR: + return BidiDirection::LTR; + case UBIDI_RTL: + return BidiDirection::RTL; + case UBIDI_MIXED: + case UBIDI_NEUTRAL: + MOZ_ASSERT_UNREACHABLE("Unexpected UBiDiDirection value."); + } + return BidiDirection::LTR; +} + +Result<int32_t, ICUError> Bidi::CountRuns() { + UErrorCode status = U_ZERO_ERROR; + int32_t runCount = ubidi_countRuns(mBidi.GetMut(), &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + mLength = ubidi_getProcessedLength(mBidi.GetConst()); + mLevels = mLength > 0 ? reinterpret_cast<const BidiEmbeddingLevel*>( + ubidi_getLevels(mBidi.GetMut(), &status)) + : nullptr; + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + return runCount; +} + +void Bidi::GetLogicalRun(int32_t aLogicalStart, int32_t* aLogicalLimitOut, + BidiEmbeddingLevel* aLevelOut) { + MOZ_ASSERT(mLevels, "CountRuns hasn't been run?"); + MOZ_RELEASE_ASSERT(aLogicalStart < mLength, "Out of bound"); + BidiEmbeddingLevel level = mLevels[aLogicalStart]; + int32_t limit; + for (limit = aLogicalStart + 1; limit < mLength; limit++) { + if (mLevels[limit] != level) { + break; + } + } + *aLogicalLimitOut = limit; + *aLevelOut = level; +} + +BidiEmbeddingLevel Bidi::GetParagraphEmbeddingLevel() const { + return BidiEmbeddingLevel(ubidi_getParaLevel(mBidi.GetConst())); +} + +BidiDirection Bidi::GetVisualRun(int32_t aRunIndex, int32_t* aLogicalStart, + int32_t* aLength) { + return ToBidiDirection( + ubidi_getVisualRun(mBidi.GetMut(), aRunIndex, aLogicalStart, aLength)); +} + +} // namespace mozilla::intl diff --git a/intl/components/src/Bidi.h b/intl/components/src/Bidi.h new file mode 100644 index 0000000000..9b7fba73e2 --- /dev/null +++ b/intl/components/src/Bidi.h @@ -0,0 +1,160 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef intl_components_Bidi_h_ +#define intl_components_Bidi_h_ + +#include "mozilla/intl/BidiEmbeddingLevel.h" +#include "mozilla/intl/ICU4CGlue.h" + +struct UBiDi; + +namespace mozilla::intl { + +/** + * This component is a Mozilla-focused API for working with bidirectional (bidi) + * text. Text is commonly displayed left to right (LTR), especially for + * Latin-based alphabets. However, languages like Arabic and Hebrew displays + * text right to left (RTL). When displaying text, LTR and RTL text can be + * combined together in the same paragraph. This class gives tools for working + * with unidirectional, and mixed direction paragraphs. + * + * See the Unicode Bidirectional Algorithm document for implementation details: + * https://unicode.org/reports/tr9/ + */ +class Bidi final { + public: + Bidi(); + ~Bidi(); + + // Not copyable or movable + Bidi(const Bidi&) = delete; + Bidi& operator=(const Bidi&) = delete; + + /** + * This enum indicates the text direction for the set paragraph. Some + * paragraphs are unidirectional, where they only have one direction, or a + * paragraph could use both LTR and RTL. In this case the paragraph's + * direction would be mixed. + */ + enum class ParagraphDirection { LTR, RTL, Mixed }; + + /** + * Set the current paragraph of text to analyze for its bidi properties. This + * performs the Unicode bidi algorithm as specified by: + * https://unicode.org/reports/tr9/ + * + * After setting the text, the other getter methods can be used to find out + * the directionality of the paragraph text. + */ + ICUResult SetParagraph(Span<const char16_t> aParagraph, + BidiEmbeddingLevel aLevel); + + /** + * Get the embedding level for the paragraph that was set by SetParagraph. + */ + BidiEmbeddingLevel GetParagraphEmbeddingLevel() const; + + /** + * Get the directionality of the paragraph text that was set by SetParagraph. + */ + ParagraphDirection GetParagraphDirection() const; + + /** + * Get the number of runs. This function may invoke the actual reordering on + * the Bidi object, after SetParagraph may have resolved only the levels of + * the text. Therefore, `CountRuns` may have to allocate memory, and may fail + * doing so. + */ + Result<int32_t, ICUError> CountRuns(); + + /** + * Get the next logical run. The logical runs are a run of text that has the + * same directionality and embedding level. These runs are in memory order, + * and not in display order. + * + * Important! `Bidi::CountRuns` must be called before calling this method. + * + * @param aLogicalStart is the offset into the paragraph text that marks the + * logical start of the text. + * @param aLogicalLimitOut is an out param that is the length of the string + * that makes up the logical run. + * @param aLevelOut is an out parameter that returns the embedding level for + * the run + */ + void GetLogicalRun(int32_t aLogicalStart, int32_t* aLogicalLimitOut, + BidiEmbeddingLevel* aLevelOut); + + /** + * This is a convenience function that does not use the ICU Bidi object. + * It is intended to be used for when an application has determined the + * embedding levels of objects (character sequences) and just needs to have + * them reordered (L2). + * + * @param aLevels is an array with `aLength` levels that have been + * determined by the application. + * + * @param aLength is the number of levels in the array, or, semantically, + * the number of objects to be reordered. It must be greater than 0. + * + * @param aIndexMap is a pointer to an array of `aLength` + * indexes which will reflect the reordering of the characters. + * The array does not need to be initialized. + * The index map will result in + * `aIndexMap[aVisualIndex]==aLogicalIndex`. + */ + static void ReorderVisual(const BidiEmbeddingLevel* aLevels, int32_t aLength, + int32_t* aIndexMap); + + /** + * This enum indicates the bidi character type of the first strong character + * for the set paragraph. + * LTR: bidi character type 'L'. + * RTL: bidi character type 'R' or 'AL'. + * Neutral: The rest of bidi character types. + */ + enum class BaseDirection { LTR, RTL, Neutral }; + + /** + * Get the base direction of the paragraph. + */ + static BaseDirection GetBaseDirection(Span<const char16_t> aParagraph); + + /** + * Get one run's logical start, length, and directionality. In an RTL run, the + * character at the logical start is visually on the right of the displayed + * run. The length is the number of characters in the run. + * `Bidi::CountRuns` should be called before the runs are retrieved. + * + * @param aRunIndex is the number of the run in visual order, in the + * range `[0..CountRuns-1]`. + * + * @param aLogicalStart is the first logical character index in the text. + * The pointer may be `nullptr` if this index is not needed. + * + * @param aLength is the number of characters (at least one) in the run. + * The pointer may be `nullptr` if this is not needed. + * + * Note that in right-to-left runs, the code places modifier letters before + * base characters and second surrogates before first ones. + */ + BidiDirection GetVisualRun(int32_t aRunIndex, int32_t* aLogicalStart, + int32_t* aLength); + + private: + ICUPointer<UBiDi> mBidi = ICUPointer<UBiDi>(nullptr); + + /** + * An array of levels that is the same length as the paragraph from + * `Bidi::SetParagraph`. + */ + const BidiEmbeddingLevel* mLevels = nullptr; + + /** + * The length of the paragraph from `Bidi::SetParagraph`. + */ + int32_t mLength = 0; +}; + +} // namespace mozilla::intl +#endif diff --git a/intl/components/src/BidiClass.h b/intl/components/src/BidiClass.h new file mode 100644 index 0000000000..04a861a382 --- /dev/null +++ b/intl/components/src/BidiClass.h @@ -0,0 +1,49 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef intl_components_BidiClass_h_ +#define intl_components_BidiClass_h_ + +#include <cstdint> + +namespace mozilla::intl { + +/** + * Read ftp://ftp.unicode.org/Public/UNIDATA/ReadMe-Latest.txt + * section BIDIRECTIONAL PROPERTIES + * for the detailed definition of the following categories + * + * The values here must match the equivalents in %bidicategorycode in + * mozilla/intl/unicharutil/tools/genUnicodePropertyData.pl, + * and must also match the values used by ICU's UCharDirection. + */ +enum class BidiClass : uint8_t { + LeftToRight = 0, + RightToLeft = 1, + EuropeanNumber = 2, + EuropeanNumberSeparator = 3, + EuropeanNumberTerminator = 4, + ArabicNumber = 5, + CommonNumberSeparator = 6, + BlockSeparator = 7, + SegmentSeparator = 8, + WhiteSpaceNeutral = 9, + OtherNeutral = 10, + LeftToRightEmbedding = 11, + LeftToRightOverride = 12, + RightToLeftArabic = 13, + RightToLeftEmbedding = 14, + RightToLeftOverride = 15, + PopDirectionalFormat = 16, + DirNonSpacingMark = 17, + BoundaryNeutral = 18, + FirstStrongIsolate = 19, + LeftToRightIsolate = 20, + RightToLeftIsolate = 21, + PopDirectionalIsolate = 22, + BidiClassCount +}; + +} // namespace mozilla::intl + +#endif diff --git a/intl/components/src/BidiEmbeddingLevel.cpp b/intl/components/src/BidiEmbeddingLevel.cpp new file mode 100644 index 0000000000..d3ef5da937 --- /dev/null +++ b/intl/components/src/BidiEmbeddingLevel.cpp @@ -0,0 +1,53 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "mozilla/intl/BidiEmbeddingLevel.h" +#include "mozilla/Casting.h" +#include "mozilla/intl/ICU4CGlue.h" + +#include "unicode/ubidi.h" + +namespace mozilla::intl { + +bool BidiEmbeddingLevel::IsDefaultLTR() const { + return mValue == UBIDI_DEFAULT_LTR; +}; + +bool BidiEmbeddingLevel::IsDefaultRTL() const { + return mValue == UBIDI_DEFAULT_RTL; +}; + +bool BidiEmbeddingLevel::IsRTL() const { + // If the least significant bit is 1, then the embedding level + // is right-to-left. + // If the least significant bit is 0, then the embedding level + // is left-to-right. + return (mValue & 0x1) == 1; +}; + +bool BidiEmbeddingLevel::IsLTR() const { return !IsRTL(); }; + +bool BidiEmbeddingLevel::IsSameDirection(BidiEmbeddingLevel aOther) const { + return (((mValue ^ aOther) & 1) == 0); +} + +BidiEmbeddingLevel BidiEmbeddingLevel::LTR() { return BidiEmbeddingLevel(0); }; + +BidiEmbeddingLevel BidiEmbeddingLevel::RTL() { return BidiEmbeddingLevel(1); }; + +BidiEmbeddingLevel BidiEmbeddingLevel::DefaultLTR() { + return BidiEmbeddingLevel(UBIDI_DEFAULT_LTR); +}; + +BidiEmbeddingLevel BidiEmbeddingLevel::DefaultRTL() { + return BidiEmbeddingLevel(UBIDI_DEFAULT_RTL); +}; + +BidiDirection BidiEmbeddingLevel::Direction() { + return IsRTL() ? BidiDirection::RTL : BidiDirection::LTR; +}; + +uint8_t BidiEmbeddingLevel::Value() const { return mValue; } + +} // namespace mozilla::intl diff --git a/intl/components/src/BidiEmbeddingLevel.h b/intl/components/src/BidiEmbeddingLevel.h new file mode 100644 index 0000000000..1628b6392f --- /dev/null +++ b/intl/components/src/BidiEmbeddingLevel.h @@ -0,0 +1,113 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef intl_components_BidiEmbeddingLevel_h_ +#define intl_components_BidiEmbeddingLevel_h_ + +#include <cstdint> + +/** + * This file has the BidiEmbeddingLevel and BidiDirection enum broken out from + * the main Bidi class for faster includes. This code is used in Layout which + * could trigger long build times when changing core mozilla::intl files. + */ +namespace mozilla::intl { + +/** + * This enum unambiguously classifies text runs as either being left to right, + * or right to left. + */ +enum class BidiDirection : uint8_t { + // Left to right text. + LTR = 0, + // Right to left text. + RTL = 1, +}; + +/** + * Embedding levels are numbers that indicate how deeply the bidi text is + * embedded, and the direction of text on that embedding level. When switching + * between strongly LTR code points and strongly RTL code points the embedding + * level normally switches between an embedding level of 0 (LTR) and 1 (RTL). + * The only time the embedding level increases is if the embedding code points + * are used. This is the Left-to-Right Embedding (LRE) code point (U+202A), or + * the Right-to-Left Embedding (RLE) code point (U+202B). The minimum + * embedding level of text is zero, and the maximum explicit depth is 125. + * + * The most significant bit is reserved for additional meaning. It can be used + * to signify in certain APIs that the text should by default be LTR or RTL if + * no strongly directional code points are found. + * + * Bug 1736595: At the time of this writing, some places in Gecko code use a 1 + * in the most significant bit to indicate that an embedding level has not + * been set. This leads to an ambiguous understanding of what the most + * significant bit actually means. + */ +class BidiEmbeddingLevel { + public: + explicit BidiEmbeddingLevel(uint8_t aValue) : mValue(aValue) {} + explicit BidiEmbeddingLevel(int aValue) + : mValue(static_cast<uint8_t>(aValue)) {} + + BidiEmbeddingLevel() = default; + + // Enable the copy operators, but disable move as this is only a uint8_t. + BidiEmbeddingLevel(const BidiEmbeddingLevel& other) = default; + BidiEmbeddingLevel& operator=(const BidiEmbeddingLevel& other) = default; + + /** + * Determine the direction of the embedding level by looking at the least + * significant bit. If it is 0, then it is LTR. If it is 1, then it is RTL. + */ + BidiDirection Direction(); + + /** + * Create a left-to-right embedding level. + */ + static BidiEmbeddingLevel LTR(); + + /** + * Create an right-to-left embedding level. + */ + static BidiEmbeddingLevel RTL(); + + /** + * When passed into `SetParagraph`, the direction is determined by first + * strongly directional character, with the default set to left-to-right if + * none is found. + * + * This is encoded with the highest bit set to 1. + */ + static BidiEmbeddingLevel DefaultLTR(); + + /** + * When passed into `SetParagraph`, the direction is determined by first + * strongly directional character, with the default set to right-to-left if + * none is found. + * + * * This is encoded with the highest and lowest bits set to 1. + */ + static BidiEmbeddingLevel DefaultRTL(); + + bool IsDefaultLTR() const; + bool IsDefaultRTL() const; + bool IsLTR() const; + bool IsRTL() const; + bool IsSameDirection(BidiEmbeddingLevel aOther) const; + + /** + * Get the underlying value as a uint8_t. + */ + uint8_t Value() const; + + /** + * Implicitly convert to the underlying value. + */ + operator uint8_t() const { return mValue; } + + private: + uint8_t mValue = 0; +}; + +} // namespace mozilla::intl +#endif diff --git a/intl/components/src/Calendar.cpp b/intl/components/src/Calendar.cpp new file mode 100644 index 0000000000..d44dedaaae --- /dev/null +++ b/intl/components/src/Calendar.cpp @@ -0,0 +1,172 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "mozilla/intl/Calendar.h" + +#include "unicode/ucal.h" +#include "unicode/uloc.h" +#include "unicode/utypes.h" + +namespace mozilla::intl { + +/* static */ +Result<UniquePtr<Calendar>, ICUError> Calendar::TryCreate( + const char* aLocale, Maybe<Span<const char16_t>> aTimeZoneOverride) { + UErrorCode status = U_ZERO_ERROR; + const UChar* zoneID = nullptr; + int32_t zoneIDLen = 0; + if (aTimeZoneOverride) { + zoneIDLen = static_cast<int32_t>(aTimeZoneOverride->Length()); + zoneID = aTimeZoneOverride->Elements(); + } + + UCalendar* calendar = + ucal_open(zoneID, zoneIDLen, aLocale, UCAL_DEFAULT, &status); + + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + return MakeUnique<Calendar>(calendar); +} + +Result<Span<const char>, ICUError> Calendar::GetBcp47Type() { + UErrorCode status = U_ZERO_ERROR; + const char* oldType = ucal_getType(mCalendar, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + const char* bcp47Type = uloc_toUnicodeLocaleType("calendar", oldType); + + if (!bcp47Type) { + return Err(ICUError::InternalError); + } + + return MakeStringSpan(bcp47Type); +} + +static Weekday WeekdayFromDaysOfWeek(UCalendarDaysOfWeek weekday) { + switch (weekday) { + case UCAL_MONDAY: + return Weekday::Monday; + case UCAL_TUESDAY: + return Weekday::Tuesday; + case UCAL_WEDNESDAY: + return Weekday::Wednesday; + case UCAL_THURSDAY: + return Weekday::Thursday; + case UCAL_FRIDAY: + return Weekday::Friday; + case UCAL_SATURDAY: + return Weekday::Saturday; + case UCAL_SUNDAY: + return Weekday::Sunday; + } + MOZ_CRASH("unexpected weekday value"); +} + +Result<EnumSet<Weekday>, ICUError> Calendar::GetWeekend() { + static_assert(static_cast<int32_t>(UCAL_SUNDAY) == 1); + static_assert(static_cast<int32_t>(UCAL_SATURDAY) == 7); + + UErrorCode status = U_ZERO_ERROR; + + EnumSet<Weekday> weekend; + for (int32_t i = UCAL_SUNDAY; i <= UCAL_SATURDAY; i++) { + auto dayOfWeek = static_cast<UCalendarDaysOfWeek>(i); + auto type = ucal_getDayOfWeekType(mCalendar, dayOfWeek, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + switch (type) { + case UCAL_WEEKEND_ONSET: + // Treat days which start as a weekday as weekdays. + [[fallthrough]]; + case UCAL_WEEKDAY: + break; + + case UCAL_WEEKEND_CEASE: + // Treat days which start as a weekend day as weekend days. + [[fallthrough]]; + case UCAL_WEEKEND: + weekend += WeekdayFromDaysOfWeek(dayOfWeek); + break; + } + } + + return weekend; +} + +Weekday Calendar::GetFirstDayOfWeek() { + int32_t firstDayOfWeek = ucal_getAttribute(mCalendar, UCAL_FIRST_DAY_OF_WEEK); + MOZ_ASSERT(UCAL_SUNDAY <= firstDayOfWeek && firstDayOfWeek <= UCAL_SATURDAY); + + return WeekdayFromDaysOfWeek( + static_cast<UCalendarDaysOfWeek>(firstDayOfWeek)); +} + +int32_t Calendar::GetMinimalDaysInFirstWeek() { + int32_t minimalDays = + ucal_getAttribute(mCalendar, UCAL_MINIMAL_DAYS_IN_FIRST_WEEK); + MOZ_ASSERT(1 <= minimalDays && minimalDays <= 7); + + return minimalDays; +} + +Result<Ok, ICUError> Calendar::SetTimeInMs(double aUnixEpoch) { + UErrorCode status = U_ZERO_ERROR; + ucal_setMillis(mCalendar, aUnixEpoch, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + return Ok{}; +} + +/* static */ +Result<SpanEnumeration<char>, ICUError> +Calendar::GetLegacyKeywordValuesForLocale(const char* aLocale) { + UErrorCode status = U_ZERO_ERROR; + UEnumeration* enumeration = ucal_getKeywordValuesForLocale( + "calendar", aLocale, /* commonlyUsed */ false, &status); + + if (U_SUCCESS(status)) { + return SpanEnumeration<char>(enumeration); + } + + return Err(ToICUError(status)); +} + +/* static */ +SpanResult<char> Calendar::LegacyIdentifierToBcp47(const char* aIdentifier, + int32_t aLength) { + if (aIdentifier == nullptr) { + return Err(InternalError{}); + } + // aLength is not needed here, as the ICU call uses the null terminated + // string. + return MakeStringSpan(uloc_toUnicodeLocaleType("ca", aIdentifier)); +} + +/* static */ +Result<Calendar::Bcp47IdentifierEnumeration, ICUError> +Calendar::GetBcp47KeywordValuesForLocale(const char* aLocale, + CommonlyUsed aCommonlyUsed) { + UErrorCode status = U_ZERO_ERROR; + UEnumeration* enumeration = ucal_getKeywordValuesForLocale( + "calendar", aLocale, static_cast<bool>(aCommonlyUsed), &status); + + if (U_SUCCESS(status)) { + return Bcp47IdentifierEnumeration(enumeration); + } + + return Err(ToICUError(status)); +} + +Calendar::~Calendar() { + MOZ_ASSERT(mCalendar); + ucal_close(mCalendar); +} + +} // namespace mozilla::intl diff --git a/intl/components/src/Calendar.h b/intl/components/src/Calendar.h new file mode 100644 index 0000000000..32975bc376 --- /dev/null +++ b/intl/components/src/Calendar.h @@ -0,0 +1,133 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef intl_components_Calendar_h_ +#define intl_components_Calendar_h_ + +#include "mozilla/Assertions.h" +#include "mozilla/EnumSet.h" +#include "mozilla/intl/ICU4CGlue.h" +#include "mozilla/intl/ICUError.h" +#include "mozilla/Maybe.h" +#include "mozilla/Result.h" +#include "mozilla/Span.h" +#include "mozilla/UniquePtr.h" + +using UCalendar = void*; + +namespace mozilla::intl { + +/** + * Weekdays in the ISO-8601 calendar. + */ +enum class Weekday : uint8_t { + Monday = 1, + Tuesday, + Wednesday, + Thursday, + Friday, + Saturday, + Sunday, +}; + +/** + * This component is a Mozilla-focused API for working with calendar systems in + * internationalization code. It is used in coordination with other operations + * such as datetime formatting. + */ +class Calendar final { + public: + explicit Calendar(UCalendar* aCalendar) : mCalendar(aCalendar) { + MOZ_ASSERT(aCalendar); + }; + + // Do not allow copy as this class owns the ICU resource. Move is not + // currently implemented, but a custom move operator could be created if + // needed. + Calendar(const Calendar&) = delete; + Calendar& operator=(const Calendar&) = delete; + + /** + * Create a Calendar. + */ + static Result<UniquePtr<Calendar>, ICUError> TryCreate( + const char* aLocale, + Maybe<Span<const char16_t>> aTimeZoneOverride = Nothing{}); + + /** + * Get the BCP 47 keyword value string designating the calendar type. For + * instance "gregory", "chinese", "islamic-civil", etc. + */ + Result<Span<const char>, ICUError> GetBcp47Type(); + + /** + * Return the set of weekdays which are considered as part of the weekend. + */ + Result<EnumSet<Weekday>, ICUError> GetWeekend(); + + /** + * Return the weekday which is considered the first day of the week. + */ + Weekday GetFirstDayOfWeek(); + + /** + * Return the minimal number of days in the first week of a year. + */ + int32_t GetMinimalDaysInFirstWeek(); + + /** + * Set the time for the calendar relative to the number of milliseconds since + * 1 January 1970, UTC. + */ + Result<Ok, ICUError> SetTimeInMs(double aUnixEpoch); + + /** + * Return ICU legacy keywords, such as "gregorian", "islamic", + * "islamic-civil", "hebrew", etc. + */ + static Result<SpanEnumeration<char>, ICUError> + GetLegacyKeywordValuesForLocale(const char* aLocale); + + private: + /** + * Internal function to convert a legacy calendar identifier to the newer + * BCP 47 identifier. + */ + static SpanResult<char> LegacyIdentifierToBcp47(const char* aIdentifier, + int32_t aLength); + + public: + enum class CommonlyUsed : bool { + /** + * Select all possible values, even when not commonly used by a locale. + */ + No, + + /** + * Only select the values which are commonly used by a locale. + */ + Yes, + }; + + using Bcp47IdentifierEnumeration = + Enumeration<char, SpanResult<char>, Calendar::LegacyIdentifierToBcp47>; + + /** + * Return BCP 47 Unicode locale extension type keywords. + */ + static Result<Bcp47IdentifierEnumeration, ICUError> + GetBcp47KeywordValuesForLocale(const char* aLocale, + CommonlyUsed aCommonlyUsed = CommonlyUsed::No); + + ~Calendar(); + + private: + friend class DateIntervalFormat; + UCalendar* GetUCalendar() const { return mCalendar; } + + UCalendar* mCalendar = nullptr; +}; + +} // namespace mozilla::intl + +#endif diff --git a/intl/components/src/Collator.cpp b/intl/components/src/Collator.cpp new file mode 100644 index 0000000000..93052932de --- /dev/null +++ b/intl/components/src/Collator.cpp @@ -0,0 +1,305 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include <algorithm> +#include <string.h> +#include "mozilla/intl/Collator.h" + +namespace mozilla::intl { + +Collator::Collator(UCollator* aCollator) : mCollator(aCollator) { + MOZ_ASSERT(aCollator); +} + +Collator::~Collator() { + if (mCollator.GetMut()) { + ucol_close(mCollator.GetMut()); + } +} + +Result<UniquePtr<Collator>, ICUError> Collator::TryCreate(const char* aLocale) { + UErrorCode status = U_ZERO_ERROR; + UCollator* collator = ucol_open(IcuLocale(aLocale), &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + return MakeUnique<Collator>(collator); +}; + +int32_t Collator::CompareStrings(Span<const char16_t> aSource, + Span<const char16_t> aTarget) const { + switch (ucol_strcoll(mCollator.GetConst(), aSource.data(), + static_cast<int32_t>(aSource.size()), aTarget.data(), + static_cast<int32_t>(aTarget.size()))) { + case UCOL_LESS: + return -1; + case UCOL_EQUAL: + return 0; + case UCOL_GREATER: + return 1; + } + MOZ_ASSERT_UNREACHABLE("ucol_strcoll returned bad UCollationResult"); + return 0; +} + +int32_t Collator::CompareSortKeys(Span<const uint8_t> aKey1, + Span<const uint8_t> aKey2) const { + size_t minLength = std::min(aKey1.Length(), aKey2.Length()); + int32_t tmpResult = strncmp((const char*)aKey1.Elements(), + (const char*)aKey2.Elements(), minLength); + if (tmpResult < 0) { + return -1; + } + if (tmpResult > 0) { + return 1; + } + if (aKey1.Length() > minLength) { + // First string contains second one, so comes later, hence return > 0. + return 1; + } + if (aKey2.Length() > minLength) { + // First string is a substring of second one, so comes earlier, + // hence return < 0. + return -1; + } + return 0; +} + +static UColAttributeValue CaseFirstToICU(Collator::CaseFirst caseFirst) { + switch (caseFirst) { + case Collator::CaseFirst::False: + return UCOL_OFF; + case Collator::CaseFirst::Upper: + return UCOL_UPPER_FIRST; + case Collator::CaseFirst::Lower: + return UCOL_LOWER_FIRST; + } + + MOZ_ASSERT_UNREACHABLE(); + return UCOL_DEFAULT; +} + +// Define this as a macro to work around exposing the UColAttributeValue type to +// the header file. Collation::Feature is private to the class. +#define FEATURE_TO_ICU(featureICU, feature) \ + switch (feature) { \ + case Collator::Feature::On: \ + (featureICU) = UCOL_ON; \ + break; \ + case Collator::Feature::Off: \ + (featureICU) = UCOL_OFF; \ + break; \ + case Collator::Feature::Default: \ + (featureICU) = UCOL_DEFAULT; \ + break; \ + } + +void Collator::SetStrength(Collator::Strength aStrength) { + UColAttributeValue strength; + switch (aStrength) { + case Collator::Strength::Default: + strength = UCOL_DEFAULT_STRENGTH; + break; + case Collator::Strength::Primary: + strength = UCOL_PRIMARY; + break; + case Collator::Strength::Secondary: + strength = UCOL_SECONDARY; + break; + case Collator::Strength::Tertiary: + strength = UCOL_TERTIARY; + break; + case Collator::Strength::Quaternary: + strength = UCOL_QUATERNARY; + break; + case Collator::Strength::Identical: + strength = UCOL_IDENTICAL; + break; + } + + ucol_setStrength(mCollator.GetMut(), strength); +} + +ICUResult Collator::SetCaseLevel(Collator::Feature aFeature) { + UErrorCode status = U_ZERO_ERROR; + UColAttributeValue featureICU; + FEATURE_TO_ICU(featureICU, aFeature); + ucol_setAttribute(mCollator.GetMut(), UCOL_CASE_LEVEL, featureICU, &status); + return ToICUResult(status); +} + +ICUResult Collator::SetAlternateHandling( + Collator::AlternateHandling aAlternateHandling) { + UErrorCode status = U_ZERO_ERROR; + UColAttributeValue handling; + switch (aAlternateHandling) { + case Collator::AlternateHandling::NonIgnorable: + handling = UCOL_NON_IGNORABLE; + break; + case Collator::AlternateHandling::Shifted: + handling = UCOL_SHIFTED; + break; + case Collator::AlternateHandling::Default: + handling = UCOL_DEFAULT; + break; + } + + ucol_setAttribute(mCollator.GetMut(), UCOL_ALTERNATE_HANDLING, handling, + &status); + return ToICUResult(status); +} + +ICUResult Collator::SetNumericCollation(Collator::Feature aFeature) { + UErrorCode status = U_ZERO_ERROR; + UColAttributeValue featureICU; + FEATURE_TO_ICU(featureICU, aFeature); + + ucol_setAttribute(mCollator.GetMut(), UCOL_NUMERIC_COLLATION, featureICU, + &status); + return ToICUResult(status); +} + +ICUResult Collator::SetNormalizationMode(Collator::Feature aFeature) { + UErrorCode status = U_ZERO_ERROR; + UColAttributeValue featureICU; + FEATURE_TO_ICU(featureICU, aFeature); + ucol_setAttribute(mCollator.GetMut(), UCOL_NORMALIZATION_MODE, featureICU, + &status); + return ToICUResult(status); +} + +ICUResult Collator::SetCaseFirst(Collator::CaseFirst aCaseFirst) { + UErrorCode status = U_ZERO_ERROR; + ucol_setAttribute(mCollator.GetMut(), UCOL_CASE_FIRST, + CaseFirstToICU(aCaseFirst), &status); + return ToICUResult(status); +} + +ICUResult Collator::SetOptions(const Options& aOptions, + const Maybe<Options&> aPrevOptions) { + if (aPrevOptions && + // Check the equality of the previous options. + aPrevOptions->sensitivity == aOptions.sensitivity && + aPrevOptions->caseFirst == aOptions.caseFirst && + aPrevOptions->ignorePunctuation == aOptions.ignorePunctuation && + aPrevOptions->numeric == aOptions.numeric) { + return Ok(); + } + + Collator::Strength strength = Collator::Strength::Default; + Collator::Feature caseLevel = Collator::Feature::Off; + switch (aOptions.sensitivity) { + case Collator::Sensitivity::Base: + strength = Collator::Strength::Primary; + break; + case Collator::Sensitivity::Accent: + strength = Collator::Strength::Secondary; + break; + case Collator::Sensitivity::Case: + caseLevel = Collator::Feature::On; + strength = Collator::Strength::Primary; + break; + case Collator::Sensitivity::Variant: + strength = Collator::Strength::Tertiary; + break; + } + + SetStrength(strength); + + ICUResult result = Ok(); + + // According to the ICU team, UCOL_SHIFTED causes punctuation to be + // ignored. Looking at Unicode Technical Report 35, Unicode Locale Data + // Markup Language, "shifted" causes whitespace and punctuation to be + // ignored - that's a bit more than asked for, but there's no way to get + // less. + result = this->SetAlternateHandling( + aOptions.ignorePunctuation ? Collator::AlternateHandling::Shifted + : Collator::AlternateHandling::Default); + if (result.isErr()) { + return result; + } + + result = SetCaseLevel(caseLevel); + if (result.isErr()) { + return result; + } + + result = SetNumericCollation(aOptions.numeric ? Collator::Feature::On + : Collator::Feature::Off); + if (result.isErr()) { + return result; + } + + // Normalization is always on to meet the canonical equivalence requirement. + result = SetNormalizationMode(Collator::Feature::On); + if (result.isErr()) { + return result; + } + + result = SetCaseFirst(aOptions.caseFirst); + if (result.isErr()) { + return result; + } + return Ok(); +} + +#undef FEATURE_TO_ICU + +Result<Collator::CaseFirst, ICUError> Collator::GetCaseFirst() const { + UErrorCode status = U_ZERO_ERROR; + UColAttributeValue caseFirst = + ucol_getAttribute(mCollator.GetConst(), UCOL_CASE_FIRST, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + if (caseFirst == UCOL_OFF) { + return CaseFirst::False; + } + if (caseFirst == UCOL_UPPER_FIRST) { + return CaseFirst::Upper; + } + MOZ_ASSERT(caseFirst == UCOL_LOWER_FIRST); + return CaseFirst::Lower; +} + +/* static */ +Result<Collator::Bcp47ExtEnumeration, ICUError> +Collator::GetBcp47KeywordValuesForLocale(const char* aLocale, + CommonlyUsed aCommonlyUsed) { + UErrorCode status = U_ZERO_ERROR; + UEnumeration* enumeration = ucol_getKeywordValuesForLocale( + "collation", aLocale, static_cast<bool>(aCommonlyUsed), &status); + + if (U_SUCCESS(status)) { + return Bcp47ExtEnumeration(enumeration); + } + + return Err(ToICUError(status)); +} + +/* static */ +Result<Collator::Bcp47ExtEnumeration, ICUError> +Collator::GetBcp47KeywordValues() { + UErrorCode status = U_ZERO_ERROR; + UEnumeration* enumeration = ucol_getKeywordValues("collation", &status); + + if (U_SUCCESS(status)) { + return Bcp47ExtEnumeration(enumeration); + } + + return Err(ToICUError(status)); +} + +/* static */ +SpanResult<char> Collator::KeywordValueToBcp47Extension(const char* aKeyword, + int32_t aLength) { + if (aKeyword == nullptr) { + return Err(InternalError{}); + } + return MakeStringSpan(uloc_toUnicodeLocaleType("co", aKeyword)); +} + +} // namespace mozilla::intl diff --git a/intl/components/src/Collator.h b/intl/components/src/Collator.h new file mode 100644 index 0000000000..dcb5a12a4f --- /dev/null +++ b/intl/components/src/Collator.h @@ -0,0 +1,322 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef intl_components_Collator_h_ +#define intl_components_Collator_h_ + +#ifndef JS_STANDALONE +# include "gtest/MozGtestFriend.h" +#endif + +#include "unicode/ucol.h" + +#include "mozilla/intl/ICU4CGlue.h" +#include "mozilla/intl/ICUError.h" +#include "mozilla/Result.h" +#include "mozilla/Span.h" + +namespace mozilla::intl { + +class Collator final { + public: + /** + * Construct from a raw UCollator. This is public so that the UniquePtr can + * access it. + */ + explicit Collator(UCollator* aCollator); + + // Do not allow copy as this class owns the ICU resource. Move is not + // currently implemented, but a custom move operator could be created if + // needed. + Collator(const Collator&) = delete; + Collator& operator=(const Collator&) = delete; + + /** + * Attempt to initialize a new collator. + */ + static Result<UniquePtr<Collator>, ICUError> TryCreate(const char* aLocale); + + ~Collator(); + + /** + * Get a sort key with the provided UTF-16 string, and store the sort key into + * the provided buffer of byte array. + * Every sort key ends with 0x00, and the terminating 0x00 byte is counted + * when calculating the length of buffer. For the purpose of other byte + * values, check the "Special Byte Values" document from ICU. + * + * https://icu.unicode.org/design/collation/bytes + */ + template <typename B> + ICUResult GetSortKey(Span<const char16_t> aString, B& aBuffer) const { + return FillBufferWithICUCall( + aBuffer, + [this, aString](uint8_t* target, int32_t length, UErrorCode* status) { + // ucol_getSortKey doesn't use the error code to report + // U_BUFFER_OVERFLOW_ERROR, instead it uses the return value to + // indicate the desired length to store the key. So we update the + // UErrorCode accordingly to let FillBufferWithICUCall resize the + // buffer. + int32_t len = ucol_getSortKey(mCollator.GetConst(), aString.data(), + static_cast<int32_t>(aString.size()), + target, length); + if (len == 0) { + // Returns 0 means there's an internal error. + *status = U_INTERNAL_PROGRAM_ERROR; + } else if (len > length) { + *status = U_BUFFER_OVERFLOW_ERROR; + } else { + *status = U_ZERO_ERROR; + } + return len; + }); + } + + int32_t CompareStrings(Span<const char16_t> aSource, + Span<const char16_t> aTarget) const; + + int32_t CompareSortKeys(Span<const uint8_t> aKey1, + Span<const uint8_t> aKey2) const; + + /** + * Determine how casing affects sorting. These options map to ECMA 402 + * collator options. + * + * https://tc39.es/ecma402/#sec-initializecollator + */ + enum class CaseFirst { + // Sort upper case first. + Upper, + // Sort lower case first. + Lower, + // Orders upper and lower case letters in accordance to their tertiary + // weights. + False, + }; + + /** + * Which differences in the strings should lead to differences in collation + * comparisons. + * + * This setting needs to be ECMA 402 compliant. + * https://tc39.es/ecma402/#sec-collator-comparestrings + */ + enum class Sensitivity { + // Only strings that differ in base letters compare as unequal. + // Examples: a ≠ b, a = á, a = A. + Base, + // Only strings that differ in base letters or accents and other diacritic + // marks compare as unequal. + // Examples: a ≠ b, a ≠ á, a = A. + Accent, + // Only strings that differ in base letters or case compare as unequal. + // Examples: a ≠ b, a = á, a ≠ A. + Case, + // Strings that differ in base letters, accents and other diacritic marks, + // or case compare as unequal. Other differences may also be taken into + // consideration. + // Examples: a ≠ b, a ≠ á, a ≠ A. + Variant, + }; + + /** + * These options map to ECMA 402 collator options. Make sure the defaults map + * to the default initialized values of ECMA 402. + * + * https://tc39.es/ecma402/#sec-initializecollator + */ + struct Options { + Sensitivity sensitivity = Sensitivity::Variant; + CaseFirst caseFirst = CaseFirst::False; + bool ignorePunctuation = false; + bool numeric = false; + }; + + /** + * Change the configuraton of the options. + */ + ICUResult SetOptions(const Options& aOptions, + const Maybe<Options&> aPrevOptions = Nothing()); + + /** + * Return the case first option of this collator. + */ + Result<CaseFirst, ICUError> GetCaseFirst() const; + + /** + * Map keywords to their BCP 47 equivalents. + */ + static SpanResult<char> KeywordValueToBcp47Extension(const char* aKeyword, + int32_t aLength); + + enum class CommonlyUsed : bool { + /** + * Select all possible values, even when not commonly used by a locale. + */ + No, + + /** + * Only select the values which are commonly used by a locale. + */ + Yes, + }; + + using Bcp47ExtEnumeration = + Enumeration<char, SpanResult<char>, + Collator::KeywordValueToBcp47Extension>; + + /** + * Returns an iterator of collator locale extensions in the preferred order. + * These extensions can be used in BCP 47 locales. For instance this + * iterator could return "phonebk" and could be appled to the German locale + * "de" as "de-co-phonebk" for a phonebook-style collation. + * + * The collation extensions can be found here: + * http://cldr.unicode.org/core-spec/#Key_Type_Definitions + */ + static Result<Bcp47ExtEnumeration, ICUError> GetBcp47KeywordValuesForLocale( + const char* aLocale, CommonlyUsed aCommonlyUsed = CommonlyUsed::No); + + /** + * Returns an iterator over all possible collator locale extensions. + * These extensions can be used in BCP 47 locales. For instance this + * iterator could return "phonebk" and could be appled to the German locale + * "de" as "de-co-phonebk" for a phonebook-style collation. + * + * The collation extensions can be found here: + * http://cldr.unicode.org/core-spec/#Key_Type_Definitions + */ + static Result<Bcp47ExtEnumeration, ICUError> GetBcp47KeywordValues(); + + /** + * Returns an iterator over all supported collator locales. + * + * The returned strings are ICU locale identifiers and NOT BCP 47 language + * tags. + * + * Also see <https://unicode-org.github.io/icu/userguide/locale>. + */ + static auto GetAvailableLocales() { + return AvailableLocalesEnumeration<ucol_countAvailable, + ucol_getAvailable>(); + } + + private: + /** + * Toggle features, or use the default setting. + */ + enum class Feature { + // Turn the feature off. + On, + // Turn the feature off. + Off, + // Use the default setting for the feature. + Default, + }; + + /** + * Attribute for handling variable elements. + */ + enum class AlternateHandling { + // Treats all the codepoints with non-ignorable primary weights in the + // same way (default) + NonIgnorable, + // Causes codepoints with primary weights that are equal or below the + // variable top value to be ignored on primary level and moved to the + // quaternary level. + Shifted, + Default, + }; + + /** + * The strength attribute. + * + * The usual strength for most locales (except Japanese) is tertiary. + * + * Quaternary strength is useful when combined with shifted setting for + * alternate handling attribute and for JIS X 4061 collation, when it is used + * to distinguish between Katakana and Hiragana. Otherwise, quaternary level + * is affected only by the number of non-ignorable code points in the string. + * + * Identical strength is rarely useful, as it amounts to codepoints of the NFD + * form of the string. + */ + enum class Strength { + // Primary collation strength. + Primary, + // Secondary collation strength. + Secondary, + // Tertiary collation strength. + Tertiary, + // Quaternary collation strength. + Quaternary, + // Identical collation strength. + Identical, + Default, + }; + + /** + * Configure the Collation::Strength + */ + void SetStrength(Strength strength); + + /** + * Configure Collation::AlternateHandling. + */ + ICUResult SetAlternateHandling(AlternateHandling aAlternateHandling); + + /** + * Controls whether an extra case level (positioned before the third level) is + * generated or not. + * + * Contents of the case level are affected by the value of CaseFirst + * attribute. A simple way to ignore accent differences in a string is to set + * the strength to Primary and enable case level. + */ + ICUResult SetCaseLevel(Feature aFeature); + + /** + * When turned on, this attribute makes substrings of digits sort according to + * their numeric values. + * + * This is a way to get '100' to sort AFTER '2'. Note that the longest digit + * substring that can be treated as a single unit is 254 digits (not counting + * leading zeros). If a digit substring is longer than that, the digits beyond + * the limit will be treated as a separate digit substring. + * + * A "digit" in this sense is a code point with General_Category=Nd, which + * does not include circled numbers, roman numerals, etc. Only a contiguous + * digit substring is considered, that is, non-negative integers without + * separators. There is no support for plus/minus signs, decimals, exponents, + * etc. + */ + ICUResult SetNumericCollation(Feature aFeature); + + /** + * Controls whether the normalization check and necessary normalizations are + * performed. + * + * When off (default), no normalization check is performed. The correctness of + * the result is guaranteed only if the input data is in so-called FCD form + * When set to on, an incremental check is performed to see whether the input + * data is in the FCD form. If the data is not in the FCD form, incremental + * NFD normalization is performed. + */ + ICUResult SetNormalizationMode(Feature aFeature); + + /** + * Configure Collation::CaseFirst. + */ + ICUResult SetCaseFirst(CaseFirst aCaseFirst); + +#ifndef JS_STANDALONE + FRIEND_TEST(IntlCollator, SetAttributesInternal); +#endif + + ICUPointer<UCollator> mCollator = ICUPointer<UCollator>(nullptr); + Maybe<Sensitivity> mLastStrategy = Nothing(); +}; + +} // namespace mozilla::intl + +#endif diff --git a/intl/components/src/Currency.cpp b/intl/components/src/Currency.cpp new file mode 100644 index 0000000000..4db8e0919c --- /dev/null +++ b/intl/components/src/Currency.cpp @@ -0,0 +1,22 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "mozilla/intl/Currency.h" + +#include "unicode/ucurr.h" +#include "unicode/uenum.h" +#include "unicode/utypes.h" + +namespace mozilla::intl { + +Result<SpanEnumeration<char>, ICUError> Currency::GetISOCurrencies() { + UErrorCode status = U_ZERO_ERROR; + UEnumeration* enumeration = ucurr_openISOCurrencies(UCURR_ALL, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + return SpanEnumeration<char>(enumeration); +} + +} // namespace mozilla::intl diff --git a/intl/components/src/Currency.h b/intl/components/src/Currency.h new file mode 100644 index 0000000000..d0f8eb6ee8 --- /dev/null +++ b/intl/components/src/Currency.h @@ -0,0 +1,30 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef intl_components_Currency_h_ +#define intl_components_Currency_h_ + +#include "mozilla/intl/ICU4CGlue.h" +#include "mozilla/intl/ICUError.h" +#include "mozilla/Result.h" + +namespace mozilla::intl { + +/** + * This component is a Mozilla-focused API for working with currencies in + * internationalization code. + */ +class Currency final { + public: + Currency() = delete; + + /** + * Returns an enumeration of all supported ISO currency codes. + */ + static Result<SpanEnumeration<char>, ICUError> GetISOCurrencies(); +}; + +} // namespace mozilla::intl + +#endif diff --git a/intl/components/src/DateIntervalFormat.cpp b/intl/components/src/DateIntervalFormat.cpp new file mode 100644 index 0000000000..0097668f8b --- /dev/null +++ b/intl/components/src/DateIntervalFormat.cpp @@ -0,0 +1,266 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "DateTimeFormat.h" // for DATE_TIME_FORMAT_REPLACE_SPECIAL_SPACES +#include "DateTimeFormatUtils.h" +#include "ScopedICUObject.h" + +#include "mozilla/intl/Calendar.h" +#include "mozilla/intl/DateIntervalFormat.h" + +namespace mozilla::intl { + +/** + * PartitionDateTimeRangePattern ( dateTimeFormat, x, y ), steps 9-11. + * + * Examine the formatted value to see if any interval span field is present. + * + * https://tc39.es/ecma402/#sec-partitiondatetimerangepattern + */ +static ICUResult DateFieldsPracticallyEqual( + const UFormattedValue* aFormattedValue, bool* aEqual) { + if (!aFormattedValue) { + return Err(ICUError::InternalError); + } + + MOZ_ASSERT(aEqual); + *aEqual = false; + UErrorCode status = U_ZERO_ERROR; + UConstrainedFieldPosition* fpos = ucfpos_open(&status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + ScopedICUObject<UConstrainedFieldPosition, ucfpos_close> toCloseFpos(fpos); + + // We're only interested in UFIELD_CATEGORY_DATE_INTERVAL_SPAN fields. + ucfpos_constrainCategory(fpos, UFIELD_CATEGORY_DATE_INTERVAL_SPAN, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + bool hasSpan = ufmtval_nextPosition(aFormattedValue, fpos, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + // When no date interval span field was found, both dates are "practically + // equal" per PartitionDateTimeRangePattern. + *aEqual = !hasSpan; + return Ok(); +} + +/* static */ +Result<UniquePtr<DateIntervalFormat>, ICUError> DateIntervalFormat::TryCreate( + Span<const char> aLocale, Span<const char16_t> aSkeleton, + Span<const char16_t> aTimeZone) { + UErrorCode status = U_ZERO_ERROR; + UDateIntervalFormat* dif = + udtitvfmt_open(IcuLocale(aLocale), aSkeleton.data(), + AssertedCast<int32_t>(aSkeleton.size()), aTimeZone.data(), + AssertedCast<int32_t>(aTimeZone.size()), &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + return UniquePtr<DateIntervalFormat>(new DateIntervalFormat(dif)); +} + +DateIntervalFormat::~DateIntervalFormat() { + MOZ_ASSERT(mDateIntervalFormat); + udtitvfmt_close(mDateIntervalFormat.GetMut()); +} + +#if DATE_TIME_FORMAT_REPLACE_SPECIAL_SPACES +// We reach inside the UFormattedValue and modify its internal string. (It's +// crucial that this is just an in-place replacement that doesn't alter any +// field positions, etc., ) +static void ReplaceSpecialSpaces(const UFormattedValue* aValue) { + UErrorCode status = U_ZERO_ERROR; + int32_t len; + const UChar* str = ufmtval_getString(aValue, &len, &status); + if (U_FAILURE(status)) { + return; + } + + for (const auto& c : Span(str, len)) { + if (IsSpecialSpace(c)) { + const_cast<UChar&>(c) = ' '; + } + } +} +#endif + +ICUResult DateIntervalFormat::TryFormatCalendar( + const Calendar& aStart, const Calendar& aEnd, + AutoFormattedDateInterval& aFormatted, bool* aPracticallyEqual) const { + MOZ_ASSERT(aFormatted.IsValid()); + + UErrorCode status = U_ZERO_ERROR; + udtitvfmt_formatCalendarToResult(mDateIntervalFormat.GetConst(), + aStart.GetUCalendar(), aEnd.GetUCalendar(), + aFormatted.GetFormatted(), &status); + + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + +#if DATE_TIME_FORMAT_REPLACE_SPECIAL_SPACES + ReplaceSpecialSpaces(aFormatted.Value()); +#endif + + MOZ_TRY(DateFieldsPracticallyEqual(aFormatted.Value(), aPracticallyEqual)); + return Ok(); +} + +ICUResult DateIntervalFormat::TryFormatDateTime( + double aStart, double aEnd, AutoFormattedDateInterval& aFormatted, + bool* aPracticallyEqual) const { + MOZ_ASSERT(aFormatted.IsValid()); + + UErrorCode status = U_ZERO_ERROR; + udtitvfmt_formatToResult(mDateIntervalFormat.GetConst(), aStart, aEnd, + aFormatted.GetFormatted(), &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + +#if DATE_TIME_FORMAT_REPLACE_SPECIAL_SPACES + ReplaceSpecialSpaces(aFormatted.Value()); +#endif + + MOZ_TRY(DateFieldsPracticallyEqual(aFormatted.Value(), aPracticallyEqual)); + return Ok(); +} + +ICUResult DateIntervalFormat::TryFormattedToParts( + const AutoFormattedDateInterval& aFormatted, + DateTimePartVector& aParts) const { + MOZ_ASSERT(aFormatted.IsValid()); + const UFormattedValue* value = aFormatted.Value(); + if (!value) { + return Err(ICUError::InternalError); + } + + size_t lastEndIndex = 0; + auto AppendPart = [&](DateTimePartType type, size_t endIndex, + DateTimePartSource source) { + if (!aParts.emplaceBack(type, endIndex, source)) { + return false; + } + + lastEndIndex = endIndex; + return true; + }; + + UErrorCode status = U_ZERO_ERROR; + UConstrainedFieldPosition* fpos = ucfpos_open(&status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + ScopedICUObject<UConstrainedFieldPosition, ucfpos_close> toCloseFpos(fpos); + + size_t categoryEndIndex = 0; + DateTimePartSource source = DateTimePartSource::Shared; + + while (true) { + bool hasMore = ufmtval_nextPosition(value, fpos, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + if (!hasMore) { + break; + } + + int32_t category = ucfpos_getCategory(fpos, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + int32_t field = ucfpos_getField(fpos, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + int32_t beginIndexInt, endIndexInt; + ucfpos_getIndexes(fpos, &beginIndexInt, &endIndexInt, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + MOZ_ASSERT(beginIndexInt <= endIndexInt, + "field iterator returning invalid range"); + + size_t beginIndex = AssertedCast<size_t>(beginIndexInt); + size_t endIndex = AssertedCast<size_t>(endIndexInt); + + // Indices are guaranteed to be returned in order (from left to right). + MOZ_ASSERT(lastEndIndex <= beginIndex, + "field iteration didn't return fields in order start to " + "finish as expected"); + + if (category == UFIELD_CATEGORY_DATE_INTERVAL_SPAN) { + // Append any remaining literal parts before changing the source kind. + if (lastEndIndex < beginIndex) { + if (!AppendPart(DateTimePartType::Literal, beginIndex, source)) { + return Err(ICUError::InternalError); + } + } + + // The special field category UFIELD_CATEGORY_DATE_INTERVAL_SPAN has only + // two allowed values (0 or 1), indicating the begin of the start- resp. + // end-date. + MOZ_ASSERT(field == 0 || field == 1, + "span category has unexpected value"); + + source = field == 0 ? DateTimePartSource::StartRange + : DateTimePartSource::EndRange; + categoryEndIndex = endIndex; + continue; + } + + // Ignore categories other than UFIELD_CATEGORY_DATE. + if (category != UFIELD_CATEGORY_DATE) { + continue; + } + + DateTimePartType type = + ConvertUFormatFieldToPartType(static_cast<UDateFormatField>(field)); + if (lastEndIndex < beginIndex) { + if (!AppendPart(DateTimePartType::Literal, beginIndex, source)) { + return Err(ICUError::InternalError); + } + } + + if (!AppendPart(type, endIndex, source)) { + return Err(ICUError::InternalError); + } + + if (endIndex == categoryEndIndex) { + // Append any remaining literal parts before changing the source kind. + if (lastEndIndex < endIndex) { + if (!AppendPart(DateTimePartType::Literal, endIndex, source)) { + return Err(ICUError::InternalError); + } + } + + source = DateTimePartSource::Shared; + } + } + + // Append any final literal. + auto spanResult = aFormatted.ToSpan(); + if (spanResult.isErr()) { + return spanResult.propagateErr(); + } + size_t formattedSize = spanResult.unwrap().size(); + if (lastEndIndex < formattedSize) { + if (!AppendPart(DateTimePartType::Literal, formattedSize, source)) { + return Err(ICUError::InternalError); + } + } + + return Ok(); +} + +} // namespace mozilla::intl diff --git a/intl/components/src/DateIntervalFormat.h b/intl/components/src/DateIntervalFormat.h new file mode 100644 index 0000000000..c4dbce807a --- /dev/null +++ b/intl/components/src/DateIntervalFormat.h @@ -0,0 +1,107 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef intl_components_DateIntervalFormat_h_ +#define intl_components_DateIntervalFormat_h_ + +#include "mozilla/intl/Calendar.h" +#include "mozilla/intl/DateTimePart.h" +#include "mozilla/intl/ICU4CGlue.h" +#include "mozilla/intl/ICUError.h" +#include "mozilla/Result.h" +#include "mozilla/Span.h" +#include "mozilla/UniquePtr.h" + +#include "unicode/udateintervalformat.h" +#include "unicode/utypes.h" + +namespace mozilla::intl { +class Calendar; + +using AutoFormattedDateInterval = + AutoFormattedResult<UFormattedDateInterval, udtitvfmt_openResult, + udtitvfmt_resultAsValue, udtitvfmt_closeResult>; + +/** + * This component is a Mozilla-focused API for the date range formatting + * provided by ICU. This DateIntervalFormat class helps to format the range + * between two date-time values. + * + * https://tc39.es/ecma402/#sec-formatdatetimerange + * https://tc39.es/ecma402/#sec-formatdatetimerangetoparts + */ +class DateIntervalFormat final { + public: + /** + * Create a DateIntervalFormat object from locale, skeleton and time zone. + * The format of skeleton can be found in [1]. + * + * Note: Skeleton will be removed in the future. + * + * [1]: https://unicode.org/reports/tr35/tr35-dates.html#Date_Format_Patterns + */ + static Result<UniquePtr<DateIntervalFormat>, ICUError> TryCreate( + Span<const char> aLocale, Span<const char16_t> aSkeleton, + Span<const char16_t> aTimeZone); + + ~DateIntervalFormat(); + + /** + * Format a date-time range between two Calendar objects. + * + * DateIntervalFormat cannot be changed to use a proleptic Gregorian + * calendar, so use this method if the start date is before the Gregorian + * calendar is introduced(October 15, 1582), otherwise use TryFormatDateTime + * instead. + * + * The result will be stored in aFormatted, caller can use + * AutoFormattedDateInterval::ToSpan() to get the formatted string, or pass + * the aFormatted to TryFormattedToParts to get the parts vector. + * + * aPracticallyEqual will be set to true if the date times of the two + * calendars are equal. + */ + ICUResult TryFormatCalendar(const Calendar& aStart, const Calendar& aEnd, + AutoFormattedDateInterval& aFormatted, + bool* aPracticallyEqual) const; + + /** + * Format a date-time range between two Unix epoch times in milliseconds. + * + * The result will be stored in aFormatted, caller can use + * AutoFormattedDateInterval::ToSpan() to get the formatted string, or pass + * the aFormatted to TryFormattedToParts to get the parts vector. + * + * aPracticallyEqual will be set to true if the date times of the two + * Unix epoch times are equal. + */ + ICUResult TryFormatDateTime(double aStart, double aEnd, + AutoFormattedDateInterval& aFormatted, + bool* aPracticallyEqual) const; + + /** + * Convert the formatted DateIntervalFormat into several parts. + * + * The caller get the formatted result from either TryFormatCalendar, or + * TryFormatDateTime methods, and instantiate the DateTimePartVector. This + * method will generate the parts and insert them into the vector. + * + * See: + * https://tc39.es/ecma402/#sec-formatdatetimerangetoparts + */ + ICUResult TryFormattedToParts(const AutoFormattedDateInterval& aFormatted, + DateTimePartVector& aParts) const; + + private: + DateIntervalFormat() = delete; + explicit DateIntervalFormat(UDateIntervalFormat* aDif) + : mDateIntervalFormat(aDif) {} + DateIntervalFormat(const DateIntervalFormat&) = delete; + DateIntervalFormat& operator=(const DateIntervalFormat&) = delete; + + ICUPointer<UDateIntervalFormat> mDateIntervalFormat = + ICUPointer<UDateIntervalFormat>(nullptr); +}; +} // namespace mozilla::intl + +#endif diff --git a/intl/components/src/DateTimeFormat.cpp b/intl/components/src/DateTimeFormat.cpp new file mode 100644 index 0000000000..5a6429e976 --- /dev/null +++ b/intl/components/src/DateTimeFormat.cpp @@ -0,0 +1,1140 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include <cstring> + +#include "unicode/ucal.h" +#include "unicode/udat.h" +#include "unicode/udatpg.h" +#include "unicode/ures.h" + +#include "DateTimeFormatUtils.h" +#include "ScopedICUObject.h" + +#include "mozilla/EnumSet.h" +#include "mozilla/intl/Calendar.h" +#include "mozilla/intl/DateTimeFormat.h" +#include "mozilla/intl/DateTimePatternGenerator.h" + +namespace mozilla::intl { + +DateTimeFormat::~DateTimeFormat() { + MOZ_ASSERT(mDateFormat); + udat_close(mDateFormat); +} + +static UDateFormatStyle ToUDateFormatStyle( + Maybe<DateTimeFormat::Style> aLength) { + if (!aLength) { + return UDAT_NONE; + } + switch (*aLength) { + case DateTimeFormat::Style::Full: + return UDAT_FULL; + case DateTimeFormat::Style::Long: + return UDAT_LONG; + case DateTimeFormat::Style::Medium: + return UDAT_MEDIUM; + case DateTimeFormat::Style::Short: + return UDAT_SHORT; + } + MOZ_ASSERT_UNREACHABLE(); + // Do not use the default: branch so that the enum is exhaustively checked. + return UDAT_NONE; +} + +/** + * Parse a pattern according to the format specified in + * <https://unicode.org/reports/tr35/tr35-dates.html#Date_Format_Patterns>. + */ +template <typename CharT> +class PatternIterator { + CharT* iter; + const CharT* const end; + + public: + explicit PatternIterator(mozilla::Span<CharT> aPattern) + : iter(aPattern.data()), end(aPattern.data() + aPattern.size()) {} + + CharT* next() { + MOZ_ASSERT(iter != nullptr); + + bool inQuote = false; + while (iter < end) { + CharT* cur = iter++; + if (*cur == '\'') { + inQuote = !inQuote; + } else if (!inQuote) { + return cur; + } + } + + iter = nullptr; + return nullptr; + } +}; + +Maybe<DateTimeFormat::HourCycle> DateTimeFormat::HourCycleFromPattern( + Span<const char16_t> aPattern) { + PatternIterator<const char16_t> iter(aPattern); + while (const auto* ptr = iter.next()) { + switch (*ptr) { + case 'K': + return Some(DateTimeFormat::HourCycle::H11); + case 'h': + return Some(DateTimeFormat::HourCycle::H12); + case 'H': + return Some(DateTimeFormat::HourCycle::H23); + case 'k': + return Some(DateTimeFormat::HourCycle::H24); + } + } + return Nothing(); +} + +static bool IsHour12(DateTimeFormat::HourCycle aHourCycle) { + return aHourCycle == DateTimeFormat::HourCycle::H11 || + aHourCycle == DateTimeFormat::HourCycle::H12; +} + +static char16_t HourSymbol(DateTimeFormat::HourCycle aHourCycle) { + switch (aHourCycle) { + case DateTimeFormat::HourCycle::H11: + return 'K'; + case DateTimeFormat::HourCycle::H12: + return 'h'; + case DateTimeFormat::HourCycle::H23: + return 'H'; + case DateTimeFormat::HourCycle::H24: + return 'k'; + } + MOZ_CRASH("unexpected hour cycle"); +} + +enum class PatternField { Hour, Minute, Second, Other }; + +template <typename CharT> +static PatternField ToPatternField(CharT aCh) { + if (aCh == 'K' || aCh == 'h' || aCh == 'H' || aCh == 'k' || aCh == 'j') { + return PatternField::Hour; + } + if (aCh == 'm') { + return PatternField::Minute; + } + if (aCh == 's') { + return PatternField::Second; + } + return PatternField::Other; +} + +/** + * Replaces all hour pattern characters in |patternOrSkeleton| to use the + * matching hour representation for |hourCycle|. + */ +/* static */ +void DateTimeFormat::ReplaceHourSymbol( + mozilla::Span<char16_t> aPatternOrSkeleton, + DateTimeFormat::HourCycle aHourCycle) { + char16_t replacement = HourSymbol(aHourCycle); + PatternIterator<char16_t> iter(aPatternOrSkeleton); + while (auto* ptr = iter.next()) { + auto field = ToPatternField(*ptr); + if (field == PatternField::Hour) { + *ptr = replacement; + } + } +} + +/** + * Find a matching pattern using the requested hour-12 options. + * + * This function is needed to work around the following two issues. + * - https://unicode-org.atlassian.net/browse/ICU-21023 + * - https://unicode-org.atlassian.net/browse/CLDR-13425 + * + * We're currently using a relatively simple workaround, which doesn't give the + * most accurate results. For example: + * + * ``` + * var dtf = new Intl.DateTimeFormat("en", { + * timeZone: "UTC", + * dateStyle: "long", + * timeStyle: "long", + * hourCycle: "h12", + * }); + * print(dtf.format(new Date("2020-01-01T00:00Z"))); + * ``` + * + * Returns the pattern "MMMM d, y 'at' h:mm:ss a z", but when going through + * |DateTimePatternGenerator::GetSkeleton| and then + * |DateTimePatternGenerator::GetBestPattern| to find an equivalent pattern for + * "h23", we'll end up with the pattern "MMMM d, y, HH:mm:ss z", so the + * combinator element " 'at' " was lost in the process. + */ +/* static */ +ICUResult DateTimeFormat::FindPatternWithHourCycle( + DateTimePatternGenerator& aDateTimePatternGenerator, + DateTimeFormat::PatternVector& aPattern, bool aHour12, + DateTimeFormat::SkeletonVector& aSkeleton) { + MOZ_TRY(mozilla::intl::DateTimePatternGenerator::GetSkeleton(aPattern, + aSkeleton)); + + // Input skeletons don't differentiate between "K" and "h" resp. "k" and "H". + DateTimeFormat::ReplaceHourSymbol(aSkeleton, + aHour12 ? DateTimeFormat::HourCycle::H12 + : DateTimeFormat::HourCycle::H23); + + MOZ_TRY(aDateTimePatternGenerator.GetBestPattern(aSkeleton, aPattern)); + + return Ok(); +} + +static auto PatternMatchOptions(mozilla::Span<const char16_t> aSkeleton) { + // Values for hour, minute, and second are: + // - absent: 0 + // - numeric: 1 + // - 2-digit: 2 + int32_t hour = 0; + int32_t minute = 0; + int32_t second = 0; + + PatternIterator<const char16_t> iter(aSkeleton); + while (const auto* ptr = iter.next()) { + switch (ToPatternField(*ptr)) { + case PatternField::Hour: + MOZ_ASSERT(hour < 2); + hour += 1; + break; + case PatternField::Minute: + MOZ_ASSERT(minute < 2); + minute += 1; + break; + case PatternField::Second: + MOZ_ASSERT(second < 2); + second += 1; + break; + case PatternField::Other: + break; + } + } + + // Adjust the field length when the user requested '2-digit' representation. + // + // We can't just always adjust the field length, because + // 1. The default value for hour, minute, and second fields is 'numeric'. If + // the length is always adjusted, |date.toLocaleTime()| will start to + // return strings like "1:5:9 AM" instead of "1:05:09 AM". + // 2. ICU doesn't support to adjust the field length to 'numeric' in certain + // cases. For example when the locale is "de" (German): + // a. hour='numeric' and minute='2-digit' will return "1:05". + // b. whereas hour='numeric' and minute='numeric' will return "01:05". + // + // Therefore we only support adjusting the field length when the user + // explicitly requested the '2-digit' representation. + + using PatternMatchOption = + mozilla::intl::DateTimePatternGenerator::PatternMatchOption; + mozilla::EnumSet<PatternMatchOption> options; + if (hour == 2) { + options += PatternMatchOption::HourField; + } + if (minute == 2) { + options += PatternMatchOption::MinuteField; + } + if (second == 2) { + options += PatternMatchOption::SecondField; + } + return options; +} + +/* static */ +Result<UniquePtr<DateTimeFormat>, ICUError> DateTimeFormat::TryCreateFromStyle( + Span<const char> aLocale, const StyleBag& aStyleBag, + DateTimePatternGenerator* aDateTimePatternGenerator, + Maybe<Span<const char16_t>> aTimeZoneOverride) { + auto dateStyle = ToUDateFormatStyle(aStyleBag.date); + auto timeStyle = ToUDateFormatStyle(aStyleBag.time); + + if (dateStyle == UDAT_NONE && timeStyle == UDAT_NONE) { + dateStyle = UDAT_DEFAULT; + timeStyle = UDAT_DEFAULT; + } + + // The time zone is optional. + int32_t tzIDLength = -1; + const UChar* tzID = nullptr; + if (aTimeZoneOverride) { + tzIDLength = static_cast<int32_t>(aTimeZoneOverride->size()); + tzID = aTimeZoneOverride->Elements(); + } + + UErrorCode status = U_ZERO_ERROR; + UDateFormat* dateFormat = + udat_open(timeStyle, dateStyle, IcuLocale(aLocale), tzID, tzIDLength, + /* pattern */ nullptr, /* pattern length */ -1, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + auto df = UniquePtr<DateTimeFormat>(new DateTimeFormat(dateFormat)); + + if (aStyleBag.time && (aStyleBag.hour12 || aStyleBag.hourCycle)) { + // Only adjust the style pattern for time if there is an override. + // Extract the pattern and adjust it for the preferred hour cycle. + DateTimeFormat::PatternVector pattern{}; + + VectorToBufferAdaptor buffer(pattern); + MOZ_TRY(df->GetPattern(buffer)); + + Maybe<DateTimeFormat::HourCycle> hcPattern = HourCycleFromPattern(pattern); + DateTimeFormat::SkeletonVector skeleton{}; + + if (hcPattern) { + bool wantHour12 = + aStyleBag.hour12 ? *aStyleBag.hour12 : IsHour12(*aStyleBag.hourCycle); + if (wantHour12 == IsHour12(*hcPattern)) { + // Return the date-time format when its hour-cycle settings match the + // requested options. + if (aStyleBag.hour12 || *hcPattern == *aStyleBag.hourCycle) { + return df; + } + } else { + MOZ_ASSERT(aDateTimePatternGenerator); + MOZ_TRY(DateTimeFormat::FindPatternWithHourCycle( + *aDateTimePatternGenerator, pattern, wantHour12, skeleton)); + } + // Replace the hourCycle, if present, in the pattern string. But only do + // this if no hour12 option is present, because the latter takes + // precedence over hourCycle. + if (!aStyleBag.hour12) { + DateTimeFormat::ReplaceHourSymbol(pattern, *aStyleBag.hourCycle); + } + + auto result = DateTimeFormat::TryCreateFromPattern(aLocale, pattern, + aTimeZoneOverride); + if (result.isErr()) { + return Err(result.unwrapErr()); + } + auto dateTimeFormat = result.unwrap(); + MOZ_TRY(dateTimeFormat->CacheSkeleton(skeleton)); + return dateTimeFormat; + } + } + + return df; +} + +DateTimeFormat::DateTimeFormat(UDateFormat* aDateFormat) { + MOZ_RELEASE_ASSERT(aDateFormat, "Expected aDateFormat to not be a nullptr."); + mDateFormat = aDateFormat; +} + +// A helper to ergonomically push a string onto a string vector. +template <typename V, size_t N> +static ICUResult PushString(V& aVec, const char16_t (&aString)[N]) { + if (!aVec.append(aString, N - 1)) { + return Err(ICUError::OutOfMemory); + } + return Ok(); +} + +// A helper to ergonomically push a char onto a string vector. +template <typename V> +static ICUResult PushChar(V& aVec, char16_t aCh) { + if (!aVec.append(aCh)) { + return Err(ICUError::OutOfMemory); + } + return Ok(); +} + +/** + * Returns an ICU skeleton string representing the specified options. + * http://unicode.org/reports/tr35/tr35-dates.html#Date_Field_Symbol_Table + */ +ICUResult ToICUSkeleton(const DateTimeFormat::ComponentsBag& aBag, + DateTimeFormat::SkeletonVector& aSkeleton) { + // Create an ICU skeleton representing the specified aBag. See + if (aBag.weekday) { + switch (*aBag.weekday) { + case DateTimeFormat::Text::Narrow: + MOZ_TRY(PushString(aSkeleton, u"EEEEE")); + break; + case DateTimeFormat::Text::Short: + MOZ_TRY(PushString(aSkeleton, u"E")); + break; + case DateTimeFormat::Text::Long: + MOZ_TRY(PushString(aSkeleton, u"EEEE")); + } + } + if (aBag.era) { + switch (*aBag.era) { + case DateTimeFormat::Text::Narrow: + MOZ_TRY(PushString(aSkeleton, u"GGGGG")); + break; + case DateTimeFormat::Text::Short: + // Use "GGG" instead of "G" to return the same results as other + // browsers. This is exploiting the following ICU bug + // <https://unicode-org.atlassian.net/browse/ICU-22138>. As soon as that + // bug has been fixed, we can change this back to "G". + // + // In practice the bug only affects "G", so we only apply it for "G" + // and not for other symbols like "B" or "z". + MOZ_TRY(PushString(aSkeleton, u"GGG")); + break; + case DateTimeFormat::Text::Long: + MOZ_TRY(PushString(aSkeleton, u"GGGG")); + break; + } + } + if (aBag.year) { + switch (*aBag.year) { + case DateTimeFormat::Numeric::TwoDigit: + MOZ_TRY(PushString(aSkeleton, u"yy")); + break; + case DateTimeFormat::Numeric::Numeric: + MOZ_TRY(PushString(aSkeleton, u"y")); + break; + } + } + if (aBag.month) { + switch (*aBag.month) { + case DateTimeFormat::Month::TwoDigit: + MOZ_TRY(PushString(aSkeleton, u"MM")); + break; + case DateTimeFormat::Month::Numeric: + MOZ_TRY(PushString(aSkeleton, u"M")); + break; + case DateTimeFormat::Month::Narrow: + MOZ_TRY(PushString(aSkeleton, u"MMMMM")); + break; + case DateTimeFormat::Month::Short: + MOZ_TRY(PushString(aSkeleton, u"MMM")); + break; + case DateTimeFormat::Month::Long: + MOZ_TRY(PushString(aSkeleton, u"MMMM")); + break; + } + } + if (aBag.day) { + switch (*aBag.day) { + case DateTimeFormat::Numeric::TwoDigit: + MOZ_TRY(PushString(aSkeleton, u"dd")); + break; + case DateTimeFormat::Numeric::Numeric: + MOZ_TRY(PushString(aSkeleton, u"d")); + break; + } + } + + // If hour12 and hourCycle are both present, hour12 takes precedence. + char16_t hourSkeletonChar = 'j'; + if (aBag.hour12) { + if (*aBag.hour12) { + hourSkeletonChar = 'h'; + } else { + hourSkeletonChar = 'H'; + } + } else if (aBag.hourCycle) { + switch (*aBag.hourCycle) { + case DateTimeFormat::HourCycle::H11: + case DateTimeFormat::HourCycle::H12: + hourSkeletonChar = 'h'; + break; + case DateTimeFormat::HourCycle::H23: + case DateTimeFormat::HourCycle::H24: + hourSkeletonChar = 'H'; + break; + } + } + if (aBag.hour) { + switch (*aBag.hour) { + case DateTimeFormat::Numeric::TwoDigit: + MOZ_TRY(PushChar(aSkeleton, hourSkeletonChar)); + MOZ_TRY(PushChar(aSkeleton, hourSkeletonChar)); + break; + case DateTimeFormat::Numeric::Numeric: + MOZ_TRY(PushChar(aSkeleton, hourSkeletonChar)); + break; + } + } + // ICU requires that "B" is set after the "j" hour skeleton symbol. + // https://unicode-org.atlassian.net/browse/ICU-20731 + if (aBag.dayPeriod) { + switch (*aBag.dayPeriod) { + case DateTimeFormat::Text::Narrow: + MOZ_TRY(PushString(aSkeleton, u"BBBBB")); + break; + case DateTimeFormat::Text::Short: + MOZ_TRY(PushString(aSkeleton, u"B")); + break; + case DateTimeFormat::Text::Long: + MOZ_TRY(PushString(aSkeleton, u"BBBB")); + break; + } + } + if (aBag.minute) { + switch (*aBag.minute) { + case DateTimeFormat::Numeric::TwoDigit: + MOZ_TRY(PushString(aSkeleton, u"mm")); + break; + case DateTimeFormat::Numeric::Numeric: + MOZ_TRY(PushString(aSkeleton, u"m")); + break; + } + } + if (aBag.second) { + switch (*aBag.second) { + case DateTimeFormat::Numeric::TwoDigit: + MOZ_TRY(PushString(aSkeleton, u"ss")); + break; + case DateTimeFormat::Numeric::Numeric: + MOZ_TRY(PushString(aSkeleton, u"s")); + break; + } + } + if (aBag.fractionalSecondDigits) { + switch (*aBag.fractionalSecondDigits) { + case 1: + MOZ_TRY(PushString(aSkeleton, u"S")); + break; + case 2: + MOZ_TRY(PushString(aSkeleton, u"SS")); + break; + default: + MOZ_TRY(PushString(aSkeleton, u"SSS")); + break; + } + } + if (aBag.timeZoneName) { + switch (*aBag.timeZoneName) { + case DateTimeFormat::TimeZoneName::Short: + MOZ_TRY(PushString(aSkeleton, u"z")); + break; + case DateTimeFormat::TimeZoneName::Long: + MOZ_TRY(PushString(aSkeleton, u"zzzz")); + break; + case DateTimeFormat::TimeZoneName::ShortOffset: + MOZ_TRY(PushString(aSkeleton, u"O")); + break; + case DateTimeFormat::TimeZoneName::LongOffset: + MOZ_TRY(PushString(aSkeleton, u"OOOO")); + break; + case DateTimeFormat::TimeZoneName::ShortGeneric: + MOZ_TRY(PushString(aSkeleton, u"v")); + break; + case DateTimeFormat::TimeZoneName::LongGeneric: + MOZ_TRY(PushString(aSkeleton, u"vvvv")); + break; + } + } + return Ok(); +} + +/* static */ +Result<UniquePtr<DateTimeFormat>, ICUError> +DateTimeFormat::TryCreateFromComponents( + Span<const char> aLocale, const DateTimeFormat::ComponentsBag& aBag, + DateTimePatternGenerator* aDateTimePatternGenerator, + Maybe<Span<const char16_t>> aTimeZoneOverride) { + DateTimeFormat::SkeletonVector skeleton; + MOZ_TRY(ToICUSkeleton(aBag, skeleton)); + return TryCreateFromSkeleton(aLocale, skeleton, aDateTimePatternGenerator, + aBag.hourCycle, aTimeZoneOverride); +} + +/* static */ +Result<UniquePtr<DateTimeFormat>, ICUError> +DateTimeFormat::TryCreateFromPattern( + Span<const char> aLocale, Span<const char16_t> aPattern, + Maybe<Span<const char16_t>> aTimeZoneOverride) { + UErrorCode status = U_ZERO_ERROR; + + // The time zone is optional. + int32_t tzIDLength = -1; + const UChar* tzID = nullptr; + if (aTimeZoneOverride) { + tzIDLength = static_cast<int32_t>(aTimeZoneOverride->size()); + tzID = aTimeZoneOverride->data(); + } + + // Create the date formatter. + UDateFormat* dateFormat = udat_open( + UDAT_PATTERN, UDAT_PATTERN, IcuLocale(aLocale), tzID, tzIDLength, + aPattern.data(), static_cast<int32_t>(aPattern.size()), &status); + + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + // The DateTimeFormat wrapper will control the life cycle of the ICU + // dateFormat object. + return UniquePtr<DateTimeFormat>(new DateTimeFormat(dateFormat)); +} + +/* static */ +Result<UniquePtr<DateTimeFormat>, ICUError> +DateTimeFormat::TryCreateFromSkeleton( + Span<const char> aLocale, Span<const char16_t> aSkeleton, + DateTimePatternGenerator* aDateTimePatternGenerator, + Maybe<DateTimeFormat::HourCycle> aHourCycle, + Maybe<Span<const char16_t>> aTimeZoneOverride) { + if (!aDateTimePatternGenerator) { + return Err(ICUError::InternalError); + } + + // Compute the best pattern for the skeleton. + DateTimeFormat::PatternVector pattern; + auto options = PatternMatchOptions(aSkeleton); + MOZ_TRY( + aDateTimePatternGenerator->GetBestPattern(aSkeleton, pattern, options)); + + if (aHourCycle) { + DateTimeFormat::ReplaceHourSymbol(pattern, *aHourCycle); + } + + auto result = + DateTimeFormat::TryCreateFromPattern(aLocale, pattern, aTimeZoneOverride); + if (result.isErr()) { + return Err(result.unwrapErr()); + } + auto dateTimeFormat = result.unwrap(); + MOZ_TRY(dateTimeFormat->CacheSkeleton(aSkeleton)); + return dateTimeFormat; +} + +ICUResult DateTimeFormat::CacheSkeleton(Span<const char16_t> aSkeleton) { + if (mOriginalSkeleton.append(aSkeleton.Elements(), aSkeleton.Length())) { + return Ok(); + } + return Err(ICUError::OutOfMemory); +} + +void DateTimeFormat::SetStartTimeIfGregorian(double aTime) { + UErrorCode status = U_ZERO_ERROR; + UCalendar* cal = const_cast<UCalendar*>(udat_getCalendar(mDateFormat)); + ucal_setGregorianChange(cal, aTime, &status); + // An error here means the calendar is not Gregorian, and can be ignored. +} + +/* static */ +Result<UniquePtr<Calendar>, ICUError> DateTimeFormat::CloneCalendar( + double aUnixEpoch) const { + UErrorCode status = U_ZERO_ERROR; + UCalendar* calendarRaw = ucal_clone(udat_getCalendar(mDateFormat), &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + auto calendar = MakeUnique<Calendar>(calendarRaw); + + MOZ_TRY(calendar->SetTimeInMs(aUnixEpoch)); + + return calendar; +} + +/** + * ICU locale identifier consisting of a language and a region subtag. + */ +class LanguageRegionLocaleId { + // unicode_language_subtag = alpha{2,3} | alpha{5,8} ; + static constexpr size_t LanguageLength = 8; + + // unicode_region_subtag = (alpha{2} | digit{3}) ; + static constexpr size_t RegionLength = 3; + + // Add +1 to account for the separator. + static constexpr size_t LRLength = LanguageLength + RegionLength + 1; + + // Add +1 to zero terminate the string. + char mLocale[LRLength + 1] = {}; + + // Pointer to the start of the region subtag within |locale_|. + char* mRegion = nullptr; + + public: + LanguageRegionLocaleId(Span<const char> aLanguage, + Maybe<Span<const char>> aRegion); + + const char* languageRegion() const { return mLocale; } + const char* region() const { return mRegion; } +}; + +LanguageRegionLocaleId::LanguageRegionLocaleId( + Span<const char> aLanguage, Maybe<Span<const char>> aRegion) { + MOZ_RELEASE_ASSERT(aLanguage.Length() <= LanguageLength); + MOZ_RELEASE_ASSERT(!aRegion || aRegion->Length() <= RegionLength); + + size_t languageLength = aLanguage.Length(); + + std::memcpy(mLocale, aLanguage.Elements(), languageLength); + + // ICU locale identifiers are separated by underscores. + mLocale[languageLength] = '_'; + + mRegion = mLocale + languageLength + 1; + if (aRegion) { + std::memcpy(mRegion, aRegion->Elements(), aRegion->Length()); + } else { + // Use "001" (UN M.49 code for the World) as the fallback to match ICU. + std::strcpy(mRegion, "001"); + } +} + +/* static */ +Result<DateTimeFormat::HourCyclesVector, ICUError> +DateTimeFormat::GetAllowedHourCycles(Span<const char> aLanguage, + Maybe<Span<const char>> aRegion) { + // ICU doesn't expose a public API to retrieve the hour cyles for a locale, so + // we have to reconstruct |DateTimePatternGenerator::getAllowedHourFormats()| + // using the public UResourceBundle API. + // + // The time data format is specified in UTS 35 at [1] and the data itself is + // located at [2]. + // + // [1] https://unicode.org/reports/tr35/tr35-dates.html#Time_Data + // [2] + // https://github.com/unicode-org/cldr/blob/master/common/supplemental/supplementalData.xml + + HourCyclesVector result; + + // Reserve space for the maximum number of hour cycles. This call always + // succeeds because it matches the inline capacity. We can now infallibly + // append all hour cycles to the vector. + MOZ_ALWAYS_TRUE(result.reserve(HourCyclesVector::InlineLength)); + + LanguageRegionLocaleId localeId(aLanguage, aRegion); + + // First open the "supplementalData" resource bundle. + UErrorCode status = U_ZERO_ERROR; + UResourceBundle* res = ures_openDirect(nullptr, "supplementalData", &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + ScopedICUObject<UResourceBundle, ures_close> closeRes(res); + MOZ_ASSERT(ures_getType(res) == URES_TABLE); + + // Locate "timeDate" within the "supplementalData" resource bundle. + UResourceBundle* timeData = ures_getByKey(res, "timeData", nullptr, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + ScopedICUObject<UResourceBundle, ures_close> closeTimeData(timeData); + MOZ_ASSERT(ures_getType(timeData) == URES_TABLE); + + // Try to find a matching resource within "timeData". The two possible keys + // into the "timeData" resource bundle are `language_region` and `region`. + // Prefer `language_region` and otherwise fallback to `region`. + UResourceBundle* hclocale = + ures_getByKey(timeData, localeId.languageRegion(), nullptr, &status); + if (status == U_MISSING_RESOURCE_ERROR) { + status = U_ZERO_ERROR; + hclocale = ures_getByKey(timeData, localeId.region(), nullptr, &status); + } + if (status == U_MISSING_RESOURCE_ERROR) { + // Default to "h23" if no resource was found at all. This matches ICU. + result.infallibleAppend(HourCycle::H23); + return result; + } + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + ScopedICUObject<UResourceBundle, ures_close> closeHcLocale(hclocale); + MOZ_ASSERT(ures_getType(hclocale) == URES_TABLE); + + EnumSet<HourCycle> added{}; + + auto addToResult = [&](const UChar* str, int32_t len) { + // An hour cycle strings is one of "K", "h", "H", or "k"; optionally + // followed by the suffix "b" or "B". We ignore the suffix because day + // periods can't be expressed in the "hc" Unicode extension. + MOZ_ASSERT(len == 1 || len == 2); + + // Default to "h23" for unsupported hour cycle strings. + HourCycle hc = HourCycle::H23; + switch (str[0]) { + case 'K': + hc = HourCycle::H11; + break; + case 'h': + hc = HourCycle::H12; + break; + case 'H': + hc = HourCycle::H23; + break; + case 'k': + hc = HourCycle::H24; + break; + } + + // Add each unique hour cycle to the result array. + if (!added.contains(hc)) { + added += hc; + + result.infallibleAppend(hc); + } + }; + + // Determine the preferred hour cycle for the locale. + int32_t len = 0; + const UChar* hc = ures_getStringByKey(hclocale, "preferred", &len, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + addToResult(hc, len); + + // Find any additionally allowed hour cycles of the locale. + UResourceBundle* allowed = + ures_getByKey(hclocale, "allowed", nullptr, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + ScopedICUObject<UResourceBundle, ures_close> closeAllowed(allowed); + MOZ_ASSERT(ures_getType(allowed) == URES_ARRAY || + ures_getType(allowed) == URES_STRING); + + while (ures_hasNext(allowed)) { + int32_t len = 0; + const UChar* hc = ures_getNextString(allowed, &len, nullptr, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + addToResult(hc, len); + } + + return result; +} + +Result<DateTimeFormat::ComponentsBag, ICUError> +DateTimeFormat::ResolveComponents() { + // Maps an ICU pattern string to a corresponding set of date-time components + // and their values, and adds properties for these components to the result + // object, which will be returned by the resolvedOptions method. For the + // interpretation of ICU pattern characters, see + // http://unicode.org/reports/tr35/tr35-dates.html#Date_Field_Symbol_Table + + DateTimeFormat::PatternVector pattern{}; + VectorToBufferAdaptor buffer(pattern); + MOZ_TRY(GetPattern(buffer)); + + DateTimeFormat::ComponentsBag bag{}; + + using Text = DateTimeFormat::Text; + using HourCycle = DateTimeFormat::HourCycle; + using Numeric = DateTimeFormat::Numeric; + using Month = DateTimeFormat::Month; + + auto text = Text::Long; + auto numeric = Numeric::Numeric; + auto month = Month::Long; + uint8_t fractionalSecondDigits = 0; + + for (size_t i = 0, len = pattern.length(); i < len;) { + char16_t c = pattern[i++]; + if (c == u'\'') { + // Skip past string literals. + while (i < len && pattern[i] != u'\'') { + i++; + } + i++; + continue; + } + + // Count how many times the character is repeated. + size_t count = 1; + while (i < len && pattern[i] == c) { + i++; + count++; + } + + // Determine the enum case of the field. + switch (c) { + // "text" cases + case u'G': + case u'E': + case u'c': + case u'B': + case u'z': + case u'O': + case u'v': + case u'V': + if (count <= 3) { + text = Text::Short; + } else if (count == 4) { + text = Text::Long; + } else { + text = Text::Narrow; + } + break; + // "number" cases + case u'y': + case u'd': + case u'h': + case u'H': + case u'm': + case u's': + case u'k': + case u'K': + if (count == 2) { + numeric = Numeric::TwoDigit; + } else { + numeric = Numeric::Numeric; + } + break; + // "text & number" cases + case u'M': + case u'L': + if (count == 1) { + month = Month::Numeric; + } else if (count == 2) { + month = Month::TwoDigit; + } else if (count == 3) { + month = Month::Short; + } else if (count == 4) { + month = Month::Long; + } else { + month = Month::Narrow; + } + break; + case u'S': + fractionalSecondDigits = count; + break; + default: { + // skip other pattern characters and literal text + } + } + + // Map ICU pattern characters back to the corresponding date-time + // components of DateTimeFormat. See + // http://unicode.org/reports/tr35/tr35-dates.html#Date_Field_Symbol_Table + switch (c) { + case u'E': + case u'c': + bag.weekday = Some(text); + break; + case u'G': + bag.era = Some(text); + break; + case u'y': + bag.year = Some(numeric); + break; + case u'M': + case u'L': + bag.month = Some(month); + break; + case u'd': + bag.day = Some(numeric); + break; + case u'B': + bag.dayPeriod = Some(text); + break; + case u'K': + bag.hourCycle = Some(HourCycle::H11); + bag.hour = Some(numeric); + bag.hour12 = Some(true); + break; + case u'h': + bag.hourCycle = Some(HourCycle::H12); + bag.hour = Some(numeric); + bag.hour12 = Some(true); + break; + case u'H': + bag.hourCycle = Some(HourCycle::H23); + bag.hour = Some(numeric); + bag.hour12 = Some(false); + break; + case u'k': + bag.hourCycle = Some(HourCycle::H24); + bag.hour = Some(numeric); + bag.hour12 = Some(false); + break; + case u'm': + bag.minute = Some(numeric); + break; + case u's': + bag.second = Some(numeric); + break; + case u'S': + bag.fractionalSecondDigits = Some(fractionalSecondDigits); + break; + case u'z': + switch (text) { + case Text::Long: + bag.timeZoneName = Some(TimeZoneName::Long); + break; + case Text::Short: + case Text::Narrow: + bag.timeZoneName = Some(TimeZoneName::Short); + break; + } + break; + case u'O': + switch (text) { + case Text::Long: + bag.timeZoneName = Some(TimeZoneName::LongOffset); + break; + case Text::Short: + case Text::Narrow: + bag.timeZoneName = Some(TimeZoneName::ShortOffset); + break; + } + break; + case u'v': + case u'V': + switch (text) { + case Text::Long: + bag.timeZoneName = Some(TimeZoneName::LongGeneric); + break; + case Text::Short: + case Text::Narrow: + bag.timeZoneName = Some(TimeZoneName::ShortGeneric); + break; + } + break; + } + } + return bag; +} + +const char* DateTimeFormat::ToString( + DateTimeFormat::TimeZoneName aTimeZoneName) { + switch (aTimeZoneName) { + case TimeZoneName::Long: + return "long"; + case TimeZoneName::Short: + return "short"; + case TimeZoneName::ShortOffset: + return "shortOffset"; + case TimeZoneName::LongOffset: + return "longOffset"; + case TimeZoneName::ShortGeneric: + return "shortGeneric"; + case TimeZoneName::LongGeneric: + return "longGeneric"; + } + MOZ_CRASH("Unexpected DateTimeFormat::TimeZoneName"); +} + +const char* DateTimeFormat::ToString(DateTimeFormat::Month aMonth) { + switch (aMonth) { + case Month::Numeric: + return "numeric"; + case Month::TwoDigit: + return "2-digit"; + case Month::Long: + return "long"; + case Month::Short: + return "short"; + case Month::Narrow: + return "narrow"; + } + MOZ_CRASH("Unexpected DateTimeFormat::Month"); +} + +const char* DateTimeFormat::ToString(DateTimeFormat::Text aText) { + switch (aText) { + case Text::Long: + return "long"; + case Text::Short: + return "short"; + case Text::Narrow: + return "narrow"; + } + MOZ_CRASH("Unexpected DateTimeFormat::Text"); +} + +const char* DateTimeFormat::ToString(DateTimeFormat::Numeric aNumeric) { + switch (aNumeric) { + case Numeric::Numeric: + return "numeric"; + case Numeric::TwoDigit: + return "2-digit"; + } + MOZ_CRASH("Unexpected DateTimeFormat::Numeric"); +} + +const char* DateTimeFormat::ToString(DateTimeFormat::Style aStyle) { + switch (aStyle) { + case Style::Full: + return "full"; + case Style::Long: + return "long"; + case Style::Medium: + return "medium"; + case Style::Short: + return "short"; + } + MOZ_CRASH("Unexpected DateTimeFormat::Style"); +} + +const char* DateTimeFormat::ToString(DateTimeFormat::HourCycle aHourCycle) { + switch (aHourCycle) { + case HourCycle::H11: + return "h11"; + case HourCycle::H12: + return "h12"; + case HourCycle::H23: + return "h23"; + case HourCycle::H24: + return "h24"; + } + MOZ_CRASH("Unexpected DateTimeFormat::HourCycle"); +} + +ICUResult DateTimeFormat::TryFormatToParts( + UFieldPositionIterator* aFieldPositionIterator, size_t aSpanSize, + DateTimePartVector& aParts) const { + ScopedICUObject<UFieldPositionIterator, ufieldpositer_close> toClose( + aFieldPositionIterator); + + size_t lastEndIndex = 0; + auto AppendPart = [&](DateTimePartType type, size_t endIndex) { + // For the part defined in FormatDateTimeToParts, it doesn't have ||Source|| + // property, we store Shared for simplicity, + if (!aParts.emplaceBack(type, endIndex, DateTimePartSource::Shared)) { + return false; + } + + lastEndIndex = endIndex; + return true; + }; + + int32_t fieldInt, beginIndexInt, endIndexInt; + while ((fieldInt = ufieldpositer_next(aFieldPositionIterator, &beginIndexInt, + &endIndexInt)) >= 0) { + MOZ_ASSERT(beginIndexInt <= endIndexInt, + "field iterator returning invalid range"); + + size_t beginIndex = AssertedCast<size_t>(beginIndexInt); + size_t endIndex = AssertedCast<size_t>(endIndexInt); + + // Technically this isn't guaranteed. But it appears true in pratice, + // and http://bugs.icu-project.org/trac/ticket/12024 is expected to + // correct the documentation lapse. + MOZ_ASSERT(lastEndIndex <= beginIndex, + "field iteration didn't return fields in order start to " + "finish as expected"); + + DateTimePartType type = + ConvertUFormatFieldToPartType(static_cast<UDateFormatField>(fieldInt)); + if (lastEndIndex < beginIndex) { + if (!AppendPart(DateTimePartType::Literal, beginIndex)) { + return Err(ICUError::InternalError); + } + } + + if (!AppendPart(type, endIndex)) { + return Err(ICUError::InternalError); + } + } + + // Append any final literal. + if (lastEndIndex < aSpanSize) { + if (!AppendPart(DateTimePartType::Literal, aSpanSize)) { + return Err(ICUError::InternalError); + } + } + + return Ok(); +} + +} // namespace mozilla::intl diff --git a/intl/components/src/DateTimeFormat.h b/intl/components/src/DateTimeFormat.h new file mode 100644 index 0000000000..b3e32cd276 --- /dev/null +++ b/intl/components/src/DateTimeFormat.h @@ -0,0 +1,593 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef intl_components_DateTimeFormat_h_ +#define intl_components_DateTimeFormat_h_ +#include <functional> +#include "unicode/udat.h" + +#include "mozilla/Assertions.h" +#include "mozilla/intl/ICU4CGlue.h" +#include "mozilla/intl/ICUError.h" + +#include "mozilla/intl/DateTimePart.h" +#include "mozilla/intl/DateTimePatternGenerator.h" +#include "mozilla/Maybe.h" +#include "mozilla/Result.h" +#include "mozilla/Span.h" +#include "mozilla/UniquePtr.h" +#include "mozilla/Utf8.h" +#include "mozilla/Variant.h" +#include "mozilla/Vector.h" + +/* + * To work around webcompat problems caused by Narrow No-Break Space in + * formatted date/time output, where existing code on the web naively + * assumes there will be a normal Space, we replace any occurrences of + * U+202F in the formatted results with U+0020. + * + * The intention is to undo this hack once other major browsers are also + * ready to ship with the updated (ICU72) i18n data that uses NNBSP. + * + * See https://bugzilla.mozilla.org/show_bug.cgi?id=1806042 for details, + * and see DateIntervalFormat.cpp for the other piece of this hack. + */ +#define DATE_TIME_FORMAT_REPLACE_SPECIAL_SPACES 1 + +namespace mozilla::intl { + +#if DATE_TIME_FORMAT_REPLACE_SPECIAL_SPACES +static inline bool IsSpecialSpace(char16_t c) { + // NARROW NO-BREAK SPACE and THIN SPACE + return c == 0x202F || c == 0x2009; +} +#endif + +class Calendar; + +/** + * Intro to mozilla::intl::DateTimeFormat + * ====================================== + * + * This component is a Mozilla-focused API for the date formatting provided by + * ICU. The methods internally call out to ICU4C. This is responsible for and + * owns any resources opened through ICU, through RAII. + * + * The construction of a DateTimeFormat contains the majority of the cost + * of the DateTimeFormat operation. DateTimeFormat::TryFormat should be + * relatively inexpensive after the initial construction. + * + * This class supports creating from Styles (a fixed set of options) and from a + * components bag (a list of components and their lengths). + * + * This API serves to back the ECMA-402 Intl.DateTimeFormat API. + * https://tc39.es/ecma402/#datetimeformat-objects + * + * + * ECMA-402 Intl.DateTimeFormat API and implementation details with ICU + * skeletons and patterns. + * ==================================================================== + * + * Different locales have different ways to display dates using the same + * basic components. For example, en-US might use "Sept. 24, 2012" while + * fr-FR might use "24 Sept. 2012". The intent of Intl.DateTimeFormat is to + * permit production of a format for the locale that best matches the + * set of date-time components and their desired representation as specified + * by the API client. + * + * ICU4C supports specification of date and time formats in three ways: + * + * 1) A style is just one of the identifiers FULL, LONG, MEDIUM, or SHORT. + * The date-time components included in each style and their representation + * are defined by ICU using CLDR locale data (CLDR is the Unicode + * Consortium's Common Locale Data Repository). + * + * 2) A skeleton is a string specifying which date-time components to include, + * and which representations to use for them. For example, "yyyyMMMMdd" + * specifies a year with at least four digits, a full month name, and a + * two-digit day. It does not specify in which order the components appear, + * how they are separated, the localized strings for textual components + * (such as weekday or month), whether the month is in format or + * stand-alone form¹, or the numbering system used for numeric components. + * All that information is filled in by ICU using CLDR locale data. + * ¹ The format form is the one used in formatted strings that include a + * day; the stand-alone form is used when not including days, e.g., in + * calendar headers. The two forms differ at least in some Slavic languages, + * e.g. Russian: "22 марта 2013 г." vs. "Март 2013". + * + * 3) A pattern is a string specifying which date-time components to include, + * in which order, with which separators, in which grammatical case. For + * example, "EEEE, d MMMM y" specifies the full localized weekday name, + * followed by comma and space, followed by the day, followed by space, + * followed by the full month name in format form, followed by space, + * followed by the full year. It + * still does not specify localized strings for textual components and the + * numbering system - these are determined by ICU using CLDR locale data or + * possibly API parameters. + * + * All actual formatting in ICU4C is done with patterns; styles and skeletons + * have to be mapped to patterns before processing. + * + * The options of Intl.DateTimeFormat most closely correspond to ICU skeletons. + * This implementation therefore converts DateTimeFormat options to ICU + * skeletons, and then lets ICU map skeletons to actual ICU patterns. The + * pattern may not directly correspond to what the skeleton requests, as the + * mapper (UDateTimePatternGenerator) is constrained by the available locale + * data for the locale. + * + * An ICU pattern represents the information of the following DateTimeFormat + * internal properties described in the specification, which therefore don't + * exist separately in the implementation: + * - [[weekday]], [[era]], [[year]], [[month]], [[day]], [[hour]], [[minute]], + * [[second]], [[timeZoneName]] + * - [[hour12]] + * - [[hourCycle]] + * - [[hourNo0]] + * When needed for the resolvedOptions method, the resolveICUPattern function + * queries the UDateFormat's internal pattern and then maps the it back to the + * specified properties of the object returned by resolvedOptions. + * + * ICU date-time skeletons and patterns aren't fully documented in the ICU + * documentation (see http://bugs.icu-project.org/trac/ticket/9627). The best + * documentation at this point is in UTR 35: + * http://unicode.org/reports/tr35/tr35-dates.html#Date_Format_Patterns + * + * Future support for ICU4X + * ======================== + * This implementation exposes a components bag, and internally handles the + * complexity of working with skeletons and patterns to generate the correct + * results. In the future, if and when we switch to ICU4X, the complexities of + * manipulating patterns will be able to be removed, as ICU4X will directly know + * how to apply the components bag. + */ +class DateTimeFormat final { + public: + /** + * The hour cycle for components. + */ + enum class HourCycle { + H11, + H12, + H23, + H24, + }; + + /** + * The style for dates or times. + */ + enum class Style { + Full, + Long, + Medium, + Short, + }; + + /** + * A bag of options to determine the length of the time and date styles. The + * hour cycle can be overridden. + */ + struct StyleBag { + Maybe<Style> date = Nothing(); + Maybe<Style> time = Nothing(); + Maybe<HourCycle> hourCycle = Nothing(); + Maybe<bool> hour12 = Nothing(); + }; + + /** + * How to to display numeric components such as the year and the day. + */ + enum class Numeric { + Numeric, + TwoDigit, + }; + + /** + * How to display the text components, such as the weekday or day period. + */ + enum class Text { + Long, + Short, + Narrow, + }; + + /** + * How to display the month. + */ + enum class Month { + Numeric, + TwoDigit, + Long, + Short, + Narrow, + }; + + /** + * How to display the time zone name. + */ + enum class TimeZoneName { + Long, + Short, + ShortOffset, + LongOffset, + ShortGeneric, + LongGeneric, + }; + + /** + * Get static strings representing the enums. These match ECMA-402's resolved + * options. + * https://tc39.es/ecma402/#sec-intl.datetimeformat.prototype.resolvedoptions + */ + static const char* ToString(DateTimeFormat::HourCycle aHourCycle); + static const char* ToString(DateTimeFormat::Style aStyle); + static const char* ToString(DateTimeFormat::Numeric aNumeric); + static const char* ToString(DateTimeFormat::Text aText); + static const char* ToString(DateTimeFormat::Month aMonth); + static const char* ToString(DateTimeFormat::TimeZoneName aTimeZoneName); + + /** + * A components bag specifies the components used to display a DateTime. Each + * component can be styled individually, and ICU will attempt to create a best + * match for a given locale. + */ + struct ComponentsBag { + Maybe<Text> era = Nothing(); + Maybe<Numeric> year = Nothing(); + Maybe<Month> month = Nothing(); + Maybe<Numeric> day = Nothing(); + Maybe<Text> weekday = Nothing(); + Maybe<Numeric> hour = Nothing(); + Maybe<Numeric> minute = Nothing(); + Maybe<Numeric> second = Nothing(); + Maybe<TimeZoneName> timeZoneName = Nothing(); + Maybe<bool> hour12 = Nothing(); + Maybe<HourCycle> hourCycle = Nothing(); + Maybe<Text> dayPeriod = Nothing(); + Maybe<uint8_t> fractionalSecondDigits = Nothing(); + }; + + // Do not allow copy as this class owns the ICU resource. Move is not + // currently implemented, but a custom move operator could be created if + // needed. + DateTimeFormat(const DateTimeFormat&) = delete; + DateTimeFormat& operator=(const DateTimeFormat&) = delete; + + // mozilla::Vector can avoid heap allocations for small transient buffers. + using PatternVector = Vector<char16_t, 128>; + using SkeletonVector = Vector<char16_t, 16>; + + /** + * Create a DateTimeFormat from styles. + * + * The "style" model uses different options for formatting a date or time + * based on how the result will be styled, rather than picking specific + * fields or lengths. + * + * Takes an optional time zone which will override the user's default + * time zone. This is a UTF-16 string that takes the form "GMT±hh:mm", or + * an IANA time zone identifier, e.g. "America/Chicago". + */ + static Result<UniquePtr<DateTimeFormat>, ICUError> TryCreateFromStyle( + Span<const char> aLocale, const StyleBag& aStyleBag, + DateTimePatternGenerator* aDateTimePatternGenerator, + Maybe<Span<const char16_t>> aTimeZoneOverride = Nothing{}); + + private: + /** + * Create a DateTimeFormat from a UTF-16 skeleton. + * + * A skeleton is an unordered list of fields that are used to find an + * appropriate date time format pattern. Example skeletons would be "yMd", + * "yMMMd", "EBhm". If the skeleton includes string literals or other + * information, it will be discarded when matching against skeletons. + * + * Takes an optional time zone which will override the user's default + * time zone. This is a string that takes the form "GMT±hh:mm", or + * an IANA time zone identifier, e.g. "America/Chicago". + */ + static Result<UniquePtr<DateTimeFormat>, ICUError> TryCreateFromSkeleton( + Span<const char> aLocale, Span<const char16_t> aSkeleton, + DateTimePatternGenerator* aDateTimePatternGenerator, + Maybe<DateTimeFormat::HourCycle> aHourCycle, + Maybe<Span<const char16_t>> aTimeZoneOverride); + + public: + /** + * Create a DateTimeFormat from a ComponentsBag. + * + * See the ComponentsBag for additional documentation. + * + * Takes an optional time zone which will override the user's default + * time zone. This is a string that takes the form "GMT±hh:mm", or + * an IANA time zone identifier, e.g. "America/Chicago". + */ + static Result<UniquePtr<DateTimeFormat>, ICUError> TryCreateFromComponents( + Span<const char> aLocale, const ComponentsBag& bag, + DateTimePatternGenerator* aDateTimePatternGenerator, + Maybe<Span<const char16_t>> aTimeZoneOverride = Nothing{}); + + /** + * Create a DateTimeFormat from a raw pattern. + * + * Warning: This method should not be added to new code. In the near future we + * plan to remove it. + */ + static Result<UniquePtr<DateTimeFormat>, ICUError> TryCreateFromPattern( + Span<const char> aLocale, Span<const char16_t> aPattern, + Maybe<Span<const char16_t>> aTimeZoneOverride = Nothing{}); + + /** + * Use the format settings to format a date time into a string. The non-null + * terminated string will be placed into the provided buffer. The idea behind + * this API is that the constructor is expensive, and then the format + * operation is cheap. + * + * aUnixEpoch is the number of milliseconds since 1 January 1970, UTC. + */ + template <typename B> + ICUResult TryFormat(double aUnixEpoch, B& aBuffer) const { + static_assert( + std::is_same_v<typename B::CharType, unsigned char> || + std::is_same_v<typename B::CharType, char> || + std::is_same_v<typename B::CharType, char16_t>, + "The only buffer CharTypes supported by DateTimeFormat are char " + "(for UTF-8 support) and char16_t (for UTF-16 support)."); + + if constexpr (std::is_same_v<typename B::CharType, char> || + std::is_same_v<typename B::CharType, unsigned char>) { + // The output buffer is UTF-8, but ICU uses UTF-16 internally. + + // Write the formatted date into the u16Buffer. + PatternVector u16Vec; + + auto result = FillBufferWithICUCall( + u16Vec, [this, &aUnixEpoch](UChar* target, int32_t length, + UErrorCode* status) { + return udat_format(mDateFormat, aUnixEpoch, target, length, + /* UFieldPosition* */ nullptr, status); + }); + if (result.isErr()) { + return result; + } + +#if DATE_TIME_FORMAT_REPLACE_SPECIAL_SPACES + for (auto& c : u16Vec) { + if (IsSpecialSpace(c)) { + c = ' '; + } + } +#endif + + if (!FillBuffer(u16Vec, aBuffer)) { + return Err(ICUError::OutOfMemory); + } + return Ok{}; + } else { + static_assert(std::is_same_v<typename B::CharType, char16_t>); + + // The output buffer is UTF-16. ICU can output directly into this buffer. + auto result = FillBufferWithICUCall( + aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) { + return udat_format(mDateFormat, aUnixEpoch, target, length, nullptr, + status); + }); + if (result.isErr()) { + return result; + } + +#if DATE_TIME_FORMAT_REPLACE_SPECIAL_SPACES + for (auto& c : Span(aBuffer.data(), aBuffer.length())) { + if (IsSpecialSpace(c)) { + c = ' '; + } + } +#endif + + return Ok{}; + } + }; + + /** + * Format the Unix epoch time into a DateTimePartVector. + * + * The caller has to create the buffer and the vector and pass to this method. + * The formatted string will be stored in the buffer and formatted parts in + * the vector. + * + * aUnixEpoch is the number of milliseconds since 1 January 1970, UTC. + * + * See: + * https://tc39.es/ecma402/#sec-formatdatetimetoparts + */ + template <typename B> + ICUResult TryFormatToParts(double aUnixEpoch, B& aBuffer, + DateTimePartVector& aParts) const { + static_assert(std::is_same_v<typename B::CharType, char16_t>, + "Only char16_t is supported (for UTF-16 support) now."); + + UErrorCode status = U_ZERO_ERROR; + UFieldPositionIterator* fpositer = ufieldpositer_open(&status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + auto result = FillBufferWithICUCall( + aBuffer, [this, aUnixEpoch, fpositer](UChar* chars, int32_t size, + UErrorCode* status) { + return udat_formatForFields(mDateFormat, aUnixEpoch, chars, size, + fpositer, status); + }); + if (result.isErr()) { + ufieldpositer_close(fpositer); + return result.propagateErr(); + } + +#if DATE_TIME_FORMAT_REPLACE_SPECIAL_SPACES + for (auto& c : Span(aBuffer.data(), aBuffer.length())) { + if (IsSpecialSpace(c)) { + c = ' '; + } + } +#endif + + return TryFormatToParts(fpositer, aBuffer.length(), aParts); + } + + /** + * Copies the pattern for the current DateTimeFormat to a buffer. + * + * Warning: This method should not be added to new code. In the near future we + * plan to remove it. + */ + template <typename B> + ICUResult GetPattern(B& aBuffer) const { + return FillBufferWithICUCall( + aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) { + return udat_toPattern(mDateFormat, /* localized*/ false, target, + length, status); + }); + } + + /** + * Copies the skeleton that was used to generate the current DateTimeFormat to + * the given buffer. If no skeleton was used, then a skeleton is generated + * from the resolved pattern. Note that going from skeleton -> resolved + * pattern -> skeleton is not a 1:1 mapping, as the resolved pattern can + * contain different symbols than the requested skeleton. + * + * Warning: This method should not be added to new code. In the near future we + * plan to remove it. + */ + template <typename B> + ICUResult GetOriginalSkeleton(B& aBuffer) { + static_assert(std::is_same_v<typename B::CharType, char16_t>); + if (mOriginalSkeleton.length() == 0) { + // Generate a skeleton from the resolved pattern, there was no originally + // cached skeleton. + PatternVector pattern{}; + VectorToBufferAdaptor buffer(pattern); + MOZ_TRY(GetPattern(buffer)); + + VectorToBufferAdaptor skeleton(mOriginalSkeleton); + MOZ_TRY(DateTimePatternGenerator::GetSkeleton(pattern, skeleton)); + } + + if (!FillBuffer(mOriginalSkeleton, aBuffer)) { + return Err(ICUError::OutOfMemory); + } + return Ok(); + } + /** + * Set the start time of the Gregorian calendar. This is useful for + * ensuring the consistent use of a proleptic Gregorian calendar for ECMA-402. + * https://en.wikipedia.org/wiki/Proleptic_Gregorian_calendar + */ + void SetStartTimeIfGregorian(double aTime); + + /** + * Determines the resolved components for the current DateTimeFormat. + * + * When a DateTimeFormat is created, even from a components bag, the resolved + * formatter may tweak the resolved components depending on the configuration + * and the locale. + * + * For the implementation, with ICU4C, this takes a string pattern and maps it + * back to a ComponentsBag. + */ + Result<ComponentsBag, ICUError> ResolveComponents(); + + ~DateTimeFormat(); + + /** + * Clones the Calendar from a DateTimeFormat, and sets its time with the + * relative milliseconds since 1 January 1970, UTC. + */ + Result<UniquePtr<Calendar>, ICUError> CloneCalendar(double aUnixEpoch) const; + + /** + * Return the hour cycle used in the input pattern or Nothing if none was + * found. + */ + static Maybe<DateTimeFormat::HourCycle> HourCycleFromPattern( + Span<const char16_t> aPattern); + + using HourCyclesVector = Vector<HourCycle, 4>; + + /** + * Returns the allowed hour cycles for the input locale. + * + * NOTE: This function currently takes a language subtag and an optional + * region subtag. This is a restriction until bug 1719746 has migrated + * language tag processing into the unified Intl component. After bug 1719746, + * this function should be changed to accept a single locale tag. + */ + static Result<HourCyclesVector, ICUError> GetAllowedHourCycles( + Span<const char> aLanguage, Maybe<Span<const char>> aRegion); + + /** + * Returns an iterator over all supported date-time formatter locales. + * + * The returned strings are ICU locale identifiers and NOT BCP 47 language + * tags. + * + * Also see <https://unicode-org.github.io/icu/userguide/locale>. + */ + static auto GetAvailableLocales() { + return AvailableLocalesEnumeration<udat_countAvailable, + udat_getAvailable>(); + } + + private: + explicit DateTimeFormat(UDateFormat* aDateFormat); + + ICUResult CacheSkeleton(Span<const char16_t> aSkeleton); + + ICUResult TryFormatToParts(UFieldPositionIterator* aFieldPositionIterator, + size_t aSpanSize, + DateTimePartVector& aParts) const; + /** + * Replaces all hour pattern characters in |patternOrSkeleton| to use the + * matching hour representation for |hourCycle|. + */ + static void ReplaceHourSymbol(Span<char16_t> aPatternOrSkeleton, + DateTimeFormat::HourCycle aHourCycle); + + /** + * Find a matching pattern using the requested hour-12 options. + * + * This function is needed to work around the following two issues. + * - https://unicode-org.atlassian.net/browse/ICU-21023 + * - https://unicode-org.atlassian.net/browse/CLDR-13425 + * + * We're currently using a relatively simple workaround, which doesn't give + * the most accurate results. For example: + * + * ``` + * var dtf = new Intl.DateTimeFormat("en", { + * timeZone: "UTC", + * dateStyle: "long", + * timeStyle: "long", + * hourCycle: "h12", + * }); + * print(dtf.format(new Date("2020-01-01T00:00Z"))); + * ``` + * + * Returns the pattern "MMMM d, y 'at' h:mm:ss a z", but when going through + * |DateTimePatternGenerator::GetSkeleton| and then + * |DateTimePatternGenerator::GetBestPattern| to find an equivalent pattern + * for "h23", we'll end up with the pattern "MMMM d, y, HH:mm:ss z", so the + * combinator element " 'at' " was lost in the process. + */ + static ICUResult FindPatternWithHourCycle( + DateTimePatternGenerator& aDateTimePatternGenerator, + DateTimeFormat::PatternVector& aPattern, bool aHour12, + DateTimeFormat::SkeletonVector& aSkeleton); + + UDateFormat* mDateFormat = nullptr; + + SkeletonVector mOriginalSkeleton; +}; + +} // namespace mozilla::intl + +#endif diff --git a/intl/components/src/DateTimeFormatUtils.cpp b/intl/components/src/DateTimeFormatUtils.cpp new file mode 100644 index 0000000000..fd0649461e --- /dev/null +++ b/intl/components/src/DateTimeFormatUtils.cpp @@ -0,0 +1,104 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "mozilla/Assertions.h" + +#include "DateTimeFormatUtils.h" + +namespace mozilla::intl { + +DateTimePartType ConvertUFormatFieldToPartType(UDateFormatField fieldName) { + // See intl/icu/source/i18n/unicode/udat.h for a detailed field list. This + // switch is deliberately exhaustive: cases might have to be added/removed + // if this code is compiled with a different ICU with more + // UDateFormatField enum initializers. Please guard such cases with + // appropriate ICU version-testing #ifdefs, should cross-version divergence + // occur. + switch (fieldName) { + case UDAT_ERA_FIELD: + return DateTimePartType::Era; + + case UDAT_YEAR_FIELD: + case UDAT_YEAR_WOY_FIELD: + case UDAT_EXTENDED_YEAR_FIELD: + return DateTimePartType::Year; + + case UDAT_YEAR_NAME_FIELD: + return DateTimePartType::YearName; + + case UDAT_MONTH_FIELD: + case UDAT_STANDALONE_MONTH_FIELD: + return DateTimePartType::Month; + + case UDAT_DATE_FIELD: + case UDAT_JULIAN_DAY_FIELD: + return DateTimePartType::Day; + + case UDAT_HOUR_OF_DAY1_FIELD: + case UDAT_HOUR_OF_DAY0_FIELD: + case UDAT_HOUR1_FIELD: + case UDAT_HOUR0_FIELD: + return DateTimePartType::Hour; + + case UDAT_MINUTE_FIELD: + return DateTimePartType::Minute; + + case UDAT_SECOND_FIELD: + return DateTimePartType::Second; + + case UDAT_DAY_OF_WEEK_FIELD: + case UDAT_STANDALONE_DAY_FIELD: + case UDAT_DOW_LOCAL_FIELD: + case UDAT_DAY_OF_WEEK_IN_MONTH_FIELD: + return DateTimePartType::Weekday; + + case UDAT_AM_PM_FIELD: + case UDAT_FLEXIBLE_DAY_PERIOD_FIELD: + return DateTimePartType::DayPeriod; + + case UDAT_TIMEZONE_FIELD: + case UDAT_TIMEZONE_GENERIC_FIELD: + case UDAT_TIMEZONE_LOCALIZED_GMT_OFFSET_FIELD: + return DateTimePartType::TimeZoneName; + + case UDAT_FRACTIONAL_SECOND_FIELD: + return DateTimePartType::FractionalSecondDigits; + +#ifndef U_HIDE_INTERNAL_API + case UDAT_RELATED_YEAR_FIELD: + return DateTimePartType::RelatedYear; +#endif + + case UDAT_DAY_OF_YEAR_FIELD: + case UDAT_WEEK_OF_YEAR_FIELD: + case UDAT_WEEK_OF_MONTH_FIELD: + case UDAT_MILLISECONDS_IN_DAY_FIELD: + case UDAT_TIMEZONE_RFC_FIELD: + case UDAT_QUARTER_FIELD: + case UDAT_STANDALONE_QUARTER_FIELD: + case UDAT_TIMEZONE_SPECIAL_FIELD: + case UDAT_TIMEZONE_ISO_FIELD: + case UDAT_TIMEZONE_ISO_LOCAL_FIELD: + case UDAT_AM_PM_MIDNIGHT_NOON_FIELD: +#ifndef U_HIDE_INTERNAL_API + case UDAT_TIME_SEPARATOR_FIELD: +#endif + // These fields are all unsupported. + return DateTimePartType::Unknown; + +#ifndef U_HIDE_DEPRECATED_API + case UDAT_FIELD_COUNT: + MOZ_ASSERT_UNREACHABLE( + "format field sentinel value returned by " + "iterator!"); +#endif + } + + MOZ_ASSERT_UNREACHABLE( + "unenumerated, undocumented format field returned " + "by iterator"); + return DateTimePartType::Unknown; +} + +} // namespace mozilla::intl diff --git a/intl/components/src/DateTimeFormatUtils.h b/intl/components/src/DateTimeFormatUtils.h new file mode 100644 index 0000000000..89187b9871 --- /dev/null +++ b/intl/components/src/DateTimeFormatUtils.h @@ -0,0 +1,14 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef intl_components_DateTimeFormatUtils_h_ +#define intl_components_DateTimeFormatUtils_h_ +#include "unicode/udat.h" + +#include "mozilla/intl/DateTimePart.h" + +namespace mozilla::intl { +DateTimePartType ConvertUFormatFieldToPartType(UDateFormatField fieldName); +} // namespace mozilla::intl + +#endif diff --git a/intl/components/src/DateTimePart.h b/intl/components/src/DateTimePart.h new file mode 100644 index 0000000000..4de2c22996 --- /dev/null +++ b/intl/components/src/DateTimePart.h @@ -0,0 +1,84 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef intl_components_DateTimePart_h_ +#define intl_components_DateTimePart_h_ + +#include <cstddef> +#include <cstdint> + +#include "mozilla/Vector.h" + +namespace mozilla::intl { + +enum class DateTimePartType : int16_t { + Literal, + Weekday, + Era, + Year, + YearName, + RelatedYear, + Month, + Day, + DayPeriod, + Hour, + Minute, + Second, + FractionalSecondDigits, + TimeZoneName, + Unknown +}; + +enum class DateTimePartSource : int16_t { Shared, StartRange, EndRange }; + +/** + * The 'Part' object defined in FormatDateTimeToParts and + * FormatDateTimeRangeToParts + * + * Each part consists of three properties: ||Type||, ||Value|| and ||Source||, + * with the ||Source|| property is set to DateTimePartSource::Shared by default. + * (Note: From the spec, the part from FormatDateTimeToParts doesn't have the + * ||Source|| property, so if the caller is FormatDateTimeToParts, it should + * ignore the ||Source|| property). + * + * To store DateTimePart more efficiently, it doesn't store the ||Value|| of + * type string in this struct. Instead, it stores the end index of the string + * in the buffer(which is passed to DateTimeFormat::TryFormatToParts() or + * can be got by calling AutoFormattedDateInterval::ToSpan()). The begin index + * of the ||Value|| is the mEndIndex of the previous part. + * + * Buffer + * 0 i j + * +---------------+---------------+---------------+ + * | Part[0].Value | Part[1].Value | Part[2].Value | .... + * +---------------+---------------+---------------+ + * + * Part[0].mEndIndex is i. Part[0].Value is stored in the Buffer[0..i]. + * Part[1].mEndIndex is j. Part[1].Value is stored in the Buffer[i..j]. + * + * See: + * https://tc39.es/ecma402/#sec-formatdatetimetoparts + * https://tc39.es/ecma402/#sec-formatdatetimerangetoparts + */ +struct DateTimePart { + DateTimePart(DateTimePartType type, size_t endIndex, + DateTimePartSource source) + : mEndIndex(endIndex), mType(type), mSource(source) {} + + // See the above comments for details, mEndIndex is placed first for reducing + // padding. + size_t mEndIndex; + DateTimePartType mType; + DateTimePartSource mSource; +}; + +// The common parts are 'month', 'literal', 'day', 'literal', 'year', 'literal', +// 'hour', 'literal', 'minute', 'literal', which are 10 parts, for DateTimeRange +// the number will be doubled, so choosing 32 as the initial length to prevent +// heap allocation. +constexpr size_t INITIAL_DATETIME_PART_VECTOR_SIZE = 32; +using DateTimePartVector = + mozilla::Vector<DateTimePart, INITIAL_DATETIME_PART_VECTOR_SIZE>; + +} // namespace mozilla::intl +#endif diff --git a/intl/components/src/DateTimePatternGenerator.cpp b/intl/components/src/DateTimePatternGenerator.cpp new file mode 100644 index 0000000000..4362061172 --- /dev/null +++ b/intl/components/src/DateTimePatternGenerator.cpp @@ -0,0 +1,49 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#include "mozilla/intl/DateTimePatternGenerator.h" + +namespace mozilla::intl { + +DateTimePatternGenerator::~DateTimePatternGenerator() { + // The mGenerator will not exist when the DateTimePatternGenerator is being + // moved. + if (mGenerator) { + udatpg_close(mGenerator.GetMut()); + } +} + +/* static */ +Result<UniquePtr<DateTimePatternGenerator>, ICUError> +DateTimePatternGenerator::TryCreate(const char* aLocale) { + UErrorCode status = U_ZERO_ERROR; + UDateTimePatternGenerator* generator = + udatpg_open(IcuLocale(aLocale), &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + return MakeUnique<DateTimePatternGenerator>(generator); +}; + +DateTimePatternGenerator::DateTimePatternGenerator( + DateTimePatternGenerator&& other) noexcept + : mGenerator(other.mGenerator.GetMut()) { + other.mGenerator = nullptr; +} + +DateTimePatternGenerator& DateTimePatternGenerator::operator=( + DateTimePatternGenerator&& other) noexcept { + if (this == &other) { + return *this; + } + + if (mGenerator) { + udatpg_close(mGenerator.GetMut()); + } + mGenerator = other.mGenerator.GetMut(); + other.mGenerator = nullptr; + + return *this; +} + +} // namespace mozilla::intl diff --git a/intl/components/src/DateTimePatternGenerator.h b/intl/components/src/DateTimePatternGenerator.h new file mode 100644 index 0000000000..d9d6de3928 --- /dev/null +++ b/intl/components/src/DateTimePatternGenerator.h @@ -0,0 +1,161 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef intl_components_DateTimePatternGenerator_h_ +#define intl_components_DateTimePatternGenerator_h_ + +#include "unicode/udatpg.h" +#include "mozilla/EnumSet.h" +#include "mozilla/Result.h" +#include "mozilla/Span.h" +#include "mozilla/UniquePtr.h" +#include "mozilla/intl/ICU4CGlue.h" +#include "mozilla/intl/ICUError.h" + +namespace mozilla::intl { + +class DisplayNames; + +/** + * The DateTimePatternGenerator is the machinery used to work with DateTime + * pattern manipulation. It is expensive to create one, and so generally it is + * created once and then cached. It may be needed to be passed in as an argument + * for different mozilla::intl APIs. + */ +class DateTimePatternGenerator final { + public: + explicit DateTimePatternGenerator(UDateTimePatternGenerator* aGenerator) + : mGenerator(aGenerator) { + MOZ_ASSERT(aGenerator); + }; + + // Transfer ownership of the UDateTimePatternGenerator in the move + // constructor. + DateTimePatternGenerator(DateTimePatternGenerator&& other) noexcept; + + // Transfer ownership of the UEnumeration in the move assignment operator. + DateTimePatternGenerator& operator=( + DateTimePatternGenerator&& other) noexcept; + + // Disallow copy. + DateTimePatternGenerator(const DateTimePatternGenerator&) = delete; + DateTimePatternGenerator& operator=(const DateTimePatternGenerator&) = delete; + + ~DateTimePatternGenerator(); + + static Result<UniquePtr<DateTimePatternGenerator>, ICUError> TryCreate( + const char* aLocale); + + enum class PatternMatchOption { + /** + * Adjust the 'hour' field in the resolved pattern to match the input + * skeleton width. + */ + HourField, + + /** + * Adjust the 'minute' field in the resolved pattern to match the input + * skeleton width. + */ + MinuteField, + + /** + * Adjust the 'second' field in the resolved pattern to match the input + * skeleton width. + */ + SecondField, + }; + + /** + * Given a skeleton (a string with unordered datetime fields), get a best + * pattern that will fit for that locale. This pattern will be filled into the + * buffer. e.g. The skeleton "yMd" would return the pattern "M/d/y" for en-US, + * or "dd/MM/y" for en-GB. + */ + template <typename B> + ICUResult GetBestPattern(Span<const char16_t> aSkeleton, B& aBuffer, + EnumSet<PatternMatchOption> options = {}) { + return FillBufferWithICUCall( + aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) { + return udatpg_getBestPatternWithOptions( + mGenerator.GetMut(), aSkeleton.data(), + static_cast<int32_t>(aSkeleton.Length()), + toUDateTimePatternMatchOptions(options), target, length, status); + }); + } + + /** + * Get a skeleton (a string with unordered datetime fields) from a pattern. + * For example, both "MMM-dd" and "dd/MMM" produce the skeleton "MMMdd". + */ + template <typename B> + static ICUResult GetSkeleton(Span<const char16_t> aPattern, B& aBuffer) { + // At one time udatpg_getSkeleton required a UDateTimePatternGenerator*, but + // now it is valid to pass in a nullptr. + return FillBufferWithICUCall( + aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) { + return udatpg_getSkeleton(nullptr, aPattern.data(), + static_cast<int32_t>(aPattern.Length()), + target, length, status); + }); + } + + /** + * Get a pattern of the form "{1} {0}" to combine separate date and time + * patterns into a single pattern. The "{0}" part is the placeholder for the + * time pattern and "{1}" is the placeholder for the date pattern. + * + * See dateTimeFormat from + * https://unicode.org/reports/tr35/tr35-dates.html#dateTimeFormat + * + * Note: + * In CLDR, it's called Date-Time Combined Format + * https://cldr.unicode.org/translation/date-time/datetime-patterns#h.x7ca7qwzh4m + * + * The naming 'placeholder pattern' is from ICU4X. + * https://unicode-org.github.io/icu4x-docs/doc/icu_pattern/index.html + */ + Span<const char16_t> GetPlaceholderPattern() const { + int32_t length; + const char16_t* combined = + udatpg_getDateTimeFormat(mGenerator.GetConst(), &length); + return Span{combined, static_cast<size_t>(length)}; + } + + private: + // Allow other mozilla::intl components to access the underlying + // UDateTimePatternGenerator. + friend class DisplayNames; + + UDateTimePatternGenerator* GetUDateTimePatternGenerator() { + return mGenerator.GetMut(); + } + + ICUPointer<UDateTimePatternGenerator> mGenerator = + ICUPointer<UDateTimePatternGenerator>(nullptr); + + static UDateTimePatternMatchOptions toUDateTimePatternMatchOptions( + EnumSet<PatternMatchOption> options) { + struct OptionMap { + PatternMatchOption from; + UDateTimePatternMatchOptions to; + } static constexpr map[] = { + {PatternMatchOption::HourField, UDATPG_MATCH_HOUR_FIELD_LENGTH}, +#ifndef U_HIDE_INTERNAL_API + {PatternMatchOption::MinuteField, UDATPG_MATCH_MINUTE_FIELD_LENGTH}, + {PatternMatchOption::SecondField, UDATPG_MATCH_SECOND_FIELD_LENGTH}, +#endif + }; + + UDateTimePatternMatchOptions result = UDATPG_MATCH_NO_OPTIONS; + for (const auto& entry : map) { + if (options.contains(entry.from)) { + result = UDateTimePatternMatchOptions(result | entry.to); + } + } + return result; + } +}; + +} // namespace mozilla::intl +#endif diff --git a/intl/components/src/DisplayNames.cpp b/intl/components/src/DisplayNames.cpp new file mode 100644 index 0000000000..252969ccbb --- /dev/null +++ b/intl/components/src/DisplayNames.cpp @@ -0,0 +1,234 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#include "mozilla/intl/DisplayNames.h" +#include "ScopedICUObject.h" + +namespace mozilla::intl { + +DisplayNames::~DisplayNames() { + // The mDisplayNames will not exist when the DisplayNames is being + // moved. + if (auto* uldn = mULocaleDisplayNames.GetMut()) { + uldn_close(uldn); + } +} + +DisplayNamesError DisplayNames::ToError(ICUError aError) const { + switch (aError) { + case ICUError::InternalError: + case ICUError::OverflowError: + return DisplayNamesError::InternalError; + case ICUError::OutOfMemory: + return DisplayNamesError::OutOfMemory; + } + MOZ_ASSERT_UNREACHABLE(); + return DisplayNamesError::InternalError; +} + +DisplayNamesError DisplayNames::ToError( + Locale::CanonicalizationError aError) const { + switch (aError) { + case Locale::CanonicalizationError::DuplicateVariant: + return DisplayNamesError::DuplicateVariantSubtag; + case Locale::CanonicalizationError::InternalError: + return DisplayNamesError::InternalError; + case Locale::CanonicalizationError::OutOfMemory: + return DisplayNamesError::OutOfMemory; + } + MOZ_ASSERT_UNREACHABLE(); + return DisplayNamesError::InternalError; +} + +/* static */ +Result<UniquePtr<DisplayNames>, ICUError> DisplayNames::TryCreate( + const char* aLocale, Options aOptions) { + UErrorCode status = U_ZERO_ERROR; + UDisplayContext contexts[] = { + // Use either standard or dialect names. + // For example either "English (GB)" or "British English". + aOptions.languageDisplay == DisplayNames::LanguageDisplay::Standard + ? UDISPCTX_STANDARD_NAMES + : UDISPCTX_DIALECT_NAMES, + + // Assume the display names are used in a stand-alone context. + UDISPCTX_CAPITALIZATION_FOR_STANDALONE, + + // Select either the long or short form. There's no separate narrow form + // available in ICU, therefore we equate "narrow"/"short" styles here. + aOptions.style == DisplayNames::Style::Long ? UDISPCTX_LENGTH_FULL + : UDISPCTX_LENGTH_SHORT, + + // Don't apply substitutes, because we need to apply our own fallbacks. + UDISPCTX_NO_SUBSTITUTE, + }; + + const char* locale = IcuLocale(aLocale); + + ULocaleDisplayNames* uLocaleDisplayNames = + uldn_openForContext(locale, contexts, std::size(contexts), &status); + + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + return MakeUnique<DisplayNames>(uLocaleDisplayNames, MakeStringSpan(locale), + aOptions); +}; + +#ifdef DEBUG +static bool IsStandaloneMonth(UDateFormatSymbolType symbolType) { + switch (symbolType) { + case UDAT_STANDALONE_MONTHS: + case UDAT_STANDALONE_SHORT_MONTHS: + case UDAT_STANDALONE_NARROW_MONTHS: + return true; + + case UDAT_ERAS: + case UDAT_MONTHS: + case UDAT_SHORT_MONTHS: + case UDAT_WEEKDAYS: + case UDAT_SHORT_WEEKDAYS: + case UDAT_AM_PMS: + case UDAT_LOCALIZED_CHARS: + case UDAT_ERA_NAMES: + case UDAT_NARROW_MONTHS: + case UDAT_NARROW_WEEKDAYS: + case UDAT_STANDALONE_WEEKDAYS: + case UDAT_STANDALONE_SHORT_WEEKDAYS: + case UDAT_STANDALONE_NARROW_WEEKDAYS: + case UDAT_QUARTERS: + case UDAT_SHORT_QUARTERS: + case UDAT_STANDALONE_QUARTERS: + case UDAT_STANDALONE_SHORT_QUARTERS: + case UDAT_SHORTER_WEEKDAYS: + case UDAT_STANDALONE_SHORTER_WEEKDAYS: + case UDAT_CYCLIC_YEARS_WIDE: + case UDAT_CYCLIC_YEARS_ABBREVIATED: + case UDAT_CYCLIC_YEARS_NARROW: + case UDAT_ZODIAC_NAMES_WIDE: + case UDAT_ZODIAC_NAMES_ABBREVIATED: + case UDAT_ZODIAC_NAMES_NARROW: + case UDAT_NARROW_QUARTERS: + case UDAT_STANDALONE_NARROW_QUARTERS: + return false; + } + + MOZ_ASSERT_UNREACHABLE("unenumerated, undocumented symbol type"); + return false; +} +#endif + +Result<Ok, DisplayNamesError> DisplayNames::ComputeDateTimeDisplayNames( + UDateFormatSymbolType symbolType, mozilla::Span<const int32_t> indices, + Span<const char> aCalendar) { + if (!mDateTimeDisplayNames.empty()) { + // No need to re-compute the display names. + return Ok(); + } + mozilla::intl::Locale tag; + // Do not use mLocale.AsSpan() as it includes the null terminator inside the + // span. + if (LocaleParser::TryParse(Span(mLocale.Elements(), mLocale.Length() - 1), + tag) + .isErr()) { + return Err(DisplayNamesError::InvalidLanguageTag); + } + + if (!aCalendar.empty()) { + // Add the calendar extension to the locale. This is only available via + // the MozExtension. + Vector<char, 32> extension; + Span<const char> prefix = MakeStringSpan("u-ca-"); + if (!extension.append(prefix.data(), prefix.size()) || + !extension.append(aCalendar.data(), aCalendar.size())) { + return Err(DisplayNamesError::OutOfMemory); + } + // This overwrites any other Unicode extensions, but should be okay to do + // here. + if (auto result = tag.SetUnicodeExtension(extension); result.isErr()) { + return Err(ToError(result.unwrapErr())); + } + } + + constexpr char16_t* timeZone = nullptr; + constexpr int32_t timeZoneLength = 0; + + constexpr char16_t* pattern = nullptr; + constexpr int32_t patternLength = 0; + + Vector<char, DisplayNames::LocaleVecLength> localeWithCalendar; + VectorToBufferAdaptor buffer(localeWithCalendar); + if (auto result = tag.ToString(buffer); result.isErr()) { + return Err(ToError(result.unwrapErr())); + } + if (!localeWithCalendar.append('\0')) { + return Err(DisplayNamesError::OutOfMemory); + } + + UErrorCode status = U_ZERO_ERROR; + UDateFormat* fmt = udat_open( + UDAT_DEFAULT, UDAT_DEFAULT, + IcuLocale( + // IcuLocale takes a Span that does not include the null terminator. + Span(localeWithCalendar.begin(), localeWithCalendar.length() - 1)), + timeZone, timeZoneLength, pattern, patternLength, &status); + if (U_FAILURE(status)) { + return Err(DisplayNamesError::InternalError); + } + ScopedICUObject<UDateFormat, udat_close> datToClose(fmt); + + Vector<char16_t, DisplayNames::LocaleVecLength> name; + for (int32_t index : indices) { + auto result = FillBufferWithICUCall(name, [&](UChar* target, int32_t length, + UErrorCode* status) { + return udat_getSymbols(fmt, symbolType, index, target, length, status); + }); + if (result.isErr()) { + return Err(ToError(result.unwrapErr())); + } + + // Everything except Undecimber should always have a non-empty name. + MOZ_ASSERT_IF(!IsStandaloneMonth(symbolType) || index != UCAL_UNDECIMBER, + !name.empty()); + + if (!mDateTimeDisplayNames.emplaceBack(Span(name.begin(), name.length()))) { + return Err(DisplayNamesError::OutOfMemory); + } + } + return Ok(); +} + +Span<const char> DisplayNames::ToCodeString(Month aMonth) { + switch (aMonth) { + case Month::January: + return MakeStringSpan("1"); + case Month::February: + return MakeStringSpan("2"); + case Month::March: + return MakeStringSpan("3"); + case Month::April: + return MakeStringSpan("4"); + case Month::May: + return MakeStringSpan("5"); + case Month::June: + return MakeStringSpan("6"); + case Month::July: + return MakeStringSpan("7"); + case Month::August: + return MakeStringSpan("8"); + case Month::September: + return MakeStringSpan("9"); + case Month::October: + return MakeStringSpan("10"); + case Month::November: + return MakeStringSpan("11"); + case Month::December: + return MakeStringSpan("12"); + case Month::Undecimber: + return MakeStringSpan("13"); + } + MOZ_ASSERT_UNREACHABLE(); + return MakeStringSpan("1"); +}; + +} // namespace mozilla::intl diff --git a/intl/components/src/DisplayNames.h b/intl/components/src/DisplayNames.h new file mode 100644 index 0000000000..ae519f61ce --- /dev/null +++ b/intl/components/src/DisplayNames.h @@ -0,0 +1,971 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef intl_components_DisplayNames_h_ +#define intl_components_DisplayNames_h_ + +#include <string> +#include <string_view> +#include "unicode/udat.h" +#include "unicode/udatpg.h" +#include "unicode/uldnames.h" +#include "unicode/uloc.h" +#include "unicode/ucurr.h" +#include "mozilla/intl/Calendar.h" +#include "mozilla/intl/DateTimePatternGenerator.h" +#include "mozilla/intl/ICU4CGlue.h" +#include "mozilla/intl/Locale.h" +#include "mozilla/Buffer.h" +#include "mozilla/Casting.h" +#include "mozilla/PodOperations.h" +#include "mozilla/Result.h" +#include "mozilla/Span.h" +#include "mozilla/TextUtils.h" +#include "mozilla/UniquePtr.h" + +namespace mozilla::intl { +/** + * Provide more granular errors for DisplayNames rather than use the generic + * ICUError type. This helps with providing more actionable feedback for + * errors with input validation. + * + * This type can't be nested in the DisplayNames class because it needs the + * UnusedZero and HasFreeLSB definitions. + */ +enum class DisplayNamesError { + // Since we claim UnusedZero<DisplayNamesError>::value and + // HasFreeLSB<Error>::value == true below, we must only use positive, + // even enum values. + InternalError = 2, + OutOfMemory = 4, + InvalidOption = 6, + DuplicateVariantSubtag = 8, + InvalidLanguageTag = 10, +}; +} // namespace mozilla::intl + +namespace mozilla::detail { +// Ensure the efficient packing of the error types into the result. See +// ICUError.h and the ICUError comments for more information. +template <> +struct UnusedZero<intl::DisplayNamesError> + : UnusedZeroEnum<intl::DisplayNamesError> {}; + +template <> +struct HasFreeLSB<intl::DisplayNamesError> { + static constexpr bool value = true; +}; +} // namespace mozilla::detail + +namespace mozilla::intl { + +// NOTE: The UTF-35 canonical "code" value for months and quarters are 1-based +// integers, so some of the following enums are 1-based for consistency with +// that. For simplicity, we make all of the following enums 1-based, but use +// `EnumToIndex` (see below) to convert to zero based if indexing into internal +// (non-ICU) tables. + +/** + * Month choices for display names. + */ +enum class Month : uint8_t { + January = 1, + February, + March, + April, + May, + June, + July, + August, + September, + October, + November, + December, + // Some calendar systems feature a 13th month. + // https://en.wikipedia.org/wiki/Undecimber + Undecimber +}; + +/** + * Quarter choices for display names. + */ +enum class Quarter : uint8_t { + Q1 = 1, + Q2, + Q3, + Q4, +}; + +/** + * Day period choices for display names. + */ +enum class DayPeriod : uint8_t { + AM = 1, + PM, +}; + +/** + * DateTimeField choices for display names. + */ +enum class DateTimeField : uint8_t { + Era = 1, + Year, + Quarter, + Month, + WeekOfYear, + Weekday, + Day, + DayPeriod, + Hour, + Minute, + Second, + TimeZoneName, +}; + +/** + * DisplayNames provide a way to get the localized names of various types of + * information such as the names of the day of the week, months, currency etc. + * + * This class backs SpiderMonkeys implementation of Intl.DisplayNames + * https://tc39.es/ecma402/#intl-displaynames-objects + */ +class DisplayNames final { + public: + /** + * The style of the display name, specified by the amount of space available + * for displaying the text. + */ + enum class Style { + Narrow, + Short, + Long, + // Note: Abbreviated is not part of ECMA-402, but it is available for + // internal Mozilla usage. + Abbreviated, + }; + + /** + * Use either standard or dialect names for the "Language" type. + */ + enum class LanguageDisplay { + Standard, + Dialect, + }; + + /** + * Determines the fallback behavior if no match is found. + */ + enum class Fallback { + // The buffer will contain an empty string. + None, + // The buffer will contain the code, but typically in a canonicalized form. + Code + }; + + /** + * These options correlate to the ECMA-402 DisplayNames options. The defaults + * values must match the default initialized values of ECMA-402. The type + * option is omitted as the C++ API relies on directly calling the + * DisplayNames::Get* methods. + * + * https://tc39.es/ecma402/#intl-displaynames-objects + * https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Intl/DisplayNames + */ + struct Options { + Style style = Style::Long; + LanguageDisplay languageDisplay = LanguageDisplay::Standard; + }; + + DisplayNames(ULocaleDisplayNames* aDisplayNames, Span<const char> aLocale, + Options aOptions) + : mOptions(aOptions), mULocaleDisplayNames(aDisplayNames) { + MOZ_ASSERT(aDisplayNames); + + // Copy the span and ensure null termination. + mLocale = Buffer<char>(aLocale.size() + 1); + PodCopy(mLocale.begin(), aLocale.data(), aLocale.size()); + mLocale[aLocale.size()] = '\0'; + } + + /** + * Initialize a new DisplayNames for the provided locale and using the + * provided options. + * + * https://tc39.es/ecma402/#sec-Intl.DisplayNames + */ + static Result<UniquePtr<DisplayNames>, ICUError> TryCreate( + const char* aLocale, Options aOptions); + + // Not copyable or movable + DisplayNames(const DisplayNames&) = delete; + DisplayNames& operator=(const DisplayNames&) = delete; + + ~DisplayNames(); + + /** + * Easily convert to a more specific DisplayNames error. + */ + DisplayNamesError ToError(ICUError aError) const; + + /** + * Easily convert to a more specific DisplayNames error. + */ + DisplayNamesError ToError(Locale::CanonicalizationError aError) const; + + private: + /** + * A helper function to handle the fallback behavior, where if there is a + * fallback the buffer is filled with the "code", often in canonicalized form. + */ + template <typename B, typename Fn> + static Result<Ok, DisplayNamesError> HandleFallback(B& aBuffer, + Fallback aFallback, + Fn aGetFallbackSpan) { + if (aBuffer.length() == 0 && + aFallback == mozilla::intl::DisplayNames::Fallback::Code) { + if (!FillBuffer(aGetFallbackSpan(), aBuffer)) { + return Err(DisplayNamesError::OutOfMemory); + } + } + return Ok(); + } + + /** + * This is a specialized form of the FillBufferWithICUCall for DisplayNames. + * Different APIs report that no display name is found with different + * statuses. This method signals no display name was found by setting the + * buffer to 0. + * + * The display name APIs such as `uldn_scriptDisplayName`, + * `uloc_getDisplayScript`, and `uldn_regionDisplayName` report + * U_ILLEGAL_ARGUMENT_ERROR when no display name was found. In order to + * accomodate fallbacking, return an empty string in this case. + */ + template <typename B, typename F> + static ICUResult FillBufferWithICUDisplayNames( + B& aBuffer, UErrorCode aNoDisplayNameStatus, F aCallback) { + return FillBufferWithICUCall( + aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) { + int32_t res = aCallback(target, length, status); + + if (*status == aNoDisplayNameStatus) { + *status = U_ZERO_ERROR; + res = 0; + } + return res; + }); + } + + /** + * An internal helper to compute the list of display names for various + * DateTime options. + */ + Result<Ok, DisplayNamesError> ComputeDateTimeDisplayNames( + UDateFormatSymbolType symbolType, mozilla::Span<const int32_t> indices, + Span<const char> aCalendar); + + // The following are the stack-allocated sizes for various strings using the + // mozilla::Vector. The numbers should be large enough to fit the common + // cases, and when the strings are too large they will fall back to heap + // allocations. + + // Fit BCP 47 locales such as "en-US", "zh-Hant". Locales can get quite long, + // but 32 should fit most smaller locales without a lot of extensions. + static constexpr size_t LocaleVecLength = 32; + // Fit calendar names such as "gregory", "buddhist", "islamic-civil". + // "islamic-umalqura" is 16 bytes + 1 for null termination, so round up to 32. + static constexpr size_t CalendarVecLength = 32; + + /** + * Given an ASCII alpha, convert it to upper case. + */ + static inline char16_t AsciiAlphaToUpperCase(char16_t aCh) { + MOZ_ASSERT(IsAsciiAlpha(aCh)); + return AsciiToUpperCase(aCh); + }; + + /** + * Attempt to use enums to safely index into an array. + * + * Note: The enums we support here are all defined starting from 1. + */ + template <typename T> + inline int32_t EnumToIndex(size_t aSize, T aEnum) { + size_t index = static_cast<size_t>(aEnum) - 1; + MOZ_RELEASE_ASSERT(index < aSize, + "Enum indexing mismatch for display names."); + return index; + } + + /** + * Convert the month to a numeric code as a string. + */ + static Span<const char> ToCodeString(Month aMonth); + + public: + /** + * Get the localized name of a language. Part of ECMA-402. + * + * Accepts: + * languageCode ["-" scriptCode] ["-" regionCode ] *("-" variant ) + * Where the language code is: + * 1. A two letters ISO 639-1 language code + * https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes + * 2. A three letters ISO 639-2 language code + * https://en.wikipedia.org/wiki/List_of_ISO_639-2_codes + * + * Examples: + * "es-ES" => "European Spanish" (en-US), "español de España" (es-ES) + * "zh-Hant" => "Traditional Chinese" (en-US), "chino tradicional" (es-ES) + */ + template <typename B> + Result<Ok, DisplayNamesError> GetLanguage( + B& aBuffer, Span<const char> aLanguage, + Fallback aFallback = Fallback::None) const { + static_assert(std::is_same<typename B::CharType, char16_t>::value); + mozilla::intl::Locale tag; + if (LocaleParser::TryParseBaseName(aLanguage, tag).isErr()) { + return Err(DisplayNamesError::InvalidOption); + } + + { + // ICU always canonicalizes the input locale, but since we know that ICU's + // canonicalization is incomplete, we need to perform our own + // canonicalization to ensure consistent result. + auto result = tag.CanonicalizeBaseName(); + if (result.isErr()) { + return Err(ToError(result.unwrapErr())); + } + } + + Vector<char, DisplayNames::LocaleVecLength> tagVec; + { + VectorToBufferAdaptor tagBuffer(tagVec); + auto result = tag.ToString(tagBuffer); + if (result.isErr()) { + return Err(ToError(result.unwrapErr())); + } + if (!tagVec.append('\0')) { + // The tag should be null terminated. + return Err(DisplayNamesError::OutOfMemory); + } + } + + auto result = FillBufferWithICUDisplayNames( + aBuffer, U_ILLEGAL_ARGUMENT_ERROR, + [&](UChar* target, int32_t length, UErrorCode* status) { + return uldn_localeDisplayName(mULocaleDisplayNames.GetConst(), + tagVec.begin(), target, length, status); + }); + if (result.isErr()) { + return Err(ToError(result.unwrapErr())); + } + + return HandleFallback(aBuffer, aFallback, [&] { + // Remove the null terminator. + return Span(tagVec.begin(), tagVec.length() - 1); + }); + }; + + /** + * Get the localized name of a region. Part of ECMA-402. + * + * Accepts: + * 1. an ISO-3166 two letters: + * https://www.iso.org/iso-3166-country-codes.html + * 2. region code, or a three digits UN M49 Geographic Regions. + * https://unstats.un.org/unsd/methodology/m49/ + * + * Examples + * "US" => "United States" (en-US), "Estados Unidos", (es-ES) + * "158" => "Taiwan" (en-US), "Taiwán", (es-ES) + */ + template <typename B> + Result<Ok, DisplayNamesError> GetRegion( + B& aBuffer, Span<const char> aCode, + Fallback aFallback = Fallback::None) const { + static_assert(std::is_same<typename B::CharType, char16_t>::value); + + mozilla::intl::RegionSubtag region; + if (!IsStructurallyValidRegionTag(aCode)) { + return Err(DisplayNamesError::InvalidOption); + } + region.Set(aCode); + + mozilla::intl::Locale tag; + tag.SetLanguage("und"); + tag.SetRegion(region); + + { + // ICU always canonicalizes the input locale, but since we know that ICU's + // canonicalization is incomplete, we need to perform our own + // canonicalization to ensure consistent result. + auto result = tag.CanonicalizeBaseName(); + if (result.isErr()) { + return Err(ToError(result.unwrapErr())); + } + } + + MOZ_ASSERT(tag.Region().Present()); + + // Note: ICU requires the region subtag to be in canonical case. + const mozilla::intl::RegionSubtag& canonicalRegion = tag.Region(); + + char regionChars[mozilla::intl::LanguageTagLimits::RegionLength + 1] = {}; + std::copy_n(canonicalRegion.Span().data(), canonicalRegion.Length(), + regionChars); + + auto result = FillBufferWithICUDisplayNames( + aBuffer, U_ILLEGAL_ARGUMENT_ERROR, + [&](UChar* chars, uint32_t size, UErrorCode* status) { + return uldn_regionDisplayName( + mULocaleDisplayNames.GetConst(), regionChars, chars, + AssertedCast<int32_t, uint32_t>(size), status); + }); + + if (result.isErr()) { + return Err(ToError(result.unwrapErr())); + } + + return HandleFallback(aBuffer, aFallback, [&] { + region.ToUpperCase(); + return region.Span(); + }); + } + + /** + * Get the localized name of a currency. Part of ECMA-402. + * + * Accepts: + * A 3-letter ISO 4217 currency code. + * https://en.wikipedia.org/wiki/ISO_4217 + * + * Examples: + * "EUR" => "Euro" (en-US), "euro" (es_ES), "欧元", (zh) + * "JPY" => "Japanese Yen" (en-US), "yen" (es_ES), "日元", (zh) + */ + template <typename B> + Result<Ok, DisplayNamesError> GetCurrency( + B& aBuffer, Span<const char> aCurrency, + Fallback aFallback = Fallback::None) const { + static_assert(std::is_same<typename B::CharType, char16_t>::value); + if (aCurrency.size() != 3) { + return Err(DisplayNamesError::InvalidOption); + } + + if (!mozilla::IsAsciiAlpha(aCurrency[0]) || + !mozilla::IsAsciiAlpha(aCurrency[1]) || + !mozilla::IsAsciiAlpha(aCurrency[2])) { + return Err(DisplayNamesError::InvalidOption); + } + + // Normally this type of operation wouldn't be safe, but ASCII characters + // all take 1 byte in UTF-8 encoding, and can be zero padded to be valid + // UTF-16. Currency codes are all three ASCII letters. + char16_t currency[] = {static_cast<char16_t>(aCurrency[0]), + static_cast<char16_t>(aCurrency[1]), + static_cast<char16_t>(aCurrency[2]), u'\0'}; + + UCurrNameStyle style; + switch (mOptions.style) { + case Style::Long: + style = UCURR_LONG_NAME; + break; + case Style::Abbreviated: + case Style::Short: + style = UCURR_SYMBOL_NAME; + break; + case Style::Narrow: + style = UCURR_NARROW_SYMBOL_NAME; + break; + } + + int32_t length = 0; + UErrorCode status = U_ZERO_ERROR; + const char16_t* name = ucurr_getName(currency, IcuLocale(mLocale), style, + nullptr, &length, &status); + if (U_FAILURE(status)) { + return Err(DisplayNamesError::InternalError); + } + + if (status == U_USING_DEFAULT_WARNING) { + // A resource bundle lookup returned a result from the root locale. + if (aFallback == DisplayNames::Fallback::Code) { + // Return the canonicalized input when no localized currency name was + // found. Canonical case for currency is upper case. + if (!aBuffer.reserve(3)) { + return Err(DisplayNamesError::OutOfMemory); + } + aBuffer.data()[0] = AsciiAlphaToUpperCase(currency[0]); + aBuffer.data()[1] = AsciiAlphaToUpperCase(currency[1]); + aBuffer.data()[2] = AsciiAlphaToUpperCase(currency[2]); + aBuffer.written(3); + } else if (aBuffer.length() != 0) { + // Ensure an empty string is in the buffer when there is no fallback. + aBuffer.written(0); + } + return Ok(); + } + + if (!FillBuffer(Span(name, length), aBuffer)) { + return Err(DisplayNamesError::OutOfMemory); + } + + return Ok(); + } + + /** + * Get the localized name of a script. Part of ECMA-402. + * + * Accepts: + * ECMA-402 expects the ISO-15924 four letters script code. + * https://unicode.org/iso15924/iso15924-codes.html + * e.g. "Latn" + * + * Examples: + * "Cher" => "Cherokee" (en-US), "cherokee" (es-ES) + * "Latn" => "Latin" (en-US), "latino" (es-ES) + */ + template <typename B> + Result<Ok, DisplayNamesError> GetScript( + B& aBuffer, Span<const char> aScript, + Fallback aFallback = Fallback::None) const { + static_assert(std::is_same<typename B::CharType, char16_t>::value); + mozilla::intl::ScriptSubtag script; + if (!IsStructurallyValidScriptTag(aScript)) { + return Err(DisplayNamesError::InvalidOption); + } + script.Set(aScript); + + mozilla::intl::Locale tag; + tag.SetLanguage("und"); + + tag.SetScript(script); + + { + // ICU always canonicalizes the input locale, but since we know that ICU's + // canonicalization is incomplete, we need to perform our own + // canonicalization to ensure consistent result. + auto result = tag.CanonicalizeBaseName(); + if (result.isErr()) { + return Err(ToError(result.unwrapErr())); + } + } + + MOZ_ASSERT(tag.Script().Present()); + mozilla::Vector<char, DisplayNames::LocaleVecLength> tagString; + VectorToBufferAdaptor buffer(tagString); + + switch (mOptions.style) { + case Style::Long: { + // |uldn_scriptDisplayName| doesn't use the stand-alone form for script + // subtags, so we're using |uloc_getDisplayScript| instead. (This only + // applies to the long form.) + // + // ICU bug: https://unicode-org.atlassian.net/browse/ICU-9301 + + // |uloc_getDisplayScript| expects a full locale identifier as its + // input. + if (auto result = tag.ToString(buffer); result.isErr()) { + return Err(ToError(result.unwrapErr())); + } + + // Null terminate the tag string. + if (!tagString.append('\0')) { + return Err(DisplayNamesError::OutOfMemory); + } + + auto result = FillBufferWithICUDisplayNames( + aBuffer, U_USING_DEFAULT_WARNING, + [&](UChar* target, int32_t length, UErrorCode* status) { + return uloc_getDisplayScript(tagString.begin(), + IcuLocale(mLocale), target, length, + status); + }); + + if (result.isErr()) { + return Err(ToError(result.unwrapErr())); + } + break; + } + case Style::Abbreviated: + case Style::Short: + case Style::Narrow: { + // Note: ICU requires the script subtag to be in canonical case. + const mozilla::intl::ScriptSubtag& canonicalScript = tag.Script(); + + char scriptChars[mozilla::intl::LanguageTagLimits::ScriptLength + 1] = + {}; + MOZ_ASSERT(canonicalScript.Length() <= + mozilla::intl::LanguageTagLimits::ScriptLength + 1); + std::copy_n(canonicalScript.Span().data(), canonicalScript.Length(), + scriptChars); + + auto result = FillBufferWithICUDisplayNames( + aBuffer, U_ILLEGAL_ARGUMENT_ERROR, + [&](UChar* target, int32_t length, UErrorCode* status) { + return uldn_scriptDisplayName(mULocaleDisplayNames.GetConst(), + scriptChars, target, length, + status); + }); + + if (result.isErr()) { + return Err(ToError(result.unwrapErr())); + } + break; + } + } + + return HandleFallback(aBuffer, aFallback, [&] { + script.ToTitleCase(); + return script.Span(); + }); + }; + + /** + * Get the localized name of a calendar. + * Part of Intl.DisplayNames V2. https://tc39.es/intl-displaynames-v2/ + * Accepts: + * Unicode calendar key: + * https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Intl/Locale/calendar#unicode_calendar_keys + */ + template <typename B> + Result<Ok, DisplayNamesError> GetCalendar( + B& aBuffer, Span<const char> aCalendar, + Fallback aFallback = Fallback::None) const { + if (aCalendar.empty() || !IsAscii(aCalendar)) { + return Err(DisplayNamesError::InvalidOption); + } + + if (LocaleParser::CanParseUnicodeExtensionType(aCalendar).isErr()) { + return Err(DisplayNamesError::InvalidOption); + } + + // Convert into canonical case before searching for replacements. + Vector<char, DisplayNames::CalendarVecLength> lowerCaseCalendar; + for (size_t i = 0; i < aCalendar.size(); i++) { + if (!lowerCaseCalendar.append(AsciiToLowerCase(aCalendar[i]))) { + return Err(DisplayNamesError::OutOfMemory); + } + } + if (!lowerCaseCalendar.append('\0')) { + return Err(DisplayNamesError::OutOfMemory); + } + + Span<const char> canonicalCalendar = mozilla::Span( + lowerCaseCalendar.begin(), lowerCaseCalendar.length() - 1); + + // Search if there's a replacement for the Unicode calendar keyword. + { + Span<const char> key = mozilla::MakeStringSpan("ca"); + Span<const char> type = canonicalCalendar; + if (const char* replacement = + mozilla::intl::Locale::ReplaceUnicodeExtensionType(key, type)) { + canonicalCalendar = MakeStringSpan(replacement); + } + } + + // The input calendar name is user-controlled, so be extra cautious before + // passing arbitrarily large strings to ICU. + static constexpr size_t maximumCalendarLength = 100; + + if (canonicalCalendar.size() <= maximumCalendarLength) { + // |uldn_keyValueDisplayName| expects old-style keyword values. + if (const char* legacyCalendar = + uloc_toLegacyType("calendar", canonicalCalendar.Elements())) { + auto result = FillBufferWithICUDisplayNames( + aBuffer, U_ILLEGAL_ARGUMENT_ERROR, + [&](UChar* chars, uint32_t size, UErrorCode* status) { + // |uldn_keyValueDisplayName| expects old-style keyword values. + return uldn_keyValueDisplayName(mULocaleDisplayNames.GetConst(), + "calendar", legacyCalendar, chars, + size, status); + }); + if (result.isErr()) { + return Err(ToError(result.unwrapErr())); + } + } else { + aBuffer.written(0); + } + } else { + aBuffer.written(0); + } + + return HandleFallback(aBuffer, aFallback, + [&] { return canonicalCalendar; }); + } + + /** + * Get the localized name of a weekday. This is a MozExtension, and not + * currently part of ECMA-402. + */ + template <typename B> + Result<Ok, DisplayNamesError> GetWeekday( + B& aBuffer, Weekday aWeekday, Span<const char> aCalendar, + Fallback aFallback = Fallback::None) { + // SpiderMonkey static casts the enum, so ensure it is correctly in range. + MOZ_ASSERT(aWeekday >= Weekday::Monday && aWeekday <= Weekday::Sunday); + + UDateFormatSymbolType symbolType; + switch (mOptions.style) { + case DisplayNames::Style::Long: + symbolType = UDAT_STANDALONE_WEEKDAYS; + break; + + case DisplayNames::Style::Abbreviated: + // ICU "short" is CLDR "abbreviated" format. + symbolType = UDAT_STANDALONE_SHORT_WEEKDAYS; + break; + + case DisplayNames::Style::Short: + // ICU "shorter" is CLDR "short" format. + symbolType = UDAT_STANDALONE_SHORTER_WEEKDAYS; + break; + + case DisplayNames::Style::Narrow: + symbolType = UDAT_STANDALONE_NARROW_WEEKDAYS; + break; + } + + static constexpr int32_t indices[] = { + UCAL_MONDAY, UCAL_TUESDAY, UCAL_WEDNESDAY, UCAL_THURSDAY, + UCAL_FRIDAY, UCAL_SATURDAY, UCAL_SUNDAY}; + + if (auto result = ComputeDateTimeDisplayNames( + symbolType, mozilla::Span(indices), aCalendar); + result.isErr()) { + return result.propagateErr(); + } + MOZ_ASSERT(mDateTimeDisplayNames.length() == std::size(indices)); + + auto& name = + mDateTimeDisplayNames[EnumToIndex(std::size(indices), aWeekday)]; + if (!FillBuffer(name.AsSpan(), aBuffer)) { + return Err(DisplayNamesError::OutOfMemory); + } + + // There is no need to fallback, as invalid options are + // DisplayNamesError::InvalidOption. + return Ok(); + } + + /** + * Get the localized name of a month. This is a MozExtension, and not + * currently part of ECMA-402. + */ + template <typename B> + Result<Ok, DisplayNamesError> GetMonth(B& aBuffer, Month aMonth, + Span<const char> aCalendar, + Fallback aFallback = Fallback::None) { + // SpiderMonkey static casts the enum, so ensure it is correctly in range. + MOZ_ASSERT(aMonth >= Month::January && aMonth <= Month::Undecimber); + + UDateFormatSymbolType symbolType; + switch (mOptions.style) { + case DisplayNames::Style::Long: + symbolType = UDAT_STANDALONE_MONTHS; + break; + + case DisplayNames::Style::Abbreviated: + case DisplayNames::Style::Short: + symbolType = UDAT_STANDALONE_SHORT_MONTHS; + break; + + case DisplayNames::Style::Narrow: + symbolType = UDAT_STANDALONE_NARROW_MONTHS; + break; + } + + static constexpr int32_t indices[] = { + UCAL_JANUARY, UCAL_FEBRUARY, UCAL_MARCH, UCAL_APRIL, + UCAL_MAY, UCAL_JUNE, UCAL_JULY, UCAL_AUGUST, + UCAL_SEPTEMBER, UCAL_OCTOBER, UCAL_NOVEMBER, UCAL_DECEMBER, + UCAL_UNDECIMBER}; + + if (auto result = ComputeDateTimeDisplayNames( + symbolType, mozilla::Span(indices), aCalendar); + result.isErr()) { + return result.propagateErr(); + } + MOZ_ASSERT(mDateTimeDisplayNames.length() == std::size(indices)); + auto& name = mDateTimeDisplayNames[EnumToIndex(std::size(indices), aMonth)]; + if (!FillBuffer(Span(name.AsSpan()), aBuffer)) { + return Err(DisplayNamesError::OutOfMemory); + } + + return HandleFallback(aBuffer, aFallback, + [&] { return ToCodeString(aMonth); }); + } + + /** + * Get the localized name of a quarter. This is a MozExtension, and not + * currently part of ECMA-402. + */ + template <typename B> + Result<Ok, DisplayNamesError> GetQuarter( + B& aBuffer, Quarter aQuarter, Span<const char> aCalendar, + Fallback aFallback = Fallback::None) { + // SpiderMonkey static casts the enum, so ensure it is correctly in range. + MOZ_ASSERT(aQuarter >= Quarter::Q1 && aQuarter <= Quarter::Q4); + + UDateFormatSymbolType symbolType; + switch (mOptions.style) { + case DisplayNames::Style::Long: + symbolType = UDAT_STANDALONE_QUARTERS; + break; + + case DisplayNames::Style::Abbreviated: + case DisplayNames::Style::Short: + symbolType = UDAT_STANDALONE_SHORT_QUARTERS; + break; + + case DisplayNames::Style::Narrow: + symbolType = UDAT_STANDALONE_NARROW_QUARTERS; + break; + } + + // ICU doesn't provide an enum for quarters. + static constexpr int32_t indices[] = {0, 1, 2, 3}; + + if (auto result = ComputeDateTimeDisplayNames( + symbolType, mozilla::Span(indices), aCalendar); + result.isErr()) { + return result.propagateErr(); + } + MOZ_ASSERT(mDateTimeDisplayNames.length() == std::size(indices)); + + auto& name = + mDateTimeDisplayNames[EnumToIndex(std::size(indices), aQuarter)]; + if (!FillBuffer(Span(name.AsSpan()), aBuffer)) { + return Err(DisplayNamesError::OutOfMemory); + } + + // There is no need to fallback, as invalid options are + // DisplayNamesError::InvalidOption. + return Ok(); + } + + /** + * Get the localized name of a day period. This is a MozExtension, and not + * currently part of ECMA-402. + */ + template <typename B> + Result<Ok, DisplayNamesError> GetDayPeriod( + B& aBuffer, DayPeriod aDayPeriod, Span<const char> aCalendar, + Fallback aFallback = Fallback::None) { + UDateFormatSymbolType symbolType = UDAT_AM_PMS; + + static constexpr int32_t indices[] = {UCAL_AM, UCAL_PM}; + + if (auto result = ComputeDateTimeDisplayNames( + symbolType, mozilla::Span(indices), aCalendar); + result.isErr()) { + return result.propagateErr(); + } + MOZ_ASSERT(mDateTimeDisplayNames.length() == std::size(indices)); + + auto& name = + mDateTimeDisplayNames[EnumToIndex(std::size(indices), aDayPeriod)]; + if (!FillBuffer(name.AsSpan(), aBuffer)) { + return Err(DisplayNamesError::OutOfMemory); + } + + // There is no need to fallback, as invalid options are + // DisplayNamesError::InvalidOption. + return Ok(); + } + + /** + * Get the localized name of a date time field. + * Part of Intl.DisplayNames V2. https://tc39.es/intl-displaynames-v2/ + * Accepts: + * "era", "year", "quarter", "month", "weekOfYear", "weekday", "day", + * "dayPeriod", "hour", "minute", "second", "timeZoneName" + * Examples: + * "weekday" => "day of the week" + * "dayPeriod" => "AM/PM" + */ + template <typename B> + Result<Ok, DisplayNamesError> GetDateTimeField( + B& aBuffer, DateTimeField aField, + DateTimePatternGenerator& aDateTimePatternGen, + Fallback aFallback = Fallback::None) { + UDateTimePatternField field; + switch (aField) { + case DateTimeField::Era: + field = UDATPG_ERA_FIELD; + break; + case DateTimeField::Year: + field = UDATPG_YEAR_FIELD; + break; + case DateTimeField::Quarter: + field = UDATPG_QUARTER_FIELD; + break; + case DateTimeField::Month: + field = UDATPG_MONTH_FIELD; + break; + case DateTimeField::WeekOfYear: + field = UDATPG_WEEK_OF_YEAR_FIELD; + break; + case DateTimeField::Weekday: + field = UDATPG_WEEKDAY_FIELD; + break; + case DateTimeField::Day: + field = UDATPG_DAY_FIELD; + break; + case DateTimeField::DayPeriod: + field = UDATPG_DAYPERIOD_FIELD; + break; + case DateTimeField::Hour: + field = UDATPG_HOUR_FIELD; + break; + case DateTimeField::Minute: + field = UDATPG_MINUTE_FIELD; + break; + case DateTimeField::Second: + field = UDATPG_SECOND_FIELD; + break; + case DateTimeField::TimeZoneName: + field = UDATPG_ZONE_FIELD; + break; + } + + UDateTimePGDisplayWidth width; + switch (mOptions.style) { + case DisplayNames::Style::Long: + width = UDATPG_WIDE; + break; + case DisplayNames::Style::Abbreviated: + case DisplayNames::Style::Short: + width = UDATPG_ABBREVIATED; + break; + case DisplayNames::Style::Narrow: + width = UDATPG_NARROW; + break; + } + + auto result = FillBufferWithICUCall( + aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) { + return udatpg_getFieldDisplayName( + aDateTimePatternGen.GetUDateTimePatternGenerator(), field, width, + target, length, status); + }); + + if (result.isErr()) { + return Err(ToError(result.unwrapErr())); + } + // There is no need to fallback, as invalid options are + // DisplayNamesError::InvalidOption. + return Ok(); + } + + Options mOptions; + Buffer<char> mLocale; + Vector<Buffer<char16_t>> mDateTimeDisplayNames; + ICUPointer<ULocaleDisplayNames> mULocaleDisplayNames = + ICUPointer<ULocaleDisplayNames>(nullptr); +}; + +} // namespace mozilla::intl + +#endif diff --git a/intl/components/src/FormatBuffer.h b/intl/components/src/FormatBuffer.h new file mode 100644 index 0000000000..774e74d2ba --- /dev/null +++ b/intl/components/src/FormatBuffer.h @@ -0,0 +1,77 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef intl_components_FormatBuffer_h +#define intl_components_FormatBuffer_h + +/** + * This file contains public adaptors for the mozilla::intl Buffer template + * argument. Adaptors that can automatically be deduced are kept as private + * in ICU4CGlue.h. There is also the SpiderMonkey specific adaptor + * js::intl::FormatBuffer in js/src/builtin/intl/FormatBuffer.h. + */ + +#include "nsTString.h" + +namespace mozilla::intl { + +/** + * mozilla::intl APIs require sizeable buffers. This class abstracts over + * the nsTSubstring. + */ +template <typename T> +class nsTStringToBufferAdapter { + public: + using CharType = T; + + // Do not allow copy or move. Move could be added in the future if needed. + nsTStringToBufferAdapter(const nsTStringToBufferAdapter&) = delete; + nsTStringToBufferAdapter& operator=(const nsTStringToBufferAdapter&) = delete; + + explicit nsTStringToBufferAdapter(nsTSubstring<CharType>& aString) + : mString(aString) {} + + /** + * Ensures the buffer has enough space to accommodate |size| elements. + */ + [[nodiscard]] bool reserve(size_t size) { + return mString.SetLength(size, fallible); + } + + /** + * Returns the raw data inside the buffer. + */ + CharType* data() { return mString.BeginWriting(); } + + /** + * Returns the count of elements written into the buffer. + */ + size_t length() const { return mString.Length(); } + + /** + * Returns the buffer's overall capacity. + */ + size_t capacity() const { + // nsString's Capacity() method is protected, so just return the length. + return mString.Length(); + } + + /** + * Resizes the buffer to the given amount of written elements. + */ + void written(size_t amount) { + MOZ_ASSERT(amount <= mString.Length()); + // This sets |mString|'s internal size so that it matches how much was + // written. This is necessary because the write happens across FFI + // boundaries. + mString.SetLength(amount); + } + + private: + nsTSubstring<CharType>& mString; +}; + +} // namespace mozilla::intl + +#endif /* intl_components_FormatBuffer_h */ diff --git a/intl/components/src/GeneralCategory.h b/intl/components/src/GeneralCategory.h new file mode 100644 index 0000000000..99603ce4b9 --- /dev/null +++ b/intl/components/src/GeneralCategory.h @@ -0,0 +1,52 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef intl_components_GeneralCategory_h_ +#define intl_components_GeneralCategory_h_ + +#include <cstdint> + +namespace mozilla::intl { + +// See https://www.unicode.org/reports/tr44/#General_Category_Values +// for details of these values. + +// The values here must match the values used by ICU's UCharCategory. + +enum class GeneralCategory : uint8_t { + Unassigned = 0, + Uppercase_Letter = 1, + Lowercase_Letter = 2, + Titlecase_Letter = 3, + Modifier_Letter = 4, + Other_Letter = 5, + Nonspacing_Mark = 6, + Enclosing_Mark = 7, + Spacing_Mark = 8, + Decimal_Number = 9, + Letter_Number = 10, + Other_Number = 11, + Space_Separator = 12, + Line_Separator = 13, + Paragraph_Separator = 14, + Control = 15, + Format = 16, + Private_Use = 17, + Surrogate = 18, + Dash_Punctuation = 19, + Open_Punctuation = 20, + Close_Punctuation = 21, + Connector_Punctuation = 22, + Other_Punctuation = 23, + Math_Symbol = 24, + Currency_Symbol = 25, + Modifier_Symbol = 26, + Other_Symbol = 27, + Initial_Punctuation = 28, + Final_Punctuation = 29, + GeneralCategoryCount +}; + +} // namespace mozilla::intl + +#endif diff --git a/intl/components/src/ICU4CGlue.cpp b/intl/components/src/ICU4CGlue.cpp new file mode 100644 index 0000000000..6b9e0c0c58 --- /dev/null +++ b/intl/components/src/ICU4CGlue.cpp @@ -0,0 +1,44 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "mozilla/intl/ICU4CGlue.h" +#include "unicode/uformattedvalue.h" + +namespace mozilla::intl { + +// Starting with ICU 59, UChar defaults to char16_t. +static_assert(std::is_same_v<UChar, char16_t>, + "Gecko doesn't support redefining UChar to a different type"); + +ICUError ToICUError(UErrorCode status) { + MOZ_ASSERT(!U_SUCCESS(status)); + switch (status) { + case U_MEMORY_ALLOCATION_ERROR: + return ICUError::OutOfMemory; + default: + return ICUError::InternalError; + } +} + +ICUResult ToICUResult(UErrorCode status) { + if (U_SUCCESS(status)) { + return Ok(); + } + return Err(ToICUError(status)); +} + +// static +Result<Span<const char16_t>, ICUError> FormattedResult::ToSpanImpl( + const UFormattedValue* value) { + UErrorCode status = U_ZERO_ERROR; + int32_t strLength; + const char16_t* str = ufmtval_getString(value, &strLength, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + return Span{str, AssertedCast<size_t>(strLength)}; +} + +} // namespace mozilla::intl diff --git a/intl/components/src/ICU4CGlue.h b/intl/components/src/ICU4CGlue.h new file mode 100644 index 0000000000..af1590680b --- /dev/null +++ b/intl/components/src/ICU4CGlue.h @@ -0,0 +1,722 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef intl_components_ICUUtils_h +#define intl_components_ICUUtils_h + +#include "unicode/uenum.h" +#include "unicode/utypes.h" +#include "mozilla/Buffer.h" +#include "mozilla/DebugOnly.h" +#include "mozilla/Maybe.h" +#include "mozilla/Result.h" +#include "mozilla/Span.h" +#include "mozilla/Utf8.h" +#include "mozilla/Vector.h" +#include "mozilla/intl/ICUError.h" + +// When building standalone js shell, it will include headers from +// intl/components if JS_HAS_INTL_API is true (the default value), but js shell +// won't include headers from XPCOM, so don't include nsTArray.h when building +// standalone js shell. +#ifndef JS_STANDALONE +# include "nsTArray.h" +#endif + +#include <cstring> +#include <iterator> +#include <stddef.h> +#include <stdint.h> +#include <string> +#include <string_view> + +struct UFormattedValue; +namespace mozilla::intl { + +template <typename CharType> +static inline CharType* AssertNullTerminatedString(Span<CharType> aSpan) { + // Intentionally check one past the last character, because we expect that the + // NUL character isn't part of the string. + MOZ_ASSERT(*(aSpan.data() + aSpan.size()) == '\0'); + + // Also ensure there aren't any other NUL characters within the string. + MOZ_ASSERT(std::char_traits<CharType>::length(aSpan.data()) == aSpan.size()); + + return aSpan.data(); +} + +static inline const char* AssertNullTerminatedString(std::string_view aView) { + // Intentionally check one past the last character, because we expect that the + // NUL character isn't part of the string. + MOZ_ASSERT(*(aView.data() + aView.size()) == '\0'); + + // Also ensure there aren't any other NUL characters within the string. + MOZ_ASSERT(std::strlen(aView.data()) == aView.size()); + + return aView.data(); +} + +/** + * Map the "und" locale to an empty string, which ICU uses internally. + */ +static inline const char* IcuLocale(const char* aLocale) { + // Return the empty string if the input is exactly equal to the string "und". + const char* locale = aLocale; + if (!std::strcmp(locale, "und")) { + locale = ""; // ICU root locale + } + return locale; +} + +/** + * Ensure a locale is null-terminated, and map the "und" locale to an empty + * string, which ICU uses internally. + */ +static inline const char* IcuLocale(Span<const char> aLocale) { + return IcuLocale(AssertNullTerminatedString(aLocale)); +} + +/** + * Ensure a locale in the buffer is null-terminated, and map the "und" locale to + * an empty string, which ICU uses internally. + */ +static inline const char* IcuLocale(const Buffer<char>& aLocale) { + return IcuLocale(Span(aLocale.begin(), aLocale.Length() - 1)); +} + +using ICUResult = Result<Ok, ICUError>; + +/** + * Convert a UErrorCode to ICUError. This will correctly apply the OutOfMemory + * case. + */ +ICUError ToICUError(UErrorCode status); + +/** + * Convert a UErrorCode to ICUResult. This will correctly apply the OutOfMemory + * case. + */ +ICUResult ToICUResult(UErrorCode status); + +/** + * The ICU status can complain about a string not being terminated, but this + * is fine for this API, as it deals with the mozilla::Span that has a pointer + * and a length. + */ +static inline bool ICUSuccessForStringSpan(UErrorCode status) { + return U_SUCCESS(status) || status == U_STRING_NOT_TERMINATED_WARNING; +} + +/** + * This class enforces that the unified mozilla::intl methods match the + * const-ness of the underlying ICU4C API calls. const ICU4C APIs take a const + * pointer, while mutable ones take a non-const pointer. + * + * For const ICU4C calls use: + * ICUPointer::GetConst(). + * + * For non-const ICU4C calls use: + * ICUPointer::GetMut(). + * + * This will propagate the `const` specifier from the ICU4C API call to the + * unified method, and it will be enforced by the compiler. This helps ensures + * a consistence and correct implementation. + */ +template <typename T> +class ICUPointer { + public: + explicit ICUPointer(T* aPointer) : mPointer(aPointer) {} + + // Only allow moves of ICUPointers, no copies. + ICUPointer(ICUPointer&& other) noexcept = default; + ICUPointer& operator=(ICUPointer&& other) noexcept = default; + + // Implicitly take ownership of a raw pointer through copy assignment. + ICUPointer& operator=(T* aPointer) noexcept { + mPointer = aPointer; + return *this; + }; + + const T* GetConst() const { return const_cast<const T*>(mPointer); } + T* GetMut() { return mPointer; } + + explicit operator bool() const { return !!mPointer; } + + private: + T* mPointer; +}; + +/** + * Calling into ICU with the C-API can be a bit tricky. This function wraps up + * the relatively risky operations involving pointers, lengths, and buffers into + * a simpler call. This function accepts a lambda that performs the ICU call, + * and returns the length of characters in the buffer. When using a temporary + * stack-based buffer, the calls can often be done in one trip. However, if + * additional memory is needed, this function will call the C-API twice, in + * order to first get the size of the result, and then second to copy the result + * over to the buffer. + */ +template <typename ICUStringFunction, typename Buffer> +static ICUResult FillBufferWithICUCall(Buffer& buffer, + const ICUStringFunction& strFn) { + static_assert(std::is_same_v<typename Buffer::CharType, char16_t> || + std::is_same_v<typename Buffer::CharType, char> || + std::is_same_v<typename Buffer::CharType, uint8_t>); + + UErrorCode status = U_ZERO_ERROR; + int32_t length = strFn(buffer.data(), buffer.capacity(), &status); + if (status == U_BUFFER_OVERFLOW_ERROR) { + MOZ_ASSERT(length >= 0); + + if (!buffer.reserve(length)) { + return Err(ICUError::OutOfMemory); + } + + status = U_ZERO_ERROR; + mozilla::DebugOnly<int32_t> length2 = strFn(buffer.data(), length, &status); + MOZ_ASSERT(length == length2); + } + if (!ICUSuccessForStringSpan(status)) { + return Err(ToICUError(status)); + } + + buffer.written(length); + + return Ok{}; +} + +/** + * Adaptor for mozilla::Vector to implement the Buffer interface. + */ +template <typename T, size_t N> +class VectorToBufferAdaptor { + mozilla::Vector<T, N>& vector; + + public: + using CharType = T; + + explicit VectorToBufferAdaptor(mozilla::Vector<T, N>& vector) + : vector(vector) {} + + T* data() { return vector.begin(); } + + size_t capacity() const { return vector.capacity(); } + + bool reserve(size_t length) { return vector.reserve(length); } + + void written(size_t length) { + mozilla::DebugOnly<bool> result = vector.resizeUninitialized(length); + MOZ_ASSERT(result); + } +}; + +/** + * An overload of FillBufferWithICUCall that accepts a mozilla::Vector rather + * than a Buffer. + */ +template <typename ICUStringFunction, size_t InlineSize, typename CharType> +static ICUResult FillBufferWithICUCall(Vector<CharType, InlineSize>& vector, + const ICUStringFunction& strFn) { + VectorToBufferAdaptor buffer(vector); + return FillBufferWithICUCall(buffer, strFn); +} + +#ifndef JS_STANDALONE +/** + * mozilla::intl APIs require sizeable buffers. This class abstracts over + * the nsTArray. + */ +template <typename T> +class nsTArrayToBufferAdapter { + public: + using CharType = T; + + // Do not allow copy or move. Move could be added in the future if needed. + nsTArrayToBufferAdapter(const nsTArrayToBufferAdapter&) = delete; + nsTArrayToBufferAdapter& operator=(const nsTArrayToBufferAdapter&) = delete; + + explicit nsTArrayToBufferAdapter(nsTArray<CharType>& aArray) + : mArray(aArray) {} + + /** + * Ensures the buffer has enough space to accommodate |size| elements. + */ + [[nodiscard]] bool reserve(size_t size) { + // Use fallible behavior here. + return mArray.SetCapacity(size, fallible); + } + + /** + * Returns the raw data inside the buffer. + */ + CharType* data() { return mArray.Elements(); } + + /** + * Returns the count of elements written into the buffer. + */ + size_t length() const { return mArray.Length(); } + + /** + * Returns the buffer's overall capacity. + */ + size_t capacity() const { return mArray.Capacity(); } + + /** + * Resizes the buffer to the given amount of written elements. + */ + void written(size_t amount) { + MOZ_ASSERT(amount <= mArray.Capacity()); + // This sets |mArray|'s internal size so that it matches how much was + // written. This is necessary because the write happens across FFI + // boundaries. + mArray.SetLengthAndRetainStorage(amount); + } + + private: + nsTArray<CharType>& mArray; +}; + +template <typename T, size_t N> +class AutoTArrayToBufferAdapter : public nsTArrayToBufferAdapter<T> { + using nsTArrayToBufferAdapter<T>::nsTArrayToBufferAdapter; +}; + +/** + * An overload of FillBufferWithICUCall that accepts a nsTArray. + */ +template <typename ICUStringFunction, typename CharType> +static ICUResult FillBufferWithICUCall(nsTArray<CharType>& array, + const ICUStringFunction& strFn) { + nsTArrayToBufferAdapter<CharType> buffer(array); + return FillBufferWithICUCall(buffer, strFn); +} + +template <typename ICUStringFunction, typename CharType, size_t N> +static ICUResult FillBufferWithICUCall(AutoTArray<CharType, N>& array, + const ICUStringFunction& strFn) { + AutoTArrayToBufferAdapter<CharType, N> buffer(array); + return FillBufferWithICUCall(buffer, strFn); +} +#endif + +/** + * Fill a UTF-8 or a UTF-16 buffer with a UTF-16 span. ICU4C mostly uses UTF-16 + * internally, but different consumers may have different situations with their + * buffers. + */ +template <typename Buffer> +[[nodiscard]] bool FillBuffer(Span<const char16_t> utf16Span, + Buffer& targetBuffer) { + static_assert(std::is_same_v<typename Buffer::CharType, char> || + std::is_same_v<typename Buffer::CharType, unsigned char> || + std::is_same_v<typename Buffer::CharType, char16_t>); + + if constexpr (std::is_same_v<typename Buffer::CharType, char> || + std::is_same_v<typename Buffer::CharType, unsigned char>) { + if (utf16Span.Length() & mozilla::tl::MulOverflowMask<3>::value) { + // Tripling the size of the buffer overflows the size_t. + return false; + } + + if (!targetBuffer.reserve(3 * utf16Span.Length())) { + return false; + } + + size_t amount = ConvertUtf16toUtf8( + utf16Span, Span(reinterpret_cast<char*>(targetBuffer.data()), + targetBuffer.capacity())); + + targetBuffer.written(amount); + } + if constexpr (std::is_same_v<typename Buffer::CharType, char16_t>) { + size_t amount = utf16Span.Length(); + if (!targetBuffer.reserve(amount)) { + return false; + } + for (size_t i = 0; i < amount; i++) { + targetBuffer.data()[i] = utf16Span[i]; + } + targetBuffer.written(amount); + } + + return true; +} + +/** + * Fill a UTF-8 or a UTF-16 buffer with a UTF-8 span. ICU4C mostly uses UTF-16 + * internally, but different consumers may have different situations with their + * buffers. + */ +template <typename Buffer> +[[nodiscard]] bool FillBuffer(Span<const char> utf8Span, Buffer& targetBuffer) { + static_assert(std::is_same_v<typename Buffer::CharType, char> || + std::is_same_v<typename Buffer::CharType, unsigned char> || + std::is_same_v<typename Buffer::CharType, char16_t>); + + if constexpr (std::is_same_v<typename Buffer::CharType, char> || + std::is_same_v<typename Buffer::CharType, unsigned char>) { + size_t amount = utf8Span.Length(); + if (!targetBuffer.reserve(amount)) { + return false; + } + for (size_t i = 0; i < amount; i++) { + targetBuffer.data()[i] = + // Static cast in case of a mismatch between `unsigned char` and + // `char` + static_cast<typename Buffer::CharType>(utf8Span[i]); + } + targetBuffer.written(amount); + } + if constexpr (std::is_same_v<typename Buffer::CharType, char16_t>) { + if (!targetBuffer.reserve(utf8Span.Length() + 1)) { + return false; + } + + size_t amount = ConvertUtf8toUtf16( + utf8Span, Span(targetBuffer.data(), targetBuffer.capacity())); + + targetBuffer.written(amount); + } + + return true; +} + +/** + * It is convenient for callers to be able to pass in UTF-8 strings to the API. + * This function can be used to convert that to a stack-allocated UTF-16 + * mozilla::Vector that can then be passed into ICU calls. The string will be + * null terminated. + */ +template <size_t StackSize> +[[nodiscard]] static bool FillUTF16Vector( + Span<const char> utf8Span, + mozilla::Vector<char16_t, StackSize>& utf16TargetVec) { + // Per ConvertUtf8toUtf16: The length of aDest must be at least one greater + // than the length of aSource. This additional length will be used for null + // termination. + if (!utf16TargetVec.reserve(utf8Span.Length() + 1)) { + return false; + } + + // ConvertUtf8toUtf16 fills the buffer with the data, but the length of the + // vector is unchanged. + size_t length = ConvertUtf8toUtf16( + utf8Span, Span(utf16TargetVec.begin(), utf16TargetVec.capacity())); + + // Assert that the last element is free for writing a null terminator. + MOZ_ASSERT(length < utf16TargetVec.capacity()); + utf16TargetVec.begin()[length] = '\0'; + + // The call to resizeUninitialized notifies the vector of how much was written + // exclusive of the null terminated character. + return utf16TargetVec.resizeUninitialized(length); +} + +/** + * An iterable class that wraps calls to the ICU UEnumeration C API. + * + * Usage: + * + * // Make sure the range expression is non-temporary, otherwise there is a + * // risk of undefined behavior: + * auto result = Calendar::GetBcp47KeywordValuesForLocale("en-US"); + * + * for (auto name : result.unwrap()) { + * MOZ_ASSERT(name.unwrap(), "An iterable value exists".); + * } + */ +template <typename CharType, typename T, T(Mapper)(const CharType*, int32_t)> +class Enumeration { + public: + class Iterator; + friend class Iterator; + + // Transfer ownership of the UEnumeration in the move constructor. + Enumeration(Enumeration&& other) noexcept + : mUEnumeration(other.mUEnumeration) { + other.mUEnumeration = nullptr; + } + + // Transfer ownership of the UEnumeration in the move assignment operator. + Enumeration& operator=(Enumeration&& other) noexcept { + if (this == &other) { + return *this; + } + if (mUEnumeration) { + uenum_close(mUEnumeration); + } + mUEnumeration = other.mUEnumeration; + other.mUEnumeration = nullptr; + return *this; + } + + class Iterator { + Enumeration& mEnumeration; + // `Nothing` signifies that no enumeration has been loaded through ICU yet. + Maybe<int32_t> mIteration = Nothing{}; + const CharType* mNext = nullptr; + int32_t mNextLength = 0; + + public: + using value_type = const CharType*; + using reference = T; + using iterator_category = std::input_iterator_tag; + + explicit Iterator(Enumeration& aEnumeration, bool aIsBegin) + : mEnumeration(aEnumeration) { + if (aIsBegin) { + AdvanceUEnum(); + } + } + + Iterator& operator++() { + AdvanceUEnum(); + return *this; + } + + Iterator operator++(int) { + Iterator retval = *this; + ++(*this); + return retval; + } + + bool operator==(Iterator other) const { + return mIteration == other.mIteration; + } + + bool operator!=(Iterator other) const { return !(*this == other); } + + T operator*() const { + // Map the iterated value to something new. + return Mapper(mNext, mNextLength); + } + + private: + void AdvanceUEnum() { + if (mIteration.isNothing()) { + mIteration = Some(-1); + } + UErrorCode status = U_ZERO_ERROR; + if constexpr (std::is_same_v<CharType, char16_t>) { + mNext = uenum_unext(mEnumeration.mUEnumeration, &mNextLength, &status); + } else { + static_assert(std::is_same_v<CharType, char>, + "Only char16_t and char are supported by " + "mozilla::intl::Enumeration."); + mNext = uenum_next(mEnumeration.mUEnumeration, &mNextLength, &status); + } + if (U_FAILURE(status)) { + mNext = nullptr; + } + + if (mNext) { + (*mIteration)++; + } else { + // The iterator is complete. + mIteration = Nothing{}; + } + } + }; + + Iterator begin() { return Iterator(*this, true); } + Iterator end() { return Iterator(*this, false); } + + explicit Enumeration(UEnumeration* aUEnumeration) + : mUEnumeration(aUEnumeration) {} + + ~Enumeration() { + if (mUEnumeration) { + // Only close when the object is being destructed, not moved. + uenum_close(mUEnumeration); + } + } + + private: + UEnumeration* mUEnumeration = nullptr; +}; + +template <typename CharType> +Result<Span<const CharType>, InternalError> SpanMapper(const CharType* string, + int32_t length) { + // Return the raw value from this Iterator. + if (string == nullptr) { + return Err(InternalError{}); + } + MOZ_ASSERT(length >= 0); + return Span<const CharType>(string, static_cast<size_t>(length)); +} + +template <typename CharType> +using SpanResult = Result<Span<const CharType>, InternalError>; + +template <typename CharType> +using SpanEnumeration = Enumeration<CharType, SpanResult<CharType>, SpanMapper>; + +/** + * An iterable class that wraps calls to ICU's available locales API. + */ +template <int32_t(CountAvailable)(), const char*(GetAvailable)(int32_t)> +class AvailableLocalesEnumeration final { + // The overall count of available locales. + int32_t mLocalesCount = 0; + + public: + AvailableLocalesEnumeration() { mLocalesCount = CountAvailable(); } + + class Iterator { + public: + // std::iterator traits. + using iterator_category = std::input_iterator_tag; + using value_type = const char*; + using difference_type = ptrdiff_t; + using pointer = value_type*; + using reference = value_type&; + + private: + // The current position in the list of available locales. + int32_t mLocalesPos = 0; + + public: + explicit Iterator(int32_t aLocalesPos) : mLocalesPos(aLocalesPos) {} + + Iterator& operator++() { + mLocalesPos++; + return *this; + } + + Iterator operator++(int) { + Iterator result = *this; + ++(*this); + return result; + } + + bool operator==(const Iterator& aOther) const { + return mLocalesPos == aOther.mLocalesPos; + } + + bool operator!=(const Iterator& aOther) const { return !(*this == aOther); } + + value_type operator*() const { return GetAvailable(mLocalesPos); } + }; + + // std::iterator begin() and end() methods. + + /** + * Return an iterator pointing to the first available locale. + */ + Iterator begin() const { return Iterator(0); } + + /** + * Return an iterator pointing to one past the last available locale. + */ + Iterator end() const { return Iterator(mLocalesCount); } +}; + +/** + * A helper class to wrap calling ICU function in cpp file so we don't have to + * include the ICU header here. + */ +class FormattedResult { + protected: + static Result<Span<const char16_t>, ICUError> ToSpanImpl( + const UFormattedValue* value); +}; + +/** + * A RAII class to hold the formatted value of format result. + * + * The caller will need to create this AutoFormattedResult on the stack, with + * the following parameters: + * 1. Native ICU type. + * 2. An ICU function which opens the result. + * 3. An ICU function which can get the result as UFormattedValue. + * 4. An ICU function which closes the result. + * + * After the object is created, caller needs to call IsValid() method to check + * if the native object has been created properly, and then passes this + * object to other format interfaces. + * The format result will be stored in this object, the caller can use ToSpan() + * method to get the formatted string. + * + * The methods GetFormatted() and Value() are private methods since they expose + * native ICU types. If the caller wants to call these methods, the caller needs + * to register itself as a friend class in AutoFormattedResult. + * + * The formatted value and the native ICU object will be released once this + * class is destructed. + */ +template <typename T, T*(Open)(UErrorCode*), + const UFormattedValue*(GetValue)(const T*, UErrorCode*), + void(Close)(T*)> +class MOZ_RAII AutoFormattedResult : FormattedResult { + public: + AutoFormattedResult() { + mFormatted = Open(&mError); + if (U_FAILURE(mError)) { + mFormatted = nullptr; + } + } + ~AutoFormattedResult() { + if (mFormatted) { + Close(mFormatted); + } + } + + AutoFormattedResult(const AutoFormattedResult& other) = delete; + AutoFormattedResult& operator=(const AutoFormattedResult& other) = delete; + + AutoFormattedResult(AutoFormattedResult&& other) = delete; + AutoFormattedResult& operator=(AutoFormattedResult&& other) = delete; + + /** + * Check if the native UFormattedDateInterval was created successfully. + */ + bool IsValid() const { return !!mFormatted; } + + /** + * Get error code if IsValid() returns false. + */ + ICUError GetError() const { return ToICUError(mError); } + + /** + * Get the formatted result. + */ + Result<Span<const char16_t>, ICUError> ToSpan() const { + if (!IsValid()) { + return Err(GetError()); + } + + const UFormattedValue* value = Value(); + if (!value) { + return Err(ICUError::InternalError); + } + + return ToSpanImpl(value); + } + + private: + friend class DateIntervalFormat; + friend class ListFormat; + T* GetFormatted() const { return mFormatted; } + + const UFormattedValue* Value() const { + if (!IsValid()) { + return nullptr; + } + + UErrorCode status = U_ZERO_ERROR; + const UFormattedValue* value = GetValue(mFormatted, &status); + if (U_FAILURE(status)) { + return nullptr; + } + + return value; + }; + + T* mFormatted = nullptr; + UErrorCode mError = U_ZERO_ERROR; +}; +} // namespace mozilla::intl + +#endif /* intl_components_ICUUtils_h */ diff --git a/intl/components/src/ICU4CLibrary.cpp b/intl/components/src/ICU4CLibrary.cpp new file mode 100644 index 0000000000..d13bc40ad4 --- /dev/null +++ b/intl/components/src/ICU4CLibrary.cpp @@ -0,0 +1,41 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "mozilla/intl/ICU4CLibrary.h" + +#include "unicode/putil.h" +#include "unicode/uclean.h" +#include "unicode/utypes.h" +#include "unicode/uversion.h" + +namespace mozilla::intl { + +ICUResult ICU4CLibrary::Initialize() { +#if !MOZ_SYSTEM_ICU + // Explicitly set the data directory to its default value, but only when we're + // sure that we use our in-tree ICU copy. See bug 1527879 and ICU bug + // report <https://unicode-org.atlassian.net/browse/ICU-20491>. + u_setDataDirectory(""); +#endif + + UErrorCode status = U_ZERO_ERROR; + u_init(&status); + return ToICUResult(status); +} + +void ICU4CLibrary::Cleanup() { u_cleanup(); } + +ICUResult ICU4CLibrary::SetMemoryFunctions(MemoryFunctions aMemoryFunctions) { + UErrorCode status = U_ZERO_ERROR; + u_setMemoryFunctions(/* context = */ nullptr, aMemoryFunctions.mAllocFn, + aMemoryFunctions.mReallocFn, aMemoryFunctions.mFreeFn, + &status); + return ToICUResult(status); +} + +Span<const char> ICU4CLibrary::GetVersion() { + return MakeStringSpan(U_ICU_VERSION); +} + +} // namespace mozilla::intl diff --git a/intl/components/src/ICU4CLibrary.h b/intl/components/src/ICU4CLibrary.h new file mode 100644 index 0000000000..67cd1e205f --- /dev/null +++ b/intl/components/src/ICU4CLibrary.h @@ -0,0 +1,74 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef intl_components_ICU4CLibrary_h +#define intl_components_ICU4CLibrary_h + +#include "mozilla/intl/ICU4CGlue.h" +#include "mozilla/Span.h" + +#include <stddef.h> + +namespace mozilla::intl { +/** + * Wrapper around non-portable, ICU4C specific functions. + */ +class ICU4CLibrary final { + public: + ICU4CLibrary() = delete; + + /** + * Initializes the ICU4C library. + * + * Note: This function should only be called once. + */ + static ICUResult Initialize(); + + /** + * Releases any memory held by ICU. Any open ICU objects and resources are + * left in an undefined state after this operation. + * + * NOTE: This function is not thread-safe. + */ + static void Cleanup(); + + struct MemoryFunctions { + // These are equivalent to ICU's |UMemAllocFn|, |UMemReallocFn|, and + // |UMemFreeFn| types. The first argument (called |context| in the ICU + // docs) will always be nullptr and should be ignored. + using AllocFn = void* (*)(const void*, size_t); + using ReallocFn = void* (*)(const void*, void*, size_t); + using FreeFn = void (*)(const void*, void*); + + /** + * Function called when allocating memory. + */ + AllocFn mAllocFn = nullptr; + + /** + * Function called when reallocating memory. + */ + ReallocFn mReallocFn = nullptr; + + /** + * Function called when freeing memory. + */ + FreeFn mFreeFn = nullptr; + }; + + /** + * Sets the ICU memory functions. + * + * This function can only be called before the initial call to Initialize()! + */ + static ICUResult SetMemoryFunctions(MemoryFunctions aMemoryFunctions); + + /** + * Return the ICU version number. + */ + static Span<const char> GetVersion(); +}; +} // namespace mozilla::intl + +#endif diff --git a/intl/components/src/ICUError.h b/intl/components/src/ICUError.h new file mode 100644 index 0000000000..c3ef236210 --- /dev/null +++ b/intl/components/src/ICUError.h @@ -0,0 +1,118 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef intl_components_ICUError_h +#define intl_components_ICUError_h + +#include "mozilla/Attributes.h" +#include "mozilla/Result.h" + +#include <cstdint> +#include <type_traits> + +namespace mozilla::intl { + +/** + * General purpose error type for operations that can result in an ICU error. + */ +enum class ICUError : uint8_t { + // Since we claim UnusedZero<ICUError>::value and + // HasFreeLSB<ICUError>::value == true below, we must only use positive, + // even enum values. + + OutOfMemory = 2, + InternalError = 4, + OverflowError = 6, +}; + +/** + * Error type when a method call can only result in an internal ICU error. + */ +struct InternalError { + // Since we claim UnusedZero<InternalError>::value and + // HasFreeLSB<InternalError>::value == true below, we must only use positive, + // even enum values. + enum class ErrorKind : uint8_t { Unspecified = 2 }; + + const ErrorKind kind = ErrorKind::Unspecified; + + constexpr InternalError() = default; + + private: + friend struct mozilla::detail::UnusedZero<InternalError>; + + constexpr MOZ_IMPLICIT InternalError(ErrorKind aKind) : kind(aKind) {} +}; + +} // namespace mozilla::intl + +namespace mozilla::detail { + +// Provide specializations for UnusedZero and HasFreeLSB to enable more +// efficient packing for mozilla::Result. This also avoids having to include +// the ResultVariant.h header. +// +// UnusedZero specialization: +// +// The UnusedZero specialization makes it possible to use CompactPair as the +// underlying storage type for Result. For this optimization to work, it is +// necessary that a distinct null-value is present for the error type. The +// null-value represents the success case and must be different from all actual +// error values. +// This optimization can be easily enabled when the error type is a scoped enum. +// No enum value must use zero as its value and UnusedZero must be specialized +// through the helper struct UnusedZeroEnum. +// For non-enum error types, a more complicated setup is necessary. The +// UnusedZero specialization must implement all necessary interface methods +// (i.e. `Inspect`, `Unwrap`, and `Store`) as well as all necessary constants +// and types (i.e. `StorageType`, `value`, and `nullValue`). +// +// HasFreeLSB specialization: +// +// When the value and the error type are both providing specializations for +// HasFreeLSB, Result uses an optimization to store both types within a single +// storage location. This optimization uses the least significant bit as a tag +// bit to mark the error case. And because the least significant bit is used for +// tagging, it can't be used by the error type. That means for example when the +// error type is an enum, all enum values must be even, because odd integer +// values have the least significant bit set. +// The actual HasFreeLSB specialization just needs to define `value` as a static +// constant with the value `true`. + +template <> +struct UnusedZero<mozilla::intl::ICUError> + : UnusedZeroEnum<mozilla::intl::ICUError> {}; + +template <> +struct UnusedZero<mozilla::intl::InternalError> { + using Error = mozilla::intl::InternalError; + using StorageType = std::underlying_type_t<Error::ErrorKind>; + + static constexpr bool value = true; + static constexpr StorageType nullValue = 0; + + static constexpr Error Inspect(const StorageType& aValue) { + return static_cast<Error::ErrorKind>(aValue); + } + static constexpr Error Unwrap(StorageType aValue) { + return static_cast<Error::ErrorKind>(aValue); + } + static constexpr StorageType Store(Error aValue) { + return static_cast<StorageType>(aValue.kind); + } +}; + +template <> +struct HasFreeLSB<mozilla::intl::ICUError> { + static constexpr bool value = true; +}; + +template <> +struct HasFreeLSB<mozilla::intl::InternalError> { + static constexpr bool value = true; +}; + +} // namespace mozilla::detail + +#endif diff --git a/intl/components/src/IDNA.cpp b/intl/components/src/IDNA.cpp new file mode 100644 index 0000000000..9b5303f4e8 --- /dev/null +++ b/intl/components/src/IDNA.cpp @@ -0,0 +1,26 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "mozilla/intl/IDNA.h" + +namespace mozilla::intl { + +// static +Result<UniquePtr<IDNA>, ICUError> IDNA::TryCreate(ProcessingType aProcessing) { + uint32_t IDNAOptions = UIDNA_CHECK_BIDI | UIDNA_CHECK_CONTEXTJ; + if (aProcessing == ProcessingType::NonTransitional) { + IDNAOptions |= UIDNA_NONTRANSITIONAL_TO_UNICODE; + } + + UErrorCode status = U_ZERO_ERROR; + UIDNA* idna = uidna_openUTS46(IDNAOptions, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + return UniquePtr<IDNA>(new IDNA(idna)); +} + +IDNA::~IDNA() { uidna_close(mIDNA.GetMut()); } +} // namespace mozilla::intl diff --git a/intl/components/src/IDNA.h b/intl/components/src/IDNA.h new file mode 100644 index 0000000000..9f18661403 --- /dev/null +++ b/intl/components/src/IDNA.h @@ -0,0 +1,130 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef intl_components_IDNA_h_ +#define intl_components_IDNA_h_ + +#include "mozilla/intl/ICU4CGlue.h" + +#include "unicode/uidna.h" + +namespace mozilla::intl { + +/** + * This component is a Mozilla-focused API for the Internationalizing Domain + * Names in Applications (IDNA). + * + * See UTS #46 for details. + * http://unicode.org/reports/tr46/ + */ +class IDNA final { + public: + ~IDNA(); + + /** + * UTS #46 specifies two specific types of processing: Transitional Processing + * and NonTransitional Processing. + * + * See http://unicode.org/reports/tr46/#Compatibility_Processing + */ + enum class ProcessingType { + Transitional, + NonTransitional, + }; + + /** + * Create an IDNA object, with specifying the type of processing by enum + * ProcessingType. + * + * Currently the implementation enables CheckBidi flag and CheckJoiners by + * default. + * + * See UTS #46, '4 Processing' for details. + * http://unicode.org/reports/tr46/#Processing + */ + static Result<UniquePtr<IDNA>, ICUError> TryCreate( + ProcessingType aProcessing); + + /** + * This class contains the error code information of IDNA processing. + */ + class Info final { + public: + /** + * Check if there's any error. + */ + bool HasErrors() const { return mErrorCode != 0; } + + /** + * If the domain name label starts with "xn--", then the label contains + * Punycode. This checks if the domain name label has invalid Punycode. + * + * See https://www.rfc-editor.org/rfc/rfc3492.html + */ + bool HasInvalidPunycode() const { + return (mErrorCode & UIDNA_ERROR_PUNYCODE) != 0; + } + + /* The label was successfully ACE (Punycode) decoded but the resulting + * string had severe validation errors. For example, + * it might contain characters that are not allowed in ACE labels, + * or it might not be normalized. + */ + bool HasInvalidAceLabel() const { + return (mErrorCode & UIDNA_ERROR_INVALID_ACE_LABEL) != 0; + } + + /** + * Checks if the domain name label has any invalid hyphen characters. + * + * See CheckHyphens flag for details in UTS #46[1]. + * - The label must not contain a U+002D HYPHEN-MINUS character in both the + * third and fourth positions. + * - The label must neither begin nor end with a U+002D HYPHEN-MINUS + * character. + * + * [1]: http://unicode.org/reports/tr46/#Validity_Criteria + */ + bool HasInvalidHyphen() const { + uint32_t hyphenErrors = UIDNA_ERROR_LEADING_HYPHEN | + UIDNA_ERROR_TRAILING_HYPHEN | + UIDNA_ERROR_HYPHEN_3_4; + return (mErrorCode & hyphenErrors) != 0; + } + + private: + friend class IDNA; + explicit Info(const UIDNAInfo* aUinfo) : mErrorCode(aUinfo->errors) {} + + uint32_t mErrorCode = 0; + }; + + /** + * Converts a domain name label to its Unicode form for human-readable + * display, and writes the Unicode form into buffer, and returns IDNA::Info + * object. + * The IDNA::Info object contains the detail information about the processing + * result of IDNA call, caller should check the result by calling + * IDNA::Info::HasErrors() as well. + */ + template <typename Buffer> + Result<Info, ICUError> LabelToUnicode(Span<const char16_t> aLabel, + Buffer& aBuffer) { + UIDNAInfo uinfo = UIDNA_INFO_INITIALIZER; + MOZ_TRY(FillBufferWithICUCall( + aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) { + return uidna_labelToUnicode(mIDNA.GetConst(), aLabel.data(), + aLabel.size(), target, length, &uinfo, + status); + })); + + return Info{&uinfo}; + } + + private: + explicit IDNA(UIDNA* aIDNA) : mIDNA(aIDNA) {} + + ICUPointer<UIDNA> mIDNA = ICUPointer<UIDNA>(nullptr); +}; +} // namespace mozilla::intl +#endif // intl_components_IDNA_h_ diff --git a/intl/components/src/ListFormat.cpp b/intl/components/src/ListFormat.cpp new file mode 100644 index 0000000000..6d1e10826a --- /dev/null +++ b/intl/components/src/ListFormat.cpp @@ -0,0 +1,132 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#include "mozilla/intl/ListFormat.h" + +#include "ScopedICUObject.h" + +namespace mozilla::intl { + +/*static*/ Result<UniquePtr<ListFormat>, ICUError> ListFormat::TryCreate( + mozilla::Span<const char> aLocale, const Options& aOptions) { + UListFormatterType utype = ToUListFormatterType(aOptions.mType); + UListFormatterWidth uwidth = ToUListFormatterWidth(aOptions.mStyle); + + UErrorCode status = U_ZERO_ERROR; + UListFormatter* fmt = + ulistfmt_openForType(IcuLocale(aLocale), utype, uwidth, &status); + if (U_FAILURE(status)) { + return Err(ICUError::InternalError); + } + + return UniquePtr<ListFormat>(new ListFormat(fmt)); +} + +ListFormat::~ListFormat() { + if (mListFormatter) { + ulistfmt_close(mListFormatter.GetMut()); + } +} + +/* static */ UListFormatterType ListFormat::ToUListFormatterType(Type type) { + switch (type) { + case Type::Conjunction: + return ULISTFMT_TYPE_AND; + case Type::Disjunction: + return ULISTFMT_TYPE_OR; + case Type::Unit: + return ULISTFMT_TYPE_UNITS; + } + MOZ_ASSERT_UNREACHABLE(); + return ULISTFMT_TYPE_AND; +} + +/* static */ UListFormatterWidth ListFormat::ToUListFormatterWidth( + Style style) { + switch (style) { + case Style::Long: + return ULISTFMT_WIDTH_WIDE; + case Style::Short: + return ULISTFMT_WIDTH_SHORT; + case Style::Narrow: + return ULISTFMT_WIDTH_NARROW; + } + MOZ_ASSERT_UNREACHABLE(); + return ULISTFMT_WIDTH_WIDE; +} + +ICUResult ListFormat::FormattedToParts(const UFormattedValue* formattedValue, + size_t formattedSize, + PartVector& parts) { + size_t lastEndIndex = 0; + + auto AppendPart = [&](PartType type, size_t endIndex) { + if (!parts.emplaceBack(type, endIndex)) { + return false; + } + + lastEndIndex = endIndex; + return true; + }; + + UErrorCode status = U_ZERO_ERROR; + UConstrainedFieldPosition* fpos = ucfpos_open(&status); + if (U_FAILURE(status)) { + return Err(ICUError::InternalError); + } + ScopedICUObject<UConstrainedFieldPosition, ucfpos_close> toCloseFpos(fpos); + + // We're only interested in ULISTFMT_ELEMENT_FIELD fields. + ucfpos_constrainField(fpos, UFIELD_CATEGORY_LIST, ULISTFMT_ELEMENT_FIELD, + &status); + if (U_FAILURE(status)) { + return Err(ICUError::InternalError); + } + + while (true) { + bool hasMore = ufmtval_nextPosition(formattedValue, fpos, &status); + if (U_FAILURE(status)) { + return Err(ICUError::InternalError); + } + if (!hasMore) { + break; + } + + int32_t beginIndexInt, endIndexInt; + ucfpos_getIndexes(fpos, &beginIndexInt, &endIndexInt, &status); + if (U_FAILURE(status)) { + return Err(ICUError::InternalError); + } + + MOZ_ASSERT(beginIndexInt <= endIndexInt, + "field iterator returning invalid range"); + + size_t beginIndex = AssertedCast<size_t>(beginIndexInt); + size_t endIndex = AssertedCast<size_t>(endIndexInt); + + // Indices are guaranteed to be returned in order (from left to right). + MOZ_ASSERT(lastEndIndex <= beginIndex, + "field iteration didn't return fields in order start to " + "finish as expected"); + + if (lastEndIndex < beginIndex) { + if (!AppendPart(PartType::Literal, beginIndex)) { + return Err(ICUError::InternalError); + } + } + + if (!AppendPart(PartType::Element, endIndex)) { + return Err(ICUError::InternalError); + } + } + + // Append any final literal. + if (lastEndIndex < formattedSize) { + if (!AppendPart(PartType::Literal, formattedSize)) { + return Err(ICUError::InternalError); + } + } + + return Ok(); +} +} // namespace mozilla::intl diff --git a/intl/components/src/ListFormat.h b/intl/components/src/ListFormat.h new file mode 100644 index 0000000000..4952512f97 --- /dev/null +++ b/intl/components/src/ListFormat.h @@ -0,0 +1,223 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef intl_components_ListFormat_h_ +#define intl_components_ListFormat_h_ + +#include "mozilla/CheckedInt.h" +#include "mozilla/intl/ICU4CGlue.h" +#include "mozilla/PodOperations.h" +#include "mozilla/Result.h" +#include "mozilla/Vector.h" +#include "unicode/ulistformatter.h" + +struct UListFormatter; + +namespace mozilla::intl { + +static constexpr size_t DEFAULT_LIST_LENGTH = 8; + +/** + * This component is a Mozilla-focused API for the list formatting provided by + * ICU. It implements the API provided by the ECMA-402 Intl.ListFormat object. + * + * https://tc39.es/ecma402/#listformat-objects + */ +class ListFormat final { + public: + /** + * The [[Type]] and [[Style]] properties of ListFormat instances. + * + * https://tc39.es/ecma402/#sec-properties-of-intl-listformat-instances + */ + // [[Type]] + enum class Type { Conjunction, Disjunction, Unit }; + // [[Style]] + enum class Style { Long, Short, Narrow }; + + /** + * The 'options' object to create Intl.ListFormat instance. + * + * https://tc39.es/ecma402/#sec-Intl.ListFormat + */ + struct Options { + // "conjunction" is the default fallback value. + Type mType = Type::Conjunction; + + // "long" is the default fallback value. + Style mStyle = Style::Long; + }; + + /** + * Create a ListFormat object for the provided locale and options. + * + * https://tc39.es/ecma402/#sec-Intl.ListFormat + */ + static Result<UniquePtr<ListFormat>, ICUError> TryCreate( + mozilla::Span<const char> aLocale, const Options& aOptions); + + ~ListFormat(); + + /** + * The list of String values for FormatList and FormatListToParts. + * + * https://tc39.es/ecma402/#sec-formatlist + * https://tc39.es/ecma402/#sec-formatlisttoparts + */ + using StringList = + mozilla::Vector<mozilla::Span<const char16_t>, DEFAULT_LIST_LENGTH>; + + /** + * Format the list according and write the result in buffer. + * + * https://tc39.es/ecma402/#sec-Intl.ListFormat.prototype.format + * https://tc39.es/ecma402/#sec-formatlist + */ + template <typename Buffer> + ICUResult Format(const StringList& list, Buffer& buffer) const { + static_assert(std::is_same_v<typename Buffer::CharType, char16_t>, + "Currently only UTF-16 buffers are supported."); + + mozilla::Vector<const char16_t*, DEFAULT_LIST_LENGTH> u16strings; + mozilla::Vector<int32_t, DEFAULT_LIST_LENGTH> u16stringLens; + MOZ_TRY(ConvertStringListToVectors(list, u16strings, u16stringLens)); + + int32_t u16stringCount = mozilla::AssertedCast<int32_t>(list.length()); + MOZ_TRY(FillBufferWithICUCall( + buffer, [this, &u16strings, &u16stringLens, u16stringCount]( + char16_t* chars, int32_t size, UErrorCode* status) { + return ulistfmt_format(mListFormatter.GetConst(), u16strings.begin(), + u16stringLens.begin(), u16stringCount, chars, + size, status); + })); + + return Ok{}; + } + + /** + * The corresponding list of parts according to the effective locale and the + * formatting options of ListFormat. + * Each part has a [[Type]] field, which must be "element" or "literal", and a + * [[Value]] field. + * + * To store Part more efficiently, it doesn't store the ||Value|| of type + * string in this struct. Instead, it stores the end index of the string in + * the buffer(which is passed to ListFormat::FormatToParts()). The begin index + * of the ||Value|| is the index of the previous part. + * + * Buffer + * 0 i j + * +---------------+---------------+---------------+ + * | Part[0].Value | Part[1].Value | Part[2].Value | .... + * +---------------+---------------+---------------+ + * + * Part[0].index is i. Part[0].Value is stored in the Buffer[0..i]. + * Part[1].index is j. Part[1].Value is stored in the Buffer[i..j]. + * + * See https://tc39.es/ecma402/#sec-createpartsfromlist + */ + enum class PartType { + Element, + Literal, + }; + // The 2nd field is the end index to the buffer as mentioned above. + using Part = std::pair<PartType, size_t>; + using PartVector = mozilla::Vector<Part, DEFAULT_LIST_LENGTH>; + + /** + * Format the list to a list of parts, and store the formatted result of + * UTF-16 string into buffer, and formatted parts into the vector 'parts'. + * + * See: + * https://tc39.es/ecma402/#sec-Intl.ListFormat.prototype.formatToParts + * https://tc39.es/ecma402/#sec-formatlisttoparts + */ + template <typename Buffer> + ICUResult FormatToParts(const StringList& list, Buffer& buffer, + PartVector& parts) { + static_assert(std::is_same_v<typename Buffer::CharType, char16_t>, + "Currently only UTF-16 buffers are supported."); + + mozilla::Vector<const char16_t*, DEFAULT_LIST_LENGTH> u16strings; + mozilla::Vector<int32_t, DEFAULT_LIST_LENGTH> u16stringLens; + MOZ_TRY(ConvertStringListToVectors(list, u16strings, u16stringLens)); + + AutoFormattedList formatted; + UErrorCode status = U_ZERO_ERROR; + ulistfmt_formatStringsToResult( + mListFormatter.GetConst(), u16strings.begin(), u16stringLens.begin(), + int32_t(list.length()), formatted.GetFormatted(), &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + auto spanResult = formatted.ToSpan(); + if (spanResult.isErr()) { + return spanResult.propagateErr(); + } + auto formattedSpan = spanResult.unwrap(); + if (!FillBuffer(formattedSpan, buffer)) { + return Err(ICUError::OutOfMemory); + } + + const UFormattedValue* value = formatted.Value(); + if (!value) { + return Err(ICUError::InternalError); + } + return FormattedToParts(value, buffer.length(), parts); + } + + private: + ListFormat() = delete; + explicit ListFormat(UListFormatter* fmt) : mListFormatter(fmt) {} + ListFormat(const ListFormat&) = delete; + ListFormat& operator=(const ListFormat&) = delete; + + ICUPointer<UListFormatter> mListFormatter = + ICUPointer<UListFormatter>(nullptr); + + // Convert StringList to an array of type 'const char16_t*' and an array of + // int32 for ICU-API. + ICUResult ConvertStringListToVectors( + const StringList& list, + mozilla::Vector<const char16_t*, DEFAULT_LIST_LENGTH>& u16strings, + mozilla::Vector<int32_t, DEFAULT_LIST_LENGTH>& u16stringLens) const { + // Keep a conservative running count of overall length. + mozilla::CheckedInt<int32_t> stringLengthTotal(0); + for (const auto& string : list) { + if (!u16strings.append(string.data())) { + return Err(ICUError::InternalError); + } + + int32_t len = mozilla::AssertedCast<int32_t>(string.size()); + if (!u16stringLens.append(len)) { + return Err(ICUError::InternalError); + } + + stringLengthTotal += len; + } + + // Add space for N unrealistically large conjunctions. + constexpr int32_t MaxConjunctionLen = 100; + stringLengthTotal += CheckedInt<int32_t>(list.length()) * MaxConjunctionLen; + // If the overestimate exceeds ICU length limits, don't try to format. + if (!stringLengthTotal.isValid()) { + return Err(ICUError::OverflowError); + } + + return Ok{}; + } + + using AutoFormattedList = + AutoFormattedResult<UFormattedList, ulistfmt_openResult, + ulistfmt_resultAsValue, ulistfmt_closeResult>; + + ICUResult FormattedToParts(const UFormattedValue* formattedValue, + size_t formattedSize, PartVector& parts); + + static UListFormatterType ToUListFormatterType(Type type); + static UListFormatterWidth ToUListFormatterWidth(Style style); +}; + +} // namespace mozilla::intl +#endif // intl_components_ListFormat_h_ diff --git a/intl/components/src/Locale.cpp b/intl/components/src/Locale.cpp new file mode 100644 index 0000000000..9a043518cf --- /dev/null +++ b/intl/components/src/Locale.cpp @@ -0,0 +1,1471 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "mozilla/intl/Locale.h" + +#include "mozilla/Assertions.h" +#include "mozilla/DebugOnly.h" +#include "mozilla/MathAlgorithms.h" +#include "mozilla/Span.h" +#include "mozilla/TextUtils.h" +#include "mozilla/Variant.h" + +#include "ICU4CGlue.h" + +#include <algorithm> +#include <iterator> +#include <stddef.h> +#include <stdint.h> +#include <string> +#include <string.h> +#include <type_traits> +#include <utility> + +#include "unicode/uloc.h" +#include "unicode/utypes.h" + +namespace mozilla::intl { + +using namespace intl::LanguageTagLimits; + +template <typename CharT> +bool IsStructurallyValidLanguageTag(Span<const CharT> aLanguage) { + // unicode_language_subtag = alpha{2,3} | alpha{5,8}; + size_t length = aLanguage.size(); + const CharT* str = aLanguage.data(); + return ((2 <= length && length <= 3) || (5 <= length && length <= 8)) && + std::all_of(str, str + length, IsAsciiAlpha<CharT>); +} + +template bool IsStructurallyValidLanguageTag(Span<const char> aLanguage); +template bool IsStructurallyValidLanguageTag(Span<const Latin1Char> aLanguage); +template bool IsStructurallyValidLanguageTag(Span<const char16_t> aLanguage); + +template <typename CharT> +bool IsStructurallyValidScriptTag(Span<const CharT> aScript) { + // unicode_script_subtag = alpha{4} ; + size_t length = aScript.size(); + const CharT* str = aScript.data(); + return length == 4 && std::all_of(str, str + length, IsAsciiAlpha<CharT>); +} + +template bool IsStructurallyValidScriptTag(Span<const char> aScript); +template bool IsStructurallyValidScriptTag(Span<const Latin1Char> aScript); +template bool IsStructurallyValidScriptTag(Span<const char16_t> aScript); + +template <typename CharT> +bool IsStructurallyValidRegionTag(Span<const CharT> aRegion) { + // unicode_region_subtag = (alpha{2} | digit{3}) ; + size_t length = aRegion.size(); + const CharT* str = aRegion.data(); + return (length == 2 && std::all_of(str, str + length, IsAsciiAlpha<CharT>)) || + (length == 3 && std::all_of(str, str + length, IsAsciiDigit<CharT>)); +} + +template bool IsStructurallyValidRegionTag(Span<const char> aRegion); +template bool IsStructurallyValidRegionTag(Span<const Latin1Char> aRegion); +template bool IsStructurallyValidRegionTag(Span<const char16_t> aRegion); + +#ifdef DEBUG +bool IsStructurallyValidVariantTag(Span<const char> aVariant) { + // unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) ; + size_t length = aVariant.size(); + const char* str = aVariant.data(); + return ((5 <= length && length <= 8) || + (length == 4 && IsAsciiDigit(str[0]))) && + std::all_of(str, str + length, IsAsciiAlphanumeric<char>); +} + +bool IsStructurallyValidUnicodeExtensionTag(Span<const char> aExtension) { + return LocaleParser::CanParseUnicodeExtension(aExtension).isOk(); +} + +static bool IsStructurallyValidExtensionTag(Span<const char> aExtension) { + // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ; + // NB: Allow any extension, including Unicode and Transform here, because + // this function is only used for an assertion. + + size_t length = aExtension.size(); + const char* str = aExtension.data(); + const char* const end = aExtension.data() + length; + if (length <= 2) { + return false; + } + if (!IsAsciiAlphanumeric(str[0]) || str[0] == 'x' || str[0] == 'X') { + return false; + } + str++; + if (*str++ != '-') { + return false; + } + while (true) { + const char* sep = + reinterpret_cast<const char*>(memchr(str, '-', end - str)); + size_t len = (sep ? sep : end) - str; + if (len < 2 || len > 8 || + !std::all_of(str, str + len, IsAsciiAlphanumeric<char>)) { + return false; + } + if (!sep) { + return true; + } + str = sep + 1; + } +} + +bool IsStructurallyValidPrivateUseTag(Span<const char> aPrivateUse) { + // pu_extensions = sep [xX] (sep alphanum{1,8})+ ; + + size_t length = aPrivateUse.size(); + const char* str = aPrivateUse.data(); + const char* const end = aPrivateUse.data() + length; + if (length <= 2) { + return false; + } + if (str[0] != 'x' && str[0] != 'X') { + return false; + } + str++; + if (*str++ != '-') { + return false; + } + while (true) { + const char* sep = + reinterpret_cast<const char*>(memchr(str, '-', end - str)); + size_t len = (sep ? sep : end) - str; + if (len == 0 || len > 8 || + !std::all_of(str, str + len, IsAsciiAlphanumeric<char>)) { + return false; + } + if (!sep) { + return true; + } + str = sep + 1; + } +} +#endif + +ptrdiff_t Locale::UnicodeExtensionIndex() const { + // The extension subtags aren't necessarily sorted, so we can't use binary + // search here. + auto p = std::find_if( + mExtensions.begin(), mExtensions.end(), + [](const auto& ext) { return ext[0] == 'u' || ext[0] == 'U'; }); + if (p != mExtensions.end()) { + return std::distance(mExtensions.begin(), p); + } + return -1; +} + +Maybe<Span<const char>> Locale::GetUnicodeExtension() const { + ptrdiff_t index = UnicodeExtensionIndex(); + if (index >= 0) { + return Some(MakeStringSpan(mExtensions[index].get())); + } + return Nothing(); +} + +ICUResult Locale::SetUnicodeExtension(Span<const char> aExtension) { + MOZ_ASSERT(IsStructurallyValidUnicodeExtensionTag(aExtension)); + + auto duplicated = DuplicateStringToUniqueChars(aExtension); + + // Replace the existing Unicode extension subtag or append a new one. + ptrdiff_t index = UnicodeExtensionIndex(); + if (index >= 0) { + mExtensions[index] = std::move(duplicated); + return Ok(); + } + if (!mExtensions.append(std::move(duplicated))) { + return Err(ICUError::OutOfMemory); + } + return Ok(); +} + +void Locale::ClearUnicodeExtension() { + ptrdiff_t index = UnicodeExtensionIndex(); + if (index >= 0) { + mExtensions.erase(mExtensions.begin() + index); + } +} + +template <size_t InitialCapacity> +static bool SortAlphabetically(Vector<UniqueChars, InitialCapacity>& aSubtags) { + size_t length = aSubtags.length(); + + // Zero or one element lists are already sorted. + if (length < 2) { + return true; + } + + // Handle two element lists inline. + if (length == 2) { + if (strcmp(aSubtags[0].get(), aSubtags[1].get()) > 0) { + aSubtags[0].swap(aSubtags[1]); + } + return true; + } + + Vector<char*, 8> scratch; + if (!scratch.resizeUninitialized(length)) { + return false; + } + for (size_t i = 0; i < length; i++) { + scratch[i] = aSubtags[i].release(); + } + + std::stable_sort( + scratch.begin(), scratch.end(), + [](const char* a, const char* b) { return strcmp(a, b) < 0; }); + + for (size_t i = 0; i < length; i++) { + aSubtags[i] = UniqueChars(scratch[i]); + } + return true; +} + +Result<Ok, Locale::CanonicalizationError> Locale::CanonicalizeBaseName() { + // Per 6.2.3 CanonicalizeUnicodeLocaleId, the very first step is to + // canonicalize the syntax by normalizing the case and ordering all subtags. + // The canonical syntax form is specified in UTS 35, 3.2.1. + + // Language codes need to be in lower case. "JA" -> "ja" + mLanguage.ToLowerCase(); + MOZ_ASSERT(IsStructurallyValidLanguageTag(Language().Span())); + + // The first character of a script code needs to be capitalized. + // "hans" -> "Hans" + mScript.ToTitleCase(); + MOZ_ASSERT(Script().Missing() || + IsStructurallyValidScriptTag(Script().Span())); + + // Region codes need to be in upper case. "bu" -> "BU" + mRegion.ToUpperCase(); + MOZ_ASSERT(Region().Missing() || + IsStructurallyValidRegionTag(Region().Span())); + + // The canonical case for variant subtags is lowercase. + for (UniqueChars& variant : mVariants) { + char* variantChars = variant.get(); + size_t variantLength = strlen(variantChars); + AsciiToLowerCase(variantChars, variantLength, variantChars); + + MOZ_ASSERT(IsStructurallyValidVariantTag({variantChars, variantLength})); + } + + // Extensions and privateuse subtags are case normalized in the + // |canonicalizeExtensions| method. + + // The second step in UTS 35, 3.2.1, is to order all subtags. + + if (mVariants.length() > 1) { + // 1. Any variants are in alphabetical order. + if (!SortAlphabetically(mVariants)) { + return Err(CanonicalizationError::OutOfMemory); + } + + // Reject the Locale identifier if a duplicate variant was found, e.g. + // "en-variant-Variant". + const UniqueChars* duplicate = std::adjacent_find( + mVariants.begin(), mVariants.end(), [](const auto& a, const auto& b) { + return strcmp(a.get(), b.get()) == 0; + }); + if (duplicate != mVariants.end()) { + return Err(CanonicalizationError::DuplicateVariant); + } + } + + // 2. Any extensions are in alphabetical order by their singleton. + // 3. All attributes are sorted in alphabetical order. + // 4. All keywords and tfields are sorted by alphabetical order of their keys, + // within their respective extensions. + // 5. Any type or tfield value "true" is removed. + // - A subsequent call to canonicalizeExtensions() will perform these steps. + + // 6.2.3 CanonicalizeUnicodeLocaleId, step 2 transforms the locale identifier + // into its canonical form per UTS 3.2.1. + + // 1. Use the bcp47 data to replace keys, types, tfields, and tvalues by their + // canonical forms. + // - A subsequent call to canonicalizeExtensions() will perform this step. + + // 2. Replace aliases in the unicode_language_id and tlang (if any). + // - tlang is handled in canonicalizeExtensions(). + + // Replace deprecated language, region, and variant subtags with their + // preferred mappings. + + if (!UpdateLegacyMappings()) { + return Err(CanonicalizationError::OutOfMemory); + } + + // Replace deprecated language subtags with their preferred values. + if (!LanguageMapping(mLanguage) && ComplexLanguageMapping(mLanguage)) { + PerformComplexLanguageMappings(); + } + + // Replace deprecated script subtags with their preferred values. + if (Script().Present()) { + ScriptMapping(mScript); + } + + // Replace deprecated region subtags with their preferred values. + if (Region().Present()) { + if (!RegionMapping(mRegion) && ComplexRegionMapping(mRegion)) { + PerformComplexRegionMappings(); + } + } + + // Replace deprecated variant subtags with their preferred values. + if (!PerformVariantMappings()) { + return Err(CanonicalizationError::OutOfMemory); + } + + // No extension replacements are currently present. + // Private use sequences are left as is. + + // 3. Replace aliases in special key values. + // - A subsequent call to canonicalizeExtensions() will perform this step. + + return Ok(); +} + +#ifdef DEBUG +static bool IsAsciiLowercaseAlphanumericOrDash(Span<const char> aSpan) { + const char* ptr = aSpan.data(); + size_t length = aSpan.size(); + return std::all_of(ptr, ptr + length, [](auto c) { + return IsAsciiLowercaseAlpha(c) || IsAsciiDigit(c) || c == '-'; + }); +} +#endif + +Result<Ok, Locale::CanonicalizationError> Locale::CanonicalizeExtensions() { + // The canonical case for all extension subtags is lowercase. + for (UniqueChars& extension : mExtensions) { + char* extensionChars = extension.get(); + size_t extensionLength = strlen(extensionChars); + AsciiToLowerCase(extensionChars, extensionLength, extensionChars); + + MOZ_ASSERT( + IsStructurallyValidExtensionTag({extensionChars, extensionLength})); + } + + // Any extensions are in alphabetical order by their singleton. + // "u-ca-chinese-t-zh-latn" -> "t-zh-latn-u-ca-chinese" + if (!SortAlphabetically(mExtensions)) { + return Err(CanonicalizationError::OutOfMemory); + } + + for (UniqueChars& extension : mExtensions) { + if (extension[0] == 'u') { + MOZ_TRY(CanonicalizeUnicodeExtension(extension)); + } else if (extension[0] == 't') { + MOZ_TRY(CanonicalizeTransformExtension(extension)); + } + + MOZ_ASSERT( + IsAsciiLowercaseAlphanumericOrDash(MakeStringSpan(extension.get()))); + } + + // The canonical case for privateuse subtags is lowercase. + if (char* privateuse = mPrivateUse.get()) { + size_t privateuseLength = strlen(privateuse); + AsciiToLowerCase(privateuse, privateuseLength, privateuse); + + MOZ_ASSERT( + IsStructurallyValidPrivateUseTag({privateuse, privateuseLength})); + } + return Ok(); +} + +template <size_t N> +static inline bool AppendSpan(Vector<char, N>& vector, Span<const char> aSpan) { + return vector.append(aSpan.data(), aSpan.size()); +} + +/** + * CanonicalizeUnicodeExtension( attributes, keywords ) + * + * Canonical syntax per + * <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>: + * + * - All attributes and keywords are in lowercase. + * - Note: The parser already converted keywords to lowercase. + * - All attributes are sorted in alphabetical order. + * - All keywords are sorted by alphabetical order of their keys. + * - Any type value "true" is removed. + * + * Canonical form: + * - All keys and types use the canonical form (from the name attribute; + * see Section 3.6.4 U Extension Data Files). + */ +Result<Ok, Locale::CanonicalizationError> Locale::CanonicalizeUnicodeExtension( + UniqueChars& aUnicodeExtension) { + Span<const char> extension = MakeStringSpan(aUnicodeExtension.get()); + MOZ_ASSERT(extension[0] == 'u'); + MOZ_ASSERT(extension[1] == '-'); + MOZ_ASSERT(IsStructurallyValidExtensionTag(extension)); + + LocaleParser::AttributesVector attributes; + LocaleParser::KeywordsVector keywords; + + using Attribute = LocaleParser::AttributesVector::ElementType; + using Keyword = LocaleParser::KeywordsVector::ElementType; + + if (LocaleParser::ParseUnicodeExtension(extension, attributes, keywords) + .isErr()) { + MOZ_ASSERT_UNREACHABLE("unexpected invalid Unicode extension subtag"); + return Err(CanonicalizationError::InternalError); + } + + auto attributesLess = [extension](const Attribute& a, const Attribute& b) { + auto astr = extension.Subspan(a.Begin(), a.Length()); + auto bstr = extension.Subspan(b.Begin(), b.Length()); + return astr < bstr; + }; + + // All attributes are sorted in alphabetical order. + if (attributes.length() > 1) { + std::stable_sort(attributes.begin(), attributes.end(), attributesLess); + } + + auto keywordsLess = [extension](const Keyword& a, const Keyword& b) { + auto astr = extension.Subspan(a.Begin(), UnicodeKeyLength); + auto bstr = extension.Subspan(b.Begin(), UnicodeKeyLength); + return astr < bstr; + }; + + // All keywords are sorted by alphabetical order of keys. + if (keywords.length() > 1) { + // Using a stable sort algorithm, guarantees that two keywords using the + // same key are never reordered. That means for example + // when we have the input "u-nu-thai-kf-false-nu-latn", we are guaranteed to + // get the result "u-kf-false-nu-thai-nu-latn", i.e. "nu-thai" still occurs + // before "nu-latn". + // This is required so that deduplication below preserves the first keyword + // for a given key and discards the rest. + std::stable_sort(keywords.begin(), keywords.end(), keywordsLess); + } + + Vector<char, 32> sb; + if (!sb.append('u')) { + return Err(CanonicalizationError::OutOfMemory); + } + + // Append all Unicode extension attributes. + for (size_t i = 0; i < attributes.length(); i++) { + const auto& attribute = attributes[i]; + auto span = extension.Subspan(attribute.Begin(), attribute.Length()); + + // Skip duplicate attributes. + if (i > 0) { + const auto& lastAttribute = attributes[i - 1]; + if (span == + extension.Subspan(lastAttribute.Begin(), lastAttribute.Length())) { + continue; + } + MOZ_ASSERT(attributesLess(lastAttribute, attribute)); + } + + if (!sb.append('-')) { + return Err(CanonicalizationError::OutOfMemory); + } + if (!AppendSpan(sb, span)) { + return Err(CanonicalizationError::OutOfMemory); + } + } + + static constexpr size_t UnicodeKeyWithSepLength = UnicodeKeyLength + 1; + + using StringSpan = Span<const char>; + + static constexpr StringSpan True = MakeStringSpan("true"); + + // Append all Unicode extension keywords. + for (size_t i = 0; i < keywords.length(); i++) { + const auto& keyword = keywords[i]; + + // Skip duplicate keywords. + if (i > 0) { + const auto& lastKeyword = keywords[i - 1]; + if (extension.Subspan(keyword.Begin(), UnicodeKeyLength) == + extension.Subspan(lastKeyword.Begin(), UnicodeKeyLength)) { + continue; + } + MOZ_ASSERT(keywordsLess(lastKeyword, keyword)); + } + + if (!sb.append('-')) { + return Err(CanonicalizationError::OutOfMemory); + } + + StringSpan span = extension.Subspan(keyword.Begin(), keyword.Length()); + if (span.size() == UnicodeKeyLength) { + // Keyword without type value. + if (!AppendSpan(sb, span)) { + return Err(CanonicalizationError::OutOfMemory); + } + } else { + StringSpan key = span.To(UnicodeKeyLength); + StringSpan type = span.From(UnicodeKeyWithSepLength); + + // Search if there's a replacement for the current Unicode keyword. + if (const char* replacement = ReplaceUnicodeExtensionType(key, type)) { + StringSpan repl = MakeStringSpan(replacement); + if (repl == True) { + // Elide the type "true" if present in the replacement. + if (!AppendSpan(sb, key)) { + return Err(CanonicalizationError::OutOfMemory); + } + } else { + // Otherwise append the Unicode key (including the separator) and the + // replaced type. + if (!AppendSpan(sb, span.To(UnicodeKeyWithSepLength))) { + return Err(CanonicalizationError::OutOfMemory); + } + if (!AppendSpan(sb, repl)) { + return Err(CanonicalizationError::OutOfMemory); + } + } + } else { + if (type == True) { + // Elide the Unicode extension type "true". + if (!AppendSpan(sb, key)) { + return Err(CanonicalizationError::OutOfMemory); + } + } else { + // Otherwise append the complete Unicode extension keyword. + if (!AppendSpan(sb, span)) { + return Err(CanonicalizationError::OutOfMemory); + } + } + } + } + } + + // We can keep the previous extension when canonicalization didn't modify it. + if (static_cast<Span<const char>>(sb) != extension) { + // Otherwise replace the previous extension with the canonical extension. + UniqueChars canonical = DuplicateStringToUniqueChars(sb); + if (!canonical) { + return Err(CanonicalizationError::OutOfMemory); + } + aUnicodeExtension = std::move(canonical); + } + + return Ok(); +} + +template <class Buffer> +static bool LocaleToString(const Locale& aTag, Buffer& aBuffer) { + auto appendSubtag = [&aBuffer](const auto& subtag) { + auto span = subtag.Span(); + MOZ_ASSERT(!span.empty()); + return aBuffer.append(span.data(), span.size()); + }; + + auto appendSubtagSpan = [&aBuffer](Span<const char> subtag) { + MOZ_ASSERT(!subtag.empty()); + return aBuffer.append(subtag.data(), subtag.size()); + }; + + auto appendSubtags = [&aBuffer, &appendSubtagSpan](const auto& subtags) { + for (const auto& subtag : subtags) { + if (!aBuffer.append('-') || !appendSubtagSpan(subtag)) { + return false; + } + } + return true; + }; + + // Append the language subtag. + if (!appendSubtag(aTag.Language())) { + return false; + } + + // Append the script subtag if present. + if (aTag.Script().Present()) { + if (!aBuffer.append('-') || !appendSubtag(aTag.Script())) { + return false; + } + } + + // Append the region subtag if present. + if (aTag.Region().Present()) { + if (!aBuffer.append('-') || !appendSubtag(aTag.Region())) { + return false; + } + } + + // Append the variant subtags if present. + if (!appendSubtags(aTag.Variants())) { + return false; + } + + // Append the extensions subtags if present. + if (!appendSubtags(aTag.Extensions())) { + return false; + } + + // Append the private-use subtag if present. + if (auto privateuse = aTag.PrivateUse()) { + if (!aBuffer.append('-') || !appendSubtagSpan(privateuse.value())) { + return false; + } + } + + return true; +} + +/** + * CanonicalizeTransformExtension + * + * Canonical form per <https://unicode.org/reports/tr35/#BCP47_T_Extension>: + * + * - These subtags are all in lowercase (that is the canonical casing for these + * subtags), [...]. + * + * And per + * <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>: + * + * - All keywords and tfields are sorted by alphabetical order of their keys, + * within their respective extensions. + */ +Result<Ok, Locale::CanonicalizationError> +Locale::CanonicalizeTransformExtension(UniqueChars& aTransformExtension) { + Span<const char> extension = MakeStringSpan(aTransformExtension.get()); + MOZ_ASSERT(extension[0] == 't'); + MOZ_ASSERT(extension[1] == '-'); + MOZ_ASSERT(IsStructurallyValidExtensionTag(extension)); + + Locale tag; + LocaleParser::TFieldVector fields; + + using TField = LocaleParser::TFieldVector::ElementType; + + if (LocaleParser::ParseTransformExtension(extension, tag, fields).isErr()) { + MOZ_ASSERT_UNREACHABLE("unexpected invalid transform extension subtag"); + return Err(CanonicalizationError::InternalError); + } + + auto tfieldLess = [extension](const TField& a, const TField& b) { + auto astr = extension.Subspan(a.Begin(), TransformKeyLength); + auto bstr = extension.Subspan(b.Begin(), TransformKeyLength); + return astr < bstr; + }; + + // All tfields are sorted by alphabetical order of their keys. + if (fields.length() > 1) { + std::stable_sort(fields.begin(), fields.end(), tfieldLess); + } + + Vector<char, 32> sb; + if (!sb.append('t')) { + return Err(CanonicalizationError::OutOfMemory); + } + + // Append the language subtag if present. + // + // Replace aliases in tlang per + // <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>. + if (tag.Language().Present()) { + if (!sb.append('-')) { + return Err(CanonicalizationError::OutOfMemory); + } + + MOZ_TRY(tag.CanonicalizeBaseName()); + + // The canonical case for Transform extensions is lowercase per + // <https://unicode.org/reports/tr35/#BCP47_T_Extension>. Convert the two + // subtags which don't use lowercase for their canonical syntax. + tag.mScript.ToLowerCase(); + tag.mRegion.ToLowerCase(); + + if (!LocaleToString(tag, sb)) { + return Err(CanonicalizationError::OutOfMemory); + } + } + + static constexpr size_t TransformKeyWithSepLength = TransformKeyLength + 1; + + using StringSpan = Span<const char>; + + // Append all fields. + // + // UTS 35, 3.2.1 specifies: + // - Any type or tfield value "true" is removed. + // + // But the `tvalue` subtag is mandatory in `tfield: tkey tvalue`, so ignore + // this apparently invalid part of the UTS 35 specification and simply + // append all `tfield` subtags. + for (const auto& field : fields) { + if (!sb.append('-')) { + return Err(CanonicalizationError::OutOfMemory); + } + + StringSpan span = extension.Subspan(field.Begin(), field.Length()); + StringSpan key = span.To(TransformKeyLength); + StringSpan value = span.From(TransformKeyWithSepLength); + + // Search if there's a replacement for the current transform keyword. + if (const char* replacement = ReplaceTransformExtensionType(key, value)) { + if (!AppendSpan(sb, span.To(TransformKeyWithSepLength))) { + return Err(CanonicalizationError::OutOfMemory); + } + if (!AppendSpan(sb, MakeStringSpan(replacement))) { + return Err(CanonicalizationError::OutOfMemory); + } + } else { + if (!AppendSpan(sb, span)) { + return Err(CanonicalizationError::OutOfMemory); + } + } + } + + // We can keep the previous extension when canonicalization didn't modify it. + if (static_cast<Span<const char>>(sb) != extension) { + // Otherwise replace the previous extension with the canonical extension. + UniqueChars canonical = DuplicateStringToUniqueChars(sb); + if (!canonical) { + return Err(CanonicalizationError::OutOfMemory); + } + aTransformExtension = std::move(canonical); + } + + return Ok(); +} + +// Zero-terminated ICU Locale ID. +using LocaleId = + Vector<char, LanguageLength + 1 + ScriptLength + 1 + RegionLength + 1>; + +enum class LikelySubtags : bool { Add, Remove }; + +// Return true iff the locale is already maximized resp. minimized. +static bool HasLikelySubtags(LikelySubtags aLikelySubtags, const Locale& aTag) { + // The locale is already maximized if the language, script, and region + // subtags are present and no placeholder subtags ("und", "Zzzz", "ZZ") are + // used. + if (aLikelySubtags == LikelySubtags::Add) { + return !aTag.Language().EqualTo("und") && + (aTag.Script().Present() && !aTag.Script().EqualTo("Zzzz")) && + (aTag.Region().Present() && !aTag.Region().EqualTo("ZZ")); + } + + // The locale is already minimized if it only contains a language + // subtag whose value is not the placeholder value "und". + return !aTag.Language().EqualTo("und") && aTag.Script().Missing() && + aTag.Region().Missing(); +} + +// Create an ICU locale ID from the given locale. +static bool CreateLocaleForLikelySubtags(const Locale& aTag, + LocaleId& aLocale) { + MOZ_ASSERT(aLocale.length() == 0); + + auto appendSubtag = [&aLocale](const auto& subtag) { + auto span = subtag.Span(); + MOZ_ASSERT(!span.empty()); + return aLocale.append(span.data(), span.size()); + }; + + // Append the language subtag. + if (!appendSubtag(aTag.Language())) { + return false; + } + + // Append the script subtag if present. + if (aTag.Script().Present()) { + if (!aLocale.append('_') || !appendSubtag(aTag.Script())) { + return false; + } + } + + // Append the region subtag if present. + if (aTag.Region().Present()) { + if (!aLocale.append('_') || !appendSubtag(aTag.Region())) { + return false; + } + } + + // Zero-terminated for use with ICU. + return aLocale.append('\0'); +} + +static ICUError ParserErrorToICUError(LocaleParser::ParserError aErr) { + using ParserError = LocaleParser::ParserError; + + switch (aErr) { + case ParserError::NotParseable: + return ICUError::InternalError; + case ParserError::OutOfMemory: + return ICUError::OutOfMemory; + } + MOZ_CRASH("Unexpected parser error"); +} + +static ICUError CanonicalizationErrorToICUError( + Locale::CanonicalizationError aErr) { + using CanonicalizationError = Locale::CanonicalizationError; + + switch (aErr) { + case CanonicalizationError::DuplicateVariant: + case CanonicalizationError::InternalError: + return ICUError::InternalError; + case CanonicalizationError::OutOfMemory: + return ICUError::OutOfMemory; + } + MOZ_CRASH("Unexpected canonicalization error"); +} + +// Assign the language, script, and region subtags from an ICU locale ID. +// +// ICU provides |uloc_getLanguage|, |uloc_getScript|, and |uloc_getCountry| to +// retrieve these subtags, but unfortunately these functions are rather slow, so +// we use our own implementation. +static ICUResult AssignFromLocaleId(LocaleId& aLocaleId, Locale& aTag) { + // Replace the ICU locale ID separator. + std::replace(aLocaleId.begin(), aLocaleId.end(), '_', '-'); + + // ICU replaces "und" with the empty string, which means "und" becomes "" and + // "und-Latn" becomes "-Latn". Handle this case separately. + if (aLocaleId.empty() || aLocaleId[0] == '-') { + static constexpr auto und = MakeStringSpan("und"); + constexpr size_t length = und.size(); + + // Insert "und" in front of the locale ID. + if (!aLocaleId.growBy(length)) { + return Err(ICUError::OutOfMemory); + } + memmove(aLocaleId.begin() + length, aLocaleId.begin(), aLocaleId.length()); + memmove(aLocaleId.begin(), und.data(), length); + } + + // Retrieve the language, script, and region subtags from the locale ID + Locale localeTag; + MOZ_TRY(LocaleParser::TryParseBaseName(aLocaleId, localeTag) + .mapErr(ParserErrorToICUError)); + + aTag.SetLanguage(localeTag.Language()); + aTag.SetScript(localeTag.Script()); + aTag.SetRegion(localeTag.Region()); + + return Ok(); +} + +template <decltype(uloc_addLikelySubtags) likelySubtagsFn> +static ICUResult CallLikelySubtags(const LocaleId& aLocaleId, + LocaleId& aResult) { + // Locale ID must be zero-terminated before passing it to ICU. + MOZ_ASSERT(aLocaleId.back() == '\0'); + MOZ_ASSERT(aResult.length() == 0); + + // Ensure there's enough room for the result. + MOZ_ALWAYS_TRUE(aResult.resize(LocaleId::InlineLength)); + + return FillBufferWithICUCall( + aResult, [&aLocaleId](char* chars, int32_t size, UErrorCode* status) { + return likelySubtagsFn(aLocaleId.begin(), chars, size, status); + }); +} + +// The canonical way to compute the Unicode BCP 47 locale identifier with likely +// subtags is as follows: +// +// 1. Call uloc_forLanguageTag() to transform the locale identifer into an ICU +// locale ID. +// 2. Call uloc_addLikelySubtags() to add the likely subtags to the locale ID. +// 3. Call uloc_toLanguageTag() to transform the resulting locale ID back into +// a Unicode BCP 47 locale identifier. +// +// Since uloc_forLanguageTag() and uloc_toLanguageTag() are both kind of slow +// and we know, by construction, that the input Unicode BCP 47 locale identifier +// only contains valid language, script, and region subtags, we can avoid both +// calls if we implement them ourselves, see CreateLocaleForLikelySubtags() and +// AssignFromLocaleId(). (Where "slow" means about 50% of the execution time of +// |Intl.Locale.prototype.maximize|.) +static ICUResult LikelySubtags(LikelySubtags aLikelySubtags, Locale& aTag) { + // Return early if the input is already maximized/minimized. + if (HasLikelySubtags(aLikelySubtags, aTag)) { + return Ok(); + } + + // Create the locale ID for the input argument. + LocaleId locale; + if (!CreateLocaleForLikelySubtags(aTag, locale)) { + return Err(ICUError::OutOfMemory); + } + + // Either add or remove likely subtags to/from the locale ID. + LocaleId localeLikelySubtags; + if (aLikelySubtags == LikelySubtags::Add) { + MOZ_TRY( + CallLikelySubtags<uloc_addLikelySubtags>(locale, localeLikelySubtags)); + } else { + MOZ_TRY( + CallLikelySubtags<uloc_minimizeSubtags>(locale, localeLikelySubtags)); + } + + // Assign the language, script, and region subtags from the locale ID. + MOZ_TRY(AssignFromLocaleId(localeLikelySubtags, aTag)); + + // Update mappings in case ICU returned a non-canonical locale. + MOZ_TRY(aTag.CanonicalizeBaseName().mapErr(CanonicalizationErrorToICUError)); + + return Ok(); +} + +ICUResult Locale::AddLikelySubtags() { + return LikelySubtags(LikelySubtags::Add, *this); +} + +ICUResult Locale::RemoveLikelySubtags() { + return LikelySubtags(LikelySubtags::Remove, *this); +} + +UniqueChars Locale::DuplicateStringToUniqueChars(const char* aStr) { + size_t length = strlen(aStr) + 1; + auto duplicate = MakeUnique<char[]>(length); + memcpy(duplicate.get(), aStr, length); + return duplicate; +} + +UniqueChars Locale::DuplicateStringToUniqueChars(Span<const char> aStr) { + size_t length = aStr.size(); + auto duplicate = MakeUnique<char[]>(length + 1); + memcpy(duplicate.get(), aStr.data(), length); + duplicate[length] = '\0'; + return duplicate; +} + +size_t Locale::ToStringCapacity() const { + // This is a bit awkward, the buffer class currently does not support + // being resized, so we need to calculate the required size up front and + // reserve it all at once. + auto lengthSubtag = [](const auto& subtag) { + auto span = subtag.Span(); + MOZ_ASSERT(!span.empty()); + return span.size(); + }; + + auto lengthSubtagZ = [](const char* subtag) { + size_t length = strlen(subtag); + MOZ_ASSERT(length > 0); + return length; + }; + + auto lengthSubtagsZ = [&lengthSubtagZ](const auto& subtags) { + size_t length = 0; + for (const auto& subtag : subtags) { + length += lengthSubtagZ(subtag.get()) + 1; + } + return length; + }; + + // First calculate required capacity + size_t capacity = 0; + + capacity += lengthSubtag(mLanguage); + + if (mScript.Present()) { + capacity += lengthSubtag(mScript) + 1; + } + + if (mRegion.Present()) { + capacity += lengthSubtag(mRegion) + 1; + } + + capacity += lengthSubtagsZ(mVariants); + + capacity += lengthSubtagsZ(mExtensions); + + if (mPrivateUse.get()) { + capacity += lengthSubtagZ(mPrivateUse.get()) + 1; + } + + return capacity; +} + +size_t Locale::ToStringAppend(char* aBuffer) const { + // Current write position inside buffer. + size_t offset = 0; + + auto appendHyphen = [&offset, &aBuffer]() { + aBuffer[offset] = '-'; + offset += 1; + }; + + auto appendSubtag = [&offset, &aBuffer](const auto& subtag) { + auto span = subtag.Span(); + memcpy(aBuffer + offset, span.data(), span.size()); + offset += span.size(); + }; + + auto appendSubtagZ = [&offset, &aBuffer](const char* subtag) { + size_t length = strlen(subtag); + memcpy(aBuffer + offset, subtag, length); + offset += length; + }; + + auto appendSubtagsZ = [&appendHyphen, &appendSubtagZ](const auto& subtags) { + for (const auto& subtag : subtags) { + appendHyphen(); + appendSubtagZ(subtag.get()); + } + }; + + // Append the language subtag. + appendSubtag(mLanguage); + + // Append the script subtag if present. + if (mScript.Present()) { + appendHyphen(); + appendSubtag(mScript); + } + + // Append the region subtag if present. + if (mRegion.Present()) { + appendHyphen(); + appendSubtag(mRegion); + } + + // Append the variant subtags if present. + appendSubtagsZ(mVariants); + + // Append the extensions subtags if present. + appendSubtagsZ(mExtensions); + + // Append the private-use subtag if present. + if (mPrivateUse.get()) { + appendHyphen(); + appendSubtagZ(mPrivateUse.get()); + } + + return offset; +} + +LocaleParser::Token LocaleParser::NextToken() { + MOZ_ASSERT(mIndex <= mLength + 1, "called after 'None' token was read"); + + TokenKind kind = TokenKind::None; + size_t tokenLength = 0; + for (size_t i = mIndex; i < mLength; i++) { + // UTS 35, section 3.1. + // alpha = [A-Z a-z] ; + // digit = [0-9] ; + char c = CharAt(i); + if (IsAsciiAlpha(c)) { + kind |= TokenKind::Alpha; + } else if (IsAsciiDigit(c)) { + kind |= TokenKind::Digit; + } else if (c == '-' && i > mIndex && i + 1 < mLength) { + break; + } else { + return {TokenKind::Error, 0, 0}; + } + tokenLength += 1; + } + + Token token{kind, mIndex, tokenLength}; + mIndex += tokenLength + 1; + return token; +} + +UniqueChars LocaleParser::Chars(size_t aIndex, size_t aLength) const { + // Add +1 to null-terminate the string. + auto chars = MakeUnique<char[]>(aLength + 1); + char* dest = chars.get(); + std::copy_n(mLocale + aIndex, aLength, dest); + dest[aLength] = '\0'; + return chars; +} + +// Parse the `unicode_language_id` production. +// +// unicode_language_id = unicode_language_subtag +// (sep unicode_script_subtag)? +// (sep unicode_region_subtag)? +// (sep unicode_variant_subtag)* ; +// +// sep = "-" +// +// Note: Unicode CLDR locale identifier backward compatibility extensions +// removed from `unicode_language_id`. +// +// |tok| is the current token from |ts|. +// +// All subtags will be added unaltered to |tag|, without canonicalizing their +// case or, in the case of variant subtags, detecting and rejecting duplicate +// variants. Users must subsequently |CanonicalizeBaseName| to perform these +// actions. +// +// Do not use this function directly: use |ParseBaseName| or +// |ParseTlangFromTransformExtension| instead. +Result<Ok, LocaleParser::ParserError> LocaleParser::InternalParseBaseName( + LocaleParser& aLocaleParser, Locale& aTag, Token& aTok) { + if (aLocaleParser.IsLanguage(aTok)) { + aLocaleParser.CopyChars(aTok, aTag.mLanguage); + + aTok = aLocaleParser.NextToken(); + } else { + // The language subtag is mandatory. + return Err(ParserError::NotParseable); + } + + if (aLocaleParser.IsScript(aTok)) { + aLocaleParser.CopyChars(aTok, aTag.mScript); + + aTok = aLocaleParser.NextToken(); + } + + if (aLocaleParser.IsRegion(aTok)) { + aLocaleParser.CopyChars(aTok, aTag.mRegion); + + aTok = aLocaleParser.NextToken(); + } + + auto& variants = aTag.mVariants; + MOZ_ASSERT(variants.length() == 0); + while (aLocaleParser.IsVariant(aTok)) { + auto variant = aLocaleParser.Chars(aTok); + if (!variants.append(std::move(variant))) { + return Err(ParserError::OutOfMemory); + } + + aTok = aLocaleParser.NextToken(); + } + + return Ok(); +} + +Result<Ok, LocaleParser::ParserError> LocaleParser::TryParse( + mozilla::Span<const char> aLocale, Locale& aTag) { + // |aTag| must be a new, empty Locale. + MOZ_ASSERT(aTag.Language().Missing()); + MOZ_ASSERT(aTag.Script().Missing()); + MOZ_ASSERT(aTag.Region().Missing()); + MOZ_ASSERT(aTag.Variants().empty()); + MOZ_ASSERT(aTag.Extensions().empty()); + MOZ_ASSERT(aTag.PrivateUse().isNothing()); + + // unicode_locale_id = unicode_language_id + // extensions* + // pu_extensions? ; + + LocaleParser ts(aLocale); + Token tok = ts.NextToken(); + + MOZ_TRY(ParseBaseName(ts, aTag, tok)); + + // extensions = unicode_locale_extensions + // | transformed_extensions + // | other_extensions ; + + // Bit set of seen singletons. + uint64_t seenSingletons = 0; + + auto& extensions = aTag.mExtensions; + while (ts.IsExtensionStart(tok)) { + char singleton = ts.SingletonKey(tok); + + // Reject the input if a duplicate singleton was found. + uint64_t hash = 1ULL << (AsciiAlphanumericToNumber(singleton) + 1); + if (seenSingletons & hash) { + return Err(ParserError::NotParseable); + } + seenSingletons |= hash; + + Token start = tok; + tok = ts.NextToken(); + + // We'll check for missing non-singleton subtags after this block by + // comparing |startValue| with the then-current position. + size_t startValue = tok.Index(); + + if (singleton == 'u') { + while (ts.IsUnicodeExtensionPart(tok)) { + tok = ts.NextToken(); + } + } else if (singleton == 't') { + // transformed_extensions = sep [tT] + // ((sep tlang (sep tfield)*) + // | (sep tfield)+) ; + + // tlang = unicode_language_subtag + // (sep unicode_script_subtag)? + // (sep unicode_region_subtag)? + // (sep unicode_variant_subtag)* ; + if (ts.IsLanguage(tok)) { + tok = ts.NextToken(); + + if (ts.IsScript(tok)) { + tok = ts.NextToken(); + } + + if (ts.IsRegion(tok)) { + tok = ts.NextToken(); + } + + while (ts.IsVariant(tok)) { + tok = ts.NextToken(); + } + } + + // tfield = tkey tvalue; + while (ts.IsTransformExtensionKey(tok)) { + tok = ts.NextToken(); + + size_t startTValue = tok.Index(); + while (ts.IsTransformExtensionPart(tok)) { + tok = ts.NextToken(); + } + + // `tfield` requires at least one `tvalue`. + if (tok.Index() <= startTValue) { + return Err(ParserError::NotParseable); + } + } + } else { + while (ts.IsOtherExtensionPart(tok)) { + tok = ts.NextToken(); + } + } + + // Singletons must be followed by a non-singleton subtag, "en-a-b" is not + // allowed. + if (tok.Index() <= startValue) { + return Err(ParserError::NotParseable); + } + + UniqueChars extension = ts.Extension(start, tok); + if (!extensions.append(std::move(extension))) { + return Err(ParserError::OutOfMemory); + } + } + + // Trailing `pu_extension` component of the `unicode_locale_id` production. + if (ts.IsPrivateUseStart(tok)) { + Token start = tok; + tok = ts.NextToken(); + + size_t startValue = tok.Index(); + while (ts.IsPrivateUsePart(tok)) { + tok = ts.NextToken(); + } + + // There must be at least one subtag after the "-x-". + if (tok.Index() <= startValue) { + return Err(ParserError::NotParseable); + } + + UniqueChars privateUse = ts.Extension(start, tok); + aTag.mPrivateUse = std::move(privateUse); + } + + if (!tok.IsNone()) { + return Err(ParserError::NotParseable); + } + + return Ok(); +} + +Result<Ok, LocaleParser::ParserError> LocaleParser::TryParseBaseName( + Span<const char> aLocale, Locale& aTag) { + // |aTag| must be a new, empty Locale. + MOZ_ASSERT(aTag.Language().Missing()); + MOZ_ASSERT(aTag.Script().Missing()); + MOZ_ASSERT(aTag.Region().Missing()); + MOZ_ASSERT(aTag.Variants().empty()); + MOZ_ASSERT(aTag.Extensions().empty()); + MOZ_ASSERT(aTag.PrivateUse().isNothing()); + + LocaleParser ts(aLocale); + Token tok = ts.NextToken(); + + MOZ_TRY(ParseBaseName(ts, aTag, tok)); + if (!tok.IsNone()) { + return Err(ParserError::NotParseable); + } + + return Ok(); +} + +// Parse |aExtension|, which must be a valid `transformed_extensions` subtag, +// and fill |aTag| and |aFields| from the `tlang` and `tfield` components. +Result<Ok, LocaleParser::ParserError> LocaleParser::ParseTransformExtension( + Span<const char> aExtension, Locale& aTag, TFieldVector& aFields) { + LocaleParser ts(aExtension); + Token tok = ts.NextToken(); + + if (!ts.IsExtensionStart(tok) || ts.SingletonKey(tok) != 't') { + return Err(ParserError::NotParseable); + } + + tok = ts.NextToken(); + + if (tok.IsNone()) { + return Err(ParserError::NotParseable); + } + + if (ts.IsLanguage(tok)) { + // We're parsing a possible `tlang` in a known-valid transform extension, so + // use the special-purpose function that takes advantage of this to compute + // lowercased |tag| contents in an optimal manner. + MOZ_TRY(ParseTlangInTransformExtension(ts, aTag, tok)); + + // After `tlang` we must have a `tfield` and its `tkey`, or we're at the end + // of the transform extension. + MOZ_ASSERT(ts.IsTransformExtensionKey(tok) || tok.IsNone()); + } else { + // If there's no `tlang` subtag, at least one `tfield` must be present. + MOZ_ASSERT(ts.IsTransformExtensionKey(tok)); + } + + // Trailing `tfield` subtags. (Any other trailing subtags are an error, + // because we're guaranteed to only see a valid tranform extension here.) + while (ts.IsTransformExtensionKey(tok)) { + size_t begin = tok.Index(); + tok = ts.NextToken(); + + size_t startTValue = tok.Index(); + while (ts.IsTransformExtensionPart(tok)) { + tok = ts.NextToken(); + } + + // `tfield` requires at least one `tvalue`. + if (tok.Index() <= startTValue) { + return Err(ParserError::NotParseable); + } + + size_t length = tok.Index() - 1 - begin; + if (!aFields.emplaceBack(begin, length)) { + return Err(ParserError::OutOfMemory); + } + } + + if (!tok.IsNone()) { + return Err(ParserError::NotParseable); + } + + return Ok(); +} + +// Parse |aExtension|, which must be a valid `unicode_locale_extensions` subtag, +// and fill |aAttributes| and |aKeywords| from the `attribute` and `keyword` +// components. +Result<Ok, LocaleParser::ParserError> LocaleParser::ParseUnicodeExtension( + Span<const char> aExtension, AttributesVector& aAttributes, + KeywordsVector& aKeywords) { + LocaleParser ts(aExtension); + Token tok = ts.NextToken(); + + // unicode_locale_extensions = sep [uU] ((sep keyword)+ | + // (sep attribute)+ (sep keyword)*) ; + + if (!ts.IsExtensionStart(tok) || ts.SingletonKey(tok) != 'u') { + return Err(ParserError::NotParseable); + } + + tok = ts.NextToken(); + + if (tok.IsNone()) { + return Err(ParserError::NotParseable); + } + + while (ts.IsUnicodeExtensionAttribute(tok)) { + if (!aAttributes.emplaceBack(tok.Index(), tok.Length())) { + return Err(ParserError::OutOfMemory); + } + + tok = ts.NextToken(); + } + + // keyword = key (sep type)? ; + while (ts.IsUnicodeExtensionKey(tok)) { + size_t begin = tok.Index(); + tok = ts.NextToken(); + + while (ts.IsUnicodeExtensionType(tok)) { + tok = ts.NextToken(); + } + + if (tok.IsError()) { + return Err(ParserError::NotParseable); + } + + size_t length = tok.Index() - 1 - begin; + if (!aKeywords.emplaceBack(begin, length)) { + return Err(ParserError::OutOfMemory); + } + } + + if (!tok.IsNone()) { + return Err(ParserError::NotParseable); + } + + return Ok(); +} + +Result<Ok, LocaleParser::ParserError> LocaleParser::CanParseUnicodeExtension( + Span<const char> aExtension) { + LocaleParser ts(aExtension); + Token tok = ts.NextToken(); + + // unicode_locale_extensions = sep [uU] ((sep keyword)+ | + // (sep attribute)+ (sep keyword)*) ; + + if (!ts.IsExtensionStart(tok) || ts.SingletonKey(tok) != 'u') { + return Err(ParserError::NotParseable); + } + + tok = ts.NextToken(); + + if (tok.IsNone()) { + return Err(ParserError::NotParseable); + } + + while (ts.IsUnicodeExtensionAttribute(tok)) { + tok = ts.NextToken(); + } + + // keyword = key (sep type)? ; + while (ts.IsUnicodeExtensionKey(tok)) { + tok = ts.NextToken(); + + while (ts.IsUnicodeExtensionType(tok)) { + tok = ts.NextToken(); + } + + if (tok.IsError()) { + return Err(ParserError::NotParseable); + } + } + + if (!tok.IsNone()) { + return Err(ParserError::OutOfMemory); + } + + return Ok(); +} + +Result<Ok, LocaleParser::ParserError> +LocaleParser::CanParseUnicodeExtensionType(Span<const char> aUnicodeType) { + MOZ_ASSERT(!aUnicodeType.empty(), "caller must exclude empty strings"); + + LocaleParser ts(aUnicodeType); + Token tok = ts.NextToken(); + + while (ts.IsUnicodeExtensionType(tok)) { + tok = ts.NextToken(); + } + + if (!tok.IsNone()) { + return Err(ParserError::NotParseable); + } + + return Ok(); +} + +} // namespace mozilla::intl diff --git a/intl/components/src/Locale.h b/intl/components/src/Locale.h new file mode 100644 index 0000000000..478d5f4a9e --- /dev/null +++ b/intl/components/src/Locale.h @@ -0,0 +1,773 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* Structured representation of Unicode locale IDs used with Intl functions. */ + +#ifndef intl_components_Locale_h +#define intl_components_Locale_h + +#include "mozilla/Assertions.h" +#include "mozilla/intl/ICUError.h" +#include "mozilla/intl/ICU4CGlue.h" +#include "mozilla/Maybe.h" +#include "mozilla/Span.h" +#include "mozilla/TextUtils.h" +#include "mozilla/TypedEnumBits.h" +#include "mozilla/Variant.h" +#include "mozilla/Vector.h" +#include "mozilla/Result.h" + +#include <algorithm> +#include <stddef.h> +#include <stdint.h> +#include <string.h> +#include <utility> + +#include "unicode/uloc.h" + +namespace mozilla::intl { + +/** + * Return true if |language| is a valid language subtag. + */ +template <typename CharT> +bool IsStructurallyValidLanguageTag(mozilla::Span<const CharT> aLanguage); + +/** + * Return true if |script| is a valid script subtag. + */ +template <typename CharT> +bool IsStructurallyValidScriptTag(mozilla::Span<const CharT> aScript); + +/** + * Return true if |region| is a valid region subtag. + */ +template <typename CharT> +bool IsStructurallyValidRegionTag(mozilla::Span<const CharT> aRegion); + +#ifdef DEBUG +/** + * Return true if |variant| is a valid variant subtag. + */ +bool IsStructurallyValidVariantTag(mozilla::Span<const char> aVariant); + +/** + * Return true if |extension| is a valid Unicode extension subtag. + */ +bool IsStructurallyValidUnicodeExtensionTag( + mozilla::Span<const char> aExtension); + +/** + * Return true if |privateUse| is a valid private-use subtag. + */ +bool IsStructurallyValidPrivateUseTag(mozilla::Span<const char> aPrivateUse); + +#endif + +template <typename CharT> +char AsciiToLowerCase(CharT aChar) { + MOZ_ASSERT(mozilla::IsAscii(aChar)); + return mozilla::IsAsciiUppercaseAlpha(aChar) ? (aChar + 0x20) : aChar; +} + +template <typename CharT> +char AsciiToUpperCase(CharT aChar) { + MOZ_ASSERT(mozilla::IsAscii(aChar)); + return mozilla::IsAsciiLowercaseAlpha(aChar) ? (aChar - 0x20) : aChar; +} + +template <typename CharT> +void AsciiToLowerCase(CharT* aChars, size_t aLength, char* aDest) { + char (&fn)(CharT) = AsciiToLowerCase; + std::transform(aChars, aChars + aLength, aDest, fn); +} + +template <typename CharT> +void AsciiToUpperCase(CharT* aChars, size_t aLength, char* aDest) { + char (&fn)(CharT) = AsciiToUpperCase; + std::transform(aChars, aChars + aLength, aDest, fn); +} + +template <typename CharT> +void AsciiToTitleCase(CharT* aChars, size_t aLength, char* aDest) { + if (aLength > 0) { + AsciiToUpperCase(aChars, 1, aDest); + AsciiToLowerCase(aChars + 1, aLength - 1, aDest + 1); + } +} + +// Constants for language subtag lengths. +namespace LanguageTagLimits { + +// unicode_language_subtag = alpha{2,3} | alpha{5,8} ; +static constexpr size_t LanguageLength = 8; + +// unicode_script_subtag = alpha{4} ; +static constexpr size_t ScriptLength = 4; + +// unicode_region_subtag = (alpha{2} | digit{3}) ; +static constexpr size_t RegionLength = 3; +static constexpr size_t AlphaRegionLength = 2; +static constexpr size_t DigitRegionLength = 3; + +// key = alphanum alpha ; +static constexpr size_t UnicodeKeyLength = 2; + +// tkey = alpha digit ; +static constexpr size_t TransformKeyLength = 2; + +} // namespace LanguageTagLimits + +// Fixed size language subtag which is stored inline in Locale. +template <size_t SubtagLength> +class LanguageTagSubtag final { + uint8_t mLength = 0; + char mChars[SubtagLength] = {}; // zero initialize + + public: + LanguageTagSubtag() = default; + + LanguageTagSubtag(const LanguageTagSubtag& aOther) { + std::copy_n(aOther.mChars, SubtagLength, mChars); + mLength = aOther.mLength; + } + + LanguageTagSubtag& operator=(const LanguageTagSubtag& aOther) { + std::copy_n(aOther.mChars, SubtagLength, mChars); + mLength = aOther.mLength; + return *this; + } + + size_t Length() const { return mLength; } + bool Missing() const { return mLength == 0; } + bool Present() const { return mLength > 0; } + + mozilla::Span<const char> Span() const { return {mChars, mLength}; } + + template <typename CharT> + void Set(mozilla::Span<const CharT> str) { + MOZ_ASSERT(str.size() <= SubtagLength); + std::copy_n(str.data(), str.size(), mChars); + mLength = str.size(); + } + + // The toXYZCase() methods are using |SubtagLength| instead of |length()|, + // because current compilers (tested GCC and Clang) can't infer the maximum + // string length - even when using hints like |std::min| - and instead are + // emitting SIMD optimized code. Using a fixed sized length avoids emitting + // the SIMD code. (Emitting SIMD code doesn't make sense here, because the + // SIMD code only kicks in for long strings.) A fixed length will + // additionally ensure the compiler unrolls the loop in the case conversion + // code. + + void ToLowerCase() { AsciiToLowerCase(mChars, SubtagLength, mChars); } + + void ToUpperCase() { AsciiToUpperCase(mChars, SubtagLength, mChars); } + + void ToTitleCase() { AsciiToTitleCase(mChars, SubtagLength, mChars); } + + template <size_t N> + bool EqualTo(const char (&str)[N]) const { + static_assert(N - 1 <= SubtagLength, + "subtag literals must not exceed the maximum subtag length"); + + return mLength == N - 1 && memcmp(mChars, str, N - 1) == 0; + } +}; + +using LanguageSubtag = LanguageTagSubtag<LanguageTagLimits::LanguageLength>; +using ScriptSubtag = LanguageTagSubtag<LanguageTagLimits::ScriptLength>; +using RegionSubtag = LanguageTagSubtag<LanguageTagLimits::RegionLength>; + +using Latin1Char = unsigned char; +using UniqueChars = UniquePtr<char[]>; + +/** + * Object representing a Unicode BCP 47 locale identifier. + * + * All subtags are already in canonicalized case. + */ +class MOZ_STACK_CLASS Locale final { + LanguageSubtag mLanguage = {}; + ScriptSubtag mScript = {}; + RegionSubtag mRegion = {}; + + using VariantsVector = Vector<UniqueChars, 2>; + using ExtensionsVector = Vector<UniqueChars, 2>; + + VariantsVector mVariants; + ExtensionsVector mExtensions; + UniqueChars mPrivateUse = nullptr; + + friend class LocaleParser; + + public: + enum class CanonicalizationError : uint8_t { + DuplicateVariant, + InternalError, + OutOfMemory, + }; + + private: + Result<Ok, CanonicalizationError> CanonicalizeUnicodeExtension( + UniqueChars& unicodeExtension); + + Result<Ok, CanonicalizationError> CanonicalizeTransformExtension( + UniqueChars& transformExtension); + + public: + static bool LanguageMapping(LanguageSubtag& aLanguage); + static bool ComplexLanguageMapping(const LanguageSubtag& aLanguage); + + private: + static bool ScriptMapping(ScriptSubtag& aScript); + static bool RegionMapping(RegionSubtag& aRegion); + static bool ComplexRegionMapping(const RegionSubtag& aRegion); + + void PerformComplexLanguageMappings(); + void PerformComplexRegionMappings(); + [[nodiscard]] bool PerformVariantMappings(); + + [[nodiscard]] bool UpdateLegacyMappings(); + + static bool SignLanguageMapping(LanguageSubtag& aLanguage, + const RegionSubtag& aRegion); + + static const char* ReplaceTransformExtensionType( + mozilla::Span<const char> aKey, mozilla::Span<const char> aType); + + public: + /** + * Given a Unicode key and type, return the null-terminated preferred + * replacement for that type if there is one, or null if there is none, e.g. + * in effect + * |ReplaceUnicodeExtensionType("ca", "islamicc") == "islamic-civil"| + * and + * |ReplaceUnicodeExtensionType("ca", "islamic-civil") == nullptr|. + */ + static const char* ReplaceUnicodeExtensionType( + mozilla::Span<const char> aKey, mozilla::Span<const char> aType); + + public: + Locale() = default; + Locale(const Locale&) = delete; + Locale& operator=(const Locale&) = delete; + Locale(Locale&&) = default; + Locale& operator=(Locale&&) = default; + + template <class Vec> + class SubtagIterator { + using Iter = decltype(std::declval<const Vec>().begin()); + + Iter mIter; + + public: + explicit SubtagIterator(Iter iter) : mIter(iter) {} + + // std::iterator traits. + using iterator_category = std::input_iterator_tag; + using value_type = Span<const char>; + using difference_type = ptrdiff_t; + using pointer = value_type*; + using reference = value_type&; + + SubtagIterator& operator++() { + mIter++; + return *this; + } + + SubtagIterator operator++(int) { + SubtagIterator result = *this; + ++(*this); + return result; + } + + bool operator==(const SubtagIterator& aOther) const { + return mIter == aOther.mIter; + } + + bool operator!=(const SubtagIterator& aOther) const { + return !(*this == aOther); + } + + value_type operator*() const { return MakeStringSpan(mIter->get()); } + }; + + template <size_t N> + class SubtagEnumeration { + using Vec = Vector<UniqueChars, N>; + + const Vec& mVector; + + public: + explicit SubtagEnumeration(const Vec& aVector) : mVector(aVector) {} + + size_t length() const { return mVector.length(); } + bool empty() const { return mVector.empty(); } + + auto begin() const { return SubtagIterator<Vec>(mVector.begin()); } + auto end() const { return SubtagIterator<Vec>(mVector.end()); } + + Span<const char> operator[](size_t aIndex) const { + return MakeStringSpan(mVector[aIndex].get()); + } + }; + + const LanguageSubtag& Language() const { return mLanguage; } + const ScriptSubtag& Script() const { return mScript; } + const RegionSubtag& Region() const { return mRegion; } + auto Variants() const { return SubtagEnumeration(mVariants); } + auto Extensions() const { return SubtagEnumeration(mExtensions); } + Maybe<Span<const char>> PrivateUse() const { + if (const char* p = mPrivateUse.get()) { + return Some(MakeStringSpan(p)); + } + return Nothing(); + } + + /** + * Return the Unicode extension subtag or Nothing if not present. + */ + Maybe<Span<const char>> GetUnicodeExtension() const; + + private: + ptrdiff_t UnicodeExtensionIndex() const; + + public: + /** + * Set the language subtag. The input must be a valid language subtag. + */ + template <size_t N> + void SetLanguage(const char (&aLanguage)[N]) { + mozilla::Span<const char> span(aLanguage, N - 1); + MOZ_ASSERT(IsStructurallyValidLanguageTag(span)); + mLanguage.Set(span); + } + + /** + * Set the language subtag. The input must be a valid language subtag. + */ + void SetLanguage(const LanguageSubtag& aLanguage) { + MOZ_ASSERT(IsStructurallyValidLanguageTag(aLanguage.Span())); + mLanguage.Set(aLanguage.Span()); + } + + /** + * Set the script subtag. The input must be a valid script subtag. + */ + template <size_t N> + void SetScript(const char (&aScript)[N]) { + mozilla::Span<const char> span(aScript, N - 1); + MOZ_ASSERT(IsStructurallyValidScriptTag(span)); + mScript.Set(span); + } + + /** + * Set the script subtag. The input must be a valid script subtag or the empty + * string. + */ + void SetScript(const ScriptSubtag& aScript) { + MOZ_ASSERT(aScript.Missing() || + IsStructurallyValidScriptTag(aScript.Span())); + mScript.Set(aScript.Span()); + } + + /** + * Set the region subtag. The input must be a valid region subtag. + */ + template <size_t N> + void SetRegion(const char (&aRegion)[N]) { + mozilla::Span<const char> span(aRegion, N - 1); + MOZ_ASSERT(IsStructurallyValidRegionTag(span)); + mRegion.Set(span); + } + + /** + * Set the region subtag. The input must be a valid region subtag or the empty + * empty string. + */ + void SetRegion(const RegionSubtag& aRegion) { + MOZ_ASSERT(aRegion.Missing() || + IsStructurallyValidRegionTag(aRegion.Span())); + mRegion.Set(aRegion.Span()); + } + + /** + * Removes all variant subtags. + */ + void ClearVariants() { mVariants.clearAndFree(); } + + /** + * Set the Unicode extension subtag. The input must be a valid Unicode + * extension subtag. + */ + ICUResult SetUnicodeExtension(Span<const char> aExtension); + + /** + * Remove any Unicode extension subtag if present. + */ + void ClearUnicodeExtension(); + + /** Canonicalize the base-name (language, script, region, variant) subtags. */ + Result<Ok, CanonicalizationError> CanonicalizeBaseName(); + + /** + * Canonicalize all extension subtags. + */ + Result<Ok, CanonicalizationError> CanonicalizeExtensions(); + + /** + * Canonicalizes the given structurally valid Unicode BCP 47 locale + * identifier, including regularized case of subtags. For example, the + * locale Zh-haNS-bu-variant2-Variant1-u-ca-chinese-t-Zh-laTN-x-PRIVATE, + * where + * + * Zh ; 2*3ALPHA + * -haNS ; ["-" script] + * -bu ; ["-" region] + * -variant2 ; *("-" variant) + * -Variant1 + * -u-ca-chinese ; *("-" extension) + * -t-Zh-laTN + * -x-PRIVATE ; ["-" privateuse] + * + * becomes zh-Hans-MM-variant1-variant2-t-zh-latn-u-ca-chinese-x-private + * + * Spec: ECMAScript Internationalization API Specification, 6.2.3. + */ + Result<Ok, CanonicalizationError> Canonicalize() { + MOZ_TRY(CanonicalizeBaseName()); + return CanonicalizeExtensions(); + } + + /** + * Fill the buffer with a string representation of the locale. + */ + template <typename B> + ICUResult ToString(B& aBuffer) const { + static_assert(std::is_same_v<typename B::CharType, char>); + + size_t capacity = ToStringCapacity(); + + // Attempt to reserve needed capacity + if (!aBuffer.reserve(capacity)) { + return Err(ICUError::OutOfMemory); + } + + size_t offset = ToStringAppend(aBuffer.data()); + + MOZ_ASSERT(capacity == offset); + aBuffer.written(offset); + + return Ok(); + } + + /** + * Add likely-subtags to the locale. + * + * Spec: <https://www.unicode.org/reports/tr35/#Likely_Subtags> + */ + ICUResult AddLikelySubtags(); + + /** + * Remove likely-subtags from the locale. + * + * Spec: <https://www.unicode.org/reports/tr35/#Likely_Subtags> + */ + ICUResult RemoveLikelySubtags(); + + /** + * Returns the default locale as an ICU locale identifier. The returned string + * is NOT a valid BCP 47 locale! + * + * Also see <https://unicode-org.github.io/icu/userguide/locale>. + */ + static const char* GetDefaultLocale() { return uloc_getDefault(); } + + /** + * Returns an iterator over all supported locales. + * + * The returned strings are ICU locale identifiers and NOT BCP 47 language + * tags. + * + * Also see <https://unicode-org.github.io/icu/userguide/locale>. + */ + static auto GetAvailableLocales() { + return AvailableLocalesEnumeration<uloc_countAvailable, + uloc_getAvailable>(); + } + + private: + static UniqueChars DuplicateStringToUniqueChars(const char* aStr); + static UniqueChars DuplicateStringToUniqueChars(Span<const char> aStr); + size_t ToStringCapacity() const; + size_t ToStringAppend(char* aBuffer) const; +}; + +/** + * Parser for Unicode BCP 47 locale identifiers. + * + * <https://unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers> + */ +class MOZ_STACK_CLASS LocaleParser final { + public: + enum class ParserError : uint8_t { + // Input was not parseable as a locale, subtag or extension. + NotParseable, + // Unable to allocate memory for the parser to operate. + OutOfMemory, + }; + + // Exposed as |public| for |MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS|. + enum class TokenKind : uint8_t { + None = 0b000, + Alpha = 0b001, + Digit = 0b010, + AlphaDigit = 0b011, + Error = 0b100 + }; + + private: + class Token final { + size_t mIndex; + size_t mLength; + TokenKind mKind; + + public: + Token(TokenKind aKind, size_t aIndex, size_t aLength) + : mIndex(aIndex), mLength(aLength), mKind(aKind) {} + + TokenKind Kind() const { return mKind; } + size_t Index() const { return mIndex; } + size_t Length() const { return mLength; } + + bool IsError() const { return mKind == TokenKind::Error; } + bool IsNone() const { return mKind == TokenKind::None; } + bool IsAlpha() const { return mKind == TokenKind::Alpha; } + bool IsDigit() const { return mKind == TokenKind::Digit; } + bool IsAlphaDigit() const { return mKind == TokenKind::AlphaDigit; } + }; + + const char* mLocale; + size_t mLength; + size_t mIndex = 0; + + explicit LocaleParser(Span<const char> aLocale) + : mLocale(aLocale.data()), mLength(aLocale.size()) {} + + char CharAt(size_t aIndex) const { return mLocale[aIndex]; } + + // Copy the token characters into |subtag|. + template <size_t N> + void CopyChars(const Token& aTok, LanguageTagSubtag<N>& aSubtag) const { + aSubtag.Set(mozilla::Span(mLocale + aTok.Index(), aTok.Length())); + } + + // Create a string copy of |length| characters starting at |index|. + UniqueChars Chars(size_t aIndex, size_t aLength) const; + + // Create a string copy of the token characters. + UniqueChars Chars(const Token& aTok) const { + return Chars(aTok.Index(), aTok.Length()); + } + + UniqueChars Extension(const Token& aStart, const Token& aEnd) const { + MOZ_ASSERT(aStart.Index() < aEnd.Index()); + + size_t length = aEnd.Index() - 1 - aStart.Index(); + return Chars(aStart.Index(), length); + } + + Token NextToken(); + + // unicode_language_subtag = alpha{2,3} | alpha{5,8} ; + // + // Four character language subtags are not allowed in Unicode BCP 47 locale + // identifiers. Also see the comparison to Unicode CLDR locale identifiers in + // <https://unicode.org/reports/tr35/#BCP_47_Conformance>. + bool IsLanguage(const Token& aTok) const { + return aTok.IsAlpha() && ((2 <= aTok.Length() && aTok.Length() <= 3) || + (5 <= aTok.Length() && aTok.Length() <= 8)); + } + + // unicode_script_subtag = alpha{4} ; + bool IsScript(const Token& aTok) const { + return aTok.IsAlpha() && aTok.Length() == 4; + } + + // unicode_region_subtag = (alpha{2} | digit{3}) ; + bool IsRegion(const Token& aTok) const { + return (aTok.IsAlpha() && aTok.Length() == 2) || + (aTok.IsDigit() && aTok.Length() == 3); + } + + // unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) ; + bool IsVariant(const Token& aTok) const { + return (5 <= aTok.Length() && aTok.Length() <= 8) || + (aTok.Length() == 4 && mozilla::IsAsciiDigit(CharAt(aTok.Index()))); + } + + // Returns the code unit of the first character at the given singleton token. + // Always returns the lower case form of an alphabetical character. + char SingletonKey(const Token& aTok) const { + MOZ_ASSERT(aTok.Length() == 1); + return AsciiToLowerCase(CharAt(aTok.Index())); + } + + // extensions = unicode_locale_extensions | + // transformed_extensions | + // other_extensions ; + // + // unicode_locale_extensions = sep [uU] ((sep keyword)+ | + // (sep attribute)+ (sep keyword)*) ; + // + // transformed_extensions = sep [tT] ((sep tlang (sep tfield)*) | + // (sep tfield)+) ; + // + // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ; + bool IsExtensionStart(const Token& aTok) const { + return aTok.Length() == 1 && SingletonKey(aTok) != 'x'; + } + + // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ; + bool IsOtherExtensionPart(const Token& aTok) const { + return 2 <= aTok.Length() && aTok.Length() <= 8; + } + + // unicode_locale_extensions = sep [uU] ((sep keyword)+ | + // (sep attribute)+ (sep keyword)*) ; + // keyword = key (sep type)? ; + bool IsUnicodeExtensionPart(const Token& aTok) const { + return IsUnicodeExtensionKey(aTok) || IsUnicodeExtensionType(aTok) || + IsUnicodeExtensionAttribute(aTok); + } + + // attribute = alphanum{3,8} ; + bool IsUnicodeExtensionAttribute(const Token& aTok) const { + return 3 <= aTok.Length() && aTok.Length() <= 8; + } + + // key = alphanum alpha ; + bool IsUnicodeExtensionKey(const Token& aTok) const { + return aTok.Length() == 2 && + mozilla::IsAsciiAlpha(CharAt(aTok.Index() + 1)); + } + + // type = alphanum{3,8} (sep alphanum{3,8})* ; + bool IsUnicodeExtensionType(const Token& aTok) const { + return 3 <= aTok.Length() && aTok.Length() <= 8; + } + + // tkey = alpha digit ; + bool IsTransformExtensionKey(const Token& aTok) const { + return aTok.Length() == 2 && mozilla::IsAsciiAlpha(CharAt(aTok.Index())) && + mozilla::IsAsciiDigit(CharAt(aTok.Index() + 1)); + } + + // tvalue = (sep alphanum{3,8})+ ; + bool IsTransformExtensionPart(const Token& aTok) const { + return 3 <= aTok.Length() && aTok.Length() <= 8; + } + + // pu_extensions = sep [xX] (sep alphanum{1,8})+ ; + bool IsPrivateUseStart(const Token& aTok) const { + return aTok.Length() == 1 && SingletonKey(aTok) == 'x'; + } + + // pu_extensions = sep [xX] (sep alphanum{1,8})+ ; + bool IsPrivateUsePart(const Token& aTok) const { + return 1 <= aTok.Length() && aTok.Length() <= 8; + } + + // Helper function for use in |ParseBaseName| and + // |ParseTlangInTransformExtension|. Do not use this directly! + static Result<Ok, ParserError> InternalParseBaseName( + LocaleParser& aLocaleParser, Locale& aTag, Token& aTok); + + // Parse the `unicode_language_id` production, i.e. the + // language/script/region/variants portion of a locale, into |aTag|. + // |aTok| must be the current token. + static Result<Ok, ParserError> ParseBaseName(LocaleParser& aLocaleParser, + Locale& aTag, Token& aTok) { + return InternalParseBaseName(aLocaleParser, aTag, aTok); + } + + // Parse the `tlang` production within a parsed 't' transform extension. + // The precise requirements for "previously parsed" are: + // + // * the input begins from current token |tok| with a valid `tlang` + // * the `tlang` is wholly lowercase (*not* canonical case) + // * variant subtags in the `tlang` may contain duplicates and be + // unordered + // + // Return an error on internal failure. Otherwise, return a success value. If + // there was no `tlang`, then |tag.language().missing()|. But if there was a + // `tlang`, then |tag| is filled with subtags exactly as they appeared in the + // parse input. + static Result<Ok, ParserError> ParseTlangInTransformExtension( + LocaleParser& aLocaleParser, Locale& aTag, Token& aTok) { + MOZ_ASSERT(aLocaleParser.IsLanguage(aTok)); + return InternalParseBaseName(aLocaleParser, aTag, aTok); + } + + friend class Locale; + + class Range final { + size_t mBegin; + size_t mLength; + + public: + Range(size_t aBegin, size_t aLength) : mBegin(aBegin), mLength(aLength) {} + + size_t Begin() const { return mBegin; } + size_t Length() const { return mLength; } + }; + + using TFieldVector = Vector<Range, 8>; + using AttributesVector = Vector<Range, 8>; + using KeywordsVector = Vector<Range, 8>; + + // Parse |extension|, which must be a validated, fully lowercase + // `transformed_extensions` subtag, and fill |tag| and |fields| from the + // `tlang` and `tfield` components. Data in |tag| is lowercase, consistent + // with |extension|. + static Result<Ok, ParserError> ParseTransformExtension( + mozilla::Span<const char> aExtension, Locale& aTag, + TFieldVector& aFields); + + // Parse |extension|, which must be a validated, fully lowercase + // `unicode_locale_extensions` subtag, and fill |attributes| and |keywords| + // from the `attribute` and `keyword` components. + static Result<Ok, ParserError> ParseUnicodeExtension( + mozilla::Span<const char> aExtension, AttributesVector& aAttributes, + KeywordsVector& aKeywords); + + public: + // Parse the input string as a locale. + // + // NOTE: |aTag| must be a new, empty Locale. + static Result<Ok, ParserError> TryParse(Span<const char> aLocale, + Locale& aTag); + + // Parse the input string as the base-name parts (language, script, region, + // variants) of a locale. + // + // NOTE: |aTag| must be a new, empty Locale. + static Result<Ok, ParserError> TryParseBaseName(Span<const char> aLocale, + Locale& aTag); + + // Return Ok() iff |extension| can be parsed as a Unicode extension subtag. + static Result<Ok, ParserError> CanParseUnicodeExtension( + Span<const char> aExtension); + + // Return Ok() iff |unicodeType| can be parsed as a Unicode extension type. + static Result<Ok, ParserError> CanParseUnicodeExtensionType( + Span<const char> aUnicodeType); +}; + +MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS(LocaleParser::TokenKind) + +} // namespace mozilla::intl + +#endif /* intl_components_Locale_h */ diff --git a/intl/components/src/LocaleCanonicalizer.cpp b/intl/components/src/LocaleCanonicalizer.cpp new file mode 100644 index 0000000000..8a83874390 --- /dev/null +++ b/intl/components/src/LocaleCanonicalizer.cpp @@ -0,0 +1,36 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "mozilla/intl/LocaleCanonicalizer.h" +#include <cstdio> +#include "unicode/uloc.h" + +namespace mozilla::intl { + +/* static */ +ICUResult LocaleCanonicalizer::CanonicalizeICULevel1( + const char* aLocaleIn, LocaleCanonicalizer::Vector& aLocaleOut) { + auto result = FillBufferWithICUCall( + aLocaleOut, + [&aLocaleIn](char* target, int32_t length, UErrorCode* status) { + return uloc_canonicalize(aLocaleIn, target, length, status); + }); + + if (result.isErr()) { + return Err(result.unwrapErr()); + } + + // This step is not included in the normal ICU4C canonicalization step, but + // consumers were expecting the results to actually be ASCII. It seemed safer + // to include it. + for (auto byte : aLocaleOut) { + if (static_cast<unsigned char>(byte) > 127) { + return Err(ICUError::InternalError); + } + } + + return Ok(); +} + +} // namespace mozilla::intl diff --git a/intl/components/src/LocaleCanonicalizer.h b/intl/components/src/LocaleCanonicalizer.h new file mode 100644 index 0000000000..bd17c9dfd4 --- /dev/null +++ b/intl/components/src/LocaleCanonicalizer.h @@ -0,0 +1,43 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef intl_components_LocaleCanonicalizer_h_ +#define intl_components_LocaleCanonicalizer_h_ + +#include "mozilla/intl/ICU4CGlue.h" +#include "mozilla/Span.h" +#include "mozilla/Vector.h" + +namespace mozilla::intl { + +/** + * 32 is somewhat an arbitrary size, but it should fit most locales on the + * stack to avoid heap allocations. + */ +constexpr size_t INITIAL_LOCALE_CANONICALIZER_BUFFER_SIZE = 32; + +/** + * Eventually this class will unify the behaviors of Locale Canonicalization. + * See Bug 1723586. + */ +class LocaleCanonicalizer { + public: + using Vector = + mozilla::Vector<char, INITIAL_LOCALE_CANONICALIZER_BUFFER_SIZE>; + + /** + * This static method will canonicalize a locale string, per the Level 1 + * canonicalization steps outlined in: + * http://userguide.icu-project.org/locale#TOC-Canonicalization + * + * For instance it will turn the string "en-US" to "en_US". It guarantees that + * the string span targeted will be in the ASCII range. The canonicalization + * process on ICU is somewhat permissive in what it accepts as input, but only + * ASCII locales are technically correct. + */ + static ICUResult CanonicalizeICULevel1( + const char* aLocale, LocaleCanonicalizer::Vector& aLocaleOut); +}; + +} // namespace mozilla::intl +#endif diff --git a/intl/components/src/LocaleGenerated.cpp b/intl/components/src/LocaleGenerated.cpp new file mode 100644 index 0000000000..427a78de72 --- /dev/null +++ b/intl/components/src/LocaleGenerated.cpp @@ -0,0 +1,1208 @@ +// Generated by make_intl_data.py. DO NOT EDIT. +// Version: CLDR-43 +// URL: https://unicode.org/Public/cldr/43/cldr-common-43.0.zip + +#include "mozilla/Assertions.h" +#include "mozilla/Span.h" +#include "mozilla/TextUtils.h" + +#include <algorithm> +#include <cstdint> +#include <cstring> +#include <iterator> +#include <string> +#include <type_traits> + +#include "mozilla/intl/Locale.h" + +using namespace mozilla::intl::LanguageTagLimits; + +template <size_t Length, size_t TagLength, size_t SubtagLength> +static inline bool HasReplacement( + const char (&subtags)[Length][TagLength], + const mozilla::intl::LanguageTagSubtag<SubtagLength>& subtag) { + MOZ_ASSERT(subtag.Length() == TagLength - 1, + "subtag must have the same length as the list of subtags"); + + const char* ptr = subtag.Span().data(); + return std::binary_search(std::begin(subtags), std::end(subtags), ptr, + [](const char* a, const char* b) { + return memcmp(a, b, TagLength - 1) < 0; + }); +} + +template <size_t Length, size_t TagLength, size_t SubtagLength> +static inline const char* SearchReplacement( + const char (&subtags)[Length][TagLength], const char* (&aliases)[Length], + const mozilla::intl::LanguageTagSubtag<SubtagLength>& subtag) { + MOZ_ASSERT(subtag.Length() == TagLength - 1, + "subtag must have the same length as the list of subtags"); + + const char* ptr = subtag.Span().data(); + auto p = std::lower_bound(std::begin(subtags), std::end(subtags), ptr, + [](const char* a, const char* b) { + return memcmp(a, b, TagLength - 1) < 0; + }); + if (p != std::end(subtags) && memcmp(*p, ptr, TagLength - 1) == 0) { + return aliases[std::distance(std::begin(subtags), p)]; + } + return nullptr; +} + +#ifdef DEBUG +static bool IsAsciiLowercaseAlphanumeric(char c) { + return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c); +} + +static bool IsAsciiLowercaseAlphanumericOrDash(char c) { + return IsAsciiLowercaseAlphanumeric(c) || c == '-'; +} + +static bool IsCanonicallyCasedLanguageTag(mozilla::Span<const char> span) { + return std::all_of(span.begin(), span.end(), + mozilla::IsAsciiLowercaseAlpha<char>); +} + +static bool IsCanonicallyCasedScriptTag(mozilla::Span<const char> span) { + return mozilla::IsAsciiUppercaseAlpha(span[0]) && + std::all_of(span.begin() + 1, span.end(), + mozilla::IsAsciiLowercaseAlpha<char>); +} + +static bool IsCanonicallyCasedRegionTag(mozilla::Span<const char> span) { + return std::all_of(span.begin(), span.end(), + mozilla::IsAsciiUppercaseAlpha<char>) || + std::all_of(span.begin(), span.end(), mozilla::IsAsciiDigit<char>); +} + +static bool IsCanonicallyCasedVariantTag(mozilla::Span<const char> span) { + return std::all_of(span.begin(), span.end(), IsAsciiLowercaseAlphanumeric); +} + +static bool IsCanonicallyCasedUnicodeKey(mozilla::Span<const char> key) { + return std::all_of(key.begin(), key.end(), IsAsciiLowercaseAlphanumeric); +} + +static bool IsCanonicallyCasedUnicodeType(mozilla::Span<const char> type) { + return std::all_of(type.begin(), type.end(), + IsAsciiLowercaseAlphanumericOrDash); +} + +static bool IsCanonicallyCasedTransformKey(mozilla::Span<const char> key) { + return std::all_of(key.begin(), key.end(), IsAsciiLowercaseAlphanumeric); +} + +static bool IsCanonicallyCasedTransformType(mozilla::Span<const char> type) { + return std::all_of(type.begin(), type.end(), + IsAsciiLowercaseAlphanumericOrDash); +} +#endif + +// Mappings from language subtags to preferred values. +// Derived from CLDR Supplemental Data, version 43. +// https://unicode.org/Public/cldr/43/cldr-common-43.0.zip +bool mozilla::intl::Locale::LanguageMapping(LanguageSubtag& language) { + MOZ_ASSERT(IsStructurallyValidLanguageTag(language.Span())); + MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language.Span())); + + if (language.Length() == 2) { + static const char languages[8][3] = { + "bh", "in", "iw", "ji", "jw", "mo", "tl", "tw", + }; + static const char* aliases[8] = { + "bho", "id", "he", "yi", "jv", "ro", "fil", "ak", + }; + + if (const char* replacement = SearchReplacement(languages, aliases, language)) { + language.Set(mozilla::MakeStringSpan(replacement)); + return true; + } + return false; + } + + if (language.Length() == 3) { + static const char languages[408][4] = { + "aam", "aar", "abk", "adp", "afr", "agp", "ais", "ajt", "aju", "aka", + "alb", "als", "amh", "ara", "arb", "arg", "arm", "asd", "asm", "aue", + "ava", "ave", "aym", "ayr", "ayx", "aze", "azj", "bak", "bam", "baq", + "baz", "bcc", "bcl", "bel", "ben", "bgm", "bhk", "bic", "bih", "bis", + "bjd", "bjq", "bkb", "blg", "bod", "bos", "bre", "btb", "bul", "bur", + "bxk", "bxr", "cat", "ccq", "ces", "cha", "che", "chi", "chu", "chv", + "cjr", "cka", "cld", "cmk", "cmn", "cor", "cos", "coy", "cqu", "cre", + "cwd", "cym", "cze", "daf", "dan", "dap", "deu", "dgo", "dhd", "dik", + "diq", "dit", "div", "djl", "dkl", "drh", "drr", "dud", "duj", "dut", + "dwl", "dzo", "ekk", "ell", "elp", "emk", "eng", "epo", "esk", "est", + "eus", "ewe", "fao", "fas", "fat", "fij", "fin", "fra", "fre", "fry", + "fuc", "ful", "gav", "gaz", "gbc", "gbo", "geo", "ger", "gfx", "ggn", + "ggo", "ggr", "gio", "gla", "gle", "glg", "gli", "glv", "gno", "gre", + "grn", "gti", "gug", "guj", "guv", "gya", "hat", "hau", "hdn", "hea", + "heb", "her", "him", "hin", "hmo", "hrr", "hrv", "hun", "hye", "ibi", + "ibo", "ice", "ido", "iii", "ike", "iku", "ile", "ill", "ilw", "ina", + "ind", "ipk", "isl", "ita", "izi", "jar", "jav", "jeg", "jpn", "kal", + "kan", "kas", "kat", "kau", "kaz", "kdv", "kgc", "kgd", "kgh", "khk", + "khm", "kik", "kin", "kir", "kmr", "knc", "kng", "knn", "koj", "kom", + "kon", "kor", "kpp", "kpv", "krm", "ktr", "kua", "kur", "kvs", "kwq", + "kxe", "kxl", "kzh", "kzj", "kzt", "lak", "lao", "lat", "lav", "lbk", + "leg", "lii", "lim", "lin", "lit", "llo", "lmm", "ltz", "lub", "lug", + "lvs", "mac", "mah", "mal", "mao", "mar", "may", "meg", "mgx", "mhr", + "mkd", "mlg", "mlt", "mnk", "mnt", "mof", "mol", "mon", "mri", "msa", + "mst", "mup", "mwd", "mwj", "mya", "myd", "myt", "nad", "nau", "nav", + "nbf", "nbl", "nbx", "ncp", "nde", "ndo", "nep", "nld", "nln", "nlr", + "nno", "nns", "nnx", "nob", "noo", "nor", "npi", "nts", "nxu", "nya", + "oci", "ojg", "oji", "ori", "orm", "ory", "oss", "oun", "pan", "pat", + "pbu", "pcr", "per", "pes", "pli", "plt", "pmc", "pmu", "pnb", "pol", + "por", "ppa", "ppr", "pry", "pus", "puz", "que", "quz", "rmr", "rmy", + "roh", "ron", "rum", "run", "rus", "sag", "san", "sap", "sca", "scc", + "scr", "sgl", "sin", "skk", "slk", "slo", "slv", "smd", "sme", "smo", + "sna", "snb", "snd", "som", "sot", "spa", "spy", "sqi", "src", "srd", + "srp", "ssw", "sul", "sum", "sun", "swa", "swe", "swh", "tah", "tam", + "tat", "tdu", "tel", "tgg", "tgk", "tgl", "tha", "thc", "thw", "thx", + "tib", "tid", "tie", "tir", "tkk", "tlw", "tmp", "tne", "ton", "tsf", + "tsn", "tso", "ttq", "tuk", "tur", "twi", "uig", "ukr", "umu", "unp", + "uok", "urd", "uzb", "uzn", "ven", "vie", "vol", "wel", "wgw", "wit", + "wiw", "wln", "wol", "xba", "xho", "xia", "xkh", "xpe", "xrq", "xsj", + "xsl", "ybd", "ydd", "yen", "yid", "yiy", "yma", "ymt", "yor", "yos", + "yuu", "zai", "zha", "zho", "zir", "zsm", "zul", "zyb", + }; + static const char* aliases[408] = { + "aas", "aa", "ab", "dz", "af", "apf", "ami", "aeb", "jrb", "ak", + "sq", "sq", "am", "ar", "ar", "an", "hy", "snz", "as", "ktz", + "av", "ae", "ay", "ay", "nun", "az", "az", "ba", "bm", "eu", + "nvo", "bal", "bik", "be", "bn", "bcg", "fbl", "bir", "bho", "bi", + "drl", "bzc", "ebk", "iba", "bo", "bs", "br", "beb", "bg", "my", + "luy", "bua", "ca", "rki", "cs", "ch", "ce", "zh", "cu", "cv", + "mom", "cmr", "syr", "xch", "zh", "kw", "co", "pij", "quh", "cr", + "cr", "cy", "cs", "dnj", "da", "njz", "de", "doi", "mwr", "din", + "zza", "dif", "dv", "dze", "aqd", "mn", "kzk", "uth", "dwu", "nl", + "dbt", "dz", "et", "el", "amq", "man", "en", "eo", "ik", "et", + "eu", "ee", "fo", "fa", "ak", "fj", "fi", "fr", "fr", "fy", + "ff", "ff", "dev", "om", "wny", "grb", "ka", "de", "vaj", "gvr", + "esg", "gtu", "aou", "gd", "ga", "gl", "kzk", "gv", "gon", "el", + "gn", "nyc", "gn", "gu", "duz", "gba", "ht", "ha", "hai", "hmn", + "he", "hz", "srx", "hi", "ho", "jal", "hr", "hu", "hy", "opa", + "ig", "is", "io", "ii", "iu", "iu", "ie", "ilm", "gal", "ia", + "id", "ik", "is", "it", "eza", "jgk", "jv", "oyb", "ja", "kl", + "kn", "ks", "ka", "kr", "kk", "zkd", "tdf", "ncq", "kml", "mn", + "km", "ki", "rw", "ky", "ku", "kr", "kg", "kok", "kwv", "kv", + "kg", "ko", "jkm", "kv", "bmf", "dtp", "kj", "ku", "gdj", "yam", + "tvd", "kru", "dgl", "dtp", "dtp", "ksp", "lo", "la", "lv", "bnc", + "enl", "raq", "li", "ln", "lt", "ngt", "rmx", "lb", "lu", "lg", + "lv", "mk", "mh", "ml", "mi", "mr", "ms", "cir", "jbk", "chm", + "mk", "mg", "mt", "man", "wnn", "xnt", "ro", "mn", "mi", "ms", + "mry", "raj", "dmw", "vaj", "my", "aog", "mry", "xny", "na", "nv", + "nru", "nr", "ekc", "kdz", "nd", "ng", "ne", "nl", "azd", "nrk", + "nn", "nbr", "ngv", "nb", "dtd", "no", "ne", "pij", "bpp", "ny", + "oc", "oj", "oj", "or", "om", "or", "os", "vaj", "pa", "kxr", + "ps", "adx", "fa", "fa", "pi", "mg", "huw", "phr", "lah", "pl", + "pt", "bfy", "lcq", "prt", "ps", "pub", "qu", "qu", "emx", "rom", + "rm", "ro", "ro", "rn", "ru", "sg", "sa", "aqt", "hle", "sr", + "hr", "isk", "si", "oyb", "sk", "sk", "sl", "kmb", "se", "sm", + "sn", "iba", "sd", "so", "st", "es", "kln", "sq", "sc", "sc", + "sr", "ss", "sgd", "ulw", "su", "sw", "sv", "sw", "ty", "ta", + "tt", "dtp", "te", "bjp", "tg", "fil", "th", "tpo", "ola", "oyb", + "bo", "itd", "ras", "ti", "twm", "weo", "tyj", "kak", "to", "taj", + "tn", "ts", "tmh", "tk", "tr", "ak", "ug", "uk", "del", "wro", + "ema", "ur", "uz", "uz", "ve", "vi", "vo", "cy", "wgb", "nol", + "nwo", "wa", "wo", "cax", "xh", "acn", "waw", "kpe", "dmw", "suj", + "den", "rki", "yi", "ynq", "yi", "yrm", "lrr", "mtm", "yo", "zom", + "yug", "zap", "za", "zh", "scv", "ms", "zu", "za", + }; + + if (const char* replacement = SearchReplacement(languages, aliases, language)) { + language.Set(mozilla::MakeStringSpan(replacement)); + return true; + } + return false; + } + + return false; +} + +// Language subtags with complex mappings. +// Derived from CLDR Supplemental Data, version 43. +// https://unicode.org/Public/cldr/43/cldr-common-43.0.zip +bool mozilla::intl::Locale::ComplexLanguageMapping(const LanguageSubtag& language) { + MOZ_ASSERT(IsStructurallyValidLanguageTag(language.Span())); + MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language.Span())); + + if (language.Length() == 2) { + return language.EqualTo("sh"); + } + + if (language.Length() == 3) { + static const char languages[6][4] = { + "cnr", "drw", "hbs", "prs", "swc", "tnf", + }; + + return HasReplacement(languages, language); + } + + return false; +} + +// Mappings from script subtags to preferred values. +// Derived from CLDR Supplemental Data, version 43. +// https://unicode.org/Public/cldr/43/cldr-common-43.0.zip +bool mozilla::intl::Locale::ScriptMapping(ScriptSubtag& script) { + MOZ_ASSERT(IsStructurallyValidScriptTag(script.Span())); + MOZ_ASSERT(IsCanonicallyCasedScriptTag(script.Span())); + + { + if (script.EqualTo("Qaai")) { + script.Set(mozilla::MakeStringSpan("Zinh")); + return true; + } + return false; + } +} + +// Mappings from region subtags to preferred values. +// Derived from CLDR Supplemental Data, version 43. +// https://unicode.org/Public/cldr/43/cldr-common-43.0.zip +bool mozilla::intl::Locale::RegionMapping(RegionSubtag& region) { + MOZ_ASSERT(IsStructurallyValidRegionTag(region.Span())); + MOZ_ASSERT(IsCanonicallyCasedRegionTag(region.Span())); + + if (region.Length() == 2) { + static const char regions[23][3] = { + "BU", "CS", "CT", "DD", "DY", "FQ", "FX", "HV", "JT", "MI", + "NH", "NQ", "PU", "PZ", "QU", "RH", "TP", "UK", "VD", "WK", + "YD", "YU", "ZR", + }; + static const char* aliases[23] = { + "MM", "RS", "KI", "DE", "BJ", "AQ", "FR", "BF", "UM", "UM", + "VU", "AQ", "UM", "PA", "EU", "ZW", "TL", "GB", "VN", "UM", + "YE", "RS", "CD", + }; + + if (const char* replacement = SearchReplacement(regions, aliases, region)) { + region.Set(mozilla::MakeStringSpan(replacement)); + return true; + } + return false; + } + + { + static const char regions[299][4] = { + "004", "008", "010", "012", "016", "020", "024", "028", "031", "032", + "036", "040", "044", "048", "050", "051", "052", "056", "060", "064", + "068", "070", "072", "074", "076", "084", "086", "090", "092", "096", + "100", "104", "108", "112", "116", "120", "124", "132", "136", "140", + "144", "148", "152", "156", "158", "162", "166", "170", "174", "175", + "178", "180", "184", "188", "191", "192", "196", "203", "204", "208", + "212", "214", "218", "222", "226", "230", "231", "232", "233", "234", + "238", "239", "242", "246", "248", "249", "250", "254", "258", "260", + "262", "266", "268", "270", "275", "276", "278", "280", "288", "292", + "296", "300", "304", "308", "312", "316", "320", "324", "328", "332", + "334", "336", "340", "344", "348", "352", "356", "360", "364", "368", + "372", "376", "380", "384", "388", "392", "398", "400", "404", "408", + "410", "414", "417", "418", "422", "426", "428", "430", "434", "438", + "440", "442", "446", "450", "454", "458", "462", "466", "470", "474", + "478", "480", "484", "492", "496", "498", "499", "500", "504", "508", + "512", "516", "520", "524", "528", "531", "533", "534", "535", "540", + "548", "554", "558", "562", "566", "570", "574", "578", "580", "581", + "583", "584", "585", "586", "591", "598", "600", "604", "608", "612", + "616", "620", "624", "626", "630", "634", "638", "642", "643", "646", + "652", "654", "659", "660", "662", "663", "666", "670", "674", "678", + "682", "686", "688", "690", "694", "702", "703", "704", "705", "706", + "710", "716", "720", "724", "728", "729", "732", "736", "740", "744", + "748", "752", "756", "760", "762", "764", "768", "772", "776", "780", + "784", "788", "792", "795", "796", "798", "800", "804", "807", "818", + "826", "830", "831", "832", "833", "834", "840", "850", "854", "858", + "860", "862", "876", "882", "886", "887", "891", "894", "958", "959", + "960", "962", "963", "964", "965", "966", "967", "968", "969", "970", + "971", "972", "973", "974", "975", "976", "977", "978", "979", "980", + "981", "982", "983", "984", "985", "986", "987", "988", "989", "990", + "991", "992", "993", "994", "995", "996", "997", "998", "999", + }; + static const char* aliases[299] = { + "AF", "AL", "AQ", "DZ", "AS", "AD", "AO", "AG", "AZ", "AR", + "AU", "AT", "BS", "BH", "BD", "AM", "BB", "BE", "BM", "BT", + "BO", "BA", "BW", "BV", "BR", "BZ", "IO", "SB", "VG", "BN", + "BG", "MM", "BI", "BY", "KH", "CM", "CA", "CV", "KY", "CF", + "LK", "TD", "CL", "CN", "TW", "CX", "CC", "CO", "KM", "YT", + "CG", "CD", "CK", "CR", "HR", "CU", "CY", "CZ", "BJ", "DK", + "DM", "DO", "EC", "SV", "GQ", "ET", "ET", "ER", "EE", "FO", + "FK", "GS", "FJ", "FI", "AX", "FR", "FR", "GF", "PF", "TF", + "DJ", "GA", "GE", "GM", "PS", "DE", "DE", "DE", "GH", "GI", + "KI", "GR", "GL", "GD", "GP", "GU", "GT", "GN", "GY", "HT", + "HM", "VA", "HN", "HK", "HU", "IS", "IN", "ID", "IR", "IQ", + "IE", "IL", "IT", "CI", "JM", "JP", "KZ", "JO", "KE", "KP", + "KR", "KW", "KG", "LA", "LB", "LS", "LV", "LR", "LY", "LI", + "LT", "LU", "MO", "MG", "MW", "MY", "MV", "ML", "MT", "MQ", + "MR", "MU", "MX", "MC", "MN", "MD", "ME", "MS", "MA", "MZ", + "OM", "NA", "NR", "NP", "NL", "CW", "AW", "SX", "BQ", "NC", + "VU", "NZ", "NI", "NE", "NG", "NU", "NF", "NO", "MP", "UM", + "FM", "MH", "PW", "PK", "PA", "PG", "PY", "PE", "PH", "PN", + "PL", "PT", "GW", "TL", "PR", "QA", "RE", "RO", "RU", "RW", + "BL", "SH", "KN", "AI", "LC", "MF", "PM", "VC", "SM", "ST", + "SA", "SN", "RS", "SC", "SL", "SG", "SK", "VN", "SI", "SO", + "ZA", "ZW", "YE", "ES", "SS", "SD", "EH", "SD", "SR", "SJ", + "SZ", "SE", "CH", "SY", "TJ", "TH", "TG", "TK", "TO", "TT", + "AE", "TN", "TR", "TM", "TC", "TV", "UG", "UA", "MK", "EG", + "GB", "JE", "GG", "JE", "IM", "TZ", "US", "VI", "BF", "UY", + "UZ", "VE", "WF", "WS", "YE", "YE", "RS", "ZM", "AA", "QM", + "QN", "QP", "QQ", "QR", "QS", "QT", "EU", "QV", "QW", "QX", + "QY", "QZ", "XA", "XB", "XC", "XD", "XE", "XF", "XG", "XH", + "XI", "XJ", "XK", "XL", "XM", "XN", "XO", "XP", "XQ", "XR", + "XS", "XT", "XU", "XV", "XW", "XX", "XY", "XZ", "ZZ", + }; + + if (const char* replacement = SearchReplacement(regions, aliases, region)) { + region.Set(mozilla::MakeStringSpan(replacement)); + return true; + } + return false; + } +} + +// Region subtags with complex mappings. +// Derived from CLDR Supplemental Data, version 43. +// https://unicode.org/Public/cldr/43/cldr-common-43.0.zip +bool mozilla::intl::Locale::ComplexRegionMapping(const RegionSubtag& region) { + MOZ_ASSERT(IsStructurallyValidRegionTag(region.Span())); + MOZ_ASSERT(IsCanonicallyCasedRegionTag(region.Span())); + + if (region.Length() == 2) { + return region.EqualTo("AN") || + region.EqualTo("NT") || + region.EqualTo("PC") || + region.EqualTo("SU"); + } + + { + static const char regions[9][4] = { + "062", "172", "200", "530", "532", "536", "582", "810", "890", + }; + + return HasReplacement(regions, region); + } +} + +// Language subtags with complex mappings. +// Derived from CLDR Supplemental Data, version 43. +// https://unicode.org/Public/cldr/43/cldr-common-43.0.zip +void mozilla::intl::Locale::PerformComplexLanguageMappings() { + MOZ_ASSERT(IsStructurallyValidLanguageTag(Language().Span())); + MOZ_ASSERT(IsCanonicallyCasedLanguageTag(Language().Span())); + + if (Language().EqualTo("cnr")) { + SetLanguage("sr"); + if (Region().Missing()) { + SetRegion("ME"); + } + } + else if (Language().EqualTo("drw") || + Language().EqualTo("prs") || + Language().EqualTo("tnf")) { + SetLanguage("fa"); + if (Region().Missing()) { + SetRegion("AF"); + } + } + else if (Language().EqualTo("hbs") || + Language().EqualTo("sh")) { + SetLanguage("sr"); + if (Script().Missing()) { + SetScript("Latn"); + } + } + else if (Language().EqualTo("swc")) { + SetLanguage("sw"); + if (Region().Missing()) { + SetRegion("CD"); + } + } +} + +// Region subtags with complex mappings. +// Derived from CLDR Supplemental Data, version 43. +// https://unicode.org/Public/cldr/43/cldr-common-43.0.zip +void mozilla::intl::Locale::PerformComplexRegionMappings() { + MOZ_ASSERT(IsStructurallyValidLanguageTag(Language().Span())); + MOZ_ASSERT(IsCanonicallyCasedLanguageTag(Language().Span())); + MOZ_ASSERT(IsStructurallyValidRegionTag(Region().Span())); + MOZ_ASSERT(IsCanonicallyCasedRegionTag(Region().Span())); + + if (Region().EqualTo("062")) { + if (Language().EqualTo("oui") || + (Language().EqualTo("und") && Script().EqualTo("Ougr"))) { + SetRegion("143"); + } + else { + SetRegion("034"); + } + } + else if (Region().EqualTo("172")) { + if (Language().EqualTo("axm") || + Language().EqualTo("hy") || + Language().EqualTo("hyw") || + Language().EqualTo("rmi") || + (Language().EqualTo("und") && Script().EqualTo("Armn"))) { + SetRegion("AM"); + } + else if (Language().EqualTo("az") || + (Language().EqualTo("azb") && Script().EqualTo("Cyrl")) || + (Language().EqualTo("azb") && Script().EqualTo("Latn")) || + Language().EqualTo("bdk") || + (Language().EqualTo("jdt") && Script().EqualTo("Latn")) || + Language().EqualTo("kjj") || + Language().EqualTo("kry") || + (Language().EqualTo("rut") && Script().EqualTo("Latn")) || + Language().EqualTo("tkr") || + Language().EqualTo("tly") || + Language().EqualTo("ttt")) { + SetRegion("AZ"); + } + else if (Language().EqualTo("be") || + (Language().EqualTo("rml") && Script().EqualTo("Cyrl"))) { + SetRegion("BY"); + } + else if (Language().EqualTo("ab") || + Language().EqualTo("bbl") || + Language().EqualTo("bhn") || + Language().EqualTo("jge") || + Language().EqualTo("ka") || + (Language().EqualTo("ku") && Script().EqualTo("Yezi")) || + Language().EqualTo("oav") || + Language().EqualTo("os") || + Language().EqualTo("sva") || + (Language().EqualTo("und") && Script().EqualTo("Geor")) || + (Language().EqualTo("und") && Script().EqualTo("Yezi")) || + Language().EqualTo("uum") || + Language().EqualTo("xmf")) { + SetRegion("GE"); + } + else if (Language().EqualTo("dng") || + Language().EqualTo("ky")) { + SetRegion("KG"); + } + else if ((Language().EqualTo("ili") && Script().EqualTo("Cyrl")) || + Language().EqualTo("kk") || + (Language().EqualTo("ug") && Script().EqualTo("Cyrl"))) { + SetRegion("KZ"); + } + else if (Language().EqualTo("gag")) { + SetRegion("MD"); + } + else if (Language().EqualTo("abh") || + (Language().EqualTo("isk") && Script().EqualTo("Cyrl")) || + Language().EqualTo("paq") || + Language().EqualTo("sgh") || + Language().EqualTo("tg") || + (Language().EqualTo("wbl") && Script().EqualTo("Cyrl")) || + Language().EqualTo("yai")) { + SetRegion("TJ"); + } + else if (Language().EqualTo("chg") || + Language().EqualTo("tk")) { + SetRegion("TM"); + } + else if (Language().EqualTo("crh") || + Language().EqualTo("got") || + Language().EqualTo("jct") || + Language().EqualTo("ji") || + (Language().EqualTo("kdr") && Script().EqualTo("Cyrl")) || + Language().EqualTo("rue") || + Language().EqualTo("uk") || + (Language().EqualTo("und") && Script().EqualTo("Goth"))) { + SetRegion("UA"); + } + else if (Language().EqualTo("auz") || + Language().EqualTo("kaa") || + Language().EqualTo("sog") || + (Language().EqualTo("und") && Script().EqualTo("Chrs")) || + (Language().EqualTo("und") && Script().EqualTo("Sogd")) || + (Language().EqualTo("und") && Script().EqualTo("Sogo")) || + Language().EqualTo("uz") || + Language().EqualTo("xco")) { + SetRegion("UZ"); + } + else { + SetRegion("RU"); + } + } + else if (Region().EqualTo("200")) { + if (Language().EqualTo("rmc") || + Language().EqualTo("sk")) { + SetRegion("SK"); + } + else { + SetRegion("CZ"); + } + } + else if (Region().EqualTo("530") || + Region().EqualTo("532") || + Region().EqualTo("AN")) { + if (Language().EqualTo("vic")) { + SetRegion("SX"); + } + else { + SetRegion("CW"); + } + } + else if (Region().EqualTo("536") || + Region().EqualTo("NT")) { + if (Language().EqualTo("acm") || + Language().EqualTo("akk") || + Language().EqualTo("ayp") || + Language().EqualTo("bjm") || + Language().EqualTo("ckb") || + Language().EqualTo("kqd") || + (Language().EqualTo("ku") && Script().EqualTo("Arab")) || + Language().EqualTo("mid") || + Language().EqualTo("sdb") || + Language().EqualTo("sdf") || + Language().EqualTo("syr") || + (Language().EqualTo("und") && Script().EqualTo("Syrc")) || + (Language().EqualTo("und") && Script().EqualTo("Xsux"))) { + SetRegion("IQ"); + } + else { + SetRegion("SA"); + } + } + else if (Region().EqualTo("582") || + Region().EqualTo("PC")) { + if (Language().EqualTo("mh")) { + SetRegion("MH"); + } + else if (Language().EqualTo("cal") || + Language().EqualTo("tpv")) { + SetRegion("MP"); + } + else if (Language().EqualTo("pau") || + Language().EqualTo("sov") || + Language().EqualTo("tox")) { + SetRegion("PW"); + } + else { + SetRegion("FM"); + } + } + else if (Region().EqualTo("810") || + Region().EqualTo("SU")) { + if (Language().EqualTo("axm") || + Language().EqualTo("hy") || + Language().EqualTo("hyw") || + Language().EqualTo("rmi") || + (Language().EqualTo("und") && Script().EqualTo("Armn"))) { + SetRegion("AM"); + } + else if (Language().EqualTo("az") || + (Language().EqualTo("azb") && Script().EqualTo("Cyrl")) || + (Language().EqualTo("azb") && Script().EqualTo("Latn")) || + Language().EqualTo("bdk") || + (Language().EqualTo("jdt") && Script().EqualTo("Latn")) || + Language().EqualTo("kjj") || + Language().EqualTo("kry") || + (Language().EqualTo("rut") && Script().EqualTo("Latn")) || + Language().EqualTo("tkr") || + Language().EqualTo("tly") || + Language().EqualTo("ttt")) { + SetRegion("AZ"); + } + else if (Language().EqualTo("be") || + (Language().EqualTo("rml") && Script().EqualTo("Cyrl"))) { + SetRegion("BY"); + } + else if (Language().EqualTo("et") || + Language().EqualTo("vro")) { + SetRegion("EE"); + } + else if (Language().EqualTo("ab") || + Language().EqualTo("bbl") || + Language().EqualTo("bhn") || + Language().EqualTo("jge") || + Language().EqualTo("ka") || + (Language().EqualTo("ku") && Script().EqualTo("Yezi")) || + Language().EqualTo("oav") || + Language().EqualTo("os") || + Language().EqualTo("sva") || + (Language().EqualTo("und") && Script().EqualTo("Geor")) || + (Language().EqualTo("und") && Script().EqualTo("Yezi")) || + Language().EqualTo("uum") || + Language().EqualTo("xmf")) { + SetRegion("GE"); + } + else if (Language().EqualTo("dng") || + Language().EqualTo("ky")) { + SetRegion("KG"); + } + else if ((Language().EqualTo("ili") && Script().EqualTo("Cyrl")) || + Language().EqualTo("kk") || + (Language().EqualTo("ug") && Script().EqualTo("Cyrl"))) { + SetRegion("KZ"); + } + else if (Language().EqualTo("kdr") || + Language().EqualTo("lt") || + Language().EqualTo("olt") || + Language().EqualTo("sgs")) { + SetRegion("LT"); + } + else if (Language().EqualTo("liv") || + Language().EqualTo("ltg") || + Language().EqualTo("lv")) { + SetRegion("LV"); + } + else if (Language().EqualTo("gag")) { + SetRegion("MD"); + } + else if (Language().EqualTo("abh") || + (Language().EqualTo("isk") && Script().EqualTo("Cyrl")) || + Language().EqualTo("paq") || + Language().EqualTo("sgh") || + Language().EqualTo("tg") || + (Language().EqualTo("wbl") && Script().EqualTo("Cyrl")) || + Language().EqualTo("yai")) { + SetRegion("TJ"); + } + else if (Language().EqualTo("chg") || + Language().EqualTo("tk")) { + SetRegion("TM"); + } + else if (Language().EqualTo("crh") || + Language().EqualTo("got") || + Language().EqualTo("jct") || + Language().EqualTo("ji") || + (Language().EqualTo("kdr") && Script().EqualTo("Cyrl")) || + Language().EqualTo("rue") || + Language().EqualTo("uk") || + (Language().EqualTo("und") && Script().EqualTo("Goth"))) { + SetRegion("UA"); + } + else if (Language().EqualTo("auz") || + Language().EqualTo("kaa") || + Language().EqualTo("sog") || + (Language().EqualTo("und") && Script().EqualTo("Chrs")) || + (Language().EqualTo("und") && Script().EqualTo("Sogd")) || + (Language().EqualTo("und") && Script().EqualTo("Sogo")) || + Language().EqualTo("uz") || + Language().EqualTo("xco")) { + SetRegion("UZ"); + } + else { + SetRegion("RU"); + } + } + else if (Region().EqualTo("890")) { + if (Language().EqualTo("bs")) { + SetRegion("BA"); + } + else if (Language().EqualTo("ckm") || + Language().EqualTo("dlm") || + Language().EqualTo("hr") || + Language().EqualTo("ist") || + Language().EqualTo("ruo")) { + SetRegion("HR"); + } + else if (Language().EqualTo("mk")) { + SetRegion("MK"); + } + else if (Language().EqualTo("sl")) { + SetRegion("SI"); + } + else { + SetRegion("RS"); + } + } +} + +static const char* ToCharPointer(const char* str) { + return str; +} + +static const char* ToCharPointer(const mozilla::intl::UniqueChars& str) { + return str.get(); +} + +template <typename T, typename U = T> +static bool IsLessThan(const T& a, const U& b) { + return strcmp(ToCharPointer(a), ToCharPointer(b)) < 0; +} + +// Mappings from variant subtags to preferred values. +// Derived from CLDR Supplemental Data, version 43. +// https://unicode.org/Public/cldr/43/cldr-common-43.0.zip +bool mozilla::intl::Locale::PerformVariantMappings() { + // The variant subtags need to be sorted for binary search. + MOZ_ASSERT(std::is_sorted(mVariants.begin(), mVariants.end(), + IsLessThan<decltype(mVariants)::ElementType>)); + + auto removeVariantAt = [&](size_t index) { + mVariants.erase(mVariants.begin() + index); + }; + + auto insertVariantSortedIfNotPresent = [&](const char* variant) { + auto* p = std::lower_bound( + mVariants.begin(), mVariants.end(), variant, + IsLessThan<decltype(mVariants)::ElementType, decltype(variant)>); + + // Don't insert the replacement when already present. + if (p != mVariants.end() && strcmp(p->get(), variant) == 0) { + return true; + } + + // Insert the preferred variant in sort order. + auto preferred = DuplicateStringToUniqueChars(variant); + return !!mVariants.insert(p, std::move(preferred)); + }; + + for (size_t i = 0; i < mVariants.length();) { + const char* variant = mVariants[i].get(); + MOZ_ASSERT(IsCanonicallyCasedVariantTag(mozilla::MakeStringSpan(variant))); + + if (strcmp(variant, "arevela") == 0 || + strcmp(variant, "arevmda") == 0 || + strcmp(variant, "bokmal") == 0 || + strcmp(variant, "hakka") == 0 || + strcmp(variant, "lojban") == 0 || + strcmp(variant, "nynorsk") == 0 || + strcmp(variant, "saaho") == 0 || + strcmp(variant, "xiang") == 0) { + removeVariantAt(i); + } + else if (strcmp(variant, "aaland") == 0) { + removeVariantAt(i); + SetRegion("AX"); + } + else if (strcmp(variant, "heploc") == 0) { + removeVariantAt(i); + if (!insertVariantSortedIfNotPresent("alalc97")) { + return false; + } + } + else if (strcmp(variant, "polytoni") == 0) { + removeVariantAt(i); + if (!insertVariantSortedIfNotPresent("polyton")) { + return false; + } + } + else { + i++; + } + } + return true; +} + +// Canonicalize legacy locale identifiers. +// Derived from CLDR Supplemental Data, version 43. +// https://unicode.org/Public/cldr/43/cldr-common-43.0.zip +bool mozilla::intl::Locale::UpdateLegacyMappings() { + // We're mapping legacy tags to non-legacy form here. + // Other tags remain unchanged. + // + // Legacy tags are either sign language tags ("sgn") or have one or multiple + // variant subtags. Therefore we can quickly exclude most tags by checking + // these two subtags. + + MOZ_ASSERT(IsCanonicallyCasedLanguageTag(Language().Span())); + + if (!Language().EqualTo("sgn") && mVariants.length() == 0) { + return true; + } + +#ifdef DEBUG + for (const auto& variant : Variants()) { + MOZ_ASSERT(IsStructurallyValidVariantTag(variant)); + MOZ_ASSERT(IsCanonicallyCasedVariantTag(variant)); + } +#endif + + // The variant subtags need to be sorted for binary search. + MOZ_ASSERT(std::is_sorted(mVariants.begin(), mVariants.end(), + IsLessThan<decltype(mVariants)::ElementType>)); + + auto findVariant = [this](const char* variant) { + auto* p = std::lower_bound(mVariants.begin(), mVariants.end(), variant, + IsLessThan<decltype(mVariants)::ElementType, + decltype(variant)>); + + if (p != mVariants.end() && strcmp(p->get(), variant) == 0) { + return p; + } + return static_cast<decltype(p)>(nullptr); + }; + + auto insertVariantSortedIfNotPresent = [&](const char* variant) { + auto* p = std::lower_bound(mVariants.begin(), mVariants.end(), variant, + IsLessThan<decltype(mVariants)::ElementType, + decltype(variant)>); + + // Don't insert the replacement when already present. + if (p != mVariants.end() && strcmp(p->get(), variant) == 0) { + return true; + } + + // Insert the preferred variant in sort order. + auto preferred = DuplicateStringToUniqueChars(variant); + return !!mVariants.insert(p, std::move(preferred)); + }; + + auto removeVariant = [&](auto* p) { + size_t index = std::distance(mVariants.begin(), p); + mVariants.erase(mVariants.begin() + index); + }; + + auto removeVariants = [&](auto* p, auto* q) { + size_t pIndex = std::distance(mVariants.begin(), p); + size_t qIndex = std::distance(mVariants.begin(), q); + MOZ_ASSERT(pIndex < qIndex, "variant subtags are sorted"); + + mVariants.erase(mVariants.begin() + qIndex); + mVariants.erase(mVariants.begin() + pIndex); + }; + + if (mVariants.length() >= 2) { + if (auto* hepburn = findVariant("hepburn")) { + if (auto* heploc = findVariant("heploc")) { + removeVariants(hepburn, heploc); + + if (!insertVariantSortedIfNotPresent("alalc97")) { + return false; + } + } + } + } + + if (Language().EqualTo("sgn")) { + if (Region().Present() && SignLanguageMapping(mLanguage, Region())) { + mRegion.Set(mozilla::MakeStringSpan("")); + } + } + else if (Language().EqualTo("aa") || + Language().EqualTo("aar")) { + if (auto* saaho = findVariant("saaho")) { + removeVariant(saaho); + SetLanguage("ssy"); + } + } + else if (Language().EqualTo("arm") || + Language().EqualTo("hy") || + Language().EqualTo("hye")) { + if (auto* arevmda = findVariant("arevmda")) { + removeVariant(arevmda); + SetLanguage("hyw"); + } + } + else if (Language().EqualTo("art")) { + if (auto* lojban = findVariant("lojban")) { + removeVariant(lojban); + SetLanguage("jbo"); + } + } + else if (Language().EqualTo("cel")) { + if (auto* gaulish = findVariant("gaulish")) { + removeVariant(gaulish); + SetLanguage("xtg"); + } + } + else if (Language().EqualTo("chi") || + Language().EqualTo("cmn") || + Language().EqualTo("zh") || + Language().EqualTo("zho")) { + if (auto* guoyu = findVariant("guoyu")) { + if (auto* hakka = findVariant("hakka")) { + removeVariants(guoyu, hakka); + SetLanguage("hak"); + return true; + } + } + if (auto* guoyu = findVariant("guoyu")) { + if (auto* xiang = findVariant("xiang")) { + removeVariants(guoyu, xiang); + SetLanguage("hsn"); + return true; + } + } + if (auto* guoyu = findVariant("guoyu")) { + removeVariant(guoyu); + SetLanguage("zh"); + } + else if (auto* hakka = findVariant("hakka")) { + removeVariant(hakka); + SetLanguage("hak"); + } + else if (auto* xiang = findVariant("xiang")) { + removeVariant(xiang); + SetLanguage("hsn"); + } + } + else if (Language().EqualTo("no") || + Language().EqualTo("nor")) { + if (auto* bokmal = findVariant("bokmal")) { + removeVariant(bokmal); + SetLanguage("nb"); + } + else if (auto* nynorsk = findVariant("nynorsk")) { + removeVariant(nynorsk); + SetLanguage("nn"); + } + } + + return true; +} + +// Mappings from legacy sign languages. +// Derived from CLDR Supplemental Data, version 43. +// https://unicode.org/Public/cldr/43/cldr-common-43.0.zip +bool mozilla::intl::Locale::SignLanguageMapping(LanguageSubtag& language, + const RegionSubtag& region) { + MOZ_ASSERT(language.EqualTo("sgn")); + MOZ_ASSERT(IsStructurallyValidRegionTag(region.Span())); + MOZ_ASSERT(IsCanonicallyCasedRegionTag(region.Span())); + + if (region.Length() == 2) { + static const char regions[22][3] = { + "BR", "CO", "DD", "DE", "DK", "ES", "FR", "FX", "GB", "GR", + "IE", "IT", "JP", "MX", "NI", "NL", "NO", "PT", "SE", "UK", + "US", "ZA", + }; + static const char* aliases[22] = { + "bzs", "csn", "gsg", "gsg", "dsl", "ssp", "fsl", "fsl", "bfi", "gss", + "isg", "ise", "jsl", "mfs", "ncs", "dse", "nsi", "psr", "swl", "bfi", + "ase", "sfs", + }; + + if (const char* replacement = SearchReplacement(regions, aliases, region)) { + language.Set(mozilla::MakeStringSpan(replacement)); + return true; + } + return false; + } + + { + static const char regions[22][4] = { + "076", "170", "208", "249", "250", "276", "278", "280", "300", "372", + "380", "392", "484", "528", "558", "578", "620", "710", "724", "752", + "826", "840", + }; + static const char* aliases[22] = { + "bzs", "csn", "dsl", "fsl", "fsl", "gsg", "gsg", "gsg", "gss", "isg", + "ise", "jsl", "mfs", "dse", "ncs", "nsi", "psr", "sfs", "ssp", "swl", + "bfi", "ase", + }; + + if (const char* replacement = SearchReplacement(regions, aliases, region)) { + language.Set(mozilla::MakeStringSpan(replacement)); + return true; + } + return false; + } +} + +template <size_t Length> +static inline bool IsUnicodeKey(mozilla::Span<const char> key, const char (&str)[Length]) { + static_assert(Length == UnicodeKeyLength + 1, + "Unicode extension key is two characters long"); + return memcmp(key.data(), str, Length - 1) == 0; +} + +template <size_t Length> +static inline bool IsUnicodeType(mozilla::Span<const char> type, const char (&str)[Length]) { + static_assert(Length > UnicodeKeyLength + 1, + "Unicode extension type contains more than two characters"); + return type.size() == (Length - 1) && + memcmp(type.data(), str, Length - 1) == 0; +} + +static int32_t CompareUnicodeType(const char* a, mozilla::Span<const char> b) { + MOZ_ASSERT(!std::char_traits<char>::find(b.data(), b.size(), '\0'), + "unexpected null-character in string"); + + using UnsignedChar = unsigned char; + for (size_t i = 0; i < b.size(); i++) { + // |a| is zero-terminated and |b| doesn't contain a null-terminator. So if + // we've reached the end of |a|, the below if-statement will always be true. + // That ensures we don't read past the end of |a|. + if (int32_t r = UnsignedChar(a[i]) - UnsignedChar(b[i])) { + return r; + } + } + + // Return zero if both strings are equal or a positive number if |b| is a + // prefix of |a|. + return int32_t(UnsignedChar(a[b.size()])); +} + +template <size_t Length> +static inline const char* SearchUnicodeReplacement( + const char* (&types)[Length], const char* (&aliases)[Length], + mozilla::Span<const char> type) { + + auto p = std::lower_bound(std::begin(types), std::end(types), type, + [](const auto& a, const auto& b) { + return CompareUnicodeType(a, b) < 0; + }); + if (p != std::end(types) && CompareUnicodeType(*p, type) == 0) { + return aliases[std::distance(std::begin(types), p)]; + } + return nullptr; +} + +/** + * Mapping from deprecated BCP 47 Unicode extension types to their preferred + * values. + * + * Spec: https://www.unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files + * Spec: https://www.unicode.org/reports/tr35/#t_Extension + */ +const char* mozilla::intl::Locale::ReplaceUnicodeExtensionType( + mozilla::Span<const char> key, mozilla::Span<const char> type) { + MOZ_ASSERT(key.size() == UnicodeKeyLength); + MOZ_ASSERT(IsCanonicallyCasedUnicodeKey(key)); + + MOZ_ASSERT(type.size() > UnicodeKeyLength); + MOZ_ASSERT(IsCanonicallyCasedUnicodeType(type)); + + if (IsUnicodeKey(key, "ca")) { + if (IsUnicodeType(type, "ethiopic-amete-alem")) { + return "ethioaa"; + } + if (IsUnicodeType(type, "islamicc")) { + return "islamic-civil"; + } + } + else if (IsUnicodeKey(key, "kb") || + IsUnicodeKey(key, "kc") || + IsUnicodeKey(key, "kh") || + IsUnicodeKey(key, "kk") || + IsUnicodeKey(key, "kn")) { + if (IsUnicodeType(type, "yes")) { + return "true"; + } + } + else if (IsUnicodeKey(key, "ks")) { + if (IsUnicodeType(type, "primary")) { + return "level1"; + } + if (IsUnicodeType(type, "tertiary")) { + return "level3"; + } + } + else if (IsUnicodeKey(key, "ms")) { + if (IsUnicodeType(type, "imperial")) { + return "uksystem"; + } + } + else if (IsUnicodeKey(key, "rg") || + IsUnicodeKey(key, "sd")) { + static const char* types[144] = { + "cn11" , "cn12" , "cn13" , "cn14" , "cn15" , "cn21" , "cn22" , + "cn23" , "cn31" , "cn32" , "cn33" , "cn34" , "cn35" , "cn36" , + "cn37" , "cn41" , "cn42" , "cn43" , "cn44" , "cn45" , "cn46" , + "cn50" , "cn51" , "cn52" , "cn53" , "cn54" , "cn61" , "cn62" , + "cn63" , "cn64" , "cn65" , "cn71" , "cn91" , "cn92" , "cz10a" , + "cz10b" , "cz10c" , "cz10d" , "cz10e" , "cz10f" , "cz611" , "cz612" , + "cz613" , "cz614" , "cz615" , "cz621" , "cz622" , "cz623" , "cz624" , + "cz626" , "cz627" , "czjc" , "czjm" , "czka" , "czkr" , "czli" , + "czmo" , "czol" , "czpa" , "czpl" , "czpr" , "czst" , "czus" , + "czvy" , "czzl" , "fi01" , "fra" , "frb" , "frbl" , "frc" , + "frcp" , "frd" , "fre" , "frf" , "frg" , "frgf" , "frgp" , + "frh" , "fri" , "frj" , "frk" , "frl" , "frm" , "frmf" , + "frmq" , "frn" , "frnc" , "fro" , "frp" , "frpf" , "frpm" , + "frq" , "frr" , "frre" , "frs" , "frt" , "frtf" , "fru" , + "frv" , "frwf" , "fryt" , "laxn" , "lud" , "lug" , "lul" , + "mrnkc" , "nlaw" , "nlcw" , "nlsx" , "no23" , "nzn" , "nzs" , + "omba" , "omsh" , "plds" , "plkp" , "pllb" , "plld" , "pllu" , + "plma" , "plmz" , "plop" , "plpd" , "plpk" , "plpm" , "plsk" , + "plsl" , "plwn" , "plwp" , "plzp" , "shta" , "tteto" , "ttrcm" , + "ttwto" , "twkhq" , "twtnq" , "twtpq" , "twtxq" , "usas" , "usgu" , + "usmp" , "uspr" , "usum" , "usvi" , + }; + static const char* aliases[144] = { + "cnbj" , "cntj" , "cnhe" , "cnsx" , "cnmn" , "cnln" , "cnjl" , + "cnhl" , "cnsh" , "cnjs" , "cnzj" , "cnah" , "cnfj" , "cnjx" , + "cnsd" , "cnha" , "cnhb" , "cnhn" , "cngd" , "cngx" , "cnhi" , + "cncq" , "cnsc" , "cngz" , "cnyn" , "cnxz" , "cnsn" , "cngs" , + "cnqh" , "cnnx" , "cnxj" , "twzzzz", "hkzzzz", "mozzzz", "cz110" , + "cz111" , "cz112" , "cz113" , "cz114" , "cz115" , "cz663" , "cz632" , + "cz633" , "cz634" , "cz635" , "cz641" , "cz642" , "cz643" , "cz644" , + "cz646" , "cz647" , "cz31" , "cz64" , "cz41" , "cz52" , "cz51" , + "cz80" , "cz71" , "cz53" , "cz32" , "cz10" , "cz20" , "cz42" , + "cz63" , "cz72" , "axzzzz", "frges" , "frnaq" , "blzzzz", "frara" , + "cpzzzz", "frbfc" , "frbre" , "frcvl" , "frges" , "gfzzzz", "gpzzzz", + "frcor" , "frbfc" , "fridf" , "frocc" , "frnaq" , "frges" , "mfzzzz", + "mqzzzz", "frocc" , "nczzzz", "frhdf" , "frnor" , "pfzzzz", "pmzzzz", + "frnor" , "frpdl" , "rezzzz", "frhdf" , "frnaq" , "tfzzzz", "frpac" , + "frara" , "wfzzzz", "ytzzzz", "laxs" , "lucl" , "luec" , "luca" , + "mr13" , "awzzzz", "cwzzzz", "sxzzzz", "no50" , "nzauk" , "nzcan" , + "ombj" , "omsj" , "pl02" , "pl04" , "pl08" , "pl10" , "pl06" , + "pl12" , "pl14" , "pl16" , "pl20" , "pl18" , "pl22" , "pl26" , + "pl24" , "pl28" , "pl30" , "pl32" , "tazzzz", "tttob" , "ttmrc" , + "tttob" , "twkhh" , "twtnn" , "twnwt" , "twtxg" , "aszzzz", "guzzzz", + "mpzzzz", "przzzz", "umzzzz", "vizzzz", + }; + return SearchUnicodeReplacement(types, aliases, type); + } + else if (IsUnicodeKey(key, "tz")) { + static const char* types[30] = { + "aqams" , "camtr" , "cnckg" , "cnhrb" , "cnkhg" , "cuba" , + "egypt" , "eire" , "est" , "gaza" , "gmt0" , "hongkong", + "hst" , "iceland" , "iran" , "israel" , "jamaica" , "japan" , + "libya" , "mst" , "navajo" , "poland" , "portugal", "prc" , + "roc" , "rok" , "turkey" , "uct" , "usnavajo", "zulu" , + }; + static const char* aliases[30] = { + "nzakl" , "cator" , "cnsha" , "cnsha" , "cnurc" , "cuhav" , + "egcai" , "iedub" , "utcw05" , "gazastrp", "gmt" , "hkhkg" , + "utcw10" , "isrey" , "irthr" , "jeruslm" , "jmkin" , "jptyo" , + "lytip" , "utcw07" , "usden" , "plwaw" , "ptlis" , "cnsha" , + "twtpe" , "krsel" , "trist" , "utc" , "usden" , "utc" , + }; + return SearchUnicodeReplacement(types, aliases, type); + } + return nullptr; +} + +template <size_t Length> +static inline bool IsTransformKey(mozilla::Span<const char> key, const char (&str)[Length]) { + static_assert(Length == TransformKeyLength + 1, + "Transform extension key is two characters long"); + return memcmp(key.data(), str, Length - 1) == 0; +} + +template <size_t Length> +static inline bool IsTransformType(mozilla::Span<const char> type, const char (&str)[Length]) { + static_assert(Length > TransformKeyLength + 1, + "Transform extension type contains more than two characters"); + return type.size() == (Length - 1) && + memcmp(type.data(), str, Length - 1) == 0; +} + +/** + * Mapping from deprecated BCP 47 Transform extension types to their preferred + * values. + * + * Spec: https://www.unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files + * Spec: https://www.unicode.org/reports/tr35/#t_Extension + */ +const char* mozilla::intl::Locale::ReplaceTransformExtensionType( + mozilla::Span<const char> key, mozilla::Span<const char> type) { + MOZ_ASSERT(key.size() == TransformKeyLength); + MOZ_ASSERT(IsCanonicallyCasedTransformKey(key)); + + MOZ_ASSERT(type.size() > TransformKeyLength); + MOZ_ASSERT(IsCanonicallyCasedTransformType(type)); + + if (IsTransformKey(key, "d0")) { + if (IsTransformType(type, "name")) { + return "charname"; + } + } + else if (IsTransformKey(key, "m0")) { + if (IsTransformType(type, "beta-metsehaf")) { + return "betamets"; + } + if (IsTransformType(type, "ies-jes")) { + return "iesjes"; + } + if (IsTransformType(type, "names")) { + return "prprname"; + } + if (IsTransformType(type, "tekie-alibekit")) { + return "tekieali"; + } + } + return nullptr; +} diff --git a/intl/components/src/MeasureUnit.cpp b/intl/components/src/MeasureUnit.cpp new file mode 100644 index 0000000000..3b932c9168 --- /dev/null +++ b/intl/components/src/MeasureUnit.cpp @@ -0,0 +1,110 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "mozilla/intl/MeasureUnit.h" + +#include "unicode/udata.h" +#include "unicode/ures.h" +#include "unicode/utypes.h" + +namespace mozilla::intl { + +void MeasureUnit::UResourceBundleDeleter::operator()(UResourceBundle* aPtr) { + ures_close(aPtr); +} + +MeasureUnit::Enumeration::Enumeration(UniqueUResourceBundle aRootLocale, + UniqueUResourceBundle aUnits) + : mRootLocale(std::move(aRootLocale)), mUnits(std::move(aUnits)) { + mUnitsSize = ures_getSize(mUnits.get()); +} + +MeasureUnit::Enumeration::Iterator::value_type +MeasureUnit::Enumeration::Iterator::operator*() const { + // Return an error result after an ICU error has occurred. + if (mHasError) { + return Err(InternalError{}); + } + + // Otherwise return the name of the current measurement unit. + const char* unitIdentifier = ures_getKey(mSubtype.get()); + MOZ_ASSERT(unitIdentifier); + return MakeStringSpan(unitIdentifier); +} + +void MeasureUnit::Enumeration::Iterator::advance() { + // Reject any attempts to modify this iterator after an error has occurred. + if (mHasError) { + return; + } + + while (true) { + // Read the next measurement unit in the types table. + if (mTypePos < mTypeSize) { + UErrorCode status = U_ZERO_ERROR; + UResourceBundle* rawSubtype = + ures_getByIndex(mType.get(), mTypePos, nullptr, &status); + if (U_FAILURE(status)) { + mHasError = true; + return; + } + + mTypePos += 1; + mSubtype.reset(rawSubtype); + return; + } + + // Read the next measurement unit type in the "units" table. + if (mUnitsPos < mEnumeration.mUnitsSize) { + UErrorCode status = U_ZERO_ERROR; + UResourceBundle* rawType = ures_getByIndex(mEnumeration.mUnits.get(), + mUnitsPos, nullptr, &status); + if (U_FAILURE(status)) { + mHasError = true; + return; + } + + mUnitsPos += 1; + mType.reset(rawType); + mTypeSize = ures_getSize(rawType); + mTypePos = 0; + continue; + } + + // All measurement units have been processed. Reset the two |mType*| fields + // to zero to match the end-iterator state and then return. + MOZ_ASSERT(mUnitsPos == mEnumeration.mUnitsSize); + mTypePos = 0; + mTypeSize = 0; + return; + } +} + +Result<MeasureUnit::Enumeration, ICUError> +MeasureUnit::Enumeration::TryCreate() { + // Look up the available measurement units in the resource bundle of the root + // locale. + + static const char packageName[] = + U_ICUDATA_NAME U_TREE_SEPARATOR_STRING "unit"; + static const char rootLocale[] = ""; + + UErrorCode status = U_ZERO_ERROR; + UResourceBundle* rawRes = ures_open(packageName, rootLocale, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + UniqueUResourceBundle res(rawRes); + + UResourceBundle* rawUnits = + ures_getByKey(res.get(), "units", nullptr, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + UniqueUResourceBundle units(rawUnits); + + return MeasureUnit::Enumeration(std::move(res), std::move(units)); +} + +} // namespace mozilla::intl diff --git a/intl/components/src/MeasureUnit.h b/intl/components/src/MeasureUnit.h new file mode 100644 index 0000000000..8a8cf1629a --- /dev/null +++ b/intl/components/src/MeasureUnit.h @@ -0,0 +1,155 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef intl_components_MeasureUnit_h_ +#define intl_components_MeasureUnit_h_ + +#include "mozilla/Assertions.h" +#include "mozilla/intl/ICU4CGlue.h" +#include "mozilla/intl/ICUError.h" +#include "mozilla/Result.h" +#include "mozilla/Span.h" +#include "mozilla/UniquePtr.h" + +#include <iterator> +#include <stddef.h> +#include <stdint.h> +#include <utility> + +struct UResourceBundle; + +namespace mozilla::intl { + +/** + * This component is a Mozilla-focused API for working with measurement units in + * internationalization code. It is used in coordination with other operations + * such as number formatting. + */ +class MeasureUnit final { + class UResourceBundleDeleter { + public: + void operator()(UResourceBundle* aPtr); + }; + + using UniqueUResourceBundle = + UniquePtr<UResourceBundle, UResourceBundleDeleter>; + + public: + MeasureUnit() = delete; + + class Enumeration final { + // Resource bundle for the root locale. + UniqueUResourceBundle mRootLocale = nullptr; + + // Resource bundle for the root locale's "units" resource table. + UniqueUResourceBundle mUnits = nullptr; + + // The overall amount of available units. + int32_t mUnitsSize = 0; + + public: + Enumeration(UniqueUResourceBundle aRootLocale, + UniqueUResourceBundle aUnits); + + class Iterator { + public: + // std::iterator traits. + using iterator_category = std::input_iterator_tag; + using value_type = SpanResult<char>; + using difference_type = ptrdiff_t; + using pointer = value_type*; + using reference = value_type&; + + private: + const Enumeration& mEnumeration; + + // Resource bundle to a measurement type within the "units" table. + // + // Measurement types describe various categories, like "area", "length", + // or "mass". + UniqueUResourceBundle mType = nullptr; + + // Resource bundle to a specific subtype within the type table. + // + // Measurement subtypes describe concrete measure units, like "acre", + // "meter", or "kilogram". + UniqueUResourceBundle mSubtype = nullptr; + + // The next position within the "units" table. + int32_t mUnitsPos = 0; + + // The overall amount of types within the |mType| table. + int32_t mTypeSize = 0; + + // The next position within the |mType| table. + int32_t mTypePos = 0; + + // Flag set when an ICU error has occurred. All further operations on this + // iterator will return an error result when this flag is set. + bool mHasError = false; + + void advance(); + + public: + Iterator(const Enumeration& aEnumeration, int32_t aUnitsPos) + : mEnumeration(aEnumeration), mUnitsPos(aUnitsPos) { + advance(); + } + + Iterator& operator++() { + advance(); + return *this; + } + + // The post-increment operator would return an invalid iterator, so it's + // not implemented. + Iterator operator++(int) = delete; + + bool operator==(const Iterator& aOther) const { + // It's an error to compare an iterator against an iterator from a + // different enumeration. + MOZ_ASSERT(&mEnumeration == &aOther.mEnumeration); + + return mUnitsPos == aOther.mUnitsPos && mTypeSize == aOther.mTypeSize && + mTypePos == aOther.mTypePos && mHasError == aOther.mHasError; + } + + bool operator!=(const Iterator& aOther) const { + return !(*this == aOther); + } + + value_type operator*() const; + }; + + friend class Iterator; + + // std::iterator begin() and end() methods. + + /** + * Return an iterator pointing to the start of the "units" table. + */ + Iterator begin() { return Iterator(*this, 0); } + + /** + * Return an iterator pointing to the end of the "units" table. + */ + Iterator end() { return Iterator(*this, mUnitsSize); } + + /** + * Create a new measurement unit enumeration. + */ + static Result<Enumeration, ICUError> TryCreate(); + }; + + /** + * Return an enumeration over all available measurement units. + */ + static Result<Enumeration, ICUError> GetAvailable() { + return Enumeration::TryCreate(); + } +}; + +} // namespace mozilla::intl + +#endif diff --git a/intl/components/src/MeasureUnitGenerated.h b/intl/components/src/MeasureUnitGenerated.h new file mode 100644 index 0000000000..8febc88649 --- /dev/null +++ b/intl/components/src/MeasureUnitGenerated.h @@ -0,0 +1,70 @@ +// Generated by make_intl_data.py. DO NOT EDIT. + +#ifndef intl_components_MeasureUnitGenerated_h +#define intl_components_MeasureUnitGenerated_h + +namespace mozilla::intl { + +struct SimpleMeasureUnit { + const char* const type; + const char* const name; +}; + +/** + * The list of currently supported simple unit identifiers. + * + * The list must be kept in alphabetical order of |name|. + */ +inline constexpr SimpleMeasureUnit simpleMeasureUnits[] = { + // clang-format off + {"area", "acre"}, + {"digital", "bit"}, + {"digital", "byte"}, + {"temperature", "celsius"}, + {"length", "centimeter"}, + {"duration", "day"}, + {"angle", "degree"}, + {"temperature", "fahrenheit"}, + {"volume", "fluid-ounce"}, + {"length", "foot"}, + {"volume", "gallon"}, + {"digital", "gigabit"}, + {"digital", "gigabyte"}, + {"mass", "gram"}, + {"area", "hectare"}, + {"duration", "hour"}, + {"length", "inch"}, + {"digital", "kilobit"}, + {"digital", "kilobyte"}, + {"mass", "kilogram"}, + {"length", "kilometer"}, + {"volume", "liter"}, + {"digital", "megabit"}, + {"digital", "megabyte"}, + {"length", "meter"}, + {"duration", "microsecond"}, + {"length", "mile"}, + {"length", "mile-scandinavian"}, + {"volume", "milliliter"}, + {"length", "millimeter"}, + {"duration", "millisecond"}, + {"duration", "minute"}, + {"duration", "month"}, + {"duration", "nanosecond"}, + {"mass", "ounce"}, + {"concentr", "percent"}, + {"digital", "petabyte"}, + {"mass", "pound"}, + {"duration", "second"}, + {"mass", "stone"}, + {"digital", "terabit"}, + {"digital", "terabyte"}, + {"duration", "week"}, + {"length", "yard"}, + {"duration", "year"}, + // clang-format on +}; + +} // namespace mozilla::intl + +#endif diff --git a/intl/components/src/NumberFormat.cpp b/intl/components/src/NumberFormat.cpp new file mode 100644 index 0000000000..6575c7a032 --- /dev/null +++ b/intl/components/src/NumberFormat.cpp @@ -0,0 +1,154 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#include "mozilla/intl/NumberFormat.h" +#include "NumberFormatFields.h" +#include "NumberFormatterSkeleton.h" +#include "ScopedICUObject.h" + +#include "unicode/unumberformatter.h" +#include "unicode/upluralrules.h" + +namespace mozilla::intl { + +/*static*/ Result<UniquePtr<NumberFormat>, ICUError> NumberFormat::TryCreate( + std::string_view aLocale, const NumberFormatOptions& aOptions) { + UniquePtr<NumberFormat> nf = MakeUnique<NumberFormat>(); + Result<Ok, ICUError> result = nf->initialize(aLocale, aOptions); + if (result.isOk()) { + return nf; + } + + return Err(result.unwrapErr()); +} + +NumberFormat::~NumberFormat() { + if (mFormattedNumber) { + unumf_closeResult(mFormattedNumber); + } + if (mNumberFormatter) { + unumf_close(mNumberFormatter); + } +} + +Result<Ok, ICUError> NumberFormat::initialize( + std::string_view aLocale, const NumberFormatOptions& aOptions) { + mFormatForUnit = aOptions.mUnit.isSome(); + NumberFormatterSkeleton skeleton(aOptions); + mNumberFormatter = skeleton.toFormatter(aLocale); + if (mNumberFormatter) { + UErrorCode status = U_ZERO_ERROR; + mFormattedNumber = unumf_openResult(&status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + return Ok(); + } + return Err(ICUError::InternalError); +} + +Result<std::u16string_view, ICUError> NumberFormat::formatToParts( + double number, NumberPartVector& parts) const { + if (!formatInternal(number)) { + return Err(ICUError::InternalError); + } + + bool isNegative = !std::isnan(number) && IsNegative(number); + + return FormatResultToParts(mFormattedNumber, Some(number), isNegative, + mFormatForUnit, parts); +} + +Result<std::u16string_view, ICUError> NumberFormat::formatToParts( + int64_t number, NumberPartVector& parts) const { + if (!formatInternal(number)) { + return Err(ICUError::InternalError); + } + + return FormatResultToParts(mFormattedNumber, Nothing(), number < 0, + mFormatForUnit, parts); +} + +Result<std::u16string_view, ICUError> NumberFormat::formatToParts( + std::string_view number, NumberPartVector& parts) const { + if (!formatInternal(number)) { + return Err(ICUError::InternalError); + } + + // Non-finite numbers aren't currently supported here. If we ever need to + // support those, the |Maybe<double>| argument must be computed here. + MOZ_ASSERT(number != "Infinity"); + MOZ_ASSERT(number != "+Infinity"); + MOZ_ASSERT(number != "-Infinity"); + MOZ_ASSERT(number != "NaN"); + + bool isNegative = !number.empty() && number[0] == '-'; + + return FormatResultToParts(mFormattedNumber, Nothing(), isNegative, + mFormatForUnit, parts); +} + +Result<int32_t, ICUError> NumberFormat::selectFormatted( + double number, char16_t* keyword, int32_t keywordSize, + UPluralRules* pluralRules) const { + MOZ_ASSERT(keyword && pluralRules); + UErrorCode status = U_ZERO_ERROR; + + MOZ_TRY(format(number)); + + int32_t utf16KeywordLength = uplrules_selectFormatted( + pluralRules, mFormattedNumber, keyword, keywordSize, &status); + + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + return utf16KeywordLength; +} + +bool NumberFormat::formatInternal(double number) const { + // ICU incorrectly formats NaN values with the sign bit set, as if they + // were negative. Replace all NaNs with a single pattern with sign bit + // unset ("positive", that is) until ICU is fixed. + if (MOZ_UNLIKELY(std::isnan(number))) { + number = SpecificNaN<double>(0, 1); + } + + UErrorCode status = U_ZERO_ERROR; + unumf_formatDouble(mNumberFormatter, number, mFormattedNumber, &status); + return U_SUCCESS(status); +} + +bool NumberFormat::formatInternal(int64_t number) const { + UErrorCode status = U_ZERO_ERROR; + unumf_formatInt(mNumberFormatter, number, mFormattedNumber, &status); + return U_SUCCESS(status); +} + +bool NumberFormat::formatInternal(std::string_view number) const { + UErrorCode status = U_ZERO_ERROR; + unumf_formatDecimal(mNumberFormatter, number.data(), number.size(), + mFormattedNumber, &status); + return U_SUCCESS(status); +} + +Result<std::u16string_view, ICUError> NumberFormat::formatResult() const { + UErrorCode status = U_ZERO_ERROR; + + const UFormattedValue* formattedValue = + unumf_resultAsValue(mFormattedNumber, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + int32_t utf16Length; + const char16_t* utf16Str = + ufmtval_getString(formattedValue, &utf16Length, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + return std::u16string_view(utf16Str, static_cast<size_t>(utf16Length)); +} + +} // namespace mozilla::intl diff --git a/intl/components/src/NumberFormat.h b/intl/components/src/NumberFormat.h new file mode 100644 index 0000000000..684b772e30 --- /dev/null +++ b/intl/components/src/NumberFormat.h @@ -0,0 +1,426 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef intl_components_NumberFormat_h_ +#define intl_components_NumberFormat_h_ +#include <string_view> +#include <utility> +#include <vector> + +#include "mozilla/FloatingPoint.h" +#include "mozilla/intl/ICU4CGlue.h" +#include "mozilla/Maybe.h" +#include "mozilla/PodOperations.h" +#include "mozilla/Result.h" +#include "mozilla/Utf8.h" +#include "mozilla/Vector.h" +#include "mozilla/intl/ICUError.h" +#include "mozilla/intl/NumberPart.h" + +#include "unicode/ustring.h" +#include "unicode/unum.h" +#include "unicode/unumberformatter.h" + +struct UPluralRules; + +namespace mozilla::intl { + +struct PluralRulesOptions; + +/** + * Configure NumberFormat options. + * The supported display styles are: + * * Decimal (default) + * * Currency (controlled by mCurrency) + * * Unit (controlled by mUnit) + * * Percent (controlled by mPercent) + * + * Only one of mCurrency, mUnit or mPercent should be set. If none are set, + * the number will formatted as a decimal. + * + * https://github.com/unicode-org/icu/blob/master/docs/userguide/format_parse/numbers/skeletons.md#unit + */ +struct MOZ_STACK_CLASS NumberFormatOptions { + /** + * Display a currency amount. |currency| must be a three-letter currency code. + * + * https://github.com/unicode-org/icu/blob/master/docs/userguide/format_parse/numbers/skeletons.md#unit + * https://github.com/unicode-org/icu/blob/master/docs/userguide/format_parse/numbers/skeletons.md#unit-width + */ + enum class CurrencyDisplay { + Symbol, + Code, + Name, + NarrowSymbol, + }; + Maybe<std::pair<std::string_view, CurrencyDisplay>> mCurrency; + + /** + * Set the fraction digits settings. |min| can be zero, |max| must be + * larger-or-equal to |min|. + * + * https://github.com/unicode-org/icu/blob/master/docs/userguide/format_parse/numbers/skeletons.md#fraction-precision + */ + Maybe<std::pair<uint32_t, uint32_t>> mFractionDigits; + + /** + * Set the minimum number of integer digits. |min| must be a non-zero + * number. + * + * https://github.com/unicode-org/icu/blob/master/docs/userguide/format_parse/numbers/skeletons.md#integer-width + */ + Maybe<uint32_t> mMinIntegerDigits; + + /** + * Set the significant digits settings. |min| must be a non-zero number, |max| + * must be larger-or-equal to |min|. + * + * https://github.com/unicode-org/icu/blob/master/docs/userguide/format_parse/numbers/skeletons.md#significant-digits-precision + */ + Maybe<std::pair<uint32_t, uint32_t>> mSignificantDigits; + + /** + * Display a unit amount. |unit| must be a well-formed unit identifier. + * + * https://github.com/unicode-org/icu/blob/master/docs/userguide/format_parse/numbers/skeletons.md#unit + * https://github.com/unicode-org/icu/blob/master/docs/userguide/format_parse/numbers/skeletons.md#per-unit + * https://github.com/unicode-org/icu/blob/master/docs/userguide/format_parse/numbers/skeletons.md#unit-width + */ + enum class UnitDisplay { Short, Narrow, Long }; + Maybe<std::pair<std::string_view, UnitDisplay>> mUnit; + + /** + * Display a percent number. + * + * https://github.com/unicode-org/icu/blob/master/docs/userguide/format_parse/numbers/skeletons.md#unit + * https://github.com/unicode-org/icu/blob/master/docs/userguide/format_parse/numbers/skeletons.md#scale + */ + bool mPercent = false; + + /** + * Set to true to strip trailing zeros after the decimal point for integer + * values. + * + * https://github.com/unicode-org/icu/blob/master/docs/userguide/format_parse/numbers/skeletons.md#trailing-zero-display + */ + bool mStripTrailingZero = false; + + /** + * Enable or disable grouping. + * + * https://github.com/unicode-org/icu/blob/master/docs/userguide/format_parse/numbers/skeletons.md#grouping + */ + enum class Grouping { + Auto, + Always, + Min2, + Never, + } mGrouping = Grouping::Auto; + + /** + * Set the notation style. + * + * https://github.com/unicode-org/icu/blob/master/docs/userguide/format_parse/numbers/skeletons.md#notation + */ + enum class Notation { + Standard, + Scientific, + Engineering, + CompactShort, + CompactLong + } mNotation = Notation::Standard; + + /** + * Set the sign-display. + * + * https://github.com/unicode-org/icu/blob/master/docs/userguide/format_parse/numbers/skeletons.md#sign-display + */ + enum class SignDisplay { + Auto, + Never, + Always, + ExceptZero, + Negative, + Accounting, + AccountingAlways, + AccountingExceptZero, + AccountingNegative, + } mSignDisplay = SignDisplay::Auto; + + /** + * Set the rounding increment, which must be a non-zero number. + * + * https://github.com/unicode-org/icu/blob/master/docs/userguide/format_parse/numbers/skeletons.md#precision + */ + uint32_t mRoundingIncrement = 1; + + /** + * Set the rounding mode. + * + * https://github.com/unicode-org/icu/blob/master/docs/userguide/format_parse/numbers/skeletons.md#rounding-mode + */ + enum class RoundingMode { + Ceil, + Floor, + Expand, + Trunc, + HalfCeil, + HalfFloor, + HalfExpand, + HalfTrunc, + HalfEven, + HalfOdd, + } mRoundingMode = RoundingMode::HalfExpand; + + /** + * Set the rounding priority. |mFractionDigits| and |mSignificantDigits| must + * both be set if the rounding priority isn't equal to "auto". + * + * https://github.com/unicode-org/icu/blob/master/docs/userguide/format_parse/numbers/skeletons.md#fraction-precision + */ + enum class RoundingPriority { + Auto, + MorePrecision, + LessPrecision, + } mRoundingPriority = RoundingPriority::Auto; +}; + +/** + * According to http://userguide.icu-project.org/design, as long as we constrain + * ourselves to const APIs ICU is const-correct. + */ + +/** + * A NumberFormat implementation that roughly mirrors the API provided by + * the ECMA-402 Intl.NumberFormat object. + * + * https://tc39.es/ecma402/#numberformat-objects + */ +class NumberFormat final { + public: + /** + * Initialize a new NumberFormat for the provided locale and using the + * provided options. + * + * https://tc39.es/ecma402/#sec-initializenumberformat + */ + static Result<UniquePtr<NumberFormat>, ICUError> TryCreate( + std::string_view aLocale, const NumberFormatOptions& aOptions); + + NumberFormat() = default; + NumberFormat(const NumberFormat&) = delete; + NumberFormat& operator=(const NumberFormat&) = delete; + ~NumberFormat(); + + /** + * Formats a double to a utf-16 string. The string view is valid until + * another number is formatted. Accessing the string view after this event + * is undefined behavior. + * + * https://tc39.es/ecma402/#sec-formatnumberstring + */ + Result<std::u16string_view, ICUError> format(double number) const { + if (!formatInternal(number)) { + return Err(ICUError::InternalError); + } + + return formatResult(); + } + + /** + * Formats a double to a utf-16 string, and fills the provided parts vector. + * The string view is valid until another number is formatted. Accessing the + * string view after this event is undefined behavior. + * + * This is utf-16 only because the only current use case is in + * SpiderMonkey. Supporting utf-8 would require recalculating the offsets + * in NumberPartVector from fixed width to variable width, which might be + * tricky to get right and is work that won't be necessary if we switch to + * ICU4X (see Bug 1707035). + * + * https://tc39.es/ecma402/#sec-partitionnumberpattern + */ + Result<std::u16string_view, ICUError> formatToParts( + double number, NumberPartVector& parts) const; + + /** + * Formats a double to the provider buffer (either utf-8 or utf-16) + * + * https://tc39.es/ecma402/#sec-formatnumberstring + */ + template <typename B> + Result<Ok, ICUError> format(double number, B& buffer) const { + if (!formatInternal(number)) { + return Err(ICUError::InternalError); + } + + return formatResult<typename B::CharType, B>(buffer); + } + + /** + * Formats an int64_t to a utf-16 string. The string view is valid until + * another number is formatted. Accessing the string view after this event is + * undefined behavior. + * + * https://tc39.es/ecma402/#sec-formatnumberstring + */ + Result<std::u16string_view, ICUError> format(int64_t number) const { + if (!formatInternal(number)) { + return Err(ICUError::InternalError); + } + + return formatResult(); + } + + /** + * Formats a int64_t to a utf-16 string, and fills the provided parts vector. + * The string view is valid until another number is formatted. Accessing the + * string view after this event is undefined behavior. + * + * This is utf-16 only because the only current use case is in + * SpiderMonkey. Supporting utf-8 would require recalculating the offsets + * in NumberPartVector from fixed width to variable width, which might be + * tricky to get right and is work that won't be necessary if we switch to + * ICU4X (see Bug 1707035). + * + * https://tc39.es/ecma402/#sec-partitionnumberpattern + */ + Result<std::u16string_view, ICUError> formatToParts( + int64_t number, NumberPartVector& parts) const; + + /** + * Formats an int64_t to the provider buffer (either utf-8 or utf-16). + * + * https://tc39.es/ecma402/#sec-formatnumberstring + */ + template <typename B> + Result<Ok, ICUError> format(int64_t number, B& buffer) const { + if (!formatInternal(number)) { + return Err(ICUError::InternalError); + } + + return formatResult<typename B::CharType, B>(buffer); + } + + /** + * Formats a string encoded decimal number to a utf-16 string. The string view + * is valid until another number is formatted. Accessing the string view + * after this event is undefined behavior. + * + * https://tc39.es/ecma402/#sec-formatnumberstring + */ + Result<std::u16string_view, ICUError> format(std::string_view number) const { + if (!formatInternal(number)) { + return Err(ICUError::InternalError); + } + + return formatResult(); + } + + /** + * Formats a string encoded decimal number to a utf-16 string, and fills the + * provided parts vector. The string view is valid until another number is + * formatted. Accessing the string view after this event is undefined + * behavior. + * + * This is utf-16 only because the only current use case is in + * SpiderMonkey. Supporting utf-8 would require recalculating the offsets + * in NumberPartVector from fixed width to variable width, which might be + * tricky to get right and is work that won't be necessary if we switch to + * ICU4X (see Bug 1707035). + * + * https://tc39.es/ecma402/#sec-partitionnumberpattern + */ + Result<std::u16string_view, ICUError> formatToParts( + std::string_view number, NumberPartVector& parts) const; + + /** + * Formats a string encoded decimal number to the provider buffer + * (either utf-8 or utf-16). + * + * https://tc39.es/ecma402/#sec-formatnumberstring + */ + template <typename B> + Result<Ok, ICUError> format(std::string_view number, B& buffer) const { + if (!formatInternal(number)) { + return Err(ICUError::InternalError); + } + + return formatResult<typename B::CharType, B>(buffer); + } + + /** + * Formats the number and selects the keyword by using a provided + * UPluralRules object. + * + * https://tc39.es/ecma402/#sec-intl.pluralrules.prototype.select + * + * TODO(1713917) This is necessary because both PluralRules and + * NumberFormat have a shared dependency on the raw UFormattedNumber + * type. Once we transition to using ICU4X, the FFI calls should no + * longer require such shared dependencies. At that time, this + * functionality should be removed from NumberFormat and invoked + * solely from PluralRules. + */ + Result<int32_t, ICUError> selectFormatted(double number, char16_t* keyword, + int32_t keywordSize, + UPluralRules* pluralRules) const; + + /** + * Returns an iterator over all supported number formatter locales. + * + * The returned strings are ICU locale identifiers and NOT BCP 47 language + * tags. + * + * Also see <https://unicode-org.github.io/icu/userguide/locale>. + */ + static auto GetAvailableLocales() { + return AvailableLocalesEnumeration<unum_countAvailable, + unum_getAvailable>(); + } + + private: + UNumberFormatter* mNumberFormatter = nullptr; + UFormattedNumber* mFormattedNumber = nullptr; + bool mFormatForUnit = false; + + Result<Ok, ICUError> initialize(std::string_view aLocale, + const NumberFormatOptions& aOptions); + + [[nodiscard]] bool formatInternal(double number) const; + [[nodiscard]] bool formatInternal(int64_t number) const; + [[nodiscard]] bool formatInternal(std::string_view number) const; + + Result<std::u16string_view, ICUError> formatResult() const; + + template <typename C, typename B> + Result<Ok, ICUError> formatResult(B& buffer) const { + // We only support buffers with char or char16_t. + static_assert(std::is_same_v<C, char> || std::is_same_v<C, char16_t>); + + return formatResult().andThen( + [&buffer](std::u16string_view result) -> Result<Ok, ICUError> { + if constexpr (std::is_same_v<C, char>) { + if (!FillBuffer(Span(result.data(), result.size()), buffer)) { + return Err(ICUError::OutOfMemory); + } + return Ok(); + } else { + // ICU provides APIs which accept a buffer, but they just copy from + // an internal buffer behind the scenes anyway. + if (!buffer.reserve(result.size())) { + return Err(ICUError::OutOfMemory); + } + PodCopy(static_cast<char16_t*>(buffer.data()), result.data(), + result.size()); + buffer.written(result.size()); + + return Ok(); + } + }); + } +}; + +} // namespace mozilla::intl + +#endif diff --git a/intl/components/src/NumberFormatFields.cpp b/intl/components/src/NumberFormatFields.cpp new file mode 100644 index 0000000000..8ab4690d50 --- /dev/null +++ b/intl/components/src/NumberFormatFields.cpp @@ -0,0 +1,396 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#include "ICU4CGlue.h" +#include "NumberFormatFields.h" +#include "ScopedICUObject.h" + +#include "mozilla/FloatingPoint.h" +#include "unicode/uformattedvalue.h" +#include "unicode/unum.h" +#include "unicode/unumberformatter.h" + +namespace mozilla::intl { + +bool NumberFormatFields::append(NumberPartType type, int32_t begin, + int32_t end) { + MOZ_ASSERT(begin >= 0); + MOZ_ASSERT(end >= 0); + MOZ_ASSERT(begin < end, "erm, aren't fields always non-empty?"); + + return fields_.emplaceBack(uint32_t(begin), uint32_t(end), type); +} + +bool NumberFormatFields::toPartsVector(size_t overallLength, + const NumberPartSourceMap& sourceMap, + NumberPartVector& parts) { + std::sort(fields_.begin(), fields_.end(), + [](const NumberFormatField& left, const NumberFormatField& right) { + // Sort first by begin index, then to place + // enclosing fields before nested fields. + return left.begin < right.begin || + (left.begin == right.begin && left.end > right.end); + }); + + // Then iterate over the sorted field list to generate a sequence of parts + // (what ECMA-402 actually exposes). A part is a maximal character sequence + // entirely within no field or a single most-nested field. + // + // Diagrams may be helpful to illustrate how fields map to parts. Consider + // formatting -19,766,580,028,249.41, the US national surplus (negative + // because it's actually a debt) on October 18, 2016. + // + // var options = + // { style: "currency", currency: "USD", currencyDisplay: "name" }; + // var usdFormatter = new Intl.NumberFormat("en-US", options); + // usdFormatter.format(-19766580028249.41); + // + // The formatted result is "-19,766,580,028,249.41 US dollars". ICU + // identifies these fields in the string: + // + // UNUM_GROUPING_SEPARATOR_FIELD + // | + // UNUM_SIGN_FIELD | UNUM_DECIMAL_SEPARATOR_FIELD + // | __________/| | + // | / | | | | + // "-19,766,580,028,249.41 US dollars" + // \________________/ |/ \_______/ + // | | | + // UNUM_INTEGER_FIELD | UNUM_CURRENCY_FIELD + // | + // UNUM_FRACTION_FIELD + // + // These fields map to parts as follows: + // + // integer decimal + // _____|________ | + // / /| |\ |\ |\ | literal + // /| / | | \ | \ | \| | + // "-19,766,580,028,249.41 US dollars" + // | \___|___|___/ |/ \________/ + // | | | | + // | group | currency + // | | + // minusSign fraction + // + // The sign is a part. Each comma is a part, splitting the integer field + // into parts for trillions/billions/&c. digits. The decimal point is a + // part. Cents are a part. The space between cents and currency is a part + // (outside any field). Last, the currency field is a part. + + class PartGenerator { + // The fields in order from start to end, then least to most nested. + const FieldsVector& fields; + + // Index of the current field, in |fields|, being considered to + // determine part boundaries. |lastEnd <= fields[index].begin| is an + // invariant. + size_t index = 0; + + // The end index of the last part produced, always less than or equal + // to |limit|, strictly increasing. + uint32_t lastEnd = 0; + + // The length of the overall formatted string. + const uint32_t limit = 0; + + NumberPartSourceMap sourceMap; + + Vector<size_t, 4> enclosingFields; + + void popEnclosingFieldsEndingAt(uint32_t end) { + MOZ_ASSERT_IF(enclosingFields.length() > 0, + fields[enclosingFields.back()].end >= end); + + while (enclosingFields.length() > 0 && + fields[enclosingFields.back()].end == end) { + enclosingFields.popBack(); + } + } + + bool nextPartInternal(NumberPart* part) { + size_t len = fields.length(); + MOZ_ASSERT(index <= len); + + // If we're out of fields, all that remains are part(s) consisting + // of trailing portions of enclosing fields, and maybe a final + // literal part. + if (index == len) { + if (enclosingFields.length() > 0) { + const auto& enclosing = fields[enclosingFields.popCopy()]; + *part = {enclosing.type, sourceMap.source(enclosing), enclosing.end}; + + // If additional enclosing fields end where this part ends, + // pop them as well. + popEnclosingFieldsEndingAt(part->endIndex); + } else { + *part = {NumberPartType::Literal, sourceMap.source(limit), limit}; + } + + return true; + } + + // Otherwise we still have a field to process. + const NumberFormatField* current = &fields[index]; + MOZ_ASSERT(lastEnd <= current->begin); + MOZ_ASSERT(current->begin < current->end); + + // But first, deal with inter-field space. + if (lastEnd < current->begin) { + if (enclosingFields.length() > 0) { + // Space between fields, within an enclosing field, is part + // of that enclosing field, until the start of the current + // field or the end of the enclosing field, whichever is + // earlier. + const auto& enclosing = fields[enclosingFields.back()]; + *part = {enclosing.type, sourceMap.source(enclosing), + std::min(enclosing.end, current->begin)}; + popEnclosingFieldsEndingAt(part->endIndex); + } else { + // If there's no enclosing field, the space is a literal. + *part = {NumberPartType::Literal, sourceMap.source(current->begin), + current->begin}; + } + + return true; + } + + // Otherwise, the part spans a prefix of the current field. Find + // the most-nested field containing that prefix. + const NumberFormatField* next; + do { + current = &fields[index]; + + // If the current field is last, the part extends to its end. + if (++index == len) { + *part = {current->type, sourceMap.source(*current), current->end}; + return true; + } + + next = &fields[index]; + MOZ_ASSERT(current->begin <= next->begin); + MOZ_ASSERT(current->begin < next->end); + + // If the next field nests within the current field, push an + // enclosing field. (If there are no nested fields, don't + // bother pushing a field that'd be immediately popped.) + if (current->end > next->begin) { + if (!enclosingFields.append(index - 1)) { + return false; + } + } + + // Do so until the next field begins after this one. + } while (current->begin == next->begin); + + if (current->end <= next->begin) { + // The next field begins after the current field ends. Therefore + // the current part ends at the end of the current field. + *part = {current->type, sourceMap.source(*current), current->end}; + popEnclosingFieldsEndingAt(part->endIndex); + } else { + // The current field encloses the next one. The current part + // ends where the next field/part will start. + *part = {current->type, sourceMap.source(*current), next->begin}; + } + + return true; + } + + public: + PartGenerator(const FieldsVector& vec, uint32_t limit, + const NumberPartSourceMap& sourceMap) + : fields(vec), limit(limit), sourceMap(sourceMap), enclosingFields() {} + + bool nextPart(bool* hasPart, NumberPart* part) { + // There are no parts left if we've partitioned the entire string. + if (lastEnd == limit) { + MOZ_ASSERT(enclosingFields.length() == 0); + *hasPart = false; + return true; + } + + if (!nextPartInternal(part)) { + return false; + } + + *hasPart = true; + lastEnd = part->endIndex; + return true; + } + }; + + // Finally, generate the result array. + size_t lastEndIndex = 0; + + PartGenerator gen(fields_, overallLength, sourceMap); + do { + bool hasPart; + NumberPart part; + if (!gen.nextPart(&hasPart, &part)) { + return false; + } + + if (!hasPart) { + break; + } + + MOZ_ASSERT(lastEndIndex < part.endIndex); + + if (!parts.append(part)) { + return false; + } + + lastEndIndex = part.endIndex; + } while (true); + + MOZ_ASSERT(lastEndIndex == overallLength, + "result array must partition the entire string"); + + return lastEndIndex == overallLength; +} + +Result<std::u16string_view, ICUError> FormatResultToParts( + const UFormattedNumber* value, Maybe<double> number, bool isNegative, + bool formatForUnit, NumberPartVector& parts) { + UErrorCode status = U_ZERO_ERROR; + + const UFormattedValue* formattedValue = unumf_resultAsValue(value, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + return FormatResultToParts(formattedValue, number, isNegative, formatForUnit, + parts); +} + +Result<std::u16string_view, ICUError> FormatResultToParts( + const UFormattedValue* value, Maybe<double> number, bool isNegative, + bool formatForUnit, NumberPartVector& parts) { + UErrorCode status = U_ZERO_ERROR; + + int32_t utf16Length; + const char16_t* utf16Str = ufmtval_getString(value, &utf16Length, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + UConstrainedFieldPosition* fpos = ucfpos_open(&status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + ScopedICUObject<UConstrainedFieldPosition, ucfpos_close> toCloseFpos(fpos); + + // We're only interested in UFIELD_CATEGORY_NUMBER fields. + ucfpos_constrainCategory(fpos, UFIELD_CATEGORY_NUMBER, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + // Vacuum up fields in the overall formatted string. + NumberFormatFields fields; + + while (true) { + bool hasMore = ufmtval_nextPosition(value, fpos, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + if (!hasMore) { + break; + } + + int32_t fieldName = ucfpos_getField(fpos, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + int32_t beginIndex, endIndex; + ucfpos_getIndexes(fpos, &beginIndex, &endIndex, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + Maybe<NumberPartType> partType = GetPartTypeForNumberField( + UNumberFormatFields(fieldName), number, isNegative, formatForUnit); + if (!partType || !fields.append(*partType, beginIndex, endIndex)) { + return Err(ICUError::InternalError); + } + } + + if (!fields.toPartsVector(utf16Length, parts)) { + return Err(ICUError::InternalError); + } + + return std::u16string_view(utf16Str, static_cast<size_t>(utf16Length)); +} + +// See intl/icu/source/i18n/unicode/unum.h for a detailed field list. This +// list is deliberately exhaustive: cases might have to be added/removed if +// this code is compiled with a different ICU with more UNumberFormatFields +// enum initializers. Please guard such cases with appropriate ICU +// version-testing #ifdefs, should cross-version divergence occur. +Maybe<NumberPartType> GetPartTypeForNumberField(UNumberFormatFields fieldName, + Maybe<double> number, + bool isNegative, + bool formatForUnit) { + switch (fieldName) { + case UNUM_INTEGER_FIELD: + if (number.isSome()) { + if (std::isnan(*number)) { + return Some(NumberPartType::Nan); + } + if (!std::isfinite(*number)) { + return Some(NumberPartType::Infinity); + } + } + return Some(NumberPartType::Integer); + case UNUM_FRACTION_FIELD: + return Some(NumberPartType::Fraction); + case UNUM_DECIMAL_SEPARATOR_FIELD: + return Some(NumberPartType::Decimal); + case UNUM_EXPONENT_SYMBOL_FIELD: + return Some(NumberPartType::ExponentSeparator); + case UNUM_EXPONENT_SIGN_FIELD: + return Some(NumberPartType::ExponentMinusSign); + case UNUM_EXPONENT_FIELD: + return Some(NumberPartType::ExponentInteger); + case UNUM_GROUPING_SEPARATOR_FIELD: + return Some(NumberPartType::Group); + case UNUM_CURRENCY_FIELD: + return Some(NumberPartType::Currency); + case UNUM_PERCENT_FIELD: + if (formatForUnit) { + return Some(NumberPartType::Unit); + } + return Some(NumberPartType::Percent); + case UNUM_PERMILL_FIELD: + MOZ_ASSERT_UNREACHABLE( + "unexpected permill field found, even though " + "we don't use any user-defined patterns that " + "would require a permill field"); + break; + case UNUM_SIGN_FIELD: + if (isNegative) { + return Some(NumberPartType::MinusSign); + } + return Some(NumberPartType::PlusSign); + case UNUM_MEASURE_UNIT_FIELD: + return Some(NumberPartType::Unit); + case UNUM_COMPACT_FIELD: + return Some(NumberPartType::Compact); + case UNUM_APPROXIMATELY_SIGN_FIELD: + return Some(NumberPartType::ApproximatelySign); +#ifndef U_HIDE_DEPRECATED_API + case UNUM_FIELD_COUNT: + MOZ_ASSERT_UNREACHABLE( + "format field sentinel value returned by iterator!"); + break; +#endif + } + + MOZ_ASSERT_UNREACHABLE( + "unenumerated, undocumented format field returned by iterator"); + return Nothing(); +} + +} // namespace mozilla::intl diff --git a/intl/components/src/NumberFormatFields.h b/intl/components/src/NumberFormatFields.h new file mode 100644 index 0000000000..4f05d4e98b --- /dev/null +++ b/intl/components/src/NumberFormatFields.h @@ -0,0 +1,91 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef intl_components_NumberFormatFields_h_ +#define intl_components_NumberFormatFields_h_ +#include "mozilla/intl/ICUError.h" +#include "mozilla/intl/NumberPart.h" +#include "mozilla/Maybe.h" +#include "mozilla/Result.h" +#include "mozilla/Vector.h" + +#include "unicode/unum.h" + +struct UFormattedNumber; +struct UFormattedValue; + +namespace mozilla::intl { + +struct NumberFormatField { + uint32_t begin; + uint32_t end; + NumberPartType type; + + // Needed for vector-resizing scratch space. + NumberFormatField() = default; + + NumberFormatField(uint32_t begin, uint32_t end, NumberPartType type) + : begin(begin), end(end), type(type) {} +}; + +struct NumberPartSourceMap { + struct Range { + uint32_t begin = 0; + uint32_t end = 0; + }; + + // Begin and end position of the start range. + Range start; + + // Begin and end position of the end range. + Range end; + + NumberPartSource source(uint32_t endIndex) { + if (start.begin < endIndex && endIndex <= start.end) { + return NumberPartSource::Start; + } + if (end.begin < endIndex && endIndex <= end.end) { + return NumberPartSource::End; + } + return NumberPartSource::Shared; + } + + NumberPartSource source(const NumberFormatField& field) { + return source(field.end); + } +}; + +class NumberFormatFields { + using FieldsVector = Vector<NumberFormatField, 16>; + + FieldsVector fields_; + + public: + [[nodiscard]] bool append(NumberPartType type, int32_t begin, int32_t end); + + [[nodiscard]] bool toPartsVector(size_t overallLength, + NumberPartVector& parts) { + return toPartsVector(overallLength, {}, parts); + } + + [[nodiscard]] bool toPartsVector(size_t overallLength, + const NumberPartSourceMap& sourceMap, + NumberPartVector& parts); +}; + +Result<std::u16string_view, ICUError> FormatResultToParts( + const UFormattedNumber* value, Maybe<double> number, bool isNegative, + bool formatForUnit, NumberPartVector& parts); + +Result<std::u16string_view, ICUError> FormatResultToParts( + const UFormattedValue* value, Maybe<double> number, bool isNegative, + bool formatForUnit, NumberPartVector& parts); + +Maybe<NumberPartType> GetPartTypeForNumberField(UNumberFormatFields fieldName, + Maybe<double> number, + bool isNegative, + bool formatForUnit); + +} // namespace mozilla::intl + +#endif diff --git a/intl/components/src/NumberFormatterSkeleton.cpp b/intl/components/src/NumberFormatterSkeleton.cpp new file mode 100644 index 0000000000..5f62d77c2b --- /dev/null +++ b/intl/components/src/NumberFormatterSkeleton.cpp @@ -0,0 +1,473 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#include "NumberFormatterSkeleton.h" +#include "NumberFormat.h" + +#include "MeasureUnitGenerated.h" + +#include "mozilla/RangedPtr.h" + +#include <algorithm> +#include <limits> + +#include "unicode/unumberrangeformatter.h" + +namespace mozilla::intl { + +NumberFormatterSkeleton::NumberFormatterSkeleton( + const NumberFormatOptions& options) { + if (options.mCurrency.isSome()) { + if (!currency(options.mCurrency->first) || + !currencyDisplay(options.mCurrency->second)) { + return; + } + } else if (options.mUnit.isSome()) { + if (!unit(options.mUnit->first) || !unitDisplay(options.mUnit->second)) { + return; + } + } else if (options.mPercent) { + if (!percent()) { + return; + } + } + + if (options.mRoundingIncrement != 1) { + auto fd = options.mFractionDigits.valueOr(std::pair{0, 0}); + if (!roundingIncrement(options.mRoundingIncrement, fd.first, fd.second, + options.mStripTrailingZero)) { + return; + } + } else if (options.mRoundingPriority == + NumberFormatOptions::RoundingPriority::Auto) { + if (options.mFractionDigits.isSome()) { + if (!fractionDigits(options.mFractionDigits->first, + options.mFractionDigits->second, + options.mStripTrailingZero)) { + return; + } + } + + if (options.mSignificantDigits.isSome()) { + if (!significantDigits(options.mSignificantDigits->first, + options.mSignificantDigits->second, + options.mStripTrailingZero)) { + return; + } + } + } else { + MOZ_ASSERT(options.mFractionDigits); + MOZ_ASSERT(options.mSignificantDigits); + + bool relaxed = options.mRoundingPriority == + NumberFormatOptions::RoundingPriority::MorePrecision; + if (!fractionWithSignificantDigits(options.mFractionDigits->first, + options.mFractionDigits->second, + options.mSignificantDigits->first, + options.mSignificantDigits->second, + relaxed, options.mStripTrailingZero)) { + return; + } + } + + if (options.mMinIntegerDigits.isSome()) { + if (!minIntegerDigits(*options.mMinIntegerDigits)) { + return; + } + } + + if (!grouping(options.mGrouping)) { + return; + } + + if (!notation(options.mNotation)) { + return; + } + + if (!signDisplay(options.mSignDisplay)) { + return; + } + + if (!roundingMode(options.mRoundingMode)) { + return; + } + + mValidSkeleton = true; +} + +bool NumberFormatterSkeleton::currency(std::string_view currency) { + MOZ_ASSERT(currency.size() == 3, + "IsWellFormedCurrencyCode permits only length-3 strings"); + + char16_t currencyChars[] = {static_cast<char16_t>(currency[0]), + static_cast<char16_t>(currency[1]), + static_cast<char16_t>(currency[2]), '\0'}; + return append(u"currency/") && append(currencyChars) && append(' '); +} + +bool NumberFormatterSkeleton::currencyDisplay( + NumberFormatOptions::CurrencyDisplay display) { + switch (display) { + case NumberFormatOptions::CurrencyDisplay::Code: + return appendToken(u"unit-width-iso-code"); + case NumberFormatOptions::CurrencyDisplay::Name: + return appendToken(u"unit-width-full-name"); + case NumberFormatOptions::CurrencyDisplay::Symbol: + // Default, no additional tokens needed. + return true; + case NumberFormatOptions::CurrencyDisplay::NarrowSymbol: + return appendToken(u"unit-width-narrow"); + } + MOZ_ASSERT_UNREACHABLE("unexpected currency display type"); + return false; +} + +static const SimpleMeasureUnit& FindSimpleMeasureUnit(std::string_view name) { + const auto* measureUnit = std::lower_bound( + std::begin(simpleMeasureUnits), std::end(simpleMeasureUnits), name, + [](const auto& measureUnit, std::string_view name) { + return name.compare(measureUnit.name) > 0; + }); + MOZ_ASSERT(measureUnit != std::end(simpleMeasureUnits), + "unexpected unit identifier: unit not found"); + MOZ_ASSERT(measureUnit->name == name, + "unexpected unit identifier: wrong unit found"); + return *measureUnit; +} + +static constexpr size_t MaxUnitLength() { + size_t length = 0; + for (const auto& unit : simpleMeasureUnits) { + length = std::max(length, std::char_traits<char>::length(unit.name)); + } + return length * 2 + std::char_traits<char>::length("-per-"); +} + +bool NumberFormatterSkeleton::unit(std::string_view unit) { + MOZ_RELEASE_ASSERT(unit.length() <= MaxUnitLength()); + + auto appendUnit = [this](const SimpleMeasureUnit& unit) { + return append(unit.type, strlen(unit.type)) && append('-') && + append(unit.name, strlen(unit.name)); + }; + + // |unit| can be a compound unit identifier, separated by "-per-". + static constexpr char separator[] = "-per-"; + size_t separator_len = strlen(separator); + size_t offset = unit.find(separator); + if (offset != std::string_view::npos) { + const auto& numerator = FindSimpleMeasureUnit(unit.substr(0, offset)); + const auto& denominator = FindSimpleMeasureUnit( + std::string_view(unit.data() + offset + separator_len, + unit.length() - offset - separator_len)); + return append(u"measure-unit/") && appendUnit(numerator) && append(' ') && + append(u"per-measure-unit/") && appendUnit(denominator) && + append(' '); + } + + const auto& simple = FindSimpleMeasureUnit(unit); + return append(u"measure-unit/") && appendUnit(simple) && append(' '); +} + +bool NumberFormatterSkeleton::unitDisplay( + NumberFormatOptions::UnitDisplay display) { + switch (display) { + case NumberFormatOptions::UnitDisplay::Short: + return appendToken(u"unit-width-short"); + case NumberFormatOptions::UnitDisplay::Narrow: + return appendToken(u"unit-width-narrow"); + case NumberFormatOptions::UnitDisplay::Long: + return appendToken(u"unit-width-full-name"); + } + MOZ_ASSERT_UNREACHABLE("unexpected unit display type"); + return false; +} + +bool NumberFormatterSkeleton::percent() { + return appendToken(u"percent scale/100"); +} + +bool NumberFormatterSkeleton::fractionDigits(uint32_t min, uint32_t max, + bool stripTrailingZero) { + // Note: |min| can be zero here. + MOZ_ASSERT(min <= max); + if (!append('.') || !appendN('0', min) || !appendN('#', max - min)) { + return false; + } + if (stripTrailingZero) { + if (!append(u"/w")) { + return false; + } + } + return append(' '); +} + +bool NumberFormatterSkeleton::fractionWithSignificantDigits( + uint32_t mnfd, uint32_t mxfd, uint32_t mnsd, uint32_t mxsd, bool relaxed, + bool stripTrailingZero) { + // Note: |mnfd| can be zero here. + MOZ_ASSERT(mnfd <= mxfd); + MOZ_ASSERT(mnsd > 0); + MOZ_ASSERT(mnsd <= mxsd); + + if (!append('.') || !appendN('0', mnfd) || !appendN('#', mxfd - mnfd)) { + return false; + } + if (!append('/') || !appendN('@', mnsd) || !appendN('#', mxsd - mnsd)) { + return false; + } + if (!append(relaxed ? 'r' : 's')) { + return false; + } + if (stripTrailingZero) { + if (!append(u"/w")) { + return false; + } + } + return append(' '); +} + +bool NumberFormatterSkeleton::minIntegerDigits(uint32_t min) { + MOZ_ASSERT(min > 0); + return append(u"integer-width/+") && appendN('0', min) && append(' '); +} + +bool NumberFormatterSkeleton::significantDigits(uint32_t min, uint32_t max, + bool stripTrailingZero) { + MOZ_ASSERT(min > 0); + MOZ_ASSERT(min <= max); + if (!appendN('@', min) || !appendN('#', max - min)) { + return false; + } + if (stripTrailingZero) { + if (!append(u"/w")) { + return false; + } + } + return append(' '); +} + +bool NumberFormatterSkeleton::grouping(NumberFormatOptions::Grouping grouping) { + switch (grouping) { + case NumberFormatOptions::Grouping::Auto: + // Default, no additional tokens needed. + return true; + case NumberFormatOptions::Grouping::Always: + return appendToken(u"group-on-aligned"); + case NumberFormatOptions::Grouping::Min2: + return appendToken(u"group-min2"); + case NumberFormatOptions::Grouping::Never: + return appendToken(u"group-off"); + } + MOZ_ASSERT_UNREACHABLE("unexpected grouping mode"); + return false; +} + +bool NumberFormatterSkeleton::notation(NumberFormatOptions::Notation style) { + switch (style) { + case NumberFormatOptions::Notation::Standard: + // Default, no additional tokens needed. + return true; + case NumberFormatOptions::Notation::Scientific: + return appendToken(u"scientific"); + case NumberFormatOptions::Notation::Engineering: + return appendToken(u"engineering"); + case NumberFormatOptions::Notation::CompactShort: + return appendToken(u"compact-short"); + case NumberFormatOptions::Notation::CompactLong: + return appendToken(u"compact-long"); + } + MOZ_ASSERT_UNREACHABLE("unexpected notation style"); + return false; +} + +bool NumberFormatterSkeleton::signDisplay( + NumberFormatOptions::SignDisplay display) { + switch (display) { + case NumberFormatOptions::SignDisplay::Auto: + // Default, no additional tokens needed. + return true; + case NumberFormatOptions::SignDisplay::Always: + return appendToken(u"sign-always"); + case NumberFormatOptions::SignDisplay::Never: + return appendToken(u"sign-never"); + case NumberFormatOptions::SignDisplay::ExceptZero: + return appendToken(u"sign-except-zero"); + case NumberFormatOptions::SignDisplay::Negative: + return appendToken(u"sign-negative"); + case NumberFormatOptions::SignDisplay::Accounting: + return appendToken(u"sign-accounting"); + case NumberFormatOptions::SignDisplay::AccountingAlways: + return appendToken(u"sign-accounting-always"); + case NumberFormatOptions::SignDisplay::AccountingExceptZero: + return appendToken(u"sign-accounting-except-zero"); + case NumberFormatOptions::SignDisplay::AccountingNegative: + return appendToken(u"sign-accounting-negative"); + } + MOZ_ASSERT_UNREACHABLE("unexpected sign display type"); + return false; +} + +bool NumberFormatterSkeleton::roundingIncrement(uint32_t increment, + uint32_t mnfd, uint32_t mxfd, + bool stripTrailingZero) { + // Note: |mnfd| can be zero here. + MOZ_ASSERT(mnfd <= mxfd); + MOZ_ASSERT(increment > 1); + + // Limit |mxfd| to 100. (20 is the current limit for ECMA-402, but there are + // plans to change it to 100.) + constexpr size_t maxFracDigits = 100; + MOZ_RELEASE_ASSERT(mxfd <= maxFracDigits); + + static constexpr char digits[] = "0123456789"; + + // We need enough space to print any uint32_t, which is possibly shifted by + // |mxfd| decimal places. And additionally we need to reserve space for "0.". + static_assert(std::numeric_limits<uint32_t>::digits10 + 1 < maxFracDigits); + constexpr size_t maxLength = maxFracDigits + 2; + + char chars[maxLength]; + RangedPtr<char> ptr(chars + maxLength, chars, maxLength); + const RangedPtr<char> end = ptr; + + // Convert to a signed integer, so we don't have to worry about underflows. + int32_t maxFrac = int32_t(mxfd); + + // Write |increment| from back to front. + while (increment != 0) { + *--ptr = digits[increment % 10]; + increment /= 10; + maxFrac -= 1; + + if (maxFrac == 0) { + *--ptr = '.'; + } + } + + // Write any remaining zeros from |mxfd| and prepend '0' if we last wrote the + // decimal point. + while (maxFrac >= 0) { + MOZ_ASSERT_IF(maxFrac == 0, *ptr == '.'); + + *--ptr = '0'; + maxFrac -= 1; + + if (maxFrac == 0) { + *--ptr = '.'; + } + } + + MOZ_ASSERT(ptr < end, "At least one character is written."); + MOZ_ASSERT(*ptr != '.', "First character is a digit."); + + if (!append(u"precision-increment/") || !append(ptr.get(), end - ptr)) { + return false; + } + if (stripTrailingZero) { + if (!append(u"/w")) { + return false; + } + } + return append(' '); +} + +bool NumberFormatterSkeleton::roundingMode( + NumberFormatOptions::RoundingMode rounding) { + switch (rounding) { + case NumberFormatOptions::RoundingMode::Ceil: + return appendToken(u"rounding-mode-ceiling"); + case NumberFormatOptions::RoundingMode::Floor: + return appendToken(u"rounding-mode-floor"); + case NumberFormatOptions::RoundingMode::Expand: + return appendToken(u"rounding-mode-up"); + case NumberFormatOptions::RoundingMode::Trunc: + return appendToken(u"rounding-mode-down"); + case NumberFormatOptions::RoundingMode::HalfCeil: + return appendToken(u"rounding-mode-half-ceiling"); + case NumberFormatOptions::RoundingMode::HalfFloor: + return appendToken(u"rounding-mode-half-floor"); + case NumberFormatOptions::RoundingMode::HalfExpand: + return appendToken(u"rounding-mode-half-up"); + case NumberFormatOptions::RoundingMode::HalfTrunc: + return appendToken(u"rounding-mode-half-down"); + case NumberFormatOptions::RoundingMode::HalfEven: + return appendToken(u"rounding-mode-half-even"); + case NumberFormatOptions::RoundingMode::HalfOdd: + return appendToken(u"rounding-mode-half-odd"); + } + MOZ_ASSERT_UNREACHABLE("unexpected rounding mode"); + return false; +} + +UNumberFormatter* NumberFormatterSkeleton::toFormatter( + std::string_view locale) { + if (!mValidSkeleton) { + return nullptr; + } + + UErrorCode status = U_ZERO_ERROR; + UNumberFormatter* nf = unumf_openForSkeletonAndLocale( + mVector.begin(), mVector.length(), AssertNullTerminatedString(locale), + &status); + if (U_FAILURE(status)) { + return nullptr; + } + return nf; +} + +static UNumberRangeCollapse ToUNumberRangeCollapse( + NumberRangeFormatOptions::RangeCollapse collapse) { + using RangeCollapse = NumberRangeFormatOptions::RangeCollapse; + switch (collapse) { + case RangeCollapse::Auto: + return UNUM_RANGE_COLLAPSE_AUTO; + case RangeCollapse::None: + return UNUM_RANGE_COLLAPSE_NONE; + case RangeCollapse::Unit: + return UNUM_RANGE_COLLAPSE_UNIT; + case RangeCollapse::All: + return UNUM_RANGE_COLLAPSE_ALL; + } + MOZ_ASSERT_UNREACHABLE("unexpected range collapse"); + return UNUM_RANGE_COLLAPSE_NONE; +} + +static UNumberRangeIdentityFallback ToUNumberRangeIdentityFallback( + NumberRangeFormatOptions::RangeIdentityFallback identity) { + using RangeIdentityFallback = NumberRangeFormatOptions::RangeIdentityFallback; + switch (identity) { + case RangeIdentityFallback::SingleValue: + return UNUM_IDENTITY_FALLBACK_SINGLE_VALUE; + case RangeIdentityFallback::ApproximatelyOrSingleValue: + return UNUM_IDENTITY_FALLBACK_APPROXIMATELY_OR_SINGLE_VALUE; + case RangeIdentityFallback::Approximately: + return UNUM_IDENTITY_FALLBACK_APPROXIMATELY; + case RangeIdentityFallback::Range: + return UNUM_IDENTITY_FALLBACK_RANGE; + } + MOZ_ASSERT_UNREACHABLE("unexpected range identity fallback"); + return UNUM_IDENTITY_FALLBACK_RANGE; +} + +UNumberRangeFormatter* NumberFormatterSkeleton::toRangeFormatter( + std::string_view locale, NumberRangeFormatOptions::RangeCollapse collapse, + NumberRangeFormatOptions::RangeIdentityFallback identity) { + if (!mValidSkeleton) { + return nullptr; + } + + UParseError* perror = nullptr; + UErrorCode status = U_ZERO_ERROR; + UNumberRangeFormatter* nrf = + unumrf_openForSkeletonWithCollapseAndIdentityFallback( + mVector.begin(), mVector.length(), ToUNumberRangeCollapse(collapse), + ToUNumberRangeIdentityFallback(identity), + AssertNullTerminatedString(locale), perror, &status); + if (U_FAILURE(status)) { + return nullptr; + } + return nrf; +} + +} // namespace mozilla::intl diff --git a/intl/components/src/NumberFormatterSkeleton.h b/intl/components/src/NumberFormatterSkeleton.h new file mode 100644 index 0000000000..134e9e0860 --- /dev/null +++ b/intl/components/src/NumberFormatterSkeleton.h @@ -0,0 +1,110 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef intl_components_NumberFormatterSkeleton_h_ +#define intl_components_NumberFormatterSkeleton_h_ +#include <string_view> +#include "mozilla/intl/NumberFormat.h" +#include "mozilla/intl/NumberRangeFormat.h" +#include "mozilla/Vector.h" +#include "unicode/unumberformatter.h" +#include "unicode/utypes.h" + +struct UNumberRangeFormatter; + +namespace mozilla::intl { + +/** + * Class to create a number formatter skeleton. + * + * The skeleton syntax is documented at: + * https://github.com/unicode-org/icu/blob/master/docs/userguide/format_parse/numbers/skeletons.md + */ +class MOZ_STACK_CLASS NumberFormatterSkeleton final { + public: + explicit NumberFormatterSkeleton(const NumberFormatOptions& options); + + /** + * Return a new UNumberFormatter based on this skeleton. + */ + UNumberFormatter* toFormatter(std::string_view locale); + + /** + * Return a new UNumberRangeFormatter based on this skeleton. + */ + UNumberRangeFormatter* toRangeFormatter( + std::string_view locale, NumberRangeFormatOptions::RangeCollapse collapse, + NumberRangeFormatOptions::RangeIdentityFallback identity); + + private: + static constexpr size_t DefaultVectorSize = 128; + + mozilla::Vector<char16_t, DefaultVectorSize> mVector; + bool mValidSkeleton = false; + + [[nodiscard]] bool append(char16_t c) { return mVector.append(c); } + + [[nodiscard]] bool appendN(char16_t c, size_t times) { + return mVector.appendN(c, times); + } + + template <size_t N> + [[nodiscard]] bool append(const char16_t (&chars)[N]) { + static_assert(N > 0, + "should only be used with string literals or properly " + "null-terminated arrays"); + MOZ_ASSERT(chars[N - 1] == '\0', + "should only be used with string literals or properly " + "null-terminated arrays"); + // Without trailing \0. + return mVector.append(chars, N - 1); + } + + template <size_t N> + [[nodiscard]] bool appendToken(const char16_t (&token)[N]) { + return append(token) && append(' '); + } + + [[nodiscard]] bool append(const char* chars, size_t length) { + return mVector.append(chars, length); + } + + [[nodiscard]] bool currency(std::string_view currency); + + [[nodiscard]] bool currencyDisplay( + NumberFormatOptions::CurrencyDisplay display); + + [[nodiscard]] bool unit(std::string_view unit); + + [[nodiscard]] bool unitDisplay(NumberFormatOptions::UnitDisplay display); + + [[nodiscard]] bool percent(); + + [[nodiscard]] bool fractionDigits(uint32_t min, uint32_t max, + bool stripTrailingZero); + + [[nodiscard]] bool fractionWithSignificantDigits(uint32_t mnfd, uint32_t mxfd, + uint32_t mnsd, uint32_t mxsd, + bool relaxed, + bool stripTrailingZero); + + [[nodiscard]] bool minIntegerDigits(uint32_t min); + + [[nodiscard]] bool significantDigits(uint32_t min, uint32_t max, + bool stripTrailingZero); + + [[nodiscard]] bool grouping(NumberFormatOptions::Grouping grouping); + + [[nodiscard]] bool notation(NumberFormatOptions::Notation style); + + [[nodiscard]] bool signDisplay(NumberFormatOptions::SignDisplay display); + + [[nodiscard]] bool roundingIncrement(uint32_t increment, uint32_t mnfd, + uint32_t mxfd, bool stripTrailingZero); + + [[nodiscard]] bool roundingMode(NumberFormatOptions::RoundingMode rounding); +}; + +} // namespace mozilla::intl + +#endif diff --git a/intl/components/src/NumberParser.cpp b/intl/components/src/NumberParser.cpp new file mode 100644 index 0000000000..fb97393783 --- /dev/null +++ b/intl/components/src/NumberParser.cpp @@ -0,0 +1,45 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#include "mozilla/intl/NumberParser.h" + +namespace mozilla::intl { + +/*static*/ Result<UniquePtr<NumberParser>, ICUError> NumberParser::TryCreate( + const char* aLocale, bool aUseGrouping) { + UniquePtr<NumberParser> nf = MakeUnique<NumberParser>(); + + UErrorCode status = U_ZERO_ERROR; + nf->mNumberFormat = + unum_open(UNUM_DECIMAL, nullptr, 0, aLocale, nullptr, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + if (!aUseGrouping) { + unum_setAttribute(nf->mNumberFormat.GetMut(), UNUM_GROUPING_USED, UBool(0)); + } + + return nf; +} + +NumberParser::~NumberParser() { + if (mNumberFormat) { + unum_close(mNumberFormat.GetMut()); + } +} + +Result<std::pair<double, int32_t>, ICUError> NumberParser::ParseDouble( + Span<const char16_t> aDouble) const { + UErrorCode status = U_ZERO_ERROR; + int32_t parsePos = 0; + double value = unum_parseDouble(mNumberFormat.GetConst(), aDouble.data(), + static_cast<int32_t>(aDouble.size()), + &parsePos, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + return std::make_pair(value, parsePos); +} + +} // namespace mozilla::intl diff --git a/intl/components/src/NumberParser.h b/intl/components/src/NumberParser.h new file mode 100644 index 0000000000..97efec0836 --- /dev/null +++ b/intl/components/src/NumberParser.h @@ -0,0 +1,46 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef intl_components_NumberParser_h_ +#define intl_components_NumberParser_h_ + +#include "mozilla/intl/ICUError.h" +#include "mozilla/intl/ICU4CGlue.h" +#include "mozilla/Span.h" +#include "mozilla/UniquePtr.h" + +#include "unicode/unum.h" + +namespace mozilla::intl { + +class NumberParser { + public: + /** + * Initialize a new NumberParser for the provided locale and using the + * provided options. + */ + static Result<UniquePtr<NumberParser>, ICUError> TryCreate( + const char* aLocale, bool aUseGrouping); + + NumberParser() : mNumberFormat(nullptr){}; + NumberParser(const NumberParser&) = delete; + NumberParser& operator=(const NumberParser&) = delete; + ~NumberParser(); + + /** + * Attempts to parse a string representing a double, returning the parsed + * double and the parse position if successful, or an error. + * + * The parse position is the index into the input string where parsing + * stopped because an non-numeric character was encountered. + */ + Result<std::pair<double, int32_t>, ICUError> ParseDouble( + Span<const char16_t> aDouble) const; + + private: + ICUPointer<UNumberFormat> mNumberFormat; +}; + +} // namespace mozilla::intl + +#endif diff --git a/intl/components/src/NumberPart.h b/intl/components/src/NumberPart.h new file mode 100644 index 0000000000..8639db5768 --- /dev/null +++ b/intl/components/src/NumberPart.h @@ -0,0 +1,53 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef intl_components_NumberPart_h_ +#define intl_components_NumberPart_h_ + +#include <cstddef> +#include <cstdint> + +#include "mozilla/Vector.h" + +namespace mozilla::intl { + +enum class NumberPartType : int16_t { + ApproximatelySign, + Compact, + Currency, + Decimal, + ExponentInteger, + ExponentMinusSign, + ExponentSeparator, + Fraction, + Group, + Infinity, + Integer, + Literal, + MinusSign, + Nan, + Percent, + PlusSign, + Unit, +}; + +enum class NumberPartSource : int16_t { Shared, Start, End }; + +// Because parts fully partition the formatted string, we only track the +// index of the end of each part -- the beginning is implicitly the last +// part's end. +struct NumberPart { + NumberPartType type; + NumberPartSource source; + size_t endIndex; + + bool operator==(const NumberPart& rhs) const { + return type == rhs.type && source == rhs.source && endIndex == rhs.endIndex; + } + bool operator!=(const NumberPart& rhs) const { return !(*this == rhs); } +}; + +using NumberPartVector = mozilla::Vector<NumberPart, 8>; + +} // namespace mozilla::intl +#endif diff --git a/intl/components/src/NumberRangeFormat.cpp b/intl/components/src/NumberRangeFormat.cpp new file mode 100644 index 0000000000..bf82e36c33 --- /dev/null +++ b/intl/components/src/NumberRangeFormat.cpp @@ -0,0 +1,215 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "mozilla/intl/NumberRangeFormat.h" + +#include "mozilla/intl/ICU4CGlue.h" +#include "mozilla/intl/NumberFormat.h" +#include "NumberFormatFields.h" +#include "NumberFormatterSkeleton.h" +#include "ScopedICUObject.h" + +#include "unicode/uformattedvalue.h" +#include "unicode/unumberrangeformatter.h" +#include "unicode/upluralrules.h" + +namespace mozilla::intl { + +/*static*/ Result<UniquePtr<NumberRangeFormat>, ICUError> +NumberRangeFormat::TryCreate(std::string_view aLocale, + const NumberRangeFormatOptions& aOptions) { + UniquePtr<NumberRangeFormat> nrf = MakeUnique<NumberRangeFormat>(); + MOZ_TRY(nrf->initialize(aLocale, aOptions)); + return nrf; +} + +NumberRangeFormat::~NumberRangeFormat() { + if (mFormattedNumberRange) { + unumrf_closeResult(mFormattedNumberRange); + } + if (mNumberRangeFormatter) { + unumrf_close(mNumberRangeFormatter); + } +} + +Result<Ok, ICUError> NumberRangeFormat::initialize( + std::string_view aLocale, const NumberRangeFormatOptions& aOptions) { + mFormatForUnit = aOptions.mUnit.isSome(); + + NumberFormatterSkeleton skeleton(aOptions); + mNumberRangeFormatter = skeleton.toRangeFormatter( + aLocale, aOptions.mRangeCollapse, aOptions.mRangeIdentityFallback); + if (mNumberRangeFormatter) { + UErrorCode status = U_ZERO_ERROR; + mFormattedNumberRange = unumrf_openResult(&status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + return Ok(); + } + return Err(ICUError::InternalError); +} + +Result<int32_t, ICUError> NumberRangeFormat::selectForRange( + double start, double end, char16_t* keyword, int32_t keywordSize, + const UPluralRules* pluralRules) const { + MOZ_ASSERT(keyword); + MOZ_ASSERT(pluralRules); + + MOZ_TRY(format(start, end)); + + UErrorCode status = U_ZERO_ERROR; + int32_t utf16KeywordLength = uplrules_selectForRange( + pluralRules, mFormattedNumberRange, keyword, keywordSize, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + return utf16KeywordLength; +} + +bool NumberRangeFormat::formatInternal(double start, double end) const { + // ICU incorrectly formats NaN values with the sign bit set, as if they + // were negative. Replace all NaNs with a single pattern with sign bit + // unset ("positive", that is) until ICU is fixed. + if (MOZ_UNLIKELY(std::isnan(start))) { + start = SpecificNaN<double>(0, 1); + } + if (MOZ_UNLIKELY(std::isnan(end))) { + end = SpecificNaN<double>(0, 1); + } + + UErrorCode status = U_ZERO_ERROR; + unumrf_formatDoubleRange(mNumberRangeFormatter, start, end, + mFormattedNumberRange, &status); + return U_SUCCESS(status); +} + +bool NumberRangeFormat::formatInternal(std::string_view start, + std::string_view end) const { + UErrorCode status = U_ZERO_ERROR; + unumrf_formatDecimalRange(mNumberRangeFormatter, start.data(), start.size(), + end.data(), end.size(), mFormattedNumberRange, + &status); + return U_SUCCESS(status); +} + +Result<std::u16string_view, ICUError> NumberRangeFormat::formatResult() const { + UErrorCode status = U_ZERO_ERROR; + + const UFormattedValue* formattedValue = + unumrf_resultAsValue(mFormattedNumberRange, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + int32_t utf16Length; + const char16_t* utf16Str = + ufmtval_getString(formattedValue, &utf16Length, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + return std::u16string_view(utf16Str, static_cast<size_t>(utf16Length)); +} + +Result<std::u16string_view, ICUError> NumberRangeFormat::formatResultToParts( + Maybe<double> start, bool startIsNegative, Maybe<double> end, + bool endIsNegative, NumberPartVector& parts) const { + UErrorCode status = U_ZERO_ERROR; + + const UFormattedValue* formattedValue = + unumrf_resultAsValue(mFormattedNumberRange, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + int32_t utf16Length; + const char16_t* utf16Str = + ufmtval_getString(formattedValue, &utf16Length, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + UConstrainedFieldPosition* fpos = ucfpos_open(&status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + ScopedICUObject<UConstrainedFieldPosition, ucfpos_close> toCloseFpos(fpos); + + Maybe<double> number = start; + bool isNegative = startIsNegative; + + NumberPartSourceMap sourceMap; + + // Vacuum up fields in the overall formatted string. + NumberFormatFields fields; + + while (true) { + bool hasMore = ufmtval_nextPosition(formattedValue, fpos, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + if (!hasMore) { + break; + } + + int32_t category = ucfpos_getCategory(fpos, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + int32_t fieldName = ucfpos_getField(fpos, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + int32_t beginIndex, endIndex; + ucfpos_getIndexes(fpos, &beginIndex, &endIndex, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + if (category == UFIELD_CATEGORY_NUMBER_RANGE_SPAN) { + // The special field category UFIELD_CATEGORY_NUMBER_RANGE_SPAN has only + // two allowed values (0 or 1), indicating the begin of the start resp. + // end number. + MOZ_ASSERT(fieldName == 0 || fieldName == 1, + "span category has unexpected value"); + + if (fieldName == 0) { + number = start; + isNegative = startIsNegative; + + sourceMap.start = {uint32_t(beginIndex), uint32_t(endIndex)}; + } else { + number = end; + isNegative = endIsNegative; + + sourceMap.end = {uint32_t(beginIndex), uint32_t(endIndex)}; + } + + continue; + } + + // Ignore categories other than UFIELD_CATEGORY_NUMBER. + if (category != UFIELD_CATEGORY_NUMBER) { + continue; + } + + Maybe<NumberPartType> partType = GetPartTypeForNumberField( + UNumberFormatFields(fieldName), number, isNegative, mFormatForUnit); + if (!partType || !fields.append(*partType, beginIndex, endIndex)) { + return Err(ToICUError(status)); + } + } + + if (!fields.toPartsVector(utf16Length, sourceMap, parts)) { + return Err(ToICUError(status)); + } + + return std::u16string_view(utf16Str, static_cast<size_t>(utf16Length)); +} + +} // namespace mozilla::intl diff --git a/intl/components/src/NumberRangeFormat.h b/intl/components/src/NumberRangeFormat.h new file mode 100644 index 0000000000..40bb85d6d2 --- /dev/null +++ b/intl/components/src/NumberRangeFormat.h @@ -0,0 +1,237 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef intl_components_NumberRangeFormat_h_ +#define intl_components_NumberRangeFormat_h_ + +#include "mozilla/FloatingPoint.h" +#include "mozilla/intl/ICUError.h" +#include "mozilla/intl/NumberFormat.h" +#include "mozilla/Result.h" +#include "mozilla/UniquePtr.h" + +#include <stdint.h> +#include <string_view> + +#include "unicode/utypes.h" + +struct UFormattedNumberRange; +struct UNumberRangeFormatter; +struct UPluralRules; + +namespace mozilla::intl { + +/** + * NumberRangeFormatOptions supports the same set of options as + * NumberFormatOptions and additionally allows to control how to display ranges. + */ +struct MOZ_STACK_CLASS NumberRangeFormatOptions : public NumberFormatOptions { + /** + * Controls if and how to collapse identical parts in a range. + */ + enum class RangeCollapse { + /** + * Apply locale-specific heuristics. + */ + Auto, + + /** + * Never collapse identical parts. + */ + None, + + /** + * Collapse identical unit parts. + */ + Unit, + + /** + * Collapse all identical parts. + */ + All, + } mRangeCollapse = RangeCollapse::Auto; + + /** + * Controls how to display identical numbers. + */ + enum class RangeIdentityFallback { + /** + * Display the range as a single value. + */ + SingleValue, + + /** + * Display the range as a single value if both numbers were equal before + * rounding. Otherwise display with a locale-sensitive approximation + * pattern. + */ + ApproximatelyOrSingleValue, + + /** + * Display with a locale-sensitive approximation pattern. + */ + Approximately, + + /** + * Display as a range expression. + */ + Range, + } mRangeIdentityFallback = RangeIdentityFallback::SingleValue; +}; + +/** + * A NumberRangeFormat implementation that roughly mirrors the API provided by + * the ECMA-402 Intl.NumberFormat object for formatting number ranges. + * + * https://tc39.es/ecma402/#numberformat-objects + */ +class NumberRangeFormat final { + public: + /** + * Initialize a new NumberRangeFormat for the provided locale and using the + * provided options. + * + * https://tc39.es/ecma402/#sec-initializenumberformat + */ + static Result<UniquePtr<NumberRangeFormat>, ICUError> TryCreate( + std::string_view aLocale, const NumberRangeFormatOptions& aOptions); + + NumberRangeFormat() = default; + NumberRangeFormat(const NumberRangeFormat&) = delete; + NumberRangeFormat& operator=(const NumberRangeFormat&) = delete; + + ~NumberRangeFormat(); + + /** + * Formats a double range to a utf-16 string. The string view is valid until + * another number range is formatted. Accessing the string view after this + * event is undefined behavior. + * + * https://tc39.es/ecma402/#sec-formatnumericrange + */ + Result<std::u16string_view, ICUError> format(double start, double end) const { + if (!formatInternal(start, end)) { + return Err(ICUError::InternalError); + } + + return formatResult(); + } + + /** + * Formats a double range to a utf-16 string, and fills the provided parts + * vector. The string view is valid until another number is formatted. + * Accessing the string view after this event is undefined behavior. + * + * https://tc39.es/ecma402/#sec-partitionnumberrangepattern + */ + Result<std::u16string_view, ICUError> formatToParts( + double start, double end, NumberPartVector& parts) const { + if (!formatInternal(start, end)) { + return Err(ICUError::InternalError); + } + + bool isNegativeStart = !std::isnan(start) && IsNegative(start); + bool isNegativeEnd = !std::isnan(end) && IsNegative(end); + + return formatResultToParts(Some(start), isNegativeStart, Some(end), + isNegativeEnd, parts); + } + + /** + * Formats a decimal number range to a utf-16 string. The string view is valid + * until another number range is formatted. Accessing the string view after + * this event is undefined behavior. + * + * https://tc39.es/ecma402/#sec-formatnumericrange + */ + Result<std::u16string_view, ICUError> format(std::string_view start, + std::string_view end) const { + if (!formatInternal(start, end)) { + return Err(ICUError::InternalError); + } + + return formatResult(); + } + + /** + * Formats a string encoded decimal number range to a utf-16 string, and fills + * the provided parts vector. The string view is valid until another number is + * formatted. Accessing the string view after this event is undefined + * behavior. + * + * https://tc39.es/ecma402/#sec-partitionnumberrangepattern + */ + Result<std::u16string_view, ICUError> formatToParts( + std::string_view start, std::string_view end, + NumberPartVector& parts) const { + if (!formatInternal(start, end)) { + return Err(ICUError::InternalError); + } + + Maybe<double> numStart = Nothing(); + if (start == "Infinity" || start == "+Infinity") { + numStart.emplace(PositiveInfinity<double>()); + } else if (start == "-Infinity") { + numStart.emplace(NegativeInfinity<double>()); + } else { + // Not currently expected, so we assert here. + MOZ_ASSERT(start != "NaN"); + } + + Maybe<double> numEnd = Nothing(); + if (end == "Infinity" || end == "+Infinity") { + numEnd.emplace(PositiveInfinity<double>()); + } else if (end == "-Infinity") { + numEnd.emplace(NegativeInfinity<double>()); + } else { + // Not currently expected, so we assert here. + MOZ_ASSERT(end != "NaN"); + } + + bool isNegativeStart = !start.empty() && start[0] == '-'; + bool isNegativeEnd = !end.empty() && end[0] == '-'; + + return formatResultToParts(numStart, isNegativeStart, numEnd, isNegativeEnd, + parts); + } + + /** + * Formats the number range and selects the keyword by using a provided + * UPluralRules object. + * + * https://tc39.es/ecma402/#sec-intl.pluralrules.prototype.selectrange + * + * TODO(1713917) This is necessary because both PluralRules and + * NumberRangeFormat have a shared dependency on the raw UFormattedNumberRange + * type. Once we transition to using ICU4X, the FFI calls should no + * longer require such shared dependencies. At that time, this + * functionality should be removed from NumberRangeFormat and invoked + * solely from PluralRules. + */ + Result<int32_t, ICUError> selectForRange( + double start, double end, char16_t* keyword, int32_t keywordSize, + const UPluralRules* pluralRules) const; + + private: + UNumberRangeFormatter* mNumberRangeFormatter = nullptr; + UFormattedNumberRange* mFormattedNumberRange = nullptr; + bool mFormatForUnit = false; + + Result<Ok, ICUError> initialize(std::string_view aLocale, + const NumberRangeFormatOptions& aOptions); + + [[nodiscard]] bool formatInternal(double start, double end) const; + + [[nodiscard]] bool formatInternal(std::string_view start, + std::string_view end) const; + + Result<std::u16string_view, ICUError> formatResult() const; + + Result<std::u16string_view, ICUError> formatResultToParts( + Maybe<double> start, bool startIsNegative, Maybe<double> end, + bool endIsNegative, NumberPartVector& parts) const; +}; + +} // namespace mozilla::intl + +#endif diff --git a/intl/components/src/NumberingSystem.cpp b/intl/components/src/NumberingSystem.cpp new file mode 100644 index 0000000000..b86484a5f7 --- /dev/null +++ b/intl/components/src/NumberingSystem.cpp @@ -0,0 +1,38 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "mozilla/intl/NumberingSystem.h" +#include "mozilla/intl/ICU4CGlue.h" + +#include "unicode/unumsys.h" +#include "unicode/utypes.h" + +namespace mozilla::intl { + +NumberingSystem::~NumberingSystem() { + MOZ_ASSERT(mNumberingSystem); + unumsys_close(mNumberingSystem); +} + +Result<UniquePtr<NumberingSystem>, ICUError> NumberingSystem::TryCreate( + const char* aLocale) { + UErrorCode status = U_ZERO_ERROR; + UNumberingSystem* numbers = unumsys_open(IcuLocale(aLocale), &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + return MakeUnique<NumberingSystem>(numbers); +} + +Result<Span<const char>, ICUError> NumberingSystem::GetName() { + const char* name = unumsys_getName(mNumberingSystem); + if (!name) { + return Err(ICUError::InternalError); + } + + return MakeStringSpan(name); +} + +} // namespace mozilla::intl diff --git a/intl/components/src/NumberingSystem.h b/intl/components/src/NumberingSystem.h new file mode 100644 index 0000000000..a3d1903dd1 --- /dev/null +++ b/intl/components/src/NumberingSystem.h @@ -0,0 +1,56 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef intl_components_NumberingSystem_h_ +#define intl_components_NumberingSystem_h_ + +#include "mozilla/intl/ICUError.h" +#include "mozilla/Result.h" +#include "mozilla/Span.h" +#include "mozilla/UniquePtr.h" + +struct UNumberingSystem; + +namespace mozilla::intl { + +/** + * This component is a Mozilla-focused API for working with numbering systems in + * internationalization code. It is used in coordination with other operations + * such as number formatting. + */ +class NumberingSystem final { + public: + explicit NumberingSystem(UNumberingSystem* aNumberingSystem) + : mNumberingSystem(aNumberingSystem) { + MOZ_ASSERT(aNumberingSystem); + }; + + // Do not allow copy as this class owns the ICU resource. Move is not + // currently implemented, but a custom move operator could be created if + // needed. + NumberingSystem(const NumberingSystem&) = delete; + NumberingSystem& operator=(const NumberingSystem&) = delete; + + ~NumberingSystem(); + + /** + * Create a NumberingSystem. + */ + static Result<UniquePtr<NumberingSystem>, ICUError> TryCreate( + const char* aLocale); + + /** + * Returns the name of this numbering system. + * + * The returned string has the same lifetime as this NumberingSystem object. + */ + Result<Span<const char>, ICUError> GetName(); + + private: + UNumberingSystem* mNumberingSystem = nullptr; +}; + +} // namespace mozilla::intl + +#endif diff --git a/intl/components/src/PluralRules.cpp b/intl/components/src/PluralRules.cpp new file mode 100644 index 0000000000..891ca45769 --- /dev/null +++ b/intl/components/src/PluralRules.cpp @@ -0,0 +1,180 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "mozilla/intl/PluralRules.h" + +#include "mozilla/intl/ICU4CGlue.h" +#include "mozilla/intl/NumberFormat.h" +#include "mozilla/intl/NumberRangeFormat.h" +#include "mozilla/Utf8.h" +#include "mozilla/PodOperations.h" +#include "mozilla/Span.h" +#include "ScopedICUObject.h" + +#include "unicode/unum.h" +#include "unicode/upluralrules.h" +#include "unicode/ustring.h" + +namespace mozilla::intl { + +PluralRules::PluralRules(UPluralRules*& aPluralRules, + UniquePtr<NumberFormat>&& aNumberFormat, + UniquePtr<NumberRangeFormat>&& aNumberRangeFormat) + : mPluralRules(aPluralRules), + mNumberFormat(std::move(aNumberFormat)), + mNumberRangeFormat(std::move(aNumberRangeFormat)) { + MOZ_ASSERT(aPluralRules); + aPluralRules = nullptr; +} + +Result<UniquePtr<PluralRules>, ICUError> PluralRules::TryCreate( + const std::string_view aLocale, const PluralRulesOptions& aOptions) { + auto numberFormat = + NumberFormat::TryCreate(aLocale, aOptions.ToNumberFormatOptions()); + + if (numberFormat.isErr()) { + return Err(numberFormat.unwrapErr()); + } + + auto numberRangeFormat = NumberRangeFormat::TryCreate( + aLocale, aOptions.ToNumberRangeFormatOptions()); + + if (numberRangeFormat.isErr()) { + return Err(numberRangeFormat.unwrapErr()); + } + + UErrorCode status = U_ZERO_ERROR; + auto pluralType = aOptions.mPluralType == PluralRules::Type::Cardinal + ? UPLURAL_TYPE_CARDINAL + : UPLURAL_TYPE_ORDINAL; + UPluralRules* pluralRules = uplrules_openForType( + AssertNullTerminatedString(aLocale), pluralType, &status); + + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + return UniquePtr<PluralRules>(new PluralRules( + pluralRules, numberFormat.unwrap(), numberRangeFormat.unwrap())); +} + +Result<PluralRules::Keyword, ICUError> PluralRules::Select( + const double aNumber) const { + char16_t keyword[MAX_KEYWORD_LENGTH]; + + auto lengthResult = mNumberFormat->selectFormatted( + aNumber, keyword, MAX_KEYWORD_LENGTH, mPluralRules); + + if (lengthResult.isErr()) { + return Err(lengthResult.unwrapErr()); + } + + return KeywordFromUtf16(Span(keyword, lengthResult.unwrap())); +} + +Result<PluralRules::Keyword, ICUError> PluralRules::SelectRange( + double aStart, double aEnd) const { + char16_t keyword[MAX_KEYWORD_LENGTH]; + + auto lengthResult = mNumberRangeFormat->selectForRange( + aStart, aEnd, keyword, MAX_KEYWORD_LENGTH, mPluralRules); + + if (lengthResult.isErr()) { + return Err(lengthResult.unwrapErr()); + } + + return KeywordFromUtf16(Span(keyword, lengthResult.unwrap())); +} + +Result<EnumSet<PluralRules::Keyword>, ICUError> PluralRules::Categories() + const { + UErrorCode status = U_ZERO_ERROR; + UEnumeration* enumeration = uplrules_getKeywords(mPluralRules, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + ScopedICUObject<UEnumeration, uenum_close> closeEnum(enumeration); + EnumSet<PluralRules::Keyword> set; + + while (true) { + int32_t keywordLength; + const char* keyword = uenum_next(enumeration, &keywordLength, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + if (!keyword) { + break; + } + + set += KeywordFromAscii(Span(keyword, keywordLength)); + } + + return set; +} + +PluralRules::Keyword PluralRules::KeywordFromUtf16( + Span<const char16_t> aKeyword) { + static constexpr auto kZero = MakeStringSpan(u"zero"); + static constexpr auto kOne = MakeStringSpan(u"one"); + static constexpr auto kTwo = MakeStringSpan(u"two"); + static constexpr auto kFew = MakeStringSpan(u"few"); + static constexpr auto kMany = MakeStringSpan(u"many"); + + if (aKeyword == kZero) { + return PluralRules::Keyword::Zero; + } + if (aKeyword == kOne) { + return PluralRules::Keyword::One; + } + if (aKeyword == kTwo) { + return PluralRules::Keyword::Two; + } + if (aKeyword == kFew) { + return PluralRules::Keyword::Few; + } + if (aKeyword == kMany) { + return PluralRules::Keyword::Many; + } + + MOZ_ASSERT(aKeyword == MakeStringSpan(u"other")); + return PluralRules::Keyword::Other; +} + +PluralRules::Keyword PluralRules::KeywordFromAscii(Span<const char> aKeyword) { + static constexpr auto kZero = MakeStringSpan("zero"); + static constexpr auto kOne = MakeStringSpan("one"); + static constexpr auto kTwo = MakeStringSpan("two"); + static constexpr auto kFew = MakeStringSpan("few"); + static constexpr auto kMany = MakeStringSpan("many"); + + if (aKeyword == kZero) { + return PluralRules::Keyword::Zero; + } + if (aKeyword == kOne) { + return PluralRules::Keyword::One; + } + if (aKeyword == kTwo) { + return PluralRules::Keyword::Two; + } + if (aKeyword == kFew) { + return PluralRules::Keyword::Few; + } + if (aKeyword == kMany) { + return PluralRules::Keyword::Many; + } + + MOZ_ASSERT(aKeyword == MakeStringSpan("other")); + return PluralRules::Keyword::Other; +} + +PluralRules::~PluralRules() { + if (mPluralRules) { + uplrules_close(mPluralRules); + mPluralRules = nullptr; + } +} + +} // namespace mozilla::intl diff --git a/intl/components/src/PluralRules.h b/intl/components/src/PluralRules.h new file mode 100644 index 0000000000..a413d54279 --- /dev/null +++ b/intl/components/src/PluralRules.h @@ -0,0 +1,221 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef intl_components_PluralRules_h_ +#define intl_components_PluralRules_h_ + +#include <string_view> +#include <type_traits> +#include <utility> + +#include "mozilla/intl/ICUError.h" +#include "mozilla/intl/NumberFormat.h" +#include "mozilla/intl/NumberRangeFormat.h" +#include "mozilla/EnumSet.h" +#include "mozilla/Maybe.h" +#include "mozilla/Result.h" +#include "mozilla/Span.h" + +#include "unicode/utypes.h" + +namespace mozilla::intl { + +class PluralRules final { + public: + /** + * The set of keywords that a PluralRules object uses. + * + * https://tc39.es/ecma402/#sec-intl.pluralrules.prototype.resolvedoptions + */ + enum class Keyword : uint8_t { + Few, + Many, + One, + Other, + Two, + Zero, + }; + + /** + * The two different types of PluralRules objects that can be created. + * + * https://tc39.es/ecma402/#sec-properties-of-intl-pluralrules-instances + */ + enum class Type : uint8_t { + Cardinal, + Ordinal, + }; + + PluralRules(const PluralRules&) = delete; + PluralRules& operator=(const PluralRules&) = delete; + + /** + * Attempts to construct a PluralRules with the given locale and options. + */ + // TODO(1709880) use mozilla::Span instead of std::string_view. + static Result<UniquePtr<PluralRules>, ICUError> TryCreate( + std::string_view aLocale, const PluralRulesOptions& aOptions); + + /** + * Returns the PluralRules keyword that corresponds to the |aNumber|. + * + * https://tc39.es/ecma402/#sec-intl.pluralrules.prototype.select + */ + Result<PluralRules::Keyword, ICUError> Select(double aNumber) const; + + /** + * Returns the PluralRules keyword that corresponds to the range from |aStart| + * to |aEnd|. + * + * https://tc39.es/ecma402/#sec-intl.pluralrules.prototype.selectrange + */ + Result<PluralRules::Keyword, ICUError> SelectRange(double aStart, + double aEnd) const; + + /** + * Returns an EnumSet with the plural-rules categories that are supported by + * the locale that the PluralRules instance was created with. + */ + Result<EnumSet<PluralRules::Keyword>, ICUError> Categories() const; + + ~PluralRules(); + + private: + // The longest keyword is "other" + static const size_t MAX_KEYWORD_LENGTH = 5; + + UPluralRules* mPluralRules = nullptr; + UniquePtr<NumberFormat> mNumberFormat; + UniquePtr<NumberRangeFormat> mNumberRangeFormat; + + PluralRules(UPluralRules*&, UniquePtr<NumberFormat>&&, + UniquePtr<NumberRangeFormat>&&); + + /** + * Returns the PluralRules::Keyword that matches the UTF-16 string. + * Strings must be [u"few", u"many", u"one", u"other", u"two", u"zero"] + */ + static PluralRules::Keyword KeywordFromUtf16(Span<const char16_t> aKeyword); + + /** + * Returns the PluralRules::Keyword that matches the ASCII string. + * Strings must be ["few", "many", "one", "other", "two", "zero"] + */ + static PluralRules::Keyword KeywordFromAscii(Span<const char> aKeyword); +}; + +/** + * Options required for constructing a PluralRules object. + */ +struct MOZ_STACK_CLASS PluralRulesOptions { + /** + * Creates a NumberFormatOptions from the PluralRulesOptions. + */ + NumberFormatOptions ToNumberFormatOptions() const { + NumberFormatOptions options; + options.mRoundingMode = NumberFormatOptions::RoundingMode::HalfExpand; + + if (mFractionDigits.isSome()) { + options.mFractionDigits.emplace(mFractionDigits.ref()); + } + + if (mMinIntegerDigits.isSome()) { + options.mMinIntegerDigits.emplace(mMinIntegerDigits.ref()); + } + + if (mSignificantDigits.isSome()) { + options.mSignificantDigits.emplace(mSignificantDigits.ref()); + } + + options.mRoundingPriority = + NumberFormatOptions::RoundingPriority(mRoundingPriority); + + return options; + } + /** + * Creates a NumberFormatOptions from the PluralRulesOptions. + */ + NumberRangeFormatOptions ToNumberRangeFormatOptions() const { + NumberRangeFormatOptions options; + options.mRoundingMode = NumberRangeFormatOptions::RoundingMode::HalfExpand; + options.mRangeCollapse = NumberRangeFormatOptions::RangeCollapse::None; + options.mRangeIdentityFallback = + NumberRangeFormatOptions::RangeIdentityFallback::Range; + + if (mFractionDigits.isSome()) { + options.mFractionDigits.emplace(mFractionDigits.ref()); + } + + if (mMinIntegerDigits.isSome()) { + options.mMinIntegerDigits.emplace(mMinIntegerDigits.ref()); + } + + if (mSignificantDigits.isSome()) { + options.mSignificantDigits.emplace(mSignificantDigits.ref()); + } + + options.mRoundingPriority = + NumberFormatOptions::RoundingPriority(mRoundingPriority); + + return options; + } + + /** + * Set the plural type between cardinal and ordinal. + * + * https://tc39.es/ecma402/#sec-intl.pluralrules.prototype.resolvedoptions + */ + PluralRules::Type mPluralType = PluralRules::Type::Cardinal; + + /** + * Set the minimum number of integer digits. |min| must be a non-zero + * number. + * + * https://tc39.es/ecma402/#sec-intl.pluralrules.prototype.resolvedoptions + */ + Maybe<uint32_t> mMinIntegerDigits; + + /** + * Set the fraction digits settings. |min| can be zero, |max| must be + * larger-or-equal to |min|. + * + * https://tc39.es/ecma402/#sec-intl.pluralrules.prototype.resolvedoptions + */ + Maybe<std::pair<uint32_t, uint32_t>> mFractionDigits; + + /** + * Set the significant digits settings. |min| must be a non-zero number, |max| + * must be larger-or-equal to |min|. + * + * https://tc39.es/ecma402/#sec-intl.pluralrules.prototype.resolvedoptions + */ + Maybe<std::pair<uint32_t, uint32_t>> mSignificantDigits; + + /** + * Set the rounding priority. |mFractionDigits| and |mSignificantDigits| must + * both be set if the rounding priority isn't equal to "auto". + */ + enum class RoundingPriority { + Auto, + MorePrecision, + LessPrecision, + } mRoundingPriority = RoundingPriority::Auto; + + // Must be compatible with NumberFormatOptions::RoundingPriority. + static_assert(std::is_same_v< + std::underlying_type_t<RoundingPriority>, + std::underlying_type_t<NumberFormatOptions::RoundingPriority>>); + static_assert(RoundingPriority::Auto == + RoundingPriority(NumberFormatOptions::RoundingPriority::Auto)); + static_assert( + RoundingPriority::LessPrecision == + RoundingPriority(NumberFormatOptions::RoundingPriority::LessPrecision)); + static_assert( + RoundingPriority::MorePrecision == + RoundingPriority(NumberFormatOptions::RoundingPriority::MorePrecision)); +}; + +} // namespace mozilla::intl + +#endif diff --git a/intl/components/src/RelativeTimeFormat.cpp b/intl/components/src/RelativeTimeFormat.cpp new file mode 100644 index 0000000000..da67f7587d --- /dev/null +++ b/intl/components/src/RelativeTimeFormat.cpp @@ -0,0 +1,153 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#include "mozilla/intl/RelativeTimeFormat.h" +#include "mozilla/FloatingPoint.h" + +#include "unicode/unum.h" + +#include "NumberFormatFields.h" +#include "ICU4CGlue.h" +#include "ScopedICUObject.h" + +namespace mozilla::intl { + +/*static*/ Result<UniquePtr<RelativeTimeFormat>, ICUError> +RelativeTimeFormat::TryCreate(const char* aLocale, + const RelativeTimeFormatOptions& aOptions) { + UErrorCode status = U_ZERO_ERROR; + + UFormattedRelativeDateTime* formattedRelativeDateTime = + ureldatefmt_openResult(&status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + ScopedICUObject<UFormattedRelativeDateTime, ureldatefmt_closeResult> + closeFormattedRelativeDate(formattedRelativeDateTime); + + UNumberFormat* nf = + unum_open(UNUM_DECIMAL, nullptr, 0, IcuLocale(aLocale), nullptr, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + ScopedICUObject<UNumberFormat, unum_close> closeNumberFormatter(nf); + + // Use the default values as if a new Intl.NumberFormat had been constructed. + unum_setAttribute(nf, UNUM_MIN_INTEGER_DIGITS, 1); + unum_setAttribute(nf, UNUM_MIN_FRACTION_DIGITS, 0); + unum_setAttribute(nf, UNUM_MAX_FRACTION_DIGITS, 3); + unum_setAttribute(nf, UNUM_GROUPING_USED, true); + unum_setAttribute(nf, UNUM_MINIMUM_GROUPING_DIGITS, + UNUM_MINIMUM_GROUPING_DIGITS_AUTO); + + UDateRelativeDateTimeFormatterStyle relDateTimeStyle; + switch (aOptions.style) { + case RelativeTimeFormatOptions::Style::Short: + relDateTimeStyle = UDAT_STYLE_SHORT; + break; + case RelativeTimeFormatOptions::Style::Narrow: + relDateTimeStyle = UDAT_STYLE_NARROW; + break; + case RelativeTimeFormatOptions::Style::Long: + relDateTimeStyle = UDAT_STYLE_LONG; + break; + } + + URelativeDateTimeFormatter* formatter = + ureldatefmt_open(IcuLocale(aLocale), nf, relDateTimeStyle, + UDISPCTX_CAPITALIZATION_FOR_STANDALONE, &status); + + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + // Ownership was transferred to mFormatter. + closeNumberFormatter.forget(); + + UniquePtr<RelativeTimeFormat> rtf = MakeUnique<RelativeTimeFormat>( + aOptions.numeric, formatter, formattedRelativeDateTime); + + // Ownership was transferred to rtf. + closeFormattedRelativeDate.forget(); + return rtf; +} + +RelativeTimeFormat::RelativeTimeFormat( + RelativeTimeFormatOptions::Numeric aNumeric, + URelativeDateTimeFormatter* aFormatter, + UFormattedRelativeDateTime* aFormattedRelativeDateTime) + : mNumeric(aNumeric), + mFormatter(aFormatter), + mFormattedRelativeDateTime(aFormattedRelativeDateTime) {} + +RelativeTimeFormat::~RelativeTimeFormat() { + if (mFormattedRelativeDateTime) { + ureldatefmt_closeResult(mFormattedRelativeDateTime); + mFormattedRelativeDateTime = nullptr; + } + + if (mFormatter) { + ureldatefmt_close(mFormatter); + mFormatter = nullptr; + } +} + +URelativeDateTimeUnit RelativeTimeFormat::ToURelativeDateTimeUnit( + FormatUnit unit) const { + switch (unit) { + case FormatUnit::Second: + return UDAT_REL_UNIT_SECOND; + case FormatUnit::Minute: + return UDAT_REL_UNIT_MINUTE; + case FormatUnit::Hour: + return UDAT_REL_UNIT_HOUR; + case FormatUnit::Day: + return UDAT_REL_UNIT_DAY; + case FormatUnit::Week: + return UDAT_REL_UNIT_WEEK; + case FormatUnit::Month: + return UDAT_REL_UNIT_MONTH; + case FormatUnit::Quarter: + return UDAT_REL_UNIT_QUARTER; + case FormatUnit::Year: + return UDAT_REL_UNIT_YEAR; + }; + MOZ_ASSERT_UNREACHABLE(); + return UDAT_REL_UNIT_SECOND; +} + +Result<Span<const char16_t>, ICUError> RelativeTimeFormat::formatToParts( + double aNumber, FormatUnit aUnit, NumberPartVector& aParts) const { + UErrorCode status = U_ZERO_ERROR; + + if (mNumeric == RelativeTimeFormatOptions::Numeric::Auto) { + ureldatefmt_formatToResult(mFormatter, aNumber, + ToURelativeDateTimeUnit(aUnit), + mFormattedRelativeDateTime, &status); + } else { + ureldatefmt_formatNumericToResult(mFormatter, aNumber, + ToURelativeDateTimeUnit(aUnit), + mFormattedRelativeDateTime, &status); + } + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + const UFormattedValue* formattedValue = + ureldatefmt_resultAsValue(mFormattedRelativeDateTime, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + bool isNegative = !std::isnan(aNumber) && IsNegative(aNumber); + + // Necessary until all of intl is using Span (Bug 1709880) + return FormatResultToParts(formattedValue, Nothing(), isNegative, + false /*formatForUnit*/, aParts) + .andThen([](std::u16string_view result) + -> Result<Span<const char16_t>, ICUError> { + return Span<const char16_t>(result.data(), result.length()); + }); +} + +} // namespace mozilla::intl diff --git a/intl/components/src/RelativeTimeFormat.h b/intl/components/src/RelativeTimeFormat.h new file mode 100644 index 0000000000..94c2db6927 --- /dev/null +++ b/intl/components/src/RelativeTimeFormat.h @@ -0,0 +1,146 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef intl_components_RelativeTimeFormat_h_ +#define intl_components_RelativeTimeFormat_h_ + +#include "mozilla/intl/ICU4CGlue.h" +#include "mozilla/intl/ICUError.h" +#include "mozilla/intl/NumberPart.h" + +#include "unicode/ureldatefmt.h" +#include "unicode/utypes.h" + +namespace mozilla::intl { + +struct RelativeTimeFormatOptions { + enum class Style { Short, Narrow, Long }; + Style style = Style::Long; + + enum class Numeric { + /** + * Only strings with numeric components like `1 day ago`. + */ + Always, + /** + * Natural-language strings like `yesterday` when possible, + * otherwise strings with numeric components as in `7 months ago`. + */ + Auto, + }; + Numeric numeric = Numeric::Always; +}; + +/** + * A RelativeTimeFormat implementation that roughly mirrors the API provided by + * the ECMA-402 Intl.RelativeTimeFormat object. + * + * https://tc39.es/ecma402/#relativetimeformat-objects + */ +class RelativeTimeFormat final { + public: + /** + * + * Initialize a new RelativeTimeFormat for the provided locale and using the + * provided options. + * + * https://tc39.es/ecma402/#sec-InitializeRelativeTimeFormat + */ + static Result<UniquePtr<RelativeTimeFormat>, ICUError> TryCreate( + const char* aLocale, const RelativeTimeFormatOptions& aOptions); + + RelativeTimeFormat() = default; + + RelativeTimeFormat(RelativeTimeFormatOptions::Numeric aNumeric, + URelativeDateTimeFormatter* aFormatter, + UFormattedRelativeDateTime* aFormattedRelativeDateTime); + + RelativeTimeFormat(const RelativeTimeFormat&) = delete; + RelativeTimeFormat& operator=(const RelativeTimeFormat&) = delete; + ~RelativeTimeFormat(); + + enum class FormatUnit { + Second, + Minute, + Hour, + Day, + Week, + Month, + Quarter, + Year + }; + + /** + * Formats a double to the provider buffer (either utf-8 or utf-16) + * + * https://tc39.es/ecma402/#sec-FormatRelativeTime + */ + template <typename B> + Result<Ok, ICUError> format(double aNumber, FormatUnit aUnit, + B& aBuffer) const { + static_assert( + std::is_same_v<typename B::CharType, char> || + std::is_same_v<typename B::CharType, char16_t>, + "The only buffer CharTypes supported by RelativeTimeFormat are char " + "(for UTF-8 support) and char16_t (for UTF-16 support)."); + + auto fmt = mNumeric == RelativeTimeFormatOptions::Numeric::Auto + ? ureldatefmt_format + : ureldatefmt_formatNumeric; + + if constexpr (std::is_same_v<typename B::CharType, char>) { + mozilla::Vector<char16_t, StackU16VectorSize> u16Vec; + + MOZ_TRY(FillBufferWithICUCall( + u16Vec, [this, aNumber, aUnit, fmt](UChar* target, int32_t length, + UErrorCode* status) { + return fmt(mFormatter, aNumber, ToURelativeDateTimeUnit(aUnit), + target, length, status); + })); + + if (!FillBuffer(u16Vec, aBuffer)) { + return Err(ICUError::OutOfMemory); + } + return Ok{}; + } else { + static_assert(std::is_same_v<typename B::CharType, char16_t>); + + return FillBufferWithICUCall( + aBuffer, [this, aNumber, aUnit, fmt](UChar* target, int32_t length, + UErrorCode* status) { + return fmt(mFormatter, aNumber, ToURelativeDateTimeUnit(aUnit), + target, length, status); + }); + } + } + + /** + * Formats the relative time to a utf-16 string, and fills the provided parts + * vector. The string view is valid until another time is formatted. + * Accessing the string view after this event is undefined behavior. + * + * This is utf-16 only because the only current use case is in + * SpiderMonkey. Supporting utf-8 would require recalculating the offsets + * in NumberPartVector from fixed width to variable width, which might be + * tricky to get right and is work that won't be necessary if we switch to + * ICU4X (see Bug 1723120). + * + * https://tc39.es/ecma402/#sec-FormatRelativeTimeToParts + */ + Result<Span<const char16_t>, ICUError> formatToParts( + double aNumber, FormatUnit aUnit, NumberPartVector& aParts) const; + + private: + RelativeTimeFormatOptions::Numeric mNumeric = + RelativeTimeFormatOptions::Numeric::Always; + URelativeDateTimeFormatter* mFormatter = nullptr; + UFormattedRelativeDateTime* mFormattedRelativeDateTime = nullptr; + + static constexpr size_t StackU16VectorSize = 128; + + URelativeDateTimeUnit ToURelativeDateTimeUnit(FormatUnit unit) const; +}; + +} // namespace mozilla::intl + +#endif diff --git a/intl/components/src/ScopedICUObject.h b/intl/components/src/ScopedICUObject.h new file mode 100644 index 0000000000..1aa79245dc --- /dev/null +++ b/intl/components/src/ScopedICUObject.h @@ -0,0 +1,40 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef intl_components_ScopedICUObject_h +#define intl_components_ScopedICUObject_h + +/* + * A simple RAII class to assure ICU objects are automatically deallocated at + * scope end. Unfortunately, ICU's C++ API is uniformly unstable, so we can't + * use its smart pointers for this. + */ + +namespace mozilla::intl { + +template <typename T, void(Delete)(T*)> +class ScopedICUObject { + T* ptr_; + + public: + explicit ScopedICUObject(T* ptr) : ptr_(ptr) {} + + ~ScopedICUObject() { + if (ptr_) { + Delete(ptr_); + } + } + + // In cases where an object should be deleted on abnormal exits, + // but returned to the caller if everything goes well, call forget() + // to transfer the object just before returning. + T* forget() { + T* tmp = ptr_; + ptr_ = nullptr; + return tmp; + } +}; + +} // namespace mozilla::intl + +#endif /* intl_components_ScopedICUObject_h */ diff --git a/intl/components/src/String.cpp b/intl/components/src/String.cpp new file mode 100644 index 0000000000..de24ab804f --- /dev/null +++ b/intl/components/src/String.cpp @@ -0,0 +1,13 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "mozilla/intl/String.h" + +namespace mozilla::intl { + +Span<const char> String::GetUnicodeVersion() { + return MakeStringSpan(U_UNICODE_VERSION); +} + +} // namespace mozilla::intl diff --git a/intl/components/src/String.h b/intl/components/src/String.h new file mode 100644 index 0000000000..f07acd6578 --- /dev/null +++ b/intl/components/src/String.h @@ -0,0 +1,256 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef intl_components_String_h_ +#define intl_components_String_h_ + +#include "mozilla/Assertions.h" +#include "mozilla/Casting.h" +#include "mozilla/intl/ICU4CGlue.h" +#include "mozilla/intl/ICUError.h" +#include "mozilla/PodOperations.h" +#include "mozilla/Result.h" +#include "mozilla/Span.h" + +#include "unicode/uchar.h" +#include "unicode/unorm2.h" +#include "unicode/ustring.h" +#include "unicode/utext.h" +#include "unicode/utypes.h" + +namespace mozilla::intl { + +/** + * This component is a Mozilla-focused API for working with strings in + * internationalization code. + */ +class String final { + public: + String() = delete; + + /** + * Return the locale-sensitive lower case string of the input. + */ + template <typename B> + static Result<Ok, ICUError> ToLocaleLowerCase(const char* aLocale, + Span<const char16_t> aString, + B& aBuffer) { + if (!aBuffer.reserve(aString.size())) { + return Err(ICUError::OutOfMemory); + } + return FillBufferWithICUCall( + aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) { + return u_strToLower(target, length, aString.data(), aString.size(), + aLocale, status); + }); + } + + /** + * Return the locale-sensitive upper case string of the input. + */ + template <typename B> + static Result<Ok, ICUError> ToLocaleUpperCase(const char* aLocale, + Span<const char16_t> aString, + B& aBuffer) { + if (!aBuffer.reserve(aString.size())) { + return Err(ICUError::OutOfMemory); + } + return FillBufferWithICUCall( + aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) { + return u_strToUpper(target, length, aString.data(), aString.size(), + aLocale, status); + }); + } + + /** + * Normalization form constants to describe which normalization algorithm + * should be performed. + * + * Also see: + * - Unicode Standard, §2.12 Equivalent Sequences + * - Unicode Standard, §3.11 Normalization Forms + * - https://unicode.org/reports/tr15/ + */ + enum class NormalizationForm { + /** + * Normalization Form C + */ + NFC, + + /** + * Normalization Form D + */ + NFD, + + /** + * Normalization Form KC + */ + NFKC, + + /** + * Normalization Form KD + */ + NFKD, + }; + + enum class AlreadyNormalized : bool { No, Yes }; + + /** + * Normalize the input string according to requested normalization form. + * + * Returns `AlreadyNormalized::Yes` when the string is already in normalized + * form. The output buffer is unchanged in this case. Otherwise returns + * `AlreadyNormalized::No` and places the normalized string into the output + * buffer. + */ + template <typename B> + static Result<AlreadyNormalized, ICUError> Normalize( + NormalizationForm aForm, Span<const char16_t> aString, B& aBuffer) { + // The unorm2_getXXXInstance() methods return a shared instance which must + // not be deleted. + UErrorCode status = U_ZERO_ERROR; + const UNormalizer2* normalizer; + switch (aForm) { + case NormalizationForm::NFC: + normalizer = unorm2_getNFCInstance(&status); + break; + case NormalizationForm::NFD: + normalizer = unorm2_getNFDInstance(&status); + break; + case NormalizationForm::NFKC: + normalizer = unorm2_getNFKCInstance(&status); + break; + case NormalizationForm::NFKD: + normalizer = unorm2_getNFKDInstance(&status); + break; + } + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + int32_t spanLengthInt = unorm2_spanQuickCheckYes(normalizer, aString.data(), + aString.size(), &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + size_t spanLength = AssertedCast<size_t>(spanLengthInt); + MOZ_ASSERT(spanLength <= aString.size()); + + // Return if the input string is already normalized. + if (spanLength == aString.size()) { + return AlreadyNormalized::Yes; + } + + if (!aBuffer.reserve(aString.size())) { + return Err(ICUError::OutOfMemory); + } + + // Copy the already normalized prefix. + if (spanLength > 0) { + PodCopy(aBuffer.data(), aString.data(), spanLength); + + aBuffer.written(spanLength); + } + + MOZ_TRY(FillBufferWithICUCall( + aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) { + Span<const char16_t> remaining = aString.From(spanLength); + return unorm2_normalizeSecondAndAppend(normalizer, target, spanLength, + length, remaining.data(), + remaining.size(), status); + })); + + return AlreadyNormalized::No; + } + + /** + * Return true if the code point has the binary property "Cased". + */ + static bool IsCased(char32_t codePoint) { + return u_hasBinaryProperty(static_cast<UChar32>(codePoint), UCHAR_CASED); + } + + /** + * Return true if the code point has the binary property "Case_Ignorable". + */ + static bool IsCaseIgnorable(char32_t codePoint) { + return u_hasBinaryProperty(static_cast<UChar32>(codePoint), + UCHAR_CASE_IGNORABLE); + } + + /** + * Return the NFC pairwise composition of the two input characters, if any; + * returns 0 (which we know is not a composed char!) if none exists. + */ + static char32_t ComposePairNFC(char32_t a, char32_t b) { + // unorm2_getNFCInstance returns a static instance that does not have to be + // released here. If it fails, we just return 0 (no composition) always. + static UErrorCode status = U_ZERO_ERROR; + static const UNormalizer2* normalizer = unorm2_getNFCInstance(&status); + if (U_FAILURE(status)) { + return 0; + } + UChar32 ch = unorm2_composePair(normalizer, static_cast<UChar32>(a), + static_cast<UChar32>(b)); + return ch < 0 ? 0 : static_cast<char32_t>(ch); + } + + /** + * Put the "raw" (single-level) canonical decomposition of the input char, if + * any, into the provided buffer. Canonical decomps are never more than two + * chars in length (although full normalization may result in longer output + * due to recursion). + * Returns the length of the decomposition (0 if none, else 1 or 2). + */ + static int DecomposeRawNFD(char32_t ab, char32_t decomp[2]) { + // unorm2_getNFCInstance returns a static instance that does not have to be + // released here. If it fails, we just return 0 (no decomposition) always. + // Although we are using it to query for a decomposition, the mode of the + // Normalizer2 is irrelevant here, so we may as well use the same singleton + // instance as ComposePairNFC. + static UErrorCode status = U_ZERO_ERROR; + static const UNormalizer2* normalizer = unorm2_getNFCInstance(&status); + if (U_FAILURE(status)) { + return 0; + } + + // Canonical decompositions are never more than two Unicode characters, + // or a maximum of 4 utf-16 code units. + const unsigned MAX_DECOMP_LENGTH = 4; + UErrorCode error = U_ZERO_ERROR; + UChar decompUtf16[MAX_DECOMP_LENGTH]; + int32_t len = + unorm2_getRawDecomposition(normalizer, static_cast<UChar32>(ab), + decompUtf16, MAX_DECOMP_LENGTH, &error); + if (U_FAILURE(error) || len < 0) { + return 0; + } + UText text = UTEXT_INITIALIZER; + utext_openUChars(&text, decompUtf16, len, &error); + MOZ_ASSERT(U_SUCCESS(error)); + UChar32 ch = UTEXT_NEXT32(&text); + len = 0; + if (ch != U_SENTINEL) { + decomp[0] = static_cast<char32_t>(ch); + ++len; + ch = UTEXT_NEXT32(&text); + if (ch != U_SENTINEL) { + decomp[1] = static_cast<char32_t>(ch); + ++len; + } + } + utext_close(&text); + return len; + } + + /** + * Return the Unicode version, for example "13.0". + */ + static Span<const char> GetUnicodeVersion(); +}; + +} // namespace mozilla::intl + +#endif diff --git a/intl/components/src/TimeZone.cpp b/intl/components/src/TimeZone.cpp new file mode 100644 index 0000000000..145dd3f071 --- /dev/null +++ b/intl/components/src/TimeZone.cpp @@ -0,0 +1,344 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "mozilla/intl/TimeZone.h" + +#include "mozilla/Vector.h" + +#include <algorithm> +#include <string_view> + +#include "unicode/uenum.h" +#if MOZ_INTL_USE_ICU_CPP_TIMEZONE +# include "unicode/basictz.h" +#endif + +namespace mozilla::intl { + +/* static */ +Result<UniquePtr<TimeZone>, ICUError> TimeZone::TryCreate( + Maybe<Span<const char16_t>> aTimeZoneOverride) { + const UChar* zoneID = nullptr; + int32_t zoneIDLen = 0; + if (aTimeZoneOverride) { + zoneIDLen = static_cast<int32_t>(aTimeZoneOverride->Length()); + zoneID = aTimeZoneOverride->Elements(); + } + +#if MOZ_INTL_USE_ICU_CPP_TIMEZONE + UniquePtr<icu::TimeZone> tz; + if (zoneID) { + tz.reset( + icu::TimeZone::createTimeZone(icu::UnicodeString(zoneID, zoneIDLen))); + } else { + tz.reset(icu::TimeZone::createDefault()); + } + MOZ_ASSERT(tz); + + if (*tz == icu::TimeZone::getUnknown()) { + return Err(ICUError::InternalError); + } + + return MakeUnique<TimeZone>(std::move(tz)); +#else + // An empty string is used for the root locale. This is regarded as the base + // locale of all locales, and is used as the language/country neutral locale + // for locale sensitive operations. + const char* rootLocale = ""; + + UErrorCode status = U_ZERO_ERROR; + UCalendar* calendar = + ucal_open(zoneID, zoneIDLen, rootLocale, UCAL_DEFAULT, &status); + + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + // https://tc39.es/ecma262/#sec-time-values-and-time-range + // + // A time value supports a slightly smaller range of -8,640,000,000,000,000 to + // 8,640,000,000,000,000 milliseconds. + constexpr double StartOfTime = -8.64e15; + + // Ensure all computations are performed in the proleptic Gregorian calendar. + ucal_setGregorianChange(calendar, StartOfTime, &status); + + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + return MakeUnique<TimeZone>(calendar); +#endif +} + +Result<int32_t, ICUError> TimeZone::GetRawOffsetMs() { +#if MOZ_INTL_USE_ICU_CPP_TIMEZONE + return mTimeZone->getRawOffset(); +#else + // Reset the time in case the calendar has been modified. + UErrorCode status = U_ZERO_ERROR; + ucal_setMillis(mCalendar, ucal_getNow(), &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + int32_t offset = ucal_get(mCalendar, UCAL_ZONE_OFFSET, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + return offset; +#endif +} + +Result<int32_t, ICUError> TimeZone::GetDSTOffsetMs(int64_t aUTCMilliseconds) { + UDate date = UDate(aUTCMilliseconds); + +#if MOZ_INTL_USE_ICU_CPP_TIMEZONE + constexpr bool dateIsLocalTime = false; + int32_t rawOffset, dstOffset; + UErrorCode status = U_ZERO_ERROR; + + mTimeZone->getOffset(date, dateIsLocalTime, rawOffset, dstOffset, status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + return dstOffset; +#else + UErrorCode status = U_ZERO_ERROR; + ucal_setMillis(mCalendar, date, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + int32_t dstOffset = ucal_get(mCalendar, UCAL_DST_OFFSET, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + return dstOffset; +#endif +} + +Result<int32_t, ICUError> TimeZone::GetOffsetMs(int64_t aUTCMilliseconds) { + UDate date = UDate(aUTCMilliseconds); + +#if MOZ_INTL_USE_ICU_CPP_TIMEZONE + constexpr bool dateIsLocalTime = false; + int32_t rawOffset, dstOffset; + UErrorCode status = U_ZERO_ERROR; + + mTimeZone->getOffset(date, dateIsLocalTime, rawOffset, dstOffset, status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + return rawOffset + dstOffset; +#else + UErrorCode status = U_ZERO_ERROR; + ucal_setMillis(mCalendar, date, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + int32_t rawOffset = ucal_get(mCalendar, UCAL_ZONE_OFFSET, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + int32_t dstOffset = ucal_get(mCalendar, UCAL_DST_OFFSET, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + return rawOffset + dstOffset; +#endif +} + +Result<int32_t, ICUError> TimeZone::GetUTCOffsetMs(int64_t aLocalMilliseconds) { + // https://tc39.es/ecma262/#sec-local-time-zone-adjustment + // + // LocalTZA ( t, isUTC ) + // + // When t_local represents local time repeating multiple times at a negative + // time zone transition (e.g. when the daylight saving time ends or the time + // zone offset is decreased due to a time zone rule change) or skipped local + // time at a positive time zone transitions (e.g. when the daylight saving + // time starts or the time zone offset is increased due to a time zone rule + // change), t_local must be interpreted using the time zone offset before the + // transition. + constexpr UTimeZoneLocalOption skippedTime = UCAL_TZ_LOCAL_FORMER; + constexpr UTimeZoneLocalOption repeatedTime = UCAL_TZ_LOCAL_FORMER; + + UDate date = UDate(aLocalMilliseconds); + +#if MOZ_INTL_USE_ICU_CPP_TIMEZONE + int32_t rawOffset, dstOffset; + UErrorCode status = U_ZERO_ERROR; + + // All ICU TimeZone classes derive from BasicTimeZone, so we can safely + // perform the static_cast. + // Once <https://unicode-org.atlassian.net/browse/ICU-13705> is fixed we + // can remove this extra cast. + auto* basicTz = static_cast<icu::BasicTimeZone*>(mTimeZone.get()); + basicTz->getOffsetFromLocal(date, skippedTime, repeatedTime, rawOffset, + dstOffset, status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + return rawOffset + dstOffset; +#else + UErrorCode status = U_ZERO_ERROR; + ucal_setMillis(mCalendar, date, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + int32_t rawOffset, dstOffset; + ucal_getTimeZoneOffsetFromLocal(mCalendar, skippedTime, repeatedTime, + &rawOffset, &dstOffset, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + return rawOffset + dstOffset; +#endif +} + +using TimeZoneIdentifierVector = + Vector<char16_t, TimeZone::TimeZoneIdentifierLength>; + +#if !MOZ_INTL_USE_ICU_CPP_TIMEZONE +static bool IsUnknownTimeZone(const TimeZoneIdentifierVector& timeZone) { + constexpr std::string_view unknownTimeZone = UCAL_UNKNOWN_ZONE_ID; + + return timeZone.length() == unknownTimeZone.length() && + std::equal(timeZone.begin(), timeZone.end(), unknownTimeZone.begin(), + unknownTimeZone.end()); +} + +static ICUResult SetDefaultTimeZone(TimeZoneIdentifierVector& timeZone) { + // The string mustn't already be null-terminated. + MOZ_ASSERT_IF(!timeZone.empty(), timeZone.end()[-1] != '\0'); + + // The time zone identifier must be a null-terminated string. + if (!timeZone.append('\0')) { + return Err(ICUError::OutOfMemory); + } + + UErrorCode status = U_ZERO_ERROR; + ucal_setDefaultTimeZone(timeZone.begin(), &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + return Ok{}; +} +#endif + +Result<bool, ICUError> TimeZone::SetDefaultTimeZone( + Span<const char> aTimeZone) { +#if MOZ_INTL_USE_ICU_CPP_TIMEZONE + icu::UnicodeString tzid(aTimeZone.data(), aTimeZone.size(), US_INV); + if (tzid.isBogus()) { + return Err(ICUError::OutOfMemory); + } + + UniquePtr<icu::TimeZone> newTimeZone(icu::TimeZone::createTimeZone(tzid)); + MOZ_ASSERT(newTimeZone); + + if (*newTimeZone != icu::TimeZone::getUnknown()) { + // adoptDefault() takes ownership of the time zone. + icu::TimeZone::adoptDefault(newTimeZone.release()); + return true; + } +#else + TimeZoneIdentifierVector tzid; + if (!tzid.append(aTimeZone.data(), aTimeZone.size())) { + return Err(ICUError::OutOfMemory); + } + + // Retrieve the current default time zone in case we need to restore it. + TimeZoneIdentifierVector defaultTimeZone; + MOZ_TRY(FillBufferWithICUCall(defaultTimeZone, ucal_getDefaultTimeZone)); + + // Try to set the new time zone. + MOZ_TRY(mozilla::intl::SetDefaultTimeZone(tzid)); + + // Check if the time zone was actually applied. + TimeZoneIdentifierVector newTimeZone; + MOZ_TRY(FillBufferWithICUCall(newTimeZone, ucal_getDefaultTimeZone)); + + // Return if the new time zone was successfully applied. + if (!IsUnknownTimeZone(newTimeZone)) { + return true; + } + + // Otherwise restore the original time zone. + MOZ_TRY(mozilla::intl::SetDefaultTimeZone(defaultTimeZone)); +#endif + + return false; +} + +ICUResult TimeZone::SetDefaultTimeZoneFromHostTimeZone() { +#if MOZ_INTL_USE_ICU_CPP_TIMEZONE + if (icu::TimeZone* defaultZone = icu::TimeZone::detectHostTimeZone()) { + icu::TimeZone::adoptDefault(defaultZone); + } +#else + TimeZoneIdentifierVector hostTimeZone; + MOZ_TRY(FillBufferWithICUCall(hostTimeZone, ucal_getHostTimeZone)); + + MOZ_TRY(mozilla::intl::SetDefaultTimeZone(hostTimeZone)); +#endif + + return Ok{}; +} + +Result<Span<const char>, ICUError> TimeZone::GetTZDataVersion() { + UErrorCode status = U_ZERO_ERROR; + const char* tzdataVersion = ucal_getTZDataVersion(&status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + return MakeStringSpan(tzdataVersion); +} + +Result<SpanEnumeration<char>, ICUError> TimeZone::GetAvailableTimeZones( + const char* aRegion) { + // Get the time zones that are commonly used in the given region. Uses the + // UCAL_ZONE_TYPE_ANY filter so we have more fine-grained control over the + // returned time zones and don't omit time zones which are considered links in + // ICU, but are treated as proper zones in IANA. + UErrorCode status = U_ZERO_ERROR; + UEnumeration* enumeration = ucal_openTimeZoneIDEnumeration( + UCAL_ZONE_TYPE_ANY, aRegion, nullptr, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + return SpanEnumeration<char>(enumeration); +} + +Result<SpanEnumeration<char>, ICUError> TimeZone::GetAvailableTimeZones() { + UErrorCode status = U_ZERO_ERROR; + UEnumeration* enumeration = ucal_openTimeZones(&status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + return SpanEnumeration<char>(enumeration); +} + +#if !MOZ_INTL_USE_ICU_CPP_TIMEZONE +TimeZone::~TimeZone() { + MOZ_ASSERT(mCalendar); + ucal_close(mCalendar); +} +#endif + +} // namespace mozilla::intl diff --git a/intl/components/src/TimeZone.h b/intl/components/src/TimeZone.h new file mode 100644 index 0000000000..364cb45c2f --- /dev/null +++ b/intl/components/src/TimeZone.h @@ -0,0 +1,257 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef intl_components_TimeZone_h_ +#define intl_components_TimeZone_h_ + +// ICU doesn't provide a separate C API for time zone functions, but instead +// requires to use UCalendar. This adds a measurable overhead when compared to +// using ICU's C++ TimeZone API, therefore we prefer to use the C++ API when +// possible. Due to the lack of a stable ABI in C++, it's only possible to use +// the C++ API when we use our in-tree ICU copy. +#if !MOZ_SYSTEM_ICU +# define MOZ_INTL_USE_ICU_CPP_TIMEZONE 1 +#else +# define MOZ_INTL_USE_ICU_CPP_TIMEZONE 0 +#endif + +#include <stdint.h> +#include <utility> + +#include "unicode/ucal.h" +#include "unicode/utypes.h" +#if MOZ_INTL_USE_ICU_CPP_TIMEZONE +# include "unicode/locid.h" +# include "unicode/timezone.h" +# include "unicode/unistr.h" +#endif + +#include "mozilla/Assertions.h" +#include "mozilla/Casting.h" +#include "mozilla/intl/ICU4CGlue.h" +#include "mozilla/intl/ICUError.h" +#include "mozilla/Maybe.h" +#include "mozilla/Result.h" +#include "mozilla/Span.h" +#include "mozilla/UniquePtr.h" + +namespace mozilla::intl { + +/** + * This component is a Mozilla-focused API for working with time zones in + * internationalization code. It is used in coordination with other operations + * such as datetime formatting. + */ +class TimeZone final { + public: +#if MOZ_INTL_USE_ICU_CPP_TIMEZONE + explicit TimeZone(UniquePtr<icu::TimeZone> aTimeZone) + : mTimeZone(std::move(aTimeZone)) { + MOZ_ASSERT(mTimeZone); + } +#else + explicit TimeZone(UCalendar* aCalendar) : mCalendar(aCalendar) { + MOZ_ASSERT(mCalendar); + } +#endif + + // Do not allow copy as this class owns the ICU resource. Move is not + // currently implemented, but a custom move operator could be created if + // needed. + TimeZone(const TimeZone&) = delete; + TimeZone& operator=(const TimeZone&) = delete; + +#if MOZ_INTL_USE_ICU_CPP_TIMEZONE + ~TimeZone() = default; +#else + ~TimeZone(); +#endif + + /** + * Create a TimeZone. + */ + static Result<UniquePtr<TimeZone>, ICUError> TryCreate( + Maybe<Span<const char16_t>> aTimeZoneOverride = Nothing{}); + + /** + * A number indicating the raw offset from GMT in milliseconds. + */ + Result<int32_t, ICUError> GetRawOffsetMs(); + + /** + * Return the daylight saving offset in milliseconds at the given UTC time. + */ + Result<int32_t, ICUError> GetDSTOffsetMs(int64_t aUTCMilliseconds); + + /** + * Return the local offset in milliseconds at the given UTC time. + */ + Result<int32_t, ICUError> GetOffsetMs(int64_t aUTCMilliseconds); + + /** + * Return the UTC offset in milliseconds at the given local time. + */ + Result<int32_t, ICUError> GetUTCOffsetMs(int64_t aLocalMilliseconds); + + enum class DaylightSavings : bool { No, Yes }; + + /** + * Return the display name for this time zone. + */ + template <typename B> + ICUResult GetDisplayName(const char* aLocale, + DaylightSavings aDaylightSavings, B& aBuffer) { +#if MOZ_INTL_USE_ICU_CPP_TIMEZONE + icu::UnicodeString displayName; + mTimeZone->getDisplayName(static_cast<bool>(aDaylightSavings), + icu::TimeZone::LONG, icu::Locale(aLocale), + displayName); + return FillBuffer(displayName, aBuffer); +#else + return FillBufferWithICUCall( + aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) { + UCalendarDisplayNameType type = + static_cast<bool>(aDaylightSavings) ? UCAL_DST : UCAL_STANDARD; + return ucal_getTimeZoneDisplayName(mCalendar, type, aLocale, target, + length, status); + }); +#endif + } + + /** + * Return the identifier for this time zone. + */ + template <typename B> + ICUResult GetId(B& aBuffer) { +#if MOZ_INTL_USE_ICU_CPP_TIMEZONE + icu::UnicodeString id; + mTimeZone->getID(id); + return FillBuffer(id, aBuffer); +#else + return FillBufferWithICUCall( + aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) { + return ucal_getTimeZoneID(mCalendar, target, length, status); + }); +#endif + } + + /** + * Fill the buffer with the system's default IANA time zone identifier, e.g. + * "America/Chicago". + */ + template <typename B> + static ICUResult GetDefaultTimeZone(B& aBuffer) { + return FillBufferWithICUCall(aBuffer, ucal_getDefaultTimeZone); + } + + /** + * Fill the buffer with the host system's default IANA time zone identifier, + * e.g. "America/Chicago". + * + * NOTE: This function is not thread-safe. + */ + template <typename B> + static ICUResult GetHostTimeZone(B& aBuffer) { + return FillBufferWithICUCall(aBuffer, ucal_getHostTimeZone); + } + + /** + * Set the default time zone. + */ + static Result<bool, ICUError> SetDefaultTimeZone(Span<const char> aTimeZone); + + /** + * Set the default time zone using the host system's time zone. + * + * NOTE: This function is not thread-safe. + */ + static ICUResult SetDefaultTimeZoneFromHostTimeZone(); + + /** + * Return the tzdata version. + * + * The tzdata version is a string of the form "<year><release>", e.g. "2021a". + */ + static Result<Span<const char>, ICUError> GetTZDataVersion(); + + /** + * Constant for the typical maximal length of a time zone identifier. + * + * At the time of this writing 32 characters fits every supported time zone: + * + * Intl.supportedValuesOf("timeZone") + * .reduce((acc, v) => Math.max(acc, v.length), 0) + */ + static constexpr size_t TimeZoneIdentifierLength = 32; + + /** + * Returns the canonical system time zone ID or the normalized custom time + * zone ID for the given time zone ID. + */ + template <typename B> + static ICUResult GetCanonicalTimeZoneID(Span<const char16_t> inputTimeZone, + B& aBuffer) { + static_assert(std::is_same_v<typename B::CharType, char16_t>, + "Currently only UTF-16 buffers are supported."); + + if (aBuffer.capacity() == 0) { + // ucal_getCanonicalTimeZoneID differs from other API calls and fails when + // passed a nullptr or 0 length result. Reserve some space initially so + // that a real pointer will be used in the API. + if (!aBuffer.reserve(TimeZoneIdentifierLength)) { + return Err(ICUError::OutOfMemory); + } + } + + return FillBufferWithICUCall( + aBuffer, + [&inputTimeZone](UChar* target, int32_t length, UErrorCode* status) { + return ucal_getCanonicalTimeZoneID( + inputTimeZone.Elements(), + static_cast<int32_t>(inputTimeZone.Length()), target, length, + /* isSystemID */ nullptr, status); + }); + } + + /** + * Return an enumeration over all time zones commonly used in the given + * region. + */ + static Result<SpanEnumeration<char>, ICUError> GetAvailableTimeZones( + const char* aRegion); + + /** + * Return an enumeration over all available time zones. + */ + static Result<SpanEnumeration<char>, ICUError> GetAvailableTimeZones(); + + private: +#if MOZ_INTL_USE_ICU_CPP_TIMEZONE + template <typename B> + static ICUResult FillBuffer(const icu::UnicodeString& aString, B& aBuffer) { + int32_t length = aString.length(); + if (!aBuffer.reserve(AssertedCast<size_t>(length))) { + return Err(ICUError::OutOfMemory); + } + + UErrorCode status = U_ZERO_ERROR; + int32_t written = aString.extract(aBuffer.data(), length, status); + if (!ICUSuccessForStringSpan(status)) { + return Err(ToICUError(status)); + } + MOZ_ASSERT(written == length); + + aBuffer.written(written); + + return Ok{}; + } + + UniquePtr<icu::TimeZone> mTimeZone = nullptr; +#else + UCalendar* mCalendar = nullptr; +#endif +}; + +} // namespace mozilla::intl + +#endif diff --git a/intl/components/src/UnicodeProperties.h b/intl/components/src/UnicodeProperties.h new file mode 100644 index 0000000000..7fd64e099e --- /dev/null +++ b/intl/components/src/UnicodeProperties.h @@ -0,0 +1,310 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef intl_components_UnicodeProperties_h_ +#define intl_components_UnicodeProperties_h_ + +#include "mozilla/intl/BidiClass.h" +#include "mozilla/intl/GeneralCategory.h" +#include "mozilla/intl/ICU4CGlue.h" +#include "mozilla/intl/UnicodeScriptCodes.h" +#include "mozilla/Vector.h" + +#include "unicode/uchar.h" +#include "unicode/uscript.h" + +namespace mozilla::intl { + +/** + * This component is a Mozilla-focused API for working with text properties. + */ +class UnicodeProperties final { + public: + /** + * Return the BidiClass for the character. + */ + static inline BidiClass GetBidiClass(uint32_t aCh) { + return BidiClass(u_charDirection(aCh)); + } + + /** + * Maps the specified character to a "mirror-image" character. + */ + static inline uint32_t CharMirror(uint32_t aCh) { return u_charMirror(aCh); } + + /** + * Return the general category value for the code point. + */ + static inline GeneralCategory CharType(uint32_t aCh) { + return GeneralCategory(u_charType(aCh)); + } + + /** + * Determine whether the code point has the Bidi_Mirrored property. + */ + static inline bool IsMirrored(uint32_t aCh) { return u_isMirrored(aCh); } + + /** + * Returns the combining class of the code point as specified in + * UnicodeData.txt. + */ + static inline uint8_t GetCombiningClass(uint32_t aCh) { + return u_getCombiningClass(aCh); + } + + enum class IntProperty { + BidiPairedBracketType, + EastAsianWidth, + HangulSyllableType, + LineBreak, + NumericType, + }; + + /** + * Get the property value for an enumerated or integer Unicode property for a + * code point. + */ + static inline int32_t GetIntPropertyValue(uint32_t aCh, IntProperty aProp) { + UProperty prop; + switch (aProp) { + case IntProperty::BidiPairedBracketType: + prop = UCHAR_BIDI_PAIRED_BRACKET_TYPE; + break; + case IntProperty::EastAsianWidth: + prop = UCHAR_EAST_ASIAN_WIDTH; + break; + case IntProperty::HangulSyllableType: + prop = UCHAR_HANGUL_SYLLABLE_TYPE; + break; + case IntProperty::LineBreak: + prop = UCHAR_LINE_BREAK; + break; + case IntProperty::NumericType: + prop = UCHAR_NUMERIC_TYPE; + break; + } + return u_getIntPropertyValue(aCh, prop); + } + + /** + * Get the numeric value for a Unicode code point as defined in the + * Unicode Character Database if the input is decimal or a digit, + * otherwise, returns -1. + */ + static inline int8_t GetNumericValue(uint32_t aCh) { + UNumericType type = + UNumericType(GetIntPropertyValue(aCh, IntProperty::NumericType)); + return type == U_NT_DECIMAL || type == U_NT_DIGIT + ? int8_t(u_getNumericValue(aCh)) + : -1; + } + + /** + * Maps the specified character to its paired bracket character. + */ + static inline uint32_t GetBidiPairedBracket(uint32_t aCh) { + return u_getBidiPairedBracket(aCh); + } + + /** + * The given character is mapped to its uppercase equivalent according to + * UnicodeData.txt; if the character has no uppercase equivalent, the + * character itself is returned. + */ + static inline uint32_t ToUpper(uint32_t aCh) { return u_toupper(aCh); } + + /** + * The given character is mapped to its lowercase equivalent according to + * UnicodeData.txt; if the character has no lowercase equivalent, the + * character itself is returned. + */ + static inline uint32_t ToLower(uint32_t aCh) { return u_tolower(aCh); } + + /** + * Check if a code point has the Lowercase Unicode property. + */ + static inline bool IsLowercase(uint32_t aCh) { return u_isULowercase(aCh); } + + /** + * The given character is mapped to its titlecase equivalent according to + * UnicodeData.txt; if the character has no titlecase equivalent, the + * character itself is returned. + */ + static inline uint32_t ToTitle(uint32_t aCh) { return u_totitle(aCh); } + + /** + * The given character is mapped to its case folding equivalent according to + * UnicodeData.txt and CaseFolding.txt; + * if the character has no case folding equivalent, the character + * itself is returned. + */ + static inline uint32_t FoldCase(uint32_t aCh) { + return u_foldCase(aCh, U_FOLD_CASE_DEFAULT); + } + + enum class BinaryProperty { + DefaultIgnorableCodePoint, + Emoji, + EmojiPresentation, + }; + + /** + * Check a binary Unicode property for a code point. + */ + static inline bool HasBinaryProperty(uint32_t aCh, BinaryProperty aProp) { + UProperty prop; + switch (aProp) { + case BinaryProperty::DefaultIgnorableCodePoint: + prop = UCHAR_DEFAULT_IGNORABLE_CODE_POINT; + break; + case BinaryProperty::Emoji: + prop = UCHAR_EMOJI; + break; + case BinaryProperty::EmojiPresentation: + prop = UCHAR_EMOJI_PRESENTATION; + break; + } + return u_hasBinaryProperty(aCh, prop); + } + + /** + * Check if the width of aCh is full width, half width or wide + * excluding emoji. + */ + static inline bool IsEastAsianWidthFHWexcludingEmoji(uint32_t aCh) { + switch (GetIntPropertyValue(aCh, IntProperty::EastAsianWidth)) { + case U_EA_FULLWIDTH: + case U_EA_HALFWIDTH: + return true; + case U_EA_WIDE: + return HasBinaryProperty(aCh, BinaryProperty::Emoji) ? false : true; + case U_EA_AMBIGUOUS: + case U_EA_NARROW: + case U_EA_NEUTRAL: + return false; + } + return false; + } + + /** + * Check if the width of aCh is ambiguous, full width, or wide. + */ + static inline bool IsEastAsianWidthAFW(uint32_t aCh) { + switch (GetIntPropertyValue(aCh, IntProperty::EastAsianWidth)) { + case U_EA_AMBIGUOUS: + case U_EA_FULLWIDTH: + case U_EA_WIDE: + return true; + case U_EA_HALFWIDTH: + case U_EA_NARROW: + case U_EA_NEUTRAL: + return false; + } + return false; + } + + /** + * Check if the width of aCh is full width, or wide. + */ + static inline bool IsEastAsianWidthFW(uint32_t aCh) { + switch (GetIntPropertyValue(aCh, IntProperty::EastAsianWidth)) { + case U_EA_FULLWIDTH: + case U_EA_WIDE: + return true; + case U_EA_AMBIGUOUS: + case U_EA_HALFWIDTH: + case U_EA_NARROW: + case U_EA_NEUTRAL: + return false; + } + return false; + } + + /** + * Check if the CharType of aCh is math or other symbol. + */ + static inline bool IsMathOrMusicSymbol(uint32_t aCh) { + // Keep this function in sync with is_math_symbol in base_chars.py. + return CharType(aCh) == GeneralCategory::Math_Symbol || + CharType(aCh) == GeneralCategory::Other_Symbol; + } + + static inline Script GetScriptCode(uint32_t aCh) { + // We can safely ignore the error code here because uscript_getScript + // returns USCRIPT_INVALID_CODE in the event of an error. + UErrorCode err = U_ZERO_ERROR; + return Script(uscript_getScript(aCh, &err)); + } + + static inline bool HasScript(uint32_t aCh, Script aScript) { + return uscript_hasScript(aCh, UScriptCode(aScript)); + } + + static inline const char* GetScriptShortName(Script aScript) { + return uscript_getShortName(UScriptCode(aScript)); + } + + static inline int32_t GetMaxNumberOfScripts() { + return u_getIntPropertyMaxValue(UCHAR_SCRIPT); + } + + // The code point which has the most script extensions is 0x0965, which has 21 + // script extensions, so choose the vector size as 32 to prevent heap + // allocation. + static constexpr size_t kMaxScripts = 32; + + using ScriptExtensionVector = Vector<Script, kMaxScripts>; + + /** + * Get the script extensions for the given code point, and write the script + * extensions to aExtensions vector. If the code point has script extensions, + * the script code (Script::COMMON or Script::INHERITED) will be excluded. + * + * If the code point doesn't have any script extension, then its script code + * will be written to aExtensions vector. + * + * If the code point is invalid, Script::UNKNOWN will be written to + * aExtensions vector. + * + * Note: aExtensions will be cleared after calling this method regardless of + * failure. + * + * See [1] for the script code of the code point, [2] for the script + * extensions. + * + * https://www.unicode.org/Public/UNIDATA/Scripts.txt + * https://www.unicode.org/Public/UNIDATA/ScriptExtensions.txt + */ + static ICUResult GetExtensions(char32_t aCodePoint, + ScriptExtensionVector& aExtensions) { + // Clear the vector first. + aExtensions.clear(); + + // We cannot pass aExtensions to uscript_getScriptExtension as USCriptCode + // takes 4 bytes, so create a local UScriptCode array to get the extensions. + UScriptCode ext[kMaxScripts]; + UErrorCode status = U_ZERO_ERROR; + int32_t len = uscript_getScriptExtensions(static_cast<UChar32>(aCodePoint), + ext, kMaxScripts, &status); + if (U_FAILURE(status)) { + // kMaxScripts should be large enough to hold the maximun number of script + // extensions. + MOZ_DIAGNOSTIC_ASSERT(status != U_BUFFER_OVERFLOW_ERROR); + return Err(ToICUError(status)); + } + + if (!aExtensions.reserve(len)) { + return Err(ICUError::OutOfMemory); + } + + for (int32_t i = 0; i < len; i++) { + aExtensions.infallibleAppend(Script(ext[i])); + } + + return Ok(); + } +}; + +} // namespace mozilla::intl + +#endif diff --git a/intl/components/src/UnicodeScriptCodes.h b/intl/components/src/UnicodeScriptCodes.h new file mode 100644 index 0000000000..b5d6f490ee --- /dev/null +++ b/intl/components/src/UnicodeScriptCodes.h @@ -0,0 +1,261 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* + * Derived from the Unicode Character Database by genUnicodePropertyData.pl + * + * For Unicode terms of use, see http://www.unicode.org/terms_of_use.html + */ + +/* + * Created on Tue Oct 25 06:53:25 2022 from UCD data files with version info: + * + +# Unicode Character Database +# Date: 2022-09-02 +# © 2022 Unicode®, Inc. +# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. +# For terms of use, see https://www.unicode.org/terms_of_use.html +# +# For documentation, see the following: +# NamesList.html +# UAX #38, "Unicode Han Database (Unihan)" +# UAX #44, "Unicode Character Database" +# UTS #51, "Unicode Emoji" +# +# The UAXes and UTS #51 can be accessed at https://www.unicode.org/versions/Unicode15.0.0/ + +This directory contains the final data files +for the Unicode Character Database, for Version 15.0.0 of the Unicode Standard. + +# IdentifierStatus.txt +# Date: 2022-08-26, 16:49:09 GMT + +# +# Unihan_Variants.txt +# Date: 2022-08-01 16:36:07 GMT [JHJ] + +# VerticalOrientation-17.txt +# Date: 2016-10-20, 07:00:00 GMT [EM, KI, LI] + + * + * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * * + */ + +#ifndef intl_components_UnicodeScriptCodes_h_ +#define intl_components_UnicodeScriptCodes_h_ + +namespace mozilla::intl { +enum class Script : int16_t { + COMMON = 0, + INHERITED = 1, + ARABIC = 2, + ARMENIAN = 3, + BENGALI = 4, + BOPOMOFO = 5, + CHEROKEE = 6, + COPTIC = 7, + CYRILLIC = 8, + DESERET = 9, + DEVANAGARI = 10, + ETHIOPIC = 11, + GEORGIAN = 12, + GOTHIC = 13, + GREEK = 14, + GUJARATI = 15, + GURMUKHI = 16, + HAN = 17, + HANGUL = 18, + HEBREW = 19, + HIRAGANA = 20, + KANNADA = 21, + KATAKANA = 22, + KHMER = 23, + LAO = 24, + LATIN = 25, + MALAYALAM = 26, + MONGOLIAN = 27, + MYANMAR = 28, + OGHAM = 29, + OLD_ITALIC = 30, + ORIYA = 31, + RUNIC = 32, + SINHALA = 33, + SYRIAC = 34, + TAMIL = 35, + TELUGU = 36, + THAANA = 37, + THAI = 38, + TIBETAN = 39, + CANADIAN_ABORIGINAL = 40, + YI = 41, + TAGALOG = 42, + HANUNOO = 43, + BUHID = 44, + TAGBANWA = 45, + BRAILLE = 46, + CYPRIOT = 47, + LIMBU = 48, + LINEAR_B = 49, + OSMANYA = 50, + SHAVIAN = 51, + TAI_LE = 52, + UGARITIC = 53, + KATAKANA_OR_HIRAGANA = 54, + BUGINESE = 55, + GLAGOLITIC = 56, + KHAROSHTHI = 57, + SYLOTI_NAGRI = 58, + NEW_TAI_LUE = 59, + TIFINAGH = 60, + OLD_PERSIAN = 61, + BALINESE = 62, + BATAK = 63, + BLISSYMBOLS = 64, + BRAHMI = 65, + CHAM = 66, + CIRTH = 67, + OLD_CHURCH_SLAVONIC_CYRILLIC = 68, + DEMOTIC_EGYPTIAN = 69, + HIERATIC_EGYPTIAN = 70, + EGYPTIAN_HIEROGLYPHS = 71, + KHUTSURI = 72, + SIMPLIFIED_HAN = 73, + TRADITIONAL_HAN = 74, + PAHAWH_HMONG = 75, + OLD_HUNGARIAN = 76, + HARAPPAN_INDUS = 77, + JAVANESE = 78, + KAYAH_LI = 79, + LATIN_FRAKTUR = 80, + LATIN_GAELIC = 81, + LEPCHA = 82, + LINEAR_A = 83, + MANDAIC = 84, + MAYAN_HIEROGLYPHS = 85, + MEROITIC_HIEROGLYPHS = 86, + NKO = 87, + OLD_TURKIC = 88, + OLD_PERMIC = 89, + PHAGS_PA = 90, + PHOENICIAN = 91, + MIAO = 92, + RONGORONGO = 93, + SARATI = 94, + ESTRANGELO_SYRIAC = 95, + WESTERN_SYRIAC = 96, + EASTERN_SYRIAC = 97, + TENGWAR = 98, + VAI = 99, + VISIBLE_SPEECH = 100, + CUNEIFORM = 101, + UNWRITTEN_LANGUAGES = 102, + UNKNOWN = 103, + CARIAN = 104, + JAPANESE = 105, + TAI_THAM = 106, + LYCIAN = 107, + LYDIAN = 108, + OL_CHIKI = 109, + REJANG = 110, + SAURASHTRA = 111, + SIGNWRITING = 112, + SUNDANESE = 113, + MOON = 114, + MEETEI_MAYEK = 115, + IMPERIAL_ARAMAIC = 116, + AVESTAN = 117, + CHAKMA = 118, + KOREAN = 119, + KAITHI = 120, + MANICHAEAN = 121, + INSCRIPTIONAL_PAHLAVI = 122, + PSALTER_PAHLAVI = 123, + BOOK_PAHLAVI = 124, + INSCRIPTIONAL_PARTHIAN = 125, + SAMARITAN = 126, + TAI_VIET = 127, + MATHEMATICAL_NOTATION = 128, + SYMBOLS = 129, + BAMUM = 130, + LISU = 131, + NAKHI_GEBA = 132, + OLD_SOUTH_ARABIAN = 133, + BASSA_VAH = 134, + DUPLOYAN = 135, + ELBASAN = 136, + GRANTHA = 137, + KPELLE = 138, + LOMA = 139, + MENDE_KIKAKUI = 140, + MEROITIC_CURSIVE = 141, + OLD_NORTH_ARABIAN = 142, + NABATAEAN = 143, + PALMYRENE = 144, + KHUDAWADI = 145, + WARANG_CITI = 146, + AFAKA = 147, + JURCHEN = 148, + MRO = 149, + NUSHU = 150, + SHARADA = 151, + SORA_SOMPENG = 152, + TAKRI = 153, + TANGUT = 154, + WOLEAI = 155, + ANATOLIAN_HIEROGLYPHS = 156, + KHOJKI = 157, + TIRHUTA = 158, + CAUCASIAN_ALBANIAN = 159, + MAHAJANI = 160, + AHOM = 161, + HATRAN = 162, + MODI = 163, + MULTANI = 164, + PAU_CIN_HAU = 165, + SIDDHAM = 166, + ADLAM = 167, + BHAIKSUKI = 168, + MARCHEN = 169, + NEWA = 170, + OSAGE = 171, + HAN_WITH_BOPOMOFO = 172, + JAMO = 173, + SYMBOLS_EMOJI = 174, + MASARAM_GONDI = 175, + SOYOMBO = 176, + ZANABAZAR_SQUARE = 177, + DOGRA = 178, + GUNJALA_GONDI = 179, + MAKASAR = 180, + MEDEFAIDRIN = 181, + HANIFI_ROHINGYA = 182, + SOGDIAN = 183, + OLD_SOGDIAN = 184, + ELYMAIC = 185, + NYIAKENG_PUACHUE_HMONG = 186, + NANDINAGARI = 187, + WANCHO = 188, + CHORASMIAN = 189, + DIVES_AKURU = 190, + KHITAN_SMALL_SCRIPT = 191, + YEZIDI = 192, + CYPRO_MINOAN = 193, + OLD_UYGHUR = 194, + TANGSA = 195, + TOTO = 196, + VITHKUQI = 197, + KAWI = 198, + NAG_MUNDARI = 199, + + NUM_SCRIPT_CODES = 200, + + INVALID = -1 +}; +} // namespace mozilla::intl + +#endif +/* + * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * * + */ |