From 26a029d407be480d791972afb5975cf62c9360a6 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 19 Apr 2024 02:47:55 +0200 Subject: Adding upstream version 124.0.1. Signed-off-by: Daniel Baumann --- intl/components/src/Bidi.cpp | 212 ++++ intl/components/src/Bidi.h | 183 +++ intl/components/src/BidiClass.h | 49 + intl/components/src/BidiEmbeddingLevel.cpp | 53 + intl/components/src/BidiEmbeddingLevel.h | 113 ++ intl/components/src/Calendar.cpp | 172 +++ intl/components/src/Calendar.h | 133 ++ intl/components/src/Collator.cpp | 295 +++++ intl/components/src/Collator.h | 348 +++++ intl/components/src/Currency.cpp | 22 + intl/components/src/Currency.h | 30 + intl/components/src/DateIntervalFormat.cpp | 266 ++++ intl/components/src/DateIntervalFormat.h | 107 ++ intl/components/src/DateTimeFormat.cpp | 1148 +++++++++++++++++ intl/components/src/DateTimeFormat.h | 593 +++++++++ intl/components/src/DateTimeFormatUtils.cpp | 104 ++ intl/components/src/DateTimeFormatUtils.h | 14 + intl/components/src/DateTimePart.h | 84 ++ intl/components/src/DateTimePatternGenerator.cpp | 49 + intl/components/src/DateTimePatternGenerator.h | 161 +++ intl/components/src/DisplayNames.cpp | 234 ++++ intl/components/src/DisplayNames.h | 971 ++++++++++++++ intl/components/src/FormatBuffer.h | 77 ++ intl/components/src/GeneralCategory.h | 52 + intl/components/src/ICU4CGlue.cpp | 44 + intl/components/src/ICU4CGlue.h | 723 +++++++++++ intl/components/src/ICU4CLibrary.cpp | 41 + intl/components/src/ICU4CLibrary.h | 74 ++ intl/components/src/ICU4XGeckoDataProvider.cpp | 35 + intl/components/src/ICU4XGeckoDataProvider.h | 21 + intl/components/src/ICUError.h | 118 ++ intl/components/src/IDNA.cpp | 26 + intl/components/src/IDNA.h | 138 ++ intl/components/src/ListFormat.cpp | 132 ++ intl/components/src/ListFormat.h | 223 ++++ intl/components/src/Locale.cpp | 1471 ++++++++++++++++++++++ intl/components/src/Locale.h | 773 ++++++++++++ intl/components/src/LocaleCanonicalizer.cpp | 36 + intl/components/src/LocaleCanonicalizer.h | 43 + intl/components/src/LocaleGenerated.cpp | 1208 ++++++++++++++++++ intl/components/src/MeasureUnit.cpp | 110 ++ intl/components/src/MeasureUnit.h | 155 +++ intl/components/src/MeasureUnitGenerated.h | 70 + intl/components/src/NumberFormat.cpp | 155 +++ intl/components/src/NumberFormat.h | 426 +++++++ intl/components/src/NumberFormatFields.cpp | 396 ++++++ intl/components/src/NumberFormatFields.h | 91 ++ intl/components/src/NumberFormatterSkeleton.cpp | 472 +++++++ intl/components/src/NumberFormatterSkeleton.h | 110 ++ intl/components/src/NumberParser.cpp | 45 + intl/components/src/NumberParser.h | 46 + intl/components/src/NumberPart.h | 53 + intl/components/src/NumberRangeFormat.cpp | 216 ++++ intl/components/src/NumberRangeFormat.h | 237 ++++ intl/components/src/NumberingSystem.cpp | 38 + intl/components/src/NumberingSystem.h | 56 + intl/components/src/PluralRules.cpp | 180 +++ intl/components/src/PluralRules.h | 233 ++++ intl/components/src/RelativeTimeFormat.cpp | 153 +++ intl/components/src/RelativeTimeFormat.h | 147 +++ intl/components/src/ScopedICUObject.h | 40 + intl/components/src/String.cpp | 13 + intl/components/src/String.h | 256 ++++ intl/components/src/TimeZone.cpp | 423 +++++++ intl/components/src/TimeZone.h | 292 +++++ intl/components/src/UnicodeProperties.h | 310 +++++ intl/components/src/UnicodeScriptCodes.h | 261 ++++ 67 files changed, 15560 insertions(+) create mode 100644 intl/components/src/Bidi.cpp create mode 100644 intl/components/src/Bidi.h create mode 100644 intl/components/src/BidiClass.h create mode 100644 intl/components/src/BidiEmbeddingLevel.cpp create mode 100644 intl/components/src/BidiEmbeddingLevel.h create mode 100644 intl/components/src/Calendar.cpp create mode 100644 intl/components/src/Calendar.h create mode 100644 intl/components/src/Collator.cpp create mode 100644 intl/components/src/Collator.h create mode 100644 intl/components/src/Currency.cpp create mode 100644 intl/components/src/Currency.h create mode 100644 intl/components/src/DateIntervalFormat.cpp create mode 100644 intl/components/src/DateIntervalFormat.h create mode 100644 intl/components/src/DateTimeFormat.cpp create mode 100644 intl/components/src/DateTimeFormat.h create mode 100644 intl/components/src/DateTimeFormatUtils.cpp create mode 100644 intl/components/src/DateTimeFormatUtils.h create mode 100644 intl/components/src/DateTimePart.h create mode 100644 intl/components/src/DateTimePatternGenerator.cpp create mode 100644 intl/components/src/DateTimePatternGenerator.h create mode 100644 intl/components/src/DisplayNames.cpp create mode 100644 intl/components/src/DisplayNames.h create mode 100644 intl/components/src/FormatBuffer.h create mode 100644 intl/components/src/GeneralCategory.h create mode 100644 intl/components/src/ICU4CGlue.cpp create mode 100644 intl/components/src/ICU4CGlue.h create mode 100644 intl/components/src/ICU4CLibrary.cpp create mode 100644 intl/components/src/ICU4CLibrary.h create mode 100644 intl/components/src/ICU4XGeckoDataProvider.cpp create mode 100644 intl/components/src/ICU4XGeckoDataProvider.h create mode 100644 intl/components/src/ICUError.h create mode 100644 intl/components/src/IDNA.cpp create mode 100644 intl/components/src/IDNA.h create mode 100644 intl/components/src/ListFormat.cpp create mode 100644 intl/components/src/ListFormat.h create mode 100644 intl/components/src/Locale.cpp create mode 100644 intl/components/src/Locale.h create mode 100644 intl/components/src/LocaleCanonicalizer.cpp create mode 100644 intl/components/src/LocaleCanonicalizer.h create mode 100644 intl/components/src/LocaleGenerated.cpp create mode 100644 intl/components/src/MeasureUnit.cpp create mode 100644 intl/components/src/MeasureUnit.h create mode 100644 intl/components/src/MeasureUnitGenerated.h create mode 100644 intl/components/src/NumberFormat.cpp create mode 100644 intl/components/src/NumberFormat.h create mode 100644 intl/components/src/NumberFormatFields.cpp create mode 100644 intl/components/src/NumberFormatFields.h create mode 100644 intl/components/src/NumberFormatterSkeleton.cpp create mode 100644 intl/components/src/NumberFormatterSkeleton.h create mode 100644 intl/components/src/NumberParser.cpp create mode 100644 intl/components/src/NumberParser.h create mode 100644 intl/components/src/NumberPart.h create mode 100644 intl/components/src/NumberRangeFormat.cpp create mode 100644 intl/components/src/NumberRangeFormat.h create mode 100644 intl/components/src/NumberingSystem.cpp create mode 100644 intl/components/src/NumberingSystem.h create mode 100644 intl/components/src/PluralRules.cpp create mode 100644 intl/components/src/PluralRules.h create mode 100644 intl/components/src/RelativeTimeFormat.cpp create mode 100644 intl/components/src/RelativeTimeFormat.h create mode 100644 intl/components/src/ScopedICUObject.h create mode 100644 intl/components/src/String.cpp create mode 100644 intl/components/src/String.h create mode 100644 intl/components/src/TimeZone.cpp create mode 100644 intl/components/src/TimeZone.h create mode 100644 intl/components/src/UnicodeProperties.h create mode 100644 intl/components/src/UnicodeScriptCodes.h (limited to 'intl/components/src') diff --git a/intl/components/src/Bidi.cpp b/intl/components/src/Bidi.cpp new file mode 100644 index 0000000000..5e18e0075d --- /dev/null +++ b/intl/components/src/Bidi.cpp @@ -0,0 +1,212 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "mozilla/intl/Bidi.h" +#include "mozilla/Casting.h" +#include "mozilla/intl/ICU4CGlue.h" + +#if !USE_RUST_UNICODE_BIDI +# include "unicode/ubidi.h" +#endif + +namespace mozilla::intl { + +#if USE_RUST_UNICODE_BIDI +using namespace ffi; + +Bidi::Bidi() = default; +Bidi::~Bidi() = default; +#else +Bidi::Bidi() { mBidi = ubidi_open(); } +Bidi::~Bidi() { ubidi_close(mBidi.GetMut()); } +#endif + +ICUResult Bidi::SetParagraph(Span aParagraph, + BidiEmbeddingLevel aLevel) { +#if USE_RUST_UNICODE_BIDI + const auto* text = reinterpret_cast(aParagraph.Elements()); + mBidi.reset(bidi_new(text, aParagraph.Length(), aLevel)); + + return ToICUResult(U_ZERO_ERROR); +#else + // Do not allow any reordering of the runs, as this can change the + // performance characteristics of working with runs. In the default mode, + // the levels can be iterated over directly, rather than relying on computing + // logical runs on the fly. This can have negative performance characteristics + // compared to iterating over the levels. + // + // In the UBIDI_REORDER_RUNS_ONLY the levels are encoded with additional + // information which can be safely ignored in this Bidi implementation. + // Note that this check is here since setting the mode must be done before + // calls to setting the paragraph. + MOZ_ASSERT(ubidi_getReorderingMode(mBidi.GetMut()) == UBIDI_REORDER_DEFAULT); + + UErrorCode status = U_ZERO_ERROR; + ubidi_setPara(mBidi.GetMut(), aParagraph.Elements(), + AssertedCast(aParagraph.Length()), aLevel, nullptr, + &status); + + mLevels = nullptr; + + return ToICUResult(status); +#endif +} + +Bidi::ParagraphDirection Bidi::GetParagraphDirection() const { +#if USE_RUST_UNICODE_BIDI + auto dir = bidi_get_direction(mBidi.get()); + switch (dir) { + case -1: + return Bidi::ParagraphDirection::RTL; + case 0: + return Bidi::ParagraphDirection::Mixed; + case 1: + return Bidi::ParagraphDirection::LTR; + default: + MOZ_ASSERT_UNREACHABLE("Bad direction value"); + return Bidi::ParagraphDirection::Mixed; + } +#else + switch (ubidi_getDirection(mBidi.GetConst())) { + case UBIDI_LTR: + return Bidi::ParagraphDirection::LTR; + case UBIDI_RTL: + return Bidi::ParagraphDirection::RTL; + case UBIDI_MIXED: + return Bidi::ParagraphDirection::Mixed; + case UBIDI_NEUTRAL: + // This is only used in `ubidi_getBaseDirection` which is unused in this + // API. + MOZ_ASSERT_UNREACHABLE("Unexpected UBiDiDirection value."); + }; + return Bidi::ParagraphDirection::Mixed; +#endif +} + +/* static */ +void Bidi::ReorderVisual(const BidiEmbeddingLevel* aLevels, int32_t aLength, + int32_t* aIndexMap) { +#if USE_RUST_UNICODE_BIDI + bidi_reorder_visual(reinterpret_cast(aLevels), aLength, + aIndexMap); +#else + ubidi_reorderVisual(reinterpret_cast(aLevels), aLength, + aIndexMap); +#endif +} + +/* static */ +Bidi::BaseDirection Bidi::GetBaseDirection(Span aText) { +#if USE_RUST_UNICODE_BIDI + const auto* text = reinterpret_cast(aText.Elements()); + switch (bidi_get_base_direction(text, aText.Length(), false)) { + case -1: + return Bidi::BaseDirection::RTL; + case 0: + return Bidi::BaseDirection::Neutral; + case 1: + return Bidi::BaseDirection::LTR; + default: + MOZ_ASSERT_UNREACHABLE("Bad base direction value"); + return Bidi::BaseDirection::Neutral; + } +#else + UBiDiDirection direction = ubidi_getBaseDirection( + aText.Elements(), AssertedCast(aText.Length())); + switch (direction) { + case UBIDI_LTR: + return Bidi::BaseDirection::LTR; + case UBIDI_RTL: + return Bidi::BaseDirection::RTL; + case UBIDI_NEUTRAL: + return Bidi::BaseDirection::Neutral; + case UBIDI_MIXED: + MOZ_ASSERT_UNREACHABLE("Unexpected UBiDiDirection value."); + } + return Bidi::BaseDirection::Neutral; +#endif +} + +#if !USE_RUST_UNICODE_BIDI +static BidiDirection ToBidiDirection(UBiDiDirection aDirection) { + switch (aDirection) { + case UBIDI_LTR: + return BidiDirection::LTR; + case UBIDI_RTL: + return BidiDirection::RTL; + case UBIDI_MIXED: + case UBIDI_NEUTRAL: + MOZ_ASSERT_UNREACHABLE("Unexpected UBiDiDirection value."); + } + return BidiDirection::LTR; +} +#endif + +Result Bidi::CountRuns() { +#if USE_RUST_UNICODE_BIDI + return bidi_count_runs(mBidi.get()); +#else + UErrorCode status = U_ZERO_ERROR; + int32_t runCount = ubidi_countRuns(mBidi.GetMut(), &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + mLength = ubidi_getProcessedLength(mBidi.GetConst()); + mLevels = mLength > 0 ? reinterpret_cast( + ubidi_getLevels(mBidi.GetMut(), &status)) + : nullptr; + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + return runCount; +#endif +} + +void Bidi::GetLogicalRun(int32_t aLogicalStart, int32_t* aLogicalLimitOut, + BidiEmbeddingLevel* aLevelOut) { +#if USE_RUST_UNICODE_BIDI + const int32_t length = bidi_get_length(mBidi.get()); + MOZ_DIAGNOSTIC_ASSERT(aLogicalStart < length); + const auto* levels = bidi_get_levels(mBidi.get()); +#else + MOZ_ASSERT(mLevels, "CountRuns hasn't been run?"); + MOZ_RELEASE_ASSERT(aLogicalStart < mLength, "Out of bound"); + const int32_t length = mLength; + const auto* levels = mLevels; +#endif + const uint8_t level = levels[aLogicalStart]; + int32_t limit; + for (limit = aLogicalStart + 1; limit < length; limit++) { + if (levels[limit] != level) { + break; + } + } + *aLogicalLimitOut = limit; + *aLevelOut = BidiEmbeddingLevel(level); +} + +BidiEmbeddingLevel Bidi::GetParagraphEmbeddingLevel() const { +#if USE_RUST_UNICODE_BIDI + return BidiEmbeddingLevel(bidi_get_paragraph_level(mBidi.get())); +#else + return BidiEmbeddingLevel(ubidi_getParaLevel(mBidi.GetConst())); +#endif +} + +BidiDirection Bidi::GetVisualRun(int32_t aRunIndex, int32_t* aLogicalStart, + int32_t* aLength) { +#if USE_RUST_UNICODE_BIDI + auto run = bidi_get_visual_run(mBidi.get(), aRunIndex); + *aLogicalStart = run.start; + *aLength = run.length; + return BidiEmbeddingLevel(run.level).Direction(); +#else + return ToBidiDirection( + ubidi_getVisualRun(mBidi.GetMut(), aRunIndex, aLogicalStart, aLength)); +#endif +} + +} // namespace mozilla::intl diff --git a/intl/components/src/Bidi.h b/intl/components/src/Bidi.h new file mode 100644 index 0000000000..7b901e6bfd --- /dev/null +++ b/intl/components/src/Bidi.h @@ -0,0 +1,183 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef intl_components_Bidi_h_ +#define intl_components_Bidi_h_ + +#include "mozilla/intl/BidiEmbeddingLevel.h" +#include "mozilla/intl/ICU4CGlue.h" + +// Restrict use of the Rust unicode-bidi implementation to Nightly builds, +// pending investigation of perf regressions; Beta/Release builds will +// continue to use the ICU4C implementation for now. +#ifdef NIGHTLY_BUILD +# define USE_RUST_UNICODE_BIDI 1 +#else +# define USE_RUST_UNICODE_BIDI 0 +#endif + +#if USE_RUST_UNICODE_BIDI +# include "mozilla/intl/unicode_bidi_ffi_generated.h" +#else +struct UBiDi; +#endif + +namespace mozilla::intl { + +/** + * This component is a Mozilla-focused API for working with bidirectional (bidi) + * text. Text is commonly displayed left to right (LTR), especially for + * Latin-based alphabets. However, languages like Arabic and Hebrew displays + * text right to left (RTL). When displaying text, LTR and RTL text can be + * combined together in the same paragraph. This class gives tools for working + * with unidirectional, and mixed direction paragraphs. + * + * See the Unicode Bidirectional Algorithm document for implementation details: + * https://unicode.org/reports/tr9/ + */ +class Bidi final { + public: + Bidi(); + ~Bidi(); + + // Not copyable or movable + Bidi(const Bidi&) = delete; + Bidi& operator=(const Bidi&) = delete; + + /** + * This enum indicates the text direction for the set paragraph. Some + * paragraphs are unidirectional, where they only have one direction, or a + * paragraph could use both LTR and RTL. In this case the paragraph's + * direction would be mixed. + */ + enum class ParagraphDirection { LTR, RTL, Mixed }; + + /** + * Set the current paragraph of text to analyze for its bidi properties. This + * performs the Unicode bidi algorithm as specified by: + * https://unicode.org/reports/tr9/ + * + * After setting the text, the other getter methods can be used to find out + * the directionality of the paragraph text. + */ + ICUResult SetParagraph(Span aParagraph, + BidiEmbeddingLevel aLevel); + + /** + * Get the embedding level for the paragraph that was set by SetParagraph. + */ + BidiEmbeddingLevel GetParagraphEmbeddingLevel() const; + + /** + * Get the directionality of the paragraph text that was set by SetParagraph. + */ + ParagraphDirection GetParagraphDirection() const; + + /** + * Get the number of runs. This function may invoke the actual reordering on + * the Bidi object, after SetParagraph may have resolved only the levels of + * the text. Therefore, `CountRuns` may have to allocate memory, and may fail + * doing so. + */ + Result CountRuns(); + + /** + * Get the next logical run. The logical runs are a run of text that has the + * same directionality and embedding level. These runs are in memory order, + * and not in display order. + * + * Important! `Bidi::CountRuns` must be called before calling this method. + * + * @param aLogicalStart is the offset into the paragraph text that marks the + * logical start of the text. + * @param aLogicalLimitOut is an out param that is the length of the string + * that makes up the logical run. + * @param aLevelOut is an out parameter that returns the embedding level for + * the run + */ + void GetLogicalRun(int32_t aLogicalStart, int32_t* aLogicalLimitOut, + BidiEmbeddingLevel* aLevelOut); + + /** + * This is a convenience function that does not use the ICU Bidi object. + * It is intended to be used for when an application has determined the + * embedding levels of objects (character sequences) and just needs to have + * them reordered (L2). + * + * @param aLevels is an array with `aLength` levels that have been + * determined by the application. + * + * @param aLength is the number of levels in the array, or, semantically, + * the number of objects to be reordered. It must be greater than 0. + * + * @param aIndexMap is a pointer to an array of `aLength` + * indexes which will reflect the reordering of the characters. + * The array does not need to be initialized. + * The index map will result in + * `aIndexMap[aVisualIndex]==aLogicalIndex`. + */ + static void ReorderVisual(const BidiEmbeddingLevel* aLevels, int32_t aLength, + int32_t* aIndexMap); + + /** + * This enum indicates the bidi character type of the first strong character + * for the set paragraph. + * LTR: bidi character type 'L'. + * RTL: bidi character type 'R' or 'AL'. + * Neutral: The rest of bidi character types. + */ + enum class BaseDirection { LTR, RTL, Neutral }; + + /** + * Get the base direction of the text. + */ + static BaseDirection GetBaseDirection(Span aText); + + /** + * Get one run's logical start, length, and directionality. In an RTL run, the + * character at the logical start is visually on the right of the displayed + * run. The length is the number of characters in the run. + * `Bidi::CountRuns` should be called before the runs are retrieved. + * + * @param aRunIndex is the number of the run in visual order, in the + * range `[0..CountRuns-1]`. + * + * @param aLogicalStart is the first logical character index in the text. + * The pointer may be `nullptr` if this index is not needed. + * + * @param aLength is the number of characters (at least one) in the run. + * The pointer may be `nullptr` if this is not needed. + * + * Note that in right-to-left runs, the code places modifier letters before + * base characters and second surrogates before first ones. + */ + BidiDirection GetVisualRun(int32_t aRunIndex, int32_t* aLogicalStart, + int32_t* aLength); + + private: +#if USE_RUST_UNICODE_BIDI + using UnicodeBidi = mozilla::intl::ffi::UnicodeBidi; + struct BidiFreePolicy { + void operator()(void* aPtr) { + bidi_destroy(static_cast(aPtr)); + } + }; + mozilla::UniquePtr mBidi; +#else + ICUPointer mBidi = ICUPointer(nullptr); + + /** + * An array of levels that is the same length as the paragraph from + * `Bidi::SetParagraph`. + */ + const BidiEmbeddingLevel* mLevels = nullptr; + + /** + * The length of the paragraph from `Bidi::SetParagraph`. + */ + int32_t mLength = 0; +#endif +}; + +} // namespace mozilla::intl +#endif diff --git a/intl/components/src/BidiClass.h b/intl/components/src/BidiClass.h new file mode 100644 index 0000000000..04a861a382 --- /dev/null +++ b/intl/components/src/BidiClass.h @@ -0,0 +1,49 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef intl_components_BidiClass_h_ +#define intl_components_BidiClass_h_ + +#include + +namespace mozilla::intl { + +/** + * Read ftp://ftp.unicode.org/Public/UNIDATA/ReadMe-Latest.txt + * section BIDIRECTIONAL PROPERTIES + * for the detailed definition of the following categories + * + * The values here must match the equivalents in %bidicategorycode in + * mozilla/intl/unicharutil/tools/genUnicodePropertyData.pl, + * and must also match the values used by ICU's UCharDirection. + */ +enum class BidiClass : uint8_t { + LeftToRight = 0, + RightToLeft = 1, + EuropeanNumber = 2, + EuropeanNumberSeparator = 3, + EuropeanNumberTerminator = 4, + ArabicNumber = 5, + CommonNumberSeparator = 6, + BlockSeparator = 7, + SegmentSeparator = 8, + WhiteSpaceNeutral = 9, + OtherNeutral = 10, + LeftToRightEmbedding = 11, + LeftToRightOverride = 12, + RightToLeftArabic = 13, + RightToLeftEmbedding = 14, + RightToLeftOverride = 15, + PopDirectionalFormat = 16, + DirNonSpacingMark = 17, + BoundaryNeutral = 18, + FirstStrongIsolate = 19, + LeftToRightIsolate = 20, + RightToLeftIsolate = 21, + PopDirectionalIsolate = 22, + BidiClassCount +}; + +} // namespace mozilla::intl + +#endif diff --git a/intl/components/src/BidiEmbeddingLevel.cpp b/intl/components/src/BidiEmbeddingLevel.cpp new file mode 100644 index 0000000000..d3ef5da937 --- /dev/null +++ b/intl/components/src/BidiEmbeddingLevel.cpp @@ -0,0 +1,53 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "mozilla/intl/BidiEmbeddingLevel.h" +#include "mozilla/Casting.h" +#include "mozilla/intl/ICU4CGlue.h" + +#include "unicode/ubidi.h" + +namespace mozilla::intl { + +bool BidiEmbeddingLevel::IsDefaultLTR() const { + return mValue == UBIDI_DEFAULT_LTR; +}; + +bool BidiEmbeddingLevel::IsDefaultRTL() const { + return mValue == UBIDI_DEFAULT_RTL; +}; + +bool BidiEmbeddingLevel::IsRTL() const { + // If the least significant bit is 1, then the embedding level + // is right-to-left. + // If the least significant bit is 0, then the embedding level + // is left-to-right. + return (mValue & 0x1) == 1; +}; + +bool BidiEmbeddingLevel::IsLTR() const { return !IsRTL(); }; + +bool BidiEmbeddingLevel::IsSameDirection(BidiEmbeddingLevel aOther) const { + return (((mValue ^ aOther) & 1) == 0); +} + +BidiEmbeddingLevel BidiEmbeddingLevel::LTR() { return BidiEmbeddingLevel(0); }; + +BidiEmbeddingLevel BidiEmbeddingLevel::RTL() { return BidiEmbeddingLevel(1); }; + +BidiEmbeddingLevel BidiEmbeddingLevel::DefaultLTR() { + return BidiEmbeddingLevel(UBIDI_DEFAULT_LTR); +}; + +BidiEmbeddingLevel BidiEmbeddingLevel::DefaultRTL() { + return BidiEmbeddingLevel(UBIDI_DEFAULT_RTL); +}; + +BidiDirection BidiEmbeddingLevel::Direction() { + return IsRTL() ? BidiDirection::RTL : BidiDirection::LTR; +}; + +uint8_t BidiEmbeddingLevel::Value() const { return mValue; } + +} // namespace mozilla::intl diff --git a/intl/components/src/BidiEmbeddingLevel.h b/intl/components/src/BidiEmbeddingLevel.h new file mode 100644 index 0000000000..fe0045e4a5 --- /dev/null +++ b/intl/components/src/BidiEmbeddingLevel.h @@ -0,0 +1,113 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef intl_components_BidiEmbeddingLevel_h_ +#define intl_components_BidiEmbeddingLevel_h_ + +#include + +/** + * This file has the BidiEmbeddingLevel and BidiDirection enum broken out from + * the main Bidi class for faster includes. This code is used in Layout which + * could trigger long build times when changing core mozilla::intl files. + */ +namespace mozilla::intl { + +/** + * This enum unambiguously classifies text runs as either being left to right, + * or right to left. + */ +enum class BidiDirection : uint8_t { + // Left to right text. + LTR = 0, + // Right to left text. + RTL = 1, +}; + +/** + * Embedding levels are numbers that indicate how deeply the bidi text is + * embedded, and the direction of text on that embedding level. When switching + * between strongly LTR code points and strongly RTL code points the embedding + * level normally switches between an embedding level of 0 (LTR) and 1 (RTL). + * The only time the embedding level increases is if the embedding code points + * are used. This is the Left-to-Right Embedding (LRE) code point (U+202A), or + * the Right-to-Left Embedding (RLE) code point (U+202B). The minimum + * embedding level of text is zero, and the maximum explicit depth is 125. + * + * The most significant bit is reserved for additional meaning. It can be used + * to signify in certain APIs that the text should by default be LTR or RTL if + * no strongly directional code points are found. + * + * Bug 1736595: At the time of this writing, some places in Gecko code use a 1 + * in the most significant bit to indicate that an embedding level has not + * been set. This leads to an ambiguous understanding of what the most + * significant bit actually means. + */ +class BidiEmbeddingLevel { + public: + constexpr explicit BidiEmbeddingLevel(uint8_t aValue) : mValue(aValue) {} + constexpr explicit BidiEmbeddingLevel(int aValue) + : mValue(static_cast(aValue)) {} + + BidiEmbeddingLevel() = default; + + // Enable the copy operators, but disable move as this is only a uint8_t. + BidiEmbeddingLevel(const BidiEmbeddingLevel& other) = default; + BidiEmbeddingLevel& operator=(const BidiEmbeddingLevel& other) = default; + + /** + * Determine the direction of the embedding level by looking at the least + * significant bit. If it is 0, then it is LTR. If it is 1, then it is RTL. + */ + BidiDirection Direction(); + + /** + * Create a left-to-right embedding level. + */ + static BidiEmbeddingLevel LTR(); + + /** + * Create an right-to-left embedding level. + */ + static BidiEmbeddingLevel RTL(); + + /** + * When passed into `SetParagraph`, the direction is determined by first + * strongly directional character, with the default set to left-to-right if + * none is found. + * + * This is encoded with the highest bit set to 1. + */ + static BidiEmbeddingLevel DefaultLTR(); + + /** + * When passed into `SetParagraph`, the direction is determined by first + * strongly directional character, with the default set to right-to-left if + * none is found. + * + * * This is encoded with the highest and lowest bits set to 1. + */ + static BidiEmbeddingLevel DefaultRTL(); + + bool IsDefaultLTR() const; + bool IsDefaultRTL() const; + bool IsLTR() const; + bool IsRTL() const; + bool IsSameDirection(BidiEmbeddingLevel aOther) const; + + /** + * Get the underlying value as a uint8_t. + */ + uint8_t Value() const; + + /** + * Implicitly convert to the underlying value. + */ + operator uint8_t() const { return mValue; } + + private: + uint8_t mValue = 0; +}; + +} // namespace mozilla::intl +#endif diff --git a/intl/components/src/Calendar.cpp b/intl/components/src/Calendar.cpp new file mode 100644 index 0000000000..d44dedaaae --- /dev/null +++ b/intl/components/src/Calendar.cpp @@ -0,0 +1,172 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "mozilla/intl/Calendar.h" + +#include "unicode/ucal.h" +#include "unicode/uloc.h" +#include "unicode/utypes.h" + +namespace mozilla::intl { + +/* static */ +Result, ICUError> Calendar::TryCreate( + const char* aLocale, Maybe> aTimeZoneOverride) { + UErrorCode status = U_ZERO_ERROR; + const UChar* zoneID = nullptr; + int32_t zoneIDLen = 0; + if (aTimeZoneOverride) { + zoneIDLen = static_cast(aTimeZoneOverride->Length()); + zoneID = aTimeZoneOverride->Elements(); + } + + UCalendar* calendar = + ucal_open(zoneID, zoneIDLen, aLocale, UCAL_DEFAULT, &status); + + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + return MakeUnique(calendar); +} + +Result, ICUError> Calendar::GetBcp47Type() { + UErrorCode status = U_ZERO_ERROR; + const char* oldType = ucal_getType(mCalendar, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + const char* bcp47Type = uloc_toUnicodeLocaleType("calendar", oldType); + + if (!bcp47Type) { + return Err(ICUError::InternalError); + } + + return MakeStringSpan(bcp47Type); +} + +static Weekday WeekdayFromDaysOfWeek(UCalendarDaysOfWeek weekday) { + switch (weekday) { + case UCAL_MONDAY: + return Weekday::Monday; + case UCAL_TUESDAY: + return Weekday::Tuesday; + case UCAL_WEDNESDAY: + return Weekday::Wednesday; + case UCAL_THURSDAY: + return Weekday::Thursday; + case UCAL_FRIDAY: + return Weekday::Friday; + case UCAL_SATURDAY: + return Weekday::Saturday; + case UCAL_SUNDAY: + return Weekday::Sunday; + } + MOZ_CRASH("unexpected weekday value"); +} + +Result, ICUError> Calendar::GetWeekend() { + static_assert(static_cast(UCAL_SUNDAY) == 1); + static_assert(static_cast(UCAL_SATURDAY) == 7); + + UErrorCode status = U_ZERO_ERROR; + + EnumSet weekend; + for (int32_t i = UCAL_SUNDAY; i <= UCAL_SATURDAY; i++) { + auto dayOfWeek = static_cast(i); + auto type = ucal_getDayOfWeekType(mCalendar, dayOfWeek, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + switch (type) { + case UCAL_WEEKEND_ONSET: + // Treat days which start as a weekday as weekdays. + [[fallthrough]]; + case UCAL_WEEKDAY: + break; + + case UCAL_WEEKEND_CEASE: + // Treat days which start as a weekend day as weekend days. + [[fallthrough]]; + case UCAL_WEEKEND: + weekend += WeekdayFromDaysOfWeek(dayOfWeek); + break; + } + } + + return weekend; +} + +Weekday Calendar::GetFirstDayOfWeek() { + int32_t firstDayOfWeek = ucal_getAttribute(mCalendar, UCAL_FIRST_DAY_OF_WEEK); + MOZ_ASSERT(UCAL_SUNDAY <= firstDayOfWeek && firstDayOfWeek <= UCAL_SATURDAY); + + return WeekdayFromDaysOfWeek( + static_cast(firstDayOfWeek)); +} + +int32_t Calendar::GetMinimalDaysInFirstWeek() { + int32_t minimalDays = + ucal_getAttribute(mCalendar, UCAL_MINIMAL_DAYS_IN_FIRST_WEEK); + MOZ_ASSERT(1 <= minimalDays && minimalDays <= 7); + + return minimalDays; +} + +Result Calendar::SetTimeInMs(double aUnixEpoch) { + UErrorCode status = U_ZERO_ERROR; + ucal_setMillis(mCalendar, aUnixEpoch, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + return Ok{}; +} + +/* static */ +Result, ICUError> +Calendar::GetLegacyKeywordValuesForLocale(const char* aLocale) { + UErrorCode status = U_ZERO_ERROR; + UEnumeration* enumeration = ucal_getKeywordValuesForLocale( + "calendar", aLocale, /* commonlyUsed */ false, &status); + + if (U_SUCCESS(status)) { + return SpanEnumeration(enumeration); + } + + return Err(ToICUError(status)); +} + +/* static */ +SpanResult Calendar::LegacyIdentifierToBcp47(const char* aIdentifier, + int32_t aLength) { + if (aIdentifier == nullptr) { + return Err(InternalError{}); + } + // aLength is not needed here, as the ICU call uses the null terminated + // string. + return MakeStringSpan(uloc_toUnicodeLocaleType("ca", aIdentifier)); +} + +/* static */ +Result +Calendar::GetBcp47KeywordValuesForLocale(const char* aLocale, + CommonlyUsed aCommonlyUsed) { + UErrorCode status = U_ZERO_ERROR; + UEnumeration* enumeration = ucal_getKeywordValuesForLocale( + "calendar", aLocale, static_cast(aCommonlyUsed), &status); + + if (U_SUCCESS(status)) { + return Bcp47IdentifierEnumeration(enumeration); + } + + return Err(ToICUError(status)); +} + +Calendar::~Calendar() { + MOZ_ASSERT(mCalendar); + ucal_close(mCalendar); +} + +} // namespace mozilla::intl diff --git a/intl/components/src/Calendar.h b/intl/components/src/Calendar.h new file mode 100644 index 0000000000..32975bc376 --- /dev/null +++ b/intl/components/src/Calendar.h @@ -0,0 +1,133 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef intl_components_Calendar_h_ +#define intl_components_Calendar_h_ + +#include "mozilla/Assertions.h" +#include "mozilla/EnumSet.h" +#include "mozilla/intl/ICU4CGlue.h" +#include "mozilla/intl/ICUError.h" +#include "mozilla/Maybe.h" +#include "mozilla/Result.h" +#include "mozilla/Span.h" +#include "mozilla/UniquePtr.h" + +using UCalendar = void*; + +namespace mozilla::intl { + +/** + * Weekdays in the ISO-8601 calendar. + */ +enum class Weekday : uint8_t { + Monday = 1, + Tuesday, + Wednesday, + Thursday, + Friday, + Saturday, + Sunday, +}; + +/** + * This component is a Mozilla-focused API for working with calendar systems in + * internationalization code. It is used in coordination with other operations + * such as datetime formatting. + */ +class Calendar final { + public: + explicit Calendar(UCalendar* aCalendar) : mCalendar(aCalendar) { + MOZ_ASSERT(aCalendar); + }; + + // Do not allow copy as this class owns the ICU resource. Move is not + // currently implemented, but a custom move operator could be created if + // needed. + Calendar(const Calendar&) = delete; + Calendar& operator=(const Calendar&) = delete; + + /** + * Create a Calendar. + */ + static Result, ICUError> TryCreate( + const char* aLocale, + Maybe> aTimeZoneOverride = Nothing{}); + + /** + * Get the BCP 47 keyword value string designating the calendar type. For + * instance "gregory", "chinese", "islamic-civil", etc. + */ + Result, ICUError> GetBcp47Type(); + + /** + * Return the set of weekdays which are considered as part of the weekend. + */ + Result, ICUError> GetWeekend(); + + /** + * Return the weekday which is considered the first day of the week. + */ + Weekday GetFirstDayOfWeek(); + + /** + * Return the minimal number of days in the first week of a year. + */ + int32_t GetMinimalDaysInFirstWeek(); + + /** + * Set the time for the calendar relative to the number of milliseconds since + * 1 January 1970, UTC. + */ + Result SetTimeInMs(double aUnixEpoch); + + /** + * Return ICU legacy keywords, such as "gregorian", "islamic", + * "islamic-civil", "hebrew", etc. + */ + static Result, ICUError> + GetLegacyKeywordValuesForLocale(const char* aLocale); + + private: + /** + * Internal function to convert a legacy calendar identifier to the newer + * BCP 47 identifier. + */ + static SpanResult LegacyIdentifierToBcp47(const char* aIdentifier, + int32_t aLength); + + public: + enum class CommonlyUsed : bool { + /** + * Select all possible values, even when not commonly used by a locale. + */ + No, + + /** + * Only select the values which are commonly used by a locale. + */ + Yes, + }; + + using Bcp47IdentifierEnumeration = + Enumeration, Calendar::LegacyIdentifierToBcp47>; + + /** + * Return BCP 47 Unicode locale extension type keywords. + */ + static Result + GetBcp47KeywordValuesForLocale(const char* aLocale, + CommonlyUsed aCommonlyUsed = CommonlyUsed::No); + + ~Calendar(); + + private: + friend class DateIntervalFormat; + UCalendar* GetUCalendar() const { return mCalendar; } + + UCalendar* mCalendar = nullptr; +}; + +} // namespace mozilla::intl + +#endif diff --git a/intl/components/src/Collator.cpp b/intl/components/src/Collator.cpp new file mode 100644 index 0000000000..8835b29bde --- /dev/null +++ b/intl/components/src/Collator.cpp @@ -0,0 +1,295 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include +#include +#include "mozilla/intl/Collator.h" + +namespace mozilla::intl { + +Collator::Collator(UCollator* aCollator) : mCollator(aCollator) { + MOZ_ASSERT(aCollator); +} + +Collator::~Collator() { + if (mCollator.GetMut()) { + ucol_close(mCollator.GetMut()); + } +} + +Result, ICUError> Collator::TryCreate(const char* aLocale) { + UErrorCode status = U_ZERO_ERROR; + UCollator* collator = ucol_open(IcuLocale(aLocale), &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + return MakeUnique(collator); +}; + +int32_t Collator::CompareStrings(Span aSource, + Span aTarget) const { + switch (ucol_strcoll(mCollator.GetConst(), aSource.data(), + static_cast(aSource.size()), aTarget.data(), + static_cast(aTarget.size()))) { + case UCOL_LESS: + return -1; + case UCOL_EQUAL: + return 0; + case UCOL_GREATER: + return 1; + } + MOZ_ASSERT_UNREACHABLE("ucol_strcoll returned bad UCollationResult"); + return 0; +} + +int32_t Collator::CompareSortKeys(Span aKey1, + Span aKey2) const { + size_t minLength = std::min(aKey1.Length(), aKey2.Length()); + int32_t tmpResult = strncmp((const char*)aKey1.Elements(), + (const char*)aKey2.Elements(), minLength); + if (tmpResult < 0) { + return -1; + } + if (tmpResult > 0) { + return 1; + } + if (aKey1.Length() > minLength) { + // First string contains second one, so comes later, hence return > 0. + return 1; + } + if (aKey2.Length() > minLength) { + // First string is a substring of second one, so comes earlier, + // hence return < 0. + return -1; + } + return 0; +} + +static UColAttributeValue CaseFirstToICU(Collator::CaseFirst caseFirst) { + switch (caseFirst) { + case Collator::CaseFirst::False: + return UCOL_OFF; + case Collator::CaseFirst::Upper: + return UCOL_UPPER_FIRST; + case Collator::CaseFirst::Lower: + return UCOL_LOWER_FIRST; + } + + MOZ_ASSERT_UNREACHABLE(); + return UCOL_DEFAULT; +} + +void Collator::SetStrength(Collator::Strength aStrength) { + UColAttributeValue strength; + switch (aStrength) { + case Collator::Strength::Default: + strength = UCOL_DEFAULT_STRENGTH; + break; + case Collator::Strength::Primary: + strength = UCOL_PRIMARY; + break; + case Collator::Strength::Secondary: + strength = UCOL_SECONDARY; + break; + case Collator::Strength::Tertiary: + strength = UCOL_TERTIARY; + break; + case Collator::Strength::Quaternary: + strength = UCOL_QUATERNARY; + break; + case Collator::Strength::Identical: + strength = UCOL_IDENTICAL; + break; + } + + ucol_setStrength(mCollator.GetMut(), strength); +} + +ICUResult Collator::SetCaseLevel(Collator::Feature aFeature) { + UErrorCode status = U_ZERO_ERROR; + ucol_setAttribute(mCollator.GetMut(), UCOL_CASE_LEVEL, + ToUColAttributeValue(aFeature), &status); + return ToICUResult(status); +} + +ICUResult Collator::SetAlternateHandling( + Collator::AlternateHandling aAlternateHandling) { + UErrorCode status = U_ZERO_ERROR; + UColAttributeValue handling; + switch (aAlternateHandling) { + case Collator::AlternateHandling::NonIgnorable: + handling = UCOL_NON_IGNORABLE; + break; + case Collator::AlternateHandling::Shifted: + handling = UCOL_SHIFTED; + break; + case Collator::AlternateHandling::Default: + handling = UCOL_DEFAULT; + break; + } + + ucol_setAttribute(mCollator.GetMut(), UCOL_ALTERNATE_HANDLING, handling, + &status); + return ToICUResult(status); +} + +ICUResult Collator::SetNumericCollation(Collator::Feature aFeature) { + UErrorCode status = U_ZERO_ERROR; + ucol_setAttribute(mCollator.GetMut(), UCOL_NUMERIC_COLLATION, + ToUColAttributeValue(aFeature), &status); + return ToICUResult(status); +} + +ICUResult Collator::SetNormalizationMode(Collator::Feature aFeature) { + UErrorCode status = U_ZERO_ERROR; + ucol_setAttribute(mCollator.GetMut(), UCOL_NORMALIZATION_MODE, + ToUColAttributeValue(aFeature), &status); + return ToICUResult(status); +} + +ICUResult Collator::SetCaseFirst(Collator::CaseFirst aCaseFirst) { + UErrorCode status = U_ZERO_ERROR; + ucol_setAttribute(mCollator.GetMut(), UCOL_CASE_FIRST, + CaseFirstToICU(aCaseFirst), &status); + return ToICUResult(status); +} + +ICUResult Collator::SetOptions(const Options& aOptions, + const Maybe aPrevOptions) { + if (aPrevOptions && + // Check the equality of the previous options. + aPrevOptions->sensitivity == aOptions.sensitivity && + aPrevOptions->caseFirst == aOptions.caseFirst && + aPrevOptions->ignorePunctuation == aOptions.ignorePunctuation && + aPrevOptions->numeric == aOptions.numeric) { + return Ok(); + } + + Collator::Strength strength = Collator::Strength::Default; + Collator::Feature caseLevel = Collator::Feature::Off; + switch (aOptions.sensitivity) { + case Collator::Sensitivity::Base: + strength = Collator::Strength::Primary; + break; + case Collator::Sensitivity::Accent: + strength = Collator::Strength::Secondary; + break; + case Collator::Sensitivity::Case: + caseLevel = Collator::Feature::On; + strength = Collator::Strength::Primary; + break; + case Collator::Sensitivity::Variant: + strength = Collator::Strength::Tertiary; + break; + } + + SetStrength(strength); + + ICUResult result = Ok(); + + // According to the ICU team, UCOL_SHIFTED causes punctuation to be + // ignored. Looking at Unicode Technical Report 35, Unicode Locale Data + // Markup Language, "shifted" causes whitespace and punctuation to be + // ignored - that's a bit more than asked for, but there's no way to get + // less. + result = this->SetAlternateHandling( + aOptions.ignorePunctuation ? Collator::AlternateHandling::Shifted + : Collator::AlternateHandling::NonIgnorable); + if (result.isErr()) { + return result; + } + + result = SetCaseLevel(caseLevel); + if (result.isErr()) { + return result; + } + + result = SetNumericCollation(aOptions.numeric ? Collator::Feature::On + : Collator::Feature::Off); + if (result.isErr()) { + return result; + } + + // Normalization is always on to meet the canonical equivalence requirement. + result = SetNormalizationMode(Collator::Feature::On); + if (result.isErr()) { + return result; + } + + result = SetCaseFirst(aOptions.caseFirst); + if (result.isErr()) { + return result; + } + return Ok(); +} + +Result Collator::GetCaseFirst() const { + UErrorCode status = U_ZERO_ERROR; + UColAttributeValue caseFirst = + ucol_getAttribute(mCollator.GetConst(), UCOL_CASE_FIRST, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + if (caseFirst == UCOL_OFF) { + return CaseFirst::False; + } + if (caseFirst == UCOL_UPPER_FIRST) { + return CaseFirst::Upper; + } + MOZ_ASSERT(caseFirst == UCOL_LOWER_FIRST); + return CaseFirst::Lower; +} + +Result Collator::GetIgnorePunctuation() const { + UErrorCode status = U_ZERO_ERROR; + UColAttributeValue alternateHandling = + ucol_getAttribute(mCollator.GetConst(), UCOL_ALTERNATE_HANDLING, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + MOZ_ASSERT(alternateHandling == UCOL_SHIFTED || + alternateHandling == UCOL_NON_IGNORABLE); + return alternateHandling == UCOL_SHIFTED; +} + +/* static */ +Result +Collator::GetBcp47KeywordValuesForLocale(const char* aLocale, + CommonlyUsed aCommonlyUsed) { + UErrorCode status = U_ZERO_ERROR; + UEnumeration* enumeration = ucol_getKeywordValuesForLocale( + "collation", aLocale, static_cast(aCommonlyUsed), &status); + + if (U_SUCCESS(status)) { + return Bcp47ExtEnumeration(enumeration); + } + + return Err(ToICUError(status)); +} + +/* static */ +Result +Collator::GetBcp47KeywordValues() { + UErrorCode status = U_ZERO_ERROR; + UEnumeration* enumeration = ucol_getKeywordValues("collation", &status); + + if (U_SUCCESS(status)) { + return Bcp47ExtEnumeration(enumeration); + } + + return Err(ToICUError(status)); +} + +/* static */ +SpanResult Collator::KeywordValueToBcp47Extension(const char* aKeyword, + int32_t aLength) { + if (aKeyword == nullptr) { + return Err(InternalError{}); + } + return MakeStringSpan(uloc_toUnicodeLocaleType("co", aKeyword)); +} + +} // namespace mozilla::intl diff --git a/intl/components/src/Collator.h b/intl/components/src/Collator.h new file mode 100644 index 0000000000..655cb7b0fd --- /dev/null +++ b/intl/components/src/Collator.h @@ -0,0 +1,348 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef intl_components_Collator_h_ +#define intl_components_Collator_h_ + +#ifndef JS_STANDALONE +# include "gtest/MozGtestFriend.h" +#endif + +#include "unicode/ucol.h" + +#include "mozilla/Compiler.h" +#include "mozilla/intl/ICU4CGlue.h" +#include "mozilla/intl/ICUError.h" +#include "mozilla/Result.h" +#include "mozilla/Span.h" + +namespace mozilla::intl { + +class Collator final { + public: + /** + * Construct from a raw UCollator. This is public so that the UniquePtr can + * access it. + */ + explicit Collator(UCollator* aCollator); + + // Do not allow copy as this class owns the ICU resource. Move is not + // currently implemented, but a custom move operator could be created if + // needed. + Collator(const Collator&) = delete; + Collator& operator=(const Collator&) = delete; + + /** + * Attempt to initialize a new collator. + */ + static Result, ICUError> TryCreate(const char* aLocale); + + ~Collator(); + + /** + * Get a sort key with the provided UTF-16 string, and store the sort key into + * the provided buffer of byte array. + * Every sort key ends with 0x00, and the terminating 0x00 byte is counted + * when calculating the length of buffer. For the purpose of other byte + * values, check the "Special Byte Values" document from ICU. + * + * https://icu.unicode.org/design/collation/bytes + */ + template + ICUResult GetSortKey(Span aString, B& aBuffer) const { + return FillBufferWithICUCall( + aBuffer, + [this, aString](uint8_t* target, int32_t length, UErrorCode* status) { + // ucol_getSortKey doesn't use the error code to report + // U_BUFFER_OVERFLOW_ERROR, instead it uses the return value to + // indicate the desired length to store the key. So we update the + // UErrorCode accordingly to let FillBufferWithICUCall resize the + // buffer. + int32_t len = ucol_getSortKey(mCollator.GetConst(), aString.data(), + static_cast(aString.size()), + target, length); + if (len == 0) { + // Returns 0 means there's an internal error. + *status = U_INTERNAL_PROGRAM_ERROR; + } else if (len > length) { + *status = U_BUFFER_OVERFLOW_ERROR; + } else { + *status = U_ZERO_ERROR; + } + return len; + }); + } + + int32_t CompareStrings(Span aSource, + Span aTarget) const; + + int32_t CompareSortKeys(Span aKey1, + Span aKey2) const; + + /** + * Determine how casing affects sorting. These options map to ECMA 402 + * collator options. + * + * https://tc39.es/ecma402/#sec-initializecollator + */ + enum class CaseFirst { + // Sort upper case first. + Upper, + // Sort lower case first. + Lower, + // Orders upper and lower case letters in accordance to their tertiary + // weights. + False, + }; + + /** + * Which differences in the strings should lead to differences in collation + * comparisons. + * + * This setting needs to be ECMA 402 compliant. + * https://tc39.es/ecma402/#sec-collator-comparestrings + */ + enum class Sensitivity { + // Only strings that differ in base letters compare as unequal. + // Examples: a ≠ b, a = á, a = A. + Base, + // Only strings that differ in base letters or accents and other diacritic + // marks compare as unequal. + // Examples: a ≠ b, a ≠ á, a = A. + Accent, + // Only strings that differ in base letters or case compare as unequal. + // Examples: a ≠ b, a = á, a ≠ A. + Case, + // Strings that differ in base letters, accents and other diacritic marks, + // or case compare as unequal. Other differences may also be taken into + // consideration. + // Examples: a ≠ b, a ≠ á, a ≠ A. + Variant, + }; + + /** + * These options map to ECMA 402 collator options. Make sure the defaults map + * to the default initialized values of ECMA 402. + * + * https://tc39.es/ecma402/#sec-initializecollator + */ + struct Options { + Sensitivity sensitivity = Sensitivity::Variant; + CaseFirst caseFirst = CaseFirst::False; + bool ignorePunctuation = false; + bool numeric = false; + }; + + /** + * Change the configuraton of the options. + */ + ICUResult SetOptions(const Options& aOptions, + const Maybe aPrevOptions = Nothing()); + + /** + * Return the case first option of this collator. + */ + Result GetCaseFirst() const; + + /** + * Return the "ignores punctuation" option of this collator. + */ + Result GetIgnorePunctuation() const; + + /** + * Map keywords to their BCP 47 equivalents. + */ + static SpanResult KeywordValueToBcp47Extension(const char* aKeyword, + int32_t aLength); + + enum class CommonlyUsed : bool { + /** + * Select all possible values, even when not commonly used by a locale. + */ + No, + + /** + * Only select the values which are commonly used by a locale. + */ + Yes, + }; + + using Bcp47ExtEnumeration = + Enumeration, + Collator::KeywordValueToBcp47Extension>; + + /** + * Returns an iterator of collator locale extensions in the preferred order. + * These extensions can be used in BCP 47 locales. For instance this + * iterator could return "phonebk" and could be appled to the German locale + * "de" as "de-co-phonebk" for a phonebook-style collation. + * + * The collation extensions can be found here: + * http://cldr.unicode.org/core-spec/#Key_Type_Definitions + */ + static Result GetBcp47KeywordValuesForLocale( + const char* aLocale, CommonlyUsed aCommonlyUsed = CommonlyUsed::No); + + /** + * Returns an iterator over all possible collator locale extensions. + * These extensions can be used in BCP 47 locales. For instance this + * iterator could return "phonebk" and could be appled to the German locale + * "de" as "de-co-phonebk" for a phonebook-style collation. + * + * The collation extensions can be found here: + * http://cldr.unicode.org/core-spec/#Key_Type_Definitions + */ + static Result GetBcp47KeywordValues(); + + /** + * Returns an iterator over all supported collator locales. + * + * The returned strings are ICU locale identifiers and NOT BCP 47 language + * tags. + * + * Also see . + */ + static auto GetAvailableLocales() { + return AvailableLocalesEnumeration(); + } + + private: + /** + * Toggle features, or use the default setting. + */ + enum class Feature { + // Turn the feature off. + On, + // Turn the feature off. + Off, + // Use the default setting for the feature. + Default, + }; + + static constexpr auto ToUColAttributeValue(Feature aFeature) { + switch (aFeature) { + case Collator::Feature::On: + return UCOL_ON; + case Collator::Feature::Off: + return UCOL_OFF; + case Collator::Feature::Default: + return UCOL_DEFAULT; + } +#if MOZ_IS_GCC +# if !MOZ_GCC_VERSION_AT_LEAST(9, 1, 0) + return UCOL_DEFAULT; +# else + MOZ_CRASH("invalid collator feature"); +# endif +#else + MOZ_CRASH("invalid collator feature"); +#endif + } + + /** + * Attribute for handling variable elements. + */ + enum class AlternateHandling { + // Treats all the codepoints with non-ignorable primary weights in the + // same way (default) + NonIgnorable, + // Causes codepoints with primary weights that are equal or below the + // variable top value to be ignored on primary level and moved to the + // quaternary level. + Shifted, + Default, + }; + + /** + * The strength attribute. + * + * The usual strength for most locales (except Japanese) is tertiary. + * + * Quaternary strength is useful when combined with shifted setting for + * alternate handling attribute and for JIS X 4061 collation, when it is used + * to distinguish between Katakana and Hiragana. Otherwise, quaternary level + * is affected only by the number of non-ignorable code points in the string. + * + * Identical strength is rarely useful, as it amounts to codepoints of the NFD + * form of the string. + */ + enum class Strength { + // Primary collation strength. + Primary, + // Secondary collation strength. + Secondary, + // Tertiary collation strength. + Tertiary, + // Quaternary collation strength. + Quaternary, + // Identical collation strength. + Identical, + Default, + }; + + /** + * Configure the Collation::Strength + */ + void SetStrength(Strength strength); + + /** + * Configure Collation::AlternateHandling. + */ + ICUResult SetAlternateHandling(AlternateHandling aAlternateHandling); + + /** + * Controls whether an extra case level (positioned before the third level) is + * generated or not. + * + * Contents of the case level are affected by the value of CaseFirst + * attribute. A simple way to ignore accent differences in a string is to set + * the strength to Primary and enable case level. + */ + ICUResult SetCaseLevel(Feature aFeature); + + /** + * When turned on, this attribute makes substrings of digits sort according to + * their numeric values. + * + * This is a way to get '100' to sort AFTER '2'. Note that the longest digit + * substring that can be treated as a single unit is 254 digits (not counting + * leading zeros). If a digit substring is longer than that, the digits beyond + * the limit will be treated as a separate digit substring. + * + * A "digit" in this sense is a code point with General_Category=Nd, which + * does not include circled numbers, roman numerals, etc. Only a contiguous + * digit substring is considered, that is, non-negative integers without + * separators. There is no support for plus/minus signs, decimals, exponents, + * etc. + */ + ICUResult SetNumericCollation(Feature aFeature); + + /** + * Controls whether the normalization check and necessary normalizations are + * performed. + * + * When off (default), no normalization check is performed. The correctness of + * the result is guaranteed only if the input data is in so-called FCD form + * When set to on, an incremental check is performed to see whether the input + * data is in the FCD form. If the data is not in the FCD form, incremental + * NFD normalization is performed. + */ + ICUResult SetNormalizationMode(Feature aFeature); + + /** + * Configure Collation::CaseFirst. + */ + ICUResult SetCaseFirst(CaseFirst aCaseFirst); + +#ifndef JS_STANDALONE + FRIEND_TEST(IntlCollator, SetAttributesInternal); +#endif + + ICUPointer mCollator = ICUPointer(nullptr); + Maybe mLastStrategy = Nothing(); +}; + +} // namespace mozilla::intl + +#endif diff --git a/intl/components/src/Currency.cpp b/intl/components/src/Currency.cpp new file mode 100644 index 0000000000..4db8e0919c --- /dev/null +++ b/intl/components/src/Currency.cpp @@ -0,0 +1,22 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "mozilla/intl/Currency.h" + +#include "unicode/ucurr.h" +#include "unicode/uenum.h" +#include "unicode/utypes.h" + +namespace mozilla::intl { + +Result, ICUError> Currency::GetISOCurrencies() { + UErrorCode status = U_ZERO_ERROR; + UEnumeration* enumeration = ucurr_openISOCurrencies(UCURR_ALL, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + return SpanEnumeration(enumeration); +} + +} // namespace mozilla::intl diff --git a/intl/components/src/Currency.h b/intl/components/src/Currency.h new file mode 100644 index 0000000000..d0f8eb6ee8 --- /dev/null +++ b/intl/components/src/Currency.h @@ -0,0 +1,30 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef intl_components_Currency_h_ +#define intl_components_Currency_h_ + +#include "mozilla/intl/ICU4CGlue.h" +#include "mozilla/intl/ICUError.h" +#include "mozilla/Result.h" + +namespace mozilla::intl { + +/** + * This component is a Mozilla-focused API for working with currencies in + * internationalization code. + */ +class Currency final { + public: + Currency() = delete; + + /** + * Returns an enumeration of all supported ISO currency codes. + */ + static Result, ICUError> GetISOCurrencies(); +}; + +} // namespace mozilla::intl + +#endif diff --git a/intl/components/src/DateIntervalFormat.cpp b/intl/components/src/DateIntervalFormat.cpp new file mode 100644 index 0000000000..0097668f8b --- /dev/null +++ b/intl/components/src/DateIntervalFormat.cpp @@ -0,0 +1,266 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "DateTimeFormat.h" // for DATE_TIME_FORMAT_REPLACE_SPECIAL_SPACES +#include "DateTimeFormatUtils.h" +#include "ScopedICUObject.h" + +#include "mozilla/intl/Calendar.h" +#include "mozilla/intl/DateIntervalFormat.h" + +namespace mozilla::intl { + +/** + * PartitionDateTimeRangePattern ( dateTimeFormat, x, y ), steps 9-11. + * + * Examine the formatted value to see if any interval span field is present. + * + * https://tc39.es/ecma402/#sec-partitiondatetimerangepattern + */ +static ICUResult DateFieldsPracticallyEqual( + const UFormattedValue* aFormattedValue, bool* aEqual) { + if (!aFormattedValue) { + return Err(ICUError::InternalError); + } + + MOZ_ASSERT(aEqual); + *aEqual = false; + UErrorCode status = U_ZERO_ERROR; + UConstrainedFieldPosition* fpos = ucfpos_open(&status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + ScopedICUObject toCloseFpos(fpos); + + // We're only interested in UFIELD_CATEGORY_DATE_INTERVAL_SPAN fields. + ucfpos_constrainCategory(fpos, UFIELD_CATEGORY_DATE_INTERVAL_SPAN, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + bool hasSpan = ufmtval_nextPosition(aFormattedValue, fpos, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + // When no date interval span field was found, both dates are "practically + // equal" per PartitionDateTimeRangePattern. + *aEqual = !hasSpan; + return Ok(); +} + +/* static */ +Result, ICUError> DateIntervalFormat::TryCreate( + Span aLocale, Span aSkeleton, + Span aTimeZone) { + UErrorCode status = U_ZERO_ERROR; + UDateIntervalFormat* dif = + udtitvfmt_open(IcuLocale(aLocale), aSkeleton.data(), + AssertedCast(aSkeleton.size()), aTimeZone.data(), + AssertedCast(aTimeZone.size()), &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + return UniquePtr(new DateIntervalFormat(dif)); +} + +DateIntervalFormat::~DateIntervalFormat() { + MOZ_ASSERT(mDateIntervalFormat); + udtitvfmt_close(mDateIntervalFormat.GetMut()); +} + +#if DATE_TIME_FORMAT_REPLACE_SPECIAL_SPACES +// We reach inside the UFormattedValue and modify its internal string. (It's +// crucial that this is just an in-place replacement that doesn't alter any +// field positions, etc., ) +static void ReplaceSpecialSpaces(const UFormattedValue* aValue) { + UErrorCode status = U_ZERO_ERROR; + int32_t len; + const UChar* str = ufmtval_getString(aValue, &len, &status); + if (U_FAILURE(status)) { + return; + } + + for (const auto& c : Span(str, len)) { + if (IsSpecialSpace(c)) { + const_cast(c) = ' '; + } + } +} +#endif + +ICUResult DateIntervalFormat::TryFormatCalendar( + const Calendar& aStart, const Calendar& aEnd, + AutoFormattedDateInterval& aFormatted, bool* aPracticallyEqual) const { + MOZ_ASSERT(aFormatted.IsValid()); + + UErrorCode status = U_ZERO_ERROR; + udtitvfmt_formatCalendarToResult(mDateIntervalFormat.GetConst(), + aStart.GetUCalendar(), aEnd.GetUCalendar(), + aFormatted.GetFormatted(), &status); + + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + +#if DATE_TIME_FORMAT_REPLACE_SPECIAL_SPACES + ReplaceSpecialSpaces(aFormatted.Value()); +#endif + + MOZ_TRY(DateFieldsPracticallyEqual(aFormatted.Value(), aPracticallyEqual)); + return Ok(); +} + +ICUResult DateIntervalFormat::TryFormatDateTime( + double aStart, double aEnd, AutoFormattedDateInterval& aFormatted, + bool* aPracticallyEqual) const { + MOZ_ASSERT(aFormatted.IsValid()); + + UErrorCode status = U_ZERO_ERROR; + udtitvfmt_formatToResult(mDateIntervalFormat.GetConst(), aStart, aEnd, + aFormatted.GetFormatted(), &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + +#if DATE_TIME_FORMAT_REPLACE_SPECIAL_SPACES + ReplaceSpecialSpaces(aFormatted.Value()); +#endif + + MOZ_TRY(DateFieldsPracticallyEqual(aFormatted.Value(), aPracticallyEqual)); + return Ok(); +} + +ICUResult DateIntervalFormat::TryFormattedToParts( + const AutoFormattedDateInterval& aFormatted, + DateTimePartVector& aParts) const { + MOZ_ASSERT(aFormatted.IsValid()); + const UFormattedValue* value = aFormatted.Value(); + if (!value) { + return Err(ICUError::InternalError); + } + + size_t lastEndIndex = 0; + auto AppendPart = [&](DateTimePartType type, size_t endIndex, + DateTimePartSource source) { + if (!aParts.emplaceBack(type, endIndex, source)) { + return false; + } + + lastEndIndex = endIndex; + return true; + }; + + UErrorCode status = U_ZERO_ERROR; + UConstrainedFieldPosition* fpos = ucfpos_open(&status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + ScopedICUObject toCloseFpos(fpos); + + size_t categoryEndIndex = 0; + DateTimePartSource source = DateTimePartSource::Shared; + + while (true) { + bool hasMore = ufmtval_nextPosition(value, fpos, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + if (!hasMore) { + break; + } + + int32_t category = ucfpos_getCategory(fpos, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + int32_t field = ucfpos_getField(fpos, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + int32_t beginIndexInt, endIndexInt; + ucfpos_getIndexes(fpos, &beginIndexInt, &endIndexInt, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + MOZ_ASSERT(beginIndexInt <= endIndexInt, + "field iterator returning invalid range"); + + size_t beginIndex = AssertedCast(beginIndexInt); + size_t endIndex = AssertedCast(endIndexInt); + + // Indices are guaranteed to be returned in order (from left to right). + MOZ_ASSERT(lastEndIndex <= beginIndex, + "field iteration didn't return fields in order start to " + "finish as expected"); + + if (category == UFIELD_CATEGORY_DATE_INTERVAL_SPAN) { + // Append any remaining literal parts before changing the source kind. + if (lastEndIndex < beginIndex) { + if (!AppendPart(DateTimePartType::Literal, beginIndex, source)) { + return Err(ICUError::InternalError); + } + } + + // The special field category UFIELD_CATEGORY_DATE_INTERVAL_SPAN has only + // two allowed values (0 or 1), indicating the begin of the start- resp. + // end-date. + MOZ_ASSERT(field == 0 || field == 1, + "span category has unexpected value"); + + source = field == 0 ? DateTimePartSource::StartRange + : DateTimePartSource::EndRange; + categoryEndIndex = endIndex; + continue; + } + + // Ignore categories other than UFIELD_CATEGORY_DATE. + if (category != UFIELD_CATEGORY_DATE) { + continue; + } + + DateTimePartType type = + ConvertUFormatFieldToPartType(static_cast(field)); + if (lastEndIndex < beginIndex) { + if (!AppendPart(DateTimePartType::Literal, beginIndex, source)) { + return Err(ICUError::InternalError); + } + } + + if (!AppendPart(type, endIndex, source)) { + return Err(ICUError::InternalError); + } + + if (endIndex == categoryEndIndex) { + // Append any remaining literal parts before changing the source kind. + if (lastEndIndex < endIndex) { + if (!AppendPart(DateTimePartType::Literal, endIndex, source)) { + return Err(ICUError::InternalError); + } + } + + source = DateTimePartSource::Shared; + } + } + + // Append any final literal. + auto spanResult = aFormatted.ToSpan(); + if (spanResult.isErr()) { + return spanResult.propagateErr(); + } + size_t formattedSize = spanResult.unwrap().size(); + if (lastEndIndex < formattedSize) { + if (!AppendPart(DateTimePartType::Literal, formattedSize, source)) { + return Err(ICUError::InternalError); + } + } + + return Ok(); +} + +} // namespace mozilla::intl diff --git a/intl/components/src/DateIntervalFormat.h b/intl/components/src/DateIntervalFormat.h new file mode 100644 index 0000000000..c4dbce807a --- /dev/null +++ b/intl/components/src/DateIntervalFormat.h @@ -0,0 +1,107 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef intl_components_DateIntervalFormat_h_ +#define intl_components_DateIntervalFormat_h_ + +#include "mozilla/intl/Calendar.h" +#include "mozilla/intl/DateTimePart.h" +#include "mozilla/intl/ICU4CGlue.h" +#include "mozilla/intl/ICUError.h" +#include "mozilla/Result.h" +#include "mozilla/Span.h" +#include "mozilla/UniquePtr.h" + +#include "unicode/udateintervalformat.h" +#include "unicode/utypes.h" + +namespace mozilla::intl { +class Calendar; + +using AutoFormattedDateInterval = + AutoFormattedResult; + +/** + * This component is a Mozilla-focused API for the date range formatting + * provided by ICU. This DateIntervalFormat class helps to format the range + * between two date-time values. + * + * https://tc39.es/ecma402/#sec-formatdatetimerange + * https://tc39.es/ecma402/#sec-formatdatetimerangetoparts + */ +class DateIntervalFormat final { + public: + /** + * Create a DateIntervalFormat object from locale, skeleton and time zone. + * The format of skeleton can be found in [1]. + * + * Note: Skeleton will be removed in the future. + * + * [1]: https://unicode.org/reports/tr35/tr35-dates.html#Date_Format_Patterns + */ + static Result, ICUError> TryCreate( + Span aLocale, Span aSkeleton, + Span aTimeZone); + + ~DateIntervalFormat(); + + /** + * Format a date-time range between two Calendar objects. + * + * DateIntervalFormat cannot be changed to use a proleptic Gregorian + * calendar, so use this method if the start date is before the Gregorian + * calendar is introduced(October 15, 1582), otherwise use TryFormatDateTime + * instead. + * + * The result will be stored in aFormatted, caller can use + * AutoFormattedDateInterval::ToSpan() to get the formatted string, or pass + * the aFormatted to TryFormattedToParts to get the parts vector. + * + * aPracticallyEqual will be set to true if the date times of the two + * calendars are equal. + */ + ICUResult TryFormatCalendar(const Calendar& aStart, const Calendar& aEnd, + AutoFormattedDateInterval& aFormatted, + bool* aPracticallyEqual) const; + + /** + * Format a date-time range between two Unix epoch times in milliseconds. + * + * The result will be stored in aFormatted, caller can use + * AutoFormattedDateInterval::ToSpan() to get the formatted string, or pass + * the aFormatted to TryFormattedToParts to get the parts vector. + * + * aPracticallyEqual will be set to true if the date times of the two + * Unix epoch times are equal. + */ + ICUResult TryFormatDateTime(double aStart, double aEnd, + AutoFormattedDateInterval& aFormatted, + bool* aPracticallyEqual) const; + + /** + * Convert the formatted DateIntervalFormat into several parts. + * + * The caller get the formatted result from either TryFormatCalendar, or + * TryFormatDateTime methods, and instantiate the DateTimePartVector. This + * method will generate the parts and insert them into the vector. + * + * See: + * https://tc39.es/ecma402/#sec-formatdatetimerangetoparts + */ + ICUResult TryFormattedToParts(const AutoFormattedDateInterval& aFormatted, + DateTimePartVector& aParts) const; + + private: + DateIntervalFormat() = delete; + explicit DateIntervalFormat(UDateIntervalFormat* aDif) + : mDateIntervalFormat(aDif) {} + DateIntervalFormat(const DateIntervalFormat&) = delete; + DateIntervalFormat& operator=(const DateIntervalFormat&) = delete; + + ICUPointer mDateIntervalFormat = + ICUPointer(nullptr); +}; +} // namespace mozilla::intl + +#endif diff --git a/intl/components/src/DateTimeFormat.cpp b/intl/components/src/DateTimeFormat.cpp new file mode 100644 index 0000000000..2c09bb2adf --- /dev/null +++ b/intl/components/src/DateTimeFormat.cpp @@ -0,0 +1,1148 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include + +#include "unicode/ucal.h" +#include "unicode/udat.h" +#include "unicode/udatpg.h" +#include "unicode/ures.h" + +#include "DateTimeFormatUtils.h" +#include "ScopedICUObject.h" + +#include "mozilla/EnumSet.h" +#include "mozilla/intl/Calendar.h" +#include "mozilla/intl/DateTimeFormat.h" +#include "mozilla/intl/DateTimePatternGenerator.h" + +namespace mozilla::intl { + +DateTimeFormat::~DateTimeFormat() { + MOZ_ASSERT(mDateFormat); + udat_close(mDateFormat); +} + +static UDateFormatStyle ToUDateFormatStyle( + Maybe aLength) { + if (!aLength) { + return UDAT_NONE; + } + switch (*aLength) { + case DateTimeFormat::Style::Full: + return UDAT_FULL; + case DateTimeFormat::Style::Long: + return UDAT_LONG; + case DateTimeFormat::Style::Medium: + return UDAT_MEDIUM; + case DateTimeFormat::Style::Short: + return UDAT_SHORT; + } + MOZ_ASSERT_UNREACHABLE(); + // Do not use the default: branch so that the enum is exhaustively checked. + return UDAT_NONE; +} + +/** + * Parse a pattern according to the format specified in + * . + */ +template +class PatternIterator { + CharT* iter; + const CharT* const end; + + public: + explicit PatternIterator(mozilla::Span aPattern) + : iter(aPattern.data()), end(aPattern.data() + aPattern.size()) {} + + CharT* next() { + MOZ_ASSERT(iter != nullptr); + + bool inQuote = false; + while (iter < end) { + CharT* cur = iter++; + if (*cur == '\'') { + inQuote = !inQuote; + } else if (!inQuote) { + return cur; + } + } + + iter = nullptr; + return nullptr; + } +}; + +Maybe DateTimeFormat::HourCycleFromPattern( + Span aPattern) { + PatternIterator iter(aPattern); + while (const auto* ptr = iter.next()) { + switch (*ptr) { + case 'K': + return Some(DateTimeFormat::HourCycle::H11); + case 'h': + return Some(DateTimeFormat::HourCycle::H12); + case 'H': + return Some(DateTimeFormat::HourCycle::H23); + case 'k': + return Some(DateTimeFormat::HourCycle::H24); + } + } + return Nothing(); +} + +static bool IsHour12(DateTimeFormat::HourCycle aHourCycle) { + return aHourCycle == DateTimeFormat::HourCycle::H11 || + aHourCycle == DateTimeFormat::HourCycle::H12; +} + +static char16_t HourSymbol(DateTimeFormat::HourCycle aHourCycle) { + switch (aHourCycle) { + case DateTimeFormat::HourCycle::H11: + return 'K'; + case DateTimeFormat::HourCycle::H12: + return 'h'; + case DateTimeFormat::HourCycle::H23: + return 'H'; + case DateTimeFormat::HourCycle::H24: + return 'k'; + } + MOZ_CRASH("unexpected hour cycle"); +} + +enum class PatternField { Hour, Minute, Second, Other }; + +template +static PatternField ToPatternField(CharT aCh) { + if (aCh == 'K' || aCh == 'h' || aCh == 'H' || aCh == 'k' || aCh == 'j') { + return PatternField::Hour; + } + if (aCh == 'm') { + return PatternField::Minute; + } + if (aCh == 's') { + return PatternField::Second; + } + return PatternField::Other; +} + +/** + * Replaces all hour pattern characters in |patternOrSkeleton| to use the + * matching hour representation for |hourCycle|. + */ +/* static */ +void DateTimeFormat::ReplaceHourSymbol( + mozilla::Span aPatternOrSkeleton, + DateTimeFormat::HourCycle aHourCycle) { + char16_t replacement = HourSymbol(aHourCycle); + PatternIterator iter(aPatternOrSkeleton); + while (auto* ptr = iter.next()) { + auto field = ToPatternField(*ptr); + if (field == PatternField::Hour) { + *ptr = replacement; + } + } +} + +/** + * Find a matching pattern using the requested hour-12 options. + * + * This function is needed to work around the following two issues. + * - https://unicode-org.atlassian.net/browse/ICU-21023 + * - https://unicode-org.atlassian.net/browse/CLDR-13425 + * + * We're currently using a relatively simple workaround, which doesn't give the + * most accurate results. For example: + * + * ``` + * var dtf = new Intl.DateTimeFormat("en", { + * timeZone: "UTC", + * dateStyle: "long", + * timeStyle: "long", + * hourCycle: "h12", + * }); + * print(dtf.format(new Date("2020-01-01T00:00Z"))); + * ``` + * + * Returns the pattern "MMMM d, y 'at' h:mm:ss a z", but when going through + * |DateTimePatternGenerator::GetSkeleton| and then + * |DateTimePatternGenerator::GetBestPattern| to find an equivalent pattern for + * "h23", we'll end up with the pattern "MMMM d, y, HH:mm:ss z", so the + * combinator element " 'at' " was lost in the process. + */ +/* static */ +ICUResult DateTimeFormat::FindPatternWithHourCycle( + DateTimePatternGenerator& aDateTimePatternGenerator, + DateTimeFormat::PatternVector& aPattern, bool aHour12, + DateTimeFormat::SkeletonVector& aSkeleton) { + MOZ_TRY(mozilla::intl::DateTimePatternGenerator::GetSkeleton(aPattern, + aSkeleton)); + + // Input skeletons don't differentiate between "K" and "h" resp. "k" and "H". + DateTimeFormat::ReplaceHourSymbol(aSkeleton, + aHour12 ? DateTimeFormat::HourCycle::H12 + : DateTimeFormat::HourCycle::H23); + + MOZ_TRY(aDateTimePatternGenerator.GetBestPattern(aSkeleton, aPattern)); + + return Ok(); +} + +static auto PatternMatchOptions(mozilla::Span aSkeleton) { + // Values for hour, minute, and second are: + // - absent: 0 + // - numeric: 1 + // - 2-digit: 2 + int32_t hour = 0; + int32_t minute = 0; + int32_t second = 0; + + PatternIterator iter(aSkeleton); + while (const auto* ptr = iter.next()) { + switch (ToPatternField(*ptr)) { + case PatternField::Hour: + MOZ_ASSERT(hour < 2); + hour += 1; + break; + case PatternField::Minute: + MOZ_ASSERT(minute < 2); + minute += 1; + break; + case PatternField::Second: + MOZ_ASSERT(second < 2); + second += 1; + break; + case PatternField::Other: + break; + } + } + + // Adjust the field length when the user requested '2-digit' representation. + // + // We can't just always adjust the field length, because + // 1. The default value for hour, minute, and second fields is 'numeric'. If + // the length is always adjusted, |date.toLocaleTime()| will start to + // return strings like "1:5:9 AM" instead of "1:05:09 AM". + // 2. ICU doesn't support to adjust the field length to 'numeric' in certain + // cases. For example when the locale is "de" (German): + // a. hour='numeric' and minute='2-digit' will return "1:05". + // b. whereas hour='numeric' and minute='numeric' will return "01:05". + // + // Therefore we only support adjusting the field length when the user + // explicitly requested the '2-digit' representation. + + using PatternMatchOption = + mozilla::intl::DateTimePatternGenerator::PatternMatchOption; + mozilla::EnumSet options; + if (hour == 2) { + options += PatternMatchOption::HourField; + } + if (minute == 2) { + options += PatternMatchOption::MinuteField; + } + if (second == 2) { + options += PatternMatchOption::SecondField; + } + return options; +} + +/* static */ +Result, ICUError> DateTimeFormat::TryCreateFromStyle( + Span aLocale, const StyleBag& aStyleBag, + DateTimePatternGenerator* aDateTimePatternGenerator, + Maybe> aTimeZoneOverride) { + auto dateStyle = ToUDateFormatStyle(aStyleBag.date); + auto timeStyle = ToUDateFormatStyle(aStyleBag.time); + + if (dateStyle == UDAT_NONE && timeStyle == UDAT_NONE) { + dateStyle = UDAT_DEFAULT; + timeStyle = UDAT_DEFAULT; + } + + // The time zone is optional. + int32_t tzIDLength = -1; + const UChar* tzID = nullptr; + if (aTimeZoneOverride) { + tzIDLength = static_cast(aTimeZoneOverride->size()); + tzID = aTimeZoneOverride->Elements(); + } + + UErrorCode status = U_ZERO_ERROR; + UDateFormat* dateFormat = + udat_open(timeStyle, dateStyle, IcuLocale(aLocale), tzID, tzIDLength, + /* pattern */ nullptr, /* pattern length */ -1, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + auto df = UniquePtr(new DateTimeFormat(dateFormat)); + + if (aStyleBag.time && (aStyleBag.hour12 || aStyleBag.hourCycle)) { + // Only adjust the style pattern for time if there is an override. + // Extract the pattern and adjust it for the preferred hour cycle. + DateTimeFormat::PatternVector pattern{}; + + VectorToBufferAdaptor buffer(pattern); + MOZ_TRY(df->GetPattern(buffer)); + + Maybe hcPattern = HourCycleFromPattern(pattern); + DateTimeFormat::SkeletonVector skeleton{}; + + if (hcPattern) { + bool wantHour12 = + aStyleBag.hour12 ? *aStyleBag.hour12 : IsHour12(*aStyleBag.hourCycle); + if (wantHour12 == IsHour12(*hcPattern)) { + // Return the date-time format when its hour-cycle settings match the + // requested options. + if (aStyleBag.hour12 || *hcPattern == *aStyleBag.hourCycle) { + return df; + } + } else { + MOZ_ASSERT(aDateTimePatternGenerator); + MOZ_TRY(DateTimeFormat::FindPatternWithHourCycle( + *aDateTimePatternGenerator, pattern, wantHour12, skeleton)); + } + // Replace the hourCycle, if present, in the pattern string. But only do + // this if no hour12 option is present, because the latter takes + // precedence over hourCycle. + if (!aStyleBag.hour12) { + DateTimeFormat::ReplaceHourSymbol(pattern, *aStyleBag.hourCycle); + } + + auto result = DateTimeFormat::TryCreateFromPattern(aLocale, pattern, + aTimeZoneOverride); + if (result.isErr()) { + return Err(result.unwrapErr()); + } + auto dateTimeFormat = result.unwrap(); + MOZ_TRY(dateTimeFormat->CacheSkeleton(skeleton)); + return dateTimeFormat; + } + } + + return df; +} + +DateTimeFormat::DateTimeFormat(UDateFormat* aDateFormat) { + MOZ_RELEASE_ASSERT(aDateFormat, "Expected aDateFormat to not be a nullptr."); + mDateFormat = aDateFormat; +} + +// A helper to ergonomically push a string onto a string vector. +template +static ICUResult PushString(V& aVec, const char16_t (&aString)[N]) { + if (!aVec.append(aString, N - 1)) { + return Err(ICUError::OutOfMemory); + } + return Ok(); +} + +// A helper to ergonomically push a char onto a string vector. +template +static ICUResult PushChar(V& aVec, char16_t aCh) { + if (!aVec.append(aCh)) { + return Err(ICUError::OutOfMemory); + } + return Ok(); +} + +/** + * Returns an ICU skeleton string representing the specified options. + * http://unicode.org/reports/tr35/tr35-dates.html#Date_Field_Symbol_Table + */ +ICUResult ToICUSkeleton(const DateTimeFormat::ComponentsBag& aBag, + DateTimeFormat::SkeletonVector& aSkeleton) { + // Create an ICU skeleton representing the specified aBag. See + if (aBag.weekday) { + switch (*aBag.weekday) { + case DateTimeFormat::Text::Narrow: + MOZ_TRY(PushString(aSkeleton, u"EEEEE")); + break; + case DateTimeFormat::Text::Short: + MOZ_TRY(PushString(aSkeleton, u"E")); + break; + case DateTimeFormat::Text::Long: + MOZ_TRY(PushString(aSkeleton, u"EEEE")); + } + } + if (aBag.era) { + switch (*aBag.era) { + case DateTimeFormat::Text::Narrow: + MOZ_TRY(PushString(aSkeleton, u"GGGGG")); + break; + case DateTimeFormat::Text::Short: + // Use "GGG" instead of "G" to return the same results as other + // browsers. This is exploiting the following ICU bug + // . As soon as that + // bug has been fixed, we can change this back to "G". + // + // In practice the bug only affects "G", so we only apply it for "G" + // and not for other symbols like "B" or "z". + MOZ_TRY(PushString(aSkeleton, u"GGG")); + break; + case DateTimeFormat::Text::Long: + MOZ_TRY(PushString(aSkeleton, u"GGGG")); + break; + } + } + if (aBag.year) { + switch (*aBag.year) { + case DateTimeFormat::Numeric::TwoDigit: + MOZ_TRY(PushString(aSkeleton, u"yy")); + break; + case DateTimeFormat::Numeric::Numeric: + MOZ_TRY(PushString(aSkeleton, u"y")); + break; + } + } + if (aBag.month) { + switch (*aBag.month) { + case DateTimeFormat::Month::TwoDigit: + MOZ_TRY(PushString(aSkeleton, u"MM")); + break; + case DateTimeFormat::Month::Numeric: + MOZ_TRY(PushString(aSkeleton, u"M")); + break; + case DateTimeFormat::Month::Narrow: + MOZ_TRY(PushString(aSkeleton, u"MMMMM")); + break; + case DateTimeFormat::Month::Short: + MOZ_TRY(PushString(aSkeleton, u"MMM")); + break; + case DateTimeFormat::Month::Long: + MOZ_TRY(PushString(aSkeleton, u"MMMM")); + break; + } + } + if (aBag.day) { + switch (*aBag.day) { + case DateTimeFormat::Numeric::TwoDigit: + MOZ_TRY(PushString(aSkeleton, u"dd")); + break; + case DateTimeFormat::Numeric::Numeric: + MOZ_TRY(PushString(aSkeleton, u"d")); + break; + } + } + + // If hour12 and hourCycle are both present, hour12 takes precedence. + char16_t hourSkeletonChar = 'j'; + if (aBag.hour12) { + if (*aBag.hour12) { + hourSkeletonChar = 'h'; + } else { + hourSkeletonChar = 'H'; + } + } else if (aBag.hourCycle) { + switch (*aBag.hourCycle) { + case DateTimeFormat::HourCycle::H11: + case DateTimeFormat::HourCycle::H12: + hourSkeletonChar = 'h'; + break; + case DateTimeFormat::HourCycle::H23: + case DateTimeFormat::HourCycle::H24: + hourSkeletonChar = 'H'; + break; + } + } + if (aBag.hour) { + switch (*aBag.hour) { + case DateTimeFormat::Numeric::TwoDigit: + MOZ_TRY(PushChar(aSkeleton, hourSkeletonChar)); + MOZ_TRY(PushChar(aSkeleton, hourSkeletonChar)); + break; + case DateTimeFormat::Numeric::Numeric: + MOZ_TRY(PushChar(aSkeleton, hourSkeletonChar)); + break; + } + } + // ICU requires that "B" is set after the "j" hour skeleton symbol. + // https://unicode-org.atlassian.net/browse/ICU-20731 + if (aBag.dayPeriod) { + switch (*aBag.dayPeriod) { + case DateTimeFormat::Text::Narrow: + MOZ_TRY(PushString(aSkeleton, u"BBBBB")); + break; + case DateTimeFormat::Text::Short: + MOZ_TRY(PushString(aSkeleton, u"B")); + break; + case DateTimeFormat::Text::Long: + MOZ_TRY(PushString(aSkeleton, u"BBBB")); + break; + } + } + if (aBag.minute) { + switch (*aBag.minute) { + case DateTimeFormat::Numeric::TwoDigit: + MOZ_TRY(PushString(aSkeleton, u"mm")); + break; + case DateTimeFormat::Numeric::Numeric: + MOZ_TRY(PushString(aSkeleton, u"m")); + break; + } + } + if (aBag.second) { + switch (*aBag.second) { + case DateTimeFormat::Numeric::TwoDigit: + MOZ_TRY(PushString(aSkeleton, u"ss")); + break; + case DateTimeFormat::Numeric::Numeric: + MOZ_TRY(PushString(aSkeleton, u"s")); + break; + } + } + if (aBag.fractionalSecondDigits) { + switch (*aBag.fractionalSecondDigits) { + case 1: + MOZ_TRY(PushString(aSkeleton, u"S")); + break; + case 2: + MOZ_TRY(PushString(aSkeleton, u"SS")); + break; + default: + MOZ_TRY(PushString(aSkeleton, u"SSS")); + break; + } + } + if (aBag.timeZoneName) { + switch (*aBag.timeZoneName) { + case DateTimeFormat::TimeZoneName::Short: + MOZ_TRY(PushString(aSkeleton, u"z")); + break; + case DateTimeFormat::TimeZoneName::Long: + MOZ_TRY(PushString(aSkeleton, u"zzzz")); + break; + case DateTimeFormat::TimeZoneName::ShortOffset: + MOZ_TRY(PushString(aSkeleton, u"O")); + break; + case DateTimeFormat::TimeZoneName::LongOffset: + MOZ_TRY(PushString(aSkeleton, u"OOOO")); + break; + case DateTimeFormat::TimeZoneName::ShortGeneric: + MOZ_TRY(PushString(aSkeleton, u"v")); + break; + case DateTimeFormat::TimeZoneName::LongGeneric: + MOZ_TRY(PushString(aSkeleton, u"vvvv")); + break; + } + } + return Ok(); +} + +/* static */ +Result, ICUError> +DateTimeFormat::TryCreateFromComponents( + Span aLocale, const DateTimeFormat::ComponentsBag& aBag, + DateTimePatternGenerator* aDateTimePatternGenerator, + Maybe> aTimeZoneOverride) { + DateTimeFormat::SkeletonVector skeleton; + MOZ_TRY(ToICUSkeleton(aBag, skeleton)); + return TryCreateFromSkeleton(aLocale, skeleton, aDateTimePatternGenerator, + aBag.hourCycle, aTimeZoneOverride); +} + +/* static */ +Result, ICUError> +DateTimeFormat::TryCreateFromPattern( + Span aLocale, Span aPattern, + Maybe> aTimeZoneOverride) { + UErrorCode status = U_ZERO_ERROR; + + // The time zone is optional. + int32_t tzIDLength = -1; + const UChar* tzID = nullptr; + if (aTimeZoneOverride) { + tzIDLength = static_cast(aTimeZoneOverride->size()); + tzID = aTimeZoneOverride->data(); + } + + // Create the date formatter. + UDateFormat* dateFormat = udat_open( + UDAT_PATTERN, UDAT_PATTERN, IcuLocale(aLocale), tzID, tzIDLength, + aPattern.data(), static_cast(aPattern.size()), &status); + + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + // The DateTimeFormat wrapper will control the life cycle of the ICU + // dateFormat object. + return UniquePtr(new DateTimeFormat(dateFormat)); +} + +/* static */ +Result, ICUError> +DateTimeFormat::TryCreateFromSkeleton( + Span aLocale, Span aSkeleton, + DateTimePatternGenerator* aDateTimePatternGenerator, + Maybe aHourCycle, + Maybe> aTimeZoneOverride) { + if (!aDateTimePatternGenerator) { + return Err(ICUError::InternalError); + } + + // Compute the best pattern for the skeleton. + DateTimeFormat::PatternVector pattern; + auto options = PatternMatchOptions(aSkeleton); + MOZ_TRY( + aDateTimePatternGenerator->GetBestPattern(aSkeleton, pattern, options)); + + if (aHourCycle) { + DateTimeFormat::ReplaceHourSymbol(pattern, *aHourCycle); + } + + auto result = + DateTimeFormat::TryCreateFromPattern(aLocale, pattern, aTimeZoneOverride); + if (result.isErr()) { + return Err(result.unwrapErr()); + } + auto dateTimeFormat = result.unwrap(); + MOZ_TRY(dateTimeFormat->CacheSkeleton(aSkeleton)); + return dateTimeFormat; +} + +ICUResult DateTimeFormat::CacheSkeleton(Span aSkeleton) { + if (mOriginalSkeleton.append(aSkeleton.Elements(), aSkeleton.Length())) { + return Ok(); + } + return Err(ICUError::OutOfMemory); +} + +void DateTimeFormat::SetStartTimeIfGregorian(double aTime) { + UErrorCode status = U_ZERO_ERROR; + UCalendar* cal = const_cast(udat_getCalendar(mDateFormat)); + ucal_setGregorianChange(cal, aTime, &status); + // An error here means the calendar is not Gregorian, and can be ignored. +} + +/* static */ +Result, ICUError> DateTimeFormat::CloneCalendar( + double aUnixEpoch) const { + UErrorCode status = U_ZERO_ERROR; + UCalendar* calendarRaw = ucal_clone(udat_getCalendar(mDateFormat), &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + auto calendar = MakeUnique(calendarRaw); + + MOZ_TRY(calendar->SetTimeInMs(aUnixEpoch)); + + return calendar; +} + +/** + * ICU locale identifier consisting of a language and a region subtag. + */ +class LanguageRegionLocaleId { + // unicode_language_subtag = alpha{2,3} | alpha{5,8} ; + static constexpr size_t LanguageLength = 8; + + // unicode_region_subtag = (alpha{2} | digit{3}) ; + static constexpr size_t RegionLength = 3; + + // Add +1 to account for the separator. + static constexpr size_t LRLength = LanguageLength + RegionLength + 1; + + // Add +1 to zero terminate the string. + char mLocale[LRLength + 1] = {}; + + // Pointer to the start of the region subtag within |locale_|. + char* mRegion = nullptr; + + public: + LanguageRegionLocaleId(Span aLanguage, + Maybe> aRegion); + + const char* languageRegion() const { return mLocale; } + const char* region() const { return mRegion; } +}; + +LanguageRegionLocaleId::LanguageRegionLocaleId( + Span aLanguage, Maybe> aRegion) { + MOZ_RELEASE_ASSERT(aLanguage.Length() <= LanguageLength); + MOZ_RELEASE_ASSERT(!aRegion || aRegion->Length() <= RegionLength); + + size_t languageLength = aLanguage.Length(); + + std::memcpy(mLocale, aLanguage.Elements(), languageLength); + + // ICU locale identifiers are separated by underscores. + mLocale[languageLength] = '_'; + + mRegion = mLocale + languageLength + 1; + if (aRegion) { + std::memcpy(mRegion, aRegion->Elements(), aRegion->Length()); + } else { + // Use "001" (UN M.49 code for the World) as the fallback to match ICU. + std::strcpy(mRegion, "001"); + } +} + +/* static */ +Result +DateTimeFormat::GetAllowedHourCycles(Span aLanguage, + Maybe> aRegion) { + // ICU doesn't expose a public API to retrieve the hour cyles for a locale, so + // we have to reconstruct |DateTimePatternGenerator::getAllowedHourFormats()| + // using the public UResourceBundle API. + // + // The time data format is specified in UTS 35 at [1] and the data itself is + // located at [2]. + // + // [1] https://unicode.org/reports/tr35/tr35-dates.html#Time_Data + // [2] + // https://github.com/unicode-org/cldr/blob/master/common/supplemental/supplementalData.xml + + HourCyclesVector result; + + // Reserve space for the maximum number of hour cycles. This call always + // succeeds because it matches the inline capacity. We can now infallibly + // append all hour cycles to the vector. + MOZ_ALWAYS_TRUE(result.reserve(HourCyclesVector::InlineLength)); + + LanguageRegionLocaleId localeId(aLanguage, aRegion); + + // First open the "supplementalData" resource bundle. + UErrorCode status = U_ZERO_ERROR; + UResourceBundle* res = ures_openDirect(nullptr, "supplementalData", &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + ScopedICUObject closeRes(res); + MOZ_ASSERT(ures_getType(res) == URES_TABLE); + + // Locate "timeDate" within the "supplementalData" resource bundle. + UResourceBundle* timeData = ures_getByKey(res, "timeData", nullptr, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + ScopedICUObject closeTimeData(timeData); + MOZ_ASSERT(ures_getType(timeData) == URES_TABLE); + + // Try to find a matching resource within "timeData". The two possible keys + // into the "timeData" resource bundle are `language_region` and `region`. + // Prefer `language_region` and otherwise fallback to `region`. + UResourceBundle* hclocale = + ures_getByKey(timeData, localeId.languageRegion(), nullptr, &status); + if (status == U_MISSING_RESOURCE_ERROR) { + status = U_ZERO_ERROR; + hclocale = ures_getByKey(timeData, localeId.region(), nullptr, &status); + } + if (status == U_MISSING_RESOURCE_ERROR) { + // Default to "h23" if no resource was found at all. This matches ICU. + result.infallibleAppend(HourCycle::H23); + return result; + } + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + ScopedICUObject closeHcLocale(hclocale); + MOZ_ASSERT(ures_getType(hclocale) == URES_TABLE); + + EnumSet added{}; + + auto addToResult = [&](const UChar* str, int32_t len) { + // An hour cycle strings is one of "K", "h", "H", or "k"; optionally + // followed by the suffix "b" or "B". We ignore the suffix because day + // periods can't be expressed in the "hc" Unicode extension. + MOZ_ASSERT(len == 1 || len == 2); + + // Default to "h23" for unsupported hour cycle strings. + HourCycle hc = HourCycle::H23; + switch (str[0]) { + case 'K': + hc = HourCycle::H11; + break; + case 'h': + hc = HourCycle::H12; + break; + case 'H': + hc = HourCycle::H23; + break; + case 'k': + hc = HourCycle::H24; + break; + } + + // Add each unique hour cycle to the result array. + if (!added.contains(hc)) { + added += hc; + + result.infallibleAppend(hc); + } + }; + + // Determine the preferred hour cycle for the locale. + int32_t len = 0; + const UChar* hc = ures_getStringByKey(hclocale, "preferred", &len, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + addToResult(hc, len); + + // Find any additionally allowed hour cycles of the locale. + UResourceBundle* allowed = + ures_getByKey(hclocale, "allowed", nullptr, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + ScopedICUObject closeAllowed(allowed); + MOZ_ASSERT(ures_getType(allowed) == URES_ARRAY || + ures_getType(allowed) == URES_STRING); + + while (ures_hasNext(allowed)) { + int32_t len = 0; + const UChar* hc = ures_getNextString(allowed, &len, nullptr, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + addToResult(hc, len); + } + + return result; +} + +Result +DateTimeFormat::ResolveComponents() { + // Maps an ICU pattern string to a corresponding set of date-time components + // and their values, and adds properties for these components to the result + // object, which will be returned by the resolvedOptions method. For the + // interpretation of ICU pattern characters, see + // http://unicode.org/reports/tr35/tr35-dates.html#Date_Field_Symbol_Table + + DateTimeFormat::PatternVector pattern{}; + VectorToBufferAdaptor buffer(pattern); + MOZ_TRY(GetPattern(buffer)); + + DateTimeFormat::ComponentsBag bag{}; + + using Text = DateTimeFormat::Text; + using HourCycle = DateTimeFormat::HourCycle; + using Numeric = DateTimeFormat::Numeric; + using Month = DateTimeFormat::Month; + + auto text = Text::Long; + auto numeric = Numeric::Numeric; + auto month = Month::Long; + uint8_t fractionalSecondDigits = 0; + + for (size_t i = 0, len = pattern.length(); i < len;) { + char16_t c = pattern[i++]; + if (c == u'\'') { + // Skip past string literals. + while (i < len && pattern[i] != u'\'') { + i++; + } + i++; + continue; + } + + // Count how many times the character is repeated. + size_t count = 1; + while (i < len && pattern[i] == c) { + i++; + count++; + } + + // Determine the enum case of the field. + switch (c) { + // "text" cases + case u'G': + case u'E': + case u'c': + case u'B': + case u'z': + case u'O': + case u'v': + case u'V': + if (count <= 3) { + text = Text::Short; + } else if (count == 4) { + text = Text::Long; + } else { + text = Text::Narrow; + } + break; + // "number" cases + case u'y': + case u'd': + case u'h': + case u'H': + case u'm': + case u's': + case u'k': + case u'K': + if (count == 2) { + numeric = Numeric::TwoDigit; + } else { + numeric = Numeric::Numeric; + } + break; + // "numeric" cases + case u'r': + case u'U': + // Both are mapped to numeric years. + numeric = Numeric::Numeric; + break; + // "text & number" cases + case u'M': + case u'L': + if (count == 1) { + month = Month::Numeric; + } else if (count == 2) { + month = Month::TwoDigit; + } else if (count == 3) { + month = Month::Short; + } else if (count == 4) { + month = Month::Long; + } else { + month = Month::Narrow; + } + break; + case u'S': + fractionalSecondDigits = count; + break; + default: { + // skip other pattern characters and literal text + } + } + + // Map ICU pattern characters back to the corresponding date-time + // components of DateTimeFormat. See + // http://unicode.org/reports/tr35/tr35-dates.html#Date_Field_Symbol_Table + switch (c) { + case u'E': + case u'c': + bag.weekday = Some(text); + break; + case u'G': + bag.era = Some(text); + break; + case u'y': + case u'r': + case u'U': + bag.year = Some(numeric); + break; + case u'M': + case u'L': + bag.month = Some(month); + break; + case u'd': + bag.day = Some(numeric); + break; + case u'B': + bag.dayPeriod = Some(text); + break; + case u'K': + bag.hourCycle = Some(HourCycle::H11); + bag.hour = Some(numeric); + bag.hour12 = Some(true); + break; + case u'h': + bag.hourCycle = Some(HourCycle::H12); + bag.hour = Some(numeric); + bag.hour12 = Some(true); + break; + case u'H': + bag.hourCycle = Some(HourCycle::H23); + bag.hour = Some(numeric); + bag.hour12 = Some(false); + break; + case u'k': + bag.hourCycle = Some(HourCycle::H24); + bag.hour = Some(numeric); + bag.hour12 = Some(false); + break; + case u'm': + bag.minute = Some(numeric); + break; + case u's': + bag.second = Some(numeric); + break; + case u'S': + bag.fractionalSecondDigits = Some(fractionalSecondDigits); + break; + case u'z': + switch (text) { + case Text::Long: + bag.timeZoneName = Some(TimeZoneName::Long); + break; + case Text::Short: + case Text::Narrow: + bag.timeZoneName = Some(TimeZoneName::Short); + break; + } + break; + case u'O': + switch (text) { + case Text::Long: + bag.timeZoneName = Some(TimeZoneName::LongOffset); + break; + case Text::Short: + case Text::Narrow: + bag.timeZoneName = Some(TimeZoneName::ShortOffset); + break; + } + break; + case u'v': + case u'V': + switch (text) { + case Text::Long: + bag.timeZoneName = Some(TimeZoneName::LongGeneric); + break; + case Text::Short: + case Text::Narrow: + bag.timeZoneName = Some(TimeZoneName::ShortGeneric); + break; + } + break; + } + } + return bag; +} + +const char* DateTimeFormat::ToString( + DateTimeFormat::TimeZoneName aTimeZoneName) { + switch (aTimeZoneName) { + case TimeZoneName::Long: + return "long"; + case TimeZoneName::Short: + return "short"; + case TimeZoneName::ShortOffset: + return "shortOffset"; + case TimeZoneName::LongOffset: + return "longOffset"; + case TimeZoneName::ShortGeneric: + return "shortGeneric"; + case TimeZoneName::LongGeneric: + return "longGeneric"; + } + MOZ_CRASH("Unexpected DateTimeFormat::TimeZoneName"); +} + +const char* DateTimeFormat::ToString(DateTimeFormat::Month aMonth) { + switch (aMonth) { + case Month::Numeric: + return "numeric"; + case Month::TwoDigit: + return "2-digit"; + case Month::Long: + return "long"; + case Month::Short: + return "short"; + case Month::Narrow: + return "narrow"; + } + MOZ_CRASH("Unexpected DateTimeFormat::Month"); +} + +const char* DateTimeFormat::ToString(DateTimeFormat::Text aText) { + switch (aText) { + case Text::Long: + return "long"; + case Text::Short: + return "short"; + case Text::Narrow: + return "narrow"; + } + MOZ_CRASH("Unexpected DateTimeFormat::Text"); +} + +const char* DateTimeFormat::ToString(DateTimeFormat::Numeric aNumeric) { + switch (aNumeric) { + case Numeric::Numeric: + return "numeric"; + case Numeric::TwoDigit: + return "2-digit"; + } + MOZ_CRASH("Unexpected DateTimeFormat::Numeric"); +} + +const char* DateTimeFormat::ToString(DateTimeFormat::Style aStyle) { + switch (aStyle) { + case Style::Full: + return "full"; + case Style::Long: + return "long"; + case Style::Medium: + return "medium"; + case Style::Short: + return "short"; + } + MOZ_CRASH("Unexpected DateTimeFormat::Style"); +} + +const char* DateTimeFormat::ToString(DateTimeFormat::HourCycle aHourCycle) { + switch (aHourCycle) { + case HourCycle::H11: + return "h11"; + case HourCycle::H12: + return "h12"; + case HourCycle::H23: + return "h23"; + case HourCycle::H24: + return "h24"; + } + MOZ_CRASH("Unexpected DateTimeFormat::HourCycle"); +} + +ICUResult DateTimeFormat::TryFormatToParts( + UFieldPositionIterator* aFieldPositionIterator, size_t aSpanSize, + DateTimePartVector& aParts) const { + ScopedICUObject toClose( + aFieldPositionIterator); + + size_t lastEndIndex = 0; + auto AppendPart = [&](DateTimePartType type, size_t endIndex) { + // For the part defined in FormatDateTimeToParts, it doesn't have ||Source|| + // property, we store Shared for simplicity, + if (!aParts.emplaceBack(type, endIndex, DateTimePartSource::Shared)) { + return false; + } + + lastEndIndex = endIndex; + return true; + }; + + int32_t fieldInt, beginIndexInt, endIndexInt; + while ((fieldInt = ufieldpositer_next(aFieldPositionIterator, &beginIndexInt, + &endIndexInt)) >= 0) { + MOZ_ASSERT(beginIndexInt <= endIndexInt, + "field iterator returning invalid range"); + + size_t beginIndex = AssertedCast(beginIndexInt); + size_t endIndex = AssertedCast(endIndexInt); + + // Technically this isn't guaranteed. But it appears true in pratice, + // and http://bugs.icu-project.org/trac/ticket/12024 is expected to + // correct the documentation lapse. + MOZ_ASSERT(lastEndIndex <= beginIndex, + "field iteration didn't return fields in order start to " + "finish as expected"); + + DateTimePartType type = + ConvertUFormatFieldToPartType(static_cast(fieldInt)); + if (lastEndIndex < beginIndex) { + if (!AppendPart(DateTimePartType::Literal, beginIndex)) { + return Err(ICUError::InternalError); + } + } + + if (!AppendPart(type, endIndex)) { + return Err(ICUError::InternalError); + } + } + + // Append any final literal. + if (lastEndIndex < aSpanSize) { + if (!AppendPart(DateTimePartType::Literal, aSpanSize)) { + return Err(ICUError::InternalError); + } + } + + return Ok(); +} + +} // namespace mozilla::intl diff --git a/intl/components/src/DateTimeFormat.h b/intl/components/src/DateTimeFormat.h new file mode 100644 index 0000000000..4853d9e3b2 --- /dev/null +++ b/intl/components/src/DateTimeFormat.h @@ -0,0 +1,593 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef intl_components_DateTimeFormat_h_ +#define intl_components_DateTimeFormat_h_ +#include +#include "unicode/udat.h" + +#include "mozilla/Assertions.h" +#include "mozilla/intl/ICU4CGlue.h" +#include "mozilla/intl/ICUError.h" + +#include "mozilla/intl/DateTimePart.h" +#include "mozilla/intl/DateTimePatternGenerator.h" +#include "mozilla/Maybe.h" +#include "mozilla/Span.h" +#include "mozilla/Try.h" +#include "mozilla/UniquePtr.h" +#include "mozilla/Utf8.h" +#include "mozilla/Variant.h" +#include "mozilla/Vector.h" + +/* + * To work around webcompat problems caused by Narrow No-Break Space in + * formatted date/time output, where existing code on the web naively + * assumes there will be a normal Space, we replace any occurrences of + * U+202F in the formatted results with U+0020. + * + * The intention is to undo this hack once other major browsers are also + * ready to ship with the updated (ICU72) i18n data that uses NNBSP. + * + * See https://bugzilla.mozilla.org/show_bug.cgi?id=1806042 for details, + * and see DateIntervalFormat.cpp for the other piece of this hack. + */ +#define DATE_TIME_FORMAT_REPLACE_SPECIAL_SPACES 1 + +namespace mozilla::intl { + +#if DATE_TIME_FORMAT_REPLACE_SPECIAL_SPACES +static inline bool IsSpecialSpace(char16_t c) { + // NARROW NO-BREAK SPACE and THIN SPACE + return c == 0x202F || c == 0x2009; +} +#endif + +class Calendar; + +/** + * Intro to mozilla::intl::DateTimeFormat + * ====================================== + * + * This component is a Mozilla-focused API for the date formatting provided by + * ICU. The methods internally call out to ICU4C. This is responsible for and + * owns any resources opened through ICU, through RAII. + * + * The construction of a DateTimeFormat contains the majority of the cost + * of the DateTimeFormat operation. DateTimeFormat::TryFormat should be + * relatively inexpensive after the initial construction. + * + * This class supports creating from Styles (a fixed set of options) and from a + * components bag (a list of components and their lengths). + * + * This API serves to back the ECMA-402 Intl.DateTimeFormat API. + * https://tc39.es/ecma402/#datetimeformat-objects + * + * + * ECMA-402 Intl.DateTimeFormat API and implementation details with ICU + * skeletons and patterns. + * ==================================================================== + * + * Different locales have different ways to display dates using the same + * basic components. For example, en-US might use "Sept. 24, 2012" while + * fr-FR might use "24 Sept. 2012". The intent of Intl.DateTimeFormat is to + * permit production of a format for the locale that best matches the + * set of date-time components and their desired representation as specified + * by the API client. + * + * ICU4C supports specification of date and time formats in three ways: + * + * 1) A style is just one of the identifiers FULL, LONG, MEDIUM, or SHORT. + * The date-time components included in each style and their representation + * are defined by ICU using CLDR locale data (CLDR is the Unicode + * Consortium's Common Locale Data Repository). + * + * 2) A skeleton is a string specifying which date-time components to include, + * and which representations to use for them. For example, "yyyyMMMMdd" + * specifies a year with at least four digits, a full month name, and a + * two-digit day. It does not specify in which order the components appear, + * how they are separated, the localized strings for textual components + * (such as weekday or month), whether the month is in format or + * stand-alone form¹, or the numbering system used for numeric components. + * All that information is filled in by ICU using CLDR locale data. + * ¹ The format form is the one used in formatted strings that include a + * day; the stand-alone form is used when not including days, e.g., in + * calendar headers. The two forms differ at least in some Slavic languages, + * e.g. Russian: "22 марта 2013 г." vs. "Март 2013". + * + * 3) A pattern is a string specifying which date-time components to include, + * in which order, with which separators, in which grammatical case. For + * example, "EEEE, d MMMM y" specifies the full localized weekday name, + * followed by comma and space, followed by the day, followed by space, + * followed by the full month name in format form, followed by space, + * followed by the full year. It + * still does not specify localized strings for textual components and the + * numbering system - these are determined by ICU using CLDR locale data or + * possibly API parameters. + * + * All actual formatting in ICU4C is done with patterns; styles and skeletons + * have to be mapped to patterns before processing. + * + * The options of Intl.DateTimeFormat most closely correspond to ICU skeletons. + * This implementation therefore converts DateTimeFormat options to ICU + * skeletons, and then lets ICU map skeletons to actual ICU patterns. The + * pattern may not directly correspond to what the skeleton requests, as the + * mapper (UDateTimePatternGenerator) is constrained by the available locale + * data for the locale. + * + * An ICU pattern represents the information of the following DateTimeFormat + * internal properties described in the specification, which therefore don't + * exist separately in the implementation: + * - [[weekday]], [[era]], [[year]], [[month]], [[day]], [[hour]], [[minute]], + * [[second]], [[timeZoneName]] + * - [[hour12]] + * - [[hourCycle]] + * - [[hourNo0]] + * When needed for the resolvedOptions method, the resolveICUPattern function + * queries the UDateFormat's internal pattern and then maps the it back to the + * specified properties of the object returned by resolvedOptions. + * + * ICU date-time skeletons and patterns aren't fully documented in the ICU + * documentation (see http://bugs.icu-project.org/trac/ticket/9627). The best + * documentation at this point is in UTR 35: + * http://unicode.org/reports/tr35/tr35-dates.html#Date_Format_Patterns + * + * Future support for ICU4X + * ======================== + * This implementation exposes a components bag, and internally handles the + * complexity of working with skeletons and patterns to generate the correct + * results. In the future, if and when we switch to ICU4X, the complexities of + * manipulating patterns will be able to be removed, as ICU4X will directly know + * how to apply the components bag. + */ +class DateTimeFormat final { + public: + /** + * The hour cycle for components. + */ + enum class HourCycle { + H11, + H12, + H23, + H24, + }; + + /** + * The style for dates or times. + */ + enum class Style { + Full, + Long, + Medium, + Short, + }; + + /** + * A bag of options to determine the length of the time and date styles. The + * hour cycle can be overridden. + */ + struct StyleBag { + Maybe