summaryrefslogtreecommitdiffstats
path: root/intl/components/src/DateTimeFormat.h
blob: 4853d9e3b282e71bb1bd195b3127ee273c986b29 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef intl_components_DateTimeFormat_h_
#define intl_components_DateTimeFormat_h_
#include <functional>
#include "unicode/udat.h"

#include "mozilla/Assertions.h"
#include "mozilla/intl/ICU4CGlue.h"
#include "mozilla/intl/ICUError.h"

#include "mozilla/intl/DateTimePart.h"
#include "mozilla/intl/DateTimePatternGenerator.h"
#include "mozilla/Maybe.h"
#include "mozilla/Span.h"
#include "mozilla/Try.h"
#include "mozilla/UniquePtr.h"
#include "mozilla/Utf8.h"
#include "mozilla/Variant.h"
#include "mozilla/Vector.h"

/*
 * To work around webcompat problems caused by Narrow No-Break Space in
 * formatted date/time output, where existing code on the web naively
 * assumes there will be a normal Space, we replace any occurrences of
 * U+202F in the formatted results with U+0020.
 *
 * The intention is to undo this hack once other major browsers are also
 * ready to ship with the updated (ICU72) i18n data that uses NNBSP.
 *
 * See https://bugzilla.mozilla.org/show_bug.cgi?id=1806042 for details,
 * and see DateIntervalFormat.cpp for the other piece of this hack.
 */
#define DATE_TIME_FORMAT_REPLACE_SPECIAL_SPACES 1

namespace mozilla::intl {

#if DATE_TIME_FORMAT_REPLACE_SPECIAL_SPACES
static inline bool IsSpecialSpace(char16_t c) {
  // NARROW NO-BREAK SPACE and THIN SPACE
  return c == 0x202F || c == 0x2009;
}
#endif

class Calendar;

/**
 * Intro to mozilla::intl::DateTimeFormat
 * ======================================
 *
 * This component is a Mozilla-focused API for the date formatting provided by
 * ICU. The methods internally call out to ICU4C. This is responsible for and
 * owns any resources opened through ICU, through RAII.
 *
 * The construction of a DateTimeFormat contains the majority of the cost
 * of the DateTimeFormat operation. DateTimeFormat::TryFormat should be
 * relatively inexpensive after the initial construction.
 *
 * This class supports creating from Styles (a fixed set of options) and from a
 * components bag (a list of components and their lengths).
 *
 * This API serves to back the ECMA-402 Intl.DateTimeFormat API.
 * https://tc39.es/ecma402/#datetimeformat-objects
 *
 *
 * ECMA-402 Intl.DateTimeFormat API and implementation details with ICU
 * skeletons and patterns.
 * ====================================================================
 *
 * Different locales have different ways to display dates using the same
 * basic components. For example, en-US might use "Sept. 24, 2012" while
 * fr-FR might use "24 Sept. 2012". The intent of Intl.DateTimeFormat is to
 * permit production of a format for the locale that best matches the
 * set of date-time components and their desired representation as specified
 * by the API client.
 *
 * ICU4C supports specification of date and time formats in three ways:
 *
 * 1) A style is just one of the identifiers FULL, LONG, MEDIUM, or SHORT.
 *    The date-time components included in each style and their representation
 *    are defined by ICU using CLDR locale data (CLDR is the Unicode
 *    Consortium's Common Locale Data Repository).
 *
 * 2) A skeleton is a string specifying which date-time components to include,
 *    and which representations to use for them. For example, "yyyyMMMMdd"
 *    specifies a year with at least four digits, a full month name, and a
 *    two-digit day. It does not specify in which order the components appear,
 *    how they are separated, the localized strings for textual components
 *    (such as weekday or month), whether the month is in format or
 *    stand-alone form¹, or the numbering system used for numeric components.
 *    All that information is filled in by ICU using CLDR locale data.
 *    ¹ The format form is the one used in formatted strings that include a
 *    day; the stand-alone form is used when not including days, e.g., in
 *    calendar headers. The two forms differ at least in some Slavic languages,
 *    e.g. Russian: "22 марта 2013 г." vs. "Март 2013".
 *
 * 3) A pattern is a string specifying which date-time components to include,
 *    in which order, with which separators, in which grammatical case. For
 *    example, "EEEE, d MMMM y" specifies the full localized weekday name,
 *    followed by comma and space, followed by the day, followed by space,
 *    followed by the full month name in format form, followed by space,
 *    followed by the full year. It
 *    still does not specify localized strings for textual components and the
 *    numbering system - these are determined by ICU using CLDR locale data or
 *    possibly API parameters.
 *
 * All actual formatting in ICU4C is done with patterns; styles and skeletons
 * have to be mapped to patterns before processing.
 *
 * The options of Intl.DateTimeFormat most closely correspond to ICU skeletons.
 * This implementation therefore converts DateTimeFormat options to ICU
 * skeletons, and then lets ICU map skeletons to actual ICU patterns. The
 * pattern may not directly correspond to what the skeleton requests, as the
 * mapper (UDateTimePatternGenerator) is constrained by the available locale
 * data for the locale.
 *
 * An ICU pattern represents the information of the following DateTimeFormat
 * internal properties described in the specification, which therefore don't
 * exist separately in the implementation:
 * - [[weekday]], [[era]], [[year]], [[month]], [[day]], [[hour]], [[minute]],
 *   [[second]], [[timeZoneName]]
 * - [[hour12]]
 * - [[hourCycle]]
 * - [[hourNo0]]
 * When needed for the resolvedOptions method, the resolveICUPattern function
 * queries the UDateFormat's internal pattern and then maps the it back to the
 * specified properties of the object returned by resolvedOptions.
 *
 * ICU date-time skeletons and patterns aren't fully documented in the ICU
 * documentation (see http://bugs.icu-project.org/trac/ticket/9627). The best
 * documentation at this point is in UTR 35:
 * http://unicode.org/reports/tr35/tr35-dates.html#Date_Format_Patterns
 *
 * Future support for ICU4X
 * ========================
 * This implementation exposes a components bag, and internally handles the
 * complexity of working with skeletons and patterns to generate the correct
 * results. In the future, if and when we switch to ICU4X, the complexities of
 * manipulating patterns will be able to be removed, as ICU4X will directly know
 * how to apply the components bag.
 */
class DateTimeFormat final {
 public:
  /**
   * The hour cycle for components.
   */
  enum class HourCycle {
    H11,
    H12,
    H23,
    H24,
  };

  /**
   * The style for dates or times.
   */
  enum class Style {
    Full,
    Long,
    Medium,
    Short,
  };

  /**
   * A bag of options to determine the length of the time and date styles. The
   * hour cycle can be overridden.
   */
  struct StyleBag {
    Maybe<Style> date = Nothing();
    Maybe<Style> time = Nothing();
    Maybe<HourCycle> hourCycle = Nothing();
    Maybe<bool> hour12 = Nothing();
  };

  /**
   * How to to display numeric components such as the year and the day.
   */
  enum class Numeric {
    Numeric,
    TwoDigit,
  };

  /**
   * How to display the text components, such as the weekday or day period.
   */
  enum class Text {
    Long,
    Short,
    Narrow,
  };

  /**
   * How to display the month.
   */
  enum class Month {
    Numeric,
    TwoDigit,
    Long,
    Short,
    Narrow,
  };

  /**
   * How to display the time zone name.
   */
  enum class TimeZoneName {
    Long,
    Short,
    ShortOffset,
    LongOffset,
    ShortGeneric,
    LongGeneric,
  };

  /**
   * Get static strings representing the enums. These match ECMA-402's resolved
   * options.
   * https://tc39.es/ecma402/#sec-intl.datetimeformat.prototype.resolvedoptions
   */
  static const char* ToString(DateTimeFormat::HourCycle aHourCycle);
  static const char* ToString(DateTimeFormat::Style aStyle);
  static const char* ToString(DateTimeFormat::Numeric aNumeric);
  static const char* ToString(DateTimeFormat::Text aText);
  static const char* ToString(DateTimeFormat::Month aMonth);
  static const char* ToString(DateTimeFormat::TimeZoneName aTimeZoneName);

  /**
   * A components bag specifies the components used to display a DateTime. Each
   * component can be styled individually, and ICU will attempt to create a best
   * match for a given locale.
   */
  struct ComponentsBag {
    Maybe<Text> era = Nothing();
    Maybe<Numeric> year = Nothing();
    Maybe<Month> month = Nothing();
    Maybe<Numeric> day = Nothing();
    Maybe<Text> weekday = Nothing();
    Maybe<Numeric> hour = Nothing();
    Maybe<Numeric> minute = Nothing();
    Maybe<Numeric> second = Nothing();
    Maybe<TimeZoneName> timeZoneName = Nothing();
    Maybe<bool> hour12 = Nothing();
    Maybe<HourCycle> hourCycle = Nothing();
    Maybe<Text> dayPeriod = Nothing();
    Maybe<uint8_t> fractionalSecondDigits = Nothing();
  };

  // Do not allow copy as this class owns the ICU resource. Move is not
  // currently implemented, but a custom move operator could be created if
  // needed.
  DateTimeFormat(const DateTimeFormat&) = delete;
  DateTimeFormat& operator=(const DateTimeFormat&) = delete;

  // mozilla::Vector can avoid heap allocations for small transient buffers.
  using PatternVector = Vector<char16_t, 128>;
  using SkeletonVector = Vector<char16_t, 16>;

  /**
   * Create a DateTimeFormat from styles.
   *
   * The "style" model uses different options for formatting a date or time
   * based on how the result will be styled, rather than picking specific
   * fields or lengths.
   *
   * Takes an optional time zone which will override the user's default
   * time zone. This is a UTF-16 string that takes the form "GMT±hh:mm", or
   * an IANA time zone identifier, e.g. "America/Chicago".
   */
  static Result<UniquePtr<DateTimeFormat>, ICUError> TryCreateFromStyle(
      Span<const char> aLocale, const StyleBag& aStyleBag,
      DateTimePatternGenerator* aDateTimePatternGenerator,
      Maybe<Span<const char16_t>> aTimeZoneOverride = Nothing{});

 private:
  /**
   * Create a DateTimeFormat from a UTF-16 skeleton.
   *
   * A skeleton is an unordered list of fields that are used to find an
   * appropriate date time format pattern. Example skeletons would be "yMd",
   * "yMMMd", "EBhm". If the skeleton includes string literals or other
   * information, it will be discarded when matching against skeletons.
   *
   * Takes an optional time zone which will override the user's default
   * time zone. This is a string that takes the form "GMT±hh:mm", or
   * an IANA time zone identifier, e.g. "America/Chicago".
   */
  static Result<UniquePtr<DateTimeFormat>, ICUError> TryCreateFromSkeleton(
      Span<const char> aLocale, Span<const char16_t> aSkeleton,
      DateTimePatternGenerator* aDateTimePatternGenerator,
      Maybe<DateTimeFormat::HourCycle> aHourCycle,
      Maybe<Span<const char16_t>> aTimeZoneOverride);

 public:
  /**
   * Create a DateTimeFormat from a ComponentsBag.
   *
   * See the ComponentsBag for additional documentation.
   *
   * Takes an optional time zone which will override the user's default
   * time zone. This is a string that takes the form "GMT±hh:mm", or
   * an IANA time zone identifier, e.g. "America/Chicago".
   */
  static Result<UniquePtr<DateTimeFormat>, ICUError> TryCreateFromComponents(
      Span<const char> aLocale, const ComponentsBag& bag,
      DateTimePatternGenerator* aDateTimePatternGenerator,
      Maybe<Span<const char16_t>> aTimeZoneOverride = Nothing{});

  /**
   * Create a DateTimeFormat from a raw pattern.
   *
   * Warning: This method should not be added to new code. In the near future we
   * plan to remove it.
   */
  static Result<UniquePtr<DateTimeFormat>, ICUError> TryCreateFromPattern(
      Span<const char> aLocale, Span<const char16_t> aPattern,
      Maybe<Span<const char16_t>> aTimeZoneOverride = Nothing{});

  /**
   * Use the format settings to format a date time into a string. The non-null
   * terminated string will be placed into the provided buffer. The idea behind
   * this API is that the constructor is expensive, and then the format
   * operation is cheap.
   *
   * aUnixEpoch is the number of milliseconds since 1 January 1970, UTC.
   */
  template <typename B>
  ICUResult TryFormat(double aUnixEpoch, B& aBuffer) const {
    static_assert(
        std::is_same_v<typename B::CharType, unsigned char> ||
            std::is_same_v<typename B::CharType, char> ||
            std::is_same_v<typename B::CharType, char16_t>,
        "The only buffer CharTypes supported by DateTimeFormat are char "
        "(for UTF-8 support) and char16_t (for UTF-16 support).");

    if constexpr (std::is_same_v<typename B::CharType, char> ||
                  std::is_same_v<typename B::CharType, unsigned char>) {
      // The output buffer is UTF-8, but ICU uses UTF-16 internally.

      // Write the formatted date into the u16Buffer.
      PatternVector u16Vec;

      auto result = FillBufferWithICUCall(
          u16Vec, [this, &aUnixEpoch](UChar* target, int32_t length,
                                      UErrorCode* status) {
            return udat_format(mDateFormat, aUnixEpoch, target, length,
                               /* UFieldPosition* */ nullptr, status);
          });
      if (result.isErr()) {
        return result;
      }

#if DATE_TIME_FORMAT_REPLACE_SPECIAL_SPACES
      for (auto& c : u16Vec) {
        if (IsSpecialSpace(c)) {
          c = ' ';
        }
      }
#endif

      if (!FillBuffer(u16Vec, aBuffer)) {
        return Err(ICUError::OutOfMemory);
      }
      return Ok{};
    } else {
      static_assert(std::is_same_v<typename B::CharType, char16_t>);

      // The output buffer is UTF-16. ICU can output directly into this buffer.
      auto result = FillBufferWithICUCall(
          aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) {
            return udat_format(mDateFormat, aUnixEpoch, target, length, nullptr,
                               status);
          });
      if (result.isErr()) {
        return result;
      }

#if DATE_TIME_FORMAT_REPLACE_SPECIAL_SPACES
      for (auto& c : Span(aBuffer.data(), aBuffer.length())) {
        if (IsSpecialSpace(c)) {
          c = ' ';
        }
      }
#endif

      return Ok{};
    }
  };

  /**
   * Format the Unix epoch time into a DateTimePartVector.
   *
   * The caller has to create the buffer and the vector and pass to this method.
   * The formatted string will be stored in the buffer and formatted parts in
   * the vector.
   *
   * aUnixEpoch is the number of milliseconds since 1 January 1970, UTC.
   *
   * See:
   * https://tc39.es/ecma402/#sec-formatdatetimetoparts
   */
  template <typename B>
  ICUResult TryFormatToParts(double aUnixEpoch, B& aBuffer,
                             DateTimePartVector& aParts) const {
    static_assert(std::is_same_v<typename B::CharType, char16_t>,
                  "Only char16_t is supported (for UTF-16 support) now.");

    UErrorCode status = U_ZERO_ERROR;
    UFieldPositionIterator* fpositer = ufieldpositer_open(&status);
    if (U_FAILURE(status)) {
      return Err(ToICUError(status));
    }

    auto result = FillBufferWithICUCall(
        aBuffer, [this, aUnixEpoch, fpositer](UChar* chars, int32_t size,
                                              UErrorCode* status) {
          return udat_formatForFields(mDateFormat, aUnixEpoch, chars, size,
                                      fpositer, status);
        });
    if (result.isErr()) {
      ufieldpositer_close(fpositer);
      return result.propagateErr();
    }

#if DATE_TIME_FORMAT_REPLACE_SPECIAL_SPACES
    for (auto& c : Span(aBuffer.data(), aBuffer.length())) {
      if (IsSpecialSpace(c)) {
        c = ' ';
      }
    }
#endif

    return TryFormatToParts(fpositer, aBuffer.length(), aParts);
  }

  /**
   * Copies the pattern for the current DateTimeFormat to a buffer.
   *
   * Warning: This method should not be added to new code. In the near future we
   * plan to remove it.
   */
  template <typename B>
  ICUResult GetPattern(B& aBuffer) const {
    return FillBufferWithICUCall(
        aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) {
          return udat_toPattern(mDateFormat, /* localized*/ false, target,
                                length, status);
        });
  }

  /**
   * Copies the skeleton that was used to generate the current DateTimeFormat to
   * the given buffer. If no skeleton was used, then a skeleton is generated
   * from the resolved pattern. Note that going from skeleton -> resolved
   * pattern -> skeleton is not a 1:1 mapping, as the resolved pattern can
   * contain different symbols than the requested skeleton.
   *
   * Warning: This method should not be added to new code. In the near future we
   * plan to remove it.
   */
  template <typename B>
  ICUResult GetOriginalSkeleton(B& aBuffer) {
    static_assert(std::is_same_v<typename B::CharType, char16_t>);
    if (mOriginalSkeleton.length() == 0) {
      // Generate a skeleton from the resolved pattern, there was no originally
      // cached skeleton.
      PatternVector pattern{};
      VectorToBufferAdaptor buffer(pattern);
      MOZ_TRY(GetPattern(buffer));

      VectorToBufferAdaptor skeleton(mOriginalSkeleton);
      MOZ_TRY(DateTimePatternGenerator::GetSkeleton(pattern, skeleton));
    }

    if (!FillBuffer(mOriginalSkeleton, aBuffer)) {
      return Err(ICUError::OutOfMemory);
    }
    return Ok();
  }
  /**
   * Set the start time of the Gregorian calendar. This is useful for
   * ensuring the consistent use of a proleptic Gregorian calendar for ECMA-402.
   * https://en.wikipedia.org/wiki/Proleptic_Gregorian_calendar
   */
  void SetStartTimeIfGregorian(double aTime);

  /**
   * Determines the resolved components for the current DateTimeFormat.
   *
   * When a DateTimeFormat is created, even from a components bag, the resolved
   * formatter may tweak the resolved components depending on the configuration
   * and the locale.
   *
   * For the implementation, with ICU4C, this takes a string pattern and maps it
   * back to a ComponentsBag.
   */
  Result<ComponentsBag, ICUError> ResolveComponents();

  ~DateTimeFormat();

  /**
   * Clones the Calendar from a DateTimeFormat, and sets its time with the
   * relative milliseconds since 1 January 1970, UTC.
   */
  Result<UniquePtr<Calendar>, ICUError> CloneCalendar(double aUnixEpoch) const;

  /**
   * Return the hour cycle used in the input pattern or Nothing if none was
   * found.
   */
  static Maybe<DateTimeFormat::HourCycle> HourCycleFromPattern(
      Span<const char16_t> aPattern);

  using HourCyclesVector = Vector<HourCycle, 4>;

  /**
   * Returns the allowed hour cycles for the input locale.
   *
   * NOTE: This function currently takes a language subtag and an optional
   * region subtag. This is a restriction until bug 1719746 has migrated
   * language tag processing into the unified Intl component. After bug 1719746,
   * this function should be changed to accept a single locale tag.
   */
  static Result<HourCyclesVector, ICUError> GetAllowedHourCycles(
      Span<const char> aLanguage, Maybe<Span<const char>> aRegion);

  /**
   * Returns an iterator over all supported date-time formatter locales.
   *
   * The returned strings are ICU locale identifiers and NOT BCP 47 language
   * tags.
   *
   * Also see <https://unicode-org.github.io/icu/userguide/locale>.
   */
  static auto GetAvailableLocales() {
    return AvailableLocalesEnumeration<udat_countAvailable,
                                       udat_getAvailable>();
  }

 private:
  explicit DateTimeFormat(UDateFormat* aDateFormat);

  ICUResult CacheSkeleton(Span<const char16_t> aSkeleton);

  ICUResult TryFormatToParts(UFieldPositionIterator* aFieldPositionIterator,
                             size_t aSpanSize,
                             DateTimePartVector& aParts) const;
  /**
   * Replaces all hour pattern characters in |patternOrSkeleton| to use the
   * matching hour representation for |hourCycle|.
   */
  static void ReplaceHourSymbol(Span<char16_t> aPatternOrSkeleton,
                                DateTimeFormat::HourCycle aHourCycle);

  /**
   * Find a matching pattern using the requested hour-12 options.
   *
   * This function is needed to work around the following two issues.
   * - https://unicode-org.atlassian.net/browse/ICU-21023
   * - https://unicode-org.atlassian.net/browse/CLDR-13425
   *
   * We're currently using a relatively simple workaround, which doesn't give
   * the most accurate results. For example:
   *
   * ```
   * var dtf = new Intl.DateTimeFormat("en", {
   *   timeZone: "UTC",
   *   dateStyle: "long",
   *   timeStyle: "long",
   *   hourCycle: "h12",
   * });
   * print(dtf.format(new Date("2020-01-01T00:00Z")));
   * ```
   *
   * Returns the pattern "MMMM d, y 'at' h:mm:ss a z", but when going through
   * |DateTimePatternGenerator::GetSkeleton| and then
   * |DateTimePatternGenerator::GetBestPattern| to find an equivalent pattern
   * for "h23", we'll end up with the pattern "MMMM d, y, HH:mm:ss z", so the
   * combinator element " 'at' " was lost in the process.
   */
  static ICUResult FindPatternWithHourCycle(
      DateTimePatternGenerator& aDateTimePatternGenerator,
      DateTimeFormat::PatternVector& aPattern, bool aHour12,
      DateTimeFormat::SkeletonVector& aSkeleton);

  UDateFormat* mDateFormat = nullptr;

  SkeletonVector mOriginalSkeleton;
};

}  // namespace mozilla::intl

#endif