diff options
Diffstat (limited to 'intl/components/src/NumberFormatFields.cpp')
-rw-r--r-- | intl/components/src/NumberFormatFields.cpp | 396 |
1 files changed, 396 insertions, 0 deletions
diff --git a/intl/components/src/NumberFormatFields.cpp b/intl/components/src/NumberFormatFields.cpp new file mode 100644 index 0000000000..8ab4690d50 --- /dev/null +++ b/intl/components/src/NumberFormatFields.cpp @@ -0,0 +1,396 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#include "ICU4CGlue.h" +#include "NumberFormatFields.h" +#include "ScopedICUObject.h" + +#include "mozilla/FloatingPoint.h" +#include "unicode/uformattedvalue.h" +#include "unicode/unum.h" +#include "unicode/unumberformatter.h" + +namespace mozilla::intl { + +bool NumberFormatFields::append(NumberPartType type, int32_t begin, + int32_t end) { + MOZ_ASSERT(begin >= 0); + MOZ_ASSERT(end >= 0); + MOZ_ASSERT(begin < end, "erm, aren't fields always non-empty?"); + + return fields_.emplaceBack(uint32_t(begin), uint32_t(end), type); +} + +bool NumberFormatFields::toPartsVector(size_t overallLength, + const NumberPartSourceMap& sourceMap, + NumberPartVector& parts) { + std::sort(fields_.begin(), fields_.end(), + [](const NumberFormatField& left, const NumberFormatField& right) { + // Sort first by begin index, then to place + // enclosing fields before nested fields. + return left.begin < right.begin || + (left.begin == right.begin && left.end > right.end); + }); + + // Then iterate over the sorted field list to generate a sequence of parts + // (what ECMA-402 actually exposes). A part is a maximal character sequence + // entirely within no field or a single most-nested field. + // + // Diagrams may be helpful to illustrate how fields map to parts. Consider + // formatting -19,766,580,028,249.41, the US national surplus (negative + // because it's actually a debt) on October 18, 2016. + // + // var options = + // { style: "currency", currency: "USD", currencyDisplay: "name" }; + // var usdFormatter = new Intl.NumberFormat("en-US", options); + // usdFormatter.format(-19766580028249.41); + // + // The formatted result is "-19,766,580,028,249.41 US dollars". ICU + // identifies these fields in the string: + // + // UNUM_GROUPING_SEPARATOR_FIELD + // | + // UNUM_SIGN_FIELD | UNUM_DECIMAL_SEPARATOR_FIELD + // | __________/| | + // | / | | | | + // "-19,766,580,028,249.41 US dollars" + // \________________/ |/ \_______/ + // | | | + // UNUM_INTEGER_FIELD | UNUM_CURRENCY_FIELD + // | + // UNUM_FRACTION_FIELD + // + // These fields map to parts as follows: + // + // integer decimal + // _____|________ | + // / /| |\ |\ |\ | literal + // /| / | | \ | \ | \| | + // "-19,766,580,028,249.41 US dollars" + // | \___|___|___/ |/ \________/ + // | | | | + // | group | currency + // | | + // minusSign fraction + // + // The sign is a part. Each comma is a part, splitting the integer field + // into parts for trillions/billions/&c. digits. The decimal point is a + // part. Cents are a part. The space between cents and currency is a part + // (outside any field). Last, the currency field is a part. + + class PartGenerator { + // The fields in order from start to end, then least to most nested. + const FieldsVector& fields; + + // Index of the current field, in |fields|, being considered to + // determine part boundaries. |lastEnd <= fields[index].begin| is an + // invariant. + size_t index = 0; + + // The end index of the last part produced, always less than or equal + // to |limit|, strictly increasing. + uint32_t lastEnd = 0; + + // The length of the overall formatted string. + const uint32_t limit = 0; + + NumberPartSourceMap sourceMap; + + Vector<size_t, 4> enclosingFields; + + void popEnclosingFieldsEndingAt(uint32_t end) { + MOZ_ASSERT_IF(enclosingFields.length() > 0, + fields[enclosingFields.back()].end >= end); + + while (enclosingFields.length() > 0 && + fields[enclosingFields.back()].end == end) { + enclosingFields.popBack(); + } + } + + bool nextPartInternal(NumberPart* part) { + size_t len = fields.length(); + MOZ_ASSERT(index <= len); + + // If we're out of fields, all that remains are part(s) consisting + // of trailing portions of enclosing fields, and maybe a final + // literal part. + if (index == len) { + if (enclosingFields.length() > 0) { + const auto& enclosing = fields[enclosingFields.popCopy()]; + *part = {enclosing.type, sourceMap.source(enclosing), enclosing.end}; + + // If additional enclosing fields end where this part ends, + // pop them as well. + popEnclosingFieldsEndingAt(part->endIndex); + } else { + *part = {NumberPartType::Literal, sourceMap.source(limit), limit}; + } + + return true; + } + + // Otherwise we still have a field to process. + const NumberFormatField* current = &fields[index]; + MOZ_ASSERT(lastEnd <= current->begin); + MOZ_ASSERT(current->begin < current->end); + + // But first, deal with inter-field space. + if (lastEnd < current->begin) { + if (enclosingFields.length() > 0) { + // Space between fields, within an enclosing field, is part + // of that enclosing field, until the start of the current + // field or the end of the enclosing field, whichever is + // earlier. + const auto& enclosing = fields[enclosingFields.back()]; + *part = {enclosing.type, sourceMap.source(enclosing), + std::min(enclosing.end, current->begin)}; + popEnclosingFieldsEndingAt(part->endIndex); + } else { + // If there's no enclosing field, the space is a literal. + *part = {NumberPartType::Literal, sourceMap.source(current->begin), + current->begin}; + } + + return true; + } + + // Otherwise, the part spans a prefix of the current field. Find + // the most-nested field containing that prefix. + const NumberFormatField* next; + do { + current = &fields[index]; + + // If the current field is last, the part extends to its end. + if (++index == len) { + *part = {current->type, sourceMap.source(*current), current->end}; + return true; + } + + next = &fields[index]; + MOZ_ASSERT(current->begin <= next->begin); + MOZ_ASSERT(current->begin < next->end); + + // If the next field nests within the current field, push an + // enclosing field. (If there are no nested fields, don't + // bother pushing a field that'd be immediately popped.) + if (current->end > next->begin) { + if (!enclosingFields.append(index - 1)) { + return false; + } + } + + // Do so until the next field begins after this one. + } while (current->begin == next->begin); + + if (current->end <= next->begin) { + // The next field begins after the current field ends. Therefore + // the current part ends at the end of the current field. + *part = {current->type, sourceMap.source(*current), current->end}; + popEnclosingFieldsEndingAt(part->endIndex); + } else { + // The current field encloses the next one. The current part + // ends where the next field/part will start. + *part = {current->type, sourceMap.source(*current), next->begin}; + } + + return true; + } + + public: + PartGenerator(const FieldsVector& vec, uint32_t limit, + const NumberPartSourceMap& sourceMap) + : fields(vec), limit(limit), sourceMap(sourceMap), enclosingFields() {} + + bool nextPart(bool* hasPart, NumberPart* part) { + // There are no parts left if we've partitioned the entire string. + if (lastEnd == limit) { + MOZ_ASSERT(enclosingFields.length() == 0); + *hasPart = false; + return true; + } + + if (!nextPartInternal(part)) { + return false; + } + + *hasPart = true; + lastEnd = part->endIndex; + return true; + } + }; + + // Finally, generate the result array. + size_t lastEndIndex = 0; + + PartGenerator gen(fields_, overallLength, sourceMap); + do { + bool hasPart; + NumberPart part; + if (!gen.nextPart(&hasPart, &part)) { + return false; + } + + if (!hasPart) { + break; + } + + MOZ_ASSERT(lastEndIndex < part.endIndex); + + if (!parts.append(part)) { + return false; + } + + lastEndIndex = part.endIndex; + } while (true); + + MOZ_ASSERT(lastEndIndex == overallLength, + "result array must partition the entire string"); + + return lastEndIndex == overallLength; +} + +Result<std::u16string_view, ICUError> FormatResultToParts( + const UFormattedNumber* value, Maybe<double> number, bool isNegative, + bool formatForUnit, NumberPartVector& parts) { + UErrorCode status = U_ZERO_ERROR; + + const UFormattedValue* formattedValue = unumf_resultAsValue(value, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + return FormatResultToParts(formattedValue, number, isNegative, formatForUnit, + parts); +} + +Result<std::u16string_view, ICUError> FormatResultToParts( + const UFormattedValue* value, Maybe<double> number, bool isNegative, + bool formatForUnit, NumberPartVector& parts) { + UErrorCode status = U_ZERO_ERROR; + + int32_t utf16Length; + const char16_t* utf16Str = ufmtval_getString(value, &utf16Length, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + UConstrainedFieldPosition* fpos = ucfpos_open(&status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + ScopedICUObject<UConstrainedFieldPosition, ucfpos_close> toCloseFpos(fpos); + + // We're only interested in UFIELD_CATEGORY_NUMBER fields. + ucfpos_constrainCategory(fpos, UFIELD_CATEGORY_NUMBER, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + // Vacuum up fields in the overall formatted string. + NumberFormatFields fields; + + while (true) { + bool hasMore = ufmtval_nextPosition(value, fpos, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + if (!hasMore) { + break; + } + + int32_t fieldName = ucfpos_getField(fpos, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + int32_t beginIndex, endIndex; + ucfpos_getIndexes(fpos, &beginIndex, &endIndex, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + Maybe<NumberPartType> partType = GetPartTypeForNumberField( + UNumberFormatFields(fieldName), number, isNegative, formatForUnit); + if (!partType || !fields.append(*partType, beginIndex, endIndex)) { + return Err(ICUError::InternalError); + } + } + + if (!fields.toPartsVector(utf16Length, parts)) { + return Err(ICUError::InternalError); + } + + return std::u16string_view(utf16Str, static_cast<size_t>(utf16Length)); +} + +// See intl/icu/source/i18n/unicode/unum.h for a detailed field list. This +// list is deliberately exhaustive: cases might have to be added/removed if +// this code is compiled with a different ICU with more UNumberFormatFields +// enum initializers. Please guard such cases with appropriate ICU +// version-testing #ifdefs, should cross-version divergence occur. +Maybe<NumberPartType> GetPartTypeForNumberField(UNumberFormatFields fieldName, + Maybe<double> number, + bool isNegative, + bool formatForUnit) { + switch (fieldName) { + case UNUM_INTEGER_FIELD: + if (number.isSome()) { + if (std::isnan(*number)) { + return Some(NumberPartType::Nan); + } + if (!std::isfinite(*number)) { + return Some(NumberPartType::Infinity); + } + } + return Some(NumberPartType::Integer); + case UNUM_FRACTION_FIELD: + return Some(NumberPartType::Fraction); + case UNUM_DECIMAL_SEPARATOR_FIELD: + return Some(NumberPartType::Decimal); + case UNUM_EXPONENT_SYMBOL_FIELD: + return Some(NumberPartType::ExponentSeparator); + case UNUM_EXPONENT_SIGN_FIELD: + return Some(NumberPartType::ExponentMinusSign); + case UNUM_EXPONENT_FIELD: + return Some(NumberPartType::ExponentInteger); + case UNUM_GROUPING_SEPARATOR_FIELD: + return Some(NumberPartType::Group); + case UNUM_CURRENCY_FIELD: + return Some(NumberPartType::Currency); + case UNUM_PERCENT_FIELD: + if (formatForUnit) { + return Some(NumberPartType::Unit); + } + return Some(NumberPartType::Percent); + case UNUM_PERMILL_FIELD: + MOZ_ASSERT_UNREACHABLE( + "unexpected permill field found, even though " + "we don't use any user-defined patterns that " + "would require a permill field"); + break; + case UNUM_SIGN_FIELD: + if (isNegative) { + return Some(NumberPartType::MinusSign); + } + return Some(NumberPartType::PlusSign); + case UNUM_MEASURE_UNIT_FIELD: + return Some(NumberPartType::Unit); + case UNUM_COMPACT_FIELD: + return Some(NumberPartType::Compact); + case UNUM_APPROXIMATELY_SIGN_FIELD: + return Some(NumberPartType::ApproximatelySign); +#ifndef U_HIDE_DEPRECATED_API + case UNUM_FIELD_COUNT: + MOZ_ASSERT_UNREACHABLE( + "format field sentinel value returned by iterator!"); + break; +#endif + } + + MOZ_ASSERT_UNREACHABLE( + "unenumerated, undocumented format field returned by iterator"); + return Nothing(); +} + +} // namespace mozilla::intl |