diff options
Diffstat (limited to 'intl/icu/source/i18n/number_skeletons.h')
-rw-r--r-- | intl/icu/source/i18n/number_skeletons.h | 393 |
1 files changed, 393 insertions, 0 deletions
diff --git a/intl/icu/source/i18n/number_skeletons.h b/intl/icu/source/i18n/number_skeletons.h new file mode 100644 index 0000000000..27f69cd48c --- /dev/null +++ b/intl/icu/source/i18n/number_skeletons.h @@ -0,0 +1,393 @@ +// © 2018 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_FORMATTING +#ifndef __SOURCE_NUMBER_SKELETONS_H__ +#define __SOURCE_NUMBER_SKELETONS_H__ + +#include "number_types.h" +#include "numparse_types.h" +#include "unicode/ucharstrie.h" +#include "string_segment.h" + +U_NAMESPACE_BEGIN +namespace number { +namespace impl { + +// Forward-declaration +struct SeenMacroProps; + +// namespace for enums and entrypoint functions +namespace skeleton { + +//////////////////////////////////////////////////////////////////////////////////////// +// NOTE: For examples of how to add a new stem to the number skeleton parser, see: // +// https://github.com/unicode-org/icu/commit/a2a7982216b2348070dc71093775ac7195793d73 // +// and // +// https://github.com/unicode-org/icu/commit/6fe86f3934a8a5701034f648a8f7c5087e84aa28 // +//////////////////////////////////////////////////////////////////////////////////////// + +/** + * While parsing a skeleton, this enum records what type of option we expect to find next. + */ +enum ParseState { + + // Section 0: We expect whitespace or a stem, but not an option: + + STATE_NULL, + + // Section 1: We might accept an option, but it is not required: + + STATE_SCIENTIFIC, + STATE_FRACTION_PRECISION, + STATE_PRECISION, + + // Section 2: An option is required: + + STATE_INCREMENT_PRECISION, + STATE_MEASURE_UNIT, + STATE_PER_MEASURE_UNIT, + STATE_IDENTIFIER_UNIT, + STATE_UNIT_USAGE, + STATE_CURRENCY_UNIT, + STATE_INTEGER_WIDTH, + STATE_NUMBERING_SYSTEM, + STATE_SCALE, +}; + +/** + * All possible stem literals have an entry in the StemEnum. The enum name is the kebab case stem + * string literal written in upper snake case. + * + * @see StemToObject + * @see #SERIALIZED_STEM_TRIE + */ +enum StemEnum { + + // Section 1: Stems that do not require an option: + + STEM_COMPACT_SHORT, + STEM_COMPACT_LONG, + STEM_SCIENTIFIC, + STEM_ENGINEERING, + STEM_NOTATION_SIMPLE, + STEM_BASE_UNIT, + STEM_PERCENT, + STEM_PERMILLE, + STEM_PERCENT_100, // concise-only + STEM_PRECISION_INTEGER, + STEM_PRECISION_UNLIMITED, + STEM_PRECISION_CURRENCY_STANDARD, + STEM_PRECISION_CURRENCY_CASH, + STEM_ROUNDING_MODE_CEILING, + STEM_ROUNDING_MODE_FLOOR, + STEM_ROUNDING_MODE_DOWN, + STEM_ROUNDING_MODE_UP, + STEM_ROUNDING_MODE_HALF_EVEN, + STEM_ROUNDING_MODE_HALF_ODD, + STEM_ROUNDING_MODE_HALF_CEILING, + STEM_ROUNDING_MODE_HALF_FLOOR, + STEM_ROUNDING_MODE_HALF_DOWN, + STEM_ROUNDING_MODE_HALF_UP, + STEM_ROUNDING_MODE_UNNECESSARY, + STEM_INTEGER_WIDTH_TRUNC, + STEM_GROUP_OFF, + STEM_GROUP_MIN2, + STEM_GROUP_AUTO, + STEM_GROUP_ON_ALIGNED, + STEM_GROUP_THOUSANDS, + STEM_LATIN, + STEM_UNIT_WIDTH_NARROW, + STEM_UNIT_WIDTH_SHORT, + STEM_UNIT_WIDTH_FULL_NAME, + STEM_UNIT_WIDTH_ISO_CODE, + STEM_UNIT_WIDTH_FORMAL, + STEM_UNIT_WIDTH_VARIANT, + STEM_UNIT_WIDTH_HIDDEN, + STEM_SIGN_AUTO, + STEM_SIGN_ALWAYS, + STEM_SIGN_NEVER, + STEM_SIGN_ACCOUNTING, + STEM_SIGN_ACCOUNTING_ALWAYS, + STEM_SIGN_EXCEPT_ZERO, + STEM_SIGN_ACCOUNTING_EXCEPT_ZERO, + STEM_SIGN_NEGATIVE, + STEM_SIGN_ACCOUNTING_NEGATIVE, + STEM_DECIMAL_AUTO, + STEM_DECIMAL_ALWAYS, + + // Section 2: Stems that DO require an option: + + STEM_PRECISION_INCREMENT, + STEM_MEASURE_UNIT, + STEM_PER_MEASURE_UNIT, + STEM_UNIT, + STEM_UNIT_USAGE, + STEM_CURRENCY, + STEM_INTEGER_WIDTH, + STEM_NUMBERING_SYSTEM, + STEM_SCALE, +}; + +/** Default wildcard char, accepted on input and printed in output */ +constexpr char16_t kWildcardChar = u'*'; + +/** Alternative wildcard char, accept on input but not printed in output */ +constexpr char16_t kAltWildcardChar = u'+'; + +/** Checks whether the char is a wildcard on input */ +inline bool isWildcardChar(char16_t c) { + return c == kWildcardChar || c == kAltWildcardChar; +} + +/** + * Creates a NumberFormatter corresponding to the given skeleton string. + * + * @param skeletonString + * A number skeleton string, possibly not in its shortest form. + * @return An UnlocalizedNumberFormatter with behavior defined by the given skeleton string. + */ +UnlocalizedNumberFormatter create( + const UnicodeString& skeletonString, UParseError* perror, UErrorCode& status); + +/** + * Create a skeleton string corresponding to the given NumberFormatter. + * + * @param macros + * The NumberFormatter options object. + * @return A skeleton string in normalized form. + */ +UnicodeString generate(const MacroProps& macros, UErrorCode& status); + +/** + * Converts from a skeleton string to a MacroProps. This method contains the primary parse loop. + * + * Internal: use the create() endpoint instead of this function. + */ +MacroProps parseSkeleton(const UnicodeString& skeletonString, int32_t& errOffset, UErrorCode& status); + +/** + * Given that the current segment represents a stem, parse it and save the result. + * + * @return The next state after parsing this stem, corresponding to what subset of options to expect. + */ +ParseState parseStem(const StringSegment& segment, const UCharsTrie& stemTrie, SeenMacroProps& seen, + MacroProps& macros, UErrorCode& status); + +/** + * Given that the current segment represents an option, parse it and save the result. + * + * @return The next state after parsing this option, corresponding to what subset of options to + * expect next. + */ +ParseState +parseOption(ParseState stem, const StringSegment& segment, MacroProps& macros, UErrorCode& status); + +} // namespace skeleton + + +/** + * Namespace for utility methods that convert from StemEnum to corresponding objects or enums. This + * applies to only the "Section 1" stems, those that are well-defined without an option. + */ +namespace stem_to_object { + +Notation notation(skeleton::StemEnum stem); + +MeasureUnit unit(skeleton::StemEnum stem); + +Precision precision(skeleton::StemEnum stem); + +UNumberFormatRoundingMode roundingMode(skeleton::StemEnum stem); + +UNumberGroupingStrategy groupingStrategy(skeleton::StemEnum stem); + +UNumberUnitWidth unitWidth(skeleton::StemEnum stem); + +UNumberSignDisplay signDisplay(skeleton::StemEnum stem); + +UNumberDecimalSeparatorDisplay decimalSeparatorDisplay(skeleton::StemEnum stem); + +} // namespace stem_to_object + +/** + * Namespace for utility methods that convert from enums to stem strings. More complex object conversions + * take place in the object_to_stem_string namespace. + */ +namespace enum_to_stem_string { + +void roundingMode(UNumberFormatRoundingMode value, UnicodeString& sb); + +void groupingStrategy(UNumberGroupingStrategy value, UnicodeString& sb); + +void unitWidth(UNumberUnitWidth value, UnicodeString& sb); + +void signDisplay(UNumberSignDisplay value, UnicodeString& sb); + +void decimalSeparatorDisplay(UNumberDecimalSeparatorDisplay value, UnicodeString& sb); + +} // namespace enum_to_stem_string + +/** + * Namespace for utility methods for processing stems and options that cannot be interpreted literally. + */ +namespace blueprint_helpers { + +/** @return Whether we successfully found and parsed an exponent width option. */ +bool parseExponentWidthOption(const StringSegment& segment, MacroProps& macros, UErrorCode& status); + +void generateExponentWidthOption(int32_t minExponentDigits, UnicodeString& sb, UErrorCode& status); + +/** @return Whether we successfully found and parsed an exponent sign option. */ +bool parseExponentSignOption(const StringSegment& segment, MacroProps& macros, UErrorCode& status); + +void parseCurrencyOption(const StringSegment& segment, MacroProps& macros, UErrorCode& status); + +void generateCurrencyOption(const CurrencyUnit& currency, UnicodeString& sb, UErrorCode& status); + +// "measure-unit/" is deprecated in favour of "unit/". +void parseMeasureUnitOption(const StringSegment& segment, MacroProps& macros, UErrorCode& status); + +// "per-measure-unit/" is deprecated in favour of "unit/". +void parseMeasurePerUnitOption(const StringSegment& segment, MacroProps& macros, UErrorCode& status); + +/** + * Parses unit identifiers like "meter-per-second" and "foot-and-inch", as + * specified via a "unit/" concise skeleton. + */ +void parseIdentifierUnitOption(const StringSegment& segment, MacroProps& macros, UErrorCode& status); + +void parseUnitUsageOption(const StringSegment& segment, MacroProps& macros, UErrorCode& status); + +void parseFractionStem(const StringSegment& segment, MacroProps& macros, UErrorCode& status); + +void generateFractionStem(int32_t minFrac, int32_t maxFrac, UnicodeString& sb, UErrorCode& status); + +void parseDigitsStem(const StringSegment& segment, MacroProps& macros, UErrorCode& status); + +void generateDigitsStem(int32_t minSig, int32_t maxSig, UnicodeString& sb, UErrorCode& status); + +void parseScientificStem(const StringSegment& segment, MacroProps& macros, UErrorCode& status); + +// Note: no generateScientificStem since this syntax was added later in ICU 67 + +void parseIntegerStem(const StringSegment& segment, MacroProps& macros, UErrorCode& status); + +// Note: no generateIntegerStem since this syntax was added later in ICU 67 + +/** @return Whether we successfully found and parsed a frac-sig option. */ +bool parseFracSigOption(const StringSegment& segment, MacroProps& macros, UErrorCode& status); + +/** @return Whether we successfully found and parsed a trailing zero option. */ +bool parseTrailingZeroOption(const StringSegment& segment, MacroProps& macros, UErrorCode& status); + +void parseIncrementOption(const StringSegment& segment, MacroProps& macros, UErrorCode& status); + +void +generateIncrementOption(uint32_t increment, digits_t incrementMagnitude, int32_t minFrac, UnicodeString& sb, UErrorCode& status); + +void parseIntegerWidthOption(const StringSegment& segment, MacroProps& macros, UErrorCode& status); + +void generateIntegerWidthOption(int32_t minInt, int32_t maxInt, UnicodeString& sb, UErrorCode& status); + +void parseNumberingSystemOption(const StringSegment& segment, MacroProps& macros, UErrorCode& status); + +void generateNumberingSystemOption(const NumberingSystem& ns, UnicodeString& sb, UErrorCode& status); + +void parseScaleOption(const StringSegment& segment, MacroProps& macros, UErrorCode& status); + +void generateScaleOption(int32_t magnitude, const DecNum* arbitrary, UnicodeString& sb, + UErrorCode& status); + +} // namespace blueprint_helpers + +/** + * Class for utility methods for generating a token corresponding to each macro-prop. Each method + * returns whether or not a token was written to the string builder. + * + * This needs to be a class, not a namespace, so it can be friended. + */ +class GeneratorHelpers { + public: + /** + * Main skeleton generator function. Appends the normalized skeleton for the MacroProps to the given + * StringBuilder. + * + * Internal: use the create() endpoint instead of this function. + */ + static void generateSkeleton(const MacroProps& macros, UnicodeString& sb, UErrorCode& status); + + private: + static bool notation(const MacroProps& macros, UnicodeString& sb, UErrorCode& status); + + static bool unit(const MacroProps& macros, UnicodeString& sb, UErrorCode& status); + + static bool usage(const MacroProps& macros, UnicodeString& sb, UErrorCode& status); + + static bool precision(const MacroProps& macros, UnicodeString& sb, UErrorCode& status); + + static bool roundingMode(const MacroProps& macros, UnicodeString& sb, UErrorCode& status); + + static bool grouping(const MacroProps& macros, UnicodeString& sb, UErrorCode& status); + + static bool integerWidth(const MacroProps& macros, UnicodeString& sb, UErrorCode& status); + + static bool symbols(const MacroProps& macros, UnicodeString& sb, UErrorCode& status); + + static bool unitWidth(const MacroProps& macros, UnicodeString& sb, UErrorCode& status); + + static bool sign(const MacroProps& macros, UnicodeString& sb, UErrorCode& status); + + static bool decimal(const MacroProps& macros, UnicodeString& sb, UErrorCode& status); + + static bool scale(const MacroProps& macros, UnicodeString& sb, UErrorCode& status); + +}; + +/** + * Struct for null-checking. + * In Java, we can just check the object reference. In C++, we need a different method. + */ +struct SeenMacroProps { + bool notation = false; + bool unit = false; + bool perUnit = false; + bool usage = false; + bool precision = false; + bool roundingMode = false; + bool grouper = false; + bool padder = false; + bool integerWidth = false; + bool symbols = false; + bool unitWidth = false; + bool sign = false; + bool decimal = false; + bool scale = false; +}; + +namespace { + +#define SKELETON_UCHAR_TO_CHAR(dest, src, start, end, status) (void)(dest); \ +UPRV_BLOCK_MACRO_BEGIN { \ + UErrorCode conversionStatus = U_ZERO_ERROR; \ + (dest).appendInvariantChars({false, (src).getBuffer() + (start), (end) - (start)}, conversionStatus); \ + if (conversionStatus == U_INVARIANT_CONVERSION_ERROR) { \ + /* Don't propagate the invariant conversion error; it is a skeleton syntax error */ \ + (status) = U_NUMBER_SKELETON_SYNTAX_ERROR; \ + return; \ + } else if (U_FAILURE(conversionStatus)) { \ + (status) = conversionStatus; \ + return; \ + } \ +} UPRV_BLOCK_MACRO_END + +} // namespace + +} // namespace impl +} // namespace number +U_NAMESPACE_END + +#endif //__SOURCE_NUMBER_SKELETONS_H__ +#endif /* #if !UCONFIG_NO_FORMATTING */ |