diff options
Diffstat (limited to 'intl/icu/source/i18n/number_affixutils.cpp')
-rw-r--r-- | intl/icu/source/i18n/number_affixutils.cpp | 444 |
1 files changed, 444 insertions, 0 deletions
diff --git a/intl/icu/source/i18n/number_affixutils.cpp b/intl/icu/source/i18n/number_affixutils.cpp new file mode 100644 index 0000000000..5f5ff4c3a6 --- /dev/null +++ b/intl/icu/source/i18n/number_affixutils.cpp @@ -0,0 +1,444 @@ +// © 2017 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_FORMATTING + +#include "number_affixutils.h" +#include "unicode/utf16.h" +#include "unicode/uniset.h" + +using namespace icu; +using namespace icu::number; +using namespace icu::number::impl; + +TokenConsumer::~TokenConsumer() = default; +SymbolProvider::~SymbolProvider() = default; + +int32_t AffixUtils::estimateLength(const UnicodeString &patternString, UErrorCode &status) { + AffixPatternState state = STATE_BASE; + int32_t offset = 0; + int32_t length = 0; + for (; offset < patternString.length();) { + UChar32 cp = patternString.char32At(offset); + + switch (state) { + case STATE_BASE: + if (cp == u'\'') { + // First quote + state = STATE_FIRST_QUOTE; + } else { + // Unquoted symbol + length++; + } + break; + case STATE_FIRST_QUOTE: + if (cp == u'\'') { + // Repeated quote + length++; + state = STATE_BASE; + } else { + // Quoted code point + length++; + state = STATE_INSIDE_QUOTE; + } + break; + case STATE_INSIDE_QUOTE: + if (cp == u'\'') { + // End of quoted sequence + state = STATE_AFTER_QUOTE; + } else { + // Quoted code point + length++; + } + break; + case STATE_AFTER_QUOTE: + if (cp == u'\'') { + // Double quote inside of quoted sequence + length++; + state = STATE_INSIDE_QUOTE; + } else { + // Unquoted symbol + length++; + } + break; + default: + UPRV_UNREACHABLE_EXIT; + } + + offset += U16_LENGTH(cp); + } + + switch (state) { + case STATE_FIRST_QUOTE: + case STATE_INSIDE_QUOTE: + status = U_ILLEGAL_ARGUMENT_ERROR; + break; + default: + break; + } + + return length; +} + +UnicodeString AffixUtils::escape(const UnicodeString &input) { + AffixPatternState state = STATE_BASE; + int32_t offset = 0; + UnicodeString output; + for (; offset < input.length();) { + UChar32 cp = input.char32At(offset); + + switch (cp) { + case u'\'': + output.append(u"''", -1); + break; + + case u'-': + case u'+': + case u'%': + case u'‰': + case u'¤': + if (state == STATE_BASE) { + output.append(u'\''); + output.append(cp); + state = STATE_INSIDE_QUOTE; + } else { + output.append(cp); + } + break; + + default: + if (state == STATE_INSIDE_QUOTE) { + output.append(u'\''); + output.append(cp); + state = STATE_BASE; + } else { + output.append(cp); + } + break; + } + offset += U16_LENGTH(cp); + } + + if (state == STATE_INSIDE_QUOTE) { + output.append(u'\''); + } + + return output; +} + +Field AffixUtils::getFieldForType(AffixPatternType type) { + switch (type) { + case TYPE_MINUS_SIGN: + return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD}; + case TYPE_PLUS_SIGN: + return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD}; + case TYPE_APPROXIMATELY_SIGN: + return {UFIELD_CATEGORY_NUMBER, UNUM_APPROXIMATELY_SIGN_FIELD}; + case TYPE_PERCENT: + return {UFIELD_CATEGORY_NUMBER, UNUM_PERCENT_FIELD}; + case TYPE_PERMILLE: + return {UFIELD_CATEGORY_NUMBER, UNUM_PERMILL_FIELD}; + case TYPE_CURRENCY_SINGLE: + return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}; + case TYPE_CURRENCY_DOUBLE: + return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}; + case TYPE_CURRENCY_TRIPLE: + return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}; + case TYPE_CURRENCY_QUAD: + return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}; + case TYPE_CURRENCY_QUINT: + return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}; + case TYPE_CURRENCY_OVERFLOW: + return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}; + default: + UPRV_UNREACHABLE_EXIT; + } +} + +int32_t +AffixUtils::unescape(const UnicodeString &affixPattern, FormattedStringBuilder &output, int32_t position, + const SymbolProvider &provider, Field field, UErrorCode &status) { + int32_t length = 0; + AffixTag tag; + while (hasNext(tag, affixPattern)) { + tag = nextToken(tag, affixPattern, status); + if (U_FAILURE(status)) { return length; } + if (tag.type == TYPE_CURRENCY_OVERFLOW) { + // Don't go to the provider for this special case + length += output.insertCodePoint( + position + length, + 0xFFFD, + {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}, + status); + } else if (tag.type < 0) { + length += output.insert( + position + length, provider.getSymbol(tag.type), getFieldForType(tag.type), status); + } else { + length += output.insertCodePoint(position + length, tag.codePoint, field, status); + } + } + return length; +} + +int32_t AffixUtils::unescapedCodePointCount(const UnicodeString &affixPattern, + const SymbolProvider &provider, UErrorCode &status) { + int32_t length = 0; + AffixTag tag; + while (hasNext(tag, affixPattern)) { + tag = nextToken(tag, affixPattern, status); + if (U_FAILURE(status)) { return length; } + if (tag.type == TYPE_CURRENCY_OVERFLOW) { + length += 1; + } else if (tag.type < 0) { + length += provider.getSymbol(tag.type).length(); + } else { + length += U16_LENGTH(tag.codePoint); + } + } + return length; +} + +bool +AffixUtils::containsType(const UnicodeString &affixPattern, AffixPatternType type, UErrorCode &status) { + if (affixPattern.length() == 0) { + return false; + } + AffixTag tag; + while (hasNext(tag, affixPattern)) { + tag = nextToken(tag, affixPattern, status); + if (U_FAILURE(status)) { return false; } + if (tag.type == type) { + return true; + } + } + return false; +} + +bool AffixUtils::hasCurrencySymbols(const UnicodeString &affixPattern, UErrorCode &status) { + if (affixPattern.length() == 0) { + return false; + } + AffixTag tag; + while (hasNext(tag, affixPattern)) { + tag = nextToken(tag, affixPattern, status); + if (U_FAILURE(status)) { return false; } + if (tag.type < 0 && getFieldForType(tag.type) == Field(UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD)) { + return true; + } + } + return false; +} + +UnicodeString AffixUtils::replaceType(const UnicodeString &affixPattern, AffixPatternType type, + char16_t replacementChar, UErrorCode &status) { + UnicodeString output(affixPattern); // copy + if (affixPattern.length() == 0) { + return output; + } + AffixTag tag; + while (hasNext(tag, affixPattern)) { + tag = nextToken(tag, affixPattern, status); + if (U_FAILURE(status)) { return output; } + if (tag.type == type) { + output.replace(tag.offset - 1, 1, replacementChar); + } + } + return output; +} + +bool AffixUtils::containsOnlySymbolsAndIgnorables(const UnicodeString& affixPattern, + const UnicodeSet& ignorables, UErrorCode& status) { + if (affixPattern.length() == 0) { + return true; + } + AffixTag tag; + while (hasNext(tag, affixPattern)) { + tag = nextToken(tag, affixPattern, status); + if (U_FAILURE(status)) { return false; } + if (tag.type == TYPE_CODEPOINT && !ignorables.contains(tag.codePoint)) { + return false; + } + } + return true; +} + +void AffixUtils::iterateWithConsumer(const UnicodeString& affixPattern, TokenConsumer& consumer, + UErrorCode& status) { + if (affixPattern.length() == 0) { + return; + } + AffixTag tag; + while (hasNext(tag, affixPattern)) { + tag = nextToken(tag, affixPattern, status); + if (U_FAILURE(status)) { return; } + consumer.consumeToken(tag.type, tag.codePoint, status); + if (U_FAILURE(status)) { return; } + } +} + +AffixTag AffixUtils::nextToken(AffixTag tag, const UnicodeString &patternString, UErrorCode &status) { + int32_t offset = tag.offset; + int32_t state = tag.state; + for (; offset < patternString.length();) { + UChar32 cp = patternString.char32At(offset); + int32_t count = U16_LENGTH(cp); + + switch (state) { + case STATE_BASE: + switch (cp) { + case u'\'': + state = STATE_FIRST_QUOTE; + offset += count; + // continue to the next code point + break; + case u'-': + return makeTag(offset + count, TYPE_MINUS_SIGN, STATE_BASE, 0); + case u'+': + return makeTag(offset + count, TYPE_PLUS_SIGN, STATE_BASE, 0); + case u'~': + return makeTag(offset + count, TYPE_APPROXIMATELY_SIGN, STATE_BASE, 0); + case u'%': + return makeTag(offset + count, TYPE_PERCENT, STATE_BASE, 0); + case u'‰': + return makeTag(offset + count, TYPE_PERMILLE, STATE_BASE, 0); + case u'¤': + state = STATE_FIRST_CURR; + offset += count; + // continue to the next code point + break; + default: + return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp); + } + break; + case STATE_FIRST_QUOTE: + if (cp == u'\'') { + return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp); + } else { + return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp); + } + case STATE_INSIDE_QUOTE: + if (cp == u'\'') { + state = STATE_AFTER_QUOTE; + offset += count; + // continue to the next code point + break; + } else { + return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp); + } + case STATE_AFTER_QUOTE: + if (cp == u'\'') { + return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp); + } else { + state = STATE_BASE; + // re-evaluate this code point + break; + } + case STATE_FIRST_CURR: + if (cp == u'¤') { + state = STATE_SECOND_CURR; + offset += count; + // continue to the next code point + break; + } else { + return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0); + } + case STATE_SECOND_CURR: + if (cp == u'¤') { + state = STATE_THIRD_CURR; + offset += count; + // continue to the next code point + break; + } else { + return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0); + } + case STATE_THIRD_CURR: + if (cp == u'¤') { + state = STATE_FOURTH_CURR; + offset += count; + // continue to the next code point + break; + } else { + return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0); + } + case STATE_FOURTH_CURR: + if (cp == u'¤') { + state = STATE_FIFTH_CURR; + offset += count; + // continue to the next code point + break; + } else { + return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0); + } + case STATE_FIFTH_CURR: + if (cp == u'¤') { + state = STATE_OVERFLOW_CURR; + offset += count; + // continue to the next code point + break; + } else { + return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0); + } + case STATE_OVERFLOW_CURR: + if (cp == u'¤') { + offset += count; + // continue to the next code point and loop back to this state + break; + } else { + return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0); + } + default: + UPRV_UNREACHABLE_EXIT; + } + } + // End of string + switch (state) { + case STATE_BASE: + // No more tokens in string. + return {-1}; + case STATE_FIRST_QUOTE: + case STATE_INSIDE_QUOTE: + // For consistent behavior with the JDK and ICU 58, set an error here. + status = U_ILLEGAL_ARGUMENT_ERROR; + return {-1}; + case STATE_AFTER_QUOTE: + // No more tokens in string. + return {-1}; + case STATE_FIRST_CURR: + return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0); + case STATE_SECOND_CURR: + return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0); + case STATE_THIRD_CURR: + return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0); + case STATE_FOURTH_CURR: + return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0); + case STATE_FIFTH_CURR: + return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0); + case STATE_OVERFLOW_CURR: + return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0); + default: + UPRV_UNREACHABLE_EXIT; + } +} + +bool AffixUtils::hasNext(const AffixTag &tag, const UnicodeString &string) { + // First check for the {-1} and default initializer syntax. + if (tag.offset < 0) { + return false; + } else if (tag.offset == 0) { + return string.length() > 0; + } + // The rest of the fields are safe to use now. + // Special case: the last character in string is an end quote. + if (tag.state == STATE_INSIDE_QUOTE && tag.offset == string.length() - 1 && + string.charAt(tag.offset) == u'\'') { + return false; + } else if (tag.state != STATE_BASE) { + return true; + } else { + return tag.offset < string.length(); + } +} + +#endif /* #if !UCONFIG_NO_FORMATTING */ |