From 0ebf5bdf043a27fd3dfb7f92e0cb63d88954c44d Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 19 Apr 2024 03:47:29 +0200 Subject: Adding upstream version 115.8.0esr. Signed-off-by: Daniel Baumann --- intl/icu/source/i18n/measunit_extra.cpp | 1260 +++++++++++++++++++++++++++++++ 1 file changed, 1260 insertions(+) create mode 100644 intl/icu/source/i18n/measunit_extra.cpp (limited to 'intl/icu/source/i18n/measunit_extra.cpp') diff --git a/intl/icu/source/i18n/measunit_extra.cpp b/intl/icu/source/i18n/measunit_extra.cpp new file mode 100644 index 0000000000..295d6a8ce8 --- /dev/null +++ b/intl/icu/source/i18n/measunit_extra.cpp @@ -0,0 +1,1260 @@ +// © 2020 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +// Extra functions for MeasureUnit not needed for all clients. +// Separate .o file so that it can be removed for modularity. + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_FORMATTING + +// Allow implicit conversion from char16_t* to UnicodeString for this file: +// Helpful in toString methods and elsewhere. +#define UNISTR_FROM_STRING_EXPLICIT + +#include "charstr.h" +#include "cmemory.h" +#include "cstring.h" +#include "measunit_impl.h" +#include "resource.h" +#include "uarrsort.h" +#include "uassert.h" +#include "ucln_in.h" +#include "umutex.h" +#include "unicode/bytestrie.h" +#include "unicode/bytestriebuilder.h" +#include "unicode/localpointer.h" +#include "unicode/stringpiece.h" +#include "unicode/stringtriebuilder.h" +#include "unicode/ures.h" +#include "unicode/ustringtrie.h" +#include "uresimp.h" +#include "util.h" +#include + +U_NAMESPACE_BEGIN + + +namespace { + +// TODO: Propose a new error code for this? +constexpr UErrorCode kUnitIdentifierSyntaxError = U_ILLEGAL_ARGUMENT_ERROR; + +// Trie value offset for SI or binary prefixes. This is big enough to ensure we only +// insert positive integers into the trie. +constexpr int32_t kPrefixOffset = 64; +static_assert(kPrefixOffset + UMEASURE_PREFIX_INTERNAL_MIN_BIN > 0, + "kPrefixOffset is too small for minimum UMeasurePrefix value"); +static_assert(kPrefixOffset + UMEASURE_PREFIX_INTERNAL_MIN_SI > 0, + "kPrefixOffset is too small for minimum UMeasurePrefix value"); + +// Trie value offset for compound parts, e.g. "-per-", "-", "-and-". +constexpr int32_t kCompoundPartOffset = 128; +static_assert(kCompoundPartOffset > kPrefixOffset + UMEASURE_PREFIX_INTERNAL_MAX_BIN, + "Ambiguous token values: prefix tokens are overlapping with CompoundPart tokens"); +static_assert(kCompoundPartOffset > kPrefixOffset + UMEASURE_PREFIX_INTERNAL_MAX_SI, + "Ambiguous token values: prefix tokens are overlapping with CompoundPart tokens"); + +enum CompoundPart { + // Represents "-per-" + COMPOUND_PART_PER = kCompoundPartOffset, + // Represents "-" + COMPOUND_PART_TIMES, + // Represents "-and-" + COMPOUND_PART_AND, +}; + +// Trie value offset for "per-". +constexpr int32_t kInitialCompoundPartOffset = 192; + +enum InitialCompoundPart { + // Represents "per-", the only compound part that can appear at the start of + // an identifier. + INITIAL_COMPOUND_PART_PER = kInitialCompoundPartOffset, +}; + +// Trie value offset for powers like "square-", "cubic-", "pow2-" etc. +constexpr int32_t kPowerPartOffset = 256; + +enum PowerPart { + POWER_PART_P2 = kPowerPartOffset + 2, + POWER_PART_P3, + POWER_PART_P4, + POWER_PART_P5, + POWER_PART_P6, + POWER_PART_P7, + POWER_PART_P8, + POWER_PART_P9, + POWER_PART_P10, + POWER_PART_P11, + POWER_PART_P12, + POWER_PART_P13, + POWER_PART_P14, + POWER_PART_P15, +}; + +// Trie value offset for simple units, e.g. "gram", "nautical-mile", +// "fluid-ounce-imperial". +constexpr int32_t kSimpleUnitOffset = 512; + +const struct UnitPrefixStrings { + const char* const string; + UMeasurePrefix value; +} gUnitPrefixStrings[] = { + // SI prefixes + { "yotta", UMEASURE_PREFIX_YOTTA }, + { "zetta", UMEASURE_PREFIX_ZETTA }, + { "exa", UMEASURE_PREFIX_EXA }, + { "peta", UMEASURE_PREFIX_PETA }, + { "tera", UMEASURE_PREFIX_TERA }, + { "giga", UMEASURE_PREFIX_GIGA }, + { "mega", UMEASURE_PREFIX_MEGA }, + { "kilo", UMEASURE_PREFIX_KILO }, + { "hecto", UMEASURE_PREFIX_HECTO }, + { "deka", UMEASURE_PREFIX_DEKA }, + { "deci", UMEASURE_PREFIX_DECI }, + { "centi", UMEASURE_PREFIX_CENTI }, + { "milli", UMEASURE_PREFIX_MILLI }, + { "micro", UMEASURE_PREFIX_MICRO }, + { "nano", UMEASURE_PREFIX_NANO }, + { "pico", UMEASURE_PREFIX_PICO }, + { "femto", UMEASURE_PREFIX_FEMTO }, + { "atto", UMEASURE_PREFIX_ATTO }, + { "zepto", UMEASURE_PREFIX_ZEPTO }, + { "yocto", UMEASURE_PREFIX_YOCTO }, + // Binary prefixes + { "yobi", UMEASURE_PREFIX_YOBI }, + { "zebi", UMEASURE_PREFIX_ZEBI }, + { "exbi", UMEASURE_PREFIX_EXBI }, + { "pebi", UMEASURE_PREFIX_PEBI }, + { "tebi", UMEASURE_PREFIX_TEBI }, + { "gibi", UMEASURE_PREFIX_GIBI }, + { "mebi", UMEASURE_PREFIX_MEBI }, + { "kibi", UMEASURE_PREFIX_KIBI }, +}; + +/** + * A ResourceSink that collects simple unit identifiers from the keys of the + * convertUnits table into an array, and adds these values to a TrieBuilder, + * with associated values being their index into this array plus a specified + * offset. + * + * Example code: + * + * UErrorCode status = U_ZERO_ERROR; + * BytesTrieBuilder b(status); + * int32_t ARR_SIZE = 200; + * const char *unitIdentifiers[ARR_SIZE]; + * int32_t *unitCategories[ARR_SIZE]; + * SimpleUnitIdentifiersSink identifierSink(gSerializedUnitCategoriesTrie, unitIdentifiers, + * unitCategories, ARR_SIZE, b, kTrieValueOffset); + * LocalUResourceBundlePointer unitsBundle(ures_openDirect(nullptr, "units", &status)); + * ures_getAllItemsWithFallback(unitsBundle.getAlias(), "convertUnits", identifierSink, status); + */ +class SimpleUnitIdentifiersSink : public icu::ResourceSink { + public: + /** + * Constructor. + * @param quantitiesTrieData The data for constructing a quantitiesTrie, + * which maps from a simple unit identifier to an index into the + * gCategories array. + * @param out Array of char* to which pointers to the simple unit + * identifiers will be saved. (Does not take ownership.) + * @param outCategories Array of int32_t to which category indexes will be + * saved: this corresponds to simple unit IDs saved to `out`, mapping + * from the ID to the value produced by the quantitiesTrie (which is an + * index into the gCategories array). + * @param outSize The size of `out` and `outCategories`. + * @param trieBuilder The trie builder to which the simple unit identifier + * should be added. The trie builder must outlive this resource sink. + * @param trieValueOffset This is added to the index of the identifier in + * the `out` array, before adding to `trieBuilder` as the value + * associated with the identifier. + */ + explicit SimpleUnitIdentifiersSink(StringPiece quantitiesTrieData, const char **out, + int32_t *outCategories, int32_t outSize, + BytesTrieBuilder &trieBuilder, int32_t trieValueOffset) + : outArray(out), outCategories(outCategories), outSize(outSize), trieBuilder(trieBuilder), + trieValueOffset(trieValueOffset), quantitiesTrieData(quantitiesTrieData), outIndex(0) {} + + /** + * Adds the table keys found in value to the output vector. + * @param key The key of the resource passed to `value`: the second + * parameter of the ures_getAllItemsWithFallback() call. + * @param value Should be a ResourceTable value, if + * ures_getAllItemsWithFallback() was called correctly for this sink. + * @param noFallback Ignored. + * @param status The standard ICU error code output parameter. + */ + void put(const char * /*key*/, ResourceValue &value, UBool /*noFallback*/, UErrorCode &status) override { + ResourceTable table = value.getTable(status); + if (U_FAILURE(status)) return; + + if (outIndex + table.getSize() > outSize) { + status = U_INDEX_OUTOFBOUNDS_ERROR; + return; + } + + BytesTrie quantitiesTrie(quantitiesTrieData.data()); + + // Collect keys from the table resource. + const char *simpleUnitID; + for (int32_t i = 0; table.getKeyAndValue(i, simpleUnitID, value); ++i) { + U_ASSERT(i < table.getSize()); + U_ASSERT(outIndex < outSize); + if (uprv_strcmp(simpleUnitID, "kilogram") == 0) { + // For parsing, we use "gram", the prefixless metric mass unit. We + // thus ignore the SI Base Unit of Mass: it exists due to being the + // mass conversion target unit, but not needed for MeasureUnit + // parsing. + continue; + } + outArray[outIndex] = simpleUnitID; + trieBuilder.add(simpleUnitID, trieValueOffset + outIndex, status); + + // Find the base target unit for this simple unit + ResourceTable table = value.getTable(status); + if (U_FAILURE(status)) { return; } + if (!table.findValue("target", value)) { + status = U_INVALID_FORMAT_ERROR; + break; + } + int32_t len; + const char16_t* uTarget = value.getString(len, status); + CharString target; + target.appendInvariantChars(uTarget, len, status); + if (U_FAILURE(status)) { return; } + quantitiesTrie.reset(); + UStringTrieResult result = quantitiesTrie.next(target.data(), target.length()); + if (!USTRINGTRIE_HAS_VALUE(result)) { + status = U_INVALID_FORMAT_ERROR; + break; + } + outCategories[outIndex] = quantitiesTrie.getValue(); + + outIndex++; + } + } + + private: + const char **outArray; + int32_t *outCategories; + int32_t outSize; + BytesTrieBuilder &trieBuilder; + int32_t trieValueOffset; + + StringPiece quantitiesTrieData; + + int32_t outIndex; +}; + +/** + * A ResourceSink that collects information from `unitQuantities` in the `units` + * resource to provide key->value lookups from base unit to category, as well as + * preserving ordering information for these categories. See `units.txt`. + * + * For example: "kilogram" -> "mass", "meter-per-second" -> "speed". + * + * In C++ unitQuantity values are collected in order into a char16_t* array, while + * unitQuantity keys are added added to a TrieBuilder, with associated values + * being the index into the aforementioned char16_t* array. + */ +class CategoriesSink : public icu::ResourceSink { + public: + /** + * Constructor. + * @param out Array of char16_t* to which unitQuantity values will be saved. + * The pointers returned not owned: they point directly at the resource + * strings in static memory. + * @param outSize The size of the `out` array. + * @param trieBuilder The trie builder to which the keys (base units) of + * each unitQuantity will be added, each with value being the offset + * into `out`. + */ + explicit CategoriesSink(const char16_t **out, int32_t &outSize, BytesTrieBuilder &trieBuilder) + : outQuantitiesArray(out), outSize(outSize), trieBuilder(trieBuilder), outIndex(0) {} + + void put(const char * /*key*/, ResourceValue &value, UBool /*noFallback*/, UErrorCode &status) override { + ResourceArray array = value.getArray(status); + if (U_FAILURE(status)) { + return; + } + + if (outIndex + array.getSize() > outSize) { + status = U_INDEX_OUTOFBOUNDS_ERROR; + return; + } + + for (int32_t i = 0; array.getValue(i, value); ++i) { + U_ASSERT(outIndex < outSize); + ResourceTable table = value.getTable(status); + if (U_FAILURE(status)) { + return; + } + if (table.getSize() != 1) { + status = U_INVALID_FORMAT_ERROR; + return; + } + const char *key; + table.getKeyAndValue(0, key, value); + int32_t uTmpLen; + outQuantitiesArray[outIndex] = value.getString(uTmpLen, status); + trieBuilder.add(key, outIndex, status); + outIndex++; + } + } + + private: + const char16_t **outQuantitiesArray; + int32_t &outSize; + BytesTrieBuilder &trieBuilder; + + int32_t outIndex; +}; + +icu::UInitOnce gUnitExtrasInitOnce {}; + +// Array of simple unit IDs. +// +// The array memory itself is owned by this pointer, but the individual char* in +// that array point at static memory. (Note that these char* are also returned +// by SingleUnitImpl::getSimpleUnitID().) +const char **gSimpleUnits = nullptr; + +// Maps from the value associated with each simple unit ID to an index into the +// gCategories array. +int32_t *gSimpleUnitCategories = nullptr; + +char *gSerializedUnitExtrasStemTrie = nullptr; + +// Array of char16_t* pointing at the unit categories (aka "quantities", aka +// "types"), as found in the `unitQuantities` resource. The array memory itself +// is owned by this pointer, but the individual char16_t* in that array point at +// static memory. +const char16_t **gCategories = nullptr; +// Number of items in `gCategories`. +int32_t gCategoriesCount = 0; +// Serialized BytesTrie for mapping from base units to indices into gCategories. +char *gSerializedUnitCategoriesTrie = nullptr; + +UBool U_CALLCONV cleanupUnitExtras() { + uprv_free(gSerializedUnitCategoriesTrie); + gSerializedUnitCategoriesTrie = nullptr; + uprv_free(gCategories); + gCategories = nullptr; + uprv_free(gSerializedUnitExtrasStemTrie); + gSerializedUnitExtrasStemTrie = nullptr; + uprv_free(gSimpleUnitCategories); + gSimpleUnitCategories = nullptr; + uprv_free(gSimpleUnits); + gSimpleUnits = nullptr; + gUnitExtrasInitOnce.reset(); + return true; +} + +void U_CALLCONV initUnitExtras(UErrorCode& status) { + ucln_i18n_registerCleanup(UCLN_I18N_UNIT_EXTRAS, cleanupUnitExtras); + LocalUResourceBundlePointer unitsBundle(ures_openDirect(nullptr, "units", &status)); + + // Collect unitQuantities information into gSerializedUnitCategoriesTrie and gCategories. + const char *CATEGORY_TABLE_NAME = "unitQuantities"; + LocalUResourceBundlePointer unitQuantities( + ures_getByKey(unitsBundle.getAlias(), CATEGORY_TABLE_NAME, nullptr, &status)); + if (U_FAILURE(status)) { return; } + gCategoriesCount = unitQuantities.getAlias()->fSize; + size_t quantitiesMallocSize = sizeof(char16_t *) * gCategoriesCount; + gCategories = static_cast(uprv_malloc(quantitiesMallocSize)); + if (gCategories == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + uprv_memset(gCategories, 0, quantitiesMallocSize); + BytesTrieBuilder quantitiesBuilder(status); + CategoriesSink categoriesSink(gCategories, gCategoriesCount, quantitiesBuilder); + ures_getAllItemsWithFallback(unitsBundle.getAlias(), CATEGORY_TABLE_NAME, categoriesSink, status); + StringPiece resultQuantities = quantitiesBuilder.buildStringPiece(USTRINGTRIE_BUILD_FAST, status); + if (U_FAILURE(status)) { return; } + // Copy the result into the global constant pointer + size_t numBytesQuantities = resultQuantities.length(); + gSerializedUnitCategoriesTrie = static_cast(uprv_malloc(numBytesQuantities)); + if (gSerializedUnitCategoriesTrie == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + uprv_memcpy(gSerializedUnitCategoriesTrie, resultQuantities.data(), numBytesQuantities); + + // Build the BytesTrie that Parser needs for parsing unit identifiers. + + BytesTrieBuilder b(status); + if (U_FAILURE(status)) { return; } + + // Add SI and binary prefixes + for (const auto& unitPrefixInfo : gUnitPrefixStrings) { + b.add(unitPrefixInfo.string, unitPrefixInfo.value + kPrefixOffset, status); + } + if (U_FAILURE(status)) { return; } + + // Add syntax parts (compound, power prefixes) + b.add("-per-", COMPOUND_PART_PER, status); + b.add("-", COMPOUND_PART_TIMES, status); + b.add("-and-", COMPOUND_PART_AND, status); + b.add("per-", INITIAL_COMPOUND_PART_PER, status); + b.add("square-", POWER_PART_P2, status); + b.add("cubic-", POWER_PART_P3, status); + b.add("pow2-", POWER_PART_P2, status); + b.add("pow3-", POWER_PART_P3, status); + b.add("pow4-", POWER_PART_P4, status); + b.add("pow5-", POWER_PART_P5, status); + b.add("pow6-", POWER_PART_P6, status); + b.add("pow7-", POWER_PART_P7, status); + b.add("pow8-", POWER_PART_P8, status); + b.add("pow9-", POWER_PART_P9, status); + b.add("pow10-", POWER_PART_P10, status); + b.add("pow11-", POWER_PART_P11, status); + b.add("pow12-", POWER_PART_P12, status); + b.add("pow13-", POWER_PART_P13, status); + b.add("pow14-", POWER_PART_P14, status); + b.add("pow15-", POWER_PART_P15, status); + if (U_FAILURE(status)) { return; } + + // Add sanctioned simple units by offset: simple units all have entries in + // units/convertUnits resources. + LocalUResourceBundlePointer convertUnits( + ures_getByKey(unitsBundle.getAlias(), "convertUnits", nullptr, &status)); + if (U_FAILURE(status)) { return; } + + // Allocate enough space: with identifierSink below skipping kilogram, we're + // probably allocating one more than needed. + int32_t simpleUnitsCount = convertUnits.getAlias()->fSize; + int32_t arrayMallocSize = sizeof(char *) * simpleUnitsCount; + gSimpleUnits = static_cast(uprv_malloc(arrayMallocSize)); + if (gSimpleUnits == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + uprv_memset(gSimpleUnits, 0, arrayMallocSize); + arrayMallocSize = sizeof(int32_t) * simpleUnitsCount; + gSimpleUnitCategories = static_cast(uprv_malloc(arrayMallocSize)); + if (gSimpleUnitCategories == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + uprv_memset(gSimpleUnitCategories, 0, arrayMallocSize); + + // Populate gSimpleUnits and build the associated trie. + SimpleUnitIdentifiersSink identifierSink(resultQuantities, gSimpleUnits, gSimpleUnitCategories, + simpleUnitsCount, b, kSimpleUnitOffset); + ures_getAllItemsWithFallback(unitsBundle.getAlias(), "convertUnits", identifierSink, status); + + // Build the CharsTrie + // TODO: Use SLOW or FAST here? + StringPiece result = b.buildStringPiece(USTRINGTRIE_BUILD_FAST, status); + if (U_FAILURE(status)) { return; } + + // Copy the result into the global constant pointer + size_t numBytes = result.length(); + gSerializedUnitExtrasStemTrie = static_cast(uprv_malloc(numBytes)); + if (gSerializedUnitExtrasStemTrie == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + uprv_memcpy(gSerializedUnitExtrasStemTrie, result.data(), numBytes); +} + +class Token { +public: + Token(int32_t match) : fMatch(match) {} + + enum Type { + TYPE_UNDEFINED, + TYPE_PREFIX, + // Token type for "-per-", "-", and "-and-". + TYPE_COMPOUND_PART, + // Token type for "per-". + TYPE_INITIAL_COMPOUND_PART, + TYPE_POWER_PART, + TYPE_SIMPLE_UNIT, + }; + + // Calling getType() is invalid, resulting in an assertion failure, if Token + // value isn't positive. + Type getType() const { + U_ASSERT(fMatch > 0); + if (fMatch < kCompoundPartOffset) { + return TYPE_PREFIX; + } + if (fMatch < kInitialCompoundPartOffset) { + return TYPE_COMPOUND_PART; + } + if (fMatch < kPowerPartOffset) { + return TYPE_INITIAL_COMPOUND_PART; + } + if (fMatch < kSimpleUnitOffset) { + return TYPE_POWER_PART; + } + return TYPE_SIMPLE_UNIT; + } + + UMeasurePrefix getUnitPrefix() const { + U_ASSERT(getType() == TYPE_PREFIX); + return static_cast(fMatch - kPrefixOffset); + } + + // Valid only for tokens with type TYPE_COMPOUND_PART. + int32_t getMatch() const { + U_ASSERT(getType() == TYPE_COMPOUND_PART); + return fMatch; + } + + int32_t getInitialCompoundPart() const { + // Even if there is only one InitialCompoundPart value, we have this + // function for the simplicity of code consistency. + U_ASSERT(getType() == TYPE_INITIAL_COMPOUND_PART); + // Defensive: if this assert fails, code using this function also needs + // to change. + U_ASSERT(fMatch == INITIAL_COMPOUND_PART_PER); + return fMatch; + } + + int8_t getPower() const { + U_ASSERT(getType() == TYPE_POWER_PART); + return static_cast(fMatch - kPowerPartOffset); + } + + int32_t getSimpleUnitIndex() const { + U_ASSERT(getType() == TYPE_SIMPLE_UNIT); + return fMatch - kSimpleUnitOffset; + } + +private: + int32_t fMatch; +}; + +class Parser { +public: + /** + * Factory function for parsing the given identifier. + * + * @param source The identifier to parse. This function does not make a copy + * of source: the underlying string that source points at, must outlive the + * parser. + * @param status ICU error code. + */ + static Parser from(StringPiece source, UErrorCode& status) { + if (U_FAILURE(status)) { + return Parser(); + } + umtx_initOnce(gUnitExtrasInitOnce, &initUnitExtras, status); + if (U_FAILURE(status)) { + return Parser(); + } + return Parser(source); + } + + MeasureUnitImpl parse(UErrorCode& status) { + MeasureUnitImpl result; + + if (U_FAILURE(status)) { + return result; + } + if (fSource.empty()) { + // The dimenionless unit: nothing to parse. leave result as is. + return result; + } + + while (hasNext()) { + bool sawAnd = false; + + SingleUnitImpl singleUnit = nextSingleUnit(sawAnd, status); + if (U_FAILURE(status)) { + return result; + } + + bool added = result.appendSingleUnit(singleUnit, status); + if (U_FAILURE(status)) { + return result; + } + + if (sawAnd && !added) { + // Two similar units are not allowed in a mixed unit. + status = kUnitIdentifierSyntaxError; + return result; + } + + if (result.singleUnits.length() >= 2) { + // nextSingleUnit fails appropriately for "per" and "and" in the + // same identifier. It doesn't fail for other compound units + // (COMPOUND_PART_TIMES). Consequently we take care of that + // here. + UMeasureUnitComplexity complexity = + sawAnd ? UMEASURE_UNIT_MIXED : UMEASURE_UNIT_COMPOUND; + if (result.singleUnits.length() == 2) { + // After appending two singleUnits, the complexity will be `UMEASURE_UNIT_COMPOUND` + U_ASSERT(result.complexity == UMEASURE_UNIT_COMPOUND); + result.complexity = complexity; + } else if (result.complexity != complexity) { + // Can't have mixed compound units + status = kUnitIdentifierSyntaxError; + return result; + } + } + } + + return result; + } + +private: + // Tracks parser progress: the offset into fSource. + int32_t fIndex = 0; + + // Since we're not owning this memory, whatever is passed to the constructor + // should live longer than this Parser - and the parser shouldn't return any + // references to that string. + StringPiece fSource; + BytesTrie fTrie; + + // Set to true when we've seen a "-per-" or a "per-", after which all units + // are in the denominator. Until we find an "-and-", at which point the + // identifier is invalid pending TODO(CLDR-13701). + bool fAfterPer = false; + + Parser() : fSource(""), fTrie(u"") {} + + Parser(StringPiece source) + : fSource(source), fTrie(gSerializedUnitExtrasStemTrie) {} + + inline bool hasNext() const { + return fIndex < fSource.length(); + } + + // Returns the next Token parsed from fSource, advancing fIndex to the end + // of that token in fSource. In case of U_FAILURE(status), the token + // returned will cause an abort if getType() is called on it. + Token nextToken(UErrorCode& status) { + fTrie.reset(); + int32_t match = -1; + // Saves the position in the fSource string for the end of the most + // recent matching token. + int32_t previ = -1; + // Find the longest token that matches a value in the trie: + while (fIndex < fSource.length()) { + auto result = fTrie.next(fSource.data()[fIndex++]); + if (result == USTRINGTRIE_NO_MATCH) { + break; + } else if (result == USTRINGTRIE_NO_VALUE) { + continue; + } + U_ASSERT(USTRINGTRIE_HAS_VALUE(result)); + match = fTrie.getValue(); + previ = fIndex; + if (result == USTRINGTRIE_FINAL_VALUE) { + break; + } + U_ASSERT(result == USTRINGTRIE_INTERMEDIATE_VALUE); + // continue; + } + + if (match < 0) { + status = kUnitIdentifierSyntaxError; + } else { + fIndex = previ; + } + return Token(match); + } + + /** + * Returns the next "single unit" via result. + * + * If a "-per-" was parsed, the result will have appropriate negative + * dimensionality. + * + * Returns an error if we parse both compound units and "-and-", since mixed + * compound units are not yet supported - TODO(CLDR-13701). + * + * @param result Will be overwritten by the result, if status shows success. + * @param sawAnd If an "-and-" was parsed prior to finding the "single + * unit", sawAnd is set to true. If not, it is left as is. + * @param status ICU error code. + */ + SingleUnitImpl nextSingleUnit(bool &sawAnd, UErrorCode &status) { + SingleUnitImpl result; + if (U_FAILURE(status)) { + return result; + } + + // state: + // 0 = no tokens seen yet (will accept power, SI or binary prefix, or simple unit) + // 1 = power token seen (will not accept another power token) + // 2 = SI or binary prefix token seen (will not accept a power, or SI or binary prefix token) + int32_t state = 0; + + bool atStart = fIndex == 0; + Token token = nextToken(status); + if (U_FAILURE(status)) { + return result; + } + + if (atStart) { + // Identifiers optionally start with "per-". + if (token.getType() == Token::TYPE_INITIAL_COMPOUND_PART) { + U_ASSERT(token.getInitialCompoundPart() == INITIAL_COMPOUND_PART_PER); + fAfterPer = true; + result.dimensionality = -1; + + token = nextToken(status); + if (U_FAILURE(status)) { + return result; + } + } + } else { + // All other SingleUnit's are separated from previous SingleUnit's + // via a compound part: + if (token.getType() != Token::TYPE_COMPOUND_PART) { + status = kUnitIdentifierSyntaxError; + return result; + } + + switch (token.getMatch()) { + case COMPOUND_PART_PER: + if (sawAnd) { + // Mixed compound units not yet supported, + // TODO(CLDR-13701). + status = kUnitIdentifierSyntaxError; + return result; + } + fAfterPer = true; + result.dimensionality = -1; + break; + + case COMPOUND_PART_TIMES: + if (fAfterPer) { + result.dimensionality = -1; + } + break; + + case COMPOUND_PART_AND: + if (fAfterPer) { + // Can't start with "-and-", and mixed compound units + // not yet supported, TODO(CLDR-13701). + status = kUnitIdentifierSyntaxError; + return result; + } + sawAnd = true; + break; + } + + token = nextToken(status); + if (U_FAILURE(status)) { + return result; + } + } + + // Read tokens until we have a complete SingleUnit or we reach the end. + while (true) { + switch (token.getType()) { + case Token::TYPE_POWER_PART: + if (state > 0) { + status = kUnitIdentifierSyntaxError; + return result; + } + result.dimensionality *= token.getPower(); + state = 1; + break; + + case Token::TYPE_PREFIX: + if (state > 1) { + status = kUnitIdentifierSyntaxError; + return result; + } + result.unitPrefix = token.getUnitPrefix(); + state = 2; + break; + + case Token::TYPE_SIMPLE_UNIT: + result.index = token.getSimpleUnitIndex(); + return result; + + default: + status = kUnitIdentifierSyntaxError; + return result; + } + + if (!hasNext()) { + // We ran out of tokens before finding a complete single unit. + status = kUnitIdentifierSyntaxError; + return result; + } + token = nextToken(status); + if (U_FAILURE(status)) { + return result; + } + } + + return result; + } +}; + +// Sorting function wrapping SingleUnitImpl::compareTo for use with uprv_sortArray. +int32_t U_CALLCONV +compareSingleUnits(const void* /*context*/, const void* left, const void* right) { + auto realLeft = static_cast(left); + auto realRight = static_cast(right); + return (*realLeft)->compareTo(**realRight); +} + +// Returns an index into the gCategories array, for the "unitQuantity" (aka +// "type" or "category") associated with the given base unit identifier. Returns +// -1 on failure, together with U_UNSUPPORTED_ERROR. +int32_t getUnitCategoryIndex(BytesTrie &trie, StringPiece baseUnitIdentifier, UErrorCode &status) { + UStringTrieResult result = trie.reset().next(baseUnitIdentifier.data(), baseUnitIdentifier.length()); + if (!USTRINGTRIE_HAS_VALUE(result)) { + status = U_UNSUPPORTED_ERROR; + return -1; + } + + return trie.getValue(); +} + +} // namespace + +U_CAPI int32_t U_EXPORT2 +umeas_getPrefixPower(UMeasurePrefix unitPrefix) { + if (unitPrefix >= UMEASURE_PREFIX_INTERNAL_MIN_BIN && + unitPrefix <= UMEASURE_PREFIX_INTERNAL_MAX_BIN) { + return unitPrefix - UMEASURE_PREFIX_INTERNAL_ONE_BIN; + } + U_ASSERT(unitPrefix >= UMEASURE_PREFIX_INTERNAL_MIN_SI && + unitPrefix <= UMEASURE_PREFIX_INTERNAL_MAX_SI); + return unitPrefix - UMEASURE_PREFIX_ONE; +} + +U_CAPI int32_t U_EXPORT2 +umeas_getPrefixBase(UMeasurePrefix unitPrefix) { + if (unitPrefix >= UMEASURE_PREFIX_INTERNAL_MIN_BIN && + unitPrefix <= UMEASURE_PREFIX_INTERNAL_MAX_BIN) { + return 1024; + } + U_ASSERT(unitPrefix >= UMEASURE_PREFIX_INTERNAL_MIN_SI && + unitPrefix <= UMEASURE_PREFIX_INTERNAL_MAX_SI); + return 10; +} + +CharString U_I18N_API getUnitQuantity(const MeasureUnitImpl &baseMeasureUnitImpl, UErrorCode &status) { + CharString result; + MeasureUnitImpl baseUnitImpl = baseMeasureUnitImpl.copy(status); + UErrorCode localStatus = U_ZERO_ERROR; + umtx_initOnce(gUnitExtrasInitOnce, &initUnitExtras, status); + if (U_FAILURE(status)) { + return result; + } + BytesTrie trie(gSerializedUnitCategoriesTrie); + + baseUnitImpl.serialize(status); + StringPiece identifier = baseUnitImpl.identifier.data(); + int32_t idx = getUnitCategoryIndex(trie, identifier, localStatus); + if (U_FAILURE(status)) { + return result; + } + + // In case the base unit identifier did not match any entry. + if (U_FAILURE(localStatus)) { + localStatus = U_ZERO_ERROR; + baseUnitImpl.takeReciprocal(status); + baseUnitImpl.serialize(status); + identifier.set(baseUnitImpl.identifier.data()); + idx = getUnitCategoryIndex(trie, identifier, localStatus); + + if (U_FAILURE(status)) { + return result; + } + } + + // In case the reciprocal of the base unit identifier did not match any entry. + MeasureUnitImpl simplifiedUnit = baseMeasureUnitImpl.copyAndSimplify(status); + if (U_FAILURE(status)) { + return result; + } + if (U_FAILURE(localStatus)) { + localStatus = U_ZERO_ERROR; + simplifiedUnit.serialize(status); + identifier.set(simplifiedUnit.identifier.data()); + idx = getUnitCategoryIndex(trie, identifier, localStatus); + + if (U_FAILURE(status)) { + return result; + } + } + + // In case the simplified base unit identifier did not match any entry. + if (U_FAILURE(localStatus)) { + localStatus = U_ZERO_ERROR; + simplifiedUnit.takeReciprocal(status); + simplifiedUnit.serialize(status); + identifier.set(simplifiedUnit.identifier.data()); + idx = getUnitCategoryIndex(trie, identifier, localStatus); + + if (U_FAILURE(status)) { + return result; + } + } + + // If there is no match at all, throw an exception. + if (U_FAILURE(localStatus)) { + status = U_INVALID_FORMAT_ERROR; + return result; + } + + if (idx < 0 || idx >= gCategoriesCount) { + status = U_INVALID_FORMAT_ERROR; + return result; + } + + result.appendInvariantChars(gCategories[idx], u_strlen(gCategories[idx]), status); + return result; +} + +// In ICU4J, this is MeasureUnit.getSingleUnitImpl(). +SingleUnitImpl SingleUnitImpl::forMeasureUnit(const MeasureUnit& measureUnit, UErrorCode& status) { + MeasureUnitImpl temp; + const MeasureUnitImpl& impl = MeasureUnitImpl::forMeasureUnit(measureUnit, temp, status); + if (U_FAILURE(status)) { + return {}; + } + if (impl.singleUnits.length() == 0) { + return {}; + } + if (impl.singleUnits.length() == 1) { + return *impl.singleUnits[0]; + } + status = U_ILLEGAL_ARGUMENT_ERROR; + return {}; +} + +MeasureUnit SingleUnitImpl::build(UErrorCode& status) const { + MeasureUnitImpl temp; + temp.appendSingleUnit(*this, status); + // TODO(icu-units#28): the MeasureUnitImpl::build() method uses + // findBySubtype, which is relatively slow. + // - At the time of loading the simple unit IDs, we could also save a + // mapping to the builtin MeasureUnit type and subtype they correspond to. + // - This method could then check dimensionality and index, and if both are + // 1, directly return MeasureUnit instances very quickly. + return std::move(temp).build(status); +} + +const char *SingleUnitImpl::getSimpleUnitID() const { + return gSimpleUnits[index]; +} + +void SingleUnitImpl::appendNeutralIdentifier(CharString &result, UErrorCode &status) const UPRV_NO_SANITIZE_UNDEFINED { + int32_t absPower = std::abs(this->dimensionality); + + U_ASSERT(absPower > 0); // "this function does not support the dimensionless single units"; + + if (absPower == 1) { + // no-op + } else if (absPower == 2) { + result.append(StringPiece("square-"), status); + } else if (absPower == 3) { + result.append(StringPiece("cubic-"), status); + } else if (absPower <= 15) { + result.append(StringPiece("pow"), status); + result.appendNumber(absPower, status); + result.append(StringPiece("-"), status); + } else { + status = U_ILLEGAL_ARGUMENT_ERROR; // Unit Identifier Syntax Error + return; + } + + if (U_FAILURE(status)) { + return; + } + + if (this->unitPrefix != UMEASURE_PREFIX_ONE) { + bool found = false; + for (const auto &unitPrefixInfo : gUnitPrefixStrings) { + // TODO: consider using binary search? If we do this, add a unit + // test to ensure gUnitPrefixStrings is sorted? + if (unitPrefixInfo.value == this->unitPrefix) { + result.append(unitPrefixInfo.string, status); + found = true; + break; + } + } + if (!found) { + status = U_UNSUPPORTED_ERROR; + return; + } + } + + result.append(StringPiece(this->getSimpleUnitID()), status); +} + +int32_t SingleUnitImpl::getUnitCategoryIndex() const { + return gSimpleUnitCategories[index]; +} + +MeasureUnitImpl::MeasureUnitImpl(const SingleUnitImpl &singleUnit, UErrorCode &status) { + this->appendSingleUnit(singleUnit, status); +} + +MeasureUnitImpl MeasureUnitImpl::forIdentifier(StringPiece identifier, UErrorCode& status) { + return Parser::from(identifier, status).parse(status); +} + +const MeasureUnitImpl& MeasureUnitImpl::forMeasureUnit( + const MeasureUnit& measureUnit, MeasureUnitImpl& memory, UErrorCode& status) { + if (measureUnit.fImpl) { + return *measureUnit.fImpl; + } else { + memory = Parser::from(measureUnit.getIdentifier(), status).parse(status); + return memory; + } +} + +MeasureUnitImpl MeasureUnitImpl::forMeasureUnitMaybeCopy( + const MeasureUnit& measureUnit, UErrorCode& status) { + if (measureUnit.fImpl) { + return measureUnit.fImpl->copy(status); + } else { + return Parser::from(measureUnit.getIdentifier(), status).parse(status); + } +} + +void MeasureUnitImpl::takeReciprocal(UErrorCode& /*status*/) { + identifier.clear(); + for (int32_t i = 0; i < singleUnits.length(); i++) { + singleUnits[i]->dimensionality *= -1; + } +} + +MeasureUnitImpl MeasureUnitImpl::copyAndSimplify(UErrorCode &status) const { + MeasureUnitImpl result; + for (int32_t i = 0; i < singleUnits.length(); i++) { + const SingleUnitImpl &singleUnit = *this->singleUnits[i]; + + // The following `for` loop will cause time complexity to be O(n^2). + // However, n is very small (number of units, generally, at maximum equal to 10) + bool unitExist = false; + for (int32_t j = 0; j < result.singleUnits.length(); j++) { + if (uprv_strcmp(result.singleUnits[j]->getSimpleUnitID(), singleUnit.getSimpleUnitID()) == + 0 && + result.singleUnits[j]->unitPrefix == singleUnit.unitPrefix) { + unitExist = true; + result.singleUnits[j]->dimensionality = + result.singleUnits[j]->dimensionality + singleUnit.dimensionality; + break; + } + } + + if (!unitExist) { + result.appendSingleUnit(singleUnit, status); + } + } + + return result; +} + +bool MeasureUnitImpl::appendSingleUnit(const SingleUnitImpl &singleUnit, UErrorCode &status) { + identifier.clear(); + + if (singleUnit.isDimensionless()) { + // Do not append dimensionless units. + return false; + } + + // Find a similar unit that already exists, to attempt to coalesce + SingleUnitImpl *oldUnit = nullptr; + for (int32_t i = 0; i < this->singleUnits.length(); i++) { + auto *candidate = this->singleUnits[i]; + if (candidate->isCompatibleWith(singleUnit)) { + oldUnit = candidate; + } + } + + if (oldUnit) { + // Both dimensionalities will be positive, or both will be negative, by + // virtue of isCompatibleWith(). + oldUnit->dimensionality += singleUnit.dimensionality; + + return false; + } + + // Add a copy of singleUnit + // NOTE: MaybeStackVector::emplaceBackAndCheckErrorCode creates new copy of singleUnit. + this->singleUnits.emplaceBackAndCheckErrorCode(status, singleUnit); + if (U_FAILURE(status)) { + return false; + } + + // If the MeasureUnitImpl is `UMEASURE_UNIT_SINGLE` and after the appending a unit, the `singleUnits` + // contains more than one. thus means the complexity should be `UMEASURE_UNIT_COMPOUND` + if (this->singleUnits.length() > 1 && + this->complexity == UMeasureUnitComplexity::UMEASURE_UNIT_SINGLE) { + this->complexity = UMeasureUnitComplexity::UMEASURE_UNIT_COMPOUND; + } + + return true; +} + +MaybeStackVector +MeasureUnitImpl::extractIndividualUnitsWithIndices(UErrorCode &status) const { + MaybeStackVector result; + + if (this->complexity != UMeasureUnitComplexity::UMEASURE_UNIT_MIXED) { + result.emplaceBackAndCheckErrorCode(status, 0, *this, status); + return result; + } + + for (int32_t i = 0; i < singleUnits.length(); ++i) { + result.emplaceBackAndCheckErrorCode(status, i, *singleUnits[i], status); + if (U_FAILURE(status)) { + return result; + } + } + + return result; +} + +/** + * Normalize a MeasureUnitImpl and generate the identifier string in place. + */ +void MeasureUnitImpl::serialize(UErrorCode &status) { + if (U_FAILURE(status)) { + return; + } + + if (this->singleUnits.length() == 0) { + // Dimensionless, constructed by the default constructor. + return; + } + + if (this->complexity == UMEASURE_UNIT_COMPOUND) { + // Note: don't sort a MIXED unit + uprv_sortArray(this->singleUnits.getAlias(), this->singleUnits.length(), + sizeof(this->singleUnits[0]), compareSingleUnits, nullptr, false, &status); + if (U_FAILURE(status)) { + return; + } + } + + CharString result; + bool beforePer = true; + bool firstTimeNegativeDimension = false; + for (int32_t i = 0; i < this->singleUnits.length(); i++) { + if (beforePer && (*this->singleUnits[i]).dimensionality < 0) { + beforePer = false; + firstTimeNegativeDimension = true; + } else if ((*this->singleUnits[i]).dimensionality < 0) { + firstTimeNegativeDimension = false; + } + + if (U_FAILURE(status)) { + return; + } + + if (this->complexity == UMeasureUnitComplexity::UMEASURE_UNIT_MIXED) { + if (result.length() != 0) { + result.append(StringPiece("-and-"), status); + } + } else { + if (firstTimeNegativeDimension) { + if (result.length() == 0) { + result.append(StringPiece("per-"), status); + } else { + result.append(StringPiece("-per-"), status); + } + } else { + if (result.length() != 0) { + result.append(StringPiece("-"), status); + } + } + } + + this->singleUnits[i]->appendNeutralIdentifier(result, status); + } + + this->identifier = CharString(result, status); +} + +MeasureUnit MeasureUnitImpl::build(UErrorCode& status) && { + this->serialize(status); + return MeasureUnit(std::move(*this)); +} + +MeasureUnit MeasureUnit::forIdentifier(StringPiece identifier, UErrorCode& status) { + return Parser::from(identifier, status).parse(status).build(status); +} + +UMeasureUnitComplexity MeasureUnit::getComplexity(UErrorCode& status) const { + MeasureUnitImpl temp; + return MeasureUnitImpl::forMeasureUnit(*this, temp, status).complexity; +} + +UMeasurePrefix MeasureUnit::getPrefix(UErrorCode& status) const { + return SingleUnitImpl::forMeasureUnit(*this, status).unitPrefix; +} + +MeasureUnit MeasureUnit::withPrefix(UMeasurePrefix prefix, UErrorCode& status) const UPRV_NO_SANITIZE_UNDEFINED { + SingleUnitImpl singleUnit = SingleUnitImpl::forMeasureUnit(*this, status); + singleUnit.unitPrefix = prefix; + return singleUnit.build(status); +} + +int32_t MeasureUnit::getDimensionality(UErrorCode& status) const { + SingleUnitImpl singleUnit = SingleUnitImpl::forMeasureUnit(*this, status); + if (U_FAILURE(status)) { return 0; } + if (singleUnit.isDimensionless()) { + return 0; + } + return singleUnit.dimensionality; +} + +MeasureUnit MeasureUnit::withDimensionality(int32_t dimensionality, UErrorCode& status) const { + SingleUnitImpl singleUnit = SingleUnitImpl::forMeasureUnit(*this, status); + singleUnit.dimensionality = dimensionality; + return singleUnit.build(status); +} + +MeasureUnit MeasureUnit::reciprocal(UErrorCode& status) const { + MeasureUnitImpl impl = MeasureUnitImpl::forMeasureUnitMaybeCopy(*this, status); + impl.takeReciprocal(status); + return std::move(impl).build(status); +} + +MeasureUnit MeasureUnit::product(const MeasureUnit& other, UErrorCode& status) const { + MeasureUnitImpl impl = MeasureUnitImpl::forMeasureUnitMaybeCopy(*this, status); + MeasureUnitImpl temp; + const MeasureUnitImpl& otherImpl = MeasureUnitImpl::forMeasureUnit(other, temp, status); + if (impl.complexity == UMEASURE_UNIT_MIXED || otherImpl.complexity == UMEASURE_UNIT_MIXED) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return {}; + } + for (int32_t i = 0; i < otherImpl.singleUnits.length(); i++) { + impl.appendSingleUnit(*otherImpl.singleUnits[i], status); + } + if (impl.singleUnits.length() > 1) { + impl.complexity = UMEASURE_UNIT_COMPOUND; + } + return std::move(impl).build(status); +} + +LocalArray MeasureUnit::splitToSingleUnitsImpl(int32_t& outCount, UErrorCode& status) const { + MeasureUnitImpl temp; + const MeasureUnitImpl& impl = MeasureUnitImpl::forMeasureUnit(*this, temp, status); + outCount = impl.singleUnits.length(); + MeasureUnit* arr = new MeasureUnit[outCount]; + if (arr == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + return LocalArray(); + } + for (int32_t i = 0; i < outCount; i++) { + arr[i] = impl.singleUnits[i]->build(status); + } + return LocalArray(arr, status); +} + + +U_NAMESPACE_END + +#endif /* !UNCONFIG_NO_FORMATTING */ -- cgit v1.2.3