summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/i18n/rulebasedcollator.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'intl/icu/source/i18n/rulebasedcollator.cpp')
-rw-r--r--intl/icu/source/i18n/rulebasedcollator.cpp1656
1 files changed, 1656 insertions, 0 deletions
diff --git a/intl/icu/source/i18n/rulebasedcollator.cpp b/intl/icu/source/i18n/rulebasedcollator.cpp
new file mode 100644
index 0000000000..e9482628d9
--- /dev/null
+++ b/intl/icu/source/i18n/rulebasedcollator.cpp
@@ -0,0 +1,1656 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+* Copyright (C) 1996-2015, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+* rulebasedcollator.cpp
+*
+* (replaced the former tblcoll.cpp)
+*
+* created on: 2012feb14 with new and old collation code
+* created by: Markus W. Scherer
+*/
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_COLLATION
+
+#include "unicode/coll.h"
+#include "unicode/coleitr.h"
+#include "unicode/localpointer.h"
+#include "unicode/locid.h"
+#include "unicode/sortkey.h"
+#include "unicode/tblcoll.h"
+#include "unicode/ucol.h"
+#include "unicode/uiter.h"
+#include "unicode/uloc.h"
+#include "unicode/uniset.h"
+#include "unicode/unistr.h"
+#include "unicode/usetiter.h"
+#include "unicode/utf8.h"
+#include "unicode/uversion.h"
+#include "bocsu.h"
+#include "charstr.h"
+#include "cmemory.h"
+#include "collation.h"
+#include "collationcompare.h"
+#include "collationdata.h"
+#include "collationdatareader.h"
+#include "collationfastlatin.h"
+#include "collationiterator.h"
+#include "collationkeys.h"
+#include "collationroot.h"
+#include "collationsets.h"
+#include "collationsettings.h"
+#include "collationtailoring.h"
+#include "cstring.h"
+#include "uassert.h"
+#include "ucol_imp.h"
+#include "uhash.h"
+#include "uitercollationiterator.h"
+#include "ustr_imp.h"
+#include "utf16collationiterator.h"
+#include "utf8collationiterator.h"
+#include "uvectr64.h"
+
+U_NAMESPACE_BEGIN
+
+namespace {
+
+class FixedSortKeyByteSink : public SortKeyByteSink {
+public:
+ FixedSortKeyByteSink(char *dest, int32_t destCapacity)
+ : SortKeyByteSink(dest, destCapacity) {}
+ virtual ~FixedSortKeyByteSink();
+
+private:
+ virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) override;
+ virtual UBool Resize(int32_t appendCapacity, int32_t length) override;
+};
+
+FixedSortKeyByteSink::~FixedSortKeyByteSink() {}
+
+void
+FixedSortKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t /*n*/, int32_t length) {
+ // buffer_ != nullptr && bytes != nullptr && n > 0 && appended_ > capacity_
+ // Fill the buffer completely.
+ int32_t available = capacity_ - length;
+ if (available > 0) {
+ uprv_memcpy(buffer_ + length, bytes, available);
+ }
+}
+
+UBool
+FixedSortKeyByteSink::Resize(int32_t /*appendCapacity*/, int32_t /*length*/) {
+ return false;
+}
+
+} // namespace
+
+// Not in an anonymous namespace, so that it can be a friend of CollationKey.
+class CollationKeyByteSink : public SortKeyByteSink {
+public:
+ CollationKeyByteSink(CollationKey &key)
+ : SortKeyByteSink(reinterpret_cast<char *>(key.getBytes()), key.getCapacity()),
+ key_(key) {}
+ virtual ~CollationKeyByteSink();
+
+private:
+ virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) override;
+ virtual UBool Resize(int32_t appendCapacity, int32_t length) override;
+
+ CollationKey &key_;
+};
+
+CollationKeyByteSink::~CollationKeyByteSink() {}
+
+void
+CollationKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) {
+ // buffer_ != nullptr && bytes != nullptr && n > 0 && appended_ > capacity_
+ if (Resize(n, length)) {
+ uprv_memcpy(buffer_ + length, bytes, n);
+ }
+}
+
+UBool
+CollationKeyByteSink::Resize(int32_t appendCapacity, int32_t length) {
+ if (buffer_ == nullptr) {
+ return false; // allocation failed before already
+ }
+ int32_t newCapacity = 2 * capacity_;
+ int32_t altCapacity = length + 2 * appendCapacity;
+ if (newCapacity < altCapacity) {
+ newCapacity = altCapacity;
+ }
+ if (newCapacity < 200) {
+ newCapacity = 200;
+ }
+ uint8_t *newBuffer = key_.reallocate(newCapacity, length);
+ if (newBuffer == nullptr) {
+ SetNotOk();
+ return false;
+ }
+ buffer_ = reinterpret_cast<char *>(newBuffer);
+ capacity_ = newCapacity;
+ return true;
+}
+
+RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator &other)
+ : Collator(other),
+ data(other.data),
+ settings(other.settings),
+ tailoring(other.tailoring),
+ cacheEntry(other.cacheEntry),
+ validLocale(other.validLocale),
+ explicitlySetAttributes(other.explicitlySetAttributes),
+ actualLocaleIsSameAsValid(other.actualLocaleIsSameAsValid) {
+ settings->addRef();
+ cacheEntry->addRef();
+}
+
+RuleBasedCollator::RuleBasedCollator(const uint8_t *bin, int32_t length,
+ const RuleBasedCollator *base, UErrorCode &errorCode)
+ : data(nullptr),
+ settings(nullptr),
+ tailoring(nullptr),
+ cacheEntry(nullptr),
+ validLocale(""),
+ explicitlySetAttributes(0),
+ actualLocaleIsSameAsValid(false) {
+ if(U_FAILURE(errorCode)) { return; }
+ if(bin == nullptr || length == 0 || base == nullptr) {
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return;
+ }
+ const CollationTailoring *root = CollationRoot::getRoot(errorCode);
+ if(U_FAILURE(errorCode)) { return; }
+ if(base->tailoring != root) {
+ errorCode = U_UNSUPPORTED_ERROR;
+ return;
+ }
+ LocalPointer<CollationTailoring> t(new CollationTailoring(base->tailoring->settings));
+ if(t.isNull() || t->isBogus()) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+ CollationDataReader::read(base->tailoring, bin, length, *t, errorCode);
+ if(U_FAILURE(errorCode)) { return; }
+ t->actualLocale.setToBogus();
+ adoptTailoring(t.orphan(), errorCode);
+}
+
+RuleBasedCollator::RuleBasedCollator(const CollationCacheEntry *entry)
+ : data(entry->tailoring->data),
+ settings(entry->tailoring->settings),
+ tailoring(entry->tailoring),
+ cacheEntry(entry),
+ validLocale(entry->validLocale),
+ explicitlySetAttributes(0),
+ actualLocaleIsSameAsValid(false) {
+ settings->addRef();
+ cacheEntry->addRef();
+}
+
+RuleBasedCollator::~RuleBasedCollator() {
+ SharedObject::clearPtr(settings);
+ SharedObject::clearPtr(cacheEntry);
+}
+
+void
+RuleBasedCollator::adoptTailoring(CollationTailoring *t, UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) {
+ t->deleteIfZeroRefCount();
+ return;
+ }
+ U_ASSERT(settings == nullptr && data == nullptr && tailoring == nullptr && cacheEntry == nullptr);
+ cacheEntry = new CollationCacheEntry(t->actualLocale, t);
+ if(cacheEntry == nullptr) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ t->deleteIfZeroRefCount();
+ return;
+ }
+ data = t->data;
+ settings = t->settings;
+ settings->addRef();
+ tailoring = t;
+ cacheEntry->addRef();
+ validLocale = t->actualLocale;
+ actualLocaleIsSameAsValid = false;
+}
+
+RuleBasedCollator *
+RuleBasedCollator::clone() const {
+ return new RuleBasedCollator(*this);
+}
+
+RuleBasedCollator &RuleBasedCollator::operator=(const RuleBasedCollator &other) {
+ if(this == &other) { return *this; }
+ SharedObject::copyPtr(other.settings, settings);
+ tailoring = other.tailoring;
+ SharedObject::copyPtr(other.cacheEntry, cacheEntry);
+ data = tailoring->data;
+ validLocale = other.validLocale;
+ explicitlySetAttributes = other.explicitlySetAttributes;
+ actualLocaleIsSameAsValid = other.actualLocaleIsSameAsValid;
+ return *this;
+}
+
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedCollator)
+
+bool
+RuleBasedCollator::operator==(const Collator& other) const {
+ if(this == &other) { return true; }
+ if(!Collator::operator==(other)) { return false; }
+ const RuleBasedCollator &o = static_cast<const RuleBasedCollator &>(other);
+ if(*settings != *o.settings) { return false; }
+ if(data == o.data) { return true; }
+ UBool thisIsRoot = data->base == nullptr;
+ UBool otherIsRoot = o.data->base == nullptr;
+ U_ASSERT(!thisIsRoot || !otherIsRoot); // otherwise their data pointers should be ==
+ if(thisIsRoot != otherIsRoot) { return false; }
+ if((thisIsRoot || !tailoring->rules.isEmpty()) &&
+ (otherIsRoot || !o.tailoring->rules.isEmpty())) {
+ // Shortcut: If both collators have valid rule strings, then compare those.
+ if(tailoring->rules == o.tailoring->rules) { return true; }
+ }
+ // Different rule strings can result in the same or equivalent tailoring.
+ // The rule strings are optional in ICU resource bundles, although included by default.
+ // cloneBinary() drops the rule string.
+ UErrorCode errorCode = U_ZERO_ERROR;
+ LocalPointer<UnicodeSet> thisTailored(getTailoredSet(errorCode));
+ LocalPointer<UnicodeSet> otherTailored(o.getTailoredSet(errorCode));
+ if(U_FAILURE(errorCode)) { return false; }
+ if(*thisTailored != *otherTailored) { return false; }
+ // For completeness, we should compare all of the mappings;
+ // or we should create a list of strings, sort it with one collator,
+ // and check if both collators compare adjacent strings the same
+ // (order & strength, down to quaternary); or similar.
+ // Testing equality of collators seems unusual.
+ return true;
+}
+
+int32_t
+RuleBasedCollator::hashCode() const {
+ int32_t h = settings->hashCode();
+ if(data->base == nullptr) { return h; } // root collator
+ // Do not rely on the rule string, see comments in operator==().
+ UErrorCode errorCode = U_ZERO_ERROR;
+ LocalPointer<UnicodeSet> set(getTailoredSet(errorCode));
+ if(U_FAILURE(errorCode)) { return 0; }
+ UnicodeSetIterator iter(*set);
+ while(iter.next() && !iter.isString()) {
+ h ^= data->getCE32(iter.getCodepoint());
+ }
+ return h;
+}
+
+void
+RuleBasedCollator::setLocales(const Locale &requested, const Locale &valid,
+ const Locale &actual) {
+ if(actual == tailoring->actualLocale) {
+ actualLocaleIsSameAsValid = false;
+ } else {
+ U_ASSERT(actual == valid);
+ actualLocaleIsSameAsValid = true;
+ }
+ // Do not modify tailoring.actualLocale:
+ // We cannot be sure that that would be thread-safe.
+ validLocale = valid;
+ (void)requested; // Ignore, see also ticket #10477.
+}
+
+Locale
+RuleBasedCollator::getLocale(ULocDataLocaleType type, UErrorCode& errorCode) const {
+ if(U_FAILURE(errorCode)) {
+ return Locale::getRoot();
+ }
+ switch(type) {
+ case ULOC_ACTUAL_LOCALE:
+ return actualLocaleIsSameAsValid ? validLocale : tailoring->actualLocale;
+ case ULOC_VALID_LOCALE:
+ return validLocale;
+ case ULOC_REQUESTED_LOCALE:
+ default:
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return Locale::getRoot();
+ }
+}
+
+const char *
+RuleBasedCollator::internalGetLocaleID(ULocDataLocaleType type, UErrorCode &errorCode) const {
+ if(U_FAILURE(errorCode)) {
+ return nullptr;
+ }
+ const Locale *result;
+ switch(type) {
+ case ULOC_ACTUAL_LOCALE:
+ result = actualLocaleIsSameAsValid ? &validLocale : &tailoring->actualLocale;
+ break;
+ case ULOC_VALID_LOCALE:
+ result = &validLocale;
+ break;
+ case ULOC_REQUESTED_LOCALE:
+ default:
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return nullptr;
+ }
+ if(result->isBogus()) { return nullptr; }
+ const char *id = result->getName();
+ return id[0] == 0 ? "root" : id;
+}
+
+const UnicodeString&
+RuleBasedCollator::getRules() const {
+ return tailoring->rules;
+}
+
+void
+RuleBasedCollator::getRules(UColRuleOption delta, UnicodeString &buffer) const {
+ if(delta == UCOL_TAILORING_ONLY) {
+ buffer = tailoring->rules;
+ return;
+ }
+ // UCOL_FULL_RULES
+ buffer.remove();
+ CollationLoader::appendRootRules(buffer);
+ buffer.append(tailoring->rules).getTerminatedBuffer();
+}
+
+void
+RuleBasedCollator::getVersion(UVersionInfo version) const {
+ uprv_memcpy(version, tailoring->version, U_MAX_VERSION_LENGTH);
+ version[0] += (UCOL_RUNTIME_VERSION << 4) + (UCOL_RUNTIME_VERSION >> 4);
+}
+
+UnicodeSet *
+RuleBasedCollator::getTailoredSet(UErrorCode &errorCode) const {
+ if(U_FAILURE(errorCode)) { return nullptr; }
+ UnicodeSet *tailored = new UnicodeSet();
+ if(tailored == nullptr) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return nullptr;
+ }
+ if(data->base != nullptr) {
+ TailoredSet(tailored).forData(data, errorCode);
+ if(U_FAILURE(errorCode)) {
+ delete tailored;
+ return nullptr;
+ }
+ }
+ return tailored;
+}
+
+void
+RuleBasedCollator::internalGetContractionsAndExpansions(
+ UnicodeSet *contractions, UnicodeSet *expansions,
+ UBool addPrefixes, UErrorCode &errorCode) const {
+ if(U_FAILURE(errorCode)) { return; }
+ if(contractions != nullptr) {
+ contractions->clear();
+ }
+ if(expansions != nullptr) {
+ expansions->clear();
+ }
+ ContractionsAndExpansions(contractions, expansions, nullptr, addPrefixes).forData(data, errorCode);
+}
+
+void
+RuleBasedCollator::internalAddContractions(UChar32 c, UnicodeSet &set, UErrorCode &errorCode) const {
+ if(U_FAILURE(errorCode)) { return; }
+ ContractionsAndExpansions(&set, nullptr, nullptr, false).forCodePoint(data, c, errorCode);
+}
+
+const CollationSettings &
+RuleBasedCollator::getDefaultSettings() const {
+ return *tailoring->settings;
+}
+
+UColAttributeValue
+RuleBasedCollator::getAttribute(UColAttribute attr, UErrorCode &errorCode) const {
+ if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
+ int32_t option;
+ switch(attr) {
+ case UCOL_FRENCH_COLLATION:
+ option = CollationSettings::BACKWARD_SECONDARY;
+ break;
+ case UCOL_ALTERNATE_HANDLING:
+ return settings->getAlternateHandling();
+ case UCOL_CASE_FIRST:
+ return settings->getCaseFirst();
+ case UCOL_CASE_LEVEL:
+ option = CollationSettings::CASE_LEVEL;
+ break;
+ case UCOL_NORMALIZATION_MODE:
+ option = CollationSettings::CHECK_FCD;
+ break;
+ case UCOL_STRENGTH:
+ return (UColAttributeValue)settings->getStrength();
+ case UCOL_HIRAGANA_QUATERNARY_MODE:
+ // Deprecated attribute, unsettable.
+ return UCOL_OFF;
+ case UCOL_NUMERIC_COLLATION:
+ option = CollationSettings::NUMERIC;
+ break;
+ default:
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return UCOL_DEFAULT;
+ }
+ return ((settings->options & option) == 0) ? UCOL_OFF : UCOL_ON;
+}
+
+void
+RuleBasedCollator::setAttribute(UColAttribute attr, UColAttributeValue value,
+ UErrorCode &errorCode) {
+ UColAttributeValue oldValue = getAttribute(attr, errorCode);
+ if(U_FAILURE(errorCode)) { return; }
+ if(value == oldValue) {
+ setAttributeExplicitly(attr);
+ return;
+ }
+ const CollationSettings &defaultSettings = getDefaultSettings();
+ if(settings == &defaultSettings) {
+ if(value == UCOL_DEFAULT) {
+ setAttributeDefault(attr);
+ return;
+ }
+ }
+ CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
+ if(ownedSettings == nullptr) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+
+ switch(attr) {
+ case UCOL_FRENCH_COLLATION:
+ ownedSettings->setFlag(CollationSettings::BACKWARD_SECONDARY, value,
+ defaultSettings.options, errorCode);
+ break;
+ case UCOL_ALTERNATE_HANDLING:
+ ownedSettings->setAlternateHandling(value, defaultSettings.options, errorCode);
+ break;
+ case UCOL_CASE_FIRST:
+ ownedSettings->setCaseFirst(value, defaultSettings.options, errorCode);
+ break;
+ case UCOL_CASE_LEVEL:
+ ownedSettings->setFlag(CollationSettings::CASE_LEVEL, value,
+ defaultSettings.options, errorCode);
+ break;
+ case UCOL_NORMALIZATION_MODE:
+ ownedSettings->setFlag(CollationSettings::CHECK_FCD, value,
+ defaultSettings.options, errorCode);
+ break;
+ case UCOL_STRENGTH:
+ ownedSettings->setStrength(value, defaultSettings.options, errorCode);
+ break;
+ case UCOL_HIRAGANA_QUATERNARY_MODE:
+ // Deprecated attribute. Check for valid values but do not change anything.
+ if(value != UCOL_OFF && value != UCOL_ON && value != UCOL_DEFAULT) {
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ }
+ break;
+ case UCOL_NUMERIC_COLLATION:
+ ownedSettings->setFlag(CollationSettings::NUMERIC, value, defaultSettings.options, errorCode);
+ break;
+ default:
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ break;
+ }
+ if(U_FAILURE(errorCode)) { return; }
+ setFastLatinOptions(*ownedSettings);
+ if(value == UCOL_DEFAULT) {
+ setAttributeDefault(attr);
+ } else {
+ setAttributeExplicitly(attr);
+ }
+}
+
+Collator &
+RuleBasedCollator::setMaxVariable(UColReorderCode group, UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) { return *this; }
+ // Convert the reorder code into a MaxVariable number, or UCOL_DEFAULT=-1.
+ int32_t value;
+ if(group == UCOL_REORDER_CODE_DEFAULT) {
+ value = UCOL_DEFAULT;
+ } else if(UCOL_REORDER_CODE_FIRST <= group && group <= UCOL_REORDER_CODE_CURRENCY) {
+ value = group - UCOL_REORDER_CODE_FIRST;
+ } else {
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return *this;
+ }
+ CollationSettings::MaxVariable oldValue = settings->getMaxVariable();
+ if(value == oldValue) {
+ setAttributeExplicitly(ATTR_VARIABLE_TOP);
+ return *this;
+ }
+ const CollationSettings &defaultSettings = getDefaultSettings();
+ if(settings == &defaultSettings) {
+ if(value == UCOL_DEFAULT) {
+ setAttributeDefault(ATTR_VARIABLE_TOP);
+ return *this;
+ }
+ }
+ CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
+ if(ownedSettings == nullptr) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return *this;
+ }
+
+ if(group == UCOL_REORDER_CODE_DEFAULT) {
+ group = (UColReorderCode)(
+ UCOL_REORDER_CODE_FIRST + int32_t{defaultSettings.getMaxVariable()});
+ }
+ uint32_t varTop = data->getLastPrimaryForGroup(group);
+ U_ASSERT(varTop != 0);
+ ownedSettings->setMaxVariable(value, defaultSettings.options, errorCode);
+ if(U_FAILURE(errorCode)) { return *this; }
+ ownedSettings->variableTop = varTop;
+ setFastLatinOptions(*ownedSettings);
+ if(value == UCOL_DEFAULT) {
+ setAttributeDefault(ATTR_VARIABLE_TOP);
+ } else {
+ setAttributeExplicitly(ATTR_VARIABLE_TOP);
+ }
+ return *this;
+}
+
+UColReorderCode
+RuleBasedCollator::getMaxVariable() const {
+ return (UColReorderCode)(UCOL_REORDER_CODE_FIRST + int32_t{settings->getMaxVariable()});
+}
+
+uint32_t
+RuleBasedCollator::getVariableTop(UErrorCode & /*errorCode*/) const {
+ return settings->variableTop;
+}
+
+uint32_t
+RuleBasedCollator::setVariableTop(const char16_t *varTop, int32_t len, UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) { return 0; }
+ if(varTop == nullptr && len !=0) {
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return 0;
+ }
+ if(len < 0) { len = u_strlen(varTop); }
+ if(len == 0) {
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return 0;
+ }
+ UBool numeric = settings->isNumeric();
+ int64_t ce1, ce2;
+ if(settings->dontCheckFCD()) {
+ UTF16CollationIterator ci(data, numeric, varTop, varTop, varTop + len);
+ ce1 = ci.nextCE(errorCode);
+ ce2 = ci.nextCE(errorCode);
+ } else {
+ FCDUTF16CollationIterator ci(data, numeric, varTop, varTop, varTop + len);
+ ce1 = ci.nextCE(errorCode);
+ ce2 = ci.nextCE(errorCode);
+ }
+ if(ce1 == Collation::NO_CE || ce2 != Collation::NO_CE) {
+ errorCode = U_CE_NOT_FOUND_ERROR;
+ return 0;
+ }
+ setVariableTop((uint32_t)(ce1 >> 32), errorCode);
+ return settings->variableTop;
+}
+
+uint32_t
+RuleBasedCollator::setVariableTop(const UnicodeString &varTop, UErrorCode &errorCode) {
+ return setVariableTop(varTop.getBuffer(), varTop.length(), errorCode);
+}
+
+void
+RuleBasedCollator::setVariableTop(uint32_t varTop, UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) { return; }
+ if(varTop != settings->variableTop) {
+ // Pin the variable top to the end of the reordering group which contains it.
+ // Only a few special groups are supported.
+ int32_t group = data->getGroupForPrimary(varTop);
+ if(group < UCOL_REORDER_CODE_FIRST || UCOL_REORDER_CODE_CURRENCY < group) {
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return;
+ }
+ uint32_t v = data->getLastPrimaryForGroup(group);
+ U_ASSERT(v != 0 && v >= varTop);
+ varTop = v;
+ if(varTop != settings->variableTop) {
+ CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
+ if(ownedSettings == nullptr) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+ ownedSettings->setMaxVariable(group - UCOL_REORDER_CODE_FIRST,
+ getDefaultSettings().options, errorCode);
+ if(U_FAILURE(errorCode)) { return; }
+ ownedSettings->variableTop = varTop;
+ setFastLatinOptions(*ownedSettings);
+ }
+ }
+ if(varTop == getDefaultSettings().variableTop) {
+ setAttributeDefault(ATTR_VARIABLE_TOP);
+ } else {
+ setAttributeExplicitly(ATTR_VARIABLE_TOP);
+ }
+}
+
+int32_t
+RuleBasedCollator::getReorderCodes(int32_t *dest, int32_t capacity,
+ UErrorCode &errorCode) const {
+ if(U_FAILURE(errorCode)) { return 0; }
+ if(capacity < 0 || (dest == nullptr && capacity > 0)) {
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return 0;
+ }
+ int32_t length = settings->reorderCodesLength;
+ if(length == 0) { return 0; }
+ if(length > capacity) {
+ errorCode = U_BUFFER_OVERFLOW_ERROR;
+ return length;
+ }
+ uprv_memcpy(dest, settings->reorderCodes, length * 4);
+ return length;
+}
+
+void
+RuleBasedCollator::setReorderCodes(const int32_t *reorderCodes, int32_t length,
+ UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) { return; }
+ if(length < 0 || (reorderCodes == nullptr && length > 0)) {
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return;
+ }
+ if(length == 1 && reorderCodes[0] == UCOL_REORDER_CODE_NONE) {
+ length = 0;
+ }
+ if(length == settings->reorderCodesLength &&
+ uprv_memcmp(reorderCodes, settings->reorderCodes, length * 4) == 0) {
+ return;
+ }
+ const CollationSettings &defaultSettings = getDefaultSettings();
+ if(length == 1 && reorderCodes[0] == UCOL_REORDER_CODE_DEFAULT) {
+ if(settings != &defaultSettings) {
+ CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
+ if(ownedSettings == nullptr) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+ ownedSettings->copyReorderingFrom(defaultSettings, errorCode);
+ setFastLatinOptions(*ownedSettings);
+ }
+ return;
+ }
+ CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
+ if(ownedSettings == nullptr) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+ ownedSettings->setReordering(*data, reorderCodes, length, errorCode);
+ setFastLatinOptions(*ownedSettings);
+}
+
+void
+RuleBasedCollator::setFastLatinOptions(CollationSettings &ownedSettings) const {
+ ownedSettings.fastLatinOptions = CollationFastLatin::getOptions(
+ data, ownedSettings,
+ ownedSettings.fastLatinPrimaries, UPRV_LENGTHOF(ownedSettings.fastLatinPrimaries));
+}
+
+UCollationResult
+RuleBasedCollator::compare(const UnicodeString &left, const UnicodeString &right,
+ UErrorCode &errorCode) const {
+ if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
+ return doCompare(left.getBuffer(), left.length(),
+ right.getBuffer(), right.length(), errorCode);
+}
+
+UCollationResult
+RuleBasedCollator::compare(const UnicodeString &left, const UnicodeString &right,
+ int32_t length, UErrorCode &errorCode) const {
+ if(U_FAILURE(errorCode) || length == 0) { return UCOL_EQUAL; }
+ if(length < 0) {
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return UCOL_EQUAL;
+ }
+ int32_t leftLength = left.length();
+ int32_t rightLength = right.length();
+ if(leftLength > length) { leftLength = length; }
+ if(rightLength > length) { rightLength = length; }
+ return doCompare(left.getBuffer(), leftLength,
+ right.getBuffer(), rightLength, errorCode);
+}
+
+UCollationResult
+RuleBasedCollator::compare(const char16_t *left, int32_t leftLength,
+ const char16_t *right, int32_t rightLength,
+ UErrorCode &errorCode) const {
+ if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
+ if((left == nullptr && leftLength != 0) || (right == nullptr && rightLength != 0)) {
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return UCOL_EQUAL;
+ }
+ // Make sure both or neither strings have a known length.
+ // We do not optimize for mixed length/termination.
+ if(leftLength >= 0) {
+ if(rightLength < 0) { rightLength = u_strlen(right); }
+ } else {
+ if(rightLength >= 0) { leftLength = u_strlen(left); }
+ }
+ return doCompare(left, leftLength, right, rightLength, errorCode);
+}
+
+UCollationResult
+RuleBasedCollator::compareUTF8(const StringPiece &left, const StringPiece &right,
+ UErrorCode &errorCode) const {
+ if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
+ const uint8_t *leftBytes = reinterpret_cast<const uint8_t *>(left.data());
+ const uint8_t *rightBytes = reinterpret_cast<const uint8_t *>(right.data());
+ if((leftBytes == nullptr && !left.empty()) || (rightBytes == nullptr && !right.empty())) {
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return UCOL_EQUAL;
+ }
+ return doCompare(leftBytes, left.length(), rightBytes, right.length(), errorCode);
+}
+
+UCollationResult
+RuleBasedCollator::internalCompareUTF8(const char *left, int32_t leftLength,
+ const char *right, int32_t rightLength,
+ UErrorCode &errorCode) const {
+ if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
+ if((left == nullptr && leftLength != 0) || (right == nullptr && rightLength != 0)) {
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return UCOL_EQUAL;
+ }
+ // Make sure both or neither strings have a known length.
+ // We do not optimize for mixed length/termination.
+ if(leftLength >= 0) {
+ if(rightLength < 0) { rightLength = static_cast<int32_t>(uprv_strlen(right)); }
+ } else {
+ if(rightLength >= 0) { leftLength = static_cast<int32_t>(uprv_strlen(left)); }
+ }
+ return doCompare(reinterpret_cast<const uint8_t *>(left), leftLength,
+ reinterpret_cast<const uint8_t *>(right), rightLength, errorCode);
+}
+
+namespace {
+
+/**
+ * Abstract iterator for identical-level string comparisons.
+ * Returns FCD code points and handles temporary switching to NFD.
+ */
+class NFDIterator : public UObject {
+public:
+ NFDIterator() : index(-1), length(0) {}
+ virtual ~NFDIterator() {}
+ /**
+ * Returns the next code point from the internal normalization buffer,
+ * or else the next text code point.
+ * Returns -1 at the end of the text.
+ */
+ UChar32 nextCodePoint() {
+ if(index >= 0) {
+ if(index == length) {
+ index = -1;
+ } else {
+ UChar32 c;
+ U16_NEXT_UNSAFE(decomp, index, c);
+ return c;
+ }
+ }
+ return nextRawCodePoint();
+ }
+ /**
+ * @param nfcImpl
+ * @param c the last code point returned by nextCodePoint() or nextDecomposedCodePoint()
+ * @return the first code point in c's decomposition,
+ * or c itself if it was decomposed already or if it does not decompose
+ */
+ UChar32 nextDecomposedCodePoint(const Normalizer2Impl &nfcImpl, UChar32 c) {
+ if(index >= 0) { return c; }
+ decomp = nfcImpl.getDecomposition(c, buffer, length);
+ if(decomp == nullptr) { return c; }
+ index = 0;
+ U16_NEXT_UNSAFE(decomp, index, c);
+ return c;
+ }
+protected:
+ /**
+ * Returns the next text code point in FCD order.
+ * Returns -1 at the end of the text.
+ */
+ virtual UChar32 nextRawCodePoint() = 0;
+private:
+ const char16_t *decomp;
+ char16_t buffer[4];
+ int32_t index;
+ int32_t length;
+};
+
+class UTF16NFDIterator : public NFDIterator {
+public:
+ UTF16NFDIterator(const char16_t *text, const char16_t *textLimit) : s(text), limit(textLimit) {}
+protected:
+ virtual UChar32 nextRawCodePoint() override {
+ if(s == limit) { return U_SENTINEL; }
+ UChar32 c = *s++;
+ if(limit == nullptr && c == 0) {
+ s = nullptr;
+ return U_SENTINEL;
+ }
+ char16_t trail;
+ if(U16_IS_LEAD(c) && s != limit && U16_IS_TRAIL(trail = *s)) {
+ ++s;
+ c = U16_GET_SUPPLEMENTARY(c, trail);
+ }
+ return c;
+ }
+
+ const char16_t *s;
+ const char16_t *limit;
+};
+
+class FCDUTF16NFDIterator : public UTF16NFDIterator {
+public:
+ FCDUTF16NFDIterator(const Normalizer2Impl &nfcImpl, const char16_t *text, const char16_t *textLimit)
+ : UTF16NFDIterator(nullptr, nullptr) {
+ UErrorCode errorCode = U_ZERO_ERROR;
+ const char16_t *spanLimit = nfcImpl.makeFCD(text, textLimit, nullptr, errorCode);
+ if(U_FAILURE(errorCode)) { return; }
+ if(spanLimit == textLimit || (textLimit == nullptr && *spanLimit == 0)) {
+ s = text;
+ limit = spanLimit;
+ } else {
+ str.setTo(text, (int32_t)(spanLimit - text));
+ {
+ ReorderingBuffer r_buffer(nfcImpl, str);
+ if(r_buffer.init(str.length(), errorCode)) {
+ nfcImpl.makeFCD(spanLimit, textLimit, &r_buffer, errorCode);
+ }
+ }
+ if(U_SUCCESS(errorCode)) {
+ s = str.getBuffer();
+ limit = s + str.length();
+ }
+ }
+ }
+private:
+ UnicodeString str;
+};
+
+class UTF8NFDIterator : public NFDIterator {
+public:
+ UTF8NFDIterator(const uint8_t *text, int32_t textLength)
+ : s(text), pos(0), length(textLength) {}
+protected:
+ virtual UChar32 nextRawCodePoint() override {
+ if(pos == length || (s[pos] == 0 && length < 0)) { return U_SENTINEL; }
+ UChar32 c;
+ U8_NEXT_OR_FFFD(s, pos, length, c);
+ return c;
+ }
+
+ const uint8_t *s;
+ int32_t pos;
+ int32_t length;
+};
+
+class FCDUTF8NFDIterator : public NFDIterator {
+public:
+ FCDUTF8NFDIterator(const CollationData *data, const uint8_t *text, int32_t textLength)
+ : u8ci(data, false, text, 0, textLength) {}
+protected:
+ virtual UChar32 nextRawCodePoint() override {
+ UErrorCode errorCode = U_ZERO_ERROR;
+ return u8ci.nextCodePoint(errorCode);
+ }
+private:
+ FCDUTF8CollationIterator u8ci;
+};
+
+class UIterNFDIterator : public NFDIterator {
+public:
+ UIterNFDIterator(UCharIterator &it) : iter(it) {}
+protected:
+ virtual UChar32 nextRawCodePoint() override {
+ return uiter_next32(&iter);
+ }
+private:
+ UCharIterator &iter;
+};
+
+class FCDUIterNFDIterator : public NFDIterator {
+public:
+ FCDUIterNFDIterator(const CollationData *data, UCharIterator &it, int32_t startIndex)
+ : uici(data, false, it, startIndex) {}
+protected:
+ virtual UChar32 nextRawCodePoint() override {
+ UErrorCode errorCode = U_ZERO_ERROR;
+ return uici.nextCodePoint(errorCode);
+ }
+private:
+ FCDUIterCollationIterator uici;
+};
+
+UCollationResult compareNFDIter(const Normalizer2Impl &nfcImpl,
+ NFDIterator &left, NFDIterator &right) {
+ for(;;) {
+ // Fetch the next FCD code point from each string.
+ UChar32 leftCp = left.nextCodePoint();
+ UChar32 rightCp = right.nextCodePoint();
+ if(leftCp == rightCp) {
+ if(leftCp < 0) { break; }
+ continue;
+ }
+ // If they are different, then decompose each and compare again.
+ if(leftCp < 0) {
+ leftCp = -2; // end of string
+ } else if(leftCp == 0xfffe) {
+ leftCp = -1; // U+FFFE: merge separator
+ } else {
+ leftCp = left.nextDecomposedCodePoint(nfcImpl, leftCp);
+ }
+ if(rightCp < 0) {
+ rightCp = -2; // end of string
+ } else if(rightCp == 0xfffe) {
+ rightCp = -1; // U+FFFE: merge separator
+ } else {
+ rightCp = right.nextDecomposedCodePoint(nfcImpl, rightCp);
+ }
+ if(leftCp < rightCp) { return UCOL_LESS; }
+ if(leftCp > rightCp) { return UCOL_GREATER; }
+ }
+ return UCOL_EQUAL;
+}
+
+} // namespace
+
+UCollationResult
+RuleBasedCollator::doCompare(const char16_t *left, int32_t leftLength,
+ const char16_t *right, int32_t rightLength,
+ UErrorCode &errorCode) const {
+ // U_FAILURE(errorCode) checked by caller.
+ if(left == right && leftLength == rightLength) {
+ return UCOL_EQUAL;
+ }
+
+ // Identical-prefix test.
+ const char16_t *leftLimit;
+ const char16_t *rightLimit;
+ int32_t equalPrefixLength = 0;
+ if(leftLength < 0) {
+ leftLimit = nullptr;
+ rightLimit = nullptr;
+ char16_t c;
+ while((c = left[equalPrefixLength]) == right[equalPrefixLength]) {
+ if(c == 0) { return UCOL_EQUAL; }
+ ++equalPrefixLength;
+ }
+ } else {
+ leftLimit = left + leftLength;
+ rightLimit = right + rightLength;
+ for(;;) {
+ if(equalPrefixLength == leftLength) {
+ if(equalPrefixLength == rightLength) { return UCOL_EQUAL; }
+ break;
+ } else if(equalPrefixLength == rightLength ||
+ left[equalPrefixLength] != right[equalPrefixLength]) {
+ break;
+ }
+ ++equalPrefixLength;
+ }
+ }
+
+ UBool numeric = settings->isNumeric();
+ if(equalPrefixLength > 0) {
+ if((equalPrefixLength != leftLength &&
+ data->isUnsafeBackward(left[equalPrefixLength], numeric)) ||
+ (equalPrefixLength != rightLength &&
+ data->isUnsafeBackward(right[equalPrefixLength], numeric))) {
+ // Identical prefix: Back up to the start of a contraction or reordering sequence.
+ while(--equalPrefixLength > 0 &&
+ data->isUnsafeBackward(left[equalPrefixLength], numeric)) {}
+ }
+ // Notes:
+ // - A longer string can compare equal to a prefix of it if only ignorables follow.
+ // - With a backward level, a longer string can compare less-than a prefix of it.
+
+ // Pass the actual start of each string into the CollationIterators,
+ // plus the equalPrefixLength position,
+ // so that prefix matches back into the equal prefix work.
+ }
+
+ int32_t result;
+ int32_t fastLatinOptions = settings->fastLatinOptions;
+ if(fastLatinOptions >= 0 &&
+ (equalPrefixLength == leftLength ||
+ left[equalPrefixLength] <= CollationFastLatin::LATIN_MAX) &&
+ (equalPrefixLength == rightLength ||
+ right[equalPrefixLength] <= CollationFastLatin::LATIN_MAX)) {
+ if(leftLength >= 0) {
+ result = CollationFastLatin::compareUTF16(data->fastLatinTable,
+ settings->fastLatinPrimaries,
+ fastLatinOptions,
+ left + equalPrefixLength,
+ leftLength - equalPrefixLength,
+ right + equalPrefixLength,
+ rightLength - equalPrefixLength);
+ } else {
+ result = CollationFastLatin::compareUTF16(data->fastLatinTable,
+ settings->fastLatinPrimaries,
+ fastLatinOptions,
+ left + equalPrefixLength, -1,
+ right + equalPrefixLength, -1);
+ }
+ } else {
+ result = CollationFastLatin::BAIL_OUT_RESULT;
+ }
+
+ if(result == CollationFastLatin::BAIL_OUT_RESULT) {
+ if(settings->dontCheckFCD()) {
+ UTF16CollationIterator leftIter(data, numeric,
+ left, left + equalPrefixLength, leftLimit);
+ UTF16CollationIterator rightIter(data, numeric,
+ right, right + equalPrefixLength, rightLimit);
+ result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
+ } else {
+ FCDUTF16CollationIterator leftIter(data, numeric,
+ left, left + equalPrefixLength, leftLimit);
+ FCDUTF16CollationIterator rightIter(data, numeric,
+ right, right + equalPrefixLength, rightLimit);
+ result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
+ }
+ }
+ if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) {
+ return (UCollationResult)result;
+ }
+
+ // Note: If NUL-terminated, we could get the actual limits from the iterators now.
+ // That would complicate the iterators a bit, NUL-terminated strings are only a C convenience,
+ // and the benefit seems unlikely to be measurable.
+
+ // Compare identical level.
+ const Normalizer2Impl &nfcImpl = data->nfcImpl;
+ left += equalPrefixLength;
+ right += equalPrefixLength;
+ if(settings->dontCheckFCD()) {
+ UTF16NFDIterator leftIter(left, leftLimit);
+ UTF16NFDIterator rightIter(right, rightLimit);
+ return compareNFDIter(nfcImpl, leftIter, rightIter);
+ } else {
+ FCDUTF16NFDIterator leftIter(nfcImpl, left, leftLimit);
+ FCDUTF16NFDIterator rightIter(nfcImpl, right, rightLimit);
+ return compareNFDIter(nfcImpl, leftIter, rightIter);
+ }
+}
+
+UCollationResult
+RuleBasedCollator::doCompare(const uint8_t *left, int32_t leftLength,
+ const uint8_t *right, int32_t rightLength,
+ UErrorCode &errorCode) const {
+ // U_FAILURE(errorCode) checked by caller.
+ if(left == right && leftLength == rightLength) {
+ return UCOL_EQUAL;
+ }
+
+ // Identical-prefix test.
+ int32_t equalPrefixLength = 0;
+ if(leftLength < 0) {
+ uint8_t c;
+ while((c = left[equalPrefixLength]) == right[equalPrefixLength]) {
+ if(c == 0) { return UCOL_EQUAL; }
+ ++equalPrefixLength;
+ }
+ } else {
+ for(;;) {
+ if(equalPrefixLength == leftLength) {
+ if(equalPrefixLength == rightLength) { return UCOL_EQUAL; }
+ break;
+ } else if(equalPrefixLength == rightLength ||
+ left[equalPrefixLength] != right[equalPrefixLength]) {
+ break;
+ }
+ ++equalPrefixLength;
+ }
+ }
+ // Back up to the start of a partially-equal code point.
+ if(equalPrefixLength > 0 &&
+ ((equalPrefixLength != leftLength && U8_IS_TRAIL(left[equalPrefixLength])) ||
+ (equalPrefixLength != rightLength && U8_IS_TRAIL(right[equalPrefixLength])))) {
+ while(--equalPrefixLength > 0 && U8_IS_TRAIL(left[equalPrefixLength])) {}
+ }
+
+ UBool numeric = settings->isNumeric();
+ if(equalPrefixLength > 0) {
+ UBool unsafe = false;
+ if(equalPrefixLength != leftLength) {
+ int32_t i = equalPrefixLength;
+ UChar32 c;
+ U8_NEXT_OR_FFFD(left, i, leftLength, c);
+ unsafe = data->isUnsafeBackward(c, numeric);
+ }
+ if(!unsafe && equalPrefixLength != rightLength) {
+ int32_t i = equalPrefixLength;
+ UChar32 c;
+ U8_NEXT_OR_FFFD(right, i, rightLength, c);
+ unsafe = data->isUnsafeBackward(c, numeric);
+ }
+ if(unsafe) {
+ // Identical prefix: Back up to the start of a contraction or reordering sequence.
+ UChar32 c;
+ do {
+ U8_PREV_OR_FFFD(left, 0, equalPrefixLength, c);
+ } while(equalPrefixLength > 0 && data->isUnsafeBackward(c, numeric));
+ }
+ // See the notes in the UTF-16 version.
+
+ // Pass the actual start of each string into the CollationIterators,
+ // plus the equalPrefixLength position,
+ // so that prefix matches back into the equal prefix work.
+ }
+
+ int32_t result;
+ int32_t fastLatinOptions = settings->fastLatinOptions;
+ if(fastLatinOptions >= 0 &&
+ (equalPrefixLength == leftLength ||
+ left[equalPrefixLength] <= CollationFastLatin::LATIN_MAX_UTF8_LEAD) &&
+ (equalPrefixLength == rightLength ||
+ right[equalPrefixLength] <= CollationFastLatin::LATIN_MAX_UTF8_LEAD)) {
+ if(leftLength >= 0) {
+ result = CollationFastLatin::compareUTF8(data->fastLatinTable,
+ settings->fastLatinPrimaries,
+ fastLatinOptions,
+ left + equalPrefixLength,
+ leftLength - equalPrefixLength,
+ right + equalPrefixLength,
+ rightLength - equalPrefixLength);
+ } else {
+ result = CollationFastLatin::compareUTF8(data->fastLatinTable,
+ settings->fastLatinPrimaries,
+ fastLatinOptions,
+ left + equalPrefixLength, -1,
+ right + equalPrefixLength, -1);
+ }
+ } else {
+ result = CollationFastLatin::BAIL_OUT_RESULT;
+ }
+
+ if(result == CollationFastLatin::BAIL_OUT_RESULT) {
+ if(settings->dontCheckFCD()) {
+ UTF8CollationIterator leftIter(data, numeric, left, equalPrefixLength, leftLength);
+ UTF8CollationIterator rightIter(data, numeric, right, equalPrefixLength, rightLength);
+ result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
+ } else {
+ FCDUTF8CollationIterator leftIter(data, numeric, left, equalPrefixLength, leftLength);
+ FCDUTF8CollationIterator rightIter(data, numeric, right, equalPrefixLength, rightLength);
+ result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
+ }
+ }
+ if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) {
+ return (UCollationResult)result;
+ }
+
+ // Note: If NUL-terminated, we could get the actual limits from the iterators now.
+ // That would complicate the iterators a bit, NUL-terminated strings are only a C convenience,
+ // and the benefit seems unlikely to be measurable.
+
+ // Compare identical level.
+ const Normalizer2Impl &nfcImpl = data->nfcImpl;
+ left += equalPrefixLength;
+ right += equalPrefixLength;
+ if(leftLength > 0) {
+ leftLength -= equalPrefixLength;
+ rightLength -= equalPrefixLength;
+ }
+ if(settings->dontCheckFCD()) {
+ UTF8NFDIterator leftIter(left, leftLength);
+ UTF8NFDIterator rightIter(right, rightLength);
+ return compareNFDIter(nfcImpl, leftIter, rightIter);
+ } else {
+ FCDUTF8NFDIterator leftIter(data, left, leftLength);
+ FCDUTF8NFDIterator rightIter(data, right, rightLength);
+ return compareNFDIter(nfcImpl, leftIter, rightIter);
+ }
+}
+
+UCollationResult
+RuleBasedCollator::compare(UCharIterator &left, UCharIterator &right,
+ UErrorCode &errorCode) const {
+ if(U_FAILURE(errorCode) || &left == &right) { return UCOL_EQUAL; }
+ UBool numeric = settings->isNumeric();
+
+ // Identical-prefix test.
+ int32_t equalPrefixLength = 0;
+ {
+ UChar32 leftUnit;
+ UChar32 rightUnit;
+ while((leftUnit = left.next(&left)) == (rightUnit = right.next(&right))) {
+ if(leftUnit < 0) { return UCOL_EQUAL; }
+ ++equalPrefixLength;
+ }
+
+ // Back out the code units that differed, for the real collation comparison.
+ if(leftUnit >= 0) { left.previous(&left); }
+ if(rightUnit >= 0) { right.previous(&right); }
+
+ if(equalPrefixLength > 0) {
+ if((leftUnit >= 0 && data->isUnsafeBackward(leftUnit, numeric)) ||
+ (rightUnit >= 0 && data->isUnsafeBackward(rightUnit, numeric))) {
+ // Identical prefix: Back up to the start of a contraction or reordering sequence.
+ do {
+ --equalPrefixLength;
+ leftUnit = left.previous(&left);
+ right.previous(&right);
+ } while(equalPrefixLength > 0 && data->isUnsafeBackward(leftUnit, numeric));
+ }
+ // See the notes in the UTF-16 version.
+ }
+ }
+
+ UCollationResult result;
+ if(settings->dontCheckFCD()) {
+ UIterCollationIterator leftIter(data, numeric, left);
+ UIterCollationIterator rightIter(data, numeric, right);
+ result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
+ } else {
+ FCDUIterCollationIterator leftIter(data, numeric, left, equalPrefixLength);
+ FCDUIterCollationIterator rightIter(data, numeric, right, equalPrefixLength);
+ result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
+ }
+ if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) {
+ return result;
+ }
+
+ // Compare identical level.
+ left.move(&left, equalPrefixLength, UITER_ZERO);
+ right.move(&right, equalPrefixLength, UITER_ZERO);
+ const Normalizer2Impl &nfcImpl = data->nfcImpl;
+ if(settings->dontCheckFCD()) {
+ UIterNFDIterator leftIter(left);
+ UIterNFDIterator rightIter(right);
+ return compareNFDIter(nfcImpl, leftIter, rightIter);
+ } else {
+ FCDUIterNFDIterator leftIter(data, left, equalPrefixLength);
+ FCDUIterNFDIterator rightIter(data, right, equalPrefixLength);
+ return compareNFDIter(nfcImpl, leftIter, rightIter);
+ }
+}
+
+CollationKey &
+RuleBasedCollator::getCollationKey(const UnicodeString &s, CollationKey &key,
+ UErrorCode &errorCode) const {
+ return getCollationKey(s.getBuffer(), s.length(), key, errorCode);
+}
+
+CollationKey &
+RuleBasedCollator::getCollationKey(const char16_t *s, int32_t length, CollationKey& key,
+ UErrorCode &errorCode) const {
+ if(U_FAILURE(errorCode)) {
+ return key.setToBogus();
+ }
+ if(s == nullptr && length != 0) {
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return key.setToBogus();
+ }
+ key.reset(); // resets the "bogus" state
+ CollationKeyByteSink sink(key);
+ writeSortKey(s, length, sink, errorCode);
+ if(U_FAILURE(errorCode)) {
+ key.setToBogus();
+ } else if(key.isBogus()) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ } else {
+ key.setLength(sink.NumberOfBytesAppended());
+ }
+ return key;
+}
+
+int32_t
+RuleBasedCollator::getSortKey(const UnicodeString &s,
+ uint8_t *dest, int32_t capacity) const {
+ return getSortKey(s.getBuffer(), s.length(), dest, capacity);
+}
+
+int32_t
+RuleBasedCollator::getSortKey(const char16_t *s, int32_t length,
+ uint8_t *dest, int32_t capacity) const {
+ if((s == nullptr && length != 0) || capacity < 0 || (dest == nullptr && capacity > 0)) {
+ return 0;
+ }
+ uint8_t noDest[1] = { 0 };
+ if(dest == nullptr) {
+ // Distinguish pure preflighting from an allocation error.
+ dest = noDest;
+ capacity = 0;
+ }
+ FixedSortKeyByteSink sink(reinterpret_cast<char *>(dest), capacity);
+ UErrorCode errorCode = U_ZERO_ERROR;
+ writeSortKey(s, length, sink, errorCode);
+ return U_SUCCESS(errorCode) ? sink.NumberOfBytesAppended() : 0;
+}
+
+void
+RuleBasedCollator::writeSortKey(const char16_t *s, int32_t length,
+ SortKeyByteSink &sink, UErrorCode &errorCode) const {
+ if(U_FAILURE(errorCode)) { return; }
+ const char16_t *limit = (length >= 0) ? s + length : nullptr;
+ UBool numeric = settings->isNumeric();
+ CollationKeys::LevelCallback callback;
+ if(settings->dontCheckFCD()) {
+ UTF16CollationIterator iter(data, numeric, s, s, limit);
+ CollationKeys::writeSortKeyUpToQuaternary(iter, data->compressibleBytes, *settings,
+ sink, Collation::PRIMARY_LEVEL,
+ callback, true, errorCode);
+ } else {
+ FCDUTF16CollationIterator iter(data, numeric, s, s, limit);
+ CollationKeys::writeSortKeyUpToQuaternary(iter, data->compressibleBytes, *settings,
+ sink, Collation::PRIMARY_LEVEL,
+ callback, true, errorCode);
+ }
+ if(settings->getStrength() == UCOL_IDENTICAL) {
+ writeIdenticalLevel(s, limit, sink, errorCode);
+ }
+ static const char terminator = 0; // TERMINATOR_BYTE
+ sink.Append(&terminator, 1);
+}
+
+void
+RuleBasedCollator::writeIdenticalLevel(const char16_t *s, const char16_t *limit,
+ SortKeyByteSink &sink, UErrorCode &errorCode) const {
+ // NFD quick check
+ const char16_t *nfdQCYesLimit = data->nfcImpl.decompose(s, limit, nullptr, errorCode);
+ if(U_FAILURE(errorCode)) { return; }
+ sink.Append(Collation::LEVEL_SEPARATOR_BYTE);
+ UChar32 prev = 0;
+ if(nfdQCYesLimit != s) {
+ prev = u_writeIdenticalLevelRun(prev, s, (int32_t)(nfdQCYesLimit - s), sink);
+ }
+ // Is there non-NFD text?
+ int32_t destLengthEstimate;
+ if(limit != nullptr) {
+ if(nfdQCYesLimit == limit) { return; }
+ destLengthEstimate = (int32_t)(limit - nfdQCYesLimit);
+ } else {
+ // s is NUL-terminated
+ if(*nfdQCYesLimit == 0) { return; }
+ destLengthEstimate = -1;
+ }
+ UnicodeString nfd;
+ data->nfcImpl.decompose(nfdQCYesLimit, limit, nfd, destLengthEstimate, errorCode);
+ u_writeIdenticalLevelRun(prev, nfd.getBuffer(), nfd.length(), sink);
+}
+
+namespace {
+
+/**
+ * internalNextSortKeyPart() calls CollationKeys::writeSortKeyUpToQuaternary()
+ * with an instance of this callback class.
+ * When another level is about to be written, the callback
+ * records the level and the number of bytes that will be written until
+ * the sink (which is actually a FixedSortKeyByteSink) fills up.
+ *
+ * When internalNextSortKeyPart() is called again, it restarts with the last level
+ * and ignores as many bytes as were written previously for that level.
+ */
+class PartLevelCallback : public CollationKeys::LevelCallback {
+public:
+ PartLevelCallback(const SortKeyByteSink &s)
+ : sink(s), level(Collation::PRIMARY_LEVEL) {
+ levelCapacity = sink.GetRemainingCapacity();
+ }
+ virtual ~PartLevelCallback() {}
+ virtual UBool needToWrite(Collation::Level l) override {
+ if(!sink.Overflowed()) {
+ // Remember a level that will be at least partially written.
+ level = l;
+ levelCapacity = sink.GetRemainingCapacity();
+ return true;
+ } else {
+ return false;
+ }
+ }
+ Collation::Level getLevel() const { return level; }
+ int32_t getLevelCapacity() const { return levelCapacity; }
+
+private:
+ const SortKeyByteSink &sink;
+ Collation::Level level;
+ int32_t levelCapacity;
+};
+
+} // namespace
+
+int32_t
+RuleBasedCollator::internalNextSortKeyPart(UCharIterator *iter, uint32_t state[2],
+ uint8_t *dest, int32_t count, UErrorCode &errorCode) const {
+ if(U_FAILURE(errorCode)) { return 0; }
+ if(iter == nullptr || state == nullptr || count < 0 || (count > 0 && dest == nullptr)) {
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return 0;
+ }
+ if(count == 0) { return 0; }
+
+ FixedSortKeyByteSink sink(reinterpret_cast<char *>(dest), count);
+ sink.IgnoreBytes((int32_t)state[1]);
+ iter->move(iter, 0, UITER_START);
+
+ Collation::Level level = (Collation::Level)state[0];
+ if(level <= Collation::QUATERNARY_LEVEL) {
+ UBool numeric = settings->isNumeric();
+ PartLevelCallback callback(sink);
+ if(settings->dontCheckFCD()) {
+ UIterCollationIterator ci(data, numeric, *iter);
+ CollationKeys::writeSortKeyUpToQuaternary(ci, data->compressibleBytes, *settings,
+ sink, level, callback, false, errorCode);
+ } else {
+ FCDUIterCollationIterator ci(data, numeric, *iter, 0);
+ CollationKeys::writeSortKeyUpToQuaternary(ci, data->compressibleBytes, *settings,
+ sink, level, callback, false, errorCode);
+ }
+ if(U_FAILURE(errorCode)) { return 0; }
+ if(sink.NumberOfBytesAppended() > count) {
+ state[0] = (uint32_t)callback.getLevel();
+ state[1] = (uint32_t)callback.getLevelCapacity();
+ return count;
+ }
+ // All of the normal levels are done.
+ if(settings->getStrength() == UCOL_IDENTICAL) {
+ level = Collation::IDENTICAL_LEVEL;
+ iter->move(iter, 0, UITER_START);
+ }
+ // else fall through to setting ZERO_LEVEL
+ }
+
+ if(level == Collation::IDENTICAL_LEVEL) {
+ int32_t levelCapacity = sink.GetRemainingCapacity();
+ UnicodeString s;
+ for(;;) {
+ UChar32 c = iter->next(iter);
+ if(c < 0) { break; }
+ s.append((char16_t)c);
+ }
+ const char16_t *sArray = s.getBuffer();
+ writeIdenticalLevel(sArray, sArray + s.length(), sink, errorCode);
+ if(U_FAILURE(errorCode)) { return 0; }
+ if(sink.NumberOfBytesAppended() > count) {
+ state[0] = (uint32_t)level;
+ state[1] = (uint32_t)levelCapacity;
+ return count;
+ }
+ }
+
+ // ZERO_LEVEL: Fill the remainder of dest with 00 bytes.
+ state[0] = (uint32_t)Collation::ZERO_LEVEL;
+ state[1] = 0;
+ int32_t length = sink.NumberOfBytesAppended();
+ int32_t i = length;
+ while(i < count) { dest[i++] = 0; }
+ return length;
+}
+
+void
+RuleBasedCollator::internalGetCEs(const UnicodeString &str, UVector64 &ces,
+ UErrorCode &errorCode) const {
+ if(U_FAILURE(errorCode)) { return; }
+ const char16_t *s = str.getBuffer();
+ const char16_t *limit = s + str.length();
+ UBool numeric = settings->isNumeric();
+ if(settings->dontCheckFCD()) {
+ UTF16CollationIterator iter(data, numeric, s, s, limit);
+ int64_t ce;
+ while((ce = iter.nextCE(errorCode)) != Collation::NO_CE) {
+ ces.addElement(ce, errorCode);
+ }
+ } else {
+ FCDUTF16CollationIterator iter(data, numeric, s, s, limit);
+ int64_t ce;
+ while((ce = iter.nextCE(errorCode)) != Collation::NO_CE) {
+ ces.addElement(ce, errorCode);
+ }
+ }
+}
+
+namespace {
+
+void appendSubtag(CharString &s, char letter, const char *subtag, int32_t length,
+ UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode) || length == 0) { return; }
+ if(!s.isEmpty()) {
+ s.append('_', errorCode);
+ }
+ s.append(letter, errorCode);
+ for(int32_t i = 0; i < length; ++i) {
+ s.append(uprv_toupper(subtag[i]), errorCode);
+ }
+}
+
+void appendAttribute(CharString &s, char letter, UColAttributeValue value,
+ UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) { return; }
+ if(!s.isEmpty()) {
+ s.append('_', errorCode);
+ }
+ static const char *valueChars = "1234...........IXO..SN..LU......";
+ s.append(letter, errorCode);
+ s.append(valueChars[value], errorCode);
+}
+
+} // namespace
+
+int32_t
+RuleBasedCollator::internalGetShortDefinitionString(const char *locale,
+ char *buffer, int32_t capacity,
+ UErrorCode &errorCode) const {
+ if(U_FAILURE(errorCode)) { return 0; }
+ if(buffer == nullptr ? capacity != 0 : capacity < 0) {
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return 0;
+ }
+ if(locale == nullptr) {
+ locale = internalGetLocaleID(ULOC_VALID_LOCALE, errorCode);
+ }
+
+ char resultLocale[ULOC_FULLNAME_CAPACITY + 1];
+ int32_t length = ucol_getFunctionalEquivalent(resultLocale, ULOC_FULLNAME_CAPACITY,
+ "collation", locale,
+ nullptr, &errorCode);
+ if(U_FAILURE(errorCode)) { return 0; }
+ resultLocale[length] = 0;
+
+ // Append items in alphabetic order of their short definition letters.
+ CharString result;
+ char subtag[ULOC_KEYWORD_AND_VALUES_CAPACITY];
+
+ if(attributeHasBeenSetExplicitly(UCOL_ALTERNATE_HANDLING)) {
+ appendAttribute(result, 'A', getAttribute(UCOL_ALTERNATE_HANDLING, errorCode), errorCode);
+ }
+ // ATTR_VARIABLE_TOP not supported because 'B' was broken.
+ // See ICU tickets #10372 and #10386.
+ if(attributeHasBeenSetExplicitly(UCOL_CASE_FIRST)) {
+ appendAttribute(result, 'C', getAttribute(UCOL_CASE_FIRST, errorCode), errorCode);
+ }
+ if(attributeHasBeenSetExplicitly(UCOL_NUMERIC_COLLATION)) {
+ appendAttribute(result, 'D', getAttribute(UCOL_NUMERIC_COLLATION, errorCode), errorCode);
+ }
+ if(attributeHasBeenSetExplicitly(UCOL_CASE_LEVEL)) {
+ appendAttribute(result, 'E', getAttribute(UCOL_CASE_LEVEL, errorCode), errorCode);
+ }
+ if(attributeHasBeenSetExplicitly(UCOL_FRENCH_COLLATION)) {
+ appendAttribute(result, 'F', getAttribute(UCOL_FRENCH_COLLATION, errorCode), errorCode);
+ }
+ // Note: UCOL_HIRAGANA_QUATERNARY_MODE is deprecated and never changes away from default.
+ length = uloc_getKeywordValue(resultLocale, "collation", subtag, UPRV_LENGTHOF(subtag), &errorCode);
+ appendSubtag(result, 'K', subtag, length, errorCode);
+ length = uloc_getLanguage(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode);
+ if (length == 0) {
+ appendSubtag(result, 'L', "root", 4, errorCode);
+ } else {
+ appendSubtag(result, 'L', subtag, length, errorCode);
+ }
+ if(attributeHasBeenSetExplicitly(UCOL_NORMALIZATION_MODE)) {
+ appendAttribute(result, 'N', getAttribute(UCOL_NORMALIZATION_MODE, errorCode), errorCode);
+ }
+ length = uloc_getCountry(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode);
+ appendSubtag(result, 'R', subtag, length, errorCode);
+ if(attributeHasBeenSetExplicitly(UCOL_STRENGTH)) {
+ appendAttribute(result, 'S', getAttribute(UCOL_STRENGTH, errorCode), errorCode);
+ }
+ length = uloc_getVariant(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode);
+ appendSubtag(result, 'V', subtag, length, errorCode);
+ length = uloc_getScript(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode);
+ appendSubtag(result, 'Z', subtag, length, errorCode);
+
+ if(U_FAILURE(errorCode)) { return 0; }
+ return result.extract(buffer, capacity, errorCode);
+}
+
+UBool
+RuleBasedCollator::isUnsafe(UChar32 c) const {
+ return data->isUnsafeBackward(c, settings->isNumeric());
+}
+
+void U_CALLCONV
+RuleBasedCollator::computeMaxExpansions(const CollationTailoring *t, UErrorCode &errorCode) {
+ t->maxExpansions = CollationElementIterator::computeMaxExpansions(t->data, errorCode);
+}
+
+UBool
+RuleBasedCollator::initMaxExpansions(UErrorCode &errorCode) const {
+ umtx_initOnce(tailoring->maxExpansionsInitOnce, computeMaxExpansions, tailoring, errorCode);
+ return U_SUCCESS(errorCode);
+}
+
+CollationElementIterator *
+RuleBasedCollator::createCollationElementIterator(const UnicodeString& source) const {
+ UErrorCode errorCode = U_ZERO_ERROR;
+ if(!initMaxExpansions(errorCode)) { return nullptr; }
+ CollationElementIterator *cei = new CollationElementIterator(source, this, errorCode);
+ if(U_FAILURE(errorCode)) {
+ delete cei;
+ return nullptr;
+ }
+ return cei;
+}
+
+CollationElementIterator *
+RuleBasedCollator::createCollationElementIterator(const CharacterIterator& source) const {
+ UErrorCode errorCode = U_ZERO_ERROR;
+ if(!initMaxExpansions(errorCode)) { return nullptr; }
+ CollationElementIterator *cei = new CollationElementIterator(source, this, errorCode);
+ if(U_FAILURE(errorCode)) {
+ delete cei;
+ return nullptr;
+ }
+ return cei;
+}
+
+int32_t
+RuleBasedCollator::getMaxExpansion(int32_t order) const {
+ UErrorCode errorCode = U_ZERO_ERROR;
+ (void)initMaxExpansions(errorCode);
+ return CollationElementIterator::getMaxExpansion(tailoring->maxExpansions, order);
+}
+
+U_NAMESPACE_END
+
+#endif // !UCONFIG_NO_COLLATION