diff options
Diffstat (limited to 'intl/icu/source/i18n/collationrootelements.cpp')
-rw-r--r-- | intl/icu/source/i18n/collationrootelements.cpp | 341 |
1 files changed, 341 insertions, 0 deletions
diff --git a/intl/icu/source/i18n/collationrootelements.cpp b/intl/icu/source/i18n/collationrootelements.cpp new file mode 100644 index 0000000000..9b46d14144 --- /dev/null +++ b/intl/icu/source/i18n/collationrootelements.cpp @@ -0,0 +1,341 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* Copyright (C) 2013-2014, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* collationrootelements.cpp +* +* created on: 2013mar05 +* created by: Markus W. Scherer +*/ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_COLLATION + +#include "collation.h" +#include "collationrootelements.h" +#include "uassert.h" + +U_NAMESPACE_BEGIN + +int64_t +CollationRootElements::lastCEWithPrimaryBefore(uint32_t p) const { + if(p == 0) { return 0; } + U_ASSERT(p > elements[elements[IX_FIRST_PRIMARY_INDEX]]); + int32_t index = findP(p); + uint32_t q = elements[index]; + uint32_t secTer; + if(p == (q & 0xffffff00)) { + // p == elements[index] is a root primary. Find the CE before it. + // We must not be in a primary range. + U_ASSERT((q & PRIMARY_STEP_MASK) == 0); + secTer = elements[index - 1]; + if((secTer & SEC_TER_DELTA_FLAG) == 0) { + // Primary CE just before p. + p = secTer & 0xffffff00; + secTer = Collation::COMMON_SEC_AND_TER_CE; + } else { + // secTer = last secondary & tertiary for the previous primary + index -= 2; + for(;;) { + p = elements[index]; + if((p & SEC_TER_DELTA_FLAG) == 0) { + p &= 0xffffff00; + break; + } + --index; + } + } + } else { + // p > elements[index] which is the previous primary. + // Find the last secondary & tertiary weights for it. + p = q & 0xffffff00; + secTer = Collation::COMMON_SEC_AND_TER_CE; + for(;;) { + q = elements[++index]; + if((q & SEC_TER_DELTA_FLAG) == 0) { + // We must not be in a primary range. + U_ASSERT((q & PRIMARY_STEP_MASK) == 0); + break; + } + secTer = q; + } + } + return ((int64_t)p << 32) | (secTer & ~SEC_TER_DELTA_FLAG); +} + +int64_t +CollationRootElements::firstCEWithPrimaryAtLeast(uint32_t p) const { + if(p == 0) { return 0; } + int32_t index = findP(p); + if(p != (elements[index] & 0xffffff00)) { + for(;;) { + p = elements[++index]; + if((p & SEC_TER_DELTA_FLAG) == 0) { + // First primary after p. We must not be in a primary range. + U_ASSERT((p & PRIMARY_STEP_MASK) == 0); + break; + } + } + } + // The code above guarantees that p has at most 3 bytes: (p & 0xff) == 0. + return ((int64_t)p << 32) | Collation::COMMON_SEC_AND_TER_CE; +} + +uint32_t +CollationRootElements::getPrimaryBefore(uint32_t p, UBool isCompressible) const { + int32_t index = findPrimary(p); + int32_t step; + uint32_t q = elements[index]; + if(p == (q & 0xffffff00)) { + // Found p itself. Return the previous primary. + // See if p is at the end of a previous range. + step = (int32_t)q & PRIMARY_STEP_MASK; + if(step == 0) { + // p is not at the end of a range. Look for the previous primary. + do { + p = elements[--index]; + } while((p & SEC_TER_DELTA_FLAG) != 0); + return p & 0xffffff00; + } + } else { + // p is in a range, and not at the start. + uint32_t nextElement = elements[index + 1]; + U_ASSERT(isEndOfPrimaryRange(nextElement)); + step = (int32_t)nextElement & PRIMARY_STEP_MASK; + } + // Return the previous range primary. + if((p & 0xffff) == 0) { + return Collation::decTwoBytePrimaryByOneStep(p, isCompressible, step); + } else { + return Collation::decThreeBytePrimaryByOneStep(p, isCompressible, step); + } +} + +uint32_t +CollationRootElements::getSecondaryBefore(uint32_t p, uint32_t s) const { + int32_t index; + uint32_t previousSec, sec; + if(p == 0) { + index = (int32_t)elements[IX_FIRST_SECONDARY_INDEX]; + // Gap at the beginning of the secondary CE range. + previousSec = 0; + sec = elements[index] >> 16; + } else { + index = findPrimary(p) + 1; + previousSec = Collation::BEFORE_WEIGHT16; + sec = getFirstSecTerForPrimary(index) >> 16; + } + U_ASSERT(s >= sec); + while(s > sec) { + previousSec = sec; + U_ASSERT((elements[index] & SEC_TER_DELTA_FLAG) != 0); + sec = elements[index++] >> 16; + } + U_ASSERT(sec == s); + return previousSec; +} + +uint32_t +CollationRootElements::getTertiaryBefore(uint32_t p, uint32_t s, uint32_t t) const { + U_ASSERT((t & ~Collation::ONLY_TERTIARY_MASK) == 0); + int32_t index; + uint32_t previousTer, secTer; + if(p == 0) { + if(s == 0) { + index = (int32_t)elements[IX_FIRST_TERTIARY_INDEX]; + // Gap at the beginning of the tertiary CE range. + previousTer = 0; + } else { + index = (int32_t)elements[IX_FIRST_SECONDARY_INDEX]; + previousTer = Collation::BEFORE_WEIGHT16; + } + secTer = elements[index] & ~SEC_TER_DELTA_FLAG; + } else { + index = findPrimary(p) + 1; + previousTer = Collation::BEFORE_WEIGHT16; + secTer = getFirstSecTerForPrimary(index); + } + uint32_t st = (s << 16) | t; + while(st > secTer) { + if((secTer >> 16) == s) { previousTer = secTer; } + U_ASSERT((elements[index] & SEC_TER_DELTA_FLAG) != 0); + secTer = elements[index++] & ~SEC_TER_DELTA_FLAG; + } + U_ASSERT(secTer == st); + return previousTer & 0xffff; +} + +uint32_t +CollationRootElements::getPrimaryAfter(uint32_t p, int32_t index, UBool isCompressible) const { + U_ASSERT(p == (elements[index] & 0xffffff00) || isEndOfPrimaryRange(elements[index + 1])); + uint32_t q = elements[++index]; + int32_t step; + if((q & SEC_TER_DELTA_FLAG) == 0 && (step = (int32_t)q & PRIMARY_STEP_MASK) != 0) { + // Return the next primary in this range. + if((p & 0xffff) == 0) { + return Collation::incTwoBytePrimaryByOffset(p, isCompressible, step); + } else { + return Collation::incThreeBytePrimaryByOffset(p, isCompressible, step); + } + } else { + // Return the next primary in the list. + while((q & SEC_TER_DELTA_FLAG) != 0) { + q = elements[++index]; + } + U_ASSERT((q & PRIMARY_STEP_MASK) == 0); + return q; + } +} + +uint32_t +CollationRootElements::getSecondaryAfter(int32_t index, uint32_t s) const { + uint32_t secTer; + uint32_t secLimit; + if(index == 0) { + // primary = 0 + U_ASSERT(s != 0); + index = (int32_t)elements[IX_FIRST_SECONDARY_INDEX]; + secTer = elements[index]; + // Gap at the end of the secondary CE range. + secLimit = 0x10000; + } else { + U_ASSERT(index >= (int32_t)elements[IX_FIRST_PRIMARY_INDEX]); + secTer = getFirstSecTerForPrimary(index + 1); + // If this is an explicit sec/ter unit, then it will be read once more. + // Gap for secondaries of primary CEs. + secLimit = getSecondaryBoundary(); + } + for(;;) { + uint32_t sec = secTer >> 16; + if(sec > s) { return sec; } + secTer = elements[++index]; + if((secTer & SEC_TER_DELTA_FLAG) == 0) { return secLimit; } + } +} + +uint32_t +CollationRootElements::getTertiaryAfter(int32_t index, uint32_t s, uint32_t t) const { + uint32_t secTer; + uint32_t terLimit; + if(index == 0) { + // primary = 0 + if(s == 0) { + U_ASSERT(t != 0); + index = (int32_t)elements[IX_FIRST_TERTIARY_INDEX]; + // Gap at the end of the tertiary CE range. + terLimit = 0x4000; + } else { + index = (int32_t)elements[IX_FIRST_SECONDARY_INDEX]; + // Gap for tertiaries of primary/secondary CEs. + terLimit = getTertiaryBoundary(); + } + secTer = elements[index] & ~SEC_TER_DELTA_FLAG; + } else { + U_ASSERT(index >= (int32_t)elements[IX_FIRST_PRIMARY_INDEX]); + secTer = getFirstSecTerForPrimary(index + 1); + // If this is an explicit sec/ter unit, then it will be read once more. + terLimit = getTertiaryBoundary(); + } + uint32_t st = (s << 16) | t; + for(;;) { + if(secTer > st) { + U_ASSERT((secTer >> 16) == s); + return secTer & 0xffff; + } + secTer = elements[++index]; + // No tertiary greater than t for this primary+secondary. + if((secTer & SEC_TER_DELTA_FLAG) == 0 || (secTer >> 16) > s) { return terLimit; } + secTer &= ~SEC_TER_DELTA_FLAG; + } +} + +uint32_t +CollationRootElements::getFirstSecTerForPrimary(int32_t index) const { + uint32_t secTer = elements[index]; + if((secTer & SEC_TER_DELTA_FLAG) == 0) { + // No sec/ter delta. + return Collation::COMMON_SEC_AND_TER_CE; + } + secTer &= ~SEC_TER_DELTA_FLAG; + if(secTer > Collation::COMMON_SEC_AND_TER_CE) { + // Implied sec/ter. + return Collation::COMMON_SEC_AND_TER_CE; + } + // Explicit sec/ter below common/common. + return secTer; +} + +int32_t +CollationRootElements::findPrimary(uint32_t p) const { + // Requirement: p must occur as a root primary. + U_ASSERT((p & 0xff) == 0); // at most a 3-byte primary + int32_t index = findP(p); + // If p is in a range, then we just assume that p is an actual primary in this range. + // (Too cumbersome/expensive to check.) + // Otherwise, it must be an exact match. + U_ASSERT(isEndOfPrimaryRange(elements[index + 1]) || p == (elements[index] & 0xffffff00)); + return index; +} + +int32_t +CollationRootElements::findP(uint32_t p) const { + // p need not occur as a root primary. + // For example, it might be a reordering group boundary. + U_ASSERT((p >> 24) != Collation::UNASSIGNED_IMPLICIT_BYTE); + // modified binary search + int32_t start = (int32_t)elements[IX_FIRST_PRIMARY_INDEX]; + U_ASSERT(p >= elements[start]); + int32_t limit = length - 1; + U_ASSERT(elements[limit] >= PRIMARY_SENTINEL); + U_ASSERT(p < elements[limit]); + while((start + 1) < limit) { + // Invariant: elements[start] and elements[limit] are primaries, + // and elements[start]<=p<=elements[limit]. + int32_t i = (start + limit) / 2; + uint32_t q = elements[i]; + if((q & SEC_TER_DELTA_FLAG) != 0) { + // Find the next primary. + int32_t j = i + 1; + for(;;) { + if(j == limit) { break; } + q = elements[j]; + if((q & SEC_TER_DELTA_FLAG) == 0) { + i = j; + break; + } + ++j; + } + if((q & SEC_TER_DELTA_FLAG) != 0) { + // Find the preceding primary. + j = i - 1; + for(;;) { + if(j == start) { break; } + q = elements[j]; + if((q & SEC_TER_DELTA_FLAG) == 0) { + i = j; + break; + } + --j; + } + if((q & SEC_TER_DELTA_FLAG) != 0) { + // No primary between start and limit. + break; + } + } + } + if(p < (q & 0xffffff00)) { // Reset the "step" bits of a range end primary. + limit = i; + } else { + start = i; + } + } + return start; +} + +U_NAMESPACE_END + +#endif // !UCONFIG_NO_COLLATION |