diff options
Diffstat (limited to 'intl/icu/source/common/uset.cpp')
-rw-r--r-- | intl/icu/source/common/uset.cpp | 641 |
1 files changed, 641 insertions, 0 deletions
diff --git a/intl/icu/source/common/uset.cpp b/intl/icu/source/common/uset.cpp new file mode 100644 index 0000000000..eae7981d52 --- /dev/null +++ b/intl/icu/source/common/uset.cpp @@ -0,0 +1,641 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2002-2011, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: uset.cpp +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2002mar07 +* created by: Markus W. Scherer +* +* There are functions to efficiently serialize a USet into an array of uint16_t +* and functions to use such a serialized form efficiently without +* instantiating a new USet. +*/ + +#include "unicode/utypes.h" +#include "unicode/uobject.h" +#include "unicode/uset.h" +#include "unicode/uniset.h" +#include "cmemory.h" +#include "unicode/ustring.h" +#include "unicode/parsepos.h" + +U_NAMESPACE_USE + +U_CAPI USet* U_EXPORT2 +uset_openEmpty() { + return (USet*) new UnicodeSet(); +} + +U_CAPI USet* U_EXPORT2 +uset_open(UChar32 start, UChar32 end) { + return (USet*) new UnicodeSet(start, end); +} + +U_CAPI void U_EXPORT2 +uset_close(USet* set) { + delete (UnicodeSet*) set; +} + +U_CAPI USet * U_EXPORT2 +uset_clone(const USet *set) { + return (USet*) (((UnicodeSet*) set)->UnicodeSet::clone()); +} + +U_CAPI UBool U_EXPORT2 +uset_isFrozen(const USet *set) { + return ((UnicodeSet*) set)->UnicodeSet::isFrozen(); +} + +U_CAPI void U_EXPORT2 +uset_freeze(USet *set) { + ((UnicodeSet*) set)->UnicodeSet::freeze(); +} + +U_CAPI USet * U_EXPORT2 +uset_cloneAsThawed(const USet *set) { + return (USet*) (((UnicodeSet*) set)->UnicodeSet::cloneAsThawed()); +} + +U_CAPI void U_EXPORT2 +uset_set(USet* set, + UChar32 start, UChar32 end) { + ((UnicodeSet*) set)->UnicodeSet::set(start, end); +} + +U_CAPI void U_EXPORT2 +uset_addAll(USet* set, const USet *additionalSet) { + ((UnicodeSet*) set)->UnicodeSet::addAll(*((const UnicodeSet*)additionalSet)); +} + +U_CAPI void U_EXPORT2 +uset_add(USet* set, UChar32 c) { + ((UnicodeSet*) set)->UnicodeSet::add(c); +} + +U_CAPI void U_EXPORT2 +uset_addRange(USet* set, UChar32 start, UChar32 end) { + ((UnicodeSet*) set)->UnicodeSet::add(start, end); +} + +U_CAPI void U_EXPORT2 +uset_addString(USet* set, const UChar* str, int32_t strLen) { + // UnicodeString handles -1 for strLen + UnicodeString s(strLen<0, str, strLen); + ((UnicodeSet*) set)->UnicodeSet::add(s); +} + +U_CAPI void U_EXPORT2 +uset_addAllCodePoints(USet* set, const UChar *str, int32_t strLen) { + // UnicodeString handles -1 for strLen + UnicodeString s(str, strLen); + ((UnicodeSet*) set)->UnicodeSet::addAll(s); +} + +U_CAPI void U_EXPORT2 +uset_remove(USet* set, UChar32 c) { + ((UnicodeSet*) set)->UnicodeSet::remove(c); +} + +U_CAPI void U_EXPORT2 +uset_removeRange(USet* set, UChar32 start, UChar32 end) { + ((UnicodeSet*) set)->UnicodeSet::remove(start, end); +} + +U_CAPI void U_EXPORT2 +uset_removeString(USet* set, const UChar* str, int32_t strLen) { + UnicodeString s(strLen==-1, str, strLen); + ((UnicodeSet*) set)->UnicodeSet::remove(s); +} + +U_CAPI void U_EXPORT2 +uset_removeAll(USet* set, const USet* remove) { + ((UnicodeSet*) set)->UnicodeSet::removeAll(*(const UnicodeSet*)remove); +} + +U_CAPI void U_EXPORT2 +uset_retain(USet* set, UChar32 start, UChar32 end) { + ((UnicodeSet*) set)->UnicodeSet::retain(start, end); +} + +U_CAPI void U_EXPORT2 +uset_retainAll(USet* set, const USet* retain) { + ((UnicodeSet*) set)->UnicodeSet::retainAll(*(const UnicodeSet*)retain); +} + +U_CAPI void U_EXPORT2 +uset_compact(USet* set) { + ((UnicodeSet*) set)->UnicodeSet::compact(); +} + +U_CAPI void U_EXPORT2 +uset_complement(USet* set) { + ((UnicodeSet*) set)->UnicodeSet::complement(); +} + +U_CAPI void U_EXPORT2 +uset_complementAll(USet* set, const USet* complement) { + ((UnicodeSet*) set)->UnicodeSet::complementAll(*(const UnicodeSet*)complement); +} + +U_CAPI void U_EXPORT2 +uset_clear(USet* set) { + ((UnicodeSet*) set)->UnicodeSet::clear(); +} + +U_CAPI void U_EXPORT2 +uset_removeAllStrings(USet* set) { + ((UnicodeSet*) set)->UnicodeSet::removeAllStrings(); +} + +U_CAPI UBool U_EXPORT2 +uset_isEmpty(const USet* set) { + return ((const UnicodeSet*) set)->UnicodeSet::isEmpty(); +} + +U_CAPI UBool U_EXPORT2 +uset_contains(const USet* set, UChar32 c) { + return ((const UnicodeSet*) set)->UnicodeSet::contains(c); +} + +U_CAPI UBool U_EXPORT2 +uset_containsRange(const USet* set, UChar32 start, UChar32 end) { + return ((const UnicodeSet*) set)->UnicodeSet::contains(start, end); +} + +U_CAPI UBool U_EXPORT2 +uset_containsString(const USet* set, const UChar* str, int32_t strLen) { + UnicodeString s(strLen==-1, str, strLen); + return ((const UnicodeSet*) set)->UnicodeSet::contains(s); +} + +U_CAPI UBool U_EXPORT2 +uset_containsAll(const USet* set1, const USet* set2) { + return ((const UnicodeSet*) set1)->UnicodeSet::containsAll(* (const UnicodeSet*) set2); +} + +U_CAPI UBool U_EXPORT2 +uset_containsAllCodePoints(const USet* set, const UChar *str, int32_t strLen) { + // Create a string alias, since nothing is being added to the set. + UnicodeString s(strLen==-1, str, strLen); + return ((const UnicodeSet*) set)->UnicodeSet::containsAll(s); +} + +U_CAPI UBool U_EXPORT2 +uset_containsNone(const USet* set1, const USet* set2) { + return ((const UnicodeSet*) set1)->UnicodeSet::containsNone(* (const UnicodeSet*) set2); +} + +U_CAPI UBool U_EXPORT2 +uset_containsSome(const USet* set1, const USet* set2) { + return ((const UnicodeSet*) set1)->UnicodeSet::containsSome(* (const UnicodeSet*) set2); +} + +U_CAPI int32_t U_EXPORT2 +uset_span(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition) { + return ((UnicodeSet*) set)->UnicodeSet::span(s, length, spanCondition); +} + +U_CAPI int32_t U_EXPORT2 +uset_spanBack(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition) { + return ((UnicodeSet*) set)->UnicodeSet::spanBack(s, length, spanCondition); +} + +U_CAPI int32_t U_EXPORT2 +uset_spanUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition) { + return ((UnicodeSet*) set)->UnicodeSet::spanUTF8(s, length, spanCondition); +} + +U_CAPI int32_t U_EXPORT2 +uset_spanBackUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition) { + return ((UnicodeSet*) set)->UnicodeSet::spanBackUTF8(s, length, spanCondition); +} + +U_CAPI UBool U_EXPORT2 +uset_equals(const USet* set1, const USet* set2) { + return *(const UnicodeSet*)set1 == *(const UnicodeSet*)set2; +} + +U_CAPI int32_t U_EXPORT2 +uset_indexOf(const USet* set, UChar32 c) { + return ((UnicodeSet*) set)->UnicodeSet::indexOf(c); +} + +U_CAPI UChar32 U_EXPORT2 +uset_charAt(const USet* set, int32_t index) { + return ((UnicodeSet*) set)->UnicodeSet::charAt(index); +} + +U_CAPI int32_t U_EXPORT2 +uset_size(const USet* set) { + return ((const UnicodeSet*) set)->UnicodeSet::size(); +} + +U_NAMESPACE_BEGIN +/** + * This class only exists to provide access to the UnicodeSet private + * USet support API. Declaring a class a friend is more portable than + * trying to declare extern "C" functions as friends. + */ +class USetAccess /* not : public UObject because all methods are static */ { +public: + /* Try to have the compiler inline these*/ + inline static int32_t getStringCount(const UnicodeSet& set) { + return set.stringsSize(); + } + inline static const UnicodeString* getString(const UnicodeSet& set, + int32_t i) { + return set.getString(i); + } +private: + /* do not instantiate*/ + USetAccess(); +}; +U_NAMESPACE_END + +U_CAPI int32_t U_EXPORT2 +uset_getItemCount(const USet* uset) { + const UnicodeSet& set = *(const UnicodeSet*)uset; + return set.getRangeCount() + USetAccess::getStringCount(set); +} + +U_CAPI int32_t U_EXPORT2 +uset_getItem(const USet* uset, int32_t itemIndex, + UChar32* start, UChar32* end, + UChar* str, int32_t strCapacity, + UErrorCode* ec) { + if (U_FAILURE(*ec)) return 0; + const UnicodeSet& set = *(const UnicodeSet*)uset; + int32_t rangeCount; + + if (itemIndex < 0) { + *ec = U_ILLEGAL_ARGUMENT_ERROR; + return -1; + } else if (itemIndex < (rangeCount = set.getRangeCount())) { + *start = set.getRangeStart(itemIndex); + *end = set.getRangeEnd(itemIndex); + return 0; + } else { + itemIndex -= rangeCount; + if (itemIndex < USetAccess::getStringCount(set)) { + const UnicodeString* s = USetAccess::getString(set, itemIndex); + return s->extract(str, strCapacity, *ec); + } else { + *ec = U_INDEX_OUTOFBOUNDS_ERROR; + return -1; + } + } +} + +//U_CAPI int32_t U_EXPORT2 +//uset_getRangeCount(const USet* set) { +// return ((const UnicodeSet*) set)->getRangeCount(); +//} +// +//U_CAPI UBool U_EXPORT2 +//uset_getRange(const USet* set, int32_t rangeIndex, +// UChar32* pStart, UChar32* pEnd) { +// if ((uint32_t) rangeIndex >= (uint32_t) uset_getRangeCount(set)) { +// return FALSE; +// } +// const UnicodeSet* us = (const UnicodeSet*) set; +// *pStart = us->getRangeStart(rangeIndex); +// *pEnd = us->getRangeEnd(rangeIndex); +// return TRUE; +//} + +/* + * Serialize a USet into 16-bit units. + * Store BMP code points as themselves with one 16-bit unit each. + * + * Important: the code points in the array are in ascending order, + * therefore all BMP code points precede all supplementary code points. + * + * Store each supplementary code point in 2 16-bit units, + * simply with higher-then-lower 16-bit halfs. + * + * Precede the entire list with the length. + * If there are supplementary code points, then set bit 15 in the length + * and add the bmpLength between it and the array. + * + * In other words: + * - all BMP: (length=bmpLength) BMP, .., BMP + * - some supplementary: (length|0x8000) (bmpLength<length) BMP, .., BMP, supp-high, supp-low, .. + */ +U_CAPI int32_t U_EXPORT2 +uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* ec) { + if (ec==NULL || U_FAILURE(*ec)) { + return 0; + } + + return ((const UnicodeSet*) set)->UnicodeSet::serialize(dest, destCapacity,* ec); +} + +U_CAPI UBool U_EXPORT2 +uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength) { + int32_t length; + + if(fillSet==NULL) { + return FALSE; + } + if(src==NULL || srcLength<=0) { + fillSet->length=fillSet->bmpLength=0; + return FALSE; + } + + length=*src++; + if(length&0x8000) { + /* there are supplementary values */ + length&=0x7fff; + if(srcLength<(2+length)) { + fillSet->length=fillSet->bmpLength=0; + return FALSE; + } + fillSet->bmpLength=*src++; + } else { + /* only BMP values */ + if(srcLength<(1+length)) { + fillSet->length=fillSet->bmpLength=0; + return FALSE; + } + fillSet->bmpLength=length; + } + fillSet->array=src; + fillSet->length=length; + return TRUE; +} + +U_CAPI void U_EXPORT2 +uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c) { + if(fillSet==NULL || (uint32_t)c>0x10ffff) { + return; + } + + fillSet->array=fillSet->staticArray; + if(c<0xffff) { + fillSet->bmpLength=fillSet->length=2; + fillSet->staticArray[0]=(uint16_t)c; + fillSet->staticArray[1]=(uint16_t)c+1; + } else if(c==0xffff) { + fillSet->bmpLength=1; + fillSet->length=3; + fillSet->staticArray[0]=0xffff; + fillSet->staticArray[1]=1; + fillSet->staticArray[2]=0; + } else if(c<0x10ffff) { + fillSet->bmpLength=0; + fillSet->length=4; + fillSet->staticArray[0]=(uint16_t)(c>>16); + fillSet->staticArray[1]=(uint16_t)c; + ++c; + fillSet->staticArray[2]=(uint16_t)(c>>16); + fillSet->staticArray[3]=(uint16_t)c; + } else /* c==0x10ffff */ { + fillSet->bmpLength=0; + fillSet->length=2; + fillSet->staticArray[0]=0x10; + fillSet->staticArray[1]=0xffff; + } +} + +U_CAPI UBool U_EXPORT2 +uset_serializedContains(const USerializedSet* set, UChar32 c) { + const uint16_t* array; + + if(set==NULL || (uint32_t)c>0x10ffff) { + return FALSE; + } + + array=set->array; + if(c<=0xffff) { + /* find c in the BMP part */ + int32_t lo = 0; + int32_t hi = set->bmpLength-1; + if (c < array[0]) { + hi = 0; + } else if (c < array[hi]) { + for(;;) { + int32_t i = (lo + hi) >> 1; + if (i == lo) { + break; // Done! + } else if (c < array[i]) { + hi = i; + } else { + lo = i; + } + } + } else { + hi += 1; + } + return (UBool)(hi&1); + } else { + /* find c in the supplementary part */ + uint16_t high=(uint16_t)(c>>16), low=(uint16_t)c; + int32_t base = set->bmpLength; + int32_t lo = 0; + int32_t hi = set->length - 2 - base; + if (high < array[base] || (high==array[base] && low<array[base+1])) { + hi = 0; + } else if (high < array[base+hi] || (high==array[base+hi] && low<array[base+hi+1])) { + for (;;) { + int32_t i = ((lo + hi) >> 1) & ~1; // Guarantee even result + int32_t iabs = i + base; + if (i == lo) { + break; // Done! + } else if (high < array[iabs] || (high==array[iabs] && low<array[iabs+1])) { + hi = i; + } else { + lo = i; + } + } + } else { + hi += 2; + } + /* count pairs of 16-bit units even per BMP and check if the number of pairs is odd */ + return (UBool)(((hi+(base<<1))&2)!=0); + } +} + +U_CAPI int32_t U_EXPORT2 +uset_getSerializedRangeCount(const USerializedSet* set) { + if(set==NULL) { + return 0; + } + + return (set->bmpLength+(set->length-set->bmpLength)/2+1)/2; +} + +U_CAPI UBool U_EXPORT2 +uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex, + UChar32* pStart, UChar32* pEnd) { + const uint16_t* array; + int32_t bmpLength, length; + + if(set==NULL || rangeIndex<0 || pStart==NULL || pEnd==NULL) { + return FALSE; + } + + array=set->array; + length=set->length; + bmpLength=set->bmpLength; + + rangeIndex*=2; /* address start/limit pairs */ + if(rangeIndex<bmpLength) { + *pStart=array[rangeIndex++]; + if(rangeIndex<bmpLength) { + *pEnd=array[rangeIndex]-1; + } else if(rangeIndex<length) { + *pEnd=((((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1])-1; + } else { + *pEnd=0x10ffff; + } + return TRUE; + } else { + rangeIndex-=bmpLength; + rangeIndex*=2; /* address pairs of pairs of units */ + length-=bmpLength; + if(rangeIndex<length) { + array+=bmpLength; + *pStart=(((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1]; + rangeIndex+=2; + if(rangeIndex<length) { + *pEnd=((((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1])-1; + } else { + *pEnd=0x10ffff; + } + return TRUE; + } else { + return FALSE; + } + } +} + +// TODO The old, internal uset.c had an efficient uset_containsOne function. +// Returned the one and only code point, or else -1 or something. +// Consider adding such a function to both C and C++ UnicodeSet/uset. +// See tools/gennorm/store.c for usage, now usetContainsOne there. + +// TODO Investigate incorporating this code into UnicodeSet to improve +// efficiency. +// --- +// #define USET_GROW_DELTA 20 +// +// static int32_t +// findChar(const UChar32* array, int32_t length, UChar32 c) { +// int32_t i; +// +// /* check the last range limit first for more efficient appending */ +// if(length>0) { +// if(c>=array[length-1]) { +// return length; +// } +// +// /* do not check the last range limit again in the loop below */ +// --length; +// } +// +// for(i=0; i<length && c>=array[i]; ++i) {} +// return i; +// } +// +// static UBool +// addRemove(USet* set, UChar32 c, int32_t doRemove) { +// int32_t i, length, more; +// +// if(set==NULL || (uint32_t)c>0x10ffff) { +// return FALSE; +// } +// +// length=set->length; +// i=findChar(set->array, length, c); +// if((i&1)^doRemove) { +// /* c is already in the set */ +// return TRUE; +// } +// +// /* how many more array items do we need? */ +// if(i<length && (c+1)==set->array[i]) { +// /* c is just before the following range, extend that in-place by one */ +// set->array[i]=c; +// if(i>0) { +// --i; +// if(c==set->array[i]) { +// /* the previous range collapsed, remove it */ +// set->length=length-=2; +// if(i<length) { +// uprv_memmove(set->array+i, set->array+i+2, (length-i)*4); +// } +// } +// } +// return TRUE; +// } else if(i>0 && c==set->array[i-1]) { +// /* c is just after the previous range, extend that in-place by one */ +// if(++c<=0x10ffff) { +// set->array[i-1]=c; +// if(i<length && c==set->array[i]) { +// /* the following range collapsed, remove it */ +// --i; +// set->length=length-=2; +// if(i<length) { +// uprv_memmove(set->array+i, set->array+i+2, (length-i)*4); +// } +// } +// } else { +// /* extend the previous range (had limit 0x10ffff) to the end of Unicode */ +// set->length=i-1; +// } +// return TRUE; +// } else if(i==length && c==0x10ffff) { +// /* insert one range limit c */ +// more=1; +// } else { +// /* insert two range limits c, c+1 */ +// more=2; +// } +// +// /* insert <more> range limits */ +// if(length+more>set->capacity) { +// /* reallocate */ +// int32_t newCapacity=set->capacity+set->capacity/2+USET_GROW_DELTA; +// UChar32* newArray=(UChar32* )uprv_malloc(newCapacity*4); +// if(newArray==NULL) { +// return FALSE; +// } +// set->capacity=newCapacity; +// uprv_memcpy(newArray, set->array, length*4); +// +// if(set->array!=set->staticBuffer) { +// uprv_free(set->array); +// } +// set->array=newArray; +// } +// +// if(i<length) { +// uprv_memmove(set->array+i+more, set->array+i, (length-i)*4); +// } +// set->array[i]=c; +// if(more==2) { +// set->array[i+1]=c+1; +// } +// set->length+=more; +// +// return TRUE; +// } +// +// U_CAPI UBool U_EXPORT2 +// uset_add(USet* set, UChar32 c) { +// return addRemove(set, c, 0); +// } +// +// U_CAPI void U_EXPORT2 +// uset_remove(USet* set, UChar32 c) { +// addRemove(set, c, 1); +// } |